]> git.donarmstrong.com Git - mothur.git/blob - parsefastaqcommand.cpp
working on sra and get.mimarkspackage commands. added file parameter to cluster...
[mothur.git] / parsefastaqcommand.cpp
1 /*
2  *  parsefastaqcommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 9/30/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "parsefastaqcommand.h"
11 #include "sequence.hpp"
12
13 //**********************************************************************************************************************
14 vector<string> ParseFastaQCommand::setParameters(){     
15         try {
16                 CommandParameter pfastq("fastq", "InputTypes", "", "", "none", "none", "none","",false,true,true); parameters.push_back(pfastq);
17         CommandParameter poligos("oligos", "InputTypes", "", "", "oligosGroup", "none", "none","",false,false); parameters.push_back(poligos);
18         CommandParameter pgroup("group", "InputTypes", "", "", "oligosGroup", "none", "none","",false,false); parameters.push_back(pgroup);
19         CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "","",false,false); parameters.push_back(ppdiffs);
20                 CommandParameter pbdiffs("bdiffs", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pbdiffs);
21         CommandParameter pldiffs("ldiffs", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pldiffs);
22                 CommandParameter psdiffs("sdiffs", "Number", "", "0", "", "", "","",false,false); parameters.push_back(psdiffs);
23         CommandParameter ptdiffs("tdiffs", "Number", "", "0", "", "", "","",false,false); parameters.push_back(ptdiffs);
24                 CommandParameter pfasta("fasta", "Boolean", "", "T", "", "", "","fasta",false,false); parameters.push_back(pfasta);
25                 CommandParameter pqual("qfile", "Boolean", "", "T", "", "", "","qfile",false,false); parameters.push_back(pqual);
26         CommandParameter ppacbio("pacbio", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(ppacbio);
27                 CommandParameter pformat("format", "Multiple", "sanger-illumina-solexa-illumina1.8+", "sanger", "", "", "","",false,false,true); parameters.push_back(pformat);
28         CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
29                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
30                 
31                 vector<string> myArray;
32                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
33                 return myArray;
34         }
35         catch(exception& e) {
36                 m->errorOut(e, "ParseFastaQCommand", "setParameters");
37                 exit(1);
38         }
39 }
40 //**********************************************************************************************************************
41 string ParseFastaQCommand::getHelpString(){     
42         try {
43                 string helpString = "";
44                 helpString += "The fastq.info command reads a fastq file and creates a fasta and quality file.\n";
45                 helpString += "The fastq.info command parameters are fastq, fasta, qfile, oligos, group and format; fastq is required.\n";
46         helpString += "The fastq.info command should be in the following format: fastq.info(fastaq=yourFastaQFile).\n";
47         helpString += "The oligos parameter allows you to provide an oligos file to split your fastq file into separate fastq files by barcode and primers. \n";
48         helpString += "The group parameter allows you to provide a group file to split your fastq file into separate fastq files by group. \n";
49         helpString += "The tdiffs parameter is used to specify the total number of differences allowed in the reads. The default is pdiffs + bdiffs + sdiffs + ldiffs.\n";
50                 helpString += "The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n";
51                 helpString += "The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n";
52         helpString += "The ldiffs parameter is used to specify the number of differences allowed in the linker. The default is 0.\n";
53                 helpString += "The sdiffs parameter is used to specify the number of differences allowed in the spacer. The default is 0.\n";
54                 helpString += "The format parameter is used to indicate whether your sequences are sanger, solexa, illumina1.8+ or illumina, default=sanger.\n";
55         helpString += "The fasta parameter allows you to indicate whether you want a fasta file generated. Default=T.\n";
56         helpString += "The qfile parameter allows you to indicate whether you want a quality file generated. Default=T.\n";
57         helpString += "The pacbio parameter allows you to indicate .... When set to true, quality scores of 0 will results in a corresponding base of N. Default=F.\n";
58                 helpString += "Example fastq.info(fastaq=test.fastaq).\n";
59                 helpString += "Note: No spaces between parameter labels (i.e. fastq), '=' and yourFastQFile.\n";
60                 return helpString;
61         }
62         catch(exception& e) {
63                 m->errorOut(e, "ParseFastaQCommand", "getHelpString");
64                 exit(1);
65         }
66 }
67 //**********************************************************************************************************************
68 string ParseFastaQCommand::getOutputPattern(string type) {
69     try {
70         string pattern = "";
71         
72         if (type == "fasta") {  pattern = "[filename],fasta"; } 
73         else if (type == "qfile") {  pattern = "[filename],qual"; }
74         else if (type == "fastq") {  pattern = "[filename],[group],fastq"; } 
75         else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true;  }
76         
77         return pattern;
78     }
79     catch(exception& e) {
80         m->errorOut(e, "ParseFastaQCommand", "getOutputPattern");
81         exit(1);
82     }
83 }
84 //**********************************************************************************************************************
85 ParseFastaQCommand::ParseFastaQCommand(){       
86         try {
87                 abort = true; calledHelp = true; 
88                 setParameters();
89                 vector<string> tempOutNames;
90                 outputTypes["fasta"] = tempOutNames;
91                 outputTypes["qfile"] = tempOutNames;
92         outputTypes["fastq"] = tempOutNames;
93         }
94         catch(exception& e) {
95                 m->errorOut(e, "ParseFastaQCommand", "ParseFastaQCommand");
96                 exit(1);
97         }
98 }
99 //**********************************************************************************************************************
100 ParseFastaQCommand::ParseFastaQCommand(string option){
101         try {
102                 abort = false; calledHelp = false;
103         split = 1;
104                 
105                 if(option == "help") {  help(); abort = true; calledHelp = true; }
106                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
107                 
108                 else {
109                         vector<string> myArray = setParameters();
110                         
111                         OptionParser parser(option);
112                         map<string,string> parameters = parser.getParameters();
113                         
114                         ValidParameters validParameter;
115                         map<string,string>::iterator it;
116
117                         //check to make sure all parameters are valid for command
118                         for (map<string,string>::iterator it = parameters.begin(); it != parameters.end(); it++) { 
119                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
120                         }
121                         
122                         //initialize outputTypes
123                         vector<string> tempOutNames;
124                         outputTypes["fasta"] = tempOutNames;
125                         outputTypes["qfile"] = tempOutNames;
126             outputTypes["fastq"] = tempOutNames;
127                         
128                         //if the user changes the input directory command factory will send this info to us in the output parameter 
129                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
130                         if (inputDir == "not found"){   inputDir = "";          }
131                         else {
132                                 string path;
133                                 it = parameters.find("fastq");
134                                 //user has given a template file
135                                 if(it != parameters.end()){ 
136                                         path = m->hasPath(it->second);
137                                         //if the user has not given a path then, add inputdir. else leave path alone.
138                                         if (path == "") {       parameters["fastq"] = inputDir + it->second;            }
139                                 }
140                 
141                 it = parameters.find("oligos");
142                                 //user has given a template file
143                                 if(it != parameters.end()){
144                                         path = m->hasPath(it->second);
145                                         //if the user has not given a path then, add inputdir. else leave path alone.
146                                         if (path == "") {       parameters["oligos"] = inputDir + it->second;           }
147                                 }
148                 
149                 it = parameters.find("group");
150                                 //user has given a template file
151                                 if(it != parameters.end()){
152                                         path = m->hasPath(it->second);
153                                         //if the user has not given a path then, add inputdir. else leave path alone.
154                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
155                                 }
156                         }
157                         
158                         //check for required parameters
159                         fastaQFile = validParameter.validFile(parameters, "fastq", true);
160                         if (fastaQFile == "not found") {        m->mothurOut("fastq is a required parameter for the fastq.info command.");      m->mothurOutEndLine();  abort = true;   }
161                         else if (fastaQFile == "not open")      {       fastaQFile = ""; abort = true;  }
162             
163             oligosfile = validParameter.validFile(parameters, "oligos", true);
164                         if (oligosfile == "not found") {        oligosfile = "";        }
165                         else if (oligosfile == "not open")      {       oligosfile = ""; abort = true;  }
166             else { m->setOligosFile(oligosfile); split = 2; }
167             
168             groupfile = validParameter.validFile(parameters, "group", true);
169                         if (groupfile == "not found") { groupfile = ""; }
170                         else if (groupfile == "not open")       {       groupfile = ""; abort = true;   }
171             else { m->setGroupFile(groupfile); split = 2; }
172             
173             if ((groupfile != "") && (oligosfile != "")) { m->mothurOut("You must enter ONLY ONE of the following: oligos or group."); m->mothurOutEndLine(); abort = true;  }
174                         
175                         //if the user changes the output directory command factory will send this info to us in the output parameter 
176                         outputDir = validParameter.validFile(parameters, "outputdir", false);   if (outputDir == "not found"){  outputDir = m->hasPath(fastaQFile);     }
177                         
178                         string temp;
179                         temp = validParameter.validFile(parameters, "fasta", false);    if(temp == "not found"){        temp = "T";     }
180                         fasta = m->isTrue(temp); 
181
182                         temp = validParameter.validFile(parameters, "qfile", false);    if(temp == "not found"){        temp = "T";     }
183                         qual = m->isTrue(temp);
184             
185             temp = validParameter.validFile(parameters, "pacbio", false);       if(temp == "not found"){        temp = "F";     }
186                         pacbio = m->isTrue(temp);
187
188             temp = validParameter.validFile(parameters, "bdiffs", false);               if (temp == "not found") { temp = "0"; }
189                         m->mothurConvert(temp, bdiffs);
190                         
191                         temp = validParameter.validFile(parameters, "pdiffs", false);           if (temp == "not found") { temp = "0"; }
192                         m->mothurConvert(temp, pdiffs);
193             
194             temp = validParameter.validFile(parameters, "ldiffs", false);               if (temp == "not found") { temp = "0"; }
195                         m->mothurConvert(temp, ldiffs);
196             
197             temp = validParameter.validFile(parameters, "sdiffs", false);               if (temp == "not found") { temp = "0"; }
198                         m->mothurConvert(temp, sdiffs);
199                         
200                         temp = validParameter.validFile(parameters, "tdiffs", false);           if (temp == "not found") { int tempTotal = pdiffs + bdiffs + ldiffs + sdiffs;  temp = toString(tempTotal); }
201                         m->mothurConvert(temp, tdiffs);
202                         
203                         if(tdiffs == 0){        tdiffs = bdiffs + pdiffs + ldiffs + sdiffs;     }
204
205                         
206             format = validParameter.validFile(parameters, "format", false);             if (format == "not found"){     format = "sanger";      }
207             
208             if ((format != "sanger") && (format != "illumina") && (format != "illumina1.8+") && (format != "solexa"))  { 
209                                 m->mothurOut(format + " is not a valid format. Your format choices are sanger, solexa, illumina1.8+ and illumina, aborting." ); m->mothurOutEndLine();
210                                 abort=true;
211                         }
212
213                         if ((!fasta) && (!qual)) { m->mothurOut("[ERROR]: no outputs selected. Aborting."); m->mothurOutEndLine(); abort=true; }
214
215                 }               
216         }
217         catch(exception& e) {
218                 m->errorOut(e, "ParseFastaQCommand", "ParseFastaQCommand");
219                 exit(1);
220         }
221 }
222 //**********************************************************************************************************************
223
224 int ParseFastaQCommand::execute(){
225         try {
226                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
227                 
228                 //open Output Files
229         map<string, string> variables; 
230         variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(fastaQFile));
231                 string fastaFile = getOutputFileName("fasta",variables);
232                 string qualFile = getOutputFileName("qfile",variables);
233                 ofstream outFasta, outQual;
234                 
235                 if (fasta) { m->openOutputFile(fastaFile, outFasta);  outputNames.push_back(fastaFile); outputTypes["fasta"].push_back(fastaFile);      }
236                 if (qual) { m->openOutputFile(qualFile, outQual);       outputNames.push_back(qualFile);  outputTypes["qfile"].push_back(qualFile);             }
237         
238         TrimOligos* trimOligos = NULL;
239         int numBarcodes, numPrimers; numBarcodes = 0; numPrimers = 0;
240         if (oligosfile != "")       {
241             readOligos(oligosfile);
242             numPrimers = primers.size(); numBarcodes = barcodes.size();
243             //find group read belongs to
244             if (pairedOligos)   {   trimOligos = new TrimOligos(pdiffs, bdiffs, 0, 0, pairedPrimers, pairedBarcodes); numBarcodes = pairedBarcodes.size(); numPrimers = pairedPrimers.size(); }
245             else                {   trimOligos = new TrimOligos(pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, revPrimer, linker, spacer);  }
246
247         }
248         else if (groupfile != "")   { readGroup(groupfile);     }
249                 
250                 ifstream in;
251                 m->openInputFile(fastaQFile, in);
252         
253         //fill convert table - goes from solexa to sanger. Used fq_all2std.pl as a reference.
254         for (int i = -64; i < 65; i++) { 
255             char temp = (char) ((int)(33 + 10*log(1+pow(10,(i/10.0)))/log(10)+0.499));
256             convertTable.push_back(temp);
257         }
258                 
259         
260         int count = 0;
261                 while (!in.eof()) {
262                         
263                         if (m->control_pressed) { break; }
264             
265             bool ignore;
266             fastqRead2 thisRead = readFastq(in, ignore);
267             
268             if (!ignore) {
269                 vector<int> qualScores;
270                 if (qual) {
271                     qualScores = convertQual(thisRead.quality);
272                     outQual << ">" << thisRead.seq.getName() << endl;
273                     for (int i = 0; i < qualScores.size(); i++) { outQual << qualScores[i] << " "; }
274                     outQual << endl;
275                 }
276                 
277                 if (m->control_pressed) { break; }
278                 
279                 if (pacbio) {
280                     if (!qual) { qualScores = convertQual(thisRead.quality); } //convert if not done
281                     string sequence = thisRead.seq.getAligned();
282                     for (int i = 0; i < qualScores.size(); i++) {
283                         if (qualScores[i] == 0){ sequence[i] = 'N'; }
284                     }
285                     thisRead.seq.setAligned(sequence);
286                 }
287                 
288                 //print sequence info to files
289                 if (fasta) { thisRead.seq.printSequence(outFasta); }
290                 
291                 if (split > 1) {
292                     int barcodeIndex, primerIndex, trashCodeLength;
293                     if (oligosfile != "")      {  trashCodeLength = findGroup(thisRead, barcodeIndex, primerIndex, trimOligos, numBarcodes, numPrimers);    }
294                     else if (groupfile != "")  {  trashCodeLength = findGroup(thisRead, barcodeIndex, primerIndex, "groupMode");   }
295                     else {  m->mothurOut("[ERROR]: uh oh, we shouldn't be here...\n"); }
296                     
297                     if(trashCodeLength == 0){
298                         ofstream out;
299                         m->openOutputFileAppend(fastqFileNames[barcodeIndex][primerIndex], out);
300                         out << thisRead.wholeRead;
301                         out.close();
302                     }else{
303                         ofstream out;
304                         m->openOutputFileAppend(noMatchFile, out);
305                         out << thisRead.wholeRead;
306                         out.close();
307                     }
308                 }
309                 //report progress
310                 if((count+1) % 10000 == 0){     m->mothurOut(toString(count+1)); m->mothurOutEndLine();         }
311                 count++;
312                         }
313                 }
314                 
315                 in.close();
316                 if (fasta)      { outFasta.close();     }
317                 if (qual)       { outQual.close();      }
318         
319         //report progress
320                 if (!m->control_pressed) {   if((count) % 10000 != 0){  m->mothurOut(toString(count)); m->mothurOutEndLine();           }  }
321         
322         if (split > 1) {
323             
324             if (groupfile != "")        { delete groupMap;      }
325             else if (oligosfile != "")  { delete trimOligos;    }
326            
327                         map<string, string>::iterator it;
328                         set<string> namesToRemove;
329                         for(int i=0;i<fastqFileNames.size();i++){
330                                 for(int j=0;j<fastqFileNames[0].size();j++){
331                                         if (fastqFileNames[i][j] != "") {
332                                                 if (namesToRemove.count(fastqFileNames[i][j]) == 0) {
333                                                         if(m->isBlank(fastqFileNames[i][j])){
334                                                                 m->mothurRemove(fastqFileNames[i][j]);
335                                                                 namesToRemove.insert(fastqFileNames[i][j]);
336                             }
337                                                 }
338                                         }
339                                 }
340                         }
341             
342                         //remove names for outputFileNames, just cleans up the output
343                         for(int i = 0; i < outputNames.size(); i++) {
344                 if (namesToRemove.count(outputNames[i]) != 0) {
345                     outputNames.erase(outputNames.begin()+i);
346                     i--;
347                 }
348             }
349             if(m->isBlank(noMatchFile)){  m->mothurRemove(noMatchFile); }
350             else { outputNames.push_back(noMatchFile); outputTypes["fastq"].push_back(noMatchFile); }
351         }
352                 
353                 if (m->control_pressed) { outputTypes.clear(); outputNames.clear(); m->mothurRemove(fastaFile); m->mothurRemove(qualFile); return 0; }
354                 
355                 //set fasta file as new current fastafile
356                 string current = "";
357                 itTypes = outputTypes.find("fasta");
358                 if (itTypes != outputTypes.end()) {
359                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
360                 }
361                 
362                 itTypes = outputTypes.find("qfile");
363                 if (itTypes != outputTypes.end()) {
364                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setQualFile(current); }
365                 }               
366                 
367                 m->mothurOutEndLine();
368                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
369                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
370                 m->mothurOutEndLine();
371
372                 return 0;
373         }
374         catch(exception& e) {
375                 m->errorOut(e, "ParseFastaQCommand", "execute");
376                 exit(1);
377         }
378 }
379 //**********************************************************************************************************************
380 fastqRead2 ParseFastaQCommand::readFastq(ifstream& in, bool& ignore){
381     try {
382         ignore = false;
383         string wholeRead = "";
384         
385         //read sequence name
386         string line = m->getline(in); m->gobble(in); if (split > 1) { wholeRead += line + "\n"; }
387         vector<string> pieces = m->splitWhiteSpace(line);
388         string name = "";  if (pieces.size() != 0) { name = pieces[0]; }
389         if (name == "") {  m->mothurOut("[WARNING]: Blank fasta name, ignoring read."); m->mothurOutEndLine(); ignore=true;  }
390         else if (name[0] != '@') { m->mothurOut("[WARNING]: reading " + name + " expected a name with @ as a leading character, ignoring read."); m->mothurOutEndLine(); ignore=true; }
391         else { name = name.substr(1); }
392         
393         //read sequence
394         string sequence = m->getline(in); m->gobble(in); if (split > 1) { wholeRead += sequence + "\n"; }
395         if (sequence == "") {  m->mothurOut("[WARNING]: missing sequence for " + name + ", ignoring."); ignore=true; }
396         
397         //read sequence name
398         line = m->getline(in); m->gobble(in); if (split > 1) { wholeRead += line + "\n"; }
399         pieces = m->splitWhiteSpace(line);
400         string name2 = "";  if (pieces.size() != 0) { name2 = pieces[0]; }
401         if (name2 == "") {  m->mothurOut("[WARNING]: expected a name with + as a leading character, ignoring."); ignore=true; }
402         else if (name2[0] != '+') { m->mothurOut("[WARNING]: reading " + name2 + " expected a name with + as a leading character, ignoring."); ignore=true; }
403         else { name2 = name2.substr(1); if (name2 == "") { name2 = name; } }
404         
405                 
406         //read quality scores
407         string quality = m->getline(in); m->gobble(in); if (split > 1) { wholeRead += quality + "\n"; }
408         if (quality == "") {  m->mothurOut("[WARNING]: missing quality for " + name2 + ", ignoring."); ignore=true; }
409         
410         //sanity check sequence length and number of quality scores match
411         if (name2 != "") { if (name != name2) { m->mothurOut("[WARNING]: names do not match. read " + name + " for fasta and " + name2 + " for quality, ignoring."); ignore=true; } }
412         if (quality.length() != sequence.length()) { m->mothurOut("[WARNING]: Lengths do not match for sequence " + name + ". Read " + toString(sequence.length()) + " characters for fasta and " + toString(quality.length()) + " characters for quality scores, ignoring read."); ignore=true; }
413         
414         m->checkName(name);
415         Sequence seq(name, sequence);
416         fastqRead2 read(seq, quality, wholeRead);
417             
418         if (m->debug) { m->mothurOut("[DEBUG]: " + read.seq.getName() + " " + read.seq.getAligned() + " " + quality + "\n"); }
419         
420         return read;
421     }
422     catch(exception& e) {
423         m->errorOut(e, "ParseFastaQCommand", "readFastq");
424         exit(1);
425     }
426 }
427
428 //**********************************************************************************************************************
429 vector<int> ParseFastaQCommand::convertQual(string qual) {
430         try {
431                 vector<int> qualScores;
432                 
433         bool negativeScores = false;
434         
435                 for (int i = 0; i < qual.length(); i++) { 
436             
437             int temp = 0;
438             temp = int(qual[i]);
439             if (format == "illumina") {
440                 temp -= 64; //char '@'
441             }else if (format == "illumina1.8+") {
442                 temp -= int('!'); //char '!'
443             }else if (format == "solexa") {
444                 temp = int(convertTable[temp]); //convert to sanger
445                 temp -= int('!'); //char '!'
446             }else {
447                 temp -= int('!'); //char '!'
448             }
449             if (temp < -5) { negativeScores = true; }
450                         qualScores.push_back(temp);
451                 }
452                 
453         if (negativeScores) { m->mothurOut("[ERROR]: finding negative quality scores, do you have the right format selected? http://en.wikipedia.org/wiki/FASTQ_format#Encoding \n");  m->control_pressed = true;  }
454         
455                 return qualScores;
456         }
457         catch(exception& e) {
458                 m->errorOut(e, "ParseFastaQCommand", "convertQual");
459                 exit(1);
460         }
461 }
462 //**********************************************************************************************************************
463 int ParseFastaQCommand::findGroup(fastqRead2 thisRead, int& barcode, int& primer, TrimOligos*& trimOligos, int numBarcodes, int numPrimers) {
464         try {
465         int success = 1;
466         string trashCode = "";
467         int currentSeqsDiffs = 0;
468         
469         Sequence currSeq(thisRead.seq.getName(), thisRead.seq.getAligned());
470         QualityScores currQual; currQual.setScores(convertQual(thisRead.quality));
471         
472         if(linker.size() != 0){
473             success = trimOligos->stripLinker(currSeq, currQual);
474             if(success > ldiffs)                {       trashCode += 'k';       }
475             else{ currentSeqsDiffs += success;  }
476             
477         }
478         
479         if(numBarcodes != 0){
480             success = trimOligos->stripBarcode(currSeq, currQual, barcode);
481             if(success > bdiffs)                {       trashCode += 'b';       }
482             else{ currentSeqsDiffs += success;  }
483         }
484         
485         if(spacer.size() != 0){
486             success = trimOligos->stripSpacer(currSeq, currQual);
487             if(success > sdiffs)                {       trashCode += 's';       }
488             else{ currentSeqsDiffs += success;  }
489             
490         }
491         
492         if(numPrimers != 0){
493             success = trimOligos->stripForward(currSeq, currQual, primer, true);
494             if(success > pdiffs)                {       trashCode += 'f';       }
495             else{ currentSeqsDiffs += success;  }
496         }
497         
498         if (currentSeqsDiffs > tdiffs)  {       trashCode += 't';   }
499         
500         if(revPrimer.size() != 0){
501             success = trimOligos->stripReverse(currSeq, currQual);
502             if(!success)                                {       trashCode += 'r';       }
503         }
504         
505         if (trashCode.length() == 0) { //is this sequence in the ignore group
506             string thisGroup = "";
507             
508             if(barcodes.size() != 0){
509                 thisGroup = barcodeNameVector[barcode];
510                 if (numPrimers != 0) {
511                     if (primerNameVector[primer] != "") {
512                         if(thisGroup != "") {
513                             thisGroup += "." + primerNameVector[primer];
514                         }else {
515                             thisGroup = primerNameVector[primer];
516                         }
517                     }
518                 }
519             }
520             
521             int pos = thisGroup.find("ignore");
522             if (pos != string::npos) {  trashCode += "i"; }
523         }
524
525         
526         return trashCode.length();
527     }
528         catch(exception& e) {
529                 m->errorOut(e, "ParseFastaQCommand", "findGroup");
530                 exit(1);
531         }
532 }
533 //**********************************************************************************************************************
534 int ParseFastaQCommand::findGroup(fastqRead2 thisRead, int& barcode, int& primer, string groupMode) {
535         try {
536         string trashCode = "";
537         primer = 0;
538         
539         string group = groupMap->getGroup(thisRead.seq.getName());
540         if (group == "not found") {     trashCode += "g";   } //scrap for group
541         else { //find file group
542             map<string, int>::iterator it = barcodes.find(group);
543             if (it != barcodes.end()) {
544                 barcode = it->second;
545             }else { trashCode += "g"; }
546         }
547         
548         return trashCode.length();
549     }
550         catch(exception& e) {
551                 m->errorOut(e, "ParseFastaQCommand", "findGroup");
552                 exit(1);
553         }
554 }
555 //***************************************************************************************************************
556
557 bool ParseFastaQCommand::readOligos(string oligoFile){
558         try {
559                 ifstream inOligos;
560                 m->openInputFile(oligoFile, inOligos);
561                 
562                 string type, oligo, roligo, group;
563         bool hasPrimer = false; bool hasPairedBarcodes = false; pairedOligos = false;
564         
565                 int indexPrimer = 0;
566                 int indexBarcode = 0;
567         int indexPairedPrimer = 0;
568                 int indexPairedBarcode = 0;
569         set<string> uniquePrimers;
570         set<string> uniqueBarcodes;
571                 
572                 while(!inOligos.eof()){
573             
574                         inOligos >> type;
575             
576                         if (m->debug) { m->mothurOut("[DEBUG]: reading type - " + type + ".\n"); }
577             
578                         if(type[0] == '#'){
579                                 while (!inOligos.eof()) {       char c = inOligos.get();  if (c == 10 || c == 13){      break;  }       } // get rest of line if there's any crap there
580                                 m->gobble(inOligos);
581                         }
582                         else{
583                                 m->gobble(inOligos);
584                                 //make type case insensitive
585                                 for(int i=0;i<type.length();i++){       type[i] = toupper(type[i]);  }
586                                 
587                                 inOligos >> oligo;
588                 
589                 if (m->debug) { m->mothurOut("[DEBUG]: reading - " + oligo + ".\n"); }
590                                 
591                                 for(int i=0;i<oligo.length();i++){
592                                         oligo[i] = toupper(oligo[i]);
593                                         if(oligo[i] == 'U')     {       oligo[i] = 'T'; }
594                                 }
595                                 
596                                 if(type == "FORWARD"){
597                                         group = "";
598                                         
599                                         // get rest of line in case there is a primer name
600                                         while (!inOligos.eof()) {
601                                                 char c = inOligos.get();
602                                                 if (c == 10 || c == 13 || c == -1){     break;  }
603                                                 else if (c == 32 || c == 9){;} //space or tab
604                                                 else {  group += c;  }
605                                         }
606                                         
607                                         //check for repeat barcodes
608                                         map<string, int>::iterator itPrime = primers.find(oligo);
609                                         if (itPrime != primers.end()) { m->mothurOut("primer " + oligo + " is in your oligos file already."); m->mothurOutEndLine();  }
610                                         
611                     if (m->debug) {  if (group != "") { m->mothurOut("[DEBUG]: reading group " + group + ".\n"); }else{ m->mothurOut("[DEBUG]: no group for primer " + oligo + ".\n"); }  }
612                     
613                                         primers[oligo]=indexPrimer; indexPrimer++;
614                                         primerNameVector.push_back(group);
615                                 }
616                 else if (type == "PRIMER"){
617                     m->gobble(inOligos);
618                                         
619                     inOligos >> roligo;
620                     
621                     for(int i=0;i<roligo.length();i++){
622                         roligo[i] = toupper(roligo[i]);
623                         if(roligo[i] == 'U')    {       roligo[i] = 'T';        }
624                     }
625                     roligo = reverseOligo(roligo);
626                     
627                     group = "";
628                     
629                                         // get rest of line in case there is a primer name
630                                         while (!inOligos.eof()) {
631                                                 char c = inOligos.get();
632                                                 if (c == 10 || c == 13 || c == -1){     break;  }
633                                                 else if (c == 32 || c == 9){;} //space or tab
634                                                 else {  group += c;  }
635                                         }
636                     
637                     oligosPair newPrimer(oligo, roligo);
638                     
639                     if (m->debug) { m->mothurOut("[DEBUG]: primer pair " + newPrimer.forward + " " + newPrimer.reverse + ", and group = " + group + ".\n"); }
640                                         
641                                         //check for repeat barcodes
642                     string tempPair = oligo+roligo;
643                     if (uniquePrimers.count(tempPair) != 0) { m->mothurOut("primer pair " + newPrimer.forward + " " + newPrimer.reverse + " is in your oligos file already."); m->mothurOutEndLine();  }
644                     else { uniquePrimers.insert(tempPair); }
645                                         
646                     if (m->debug) {  if (group != "") { m->mothurOut("[DEBUG]: reading group " + group + ".\n"); }else{ m->mothurOut("[DEBUG]: no group for primer pair " + newPrimer.forward + " " + newPrimer.reverse + ".\n"); }  }
647                     
648                                         pairedPrimers[indexPairedPrimer]=newPrimer; indexPairedPrimer++;
649                                         primerNameVector.push_back(group);
650                     hasPrimer = true;
651                 }
652                                 else if(type == "REVERSE"){
653                                         //Sequence oligoRC("reverse", oligo);
654                                         //oligoRC.reverseComplement();
655                     string oligoRC = reverseOligo(oligo);
656                                         revPrimer.push_back(oligoRC);
657                                 }
658                                 else if(type == "BARCODE"){
659                                         inOligos >> group;
660                     
661                     //barcode lines can look like   BARCODE   atgcatgc   groupName  - for 454 seqs
662                     //or                            BARCODE   atgcatgc   atgcatgc    groupName  - for illumina data that has forward and reverse info
663                     
664                     string temp = "";
665                     while (!inOligos.eof())     {
666                                                 char c = inOligos.get();
667                                                 if (c == 10 || c == 13 || c == -1){     break;  }
668                                                 else if (c == 32 || c == 9){;} //space or tab
669                                                 else {  temp += c;  }
670                                         }
671                                         
672                     //then this is illumina data with 4 columns
673                     if (temp != "") {
674                         hasPairedBarcodes = true;
675                         string reverseBarcode = group; //reverseOligo(group); //reverse barcode
676                         group = temp;
677                         
678                         for(int i=0;i<reverseBarcode.length();i++){
679                             reverseBarcode[i] = toupper(reverseBarcode[i]);
680                             if(reverseBarcode[i] == 'U')        {       reverseBarcode[i] = 'T';        }
681                         }
682                         
683                         reverseBarcode = reverseOligo(reverseBarcode);
684                         oligosPair newPair(oligo, reverseBarcode);
685                         
686                         if (m->debug) { m->mothurOut("[DEBUG]: barcode pair " + newPair.forward + " " + newPair.reverse + ", and group = " + group + ".\n"); }
687                         //check for repeat barcodes
688                         string tempPair = oligo+reverseBarcode;
689                         if (uniqueBarcodes.count(tempPair) != 0) { m->mothurOut("barcode pair " + newPair.forward + " " + newPair.reverse +  " is in your oligos file already, disregarding."); m->mothurOutEndLine();  }
690                         else { uniqueBarcodes.insert(tempPair); }
691                         
692                         pairedBarcodes[indexPairedBarcode]=newPair; indexPairedBarcode++;
693                         barcodeNameVector.push_back(group);
694                     }else {
695                         //check for repeat barcodes
696                         map<string, int>::iterator itBar = barcodes.find(oligo);
697                         if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine();  }
698                         
699                         barcodes[oligo]=indexBarcode; indexBarcode++;
700                         barcodeNameVector.push_back(group);
701                     }
702                                 }else if(type == "LINKER"){
703                                         linker.push_back(oligo);
704                                 }else if(type == "SPACER"){
705                                         spacer.push_back(oligo);
706                                 }
707                                 else{   m->mothurOut("[WARNING]: " + type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); }
708                         }
709                         m->gobble(inOligos);
710                 }
711                 inOligos.close();
712                 
713         if (hasPairedBarcodes || hasPrimer) {
714             pairedOligos = true;
715             if ((primers.size() != 0) || (barcodes.size() != 0) || (linker.size() != 0) || (spacer.size() != 0) || (revPrimer.size() != 0)) { m->control_pressed = true;  m->mothurOut("[ERROR]: cannot mix paired primers and barcodes with non paired or linkers and spacers, quitting."); m->mothurOutEndLine();  return 0; }
716         }
717         
718                 //add in potential combos
719                 if(barcodeNameVector.size() == 0){
720                         barcodes[""] = 0;
721                         barcodeNameVector.push_back("");
722                 }
723                 
724                 if(primerNameVector.size() == 0){
725                         primers[""] = 0;
726                         primerNameVector.push_back("");
727                 }
728                 
729                 fastqFileNames.resize(barcodeNameVector.size());
730                 for(int i=0;i<fastqFileNames.size();i++){
731                         fastqFileNames[i].assign(primerNameVector.size(), "");
732                 }
733                 
734                 
735                         set<string> uniqueNames; //used to cleanup outputFileNames
736             if (pairedOligos) {
737                 for(map<int, oligosPair>::iterator itBar = pairedBarcodes.begin();itBar != pairedBarcodes.end();itBar++){
738                     for(map<int, oligosPair>::iterator itPrimer = pairedPrimers.begin();itPrimer != pairedPrimers.end(); itPrimer++){
739                         
740                         string primerName = primerNameVector[itPrimer->first];
741                         string barcodeName = barcodeNameVector[itBar->first];
742                         
743                         if ((primerName == "ignore") || (barcodeName == "ignore")) { } //do nothing
744                         else {
745                             string comboGroupName = "";
746                             string fastqFileName = "";
747                             
748                             if(primerName == ""){
749                                 comboGroupName = barcodeNameVector[itBar->first];
750                             }
751                             else{
752                                 if(barcodeName == ""){
753                                     comboGroupName = primerNameVector[itPrimer->first];
754                                 }
755                                 else{
756                                     comboGroupName = barcodeNameVector[itBar->first] + "." + primerNameVector[itPrimer->first];
757                                 }
758                             }
759                             
760                             
761                             ofstream temp;
762                             map<string, string> variables;
763                             variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(fastaQFile));
764                             variables["[group]"] = comboGroupName;
765                             fastqFileName = getOutputFileName("fastq", variables);
766                             if (uniqueNames.count(fastqFileName) == 0) {
767                                 outputNames.push_back(fastqFileName);
768                                 outputTypes["fastq"].push_back(fastqFileName);
769                                 uniqueNames.insert(fastqFileName);
770                             }
771                             
772                             fastqFileNames[itBar->first][itPrimer->first] = fastqFileName;
773                             m->openOutputFile(fastqFileName, temp);             temp.close();
774                             
775                         }
776                     }
777                 }
778             }else {
779                 for(map<string, int>::iterator itBar = barcodes.begin();itBar != barcodes.end();itBar++){
780                     for(map<string, int>::iterator itPrimer = primers.begin();itPrimer != primers.end(); itPrimer++){
781                         
782                         string primerName = primerNameVector[itPrimer->second];
783                         string barcodeName = barcodeNameVector[itBar->second];
784                         
785                         if ((primerName == "ignore") || (barcodeName == "ignore")) { } //do nothing
786                         else {
787                             string comboGroupName = "";
788                             string fastqFileName = "";
789                             
790                             if(primerName == ""){
791                                 comboGroupName = barcodeNameVector[itBar->second];
792                             }
793                             else{
794                                 if(barcodeName == ""){
795                                     comboGroupName = primerNameVector[itPrimer->second];
796                                 }
797                                 else{
798                                     comboGroupName = barcodeNameVector[itBar->second] + "." + primerNameVector[itPrimer->second];
799                                 }
800                             }
801                             
802                             
803                             ofstream temp;
804                             map<string, string> variables;
805                             variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(fastaQFile));
806                             variables["[group]"] = comboGroupName;
807                             fastqFileName = getOutputFileName("fastq", variables);
808                             if (uniqueNames.count(fastqFileName) == 0) {
809                                 outputNames.push_back(fastqFileName);
810                                 outputTypes["fastq"].push_back(fastqFileName);
811                                 uniqueNames.insert(fastqFileName);
812                             }
813                             
814                             fastqFileNames[itBar->second][itPrimer->second] = fastqFileName;
815                             m->openOutputFile(fastqFileName, temp);             temp.close();
816                             
817                         }
818                     }
819                 }
820             }
821                 
822         ofstream temp;
823         map<string, string> variables;
824         variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(fastaQFile));
825         variables["[group]"] = "scrap";
826         noMatchFile = getOutputFileName("fastq", variables);
827         m->openOutputFile(noMatchFile, temp);           temp.close();
828
829                 return true;
830                 
831         }
832         catch(exception& e) {
833                 m->errorOut(e, "ParseFastaQCommand", "getOligos");
834                 exit(1);
835         }
836 }
837 //***************************************************************************************************************
838 bool ParseFastaQCommand::readGroup(string groupfile){
839         try {
840         fastqFileNames.clear();
841         
842         groupMap = new GroupMap();
843         groupMap->readMap(groupfile);
844         
845         //like barcodeNameVector - no primer names
846         vector<string> groups = groupMap->getNamesOfGroups();
847                 
848                 fastqFileNames.resize(groups.size());
849         for (int i = 0; i < fastqFileNames.size(); i++) {
850             for (int j = 0; j < 1; j++) {
851                 
852                 map<string, string> variables;
853                 variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(fastaQFile));
854                 variables["[group]"] = groups[i];
855                 string thisFilename = getOutputFileName("fastq",variables);
856                 outputNames.push_back(thisFilename);
857                 outputTypes["fastq"].push_back(thisFilename);
858                 
859                 ofstream temp;
860                 m->openOutputFileBinary(thisFilename, temp); temp.close();
861                 fastqFileNames[i].push_back(thisFilename);
862                 barcodes[groups[i]] = i;
863             }
864         }
865         
866         map<string, string> variables;
867         variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(fastaQFile));
868         variables["[group]"] = "scrap";
869                 noMatchFile = getOutputFileName("fastq",variables);
870         m->mothurRemove(noMatchFile);
871                 
872                 return true;
873                 
874         }
875         catch(exception& e) {
876                 m->errorOut(e, "ParseFastaQCommand", "readGroup");
877                 exit(1);
878         }
879 }
880 //********************************************************************/
881 string ParseFastaQCommand::reverseOligo(string oligo){
882         try {
883         string reverse = "";
884         
885         for(int i=oligo.length()-1;i>=0;i--){
886             
887             if(oligo[i] == 'A')         {       reverse += 'T'; }
888             else if(oligo[i] == 'T'){   reverse += 'A'; }
889             else if(oligo[i] == 'U'){   reverse += 'A'; }
890             
891             else if(oligo[i] == 'G'){   reverse += 'C'; }
892             else if(oligo[i] == 'C'){   reverse += 'G'; }
893             
894             else if(oligo[i] == 'R'){   reverse += 'Y'; }
895             else if(oligo[i] == 'Y'){   reverse += 'R'; }
896             
897             else if(oligo[i] == 'M'){   reverse += 'K'; }
898             else if(oligo[i] == 'K'){   reverse += 'M'; }
899             
900             else if(oligo[i] == 'W'){   reverse += 'W'; }
901             else if(oligo[i] == 'S'){   reverse += 'S'; }
902             
903             else if(oligo[i] == 'B'){   reverse += 'V'; }
904             else if(oligo[i] == 'V'){   reverse += 'B'; }
905             
906             else if(oligo[i] == 'D'){   reverse += 'H'; }
907             else if(oligo[i] == 'H'){   reverse += 'D'; }
908             
909             else                                                {       reverse += 'N'; }
910         }
911         
912         
913         return reverse;
914     }
915         catch(exception& e) {
916                 m->errorOut(e, "ParseFastaQCommand", "reverseOligo");
917                 exit(1);
918         }
919 }
920
921
922 //**********************************************************************************************************************
923
924
925