]> git.donarmstrong.com Git - mothur.git/blob - parsefastaqcommand.cpp
changed random forest output filename
[mothur.git] / parsefastaqcommand.cpp
1 /*
2  *  parsefastaqcommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 9/30/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "parsefastaqcommand.h"
11 #include "sequence.hpp"
12
13 //**********************************************************************************************************************
14 vector<string> ParseFastaQCommand::setParameters(){     
15         try {
16                 CommandParameter pfastq("fastq", "InputTypes", "", "", "none", "none", "none","",false,true,true); parameters.push_back(pfastq);
17                 CommandParameter pfasta("fasta", "Boolean", "", "T", "", "", "","fasta",false,false); parameters.push_back(pfasta);
18                 CommandParameter pqual("qfile", "Boolean", "", "T", "", "", "","qfile",false,false); parameters.push_back(pqual);
19         CommandParameter ppacbio("pacbio", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(ppacbio);
20                 CommandParameter pformat("format", "Multiple", "sanger-illumina-solexa-illumina1.8+", "sanger", "", "", "","",false,false,true); parameters.push_back(pformat);
21         CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
22                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
23                 
24                 vector<string> myArray;
25                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
26                 return myArray;
27         }
28         catch(exception& e) {
29                 m->errorOut(e, "ParseFastaQCommand", "setParameters");
30                 exit(1);
31         }
32 }
33 //**********************************************************************************************************************
34 string ParseFastaQCommand::getHelpString(){     
35         try {
36                 string helpString = "";
37                 helpString += "The fastq.info command reads a fastq file and creates a fasta and quality file.\n";
38                 helpString += "The fastq.info command parameters are fastq, fasta, qfile and format; fastq is required.\n";
39         helpString += "The fastq.info command should be in the following format: fastq.info(fastaq=yourFastaQFile).\n";
40                 helpString += "The format parameter is used to indicate whether your sequences are sanger, solexa, illumina1.8+ or illumina, default=sanger.\n";
41         helpString += "The fasta parameter allows you to indicate whether you want a fasta file generated. Default=T.\n";
42         helpString += "The qfile parameter allows you to indicate whether you want a quality file generated. Default=T.\n";
43         helpString += "The pacbio parameter allows you to indicate .... When set to true, quality scores of 0 will results in a corresponding base of N. Default=F.\n";
44                 helpString += "Example fastq.info(fastaq=test.fastaq).\n";
45                 helpString += "Note: No spaces between parameter labels (i.e. fastq), '=' and yourFastQFile.\n";
46                 return helpString;
47         }
48         catch(exception& e) {
49                 m->errorOut(e, "ParseFastaQCommand", "getHelpString");
50                 exit(1);
51         }
52 }
53 //**********************************************************************************************************************
54 string ParseFastaQCommand::getOutputPattern(string type) {
55     try {
56         string pattern = "";
57         
58         if (type == "fasta") {  pattern = "[filename],fasta"; } 
59         else if (type == "qfile") {  pattern = "[filename],qual"; } 
60         else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true;  }
61         
62         return pattern;
63     }
64     catch(exception& e) {
65         m->errorOut(e, "ParseFastaQCommand", "getOutputPattern");
66         exit(1);
67     }
68 }
69 //**********************************************************************************************************************
70 ParseFastaQCommand::ParseFastaQCommand(){       
71         try {
72                 abort = true; calledHelp = true; 
73                 setParameters();
74                 vector<string> tempOutNames;
75                 outputTypes["fasta"] = tempOutNames;
76                 outputTypes["qfile"] = tempOutNames;
77         }
78         catch(exception& e) {
79                 m->errorOut(e, "ParseFastaQCommand", "ParseFastaQCommand");
80                 exit(1);
81         }
82 }
83 //**********************************************************************************************************************
84 ParseFastaQCommand::ParseFastaQCommand(string option){
85         try {
86                 abort = false; calledHelp = false;   
87                 
88                 if(option == "help") {  help(); abort = true; calledHelp = true; }
89                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
90                 
91                 else {
92                         vector<string> myArray = setParameters();
93                         
94                         OptionParser parser(option);
95                         map<string,string> parameters = parser.getParameters();
96                         
97                         ValidParameters validParameter;
98                         map<string,string>::iterator it;
99
100                         //check to make sure all parameters are valid for command
101                         for (map<string,string>::iterator it = parameters.begin(); it != parameters.end(); it++) { 
102                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
103                         }
104                         
105                         //initialize outputTypes
106                         vector<string> tempOutNames;
107                         outputTypes["fasta"] = tempOutNames;
108                         outputTypes["qfile"] = tempOutNames;
109                         
110                         //if the user changes the input directory command factory will send this info to us in the output parameter 
111                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
112                         if (inputDir == "not found"){   inputDir = "";          }
113                         else {
114                                 string path;
115                                 it = parameters.find("fastq");
116                                 //user has given a template file
117                                 if(it != parameters.end()){ 
118                                         path = m->hasPath(it->second);
119                                         //if the user has not given a path then, add inputdir. else leave path alone.
120                                         if (path == "") {       parameters["fastq"] = inputDir + it->second;            }
121                                 }
122                         }
123                         
124                         //check for required parameters
125                         fastaQFile = validParameter.validFile(parameters, "fastq", true);
126                         if (fastaQFile == "not found") {        m->mothurOut("fastq is a required parameter for the fastq.info command.");      m->mothurOutEndLine();  abort = true;   }
127                         else if (fastaQFile == "not open")      {       fastaQFile = ""; abort = true;  }       
128                         
129                         //if the user changes the output directory command factory will send this info to us in the output parameter 
130                         outputDir = validParameter.validFile(parameters, "outputdir", false);   if (outputDir == "not found"){  outputDir = m->hasPath(fastaQFile);     }
131                         
132                         string temp;
133                         temp = validParameter.validFile(parameters, "fasta", false);    if(temp == "not found"){        temp = "T";     }
134                         fasta = m->isTrue(temp); 
135
136                         temp = validParameter.validFile(parameters, "qfile", false);    if(temp == "not found"){        temp = "T";     }
137                         qual = m->isTrue(temp);
138             
139             temp = validParameter.validFile(parameters, "pacbio", false);       if(temp == "not found"){        temp = "F";     }
140                         pacbio = m->isTrue(temp);
141
142                         
143             format = validParameter.validFile(parameters, "format", false);             if (format == "not found"){     format = "sanger";      }
144             
145             if ((format != "sanger") && (format != "illumina") && (format != "illumina1.8+") && (format != "solexa"))  { 
146                                 m->mothurOut(format + " is not a valid format. Your format choices are sanger, solexa, illumina1.8+ and illumina, aborting." ); m->mothurOutEndLine();
147                                 abort=true;
148                         }
149
150                         if ((!fasta) && (!qual)) { m->mothurOut("[ERROR]: no outputs selected. Aborting."); m->mothurOutEndLine(); abort=true; }
151
152                 }               
153         }
154         catch(exception& e) {
155                 m->errorOut(e, "ParseFastaQCommand", "ParseFastaQCommand");
156                 exit(1);
157         }
158 }
159 //**********************************************************************************************************************
160
161 int ParseFastaQCommand::execute(){
162         try {
163                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
164                 
165                 //open Output Files
166         map<string, string> variables; 
167         variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(fastaQFile));
168                 string fastaFile = getOutputFileName("fasta",variables);
169                 string qualFile = getOutputFileName("qfile",variables);
170                 ofstream outFasta, outQual;
171                 
172                 if (fasta) { m->openOutputFile(fastaFile, outFasta);  outputNames.push_back(fastaFile); outputTypes["fasta"].push_back(fastaFile);      }
173                 if (qual) { m->openOutputFile(qualFile, outQual);       outputNames.push_back(qualFile);  outputTypes["qfile"].push_back(qualFile);             }
174                 
175                 ifstream in;
176                 m->openInputFile(fastaQFile, in);
177         
178         //fill convert table - goes from solexa to sanger. Used fq_all2std.pl as a reference.
179         for (int i = -64; i < 65; i++) { 
180             char temp = (char) ((int)(33 + 10*log(1+pow(10,(i/10.0)))/log(10)+0.499));
181             convertTable.push_back(temp);
182         }
183                 
184                 while (!in.eof()) {
185                         
186                         if (m->control_pressed) { break; }
187                 
188                         //read sequence name
189                         string name = m->getline(in); m->gobble(in);
190                         if (name == "") {  m->mothurOut("[ERROR]: Blank fasta name."); m->mothurOutEndLine(); m->control_pressed = true; break; }
191                         else if (name[0] != '@') { m->mothurOut("[ERROR]: reading " + name + " expected a name with @ as a leading character."); m->mothurOutEndLine(); m->control_pressed = true; break; }
192                         else { 
193                 name = name.substr(1); 
194                 m->checkName(name);
195             }
196                         
197                         //read sequence
198                         string sequence = m->getline(in); m->gobble(in);
199                         if (sequence == "") {  m->mothurOut("[ERROR]: missing sequence for " + name); m->mothurOutEndLine(); m->control_pressed = true; break; }
200                         
201                         //read sequence name
202                         string name2 = m->getline(in); m->gobble(in);
203                         if (name2 == "") {  m->mothurOut("[ERROR]: Blank quality name."); m->mothurOutEndLine(); m->control_pressed = true; break; }
204                         else if (name2[0] != '+') { m->mothurOut("[ERROR]: reading " + name2 + " expected a name with + as a leading character."); m->mothurOutEndLine(); m->control_pressed = true; break; }
205                         else { 
206                 name2 = name2.substr(1);  
207                 m->checkName(name2);
208             }
209                         
210                         //read quality scores
211                         string quality = m->getline(in); m->gobble(in);
212                         if (quality == "") {  m->mothurOut("[ERROR]: missing quality for " + name2); m->mothurOutEndLine(); m->control_pressed = true; break; }
213                         
214                         //sanity check sequence length and number of quality scores match
215                         if (name2 != "") { if (name != name2) { m->mothurOut("[ERROR]: names do not match. read " + name + " for fasta and " + name2 + " for quality."); m->mothurOutEndLine(); m->control_pressed = true; break; } }
216                         if (quality.length() != sequence.length()) { m->mothurOut("[ERROR]: Lengths do not match for sequence " + name + ". Read " + toString(sequence.length()) + " characters for fasta and " + toString(quality.length()) + " characters for quality scores."); m->mothurOutEndLine(); m->control_pressed = true; break; }
217                         
218             vector<int> qualScores;
219             if (qual) {
220                                 qualScores = convertQual(quality);
221                                 outQual << ">" << name << endl;
222                                 for (int i = 0; i < qualScores.size(); i++) { outQual << qualScores[i] << " "; }
223                                 outQual << endl;
224                         }
225             
226             if (m->control_pressed) { break; }
227             
228             if (pacbio) {
229                 if (!qual) { qualScores = convertQual(quality); } //get scores if we didn't already
230                 for (int i = 0; i < qualScores.size(); i++) {
231                     if (qualScores[i] == 0){ sequence[i] = 'N'; }
232                 }
233             }
234             
235                         //print sequence info to files
236                         if (fasta) { outFasta << ">" << name << endl << sequence << endl; }
237                         
238                 }
239                 
240                 in.close();
241                 if (fasta)      { outFasta.close();     }
242                 if (qual)       { outQual.close();      }
243                 
244                 if (m->control_pressed) { outputTypes.clear(); outputNames.clear(); m->mothurRemove(fastaFile); m->mothurRemove(qualFile); return 0; }
245                 
246                 //set fasta file as new current fastafile
247                 string current = "";
248                 itTypes = outputTypes.find("fasta");
249                 if (itTypes != outputTypes.end()) {
250                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
251                 }
252                 
253                 itTypes = outputTypes.find("qfile");
254                 if (itTypes != outputTypes.end()) {
255                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setQualFile(current); }
256                 }               
257                 
258                 m->mothurOutEndLine();
259                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
260                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
261                 m->mothurOutEndLine();
262
263                 return 0;
264         }
265         catch(exception& e) {
266                 m->errorOut(e, "ParseFastaQCommand", "execute");
267                 exit(1);
268         }
269 }
270 //**********************************************************************************************************************
271 vector<int> ParseFastaQCommand::convertQual(string qual) {
272         try {
273                 vector<int> qualScores;
274                 
275         bool negativeScores = false;
276         
277                 for (int i = 0; i < qual.length(); i++) { 
278             
279             int temp = 0;
280             temp = int(qual[i]);
281             if (format == "illumina") {
282                 temp -= 64; //char '@'
283             }else if (format == "illumina1.8+") {
284                 temp -= int('!'); //char '!'
285             }else if (format == "solexa") {
286                 temp = int(convertTable[temp]); //convert to sanger
287                 temp -= int('!'); //char '!'
288             }else {
289                 temp -= int('!'); //char '!'
290             }
291             if (temp < -5) { negativeScores = true; }
292                         qualScores.push_back(temp);
293                 }
294                 
295         if (negativeScores) { m->mothurOut("[ERROR]: finding negative quality scores, do you have the right format selected? http://en.wikipedia.org/wiki/FASTQ_format#Encoding \n");  m->control_pressed = true;  }
296         
297                 return qualScores;
298         }
299         catch(exception& e) {
300                 m->errorOut(e, "ParseFastaQCommand", "convertQual");
301                 exit(1);
302         }
303 }
304 //**********************************************************************************************************************
305
306
307