5 * Created by Pat Schloss on 7/15/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "seqerrorcommand.h"
12 //**********************************************************************************************************************
13 vector<string> SeqErrorCommand::getValidParameters(){
15 string Array[] = {"query", "reference", "name", "threshold", "inputdir", "outputdir"};
16 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
20 m->errorOut(e, "SeqErrorCommand", "getValidParameters");
24 //**********************************************************************************************************************
25 SeqErrorCommand::SeqErrorCommand(){
28 //initialize outputTypes
29 vector<string> tempOutNames;
30 outputTypes["error"] = tempOutNames;
31 outputTypes["count"] = tempOutNames;
34 m->errorOut(e, "SeqErrorCommand", "SeqErrorCommand");
38 //**********************************************************************************************************************
39 vector<string> SeqErrorCommand::getRequiredParameters(){
41 string Array[] = {"query","reference"};
42 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
46 m->errorOut(e, "SeqErrorCommand", "getRequiredParameters");
50 //**********************************************************************************************************************
51 vector<string> SeqErrorCommand::getRequiredFiles(){
53 vector<string> myArray;
57 m->errorOut(e, "SeqErrorCommand", "getRequiredFiles");
61 //***************************************************************************************************************
63 SeqErrorCommand::SeqErrorCommand(string option) {
68 //allow user to run help
69 if(option == "help") { help(); abort = true; }
74 //valid paramters for this command
75 string AlignArray[] = {"query", "reference", "name", "threshold", "inputdir", "outputdir"};
77 //need to implement name file option
79 vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
81 OptionParser parser(option);
82 map<string,string> parameters = parser.getParameters();
84 ValidParameters validParameter;
85 map<string,string>::iterator it;
87 //check to make sure all parameters are valid for command
88 for (it = parameters.begin(); it != parameters.end(); it++) {
89 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
92 //initialize outputTypes
93 vector<string> tempOutNames;
94 outputTypes["error"] = tempOutNames;
95 outputTypes["count"] = tempOutNames;
97 //if the user changes the input directory command factory will send this info to us in the output parameter
98 string inputDir = validParameter.validFile(parameters, "inputdir", false);
99 if (inputDir == "not found"){ inputDir = ""; }
102 it = parameters.find("query");
103 //user has given a template file
104 if(it != parameters.end()){
105 path = m->hasPath(it->second);
106 //if the user has not given a path then, add inputdir. else leave path alone.
107 if (path == "") { parameters["query"] = inputDir + it->second; }
110 it = parameters.find("reference");
111 //user has given a template file
112 if(it != parameters.end()){
113 path = m->hasPath(it->second);
114 //if the user has not given a path then, add inputdir. else leave path alone.
115 if (path == "") { parameters["reference"] = inputDir + it->second; }
118 it = parameters.find("name");
119 //user has given a template file
120 if(it != parameters.end()){
121 path = m->hasPath(it->second);
122 //if the user has not given a path then, add inputdir. else leave path alone.
123 if (path == "") { parameters["name"] = inputDir + it->second; }
127 //check for required parameters
128 queryFileName = validParameter.validFile(parameters, "query", true);
129 if (queryFileName == "not found") { m->mothurOut("query is a required parameter for the seq.error command."); m->mothurOutEndLine(); abort = true; }
130 else if (queryFileName == "not open") { abort = true; }
132 referenceFileName = validParameter.validFile(parameters, "reference", true);
133 if (referenceFileName == "not found") { m->mothurOut("reference is a required parameter for the seq.error command."); m->mothurOutEndLine(); abort = true; }
134 else if (referenceFileName == "not open") { abort = true; }
136 //if the user changes the output directory command factory will send this info to us in the output parameter
137 namesFileName = validParameter.validFile(parameters, "name", true);
138 if(namesFileName == "not found"){ namesFileName = ""; }
139 cout << namesFileName << endl;
141 outputDir = validParameter.validFile(parameters, "outputdir", false);
142 if (outputDir == "not found"){
144 outputDir += m->hasPath(queryFileName); //if user entered a file with a path then preserve it
147 //check for optional parameter and set defaults
148 // ...at some point should added some additional type checking...
149 temp = validParameter.validFile(parameters, "threshold", false); if (temp == "not found") { temp = "1.00"; }
150 convert(temp, threshold);
152 errorSummaryFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.summary";
153 m->openOutputFile(errorSummaryFileName, errorSummaryFile);
154 outputNames.push_back(errorSummaryFileName); outputTypes["error.summary"].push_back(errorSummaryFileName);
157 errorSeqFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.seq";
158 m->openOutputFile(errorSeqFileName, errorSeqFile);
159 outputNames.push_back(errorSeqFileName); outputTypes["error.seq"].push_back(errorSeqFileName);
164 catch(exception& e) {
165 m->errorOut(e, "SeqErrorCommand", "SeqErrorCommand");
170 //**********************************************************************************************************************
172 void SeqErrorCommand::help(){
174 m->mothurOut("The seq.error command reads a query alignment file and a reference alignment file and creates .....\n");
178 m->mothurOut("Example seq.error(...).\n");
179 m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n");
180 m->mothurOut("For more details please check out the wiki http://www.mothur.org/wiki/seq.error .\n\n");
182 catch(exception& e) {
183 m->errorOut(e, "SeqErrorCommand", "help");
188 //***************************************************************************************************************
190 SeqErrorCommand::~SeqErrorCommand(){
191 errorSummaryFile.close();
192 errorSeqFile.close();
195 //***************************************************************************************************************
197 int SeqErrorCommand::execute(){
199 if (abort == true) { return 0; }
201 getReferences(); //read in reference sequences - make sure there's no ambiguous bases
203 map<string, int> weights;
204 if(namesFileName != ""){ weights = getWeights(); }
207 m->openInputFile(queryFileName, queryFile);
210 int totalMatches = 0;
212 vector<int> misMatchCounts(11, 0);
216 map<string, int>::iterator it;
221 Sequence query(queryFile);
223 for(int i=0;i<numRefs;i++){
224 Compare currCompare = getErrors(query, referenceSeqs[i]);
226 if(currCompare.errorRate < minCompare.errorRate){
227 minCompare = currCompare;
232 if(namesFileName != ""){
233 it = weights.find(query.getName());
234 minCompare.weight = it->second;
237 minCompare.weight = 1;
240 printErrorData(minCompare);
242 if(minCompare.errorRate < threshold){
243 totalBases += (minCompare.total * minCompare.weight);
244 totalMatches += minCompare.matches * minCompare.weight;
245 if(minCompare.mismatches > maxMismatch){
246 maxMismatch = minCompare.mismatches;
247 misMatchCounts.resize(maxMismatch + 1, 0);
249 misMatchCounts[minCompare.mismatches] += minCompare.weight;
259 string errorCountFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.count";
260 ofstream errorCountFile;
261 m->openOutputFile(errorCountFileName, errorCountFile);
262 outputNames.push_back(errorCountFileName); outputTypes["error.count"].push_back(errorCountFileName);
264 m->mothurOut("Overall error rate:\t" + toString((double)(totalBases - totalMatches) / (double)totalBases) + "\n\n");
265 m->mothurOut("Errors\tSequences\n");
267 errorCountFile << "Errors\tSequences\n";
269 for(int i=0;i<misMatchCounts.size();i++){
270 m->mothurOut(toString(i) + '\t' + toString(misMatchCounts[i]) + '\n');
271 errorCountFile << i << '\t' << misMatchCounts[i] << endl;
276 catch(exception& e) {
277 m->errorOut(e, "SeqErrorCommand", "execute");
282 //***************************************************************************************************************
284 void SeqErrorCommand::getReferences(){
287 ifstream referenceFile;
288 m->openInputFile(referenceFileName, referenceFile);
290 while(referenceFile){
291 Sequence currentSeq(referenceFile);
292 int numAmbigs = currentSeq.getAmbigBases();
295 m->mothurOut("Warning: " + toString(currentSeq.getName()) + " has " + toString(numAmbigs) + " ambiguous bases, these bases will be removed\n");
296 currentSeq.removeAmbigBases();
298 referenceSeqs.push_back(currentSeq);
299 m->gobble(referenceFile);
301 numRefs = referenceSeqs.size();
303 referenceFile.close();
305 catch(exception& e) {
306 m->errorOut(e, "SeqErrorCommand", "getReferences");
311 //***************************************************************************************************************
313 Compare SeqErrorCommand::getErrors(Sequence query, Sequence reference){
315 if(query.getAlignLength() != reference.getAlignLength()){
316 m->mothurOut("Warning: " + toString(query.getName()) + " and " + toString(reference.getName()) + " are different lengths\n");
318 int alignLength = query.getAlignLength();
320 string q = query.getAligned();
321 string r = reference.getAligned();
327 for(int i=0;i<alignLength;i++){
328 if(q[i] != '.' && r[i] != '.' && (q[i] != '-' || r[i] != '-')){ // no missing data and no double gaps
332 if(r[i] == 'A'){ errors.AA++; errors.matches++; errors.sequence += 'm'; }
333 if(r[i] == 'T'){ errors.AT++; errors.sequence += 's'; }
334 if(r[i] == 'G'){ errors.AG++; errors.sequence += 's'; }
335 if(r[i] == 'C'){ errors.AC++; errors.sequence += 's'; }
336 if(r[i] == '-'){ errors.Ai++; errors.sequence += 'i'; }
338 else if(q[i] == 'T'){
339 if(r[i] == 'A'){ errors.TA++; errors.sequence += 's'; }
340 if(r[i] == 'T'){ errors.TT++; errors.matches++; errors.sequence += 'm'; }
341 if(r[i] == 'G'){ errors.TG++; errors.sequence += 's'; }
342 if(r[i] == 'C'){ errors.TC++; errors.sequence += 's'; }
343 if(r[i] == '-'){ errors.Ti++; errors.sequence += 'i'; }
345 else if(q[i] == 'G'){
346 if(r[i] == 'A'){ errors.GA++; errors.sequence += 's'; }
347 if(r[i] == 'T'){ errors.GT++; errors.sequence += 's'; }
348 if(r[i] == 'G'){ errors.GG++; errors.matches++; errors.sequence += 'm'; }
349 if(r[i] == 'C'){ errors.GC++; errors.sequence += 's'; }
350 if(r[i] == '-'){ errors.Gi++; errors.sequence += 'i'; }
352 else if(q[i] == 'C'){
353 if(r[i] == 'A'){ errors.CA++; errors.sequence += 's'; }
354 if(r[i] == 'T'){ errors.CT++; errors.sequence += 's'; }
355 if(r[i] == 'G'){ errors.CG++; errors.sequence += 's'; }
356 if(r[i] == 'C'){ errors.CC++; errors.matches++; errors.sequence += 'm'; }
357 if(r[i] == '-'){ errors.Ci++; errors.sequence += 'i'; }
359 else if(q[i] == 'N'){
360 if(r[i] == 'A'){ errors.NA++; errors.sequence += 'a'; }
361 if(r[i] == 'T'){ errors.NT++; errors.sequence += 'a'; }
362 if(r[i] == 'G'){ errors.NG++; errors.sequence += 'a'; }
363 if(r[i] == 'C'){ errors.NC++; errors.sequence += 'a'; }
364 if(r[i] == '-'){ errors.Ni++; errors.sequence += 'a'; }
366 else if(q[i] == '-' && r[i] != '-'){
367 if(r[i] == 'A'){ errors.dA++; errors.sequence += 'd'; }
368 if(r[i] == 'T'){ errors.dT++; errors.sequence += 'd'; }
369 if(r[i] == 'G'){ errors.dG++; errors.sequence += 'd'; }
370 if(r[i] == 'C'){ errors.dC++; errors.sequence += 'd'; }
375 else if(q[i] == '.' && r[i] != '.'){ // reference extends beyond query
376 if(started == 1){ break; }
378 else if(q[i] != '.' && r[i] == '.'){ // query extends beyond reference
379 m->mothurOut("Warning: " + toString(query.getName()) + " extend beyond " + toString(reference.getName()) + ". Ignoring the extra bases in the query\n");
380 if(started == 1){ break; }
382 else if(q[i] == '.' && r[i] == '.'){ // both are missing data
383 if(started == 1){ break; }
387 errors.mismatches = errors.total-errors.matches;
388 errors.errorRate = (double)(errors.total-errors.matches) / (double)errors.total;
389 errors.queryName = query.getName();
390 errors.refName = reference.getName();
394 catch(exception& e) {
395 m->errorOut(e, "SeqErrorCommand", "getErrors");
400 //***************************************************************************************************************
402 map<string, int> SeqErrorCommand::getWeights(){
404 m->openInputFile(namesFileName, nameFile);
407 string redundantSeqs;
408 map<string, int> nameCountMap;
411 nameFile >> seqName >> redundantSeqs;
412 nameCountMap[seqName] = m->getNumNames(redundantSeqs);
419 //***************************************************************************************************************
421 void SeqErrorCommand::printErrorHeader(){
423 errorSummaryFile << "query\treference\tweight\t";
424 errorSummaryFile << "AA\tAT\tAG\tAC\tTA\tTT\tTG\tTC\tGA\tGT\tGG\tGC\tCA\tCT\tCG\tCC\tNA\tNT\tNG\tNC\tAi\tTi\tGi\tCi\tNi\tdA\tdT\tdG\tdC\t";
425 errorSummaryFile << "insertions\tdeletions\tsubstitutions\tambig\tmatches\tmismatches\ttotal\terror\n";
427 errorSummaryFile << setprecision(6);
428 errorSummaryFile.setf(ios::fixed);
430 catch(exception& e) {
431 m->errorOut(e, "SeqErrorCommand", "printErrorHeader");
436 //***************************************************************************************************************
438 void SeqErrorCommand::printErrorData(Compare error){
440 errorSummaryFile << error.queryName << '\t' << error.refName << '\t' << error.weight << '\t';
441 errorSummaryFile << error.AA << '\t' << error.AT << '\t' << error.AG << '\t' << error.AC << '\t';
442 errorSummaryFile << error.TA << '\t' << error.TT << '\t' << error.TG << '\t' << error.TC << '\t';
443 errorSummaryFile << error.GA << '\t' << error.GT << '\t' << error.GG << '\t' << error.GC << '\t';
444 errorSummaryFile << error.CA << '\t' << error.CT << '\t' << error.CG << '\t' << error.CC << '\t';
445 errorSummaryFile << error.NA << '\t' << error.NT << '\t' << error.NG << '\t' << error.NC << '\t';
446 errorSummaryFile << error.Ai << '\t' << error.Ti << '\t' << error.Gi << '\t' << error.Ci << '\t' << error.Ni << '\t' ;
447 errorSummaryFile << error.dA << '\t' << error.dT << '\t' << error.dG << '\t' << error.dC << '\t';
449 errorSummaryFile << error.Ai + error.Ti + error.Gi + error.Ci << '\t'; //insertions
450 errorSummaryFile << error.dA + error.dT + error.dG + error.dC << '\t'; //deletions
451 errorSummaryFile << error.mismatches - (error.Ai + error.Ti + error.Gi + error.Ci) - (error.dA + error.dT + error.dG + error.dC) - (error.NA + error.NT + error.NG + error.NC + error.Ni) << '\t'; //substitutions
452 errorSummaryFile << error.NA + error.NT + error.NG + error.NC + error.Ni << '\t'; //ambiguities
453 errorSummaryFile << error.matches << '\t' << error.mismatches << '\t' << error.total << '\t' << error.errorRate << endl;
455 errorSeqFile << '>' << error.queryName << "\tref:" << error.refName << '\n' << error.sequence << endl;
457 catch(exception& e) {
458 m->errorOut(e, "SeqErrorCommand", "printErrorData");
463 //***************************************************************************************************************