5 * Created by Pat Schloss on 7/15/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "seqerrorcommand.h"
11 #include "reportfile.h"
12 #include "qualityscores.h"
14 //**********************************************************************************************************************
15 vector<string> SeqErrorCommand::getValidParameters(){
17 string Array[] = {"query", "reference", "name", "qfile", "report", "threshold", "inputdir", "outputdir"};
18 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
22 m->errorOut(e, "SeqErrorCommand", "getValidParameters");
26 //**********************************************************************************************************************
27 SeqErrorCommand::SeqErrorCommand(){
30 //initialize outputTypes
31 vector<string> tempOutNames;
32 outputTypes["error"] = tempOutNames;
33 outputTypes["count"] = tempOutNames;
36 m->errorOut(e, "SeqErrorCommand", "SeqErrorCommand");
40 //**********************************************************************************************************************
41 vector<string> SeqErrorCommand::getRequiredParameters(){
43 string Array[] = {"query","reference"};
44 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
48 m->errorOut(e, "SeqErrorCommand", "getRequiredParameters");
52 //**********************************************************************************************************************
53 vector<string> SeqErrorCommand::getRequiredFiles(){
55 vector<string> myArray;
59 m->errorOut(e, "SeqErrorCommand", "getRequiredFiles");
63 //***************************************************************************************************************
65 SeqErrorCommand::SeqErrorCommand(string option) {
70 //allow user to run help
71 if(option == "help") { help(); abort = true; }
76 //valid paramters for this command
77 string AlignArray[] = {"query", "reference", "name", "qfile", "report", "threshold", "inputdir", "outputdir"};
79 //need to implement name file option
81 vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
83 OptionParser parser(option);
84 map<string,string> parameters = parser.getParameters();
86 ValidParameters validParameter;
87 map<string,string>::iterator it;
89 //check to make sure all parameters are valid for command
90 for (it = parameters.begin(); it != parameters.end(); it++) {
91 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
94 //initialize outputTypes
95 vector<string> tempOutNames;
96 outputTypes["error"] = tempOutNames;
97 outputTypes["count"] = tempOutNames;
99 //if the user changes the input directory command factory will send this info to us in the output parameter
100 string inputDir = validParameter.validFile(parameters, "inputdir", false);
101 if (inputDir == "not found"){ inputDir = ""; }
104 it = parameters.find("query");
105 //user has given a template file
106 if(it != parameters.end()){
107 path = m->hasPath(it->second);
108 //if the user has not given a path then, add inputdir. else leave path alone.
109 if (path == "") { parameters["query"] = inputDir + it->second; }
112 it = parameters.find("reference");
113 //user has given a template file
114 if(it != parameters.end()){
115 path = m->hasPath(it->second);
116 //if the user has not given a path then, add inputdir. else leave path alone.
117 if (path == "") { parameters["reference"] = inputDir + it->second; }
120 it = parameters.find("name");
121 //user has given a names file
122 if(it != parameters.end()){
123 path = m->hasPath(it->second);
124 //if the user has not given a path then, add inputdir. else leave path alone.
125 if (path == "") { parameters["name"] = inputDir + it->second; }
128 it = parameters.find("qfile");
129 //user has given a quality score file
130 if(it != parameters.end()){
131 path = m->hasPath(it->second);
132 //if the user has not given a path then, add inputdir. else leave path alone.
133 if (path == "") { parameters["qfile"] = inputDir + it->second; }
136 it = parameters.find("report");
137 //user has given a alignment report file
138 if(it != parameters.end()){
139 path = m->hasPath(it->second);
140 //if the user has not given a path then, add inputdir. else leave path alone.
141 if (path == "") { parameters["report"] = inputDir + it->second; }
145 //check for required parameters
146 queryFileName = validParameter.validFile(parameters, "query", true);
147 if (queryFileName == "not found") { m->mothurOut("query is a required parameter for the seq.error command."); m->mothurOutEndLine(); abort = true; }
148 else if (queryFileName == "not open") { abort = true; }
150 referenceFileName = validParameter.validFile(parameters, "reference", true);
151 if (referenceFileName == "not found") { m->mothurOut("reference is a required parameter for the seq.error command."); m->mothurOutEndLine(); abort = true; }
152 else if (referenceFileName == "not open") { abort = true; }
155 //check for optional parameters
156 namesFileName = validParameter.validFile(parameters, "name", true);
157 if(namesFileName == "not found"){ namesFileName = ""; }
159 qualFileName = validParameter.validFile(parameters, "qfile", true);
160 if(qualFileName == "not found"){ qualFileName = ""; }
162 reportFileName = validParameter.validFile(parameters, "report", true);
163 if(reportFileName == "not found"){ reportFileName = ""; }
165 if((reportFileName != "" && qualFileName == "") || (reportFileName == "" && qualFileName != "")){
166 m->mothurOut("if you use either a qual file or a report file, you have to have both.");
167 m->mothurOutEndLine();
173 outputDir = validParameter.validFile(parameters, "outputdir", false);
174 if (outputDir == "not found"){
176 outputDir += m->hasPath(queryFileName); //if user entered a file with a path then preserve it
179 //check for optional parameter and set defaults
180 // ...at some point should added some additional type checking...
181 temp = validParameter.validFile(parameters, "threshold", false); if (temp == "not found") { temp = "1.00"; }
182 convert(temp, threshold);
184 errorSummaryFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.summary";
185 m->openOutputFile(errorSummaryFileName, errorSummaryFile);
186 outputNames.push_back(errorSummaryFileName); outputTypes["error.summary"].push_back(errorSummaryFileName);
189 errorSeqFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.seq";
190 m->openOutputFile(errorSeqFileName, errorSeqFile);
191 outputNames.push_back(errorSeqFileName); outputTypes["error.seq"].push_back(errorSeqFileName);
196 catch(exception& e) {
197 m->errorOut(e, "SeqErrorCommand", "SeqErrorCommand");
202 //**********************************************************************************************************************
204 void SeqErrorCommand::help(){
206 m->mothurOut("The seq.error command reads a query alignment file and a reference alignment file and creates .....\n");
210 m->mothurOut("Example seq.error(...).\n");
211 m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n");
212 m->mothurOut("For more details please check out the wiki http://www.mothur.org/wiki/seq.error .\n\n");
214 catch(exception& e) {
215 m->errorOut(e, "SeqErrorCommand", "help");
220 //***************************************************************************************************************
222 SeqErrorCommand::~SeqErrorCommand(){
223 errorSummaryFile.close();
224 errorSeqFile.close();
227 //***************************************************************************************************************
229 int SeqErrorCommand::execute(){
231 if (abort == true) { return 0; }
233 getReferences(); //read in reference sequences - make sure there's no ambiguous bases
235 map<string, int> weights;
236 if(namesFileName != ""){ weights = getWeights(); }
239 m->openInputFile(queryFileName, queryFile);
245 QualityScores quality;
247 if(qualFileName != "" && reportFileName != ""){
248 m->openInputFile(qualFileName, qualFile);
249 report = ReportFile(reportFile, reportFileName);
253 int totalMatches = 0;
255 vector<int> misMatchCounts(11, 0);
259 map<string, int>::iterator it;
260 map<char, vector<int> > qScoreErrorMap;
261 qScoreErrorMap['m'].assign(41, 0);
262 qScoreErrorMap['s'].assign(41, 0);
263 qScoreErrorMap['i'].assign(41, 0);
264 qScoreErrorMap['a'].assign(41, 0);
268 Sequence query(queryFile);
270 for(int i=0;i<numRefs;i++){
271 Compare currCompare = getErrors(query, referenceSeqs[i]);
273 if(currCompare.errorRate < minCompare.errorRate){
274 minCompare = currCompare;
278 if(namesFileName != ""){
279 it = weights.find(query.getName());
280 minCompare.weight = it->second;
282 else { minCompare.weight = 1; }
284 printErrorData(minCompare);
286 if(qualFileName != "" && reportFileName != ""){
287 report = ReportFile(reportFile);
289 int origLength = report.getQueryLength();
290 int startBase = report.getQueryStart();
291 int endBase = report.getQueryEnd();
293 quality = QualityScores(qualFile, origLength);
294 quality.updateQScoreErrorMap(qScoreErrorMap, minCompare.sequence, startBase, endBase, minCompare.weight);
297 if(minCompare.errorRate < threshold){
298 totalBases += (minCompare.total * minCompare.weight);
299 totalMatches += minCompare.matches * minCompare.weight;
300 if(minCompare.mismatches > maxMismatch){
301 maxMismatch = minCompare.mismatches;
302 misMatchCounts.resize(maxMismatch + 1, 0);
304 misMatchCounts[minCompare.mismatches] += minCompare.weight;
314 if(qualFileName != "" && reportFileName != ""){
315 string errorQualityFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.quality";
316 ofstream errorQualityFile;
317 m->openOutputFile(errorQualityFileName, errorQualityFile);
318 outputNames.push_back(errorQualityFileName); outputTypes["error.quality"].push_back(errorQualityFileName);
320 errorQualityFile << "qscore\tmatches\tsubstitutions\tinsertions\tambiguous" << endl;
321 for(int i=0;i<41;i++){
322 errorQualityFile << i << '\t' << qScoreErrorMap['m'][i] << '\t' << qScoreErrorMap['s'][i] << '\t' << qScoreErrorMap['i'][i] << '\t'<< qScoreErrorMap['a'][i] << endl;
326 string errorCountFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.count";
327 ofstream errorCountFile;
328 m->openOutputFile(errorCountFileName, errorCountFile);
329 outputNames.push_back(errorCountFileName); outputTypes["error.count"].push_back(errorCountFileName);
331 m->mothurOut("Overall error rate:\t" + toString((double)(totalBases - totalMatches) / (double)totalBases) + "\n\n");
332 m->mothurOut("Errors\tSequences\n");
334 errorCountFile << "Errors\tSequences\n";
336 for(int i=0;i<misMatchCounts.size();i++){
337 m->mothurOut(toString(i) + '\t' + toString(misMatchCounts[i]) + '\n');
338 errorCountFile << i << '\t' << misMatchCounts[i] << endl;
343 catch(exception& e) {
344 m->errorOut(e, "SeqErrorCommand", "execute");
349 //***************************************************************************************************************
351 void SeqErrorCommand::getReferences(){
354 ifstream referenceFile;
355 m->openInputFile(referenceFileName, referenceFile);
357 while(referenceFile){
358 Sequence currentSeq(referenceFile);
359 int numAmbigs = currentSeq.getAmbigBases();
362 m->mothurOut("Warning: " + toString(currentSeq.getName()) + " has " + toString(numAmbigs) + " ambiguous bases, these bases will be removed\n");
363 currentSeq.removeAmbigBases();
365 referenceSeqs.push_back(currentSeq);
366 m->gobble(referenceFile);
368 numRefs = referenceSeqs.size();
370 referenceFile.close();
372 catch(exception& e) {
373 m->errorOut(e, "SeqErrorCommand", "getReferences");
378 //***************************************************************************************************************
380 Compare SeqErrorCommand::getErrors(Sequence query, Sequence reference){
382 if(query.getAlignLength() != reference.getAlignLength()){
383 m->mothurOut("Warning: " + toString(query.getName()) + " and " + toString(reference.getName()) + " are different lengths\n");
385 int alignLength = query.getAlignLength();
387 string q = query.getAligned();
388 string r = reference.getAligned();
394 for(int i=0;i<alignLength;i++){
395 if(q[i] != '.' && r[i] != '.' && (q[i] != '-' || r[i] != '-')){ // no missing data and no double gaps
399 if(r[i] == 'A'){ errors.AA++; errors.matches++; errors.sequence += 'm'; }
400 if(r[i] == 'T'){ errors.AT++; errors.sequence += 's'; }
401 if(r[i] == 'G'){ errors.AG++; errors.sequence += 's'; }
402 if(r[i] == 'C'){ errors.AC++; errors.sequence += 's'; }
403 if(r[i] == '-'){ errors.Ai++; errors.sequence += 'i'; }
405 else if(q[i] == 'T'){
406 if(r[i] == 'A'){ errors.TA++; errors.sequence += 's'; }
407 if(r[i] == 'T'){ errors.TT++; errors.matches++; errors.sequence += 'm'; }
408 if(r[i] == 'G'){ errors.TG++; errors.sequence += 's'; }
409 if(r[i] == 'C'){ errors.TC++; errors.sequence += 's'; }
410 if(r[i] == '-'){ errors.Ti++; errors.sequence += 'i'; }
412 else if(q[i] == 'G'){
413 if(r[i] == 'A'){ errors.GA++; errors.sequence += 's'; }
414 if(r[i] == 'T'){ errors.GT++; errors.sequence += 's'; }
415 if(r[i] == 'G'){ errors.GG++; errors.matches++; errors.sequence += 'm'; }
416 if(r[i] == 'C'){ errors.GC++; errors.sequence += 's'; }
417 if(r[i] == '-'){ errors.Gi++; errors.sequence += 'i'; }
419 else if(q[i] == 'C'){
420 if(r[i] == 'A'){ errors.CA++; errors.sequence += 's'; }
421 if(r[i] == 'T'){ errors.CT++; errors.sequence += 's'; }
422 if(r[i] == 'G'){ errors.CG++; errors.sequence += 's'; }
423 if(r[i] == 'C'){ errors.CC++; errors.matches++; errors.sequence += 'm'; }
424 if(r[i] == '-'){ errors.Ci++; errors.sequence += 'i'; }
426 else if(q[i] == 'N'){
427 if(r[i] == 'A'){ errors.NA++; errors.sequence += 'a'; }
428 if(r[i] == 'T'){ errors.NT++; errors.sequence += 'a'; }
429 if(r[i] == 'G'){ errors.NG++; errors.sequence += 'a'; }
430 if(r[i] == 'C'){ errors.NC++; errors.sequence += 'a'; }
431 if(r[i] == '-'){ errors.Ni++; errors.sequence += 'a'; }
433 else if(q[i] == '-' && r[i] != '-'){
434 if(r[i] == 'A'){ errors.dA++; errors.sequence += 'd'; }
435 if(r[i] == 'T'){ errors.dT++; errors.sequence += 'd'; }
436 if(r[i] == 'G'){ errors.dG++; errors.sequence += 'd'; }
437 if(r[i] == 'C'){ errors.dC++; errors.sequence += 'd'; }
442 else if(q[i] == '.' && r[i] != '.'){ // reference extends beyond query
443 if(started == 1){ break; }
445 else if(q[i] != '.' && r[i] == '.'){ // query extends beyond reference
446 m->mothurOut("Warning: " + toString(query.getName()) + " extend beyond " + toString(reference.getName()) + ". Ignoring the extra bases in the query\n");
447 if(started == 1){ break; }
449 else if(q[i] == '.' && r[i] == '.'){ // both are missing data
450 if(started == 1){ break; }
454 errors.mismatches = errors.total-errors.matches;
455 errors.errorRate = (double)(errors.total-errors.matches) / (double)errors.total;
456 errors.queryName = query.getName();
457 errors.refName = reference.getName();
461 catch(exception& e) {
462 m->errorOut(e, "SeqErrorCommand", "getErrors");
467 //***************************************************************************************************************
469 map<string, int> SeqErrorCommand::getWeights(){
471 m->openInputFile(namesFileName, nameFile);
474 string redundantSeqs;
475 map<string, int> nameCountMap;
478 nameFile >> seqName >> redundantSeqs;
479 nameCountMap[seqName] = m->getNumNames(redundantSeqs);
486 //***************************************************************************************************************
488 void SeqErrorCommand::printErrorHeader(){
490 errorSummaryFile << "query\treference\tweight\t";
491 errorSummaryFile << "AA\tAT\tAG\tAC\tTA\tTT\tTG\tTC\tGA\tGT\tGG\tGC\tCA\tCT\tCG\tCC\tNA\tNT\tNG\tNC\tAi\tTi\tGi\tCi\tNi\tdA\tdT\tdG\tdC\t";
492 errorSummaryFile << "insertions\tdeletions\tsubstitutions\tambig\tmatches\tmismatches\ttotal\terror\n";
494 errorSummaryFile << setprecision(6);
495 errorSummaryFile.setf(ios::fixed);
497 catch(exception& e) {
498 m->errorOut(e, "SeqErrorCommand", "printErrorHeader");
503 //***************************************************************************************************************
505 void SeqErrorCommand::printErrorData(Compare error){
507 errorSummaryFile << error.queryName << '\t' << error.refName << '\t' << error.weight << '\t';
508 errorSummaryFile << error.AA << '\t' << error.AT << '\t' << error.AG << '\t' << error.AC << '\t';
509 errorSummaryFile << error.TA << '\t' << error.TT << '\t' << error.TG << '\t' << error.TC << '\t';
510 errorSummaryFile << error.GA << '\t' << error.GT << '\t' << error.GG << '\t' << error.GC << '\t';
511 errorSummaryFile << error.CA << '\t' << error.CT << '\t' << error.CG << '\t' << error.CC << '\t';
512 errorSummaryFile << error.NA << '\t' << error.NT << '\t' << error.NG << '\t' << error.NC << '\t';
513 errorSummaryFile << error.Ai << '\t' << error.Ti << '\t' << error.Gi << '\t' << error.Ci << '\t' << error.Ni << '\t' ;
514 errorSummaryFile << error.dA << '\t' << error.dT << '\t' << error.dG << '\t' << error.dC << '\t';
516 errorSummaryFile << error.Ai + error.Ti + error.Gi + error.Ci << '\t'; //insertions
517 errorSummaryFile << error.dA + error.dT + error.dG + error.dC << '\t'; //deletions
518 errorSummaryFile << error.mismatches - (error.Ai + error.Ti + error.Gi + error.Ci) - (error.dA + error.dT + error.dG + error.dC) - (error.NA + error.NT + error.NG + error.NC + error.Ni) << '\t'; //substitutions
519 errorSummaryFile << error.NA + error.NT + error.NG + error.NC + error.Ni << '\t'; //ambiguities
520 errorSummaryFile << error.matches << '\t' << error.mismatches << '\t' << error.total << '\t' << error.errorRate << endl;
522 errorSeqFile << '>' << error.queryName << "\tref:" << error.refName << '\n' << error.sequence << endl;
524 catch(exception& e) {
525 m->errorOut(e, "SeqErrorCommand", "printErrorData");
530 //***************************************************************************************************************