X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=seqerrorcommand.cpp;h=615206b5b84c4c717ad9e17c5dcb1710091c308f;hb=bdb5d82e2a73829b4e1fa42656ad9bcb57e3e948;hp=f1b9cd8103a2d5f9c8836f657411359b0845785c;hpb=260ae19c36cb11a53ddc5a75b5e507f8dd8b31d6;p=mothur.git diff --git a/seqerrorcommand.cpp b/seqerrorcommand.cpp index f1b9cd8..615206b 100644 --- a/seqerrorcommand.cpp +++ b/seqerrorcommand.cpp @@ -8,7 +8,58 @@ */ #include "seqerrorcommand.h" +#include "reportfile.h" +#include "qualityscores.h" +//********************************************************************************************************************** +vector SeqErrorCommand::getValidParameters(){ + try { + string Array[] = {"query", "reference", "name", "qfile", "report", "threshold", "inputdir", "outputdir"}; + vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); + return myArray; + } + catch(exception& e) { + m->errorOut(e, "SeqErrorCommand", "getValidParameters"); + exit(1); + } +} +//********************************************************************************************************************** +SeqErrorCommand::SeqErrorCommand(){ + try { + abort = true; + //initialize outputTypes + vector tempOutNames; + outputTypes["error"] = tempOutNames; + outputTypes["count"] = tempOutNames; + } + catch(exception& e) { + m->errorOut(e, "SeqErrorCommand", "SeqErrorCommand"); + exit(1); + } +} +//********************************************************************************************************************** +vector SeqErrorCommand::getRequiredParameters(){ + try { + string Array[] = {"query","reference"}; + vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); + return myArray; + } + catch(exception& e) { + m->errorOut(e, "SeqErrorCommand", "getRequiredParameters"); + exit(1); + } +} +//********************************************************************************************************************** +vector SeqErrorCommand::getRequiredFiles(){ + try { + vector myArray; + return myArray; + } + catch(exception& e) { + m->errorOut(e, "SeqErrorCommand", "getRequiredFiles"); + exit(1); + } +} //*************************************************************************************************************** SeqErrorCommand::SeqErrorCommand(string option) { @@ -23,9 +74,7 @@ SeqErrorCommand::SeqErrorCommand(string option) { string temp; //valid paramters for this command - string AlignArray[] = {"query", "reference", "name", "threshold"}; - -//need to implement name file option + string AlignArray[] = {"query", "reference", "name", "qfile", "report", "threshold", "inputdir", "outputdir"}; vector myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string))); @@ -40,6 +89,11 @@ SeqErrorCommand::SeqErrorCommand(string option) { if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; } } + //initialize outputTypes + vector tempOutNames; + outputTypes["error"] = tempOutNames; + outputTypes["count"] = tempOutNames; + //if the user changes the input directory command factory will send this info to us in the output parameter string inputDir = validParameter.validFile(parameters, "inputdir", false); if (inputDir == "not found"){ inputDir = ""; } @@ -62,12 +116,28 @@ SeqErrorCommand::SeqErrorCommand(string option) { } it = parameters.find("name"); - //user has given a template file + //user has given a names file if(it != parameters.end()){ path = m->hasPath(it->second); //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["name"] = inputDir + it->second; } } + + it = parameters.find("qfile"); + //user has given a quality score file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["qfile"] = inputDir + it->second; } + } + + it = parameters.find("report"); + //user has given a alignment report file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["report"] = inputDir + it->second; } + } } //check for required parameters @@ -79,10 +149,22 @@ SeqErrorCommand::SeqErrorCommand(string option) { if (referenceFileName == "not found") { m->mothurOut("reference is a required parameter for the seq.error command."); m->mothurOutEndLine(); abort = true; } else if (referenceFileName == "not open") { abort = true; } - //if the user changes the output directory command factory will send this info to us in the output parameter + + //check for optional parameters namesFileName = validParameter.validFile(parameters, "name", true); if(namesFileName == "not found"){ namesFileName = ""; } - cout << namesFileName << endl; + + qualFileName = validParameter.validFile(parameters, "qfile", true); + if(qualFileName == "not found"){ qualFileName = ""; } + + reportFileName = validParameter.validFile(parameters, "report", true); + if(reportFileName == "not found"){ reportFileName = ""; } + + if((reportFileName != "" && qualFileName == "") || (reportFileName == "" && qualFileName != "")){ + m->mothurOut("if you use either a qual file or a report file, you have to have both."); + m->mothurOutEndLine(); + abort = true; + } outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ @@ -95,9 +177,17 @@ SeqErrorCommand::SeqErrorCommand(string option) { temp = validParameter.validFile(parameters, "threshold", false); if (temp == "not found") { temp = "1.00"; } convert(temp, threshold); - errorFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".errors"; - m->openOutputFile(errorFileName, errorFile); + errorSummaryFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.summary"; + m->openOutputFile(errorSummaryFileName, errorSummaryFile); + outputNames.push_back(errorSummaryFileName); outputTypes["error.summary"].push_back(errorSummaryFileName); printErrorHeader(); + + errorSeqFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.seq"; + m->openOutputFile(errorSeqFileName, errorSeqFile); + outputNames.push_back(errorSeqFileName); outputTypes["error.seq"].push_back(errorSeqFileName); + + substitutionMatrix.resize(6); + for(int i=0;i<6;i++){ substitutionMatrix[i].assign(6,0); } } } catch(exception& e) { @@ -126,7 +216,10 @@ void SeqErrorCommand::help(){ //*************************************************************************************************************** -SeqErrorCommand::~SeqErrorCommand(){ errorFile.close(); } +SeqErrorCommand::~SeqErrorCommand(){ + errorSummaryFile.close(); + errorSeqFile.close(); +} //*************************************************************************************************************** @@ -141,7 +234,27 @@ int SeqErrorCommand::execute(){ ifstream queryFile; m->openInputFile(queryFileName, queryFile); - + + ifstream reportFile; + ifstream qualFile; + + ReportFile report; + QualityScores quality; + vector > qualForwardMap; + vector > qualReverseMap; + + if(qualFileName != "" && reportFileName != ""){ + m->openInputFile(qualFileName, qualFile); + report = ReportFile(reportFile, reportFileName); + + qualForwardMap.resize(1000); + qualReverseMap.resize(1000); + for(int i=0;i<1000;i++){ + qualForwardMap[i].assign(100,0); + qualReverseMap[i].assign(100,0); + } + } + int totalBases = 0; int totalMatches = 0; @@ -150,10 +263,32 @@ int SeqErrorCommand::execute(){ int numSeqs = 0; map::iterator it; + map > qScoreErrorMap; + qScoreErrorMap['m'].assign(41, 0); + qScoreErrorMap['s'].assign(41, 0); + qScoreErrorMap['i'].assign(41, 0); + qScoreErrorMap['a'].assign(41, 0); + + + + map > errorForward; + errorForward['m'].assign(1000,0); + errorForward['s'].assign(1000,0); + errorForward['i'].assign(1000,0); + errorForward['d'].assign(1000,0); + errorForward['a'].assign(1000,0); + + map > errorReverse; + errorReverse['m'].assign(1000,0); + errorReverse['s'].assign(1000,0); + errorReverse['i'].assign(1000,0); + errorReverse['d'].assign(1000,0); + errorReverse['a'].assign(1000,0); + + while(queryFile){ Compare minCompare; - Sequence query(queryFile); for(int i=0;isecond; } - else { - minCompare.weight = 1; - } + else { minCompare.weight = 1; } printErrorData(minCompare); + for(int i=0;iopenOutputFile(errorQualityFileName, errorQualityFile); + outputNames.push_back(errorQualityFileName); outputTypes["error.quality"].push_back(errorQualityFileName); + + errorQualityFile << "qscore\tmatches\tsubstitutions\tinsertions\tambiguous" << endl; + for(int i=0;i<41;i++){ + errorQualityFile << i << '\t' << qScoreErrorMap['m'][i] << '\t' << qScoreErrorMap['s'][i] << '\t' << qScoreErrorMap['i'][i] << '\t'<< qScoreErrorMap['a'][i] << endl; + } + errorQualityFile.close(); + + + + int lastRow = 0; + int lastColumn = 0; + + for(int i=0;iopenOutputFile(qualityForwardFileName, qualityForwardFile); + outputNames.push_back(errorQualityFileName); outputTypes["error.qual.forward"].push_back(qualityForwardFileName); + + for(int i=0;iopenOutputFile(qualityReverseFileName, qualityReverseFile); + outputNames.push_back(errorQualityFileName); outputTypes["error.qual.reverse"].push_back(qualityReverseFileName); + + for(int i=0;iopenOutputFile(errorForwardFileName, errorForwardFile); + outputNames.push_back(errorForwardFileName); outputTypes["error.forward"].push_back(errorForwardFileName); + + errorForwardFile << "position\ttotalseqs\tmatch\tsubstitution\tinsertion\tdeletion\tambiguous" << endl; + for(int i=0;i<1000;i++){ + float match = (float)errorForward['m'][i]; + float subst = (float)errorForward['s'][i]; + float insert = (float)errorForward['i'][i]; + float del = (float)errorForward['d'][i]; + float amb = (float)errorForward['a'][i]; + float total = match + subst + insert + del + amb; + if(total == 0){ break; } + errorForwardFile << i+1 << '\t' << total << '\t' << match/total << '\t' << subst/total << '\t' << insert/total << '\t' << del/total << '\t' << amb/total << endl; + } + errorForwardFile.close(); + + + string errorReverseFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.seq.reverse"; + ofstream errorReverseFile; + m->openOutputFile(errorReverseFileName, errorReverseFile); + outputNames.push_back(errorReverseFileName); outputTypes["error.reverse"].push_back(errorReverseFileName); + + errorReverseFile << "position\ttotalseqs\tmatch\tsubstitution\tinsertion\tdeletion\tambiguous" << endl; + for(int i=0;i<1000;i++){ + float match = (float)errorReverse['m'][i]; + float subst = (float)errorReverse['s'][i]; + float insert = (float)errorReverse['i'][i]; + float del = (float)errorReverse['d'][i]; + float amb = (float)errorReverse['a'][i]; + float total = match + subst + insert + del + amb; + if(total == 0){ break; } + errorReverseFile << i+1 << '\t' << total << '\t' << match/total << '\t' << subst/total << '\t' << insert/total << '\t' << del/total << '\t' << amb/total << endl; + } + errorReverseFile.close(); + + - string errorCountFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".count"; + string errorCountFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.count"; ofstream errorCountFile; m->openOutputFile(errorCountFileName, errorCountFile); - + outputNames.push_back(errorCountFileName); outputTypes["error.count"].push_back(errorCountFileName); m->mothurOut("Overall error rate:\t" + toString((double)(totalBases - totalMatches) / (double)totalBases) + "\n\n"); m->mothurOut("Errors\tSequences\n"); - - errorCountFile << "Errors\tSequences\n"; - + errorCountFile << "Errors\tSequences\n"; for(int i=0;imothurOut(toString(i) + '\t' + toString(misMatchCounts[i]) + '\n'); errorCountFile << i << '\t' << misMatchCounts[i] << endl; } + errorCountFile.close(); + + + + + + string subMatrixFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.matrix"; + ofstream subMatrixFile; + m->openOutputFile(subMatrixFileName, subMatrixFile); + outputNames.push_back(subMatrixFileName); outputTypes["error.matrix"].push_back(subMatrixFileName); + vector bases(6); + bases[0] = "A"; + bases[1] = "T"; + bases[2] = "G"; + bases[3] = "C"; + bases[4] = "Gap"; + bases[5] = "N"; + vector refSums(5,1); + + for(int i=0;i<5;i++){ + subMatrixFile << "\tr" << bases[i]; + + for(int j=0;j<6;j++){ + refSums[i] += substitutionMatrix[i][j]; + } + + } + subMatrixFile << endl; + + for(int i=0;i<6;i++){ + subMatrixFile << 'q' << bases[i]; + for(int j=0;j<5;j++){ + subMatrixFile << '\t' << substitutionMatrix[j][i]; + } + subMatrixFile << endl; + } + subMatrixFile << "total"; + for(int i=0;i<5;i++){ + subMatrixFile << '\t' << refSums[i]; + } + subMatrixFile << endl; + subMatrixFile.close(); + + return 0; } @@ -263,45 +556,45 @@ Compare SeqErrorCommand::getErrors(Sequence query, Sequence reference){ started = 1; if(q[i] == 'A'){ - if(r[i] == 'A'){ errors.AA++; errors.matches++; } - if(r[i] == 'T'){ errors.AT++; } - if(r[i] == 'G'){ errors.AG++; } - if(r[i] == 'C'){ errors.AC++; } - if(r[i] == '-'){ errors.Ai++; } + if(r[i] == 'A'){ errors.AA++; errors.matches++; errors.sequence += 'm'; } + if(r[i] == 'T'){ errors.AT++; errors.sequence += 's'; } + if(r[i] == 'G'){ errors.AG++; errors.sequence += 's'; } + if(r[i] == 'C'){ errors.AC++; errors.sequence += 's'; } + if(r[i] == '-'){ errors.Ai++; errors.sequence += 'i'; } } else if(q[i] == 'T'){ - if(r[i] == 'A'){ errors.TA++; } - if(r[i] == 'T'){ errors.TT++; errors.matches++; } - if(r[i] == 'G'){ errors.TG++; } - if(r[i] == 'C'){ errors.TC++; } - if(r[i] == '-'){ errors.Ti++; } + if(r[i] == 'A'){ errors.TA++; errors.sequence += 's'; } + if(r[i] == 'T'){ errors.TT++; errors.matches++; errors.sequence += 'm'; } + if(r[i] == 'G'){ errors.TG++; errors.sequence += 's'; } + if(r[i] == 'C'){ errors.TC++; errors.sequence += 's'; } + if(r[i] == '-'){ errors.Ti++; errors.sequence += 'i'; } } else if(q[i] == 'G'){ - if(r[i] == 'A'){ errors.GA++; } - if(r[i] == 'T'){ errors.GT++; } - if(r[i] == 'G'){ errors.GG++; errors.matches++; } - if(r[i] == 'C'){ errors.GC++; } - if(r[i] == '-'){ errors.Gi++; } + if(r[i] == 'A'){ errors.GA++; errors.sequence += 's'; } + if(r[i] == 'T'){ errors.GT++; errors.sequence += 's'; } + if(r[i] == 'G'){ errors.GG++; errors.matches++; errors.sequence += 'm'; } + if(r[i] == 'C'){ errors.GC++; errors.sequence += 's'; } + if(r[i] == '-'){ errors.Gi++; errors.sequence += 'i'; } } else if(q[i] == 'C'){ - if(r[i] == 'A'){ errors.CA++; } - if(r[i] == 'T'){ errors.CT++; } - if(r[i] == 'G'){ errors.CG++; } - if(r[i] == 'C'){ errors.CC++; errors.matches++; } - if(r[i] == '-'){ errors.Ci++; } + if(r[i] == 'A'){ errors.CA++; errors.sequence += 's'; } + if(r[i] == 'T'){ errors.CT++; errors.sequence += 's'; } + if(r[i] == 'G'){ errors.CG++; errors.sequence += 's'; } + if(r[i] == 'C'){ errors.CC++; errors.matches++; errors.sequence += 'm'; } + if(r[i] == '-'){ errors.Ci++; errors.sequence += 'i'; } } else if(q[i] == 'N'){ - if(r[i] == 'A'){ errors.NA++; } - if(r[i] == 'T'){ errors.NT++; } - if(r[i] == 'G'){ errors.NG++; } - if(r[i] == 'C'){ errors.NC++; } - if(r[i] == '-'){ errors.Ni++; } + if(r[i] == 'A'){ errors.NA++; errors.sequence += 'a'; } + if(r[i] == 'T'){ errors.NT++; errors.sequence += 'a'; } + if(r[i] == 'G'){ errors.NG++; errors.sequence += 'a'; } + if(r[i] == 'C'){ errors.NC++; errors.sequence += 'a'; } + if(r[i] == '-'){ errors.Ni++; errors.sequence += 'a'; } } else if(q[i] == '-' && r[i] != '-'){ - if(r[i] == 'A'){ errors.dA++; } - if(r[i] == 'T'){ errors.dT++; } - if(r[i] == 'G'){ errors.dG++; } - if(r[i] == 'C'){ errors.dC++; } + if(r[i] == 'A'){ errors.dA++; errors.sequence += 'd'; } + if(r[i] == 'T'){ errors.dT++; errors.sequence += 'd'; } + if(r[i] == 'G'){ errors.dG++; errors.sequence += 'd'; } + if(r[i] == 'C'){ errors.dC++; errors.sequence += 'd'; } } errors.total++; @@ -354,12 +647,12 @@ map SeqErrorCommand::getWeights(){ void SeqErrorCommand::printErrorHeader(){ try { - errorFile << "query\treference\tweight\t"; - errorFile << "AA\tAT\tAG\tAC\tTA\tTT\tTG\tTC\tGA\tGT\tGG\tGC\tCA\tCT\tCG\tCC\tNA\tNT\tNG\tNC\tAi\tTi\tGi\tCi\tNi\tdA\tdT\tdG\tdC\t"; - errorFile << "insertions\tdeletions\tsubstitutions\tambig\tmatches\tmismatches\ttotal\terror\n"; + errorSummaryFile << "query\treference\tweight\t"; + errorSummaryFile << "AA\tAT\tAG\tAC\tTA\tTT\tTG\tTC\tGA\tGT\tGG\tGC\tCA\tCT\tCG\tCC\tNA\tNT\tNG\tNC\tAi\tTi\tGi\tCi\tNi\tdA\tdT\tdG\tdC\t"; + errorSummaryFile << "insertions\tdeletions\tsubstitutions\tambig\tmatches\tmismatches\ttotal\terror\n"; - errorFile << setprecision(6); - errorFile.setf(ios::fixed); + errorSummaryFile << setprecision(6); + errorSummaryFile.setf(ios::fixed); } catch(exception& e) { m->errorOut(e, "SeqErrorCommand", "printErrorHeader"); @@ -371,20 +664,60 @@ void SeqErrorCommand::printErrorHeader(){ void SeqErrorCommand::printErrorData(Compare error){ try { - errorFile << error.queryName << '\t' << error.refName << '\t' << error.weight << '\t'; - errorFile << error.AA << '\t' << error.AT << '\t' << error.AG << '\t' << error.AC << '\t'; - errorFile << error.TA << '\t' << error.TT << '\t' << error.TG << '\t' << error.TC << '\t'; - errorFile << error.GA << '\t' << error.GT << '\t' << error.GG << '\t' << error.GC << '\t'; - errorFile << error.CA << '\t' << error.CT << '\t' << error.CG << '\t' << error.CC << '\t'; - errorFile << error.NA << '\t' << error.NT << '\t' << error.NG << '\t' << error.NC << '\t'; - errorFile << error.Ai << '\t' << error.Ti << '\t' << error.Gi << '\t' << error.Ci << '\t' << error.Ni << '\t' ; - errorFile << error.dA << '\t' << error.dT << '\t' << error.dG << '\t' << error.dC << '\t'; - - errorFile << error.Ai + error.Ti + error.Gi + error.Ci << '\t'; //insertions - errorFile << error.dA + error.dT + error.dG + error.dC << '\t'; //deletions - errorFile << error.mismatches - (error.Ai + error.Ti + error.Gi + error.Ci) - (error.dA + error.dT + error.dG + error.dC) - (error.NA + error.NT + error.NG + error.NC + error.Ni) << '\t'; //substitutions - errorFile << error.NA + error.NT + error.NG + error.NC + error.Ni << '\t'; //ambiguities - errorFile << error.matches << '\t' << error.mismatches << '\t' << error.total << '\t' << error.errorRate << endl; + errorSummaryFile << error.queryName << '\t' << error.refName << '\t' << error.weight << '\t'; + errorSummaryFile << error.AA << '\t' << error.AT << '\t' << error.AG << '\t' << error.AC << '\t'; + errorSummaryFile << error.TA << '\t' << error.TT << '\t' << error.TG << '\t' << error.TC << '\t'; + errorSummaryFile << error.GA << '\t' << error.GT << '\t' << error.GG << '\t' << error.GC << '\t'; + errorSummaryFile << error.CA << '\t' << error.CT << '\t' << error.CG << '\t' << error.CC << '\t'; + errorSummaryFile << error.NA << '\t' << error.NT << '\t' << error.NG << '\t' << error.NC << '\t'; + errorSummaryFile << error.Ai << '\t' << error.Ti << '\t' << error.Gi << '\t' << error.Ci << '\t' << error.Ni << '\t' ; + errorSummaryFile << error.dA << '\t' << error.dT << '\t' << error.dG << '\t' << error.dC << '\t'; + + errorSummaryFile << error.Ai + error.Ti + error.Gi + error.Ci << '\t'; //insertions + errorSummaryFile << error.dA + error.dT + error.dG + error.dC << '\t'; //deletions + errorSummaryFile << error.mismatches - (error.Ai + error.Ti + error.Gi + error.Ci) - (error.dA + error.dT + error.dG + error.dC) - (error.NA + error.NT + error.NG + error.NC + error.Ni) << '\t'; //substitutions + errorSummaryFile << error.NA + error.NT + error.NG + error.NC + error.Ni << '\t'; //ambiguities + errorSummaryFile << error.matches << '\t' << error.mismatches << '\t' << error.total << '\t' << error.errorRate << endl; + + errorSeqFile << '>' << error.queryName << "\tref:" << error.refName << '\n' << error.sequence << endl; + + + int a=0; int t=1; int g=2; int c=3; + int gap=4; int n=5; + + substitutionMatrix[a][a] += error.weight * error.AA; + substitutionMatrix[a][t] += error.weight * error.TA; + substitutionMatrix[a][g] += error.weight * error.GA; + substitutionMatrix[a][c] += error.weight * error.CA; + substitutionMatrix[a][gap] += error.weight * error.dA; + substitutionMatrix[a][n] += error.weight * error.NA; + + substitutionMatrix[t][a] += error.weight * error.AT; + substitutionMatrix[t][t] += error.weight * error.TT; + substitutionMatrix[t][g] += error.weight * error.GT; + substitutionMatrix[t][c] += error.weight * error.CT; + substitutionMatrix[t][gap] += error.weight * error.dT; + substitutionMatrix[t][n] += error.weight * error.NT; + + substitutionMatrix[g][a] += error.weight * error.AG; + substitutionMatrix[g][t] += error.weight * error.TG; + substitutionMatrix[g][g] += error.weight * error.GG; + substitutionMatrix[g][c] += error.weight * error.CG; + substitutionMatrix[g][gap] += error.weight * error.dG; + substitutionMatrix[g][n] += error.weight * error.NG; + + substitutionMatrix[c][a] += error.weight * error.AC; + substitutionMatrix[c][t] += error.weight * error.TC; + substitutionMatrix[c][g] += error.weight * error.GC; + substitutionMatrix[c][c] += error.weight * error.CC; + substitutionMatrix[c][gap] += error.weight * error.dC; + substitutionMatrix[c][n] += error.weight * error.NC; + + substitutionMatrix[gap][a] += error.weight * error.Ai; + substitutionMatrix[gap][t] += error.weight * error.Ti; + substitutionMatrix[gap][g] += error.weight * error.Gi; + substitutionMatrix[gap][c] += error.weight * error.Ci; + substitutionMatrix[gap][n] += error.weight * error.Ni; } catch(exception& e) {