From c27e902ede71ea4d20af65752ef04459c101611b Mon Sep 17 00:00:00 2001 From: pschloss Date: Mon, 19 Jul 2010 14:57:36 +0000 Subject: [PATCH] pat's updates on 7/19/10 --- Mothur.xcodeproj/project.pbxproj | 4 + commandfactory.cpp | 3 + getseqscommand.cpp | 2 +- sensspeccommand.cpp | 2 +- seqerrorcommand.cpp | 403 +++++++++++++++++++++++++++++++ seqerrorcommand.h | 65 +++++ sequence.cpp | 12 + sequence.hpp | 1 + 8 files changed, 490 insertions(+), 2 deletions(-) create mode 100644 seqerrorcommand.cpp create mode 100644 seqerrorcommand.h diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj index 81623b1..a3e57af 100644 --- a/Mothur.xcodeproj/project.pbxproj +++ b/Mothur.xcodeproj/project.pbxproj @@ -7,6 +7,8 @@ objects = { /* Begin PBXFileReference section */ + 7E84528511EF4BEB00564975 /* seqerrorcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = seqerrorcommand.h; sourceTree = ""; }; + 7E84528611EF4BEB00564975 /* seqerrorcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = seqerrorcommand.cpp; sourceTree = ""; }; 7E85BD1C11EB5E9B00FD37C0 /* qualityscores.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = qualityscores.h; sourceTree = ""; }; 7E85BD1D11EB5E9B00FD37C0 /* qualityscores.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = qualityscores.cpp; sourceTree = ""; }; 7EA299BA11E384940022D8D3 /* sensspeccommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sensspeccommand.h; sourceTree = ""; }; @@ -542,6 +544,8 @@ A7DA2174113FECD400BF472F /* validparameter.h */, A7DA2175113FECD400BF472F /* venn.cpp */, A7DA2176113FECD400BF472F /* venn.h */, + 7E84528511EF4BEB00564975 /* seqerrorcommand.h */, + 7E84528611EF4BEB00564975 /* seqerrorcommand.cpp */, ); name = mothur; sourceTree = ""; diff --git a/commandfactory.cpp b/commandfactory.cpp index fc20ee5..f86a354 100644 --- a/commandfactory.cpp +++ b/commandfactory.cpp @@ -84,6 +84,7 @@ #include "getrelabundcommand.h" #include "sensspeccommand.h" #include "sffinfocommand.h" +#include "seqerrorcommand.h" /*******************************************************/ @@ -187,6 +188,7 @@ CommandFactory::CommandFactory(){ commands["summary.seqs"] = "MPIEnabled"; commands["cluster.split"] = "MPIEnabled"; commands["sens.spec"] = "sens.spec"; + commands["seq.error"] = "seq.error"; commands["quit"] = "MPIEnabled"; } @@ -302,6 +304,7 @@ Command* CommandFactory::getCommand(string commandName, string optionString){ else if(commandName == "degap.seqs") { command = new DegapSeqsCommand(optionString); } else if(commandName == "get.relabund") { command = new GetRelAbundCommand(optionString); } else if(commandName == "sens.spec") { command = new SensSpecCommand(optionString); } + else if(commandName == "seq.error") { command = new SeqErrorCommand(optionString); } else if(commandName == "sffinfo") { command = new SffInfoCommand(optionString); } else { command = new NoCommand(optionString); } diff --git a/getseqscommand.cpp b/getseqscommand.cpp index 5e77a1d..ef0a1ac 100644 --- a/getseqscommand.cpp +++ b/getseqscommand.cpp @@ -22,7 +22,7 @@ GetSeqsCommand::GetSeqsCommand(string option) { else { //valid paramters for this command - string Array[] = {"fasta","name", "group", "alignreport", "accnos", "list","taxonomy","outputdir","inputdir"}; + string Array[] = {"fasta","name", "group", "alignreport", "accnos", "dups", "list","taxonomy","outputdir","inputdir"}; vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); OptionParser parser(option); diff --git a/sensspeccommand.cpp b/sensspeccommand.cpp index edb1d45..3a147c5 100644 --- a/sensspeccommand.cpp +++ b/sensspeccommand.cpp @@ -128,6 +128,7 @@ SensSpecCommand::SensSpecCommand(string option) { exit(1); } } + //********************************************************************************************************************** void SensSpecCommand::help(){ @@ -147,7 +148,6 @@ void SensSpecCommand::help(){ } } - //*************************************************************************************************************** SensSpecCommand::~SensSpecCommand(){ /* do nothing */ } diff --git a/seqerrorcommand.cpp b/seqerrorcommand.cpp new file mode 100644 index 0000000..82af49e --- /dev/null +++ b/seqerrorcommand.cpp @@ -0,0 +1,403 @@ +/* + * seqerrorcommand.cpp + * Mothur + * + * Created by Pat Schloss on 7/15/10. + * Copyright 2010 Schloss Lab. All rights reserved. + * + */ + +#include "seqerrorcommand.h" + +//*************************************************************************************************************** + +SeqErrorCommand::SeqErrorCommand(string option) { + try { + + abort = false; + + //allow user to run help + if(option == "help") { help(); abort = true; } + + else { + string temp; + + //valid paramters for this command + string AlignArray[] = {"query", "reference", "name", "threshold"}; + +//need to implement name file option + + vector myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string))); + + OptionParser parser(option); + map parameters = parser.getParameters(); + + ValidParameters validParameter; + map::iterator it; + + //check to make sure all parameters are valid for command + for (it = parameters.begin(); it != parameters.end(); it++) { + if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; } + } + + //if the user changes the input directory command factory will send this info to us in the output parameter + string inputDir = validParameter.validFile(parameters, "inputdir", false); + if (inputDir == "not found"){ inputDir = ""; } + else { + string path; + it = parameters.find("query"); + //user has given a template file + if(it != parameters.end()){ + path = hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["query"] = inputDir + it->second; } + } + + it = parameters.find("reference"); + //user has given a template file + if(it != parameters.end()){ + path = hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["reference"] = inputDir + it->second; } + } + + it = parameters.find("name"); + //user has given a template file + if(it != parameters.end()){ + path = hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["name"] = inputDir + it->second; } + } + + } + //check for required parameters + queryFileName = validParameter.validFile(parameters, "query", true); + if (queryFileName == "not found") { m->mothurOut("query is a required parameter for the seq.error command."); m->mothurOutEndLine(); abort = true; } + else if (queryFileName == "not open") { abort = true; } + + referenceFileName = validParameter.validFile(parameters, "reference", true); + if (referenceFileName == "not found") { m->mothurOut("reference is a required parameter for the seq.error command."); m->mothurOutEndLine(); abort = true; } + else if (referenceFileName == "not open") { abort = true; } + + //if the user changes the output directory command factory will send this info to us in the output parameter + namesFileName = validParameter.validFile(parameters, "name", true); + if(namesFileName == "not found"){ namesFileName = ""; } + cout << namesFileName << endl; + + outputDir = validParameter.validFile(parameters, "outputdir", false); + if (outputDir == "not found"){ + outputDir = ""; + outputDir += hasPath(queryFileName); //if user entered a file with a path then preserve it + } + + //check for optional parameter and set defaults + // ...at some point should added some additional type checking... + temp = validParameter.validFile(parameters, "threshold", false); if (temp == "not found") { temp = "1.00"; } + convert(temp, threshold); + + errorFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".errors"; + openOutputFile(errorFileName, errorFile); + printErrorHeader(); + } + } + catch(exception& e) { + m->errorOut(e, "SeqErrorCommand", "SeqErrorCommand"); + exit(1); + } +} + +//********************************************************************************************************************** + +void SeqErrorCommand::help(){ + try { + m->mothurOut("The seq.error command reads a query alignment file and a reference alignment file and creates .....\n"); + + + + m->mothurOut("Example seq.error(...).\n"); + m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n"); + m->mothurOut("For more details please check out the wiki http://www.mothur.org/wiki/seq.error .\n\n"); + } + catch(exception& e) { + m->errorOut(e, "SeqErrorCommand", "help"); + exit(1); + } +} + +//*************************************************************************************************************** + +SeqErrorCommand::~SeqErrorCommand(){ errorFile.close(); } + +//*************************************************************************************************************** + +int SeqErrorCommand::execute(){ + try{ + if (abort == true) { return 0; } + + getReferences(); //read in reference sequences - make sure there's no ambiguous bases + + map weights; + if(namesFileName != ""){ weights = getWeights(); } + + ifstream queryFile; + openInputFile(queryFileName, queryFile); + + int totalBases = 0; + int totalMatches = 0; + + vector misMatchCounts(11, 0); + int maxMismatch = 0; + int numSeqs = 0; + + map::iterator it; + + while(queryFile){ + Compare minCompare; + + Sequence query(queryFile); + + for(int i=0;isecond; + } + else { + minCompare.weight = 1; + } + + printErrorData(minCompare); + + if(minCompare.errorRate < threshold){ + totalBases += (minCompare.total * minCompare.weight); + totalMatches += minCompare.matches * minCompare.weight; + if(minCompare.mismatches > maxMismatch){ + maxMismatch = minCompare.mismatches; + misMatchCounts.resize(maxMismatch + 1, 0); + } + misMatchCounts[minCompare.mismatches] += minCompare.weight; + numSeqs++; + } + + } + queryFile.close(); + + int total = 0; + + + string errorCountFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".count"; + ofstream errorCountFile; + openOutputFile(errorCountFileName, errorCountFile); + + m->mothurOut("Overall error rate:\t" + toString((double)(totalBases - totalMatches) / (double)totalBases) + "\n\n"); + m->mothurOut("Errors\tSequences\n"); + + errorCountFile << "Errors\tSequences\n"; + + for(int i=0;imothurOut(toString(i) + '\t' + toString(misMatchCounts[i]) + '\n'); + errorCountFile << i << '\t' << misMatchCounts[i] << endl; + } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "SeqErrorCommand", "execute"); + exit(1); + } +} + +//*************************************************************************************************************** + +void SeqErrorCommand::getReferences(){ + try { + + ifstream referenceFile; + openInputFile(referenceFileName, referenceFile); + + while(referenceFile){ + Sequence currentSeq(referenceFile); + int numAmbigs = currentSeq.getAmbigBases(); + + if(numAmbigs != 0){ + m->mothurOut("Warning: " + toString(currentSeq.getName()) + " has " + toString(numAmbigs) + " ambiguous bases, these bases will be removed\n"); + currentSeq.removeAmbigBases(); + } + referenceSeqs.push_back(currentSeq); + gobble(referenceFile); + } + numRefs = referenceSeqs.size(); + + referenceFile.close(); + } + catch(exception& e) { + m->errorOut(e, "SeqErrorCommand", "getReferences"); + exit(1); + } +} + +//*************************************************************************************************************** + +Compare SeqErrorCommand::getErrors(Sequence query, Sequence reference){ + try { + if(query.getAlignLength() != reference.getAlignLength()){ + m->mothurOut("Warning: " + toString(query.getName()) + " and " + toString(reference.getName()) + " are different lengths\n"); + } + int alignLength = query.getAlignLength(); + + string q = query.getAligned(); + string r = reference.getAligned(); + + int started = 0; + Compare errors; + + for(int i=0;imothurOut("Warning: " + toString(query.getName()) + " extend beyond " + toString(reference.getName()) + ". Ignoring the extra bases in the query\n"); + if(started == 1){ break; } + } + else if(q[i] == '.' && r[i] == '.'){ // both are missing data + if(started == 1){ break; } + } + + } + errors.mismatches = errors.total-errors.matches; + errors.errorRate = (double)(errors.total-errors.matches) / (double)errors.total; + errors.queryName = query.getName(); + errors.refName = reference.getName(); + + return errors; + } + catch(exception& e) { + m->errorOut(e, "SeqErrorCommand", "getErrors"); + exit(1); + } +} + +//*************************************************************************************************************** + +map SeqErrorCommand::getWeights(){ + ifstream nameFile; + openInputFile(namesFileName, nameFile); + + string seqName; + string redundantSeqs; + map nameCountMap; + + while(nameFile){ + nameFile >> seqName >> redundantSeqs; + nameCountMap[seqName] = getNumNames(redundantSeqs); + gobble(nameFile); + } + return nameCountMap; +} + + +//*************************************************************************************************************** + +void SeqErrorCommand::printErrorHeader(){ + try { + errorFile << "query\treference\tweight\t"; + errorFile << "AA\tAT\tAG\tAC\tTA\tTT\tTG\tTC\tGA\tGT\tGG\tGC\tCA\tCT\tCG\tCC\tNA\tNT\tNG\tNC\tAi\tTi\tGi\tCi\tNi\tdA\tdT\tdG\tdC\t"; + errorFile << "insertions\tdeletions\tsubstitutions\tambig\tmatches\tmismatches\ttotal\terror\n"; + + errorFile << setprecision(6); + errorFile.setf(ios::fixed); + } + catch(exception& e) { + m->errorOut(e, "SeqErrorCommand", "printErrorHeader"); + exit(1); + } +} + +//*************************************************************************************************************** + +void SeqErrorCommand::printErrorData(Compare error){ + try { + errorFile << error.queryName << '\t' << error.refName << '\t' << error.weight << '\t'; + errorFile << error.AA << '\t' << error.AT << '\t' << error.AG << '\t' << error.AC << '\t'; + errorFile << error.TA << '\t' << error.TT << '\t' << error.TG << '\t' << error.TC << '\t'; + errorFile << error.GA << '\t' << error.GT << '\t' << error.GG << '\t' << error.GC << '\t'; + errorFile << error.CA << '\t' << error.CT << '\t' << error.CG << '\t' << error.CC << '\t'; + errorFile << error.NA << '\t' << error.NT << '\t' << error.NG << '\t' << error.NC << '\t'; + errorFile << error.Ai << '\t' << error.Ti << '\t' << error.Gi << '\t' << error.Ci << '\t' << error.Ni << '\t' ; + errorFile << error.dA << '\t' << error.dT << '\t' << error.dG << '\t' << error.dC << '\t'; + + errorFile << error.Ai + error.Ti + error.Gi + error.Ci << '\t'; //insertions + errorFile << error.dA + error.dT + error.dG + error.dC << '\t'; //deletions + errorFile << error.mismatches - (error.Ai + error.Ti + error.Gi + error.Ci) - (error.dA + error.dT + error.dG + error.dC) - (error.NA + error.NT + error.NG + error.NC + error.Ni) << '\t'; //substitutions + errorFile << error.NA + error.NT + error.NG + error.NC + error.Ni << '\t'; //ambiguities + errorFile << error.matches << '\t' << error.mismatches << '\t' << error.total << '\t' << error.errorRate << endl; + + } + catch(exception& e) { + m->errorOut(e, "SeqErrorCommand", "printErrorData"); + exit(1); + } +} + +//*************************************************************************************************************** + + + + + + + diff --git a/seqerrorcommand.h b/seqerrorcommand.h new file mode 100644 index 0000000..1eb2576 --- /dev/null +++ b/seqerrorcommand.h @@ -0,0 +1,65 @@ +#ifndef SEQERRORCOMMAND +#define SEQERRORCOMMAND + +/* + * seqerrorcommand.h + * Mothur + * + * Created by Pat Schloss on 7/15/10. + * Copyright 2010 Schloss Lab. All rights reserved. + * + */ + +#include "mothur.h" +#include "command.hpp" +#include "sequence.hpp" + +struct Compare { + int AA, AT, AG, AC, TA, TT, TG, TC, GA, GT, GG, GC, CA, CT, CG, CC, NA, NT, NG, NC, Ai, Ti, Gi, Ci, Ni, dA, dT, dG, dC; + string refName, queryName; + double errorRate; + int weight, matches, mismatches, total; + + Compare(){ + AA=0; AT=0; AG=0; AC=0; + TA=0; TT=0; TG=0; TC=0; + GA=0; GT=0; GG=0; GC=0; + CA=0; CT=0; CG=0; CC=0; + NA=0; NT=0; NG=0; NC=0; + Ai=0; Ti=0; Gi=0; Ci=0; Ni=0; + dA=0; dT=0; dG=0; dC=0; + refName = ""; + queryName = ""; + weight = 1; + matches = 0; + mismatches = 0; + total = 0; + errorRate = 1.0000; + } +}; + +class SeqErrorCommand : public Command { +public: + SeqErrorCommand(string); + ~SeqErrorCommand(); + int execute(); + void help(); + +private: + bool abort; + + void getReferences(); + map getWeights(); + Compare getErrors(Sequence, Sequence); + void printErrorHeader(); + void printErrorData(Compare); + + string queryFileName, referenceFileName, namesFileName, errorFileName, outputDir; + double threshold; + int numRefs; + ofstream errorFile; + + vector referenceSeqs; +}; + +#endif diff --git a/sequence.cpp b/sequence.cpp index b9e1c4b..38e9b62 100644 --- a/sequence.cpp +++ b/sequence.cpp @@ -480,6 +480,18 @@ int Sequence::getAmbigBases(){ //******************************************************************************************************************** +void Sequence::removeAmbigBases(){ + + for(int j=0;j