]> git.donarmstrong.com Git - mothur.git/commitdiff
pat's updates on 7/19/10
authorpschloss <pschloss>
Mon, 19 Jul 2010 14:57:36 +0000 (14:57 +0000)
committerpschloss <pschloss>
Mon, 19 Jul 2010 14:57:36 +0000 (14:57 +0000)
Mothur.xcodeproj/project.pbxproj
commandfactory.cpp
getseqscommand.cpp
sensspeccommand.cpp
seqerrorcommand.cpp [new file with mode: 0644]
seqerrorcommand.h [new file with mode: 0644]
sequence.cpp
sequence.hpp

index 81623b1dd64b8139d8c81aeb5a5c8c63930897a7..a3e57afdee5fb693f9dc9d96b3bf646982aff7d3 100644 (file)
@@ -7,6 +7,8 @@
        objects = {
 
 /* Begin PBXFileReference section */
+               7E84528511EF4BEB00564975 /* seqerrorcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = seqerrorcommand.h; sourceTree = "<group>"; };
+               7E84528611EF4BEB00564975 /* seqerrorcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = seqerrorcommand.cpp; sourceTree = "<group>"; };
                7E85BD1C11EB5E9B00FD37C0 /* qualityscores.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = qualityscores.h; sourceTree = "<group>"; };
                7E85BD1D11EB5E9B00FD37C0 /* qualityscores.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = qualityscores.cpp; sourceTree = "<group>"; };
                7EA299BA11E384940022D8D3 /* sensspeccommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sensspeccommand.h; sourceTree = "<group>"; };
                                A7DA2174113FECD400BF472F /* validparameter.h */,
                                A7DA2175113FECD400BF472F /* venn.cpp */,
                                A7DA2176113FECD400BF472F /* venn.h */,
+                               7E84528511EF4BEB00564975 /* seqerrorcommand.h */,
+                               7E84528611EF4BEB00564975 /* seqerrorcommand.cpp */,
                        );
                        name = mothur;
                        sourceTree = "<group>";
index fc20ee5abbcda080d7965c133199e9a86e4a74cb..f86a354fdcb2fd4c5fb0b744f921e70cdb45abbf 100644 (file)
@@ -84,6 +84,7 @@
 #include "getrelabundcommand.h"
 #include "sensspeccommand.h"
 #include "sffinfocommand.h"
+#include "seqerrorcommand.h"
 
 /*******************************************************/
 
@@ -187,6 +188,7 @@ CommandFactory::CommandFactory(){
        commands["summary.seqs"]                = "MPIEnabled";
        commands["cluster.split"]               = "MPIEnabled";
        commands["sens.spec"]                   = "sens.spec";
+       commands["seq.error"]                   = "seq.error";
        commands["quit"]                                = "MPIEnabled"; 
 
 }
@@ -302,6 +304,7 @@ Command* CommandFactory::getCommand(string commandName, string optionString){
                else if(commandName == "degap.seqs")                    {       command = new DegapSeqsCommand(optionString);                           }
                else if(commandName == "get.relabund")                  {       command = new GetRelAbundCommand(optionString);                         }
                else if(commandName == "sens.spec")                             {       command = new SensSpecCommand(optionString);                            }
+               else if(commandName == "seq.error")                             {       command = new SeqErrorCommand(optionString);                            }
                else if(commandName == "sffinfo")                               {       command = new SffInfoCommand(optionString);                                     }
                else                                                                                    {       command = new NoCommand(optionString);                                          }
 
index 5e77a1d296b1d740f9ecd58209f555f33c8ac73f..ef0a1ac36de925af45e66c1c9420272272b32f2e 100644 (file)
@@ -22,7 +22,7 @@ GetSeqsCommand::GetSeqsCommand(string option)  {
                
                else {
                        //valid paramters for this command
-                       string Array[] =  {"fasta","name", "group", "alignreport", "accnos", "list","taxonomy","outputdir","inputdir"};
+                       string Array[] =  {"fasta","name", "group", "alignreport", "accnos", "dups", "list","taxonomy","outputdir","inputdir"};
                        vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
                        
                        OptionParser parser(option);
index edb1d452746191a4005e6dab0e1f70876e8b1216..3a147c5c2773df818388f8e899be80de8f7a0d17 100644 (file)
@@ -128,6 +128,7 @@ SensSpecCommand::SensSpecCommand(string option)  {
                exit(1);
        }
 }
+
 //**********************************************************************************************************************
 
 void SensSpecCommand::help(){
@@ -147,7 +148,6 @@ void SensSpecCommand::help(){
        }
 }
 
-
 //***************************************************************************************************************
 
 SensSpecCommand::~SensSpecCommand(){   /*      do nothing      */      }
diff --git a/seqerrorcommand.cpp b/seqerrorcommand.cpp
new file mode 100644 (file)
index 0000000..82af49e
--- /dev/null
@@ -0,0 +1,403 @@
+/*
+ *  seqerrorcommand.cpp
+ *  Mothur
+ *
+ *  Created by Pat Schloss on 7/15/10.
+ *  Copyright 2010 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "seqerrorcommand.h"
+
+//***************************************************************************************************************
+
+SeqErrorCommand::SeqErrorCommand(string option)  {
+       try {
+               
+               abort = false;
+               
+               //allow user to run help
+               if(option == "help") { help(); abort = true; }
+               
+               else {
+                       string temp;
+                       
+                       //valid paramters for this command
+                       string AlignArray[] =  {"query", "reference", "name", "threshold"};
+                       
+//need to implement name file option
+                       
+                       vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
+                       
+                       OptionParser parser(option);
+                       map<string,string> parameters = parser.getParameters();
+                       
+                       ValidParameters validParameter;
+                       map<string,string>::iterator it;
+                       
+                       //check to make sure all parameters are valid for command
+                       for (it = parameters.begin(); it != parameters.end(); it++) { 
+                               if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
+                       }
+                       
+                       //if the user changes the input directory command factory will send this info to us in the output parameter 
+                       string inputDir = validParameter.validFile(parameters, "inputdir", false);              
+                       if (inputDir == "not found"){   inputDir = "";          }
+                       else {
+                               string path;
+                               it = parameters.find("query");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["query"] = inputDir + it->second;            }
+                               }
+                               
+                               it = parameters.find("reference");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["reference"] = inputDir + it->second;                }
+                               }
+                               
+                               it = parameters.find("name");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["name"] = inputDir + it->second;             }
+                               }
+                               
+                       }
+                       //check for required parameters
+                       queryFileName = validParameter.validFile(parameters, "query", true);
+                       if (queryFileName == "not found") { m->mothurOut("query is a required parameter for the seq.error command."); m->mothurOutEndLine(); abort = true; }
+                       else if (queryFileName == "not open") { abort = true; } 
+                       
+                       referenceFileName = validParameter.validFile(parameters, "reference", true);
+                       if (referenceFileName == "not found") { m->mothurOut("reference is a required parameter for the seq.error command."); m->mothurOutEndLine(); abort = true; }
+                       else if (referenceFileName == "not open") { abort = true; }     
+                       
+                       //if the user changes the output directory command factory will send this info to us in the output parameter 
+                       namesFileName = validParameter.validFile(parameters, "name", true);
+                       if(namesFileName == "not found"){       namesFileName = "";     }
+                       cout << namesFileName << endl;
+                       
+                       outputDir = validParameter.validFile(parameters, "outputdir", false);
+                       if (outputDir == "not found"){  
+                               outputDir = ""; 
+                               outputDir += hasPath(queryFileName); //if user entered a file with a path then preserve it      
+                       }
+                       
+                       //check for optional parameter and set defaults
+                       // ...at some point should added some additional type checking...
+                       temp = validParameter.validFile(parameters, "threshold", false);        if (temp == "not found") { temp = "1.00"; }
+                       convert(temp, threshold);  
+                                               
+                       errorFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".errors";
+                       openOutputFile(errorFileName, errorFile);
+                       printErrorHeader();
+               }
+       }
+       catch(exception& e) {
+               m->errorOut(e, "SeqErrorCommand", "SeqErrorCommand");
+               exit(1);
+       }
+}
+
+//**********************************************************************************************************************
+
+void SeqErrorCommand::help(){
+       try {
+               m->mothurOut("The seq.error command reads a query alignment file and a reference alignment file and creates .....\n");
+               
+               
+               
+               m->mothurOut("Example seq.error(...).\n");
+               m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n");
+               m->mothurOut("For more details please check out the wiki http://www.mothur.org/wiki/seq.error .\n\n");
+       }
+       catch(exception& e) {
+               m->errorOut(e, "SeqErrorCommand", "help");
+               exit(1);
+       }
+}
+
+//***************************************************************************************************************
+
+SeqErrorCommand::~SeqErrorCommand(){   errorFile.close();      }
+
+//***************************************************************************************************************
+
+int SeqErrorCommand::execute(){
+       try{
+               if (abort == true) { return 0; }
+
+               getReferences();        //read in reference sequences - make sure there's no ambiguous bases
+
+               map<string, int> weights;
+               if(namesFileName != ""){        weights = getWeights(); }
+               
+               ifstream queryFile;
+               openInputFile(queryFileName, queryFile);
+                               
+               int totalBases = 0;
+               int totalMatches = 0;
+               
+               vector<int> misMatchCounts(11, 0);
+               int maxMismatch = 0;
+               int numSeqs = 0;
+               
+               map<string, int>::iterator it;
+               
+               while(queryFile){
+                       Compare minCompare;
+                       
+                       Sequence query(queryFile);
+                       
+                       for(int i=0;i<numRefs;i++){
+                               Compare currCompare = getErrors(query, referenceSeqs[i]);
+                               
+                               if(currCompare.errorRate < minCompare.errorRate){
+                                       minCompare = currCompare;
+                               }
+                               
+                       }
+
+                       if(namesFileName != ""){
+                               it = weights.find(query.getName());
+                               minCompare.weight = it->second;
+                       }
+                       else {
+                               minCompare.weight = 1;
+                       }
+
+                       printErrorData(minCompare);
+
+                       if(minCompare.errorRate < threshold){
+                               totalBases += (minCompare.total * minCompare.weight);
+                               totalMatches += minCompare.matches * minCompare.weight;
+                               if(minCompare.mismatches > maxMismatch){
+                                       maxMismatch = minCompare.mismatches;
+                                       misMatchCounts.resize(maxMismatch + 1, 0);
+                               }                               
+                               misMatchCounts[minCompare.mismatches] += minCompare.weight;
+                               numSeqs++;
+                       }
+                       
+               }
+               queryFile.close();
+               
+               int total = 0;
+               
+               
+               string errorCountFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".count";
+               ofstream errorCountFile;
+               openOutputFile(errorCountFileName, errorCountFile);
+               
+               m->mothurOut("Overall error rate:\t" + toString((double)(totalBases - totalMatches) / (double)totalBases) + "\n\n");
+               m->mothurOut("Errors\tSequences\n");
+               
+               errorCountFile << "Errors\tSequences\n";
+               
+               for(int i=0;i<misMatchCounts.size();i++){
+                       m->mothurOut(toString(i) + '\t' + toString(misMatchCounts[i]) + '\n');
+                       errorCountFile << i << '\t' << misMatchCounts[i] << endl;
+               }
+               
+               return 0;       
+       }
+       catch(exception& e) {
+               m->errorOut(e, "SeqErrorCommand", "execute");
+               exit(1);
+       }
+}
+
+//***************************************************************************************************************
+
+void SeqErrorCommand::getReferences(){
+       try {
+               
+               ifstream referenceFile;
+               openInputFile(referenceFileName, referenceFile);
+               
+               while(referenceFile){
+                       Sequence currentSeq(referenceFile);
+                       int numAmbigs = currentSeq.getAmbigBases();
+                       
+                       if(numAmbigs != 0){
+                               m->mothurOut("Warning: " + toString(currentSeq.getName()) + " has " + toString(numAmbigs) + " ambiguous bases, these bases will be removed\n");
+                               currentSeq.removeAmbigBases();
+                       }
+                       referenceSeqs.push_back(currentSeq);
+                       gobble(referenceFile);
+               }
+               numRefs = referenceSeqs.size();
+               
+               referenceFile.close();
+       }
+       catch(exception& e) {
+               m->errorOut(e, "SeqErrorCommand", "getReferences");
+               exit(1);
+       }
+}
+
+//***************************************************************************************************************
+
+Compare SeqErrorCommand::getErrors(Sequence query, Sequence reference){
+       try {
+               if(query.getAlignLength() != reference.getAlignLength()){
+                       m->mothurOut("Warning: " + toString(query.getName()) + " and " + toString(reference.getName()) + " are different lengths\n");
+               }
+               int alignLength = query.getAlignLength();
+       
+               string q = query.getAligned();
+               string r = reference.getAligned();
+
+               int started = 0;
+               Compare errors;
+
+               for(int i=0;i<alignLength;i++){
+                       if(q[i] != '.' && r[i] != '.' && (q[i] != '-' || r[i] != '-')){                 //      no missing data and no double gaps
+                               started = 1;
+                               
+                               if(q[i] == 'A'){
+                                       if(r[i] == 'A'){        errors.AA++;    errors.matches++;       }
+                                       if(r[i] == 'T'){        errors.AT++;    }
+                                       if(r[i] == 'G'){        errors.AG++;    }
+                                       if(r[i] == 'C'){        errors.AC++;    }
+                                       if(r[i] == '-'){        errors.Ai++;    }
+                               }
+                               else if(q[i] == 'T'){
+                                       if(r[i] == 'A'){        errors.TA++;    }
+                                       if(r[i] == 'T'){        errors.TT++;    errors.matches++;       }
+                                       if(r[i] == 'G'){        errors.TG++;    }
+                                       if(r[i] == 'C'){        errors.TC++;    }
+                                       if(r[i] == '-'){        errors.Ti++;    }
+                               }
+                               else if(q[i] == 'G'){
+                                       if(r[i] == 'A'){        errors.GA++;    }
+                                       if(r[i] == 'T'){        errors.GT++;    }
+                                       if(r[i] == 'G'){        errors.GG++;    errors.matches++;       }
+                                       if(r[i] == 'C'){        errors.GC++;    }
+                                       if(r[i] == '-'){        errors.Gi++;    }
+                               }
+                               else if(q[i] == 'C'){
+                                       if(r[i] == 'A'){        errors.CA++;    }
+                                       if(r[i] == 'T'){        errors.CT++;    }
+                                       if(r[i] == 'G'){        errors.CG++;    }
+                                       if(r[i] == 'C'){        errors.CC++;    errors.matches++;       }
+                                       if(r[i] == '-'){        errors.Ci++;    }
+                               }
+                               else if(q[i] == 'N'){
+                                       if(r[i] == 'A'){        errors.NA++;    }
+                                       if(r[i] == 'T'){        errors.NT++;    }
+                                       if(r[i] == 'G'){        errors.NG++;    }
+                                       if(r[i] == 'C'){        errors.NC++;    }
+                                       if(r[i] == '-'){        errors.Ni++;    }
+                               }
+                               else if(q[i] == '-' && r[i] != '-'){
+                                       if(r[i] == 'A'){        errors.dA++;    }
+                                       if(r[i] == 'T'){        errors.dT++;    }
+                                       if(r[i] == 'G'){        errors.dG++;    }
+                                       if(r[i] == 'C'){        errors.dC++;    }
+                               }
+                               errors.total++; 
+                               
+                       }
+                       else if(q[i] == '.' && r[i] != '.'){            //      reference extends beyond query
+                               if(started == 1){       break;  }
+                       }
+                       else if(q[i] != '.' && r[i] == '.'){            //      query extends beyond reference
+                               m->mothurOut("Warning: " + toString(query.getName()) + " extend beyond " + toString(reference.getName()) + ".  Ignoring the extra bases in the query\n");
+                               if(started == 1){       break;  }
+                       }
+                       else if(q[i] == '.' && r[i] == '.'){            //      both are missing data
+                               if(started == 1){       break;  }                       
+                       }
+                       
+               }
+               errors.mismatches = errors.total-errors.matches;
+               errors.errorRate = (double)(errors.total-errors.matches) / (double)errors.total;
+               errors.queryName = query.getName();
+               errors.refName = reference.getName();
+               
+               return errors;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "SeqErrorCommand", "getErrors");
+               exit(1);
+       }
+}
+
+//***************************************************************************************************************
+
+map<string, int> SeqErrorCommand::getWeights(){
+       ifstream nameFile;
+       openInputFile(namesFileName, nameFile);
+       
+       string seqName;
+       string redundantSeqs;
+       map<string, int> nameCountMap;
+       
+       while(nameFile){
+               nameFile >> seqName >> redundantSeqs;
+               nameCountMap[seqName] = getNumNames(redundantSeqs); 
+               gobble(nameFile);
+       }
+       return nameCountMap;
+}
+
+
+//***************************************************************************************************************
+
+void SeqErrorCommand::printErrorHeader(){
+       try {
+               errorFile << "query\treference\tweight\t";
+               errorFile << "AA\tAT\tAG\tAC\tTA\tTT\tTG\tTC\tGA\tGT\tGG\tGC\tCA\tCT\tCG\tCC\tNA\tNT\tNG\tNC\tAi\tTi\tGi\tCi\tNi\tdA\tdT\tdG\tdC\t";
+               errorFile << "insertions\tdeletions\tsubstitutions\tambig\tmatches\tmismatches\ttotal\terror\n";
+               
+               errorFile << setprecision(6);
+               errorFile.setf(ios::fixed);
+       }
+       catch(exception& e) {
+               m->errorOut(e, "SeqErrorCommand", "printErrorHeader");
+               exit(1);
+       }
+}
+
+//***************************************************************************************************************
+
+void SeqErrorCommand::printErrorData(Compare error){
+       try {
+               errorFile << error.queryName << '\t' << error.refName << '\t' << error.weight << '\t';
+               errorFile << error.AA << '\t' << error.AT << '\t' << error.AG << '\t' << error.AC << '\t';
+               errorFile << error.TA << '\t' << error.TT << '\t' << error.TG << '\t' << error.TC << '\t';
+               errorFile << error.GA << '\t' << error.GT << '\t' << error.GG << '\t' << error.GC << '\t';
+               errorFile << error.CA << '\t' << error.CT << '\t' << error.CG << '\t' << error.CC << '\t';
+               errorFile << error.NA << '\t' << error.NT << '\t' << error.NG << '\t' << error.NC << '\t';
+               errorFile << error.Ai << '\t' << error.Ti << '\t' << error.Gi << '\t' << error.Ci << '\t' << error.Ni << '\t' ;
+               errorFile << error.dA << '\t' << error.dT << '\t' << error.dG << '\t' << error.dC << '\t';
+               
+               errorFile << error.Ai + error.Ti + error.Gi + error.Ci << '\t';                 //insertions
+               errorFile << error.dA + error.dT + error.dG + error.dC << '\t';                 //deletions
+               errorFile << error.mismatches - (error.Ai + error.Ti + error.Gi + error.Ci) - (error.dA + error.dT + error.dG + error.dC) - (error.NA + error.NT + error.NG + error.NC + error.Ni) << '\t';     //substitutions
+               errorFile << error.NA + error.NT + error.NG + error.NC + error.Ni << '\t';      //ambiguities
+               errorFile << error.matches << '\t' << error.mismatches << '\t' << error.total << '\t' << error.errorRate << endl;
+               
+       }
+       catch(exception& e) {
+               m->errorOut(e, "SeqErrorCommand", "printErrorData");
+               exit(1);
+       }
+}
+
+//***************************************************************************************************************
+
+
+
+
+
+
+
diff --git a/seqerrorcommand.h b/seqerrorcommand.h
new file mode 100644 (file)
index 0000000..1eb2576
--- /dev/null
@@ -0,0 +1,65 @@
+#ifndef SEQERRORCOMMAND
+#define SEQERRORCOMMAND
+
+/*
+ *  seqerrorcommand.h
+ *  Mothur
+ *
+ *  Created by Pat Schloss on 7/15/10.
+ *  Copyright 2010 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "mothur.h"
+#include "command.hpp"
+#include "sequence.hpp"
+
+struct Compare {
+       int AA, AT, AG, AC,     TA, TT, TG, TC, GA, GT, GG, GC, CA, CT, CG, CC, NA, NT, NG, NC, Ai, Ti, Gi, Ci, Ni, dA, dT, dG, dC;
+       string refName, queryName;
+       double errorRate;
+       int weight, matches, mismatches, total;
+       
+       Compare(){
+               AA=0; AT=0; AG=0; AC=0;
+               TA=0; TT=0; TG=0; TC=0;
+               GA=0; GT=0; GG=0; GC=0;
+               CA=0; CT=0; CG=0; CC=0;
+               NA=0; NT=0; NG=0; NC=0;
+               Ai=0; Ti=0; Gi=0; Ci=0; Ni=0;
+               dA=0; dT=0; dG=0; dC=0;
+               refName = "";
+               queryName = "";
+               weight = 1;
+               matches = 0;
+               mismatches = 0;
+               total = 0;
+               errorRate = 1.0000;
+       }
+};
+
+class SeqErrorCommand : public Command {
+public:
+       SeqErrorCommand(string);
+       ~SeqErrorCommand();
+       int execute();
+       void help();
+       
+private:
+       bool abort;
+
+       void getReferences();
+       map<string,int> getWeights();
+       Compare getErrors(Sequence, Sequence);
+       void printErrorHeader();
+       void printErrorData(Compare);
+       
+       string queryFileName, referenceFileName, namesFileName, errorFileName, outputDir;
+       double threshold;
+       int numRefs;
+       ofstream errorFile;
+       
+       vector<Sequence> referenceSeqs;
+};
+
+#endif
index b9e1c4b3ffbc4951d23204b3b761a74708003edc..38e9b6230838424dd136562111f25967f6822099 100644 (file)
@@ -480,6 +480,18 @@ int Sequence::getAmbigBases(){
 
 //********************************************************************************************************************
 
+void Sequence::removeAmbigBases(){
+       
+       for(int j=0;j<alignmentLength;j++){
+               if(aligned[j] != 'A' && aligned[j] != 'T' && aligned[j] != 'G' && aligned[j] != 'C'){
+                       aligned[j] = '-';
+               }
+       }
+       setUnaligned(aligned);
+}
+       
+//********************************************************************************************************************
+
 int Sequence::getLongHomoPolymer(){
        if(longHomoPolymer == -1){
                longHomoPolymer = 1;
index 95c3e51839029393798dc2f57848d5e325c6c2df..c634f78b6522e43742800b0a4541d910eb7d6be3 100644 (file)
@@ -48,6 +48,7 @@ public:
        int getEndPos();
        int getAlignLength();
        int getAmbigBases();
+       void removeAmbigBases();
        int getLongHomoPolymer();
        bool getIsAligned();
        void printSequence(ostream&);