]> git.donarmstrong.com Git - mothur.git/commitdiff
modified trim.seqs to account for primers of different lengths
authorwestcott <westcott>
Fri, 7 May 2010 16:45:56 +0000 (16:45 +0000)
committerwestcott <westcott>
Fri, 7 May 2010 16:45:56 +0000 (16:45 +0000)
Mothur.xcodeproj/project.pbxproj
makegroupcommand.cpp [new file with mode: 0644]
makegroupcommand.h [new file with mode: 0644]
trimseqscommand.cpp
trimseqscommand.h

index 2084dc5fffb240afa30d8f790448006fc0b0d814..eb9edec42e9c830b0e25c4da9464d7e0831623d1 100644 (file)
@@ -7,6 +7,8 @@
        objects = {
 
 /* Begin PBXFileReference section */
+               A703FE931194645F002C397E /* makegroupcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = makegroupcommand.h; sourceTree = SOURCE_ROOT; };
+               A703FE941194645F002C397E /* makegroupcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = makegroupcommand.cpp; sourceTree = SOURCE_ROOT; };
                A72B3A62118B37FD004B9F8D /* phylodiversitycommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = phylodiversitycommand.h; sourceTree = "<group>"; };
                A72B3A63118B37FD004B9F8D /* phylodiversitycommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = phylodiversitycommand.cpp; sourceTree = "<group>"; };
                A72B3A7B118B4D1B004B9F8D /* phylodiversity.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = phylodiversity.h; sourceTree = "<group>"; };
                                A7DA207F113FECD400BF472F /* helpcommand.h */,
                                A7DA208D113FECD400BF472F /* libshuffcommand.cpp */,
                                A7DA208E113FECD400BF472F /* libshuffcommand.h */,
-                               A7DA208F113FECD400BF472F /* listseqscommand.cpp */,
                                A7DA2090113FECD400BF472F /* listseqscommand.h */,
+                               A7DA208F113FECD400BF472F /* listseqscommand.cpp */,
+                               A703FE931194645F002C397E /* makegroupcommand.h */,
+                               A703FE941194645F002C397E /* makegroupcommand.cpp */,
                                A7DA2098113FECD400BF472F /* matrixoutputcommand.cpp */,
                                A7DA2099113FECD400BF472F /* matrixoutputcommand.h */,
                                A7DA209A113FECD400BF472F /* mergefilecommand.cpp */,
diff --git a/makegroupcommand.cpp b/makegroupcommand.cpp
new file mode 100644 (file)
index 0000000..24c2de2
--- /dev/null
@@ -0,0 +1,142 @@
+/*
+ *  makegroupcommand.cpp
+ *  Mothur
+ *
+ *  Created by westcott on 5/7/10.
+ *  Copyright 2010 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "makegroupcommand.h"
+
+//**********************************************************************************************************************
+
+MakeGroupCommand::MakeGroupCommand(string option)  {
+       try {
+               
+               abort = false;
+       
+               //allow user to run help
+               if(option == "help") { help(); abort = true; }
+               
+               else {
+                       
+                       //valid paramters for this command
+                       string AlignArray[] =  {"fasta","groups","outputdir","inputdir"};
+                       vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
+                       
+                       OptionParser parser(option);
+                       map<string, string> parameters = parser.getParameters(); 
+                       
+                       ValidParameters validParameter;
+                       map<string, string>::iterator it;
+                       
+                       //check to make sure all parameters are valid for command
+                       for (it = parameters.begin(); it != parameters.end(); it++) { 
+                               if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
+                       }
+
+                       //if the user changes the output directory command factory will send this info to us in the output parameter 
+                       outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
+                       
+
+                       //if the user changes the input directory command factory will send this info to us in the output parameter 
+                       string inputDir = validParameter.validFile(parameters, "inputdir", false);              
+                       if (inputDir == "not found"){   inputDir = "";          }
+                       
+
+                       fastaFileName = validParameter.validFile(parameters, "fasta", false);
+                       if (fastaFileName == "not found") { m->mothurOut("fasta is a required parameter for the make.group command."); m->mothurOutEndLine(); abort = true;  }
+                       else { 
+                               splitAtDash(fastaFileName, fastaFileNames);
+                               
+                               //go through files and make sure they are good, if not, then disregard them
+                               for (int i = 0; i < fastaFileNames.size(); i++) {
+                                       if (inputDir != "") {
+                                               string path = hasPath(fastaFileNames[i]);
+                                               //if the user has not given a path then, add inputdir. else leave path alone.
+                                               if (path == "") {       fastaFileNames[i] = inputDir + fastaFileNames[i];               }
+                                       }
+       
+                                       int ableToOpen;
+                                       ifstream in;
+                                       
+                                       ableToOpen = openInputFile(fastaFileNames[i], in);
+                                       in.close();
+
+                                       if (ableToOpen == 1) { 
+                                               m->mothurOut(fastaFileNames[i] + " will be disregarded."); m->mothurOutEndLine(); 
+                                               //erase from file list
+                                               fastaFileNames.erase(fastaFileNames.begin()+i);
+                                               i--;
+                                       }
+                                       
+                               }
+                               
+                               //make sure there is at least one valid file left
+                               if (fastaFileNames.size() == 0) { m->mothurOut("no valid files."); m->mothurOutEndLine(); abort = true; }
+                       }
+               
+               }
+               
+       }
+       catch(exception& e) {
+               m->errorOut(e, "MakeGroupCommand", "MakeGroupCommand");
+               exit(1);
+       }
+}
+
+//**********************************************************************************************************************
+
+MakeGroupCommand::~MakeGroupCommand(){ }
+
+//**********************************************************************************************************************
+
+void MakeGroupCommand::help(){
+       try {
+               m->mothurOut("The align.seqs command reads a file containing sequences and creates an alignment file and a report file.\n");
+               m->mothurOut("The align.seqs command parameters are template, candidate, search, ksize, align, match, mismatch, gapopen and gapextend.\n");
+               m->mothurOut("The template and candidate parameters are required. You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amzon.fasta \n");
+               m->mothurOut("The search parameter allows you to specify the method to find most similar template.  Your options are: suffix, kmer and blast. The default is kmer.\n");
+               m->mothurOut("The align parameter allows you to specify the alignment method to use.  Your options are: gotoh, needleman, blast and noalign. The default is needleman.\n");
+               m->mothurOut("The ksize parameter allows you to specify the kmer size for finding most similar template to candidate.  The default is 8.\n");
+               m->mothurOut("The match parameter allows you to specify the bonus for having the same base. The default is 1.0.\n");
+               m->mothurOut("The mistmatch parameter allows you to specify the penalty for having different bases.  The default is -1.0.\n");
+               m->mothurOut("The gapopen parameter allows you to specify the penalty for opening a gap in an alignment. The default is -2.0.\n");
+               m->mothurOut("The gapextend parameter allows you to specify the penalty for extending a gap in an alignment.  The default is -1.0.\n");
+               m->mothurOut("The flip parameter is used to specify whether or not you want mothur to try the reverse complement if a sequence falls below the threshold.  The default is false.\n");
+               m->mothurOut("The threshold is used to specify a cutoff at which an alignment is deemed 'bad' and the reverse complement may be tried. The default threshold is 0.50, meaning 50% of the bases are removed in the alignment.\n");
+               m->mothurOut("If the flip parameter is set to true the reverse complement of the sequence is aligned and the better alignment is reported.\n");
+               m->mothurOut("The default for the threshold parameter is 0.50, meaning at least 50% of the bases must remain or the sequence is reported as potentially reversed.\n");
+               m->mothurOut("The align.seqs command should be in the following format: \n");
+               m->mothurOut("align.seqs(template=yourTemplateFile, candidate=yourCandidateFile, align=yourAlignmentMethod, search=yourSearchmethod, ksize=yourKmerSize, match=yourMatchBonus, mismatch=yourMismatchpenalty, gapopen=yourGapopenPenalty, gapextend=yourGapExtendPenalty) \n");
+               m->mothurOut("Example align.seqs(candidate=candidate.fasta, template=core.filtered, align=kmer, search=gotoh, ksize=8, match=2.0, mismatch=3.0, gapopen=-2.0, gapextend=-1.0)\n");
+               m->mothurOut("Note: No spaces between parameter labels (i.e. candidate), '=' and parameters (i.e.yourFastaFile).\n\n");
+       }
+       catch(exception& e) {
+               m->errorOut(e, "MakeGroupCommand", "help");
+               exit(1);
+       }
+}
+
+
+//**********************************************************************************************************************
+
+int MakeGroupCommand::execute(){
+       try {
+               if (abort == true) {    return 0;       }
+               
+               
+               m->mothurOutEndLine();
+               m->mothurOut("Output File Names: "); m->mothurOutEndLine();
+               //for (int i = 0; i < outputNames.size(); i++) {        m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
+               m->mothurOutEndLine();
+
+               return 0;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "MakeGroupCommand", "execute");
+               exit(1);
+       }
+}
+
diff --git a/makegroupcommand.h b/makegroupcommand.h
new file mode 100644 (file)
index 0000000..76435bf
--- /dev/null
@@ -0,0 +1,33 @@
+#ifndef MAKEGROUPCOMMAND_H
+#define MAKEGROUPCOMMAND_H
+
+/*
+ *  makegroupcommand.h
+ *  Mothur
+ *
+ *  Created by westcott on 5/7/10.
+ *  Copyright 2010 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "command.hpp"
+
+class MakeGroupCommand : public Command {
+       
+public:
+       MakeGroupCommand(string);       
+       ~MakeGroupCommand();
+       int execute(); 
+       void help();    
+       
+private:
+               
+       string fastaFileName, groups, outputDir;
+       vector<string> fastaFileNames;
+       vector<string> groupsNames;
+       
+       bool abort;
+};
+
+#endif
+
index 06bd0eda506882346f98bd834edcfb74e7f149b9..b12da1cfe126bbdfc562172772093cf84e733457 100644 (file)
@@ -23,7 +23,7 @@ TrimSeqsCommand::TrimSeqsCommand(string option)  {
                else {
                        //valid paramters for this command
                        string AlignArray[] =  {"fasta", "flip", "oligos", "maxambig", "maxhomop", "minlength", "maxlength", "qfile", 
-                                                                       "qthreshold", "qaverage", "allfiles", "qtrim","diffs", "processors", "outputdir","inputdir"};
+                                                                       "qthreshold", "qaverage", "allfiles", "qtrim","tdiffs", "pdiffs", "bdiffs", "processors", "outputdir","inputdir"};
                        
                        vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
                        
@@ -104,8 +104,14 @@ TrimSeqsCommand::TrimSeqsCommand(string option)  {
                        temp = validParameter.validFile(parameters, "maxlength", false);        if (temp == "not found") { temp = "0"; }
                        convert(temp, maxLength);
                        
-                       temp = validParameter.validFile(parameters, "diffs", false);            if (temp == "not found") { temp = "0"; }
-                       convert(temp, diffs);
+                       temp = validParameter.validFile(parameters, "tdiffs", false);           if (temp == "not found") { temp = "0"; }
+                       convert(temp, tdiffs);
+                       
+                       temp = validParameter.validFile(parameters, "bdiffs", false);           if (temp == "not found") { temp = "0"; }
+                       convert(temp, bdiffs);
+                       
+                       temp = validParameter.validFile(parameters, "pdiffs", false);           if (temp == "not found") { temp = "0"; }
+                       convert(temp, pdiffs);
                        
                        temp = validParameter.validFile(parameters, "qfile", true);     
                        if (temp == "not found")        {       qFileName = "";         }
@@ -160,7 +166,9 @@ void TrimSeqsCommand::help(){
                m->mothurOut("The maxhomop parameter .... The default is 0.\n");
                m->mothurOut("The minlength parameter .... The default is 0.\n");
                m->mothurOut("The maxlength parameter .... The default is 0.\n");
-               m->mothurOut("The diffs parameter .... The default is 0.\n");
+               m->mothurOut("The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is 0.\n");
+               m->mothurOut("The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n");
+               m->mothurOut("The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n");
                m->mothurOut("The qfile parameter .....\n");
                m->mothurOut("The qthreshold parameter .... The default is 0.\n");
                m->mothurOut("The qaverage parameter .... The default is 0.\n");
@@ -357,6 +365,7 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string
                        if (origSeq != "") {
                                int group;
                                string trashCode = "";
+                               int currentSeqsDiffs = 0;
                                
                                if(qFileName != ""){
                                        if(qThreshold != 0)             {       success = stripQualThreshold(currSeq, qFile);   }
@@ -370,13 +379,17 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string
                                if(barcodes.size() != 0){
                                        success = stripBarcode(currSeq, group);
                                        if(!success){   trashCode += 'b';       }
+                                       else{ currentSeqsDiffs += currentSeqsTdiffs;  }
                                }
                        
                                if(numFPrimers != 0){
                                        success = stripForward(currSeq);
                                        if(!success){   trashCode += 'f';       }
+                                       else{ currentSeqsDiffs += currentSeqsTdiffs;  }
                                }
-                                       
+                               
+                               if (currentSeqsDiffs > tdiffs) { trashCode += 't';   }
+
                                if(numRPrimers != 0){
                                        success = stripReverse(currSeq);
                                        if(!success){   trashCode += 'r';       }
@@ -609,7 +622,7 @@ bool TrimSeqsCommand::stripBarcode(Sequence& seq, int& group){
                }
                
                //if you found the barcode or if you don't want to allow for diffs
-               if ((diffs == 0) || (success == 1)) { return success;  }
+               if ((bdiffs == 0) || (success == 1)) { return success;  }
                
                else { //try aligning and see if you can find it
                        
@@ -618,7 +631,7 @@ bool TrimSeqsCommand::stripBarcode(Sequence& seq, int& group){
                                map<string,int>::iterator it=barcodes.begin();
                                string temp = it->first;
                                
-                               alignment = new NeedlemanOverlap(-2.0, 1.0, -1.0, (temp.length()+diffs+1));  
+                               alignment = new NeedlemanOverlap(-2.0, 1.0, -1.0, (temp.length()+bdiffs+1));  
                        }else{ alignment = NULL; } 
                        
 
@@ -633,13 +646,13 @@ bool TrimSeqsCommand::stripBarcode(Sequence& seq, int& group){
                                }
                                
                                //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
-                               alignment->align(oligo, rawSequence.substr(0,length+diffs));
+                               alignment->align(oligo, rawSequence.substr(0,length+bdiffs));
                                oligo = alignment->getSeqAAln();
                                string temp = alignment->getSeqBAln();
                //cout << "barcode = " << oligo << " raw = " << rawSequence.substr(0,oligo.length()) << " raw aligned = " << temp << endl;                      
                                
                                int newStart=0;
-                               if(compareDNASeq(oligo, temp, length, newStart)){
+                               if(compareDNASeq(oligo, temp, length, newStart, bdiffs)){
                                        group = it->second;
                                        seq.setUnaligned(rawSequence.substr(newStart));
                                        success = 1;
@@ -682,32 +695,33 @@ bool TrimSeqsCommand::stripForward(Sequence& seq){
                }
                
                //if you found the primer or if you don't want to allow for diffs
-               if ((diffs == 0) || (success == 1)) { return success;  }
+               if ((pdiffs == 0) || (success == 1)) { return success;  }
                
                else { //try aligning and see if you can find it
                        
-                       
                        Alignment* alignment;
-                       if (numFPrimers > 0) { alignment = new NeedlemanOverlap(-2.0, 1.0, -1.0, (forPrimer[0].length()+diffs+1));  } //assumes primers are all the same length
+                       if (numFPrimers > 0) {  alignment = new NeedlemanOverlap(-2.0, 1.0, -1.0, (forPrimer[0].length()+pdiffs+1));  } 
                        else{ alignment = NULL; } 
-                       
                        //can you find the primer
                        for(int i=0;i<numFPrimers;i++){
                                string oligo = forPrimer[i];
                                int length = oligo.length();
-                               
+                       
                                if(rawSequence.length() < oligo.length()){      
                                        success = 0;
                                        break;
                                }
+                       
+                               //resize if neccessary
+                               if ((length+pdiffs+1) > alignment->getnRows()) { alignment->resize(length+pdiffs+1);    }
                                
                                //use needleman to align first primer.length()+numdiffs of sequence to each primer
-                               alignment->align(oligo, rawSequence.substr(0,length+diffs));
+                               alignment->align(oligo, rawSequence.substr(0,length+pdiffs));
                                oligo = alignment->getSeqAAln();
                                string temp = alignment->getSeqBAln();
-                               
+                       
                                int newStart = 0;
-                               if(compareDNASeq(oligo, temp, length, newStart)){
+                               if(compareDNASeq(oligo, temp, length, newStart, pdiffs)){
                                        seq.setUnaligned(rawSequence.substr(newStart));
                                        success = 1;
                                        break;
@@ -856,24 +870,22 @@ bool TrimSeqsCommand::compareDNASeq(string oligo, string seq){
 }
 //***************************************************************************************************************
 
-bool TrimSeqsCommand::compareDNASeq(string oligo, string seq, int numBases, int& end){
+bool TrimSeqsCommand::compareDNASeq(string oligo, string seq, int numBases, int& end, int diffs){
        try {
                bool success = 1;
                int length = oligo.length();
-               end = length;
+               end = numBases;
                int countBases = 0;
                int countDiffs = 0;
                
                if (length != 0) {
-                       if ((oligo[0] == '-') || (oligo[0] == '.')) {  success = 0; return success;  } //no gaps allowed at beginning
+                       if ((oligo[0] == '-') || (oligo[0] == '.')) {  success = 0;  return success;  } //no gaps allowed at beginning
                }
                
                for(int i=0;i<length;i++){
                        
                        if ((oligo[i] != '-') && (oligo[i] != '.'))  { countBases++; } 
-                       
-       //cout << oligo[i] << " " << seq[i] << " diffs = " << countDiffs << " countBases = " << countBases << endl;     
-               
+                                       
                        if(oligo[i] != seq[i]){
                                if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C' || oligo[i] == '-' || oligo[i] == '.')      {       countDiffs++;   }
                                else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                {       countDiffs++;   }
@@ -894,9 +906,12 @@ bool TrimSeqsCommand::compareDNASeq(string oligo, string seq, int numBases, int&
                                success = 1;
                        }
                        
-                       if (countBases >= numBases) { end = i; break; } //stop checking after end of barcode or primer
+                       if (countBases >= numBases) { end = countBases; break; } //stop checking after end of barcode or primer
                }
-       
+               
+               //if it's a success we want to check for total diffs in driver, so save it.
+               if (success == 1) {  currentSeqsTdiffs = countDiffs; }
+               
                return success;
        }
        catch(exception& e) {
index 4a1d3d72c4f78e7c81fc22d17d996c68ff093984..f9a251c9323a9388af285ce6933fd5b676fe3838 100644 (file)
@@ -39,13 +39,13 @@ private:
        bool cullHomoP(Sequence&);
        bool cullAmbigs(Sequence&);
        bool compareDNASeq(string, string);
-       bool compareDNASeq(string, string, int, int&);
+       bool compareDNASeq(string, string, int, int&, int);
 
        bool abort;
        string fastaFile, oligoFile, qFileName, outputDir;
        
        bool flip, allFiles, qtrim;
-       int numFPrimers, numRPrimers, maxAmbig, maxHomoP, minLength, maxLength, qThreshold, qAverage, processors, diffs;
+       int numFPrimers, numRPrimers, maxAmbig, maxHomoP, minLength, maxLength, qThreshold, qAverage, processors, tdiffs, bdiffs, pdiffs, currentSeqsTdiffs;
        vector<string> forPrimer, revPrimer, outputNames;
        map<string, int> barcodes;
        vector<string> groupVector;