From 17aafaea968f87e581297063b16695ad515bea53 Mon Sep 17 00:00:00 2001 From: westcott Date: Fri, 7 May 2010 16:45:56 +0000 Subject: [PATCH] modified trim.seqs to account for primers of different lengths --- Mothur.xcodeproj/project.pbxproj | 6 +- makegroupcommand.cpp | 142 +++++++++++++++++++++++++++++++ makegroupcommand.h | 33 +++++++ trimseqscommand.cpp | 65 ++++++++------ trimseqscommand.h | 4 +- 5 files changed, 222 insertions(+), 28 deletions(-) create mode 100644 makegroupcommand.cpp create mode 100644 makegroupcommand.h diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj index 2084dc5..eb9edec 100644 --- a/Mothur.xcodeproj/project.pbxproj +++ b/Mothur.xcodeproj/project.pbxproj @@ -7,6 +7,8 @@ objects = { /* Begin PBXFileReference section */ + A703FE931194645F002C397E /* makegroupcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = makegroupcommand.h; sourceTree = SOURCE_ROOT; }; + A703FE941194645F002C397E /* makegroupcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = makegroupcommand.cpp; sourceTree = SOURCE_ROOT; }; A72B3A62118B37FD004B9F8D /* phylodiversitycommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = phylodiversitycommand.h; sourceTree = ""; }; A72B3A63118B37FD004B9F8D /* phylodiversitycommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = phylodiversitycommand.cpp; sourceTree = ""; }; A72B3A7B118B4D1B004B9F8D /* phylodiversity.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = phylodiversity.h; sourceTree = ""; }; @@ -694,8 +696,10 @@ A7DA207F113FECD400BF472F /* helpcommand.h */, A7DA208D113FECD400BF472F /* libshuffcommand.cpp */, A7DA208E113FECD400BF472F /* libshuffcommand.h */, - A7DA208F113FECD400BF472F /* listseqscommand.cpp */, A7DA2090113FECD400BF472F /* listseqscommand.h */, + A7DA208F113FECD400BF472F /* listseqscommand.cpp */, + A703FE931194645F002C397E /* makegroupcommand.h */, + A703FE941194645F002C397E /* makegroupcommand.cpp */, A7DA2098113FECD400BF472F /* matrixoutputcommand.cpp */, A7DA2099113FECD400BF472F /* matrixoutputcommand.h */, A7DA209A113FECD400BF472F /* mergefilecommand.cpp */, diff --git a/makegroupcommand.cpp b/makegroupcommand.cpp new file mode 100644 index 0000000..24c2de2 --- /dev/null +++ b/makegroupcommand.cpp @@ -0,0 +1,142 @@ +/* + * makegroupcommand.cpp + * Mothur + * + * Created by westcott on 5/7/10. + * Copyright 2010 Schloss Lab. All rights reserved. + * + */ + +#include "makegroupcommand.h" + +//********************************************************************************************************************** + +MakeGroupCommand::MakeGroupCommand(string option) { + try { + + abort = false; + + //allow user to run help + if(option == "help") { help(); abort = true; } + + else { + + //valid paramters for this command + string AlignArray[] = {"fasta","groups","outputdir","inputdir"}; + vector myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string))); + + OptionParser parser(option); + map parameters = parser.getParameters(); + + ValidParameters validParameter; + map::iterator it; + + //check to make sure all parameters are valid for command + for (it = parameters.begin(); it != parameters.end(); it++) { + if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; } + } + + //if the user changes the output directory command factory will send this info to us in the output parameter + outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; } + + + //if the user changes the input directory command factory will send this info to us in the output parameter + string inputDir = validParameter.validFile(parameters, "inputdir", false); + if (inputDir == "not found"){ inputDir = ""; } + + + fastaFileName = validParameter.validFile(parameters, "fasta", false); + if (fastaFileName == "not found") { m->mothurOut("fasta is a required parameter for the make.group command."); m->mothurOutEndLine(); abort = true; } + else { + splitAtDash(fastaFileName, fastaFileNames); + + //go through files and make sure they are good, if not, then disregard them + for (int i = 0; i < fastaFileNames.size(); i++) { + if (inputDir != "") { + string path = hasPath(fastaFileNames[i]); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { fastaFileNames[i] = inputDir + fastaFileNames[i]; } + } + + int ableToOpen; + ifstream in; + + ableToOpen = openInputFile(fastaFileNames[i], in); + in.close(); + + if (ableToOpen == 1) { + m->mothurOut(fastaFileNames[i] + " will be disregarded."); m->mothurOutEndLine(); + //erase from file list + fastaFileNames.erase(fastaFileNames.begin()+i); + i--; + } + + } + + //make sure there is at least one valid file left + if (fastaFileNames.size() == 0) { m->mothurOut("no valid files."); m->mothurOutEndLine(); abort = true; } + } + + } + + } + catch(exception& e) { + m->errorOut(e, "MakeGroupCommand", "MakeGroupCommand"); + exit(1); + } +} + +//********************************************************************************************************************** + +MakeGroupCommand::~MakeGroupCommand(){ } + +//********************************************************************************************************************** + +void MakeGroupCommand::help(){ + try { + m->mothurOut("The align.seqs command reads a file containing sequences and creates an alignment file and a report file.\n"); + m->mothurOut("The align.seqs command parameters are template, candidate, search, ksize, align, match, mismatch, gapopen and gapextend.\n"); + m->mothurOut("The template and candidate parameters are required. You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amzon.fasta \n"); + m->mothurOut("The search parameter allows you to specify the method to find most similar template. Your options are: suffix, kmer and blast. The default is kmer.\n"); + m->mothurOut("The align parameter allows you to specify the alignment method to use. Your options are: gotoh, needleman, blast and noalign. The default is needleman.\n"); + m->mothurOut("The ksize parameter allows you to specify the kmer size for finding most similar template to candidate. The default is 8.\n"); + m->mothurOut("The match parameter allows you to specify the bonus for having the same base. The default is 1.0.\n"); + m->mothurOut("The mistmatch parameter allows you to specify the penalty for having different bases. The default is -1.0.\n"); + m->mothurOut("The gapopen parameter allows you to specify the penalty for opening a gap in an alignment. The default is -2.0.\n"); + m->mothurOut("The gapextend parameter allows you to specify the penalty for extending a gap in an alignment. The default is -1.0.\n"); + m->mothurOut("The flip parameter is used to specify whether or not you want mothur to try the reverse complement if a sequence falls below the threshold. The default is false.\n"); + m->mothurOut("The threshold is used to specify a cutoff at which an alignment is deemed 'bad' and the reverse complement may be tried. The default threshold is 0.50, meaning 50% of the bases are removed in the alignment.\n"); + m->mothurOut("If the flip parameter is set to true the reverse complement of the sequence is aligned and the better alignment is reported.\n"); + m->mothurOut("The default for the threshold parameter is 0.50, meaning at least 50% of the bases must remain or the sequence is reported as potentially reversed.\n"); + m->mothurOut("The align.seqs command should be in the following format: \n"); + m->mothurOut("align.seqs(template=yourTemplateFile, candidate=yourCandidateFile, align=yourAlignmentMethod, search=yourSearchmethod, ksize=yourKmerSize, match=yourMatchBonus, mismatch=yourMismatchpenalty, gapopen=yourGapopenPenalty, gapextend=yourGapExtendPenalty) \n"); + m->mothurOut("Example align.seqs(candidate=candidate.fasta, template=core.filtered, align=kmer, search=gotoh, ksize=8, match=2.0, mismatch=3.0, gapopen=-2.0, gapextend=-1.0)\n"); + m->mothurOut("Note: No spaces between parameter labels (i.e. candidate), '=' and parameters (i.e.yourFastaFile).\n\n"); + } + catch(exception& e) { + m->errorOut(e, "MakeGroupCommand", "help"); + exit(1); + } +} + + +//********************************************************************************************************************** + +int MakeGroupCommand::execute(){ + try { + if (abort == true) { return 0; } + + + m->mothurOutEndLine(); + m->mothurOut("Output File Names: "); m->mothurOutEndLine(); + //for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } + m->mothurOutEndLine(); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "MakeGroupCommand", "execute"); + exit(1); + } +} + diff --git a/makegroupcommand.h b/makegroupcommand.h new file mode 100644 index 0000000..76435bf --- /dev/null +++ b/makegroupcommand.h @@ -0,0 +1,33 @@ +#ifndef MAKEGROUPCOMMAND_H +#define MAKEGROUPCOMMAND_H + +/* + * makegroupcommand.h + * Mothur + * + * Created by westcott on 5/7/10. + * Copyright 2010 Schloss Lab. All rights reserved. + * + */ + +#include "command.hpp" + +class MakeGroupCommand : public Command { + +public: + MakeGroupCommand(string); + ~MakeGroupCommand(); + int execute(); + void help(); + +private: + + string fastaFileName, groups, outputDir; + vector fastaFileNames; + vector groupsNames; + + bool abort; +}; + +#endif + diff --git a/trimseqscommand.cpp b/trimseqscommand.cpp index 06bd0ed..b12da1c 100644 --- a/trimseqscommand.cpp +++ b/trimseqscommand.cpp @@ -23,7 +23,7 @@ TrimSeqsCommand::TrimSeqsCommand(string option) { else { //valid paramters for this command string AlignArray[] = {"fasta", "flip", "oligos", "maxambig", "maxhomop", "minlength", "maxlength", "qfile", - "qthreshold", "qaverage", "allfiles", "qtrim","diffs", "processors", "outputdir","inputdir"}; + "qthreshold", "qaverage", "allfiles", "qtrim","tdiffs", "pdiffs", "bdiffs", "processors", "outputdir","inputdir"}; vector myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string))); @@ -104,8 +104,14 @@ TrimSeqsCommand::TrimSeqsCommand(string option) { temp = validParameter.validFile(parameters, "maxlength", false); if (temp == "not found") { temp = "0"; } convert(temp, maxLength); - temp = validParameter.validFile(parameters, "diffs", false); if (temp == "not found") { temp = "0"; } - convert(temp, diffs); + temp = validParameter.validFile(parameters, "tdiffs", false); if (temp == "not found") { temp = "0"; } + convert(temp, tdiffs); + + temp = validParameter.validFile(parameters, "bdiffs", false); if (temp == "not found") { temp = "0"; } + convert(temp, bdiffs); + + temp = validParameter.validFile(parameters, "pdiffs", false); if (temp == "not found") { temp = "0"; } + convert(temp, pdiffs); temp = validParameter.validFile(parameters, "qfile", true); if (temp == "not found") { qFileName = ""; } @@ -160,7 +166,9 @@ void TrimSeqsCommand::help(){ m->mothurOut("The maxhomop parameter .... The default is 0.\n"); m->mothurOut("The minlength parameter .... The default is 0.\n"); m->mothurOut("The maxlength parameter .... The default is 0.\n"); - m->mothurOut("The diffs parameter .... The default is 0.\n"); + m->mothurOut("The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is 0.\n"); + m->mothurOut("The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n"); + m->mothurOut("The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n"); m->mothurOut("The qfile parameter .....\n"); m->mothurOut("The qthreshold parameter .... The default is 0.\n"); m->mothurOut("The qaverage parameter .... The default is 0.\n"); @@ -357,6 +365,7 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string if (origSeq != "") { int group; string trashCode = ""; + int currentSeqsDiffs = 0; if(qFileName != ""){ if(qThreshold != 0) { success = stripQualThreshold(currSeq, qFile); } @@ -370,13 +379,17 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string if(barcodes.size() != 0){ success = stripBarcode(currSeq, group); if(!success){ trashCode += 'b'; } + else{ currentSeqsDiffs += currentSeqsTdiffs; } } if(numFPrimers != 0){ success = stripForward(currSeq); if(!success){ trashCode += 'f'; } + else{ currentSeqsDiffs += currentSeqsTdiffs; } } - + + if (currentSeqsDiffs > tdiffs) { trashCode += 't'; } + if(numRPrimers != 0){ success = stripReverse(currSeq); if(!success){ trashCode += 'r'; } @@ -609,7 +622,7 @@ bool TrimSeqsCommand::stripBarcode(Sequence& seq, int& group){ } //if you found the barcode or if you don't want to allow for diffs - if ((diffs == 0) || (success == 1)) { return success; } + if ((bdiffs == 0) || (success == 1)) { return success; } else { //try aligning and see if you can find it @@ -618,7 +631,7 @@ bool TrimSeqsCommand::stripBarcode(Sequence& seq, int& group){ map::iterator it=barcodes.begin(); string temp = it->first; - alignment = new NeedlemanOverlap(-2.0, 1.0, -1.0, (temp.length()+diffs+1)); + alignment = new NeedlemanOverlap(-2.0, 1.0, -1.0, (temp.length()+bdiffs+1)); }else{ alignment = NULL; } @@ -633,13 +646,13 @@ bool TrimSeqsCommand::stripBarcode(Sequence& seq, int& group){ } //use needleman to align first barcode.length()+numdiffs of sequence to each barcode - alignment->align(oligo, rawSequence.substr(0,length+diffs)); + alignment->align(oligo, rawSequence.substr(0,length+bdiffs)); oligo = alignment->getSeqAAln(); string temp = alignment->getSeqBAln(); //cout << "barcode = " << oligo << " raw = " << rawSequence.substr(0,oligo.length()) << " raw aligned = " << temp << endl; int newStart=0; - if(compareDNASeq(oligo, temp, length, newStart)){ + if(compareDNASeq(oligo, temp, length, newStart, bdiffs)){ group = it->second; seq.setUnaligned(rawSequence.substr(newStart)); success = 1; @@ -682,32 +695,33 @@ bool TrimSeqsCommand::stripForward(Sequence& seq){ } //if you found the primer or if you don't want to allow for diffs - if ((diffs == 0) || (success == 1)) { return success; } + if ((pdiffs == 0) || (success == 1)) { return success; } else { //try aligning and see if you can find it - Alignment* alignment; - if (numFPrimers > 0) { alignment = new NeedlemanOverlap(-2.0, 1.0, -1.0, (forPrimer[0].length()+diffs+1)); } //assumes primers are all the same length + if (numFPrimers > 0) { alignment = new NeedlemanOverlap(-2.0, 1.0, -1.0, (forPrimer[0].length()+pdiffs+1)); } else{ alignment = NULL; } - //can you find the primer for(int i=0;i alignment->getnRows()) { alignment->resize(length+pdiffs+1); } //use needleman to align first primer.length()+numdiffs of sequence to each primer - alignment->align(oligo, rawSequence.substr(0,length+diffs)); + alignment->align(oligo, rawSequence.substr(0,length+pdiffs)); oligo = alignment->getSeqAAln(); string temp = alignment->getSeqBAln(); - + int newStart = 0; - if(compareDNASeq(oligo, temp, length, newStart)){ + if(compareDNASeq(oligo, temp, length, newStart, pdiffs)){ seq.setUnaligned(rawSequence.substr(newStart)); success = 1; break; @@ -856,24 +870,22 @@ bool TrimSeqsCommand::compareDNASeq(string oligo, string seq){ } //*************************************************************************************************************** -bool TrimSeqsCommand::compareDNASeq(string oligo, string seq, int numBases, int& end){ +bool TrimSeqsCommand::compareDNASeq(string oligo, string seq, int numBases, int& end, int diffs){ try { bool success = 1; int length = oligo.length(); - end = length; + end = numBases; int countBases = 0; int countDiffs = 0; if (length != 0) { - if ((oligo[0] == '-') || (oligo[0] == '.')) { success = 0; return success; } //no gaps allowed at beginning + if ((oligo[0] == '-') || (oligo[0] == '.')) { success = 0; return success; } //no gaps allowed at beginning } for(int i=0;i