From: pschloss Date: Thu, 23 Dec 2010 21:37:11 +0000 (+0000) Subject: addition of trim.flows X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=commitdiff_plain;h=d635b39347cd81943ea50de7b813a0a5d743b0c0 addition of trim.flows --- diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj index 6f1e88d..febae68 100644 --- a/Mothur.xcodeproj/project.pbxproj +++ b/Mothur.xcodeproj/project.pbxproj @@ -9,6 +9,10 @@ /* Begin PBXFileReference section */ 7E13BDD112BE3FEE004B8A53 /* reportfile.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = reportfile.h; sourceTree = ""; }; 7E13BDD212BE3FEF004B8A53 /* reportfile.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = reportfile.cpp; sourceTree = ""; }; + 7E1CA7D712C2A384003F10B2 /* flowdata.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = flowdata.h; sourceTree = ""; }; + 7E1CA7D812C2A384003F10B2 /* flowdata.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = flowdata.cpp; sourceTree = ""; }; + 7E1CA7D912C2A425003F10B2 /* trimflowscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = trimflowscommand.h; sourceTree = ""; }; + 7E1CA7DA12C2A425003F10B2 /* trimflowscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = trimflowscommand.cpp; sourceTree = ""; }; 7E4EBD43122018FB00D85E7B /* simpsoneven.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = simpsoneven.h; sourceTree = ""; }; 7E4EBD44122018FB00D85E7B /* simpsoneven.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = simpsoneven.cpp; sourceTree = ""; }; 7E5B28DC121FEFCC0005339C /* shannoneven.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = shannoneven.h; sourceTree = ""; }; @@ -662,6 +666,8 @@ A7DA2176113FECD400BF472F /* venn.h */, 7E13BDD112BE3FEE004B8A53 /* reportfile.h */, 7E13BDD212BE3FEF004B8A53 /* reportfile.cpp */, + 7E1CA7D712C2A384003F10B2 /* flowdata.h */, + 7E1CA7D812C2A384003F10B2 /* flowdata.cpp */, ); name = mothur; sourceTree = ""; @@ -829,6 +835,8 @@ A7CB593E11402F110010EB83 /* commands */ = { isa = PBXGroup; children = ( + 7E1CA7D912C2A425003F10B2 /* trimflowscommand.h */, + 7E1CA7DA12C2A425003F10B2 /* trimflowscommand.cpp */, A7DA202B113FECD400BF472F /* command.hpp */, A7DA1FEF113FECD400BF472F /* aligncommand.h */, A7DA1FEE113FECD400BF472F /* aligncommand.cpp */, diff --git a/commandfactory.cpp b/commandfactory.cpp index f1a7105..5d9679a 100644 --- a/commandfactory.cpp +++ b/commandfactory.cpp @@ -103,6 +103,7 @@ #include "removeotuscommand.h" #include "indicatorcommand.h" #include "consensusseqscommand.h" +#include "trimflowscommand.h" #include "corraxescommand.h" /*******************************************************/ @@ -167,6 +168,7 @@ CommandFactory::CommandFactory(){ commands["help"] = "help"; commands["reverse.seqs"] = "reverse.seqs"; commands["trim.seqs"] = "trim.seqs"; + commands["trim.flows"] = "trim.flows"; commands["list.seqs"] = "list.seqs"; commands["get.seqs"] = "get.seqs"; commands["remove.seqs"] = "get.seqs"; @@ -310,6 +312,7 @@ Command* CommandFactory::getCommand(string commandName, string optionString){ else if(commandName == "screen.seqs") { command = new ScreenSeqsCommand(optionString); } else if(commandName == "reverse.seqs") { command = new ReverseSeqsCommand(optionString); } else if(commandName == "trim.seqs") { command = new TrimSeqsCommand(optionString); } + else if(commandName == "trim.flows") { command = new TrimFlowsCommand(optionString); } else if(commandName == "chimera.seqs") { command = new ChimeraSeqsCommand(optionString); } else if(commandName == "list.seqs") { command = new ListSeqsCommand(optionString); } else if(commandName == "get.seqs") { command = new GetSeqsCommand(optionString); } @@ -433,6 +436,7 @@ Command* CommandFactory::getCommand(string commandName, string optionString, str else if(commandName == "screen.seqs") { pipecommand = new ScreenSeqsCommand(optionString); } else if(commandName == "reverse.seqs") { pipecommand = new ReverseSeqsCommand(optionString); } else if(commandName == "trim.seqs") { pipecommand = new TrimSeqsCommand(optionString); } + else if(commandName == "trim.flows") { pipecommand = new TrimFlowsCommand(optionString); } else if(commandName == "chimera.seqs") { pipecommand = new ChimeraSeqsCommand(optionString); } else if(commandName == "list.seqs") { pipecommand = new ListSeqsCommand(optionString); } else if(commandName == "get.seqs") { pipecommand = new GetSeqsCommand(optionString); } @@ -543,6 +547,7 @@ Command* CommandFactory::getCommand(string commandName){ else if(commandName == "screen.seqs") { shellcommand = new ScreenSeqsCommand(); } else if(commandName == "reverse.seqs") { shellcommand = new ReverseSeqsCommand(); } else if(commandName == "trim.seqs") { shellcommand = new TrimSeqsCommand(); } + else if(commandName == "trim.flows") { shellcommand = new TrimFlowsCommand(); } else if(commandName == "chimera.seqs") { shellcommand = new ChimeraSeqsCommand(); } else if(commandName == "list.seqs") { shellcommand = new ListSeqsCommand(); } else if(commandName == "get.seqs") { shellcommand = new GetSeqsCommand(); } diff --git a/flowdata.cpp b/flowdata.cpp new file mode 100644 index 0000000..f40fdd6 --- /dev/null +++ b/flowdata.cpp @@ -0,0 +1,241 @@ +/* + * flowdata.cpp + * Mothur + * + * Created by Pat Schloss on 12/22/10. + * Copyright 2010 Schloss Lab. All rights reserved. + * + */ + +#include "flowdata.h" + +//********************************************************************************************************************** + +FlowData::FlowData(){} + +//********************************************************************************************************************** + +FlowData::FlowData(ifstream& flowFile, float signal, float noise, int maxHomoP){ + + try { + m = MothurOut::getInstance(); + + baseFlow = "TACG"; + seqName = ""; + numFlows = 0; + locationString = ""; + seqLength = 0; + + string lengthString; + string flowString; + + flowFile >> seqName >> locationString >> lengthString >> flowString; + + convert(lengthString.substr(7), seqLength); + convert(flowString.substr(9), numFlows); + + flowData.resize(numFlows); + + if (seqName == "") { + m->mothurOut("Error reading quality file, name blank at position, " + toString(flowFile.tellg())); + m->mothurOutEndLine(); + } + else{ + seqName = seqName.substr(1); + for(int i=0;i> flowData[i]; } + } + + findDeadSpot(signal, noise, maxHomoP); + translateFlow(); + + m->gobble(flowFile); + } + catch(exception& e) { + m->errorOut(e, "FlowData", "FlowData"); + exit(1); + } + +} + +//********************************************************************************************************************** + +void FlowData::findDeadSpot(float signalIntensity, float noiseIntensity, int maxHomoP){ + try{ + + int currLength = 0; + float maxIntensity = (float) maxHomoP + 0.49; + + deadSpot = 0; + while(currLength < seqLength + 4){ + int signal = 0; + int noise = 0; + + for(int i=0;i<4;i++){ + float intensity = flowData[i + 4 * deadSpot]; + if(intensity > signalIntensity){ + signal++; + + if(intensity < noiseIntensity || intensity > maxIntensity){ + noise++; + } + } + currLength += (int)(intensity+0.5); + } + + if(noise > 0 || signal == 0){ + break; + } + + deadSpot++; + } + deadSpot *= 4; + seqLength = currLength; + + } + catch(exception& e) { + m->errorOut(e, "FlowData", "findDeadSpot"); + exit(1); + } +} + +//********************************************************************************************************************** + +void FlowData::translateFlow(){ + + try{ + sequence = ""; + for(int i=0;i 4){ + sequence = sequence.substr(4); + } + else{ + sequence = "NNNN"; + } + } + catch(exception& e) { + m->errorOut(e, "FlowData", "translateFlow"); + exit(1); + } +} + +//********************************************************************************************************************** + +void FlowData::capFlows(int maxFlows){ + + try{ + + numFlows = maxFlows; + if(deadSpot > maxFlows){ deadSpot = maxFlows; } + + } + catch(exception& e) { + m->errorOut(e, "FlowData", "capFlows"); + exit(1); + } +} + +//********************************************************************************************************************** + +bool FlowData::hasMinFlows(int minFlows){ + + try{ + bool pastMin = 0; + + if(deadSpot >= minFlows){ pastMin = 1; } + return pastMin; + } + catch(exception& e) { + m->errorOut(e, "FlowData", "hasMinFlows"); + exit(1); + } +} + +//********************************************************************************************************************** + +Sequence FlowData::getSequence(){ + + try{ + return Sequence(seqName, sequence); + } + catch(exception& e) { + m->errorOut(e, "FlowData", "getSequence"); + exit(1); + } +} + +//********************************************************************************************************************** + +int FlowData::getSeqLength(){ + + try{ + return seqLength; + } + catch(exception& e) { + m->errorOut(e, "FlowData", "getSeqLength"); + exit(1); + } +} + +//********************************************************************************************************************** + +void FlowData::printFlows(ofstream& outFlowFile){ + try{ + // outFlowFile << '>' << seqName << locationString << " length=" << seqLength << " numflows=" << maxFlows << endl; + outFlowFile << seqName << ' ' << deadSpot << ' ' << setprecision(2); + + for(int i=0;ierrorOut(e, "FlowData", "printFlows"); + exit(1); + } +} + +//********************************************************************************************************************** + +void FlowData::printFlows(ofstream& outFlowFile, string scrapCode){ + try{ + + // outFlowFile << '>' << seqName << locationString << " length=" << seqLength << " numflows=" << maxFlows << endl; + + outFlowFile << seqName << '|' << scrapCode << ' ' << deadSpot << ' ' << setprecision(2); + + for(int i=0;ierrorOut(e, "FlowData", "printFlows"); + exit(1); + } +} + +//********************************************************************************************************************** + +void FlowData::printFASTA(ofstream& outFASTA){ + try{ + + outFASTA << '>' << seqName << endl; + outFASTA << sequence << endl; + + } + catch(exception& e) { + m->errorOut(e, "FlowData", "printFlows"); + exit(1); + } +} + + +//********************************************************************************************************************** diff --git a/flowdata.h b/flowdata.h new file mode 100644 index 0000000..dafeab3 --- /dev/null +++ b/flowdata.h @@ -0,0 +1,42 @@ +#ifndef FLOWDATA_H +#define FLOWDATA_H + +/* + * flowdata.h + * Mothur + * + * Created by Pat Schloss on 12/22/10. + * Copyright 2010 Schloss Lab. All rights reserved. + * + */ + +#include "mothur.h" +#include "mothurout.h" +#include "sequence.hpp" + +class FlowData { + +public: + FlowData(); + FlowData(ifstream&, float, float, int); + ~FlowData(){}; + void capFlows(int); + bool hasMinFlows(int); + Sequence getSequence(); + + int getSeqLength(); + void printFlows(ofstream&); + void printFlows(ofstream&, string); + void printFASTA(ofstream&); +private: + MothurOut* m; + + void findDeadSpot(float, float, int); + void translateFlow(); + + string seqName, locationString, sequence, baseFlow; + int numFlows, seqLength, deadSpot; + vector flowData; +}; + +#endif diff --git a/mothur b/mothur index 2578194..83751b3 100755 Binary files a/mothur and b/mothur differ diff --git a/sffinfocommand.cpp b/sffinfocommand.cpp index 66409e8..1a9b73f 100644 --- a/sffinfocommand.cpp +++ b/sffinfocommand.cpp @@ -316,7 +316,7 @@ int SffInfoCommand::extractSffInfo(string input, string accnos){ if (sfftxt) { m->openOutputFile(sfftxtFileName, outSfftxt); outSfftxt.setf(ios::fixed, ios::floatfield); outSfftxt.setf(ios::showpoint); outputNames.push_back(sfftxtFileName); outputTypes["sfftxt"].push_back(sfftxtFileName); } if (fasta) { m->openOutputFile(outFastaFileName, outFasta); outputNames.push_back(outFastaFileName); outputTypes["fasta"].push_back(outFastaFileName); } if (qual) { m->openOutputFile(outQualFileName, outQual); outputNames.push_back(outQualFileName); outputTypes["qual"].push_back(outQualFileName); } - if (flow) { m->openOutputFile(outFlowFileName, outFlow); outputNames.push_back(outFlowFileName); outputTypes["flow"].push_back(outFlowFileName); } + if (flow) { m->openOutputFile(outFlowFileName, outFlow); outputNames.push_back(outFlowFileName); outFlow.setf(ios::fixed, ios::floatfield); outFlow.setf(ios::showpoint); outputTypes["flow"].push_back(outFlowFileName); } ifstream in; in.open(input.c_str(), ios::binary); @@ -778,10 +778,12 @@ int SffInfoCommand::printQualSeqData(ofstream& out, seqRead& read, Header& heade //********************************************************************************************************************** int SffInfoCommand::printFlowSeqData(ofstream& out, seqRead& read, Header& header) { try { + if(header.clipQualRight > header.clipQualLeft){ + out << ">" << header.name << " xy=" << header.xy << " length=" << (header.clipQualRight-header.clipQualLeft) << " numflows=" << read.flowgram.size() << endl; + for (int i = 0; i < read.flowgram.size(); i++) { out << setprecision(2) << (read.flowgram[i]/(float)100) << ' '; } + out << endl; + } - out << ">" << header.name << " xy=" << header.xy << endl; - for (int i = 0; i < read.flowgram.size(); i++) { out << setprecision(2) << (read.flowgram[i]/(float)100) << '\t'; } - out << endl; return 0; } diff --git a/trimflowscommand.cpp b/trimflowscommand.cpp new file mode 100644 index 0000000..a2c3a88 --- /dev/null +++ b/trimflowscommand.cpp @@ -0,0 +1,903 @@ +/* + * trimflowscommand.cpp + * Mothur + * + * Created by Pat Schloss on 12/22/10. + * Copyright 2010 Schloss Lab. All rights reserved. + * + */ + +#include "trimflowscommand.h" +#include "needlemanoverlap.hpp" + +//********************************************************************************************************************** + +vector TrimFlowsCommand::getValidParameters(){ + try { + string Array[] = {"flow", "totalflows", "minflows", + "fasta", "minlength", "maxlength", "maxhomop", "signal", "noise" + "oligos", "pdiffs", "bdiffs", "tdiffs", + "allfiles", + + +// "group", +// "processors", + "outputdir","inputdir" + + }; + vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); + return myArray; + } + catch(exception& e) { + m->errorOut(e, "TrimFlowsCommand", "getValidParameters"); + exit(1); + } +} + +//********************************************************************************************************************** + +vector TrimFlowsCommand::getRequiredParameters(){ + try { + string Array[] = {"flow"}; + vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); + return myArray; + } + catch(exception& e) { + m->errorOut(e, "TrimFlowsCommand", "getRequiredParameters"); + exit(1); + } +} + +//********************************************************************************************************************** + +vector TrimFlowsCommand::getRequiredFiles(){ + try { + vector myArray; + return myArray; + } + catch(exception& e) { + m->errorOut(e, "TrimFlowsCommand", "getRequiredFiles"); + exit(1); + } +} + +//********************************************************************************************************************** + +TrimFlowsCommand::TrimFlowsCommand(){ + try { + abort = true; + //initialize outputTypes + vector tempOutNames; + outputTypes["flow"] = tempOutNames; + outputTypes["fasta"] = tempOutNames; + } + catch(exception& e) { + m->errorOut(e, "TrimFlowsCommand", "TrimFlowsCommand"); + exit(1); + } +} + +//*************************************************************************************************************** + +TrimFlowsCommand::~TrimFlowsCommand(){ /* do nothing */ } + +//*************************************************************************************************************** + +void TrimFlowsCommand::help(){ + try { + m->mothurOut("The trim.flows command reads a flowgram file and creates .....\n"); + m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n"); + m->mothurOut("For more details please check out the wiki http://www.mothur.org/wiki/Trim.flows.\n\n"); + + } + catch(exception& e) { + m->errorOut(e, "TrimFlowsCommand", "help"); + exit(1); + } +} + +//********************************************************************************************************************** + +TrimFlowsCommand::TrimFlowsCommand(string option) { + try { + + abort = false; + comboStarts = 0; + + //allow user to run help + if(option == "help") { help(); abort = true; } + + else { + //valid paramters for this command + string AlignArray[] = {"flow", "totalflows", "minflows", + "fasta", "minlength", "maxlength", "maxhomop", "signal", "noise", + "oligos", "pdiffs", "bdiffs", "tdiffs", + "allfiles", + + // "group", + // "processors", + "outputdir","inputdir" + + }; + + vector myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string))); + + OptionParser parser(option); + map parameters = parser.getParameters(); + + ValidParameters validParameter; + map::iterator it; + + //check to make sure all parameters are valid for command + for (it = parameters.begin(); it != parameters.end(); it++) { + if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; } + } + + //initialize outputTypes + vector tempOutNames; + outputTypes["flow"] = tempOutNames; + // outputTypes["fasta"] = tempOutNames; + // outputTypes["group"] = tempOutNames; + + //if the user changes the input directory command factory will send this info to us in the output parameter + string inputDir = validParameter.validFile(parameters, "inputdir", false); + if (inputDir == "not found"){ inputDir = ""; } + else { + string path; + it = parameters.find("flow"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["flow"] = inputDir + it->second; } + } + +// it = parameters.find("oligos"); +// //user has given a template file +// if(it != parameters.end()){ +// path = m->hasPath(it->second); +// //if the user has not given a path then, add inputdir. else leave path alone. +// if (path == "") { parameters["oligos"] = inputDir + it->second; } +// } + + +// it = parameters.find("group"); +// //user has given a template file +// if(it != parameters.end()){ +// path = m->hasPath(it->second); +// //if the user has not given a path then, add inputdir. else leave path alone. +// if (path == "") { parameters["group"] = inputDir + it->second; } +// } + } + + + //check for required parameters + flowFileName = validParameter.validFile(parameters, "flow", true); + if (flowFileName == "not found") { m->mothurOut("flow is a required parameter for the trim.flows command."); m->mothurOutEndLine(); abort = true; } + else if (flowFileName == "not open") { abort = true; } + + //if the user changes the output directory command factory will send this info to us in the output parameter + outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ + outputDir = ""; + outputDir += m->hasPath(flowFileName); //if user entered a file with a path then preserve it + } + + + //check for optional parameter and set defaults + // ...at some point should added some additional type checking... + + string temp; + temp = validParameter.validFile(parameters, "minflows", false); if (temp == "not found") { temp = "360"; } + convert(temp, minFlows); + + temp = validParameter.validFile(parameters, "totalflows", false); if (temp == "not found") { temp = "720"; } + convert(temp, totalFlows); + + + temp = validParameter.validFile(parameters, "oligos", true); + if (temp == "not found") { oligoFileName = ""; } + else if(temp == "not open") { abort = true; } + else { oligoFileName = temp; } + + temp = validParameter.validFile(parameters, "fasta", false); if (temp == "not found"){ fasta = 0; } + else if(m->isTrue(temp)) { fasta = 1; } + + temp = validParameter.validFile(parameters, "maxhomop", false); if (temp == "not found"){ temp = "9"; } + convert(temp, maxHomoP); + + temp = validParameter.validFile(parameters, "signal", false); if (temp == "not found"){ temp = "0.50"; } + convert(temp, signal); + + temp = validParameter.validFile(parameters, "noise", false); if (temp == "not found"){ temp = "0.70"; } + convert(temp, noise); + + temp = validParameter.validFile(parameters, "minlength", false); if (temp == "not found"){ temp = "0"; } + convert(temp, minLength); + + temp = validParameter.validFile(parameters, "maxlength", false); if (temp == "not found"){ temp = "0"; } + convert(temp, maxLength); + + temp = validParameter.validFile(parameters, "bdiffs", false); if (temp == "not found"){ temp = "0"; } + convert(temp, bdiffs); + + temp = validParameter.validFile(parameters, "pdiffs", false); if (temp == "not found"){ temp = "0"; } + convert(temp, pdiffs); + + temp = validParameter.validFile(parameters, "tdiffs", false); if (temp == "not found"){ int tempTotal = pdiffs + bdiffs; temp = toString(tempTotal); } + convert(temp, tdiffs); + if(tdiffs == 0){ tdiffs = bdiffs + pdiffs; } + + temp = validParameter.validFile(parameters, "allfiles", false); if (temp == "not found") { temp = "T"; } + allFiles = m->isTrue(temp); + +// temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found") { temp = "1"; } +// convert(temp, processors); + + if(allFiles && oligoFileName == ""){ + m->mothurOut("You selected allfiles, but didn't enter an oligos file. Ignoring the allfiles request."); m->mothurOutEndLine(); + allFiles = 0; + } + + } + + } + catch(exception& e) { + m->errorOut(e, "TrimFlowsCommand", "TrimSeqsCommand"); + exit(1); + } +} + +//*************************************************************************************************************** + +int TrimFlowsCommand::execute(){ + try{ + + if (abort == true) { return 0; } + + string trimFlowFileName = outputDir + m->getRootName(m->getSimpleName(flowFileName)) + "trim.flow"; + outputNames.push_back(trimFlowFileName); outputTypes["flow"].push_back(trimFlowFileName); + + string scrapFlowFileName = outputDir + m->getRootName(m->getSimpleName(flowFileName)) + "scrap.flow"; + outputNames.push_back(scrapFlowFileName); outputTypes["flow"].push_back(scrapFlowFileName); + + string fastaFileName = outputDir + m->getRootName(m->getSimpleName(flowFileName)) + "flow.fasta"; + if(fasta){ + outputNames.push_back(fastaFileName); outputTypes["fasta"].push_back(fastaFileName); + } + + driverCreateTrim(flowFileName, trimFlowFileName, scrapFlowFileName, fastaFileName); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "TrimSeqsCommand", "execute"); + exit(1); + } +} + +//*************************************************************************************************************** + +int TrimFlowsCommand::driverCreateTrim(string flowFileName, string trimFlowFileName, string scrapFlowFileName, string fastaFileName){ + + try { + + ofstream trimFlowFile; + m->openOutputFile(trimFlowFileName, trimFlowFile); + trimFlowFile.setf(ios::fixed, ios::floatfield); trimFlowFile.setf(ios::showpoint); + + ofstream scrapFlowFile; + m->openOutputFile(scrapFlowFileName, scrapFlowFile); + scrapFlowFile.setf(ios::fixed, ios::floatfield); scrapFlowFile.setf(ios::showpoint); + + ifstream flowFile; + m->openInputFile(flowFileName, flowFile); + + ofstream fastaFile; + if(fasta){ m->openOutputFile(fastaFileName, fastaFile); } + + vector > barcodePrimerCombos; + if(oligoFileName != ""){ getOligos(barcodePrimerCombos); } + +// inFASTA.seekg(line->start); + +// bool done = false; + int count = 0; + + while (flowFile) { + + int success = 1; + int currentSeqDiffs = 0; + + string trashCode = ""; + + FlowData currFlow(flowFile, signal, noise, maxHomoP); + m->gobble(flowFile); + currFlow.capFlows(totalFlows); + Sequence currSeq = currFlow.getSequence(); + + + if(!currFlow.hasMinFlows(minFlows)){ //screen to see if sequence is of a minimum number of flows + success = 0; + trashCode += 'l'; + } + + if(minLength > 0 || maxLength > 0){ //screen to see if sequence is above and below a specific number of bases + int seqLength = currFlow.getSeqLength(); + if(seqLength < minLength || seqLength > maxLength){ + success = 0; + trashCode += 'l'; + } + } + + int primerIndex, barcodeIndex; + if(barcodes.size() != 0){ + success = stripBarcode(currSeq, barcodeIndex); + if(success > bdiffs) { trashCode += 'b'; } + else{ currentSeqDiffs += success; } + } + + if(numFPrimers != 0){ + success = stripForward(currSeq, primerIndex); + if(success > pdiffs) { trashCode += 'f'; } + else{ currentSeqDiffs += success; } + } + + if (currentSeqDiffs > tdiffs) { trashCode += 't'; } + + if(numRPrimers != 0){ + success = stripReverse(currSeq); + if(!success) { trashCode += 'r'; } + } + + + if(trashCode.length() == 0){ + currFlow.printFlows(trimFlowFile); + + if(fasta){ + currFlow.printFASTA(fastaFile); + } + + if(barcodes.size() != 0 || primers.size() != 0){ + string fileName = outputDir + m->getRootName(m->getSimpleName(flowFileName)) + barcodePrimerCombos[barcodeIndex][primerIndex] + ".flow"; + ofstream output; + m->openOutputFileAppend(fileName, output); + currFlow.printFlows(output); + output.close(); + } + + } + else{ + currFlow.printFlows(scrapFlowFile, trashCode); + } + +// if(barcodes.size() != 0){ +// string thisGroup = groupVector[groupBar]; +// int indexToFastaFile = groupBar; +// if (primers.size() != 0){ +// //does this primer have a group +// if (groupVector[groupPrime] != "") { +// thisGroup += "." + groupVector[groupPrime]; +// indexToFastaFile = combos[thisGroup]; +// } +// } +// outGroups << currSeq.getName() << '\t' << thisGroup << endl; +// if(allFiles){ +// ofstream outTemp; +// m->openOutputFileAppend(fastaNames[indexToFastaFile], outTemp); +// //currSeq.printSequence(*fastaFileNames[indexToFastaFile]); +// currSeq.printSequence(outTemp); +// outTemp.close(); +// +// if(qFileName != ""){ +// //currQual.printQScores(*qualFileNames[indexToFastaFile]); +// ofstream outTemp2; +// m->openOutputFileAppend(qualNames[indexToFastaFile], outTemp2); +// currQual.printQScores(outTemp2); +// outTemp2.close(); +// } +// } +// } +// } +// else{ +// currSeq.setName(currSeq.getName() + '|' + trashCode); +// currSeq.setUnaligned(origSeq); +// currSeq.setAligned(origSeq); +// currSeq.printSequence(scrapFASTA); +// } + + count++; + + //report progress + if((count) % 1000 == 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); } + + } + //report progress + if((count) % 1000 != 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); } + + + trimFlowFile.close(); + scrapFlowFile.close(); + flowFile.close(); + + return count; + } + catch(exception& e) { + m->errorOut(e, "TrimSeqsCommand", "driverCreateTrim"); + exit(1); + } +} + +//*************************************************************************************************************** + +void TrimFlowsCommand::getOligos(vector >& outFlowFileNames){ + try { + ifstream oligosFile; + m->openInputFile(oligoFileName, oligosFile); + + string type, oligo, group; + + int indexPrimer = 0; + int indexBarcode = 0; + + while(!oligosFile.eof()){ + + oligosFile >> type; m->gobble(oligosFile); //get the first column value of the row - is it a comment or a feature we are interested in? + + if(type[0] == '#'){ //igore the line because there's a comment + while (!oligosFile.eof()) { char c = oligosFile.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there + } + else{ //there's a feature we're interested in + + for(int i=0;i> oligo; //get the DNA sequence for the feature + + for(int i=0;i::iterator itPrimer = primers.find(oligo); + if (itPrimer != primers.end()) { m->mothurOut("primer " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); } + + primers[oligo]=indexPrimer; indexPrimer++; + primerNameVector.push_back(group); + + } + else if(type == "REVERSE"){ + Sequence oligoRC("reverse", oligo); + oligoRC.reverseComplement(); + revPrimer.push_back(oligoRC.getUnaligned()); + } + else if(type == "BARCODE"){ + oligosFile >> group; + + //check for repeat barcodes + map::iterator itBar = barcodes.find(oligo); + if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); } + + barcodes[oligo]=indexBarcode; indexBarcode++; + barcodeNameVector.push_back(group); + } + else{ + m->mothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); + } + } + + m->gobble(oligosFile); + } + oligosFile.close(); + + //add in potential combos + outFlowFileNames.resize(barcodeNameVector.size()); + for(int i=0;i::iterator itBar = barcodes.begin();itBar != barcodes.end();itBar++){ + for(map::iterator itPrimer = primers.begin();itPrimer != primers.end(); itPrimer++){ + + string primerName = primerNameVector[itPrimer->second]; + string barcodeName = barcodeNameVector[itBar->second]; + + string comboGroupName = ""; + string fileName = ""; + + if(primerName == ""){ + comboGroupName = barcodeNameVector[itBar->second]; + fileName = outputDir + m->getRootName(m->getSimpleName(flowFileName)) + comboGroupName + ".flow"; + } + else{ + comboGroupName = barcodeNameVector[itBar->second] + "." + primerNameVector[itPrimer->second]; + fileName = outputDir + m->getRootName(m->getSimpleName(flowFileName)) + comboGroupName + ".flow"; + } + + outputNames.push_back(fileName); + outputTypes["flow"].push_back(fileName); + outFlowFileNames[itBar->second][itPrimer->second] = comboGroupName; + + ofstream temp; + m->openOutputFile(fileName, temp); + temp.close(); + } + } + } + + numFPrimers = primers.size(); + numRPrimers = revPrimer.size(); + + } + catch(exception& e) { + m->errorOut(e, "TrimSeqsCommand", "getOligos"); + exit(1); + } +} + +//*************************************************************************************************************** + +int TrimFlowsCommand::stripBarcode(Sequence& seq, int& group){ + try { + + string rawSequence = seq.getUnaligned(); + int success = bdiffs + 1; //guilty until proven innocent + + //can you find the barcode + for(map::iterator it=barcodes.begin();it!=barcodes.end();it++){ + string oligo = it->first; + if(rawSequence.length() < oligo.length()){ //let's just assume that the barcodes are the same length + success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out + break; + } + + if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){ + group = it->second; + seq.setUnaligned(rawSequence.substr(oligo.length())); + + success = 0; + break; + } + } + + //if you found the barcode or if you don't want to allow for diffs + // cout << success; + if ((bdiffs == 0) || (success == 0)) { return success; } + + else { //try aligning and see if you can find it + // cout << endl; + + int maxLength = 0; + + Alignment* alignment; + if (barcodes.size() > 0) { + map::iterator it=barcodes.begin(); + + for(it;it!=barcodes.end();it++){ + if(it->first.length() > maxLength){ + maxLength = it->first.length(); + } + } + alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1)); + + }else{ alignment = NULL; } + + //can you find the barcode + int minDiff = 1e6; + int minCount = 1; + int minGroup = -1; + int minPos = 0; + + for(map::iterator it=barcodes.begin();it!=barcodes.end();it++){ + string oligo = it->first; + // int length = oligo.length(); + + if(rawSequence.length() < maxLength){ //let's just assume that the barcodes are the same length + success = bdiffs + 10; + break; + } + + //use needleman to align first barcode.length()+numdiffs of sequence to each barcode + alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs)); + oligo = alignment->getSeqAAln(); + string temp = alignment->getSeqBAln(); + + int alnLength = oligo.length(); + + for(int i=oligo.length()-1;i>=0;i--){ + if(oligo[i] != '-'){ alnLength = i+1; break; } + } + oligo = oligo.substr(0,alnLength); + temp = temp.substr(0,alnLength); + + int newStart=0; + int numDiff = countDiffs(oligo, temp); + + if(numDiff < minDiff){ + minDiff = numDiff; + minCount = 1; + minGroup = it->second; + minPos = 0; + for(int i=0;i bdiffs) { success = minDiff; } //no good matches + else if(minCount > 1) { success = bdiffs + 100; } //can't tell the difference between multiple barcodes + else{ //use the best match + group = minGroup; + seq.setUnaligned(rawSequence.substr(minPos)); + success = minDiff; + } + + if (alignment != NULL) { delete alignment; } + + } + + return success; + + } + catch(exception& e) { + m->errorOut(e, "TrimFlowsCommand", "stripBarcode"); + exit(1); + } + +} + +//*************************************************************************************************************** + +int TrimFlowsCommand::stripForward(Sequence& seq, int& group){ + try { + + string rawSequence = seq.getUnaligned(); + int success = pdiffs + 1; //guilty until proven innocent + + //can you find the primer + for(map::iterator it=primers.begin();it!=primers.end();it++){ + string oligo = it->first; + if(rawSequence.length() < oligo.length()){ //let's just assume that the primers are the same length + success = pdiffs + 10; //if the sequence is shorter than the barcode then bail out + break; + } + + if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){ + group = it->second; + seq.setUnaligned(rawSequence.substr(oligo.length())); + success = 0; + break; + } + } + + //if you found the barcode or if you don't want to allow for diffs + // cout << success; + if ((pdiffs == 0) || (success == 0)) { return success; } + + else { //try aligning and see if you can find it + // cout << endl; + + int maxLength = 0; + + Alignment* alignment; + if (primers.size() > 0) { + map::iterator it=primers.begin(); + + for(it;it!=primers.end();it++){ + if(it->first.length() > maxLength){ + maxLength = it->first.length(); + } + } + alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+pdiffs+1)); + + }else{ alignment = NULL; } + + //can you find the barcode + int minDiff = 1e6; + int minCount = 1; + int minGroup = -1; + int minPos = 0; + + for(map::iterator it=primers.begin();it!=primers.end();it++){ + string oligo = it->first; + // int length = oligo.length(); + + if(rawSequence.length() < maxLength){ + success = pdiffs + 100; + break; + } + + //use needleman to align first barcode.length()+numdiffs of sequence to each barcode + alignment->align(oligo, rawSequence.substr(0,oligo.length()+pdiffs)); + oligo = alignment->getSeqAAln(); + string temp = alignment->getSeqBAln(); + + int alnLength = oligo.length(); + + for(int i=oligo.length()-1;i>=0;i--){ + if(oligo[i] != '-'){ alnLength = i+1; break; } + } + oligo = oligo.substr(0,alnLength); + temp = temp.substr(0,alnLength); + + int newStart=0; + int numDiff = countDiffs(oligo, temp); + + // cout << oligo << '\t' << temp << '\t' << numDiff << endl; + + if(numDiff < minDiff){ + minDiff = numDiff; + minCount = 1; + minGroup = it->second; + minPos = 0; + for(int i=0;i pdiffs) { success = minDiff; } //no good matches + else if(minCount > 1) { success = pdiffs + 10; } //can't tell the difference between multiple primers + else{ //use the best match + group = minGroup; + seq.setUnaligned(rawSequence.substr(minPos)); + success = minDiff; + } + + if (alignment != NULL) { delete alignment; } + + } + + return success; + + } + catch(exception& e) { + m->errorOut(e, "TrimFlowsCommand", "stripForward"); + exit(1); + } +} + +//*************************************************************************************************************** + +bool TrimFlowsCommand::stripReverse(Sequence& seq){ + try { + + string rawSequence = seq.getUnaligned(); + bool success = 0; //guilty until proven innocent + + for(int i=0;ierrorOut(e, "TrimFlowsCommand", "stripReverse"); + exit(1); + } +} + + +//*************************************************************************************************************** + +bool TrimFlowsCommand::compareDNASeq(string oligo, string seq){ + try { + bool success = 1; + int length = oligo.length(); + + for(int i=0;ierrorOut(e, "TrimFlowsCommand", "compareDNASeq"); + exit(1); + } + +} + +//*************************************************************************************************************** + +int TrimFlowsCommand::countDiffs(string oligo, string seq){ + try { + + int length = oligo.length(); + int countDiffs = 0; + + for(int i=0;ierrorOut(e, "TrimFlowsCommand", "countDiffs"); + exit(1); + } + +} + +//*************************************************************************************************************** + + + + + + + + + + + + + diff --git a/trimflowscommand.h b/trimflowscommand.h new file mode 100644 index 0000000..215b1a2 --- /dev/null +++ b/trimflowscommand.h @@ -0,0 +1,89 @@ +#ifndef TRIMFLOWSCOMMAND_H +#define TRIMFLOWSCOMMAND_H + +/* + * trimflowscommand.h + * Mothur + * + * Created by Pat Schloss on 12/22/10. + * Copyright 2010 Schloss Lab. All rights reserved. + * + */ + +#include "mothur.h" +#include "command.hpp" +#include "sequence.hpp" +#include "flowdata.h" +#include "groupmap.h" + +class TrimFlowsCommand : public Command { +public: + TrimFlowsCommand(string); + TrimFlowsCommand(); + ~TrimFlowsCommand(); + vector getRequiredParameters(); + vector getValidParameters(); + vector getRequiredFiles(); + map > getOutputFiles() { return outputTypes; } + int execute(); + void help(); + +private: + bool abort; + +// GroupMap* groupMap; + + struct linePair { + unsigned long int start; + unsigned long int end; + linePair(unsigned long int i, unsigned long int j) : start(i), end(j) {} + }; + int comboStarts; + vector processIDS; //processid + vector lines; + vector qLines; + map > outputTypes; + vector outputNames; + set filesToRemove; + + + + void getOligos(vector >&); //a rewrite of what is in trimseqscommand.h + int stripBarcode(Sequence&, int&); //largely redundant with trimseqscommand.h + int stripForward(Sequence&, int&); //largely redundant with trimseqscommand.h + bool stripReverse(Sequence&); //largely redundant with trimseqscommand.h + bool compareDNASeq(string, string); //largely redundant with trimseqscommand.h + int countDiffs(string, string); //largely redundant with trimseqscommand.h + + + bool allFiles; +// int processors; + int numFPrimers, numRPrimers; + int totalFlows, minFlows, minLength, maxLength, maxHomoP, tdiffs, bdiffs, pdiffs; + float signal, noise; + bool fasta; + + + string flowFileName, oligoFileName, outputDir; + + + map barcodes; + map primers; + vector revPrimer; + + vector primerNameVector; //needed here? + vector barcodeNameVector; //needed here? + + map combos; //needed here? + map groupToIndex; //needed here? + + + int driverCreateTrim(string, string, string, string); + +// int createProcessesCreateTrim(string, string, string, string, string, string, string, vector, vector){}; + int setLines(string, string, vector&, vector&){}; + +}; + + +#endif diff --git a/trimseqscommand.cpp b/trimseqscommand.cpp index 9fc3abb..b40b7e5 100644 --- a/trimseqscommand.cpp +++ b/trimseqscommand.cpp @@ -11,6 +11,7 @@ #include "needlemanoverlap.hpp" //********************************************************************************************************************** + vector TrimSeqsCommand::getValidParameters(){ try { string Array[] = {"fasta", "flip", "oligos", "maxambig", "maxhomop", "group","minlength", "maxlength", "qfile", @@ -25,7 +26,9 @@ vector TrimSeqsCommand::getValidParameters(){ exit(1); } } + //********************************************************************************************************************** + TrimSeqsCommand::TrimSeqsCommand(){ try { abort = true; @@ -40,7 +43,9 @@ TrimSeqsCommand::TrimSeqsCommand(){ exit(1); } } + //********************************************************************************************************************** + vector TrimSeqsCommand::getRequiredParameters(){ try { string Array[] = {"fasta"}; @@ -52,7 +57,9 @@ vector TrimSeqsCommand::getRequiredParameters(){ exit(1); } } + //********************************************************************************************************************** + vector TrimSeqsCommand::getRequiredFiles(){ try { vector myArray; @@ -63,6 +70,7 @@ vector TrimSeqsCommand::getRequiredFiles(){ exit(1); } } + //*************************************************************************************************************** TrimSeqsCommand::TrimSeqsCommand(string option) { @@ -254,6 +262,7 @@ TrimSeqsCommand::TrimSeqsCommand(string option) { exit(1); } } + //********************************************************************************************************************** void TrimSeqsCommand::help(){ @@ -1433,6 +1442,7 @@ bool TrimSeqsCommand::compareDNASeq(string oligo, string seq){ } } + //*************************************************************************************************************** int TrimSeqsCommand::countDiffs(string oligo, string seq){ diff --git a/trimseqscommand.h b/trimseqscommand.h index 21a5d41..e4c2e0b 100644 --- a/trimseqscommand.h +++ b/trimseqscommand.h @@ -37,11 +37,12 @@ private: unsigned long int end; linePair(unsigned long int i, unsigned long int j) : start(i), end(j) {} }; - + void getOligos(vector&, vector&); int stripBarcode(Sequence&, QualityScores&, int&); int stripForward(Sequence&, QualityScores&, int&); bool stripReverse(Sequence&, QualityScores&); + bool keepFirstTrim(Sequence&, QualityScores&); bool removeLastTrim(Sequence&, QualityScores&); @@ -74,7 +75,6 @@ private: int driverCreateTrim(string, string, string, string, string, string, string, vector, vector, linePair*, linePair*); int createProcessesCreateTrim(string, string, string, string, string, string, string, vector, vector); int setLines(string, string, vector&, vector&); - }; #endif