From 0fb6d165c8dc8dc7153a101513a05f457431d0bc Mon Sep 17 00:00:00 2001 From: pschloss Date: Wed, 14 Jul 2010 12:14:53 +0000 Subject: [PATCH] pat's changes to trim.seqs --- Mothur.xcodeproj/project.pbxproj | 4 + qualityscores.cpp | 295 +++++++++++++++++++++++ qualityscores.h | 44 ++++ sharedthetayc.cpp | 49 +++- trimseqscommand.cpp | 402 ++++++++++++++++++++----------- trimseqscommand.h | 21 +- 6 files changed, 659 insertions(+), 156 deletions(-) create mode 100644 qualityscores.cpp create mode 100644 qualityscores.h diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj index dae9fe3..81623b1 100644 --- a/Mothur.xcodeproj/project.pbxproj +++ b/Mothur.xcodeproj/project.pbxproj @@ -7,6 +7,8 @@ objects = { /* Begin PBXFileReference section */ + 7E85BD1C11EB5E9B00FD37C0 /* qualityscores.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = qualityscores.h; sourceTree = ""; }; + 7E85BD1D11EB5E9B00FD37C0 /* qualityscores.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = qualityscores.cpp; sourceTree = ""; }; 7EA299BA11E384940022D8D3 /* sensspeccommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sensspeccommand.h; sourceTree = ""; }; 7EA299BB11E384940022D8D3 /* sensspeccommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sensspeccommand.cpp; sourceTree = ""; }; 7EC61A0911BEA6AF00F668D9 /* weightedlinkage.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = weightedlinkage.cpp; sourceTree = ""; }; @@ -821,6 +823,8 @@ A7CB594211402F430010EB83 /* containers */ = { isa = PBXGroup; children = ( + 7E85BD1C11EB5E9B00FD37C0 /* qualityscores.h */, + 7E85BD1D11EB5E9B00FD37C0 /* qualityscores.cpp */, A7DA1FF3113FECD400BF472F /* alignmentcell.hpp */, A7DA1FF2113FECD400BF472F /* alignmentcell.cpp */, A7DA1FF4113FECD400BF472F /* alignmentdb.cpp */, diff --git a/qualityscores.cpp b/qualityscores.cpp new file mode 100644 index 0000000..4b315b1 --- /dev/null +++ b/qualityscores.cpp @@ -0,0 +1,295 @@ +/* + * qualityscores.cpp + * Mothur + * + * Created by Pat Schloss on 7/12/10. + * Copyright 2010 Schloss Lab. All rights reserved. + * + */ + +#include "qualityscores.h" + +/**************************************************************************************************/ + +QualityScores::QualityScores(){ + try { + m = MothurOut::getInstance(); + seqName = ""; + seqLength = -1; + } + catch(exception& e) { + m->errorOut(e, "QualityScores", "QualityScores"); + exit(1); + } +} + +/**************************************************************************************************/ + +QualityScores::QualityScores(ifstream& qFile){ + try { + + m = MothurOut::getInstance(); + + seqName = ""; + seqLength = -1; + int score; + + string line; + getline(qFile, line); + istringstream nameStream(line); + + nameStream >> seqName; + seqName = seqName.substr(1); + + getline(qFile, line); + istringstream qualStream(line); + + while(qualStream){ + qualStream >> score; + qScores.push_back(score); + } + qScores.pop_back(); + + seqLength = qScores.size(); + } + catch(exception& e) { + m->errorOut(e, "QualityScores", "QualityScores"); + exit(1); + } + +} + +/**************************************************************************************************/ + +string QualityScores::getName(){ + + try { + return seqName; + } + catch(exception& e) { + m->errorOut(e, "QualityScores", "getName"); + exit(1); + } +} + +/**************************************************************************************************/ + +void QualityScores::printQScores(ofstream& qFile){ + try { + + double aveQScore = calculateAverage(); + + qFile << '>' << seqName << '\t' << aveQScore << endl; + + for(int i=0;ierrorOut(e, "QualityScores", "printQScores"); + exit(1); + } +} + +/**************************************************************************************************/ + +void QualityScores::trimQScores(int start, int end){ + try { + vector hold; + + if(end == -1){ + hold = vector(qScores.begin()+start, qScores.end()); + qScores = hold; + } + if(start == -1){ + hold = vector(qScores.begin(), qScores.begin()+end); //not sure if indexing is correct + qScores = hold; + } + + seqLength = qScores.size(); + } + catch(exception& e) { + m->errorOut(e, "QualityScores", "trimQScores"); + exit(1); + } +} + +/**************************************************************************************************/ + +void QualityScores::flipQScores(){ + try { + + vector temp = qScores; + for(int i=0;ierrorOut(e, "QualityScores", "flipQScores"); + exit(1); + } +} + +/**************************************************************************************************/ + +bool QualityScores::stripQualThreshold(Sequence& sequence, double qThreshold){ + try { + string rawSequence = sequence.getUnaligned(); + int seqLength = sequence.getNumBases(); + + if(seqName != sequence.getName()){ + m->mothurOut("sequence name mismatch btwn fasta: " + sequence.getName() + " and qual file: " + seqName); + m->mothurOutEndLine(); + } + + int end; + for(int i=0;ierrorOut(e, "QualityScores", "flipQScores"); + exit(1); + } + +} + +/**************************************************************************************************/ + +bool QualityScores::stripQualRollingAverage(Sequence& sequence, double qThreshold){ + try { + string rawSequence = sequence.getUnaligned(); + int seqLength = sequence.getNumBases(); + + if(seqName != sequence.getName()){ + m->mothurOut("sequence name mismatch btwn fasta: " + sequence.getName() + " and qual file: " + seqName); + m->mothurOutEndLine(); + } + + int end = -1; + double rollingSum = 0.0000; + + for(int i=0;ierrorOut(e, "QualityScores", "flipQScores"); + exit(1); + } + +} + +/**************************************************************************************************/ + +bool QualityScores::stripQualWindowAverage(Sequence& sequence, int stepSize, int windowSize, double qThreshold){ + try { + string rawSequence = sequence.getUnaligned(); + int seqLength = sequence.getNumBases(); + + if(seqName != sequence.getName()){ + m->mothurOut("sequence name mismatch btwn fasta: " + sequence.getName() + " and qual file: " + seqName); + m->mothurOutEndLine(); + } + + int end = windowSize; + int start = 0; + + + while(start < seqLength){ + double windowSum = 0.0000; + + for(int i=start;i= seqLength){ end = seqLength - 1; } + } + + + if(end == -1){ end = seqLength; } + + sequence.setUnaligned(rawSequence.substr(0,end)); + trimQScores(-1, end); + + return 1; + } + catch(exception& e) { + m->errorOut(e, "QualityScores", "flipQScores"); + exit(1); + } + +} + +/**************************************************************************************************/ + +double QualityScores::calculateAverage(){ + + double aveQScore = 0.0000; + + for(int i=0;imothurOut("sequence name mismatch btwn fasta: " + sequence.getName() + " and qual file: " + seqName); + m->mothurOutEndLine(); + } + + double aveQScore = calculateAverage(); + + if(aveQScore >= qAverage) { success = 1; } + else { success = 0; } + + return success; + } + catch(exception& e) { + m->errorOut(e, "TrimSeqsCommand", "cullQualAverage"); + exit(1); + } +} + +/**************************************************************************************************/ diff --git a/qualityscores.h b/qualityscores.h new file mode 100644 index 0000000..710200a --- /dev/null +++ b/qualityscores.h @@ -0,0 +1,44 @@ +#ifndef QUALITYSCORES +#define QUALITYSCORES + +/* + * qualityscores.h + * Mothur + * + * Created by Pat Schloss on 7/12/10. + * Copyright 2010 Schloss Lab. All rights reserved. + * + */ + + +#include "mothur.h" +#include "mothurout.h" +#include "sequence.hpp" + +/**************************************************************************************************/ + +class QualityScores { +public: + QualityScores(); + QualityScores(ifstream&); + string getName(); + void printQScores(ofstream&); + void trimQScores(int, int); + void flipQScores(); + bool stripQualThreshold(Sequence&, double); + bool stripQualRollingAverage(Sequence&, double); + bool stripQualWindowAverage(Sequence&, int, int, double); + bool cullQualAverage(Sequence&, double); +private: + + double calculateAverage(); + MothurOut* m; + vector qScores; + + string seqName; + int seqLength; +}; + +/**************************************************************************************************/ + +#endif diff --git a/sharedthetayc.cpp b/sharedthetayc.cpp index 45dce49..6f3f250 100644 --- a/sharedthetayc.cpp +++ b/sharedthetayc.cpp @@ -12,38 +12,63 @@ /***********************************************************************/ EstOutput ThetaYC::getValues(vector shared) { try { - data.resize(1,0); + data.resize(3,0.0000); - int Atotal = 0; - int Btotal = 0; + float Atotal = 0; + float Btotal = 0; float thetaYC = 0; - float relA = 0; - float relB = 0; + float pi = 0; + float qi = 0; float a = 0; float b = 0; + float d = 0; + + float sumPcubed = 0; + float sumQcubed = 0; + float sumPQsq = 0; + float sumPsqQ = 0; //get the total values we need to calculate the theta denominator sums for (int i = 0; i < shared[0]->size(); i++) { //store in temps to avoid multiple repetitive function calls - Atotal += shared[0]->getAbundance(i); - Btotal += shared[1]->getAbundance(i); + Atotal += (float)shared[0]->getAbundance(i); + Btotal += (float)shared[1]->getAbundance(i); } //calculate the theta denominator sums for (int j = 0; j < shared[0]->size(); j++) { //store in temps to avoid multiple repetitive function calls - relA = shared[0]->getAbundance(j) / (float)Atotal; - relB = shared[1]->getAbundance(j) / (float)Btotal; + pi = shared[0]->getAbundance(j) / Atotal; + qi = shared[1]->getAbundance(j) / Btotal; - a += relA * relB; - b += pow((relA-relB),2); + a += pi * pi; + b += qi * qi; + d += pi * qi; + + sumPcubed += pi * pi * pi; + sumQcubed += qi * qi * qi; + sumPQsq += pi * qi * qi; + sumPsqQ += pi * pi * qi; } - thetaYC = a / (float) (b+a); + thetaYC = d / (a + b - d); if (isnan(thetaYC) || isinf(thetaYC)) { thetaYC = 0; } + float varA = 4 / Atotal * (sumPcubed - a * a); + float varB = 4 / Btotal * (sumQcubed - b * b); + float varD = sumPQsq / Atotal + sumPsqQ / Btotal - d * d * (1/Atotal + 1/Btotal); + float covAD = 2 / Atotal * (sumPsqQ - a * d); + float covBD = 2 / Btotal * (sumPQsq - b* d); + + float varT = d * d * (varA + varB) / pow(a + b - d, (float)4.0) + pow(a+b, (float)2.0) * varD / pow(a+b-d, (float)4.0) + - 2.0 * (a + b) * d / pow(a + b - d, (float)4.0) * (covAD + covBD); + + float ci = 1.95 * sqrt(varT); + data[0] = thetaYC; + data[1] = thetaYC - ci; + data[2] = thetaYC + ci; return data; } diff --git a/trimseqscommand.cpp b/trimseqscommand.cpp index af3bfc8..5a01249 100644 --- a/trimseqscommand.cpp +++ b/trimseqscommand.cpp @@ -24,7 +24,7 @@ TrimSeqsCommand::TrimSeqsCommand(string option) { else { //valid paramters for this command string AlignArray[] = {"fasta", "flip", "oligos", "maxambig", "maxhomop", "minlength", "maxlength", "qfile", - "qthreshold", "qaverage", "allfiles", "qtrim","tdiffs", "pdiffs", "bdiffs", "processors", "outputdir","inputdir"}; + "qthreshold", "qwindowaverage", "qstepsize", "qwindowsize", "qaverage", "rollaverage", "allfiles", "qtrim","tdiffs", "pdiffs", "bdiffs", "processors", "outputdir","inputdir"}; vector myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string))); @@ -81,6 +81,7 @@ TrimSeqsCommand::TrimSeqsCommand(string option) { outputDir += hasPath(fastaFile); //if user entered a file with a path then preserve it } + //check for optional parameter and set defaults // ...at some point should added some additional type checking... string temp; @@ -105,7 +106,6 @@ TrimSeqsCommand::TrimSeqsCommand(string option) { temp = validParameter.validFile(parameters, "maxlength", false); if (temp == "not found") { temp = "0"; } convert(temp, maxLength); - temp = validParameter.validFile(parameters, "bdiffs", false); if (temp == "not found") { temp = "0"; } convert(temp, bdiffs); @@ -125,16 +125,28 @@ TrimSeqsCommand::TrimSeqsCommand(string option) { temp = validParameter.validFile(parameters, "qthreshold", false); if (temp == "not found") { temp = "0"; } convert(temp, qThreshold); - temp = validParameter.validFile(parameters, "qtrim", false); if (temp == "not found") { temp = "F"; } + temp = validParameter.validFile(parameters, "qtrim", false); if (temp == "not found") { temp = "F"; } qtrim = isTrue(temp); + temp = validParameter.validFile(parameters, "rollaverage", false); if (temp == "not found") { temp = "0"; } + convert(temp, qRollAverage); + + temp = validParameter.validFile(parameters, "qwindowaverage", false);if (temp == "not found") { temp = "0"; } + convert(temp, qWindowAverage); + + temp = validParameter.validFile(parameters, "qwindowsize", false); if (temp == "not found") { temp = "100"; } + convert(temp, qWindowSize); + + temp = validParameter.validFile(parameters, "qstepsize", false); if (temp == "not found") { temp = "10"; } + convert(temp, qWindowStep); + temp = validParameter.validFile(parameters, "qaverage", false); if (temp == "not found") { temp = "0"; } convert(temp, qAverage); temp = validParameter.validFile(parameters, "allfiles", false); if (temp == "not found") { temp = "F"; } allFiles = isTrue(temp); - temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = "1"; } + temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found") { temp = "1"; } convert(temp, processors); if(allFiles && oligoFile == ""){ @@ -211,12 +223,17 @@ int TrimSeqsCommand::execute(){ outputNames.push_back(trimSeqFile); string scrapSeqFile = outputDir + getRootName(getSimpleName(fastaFile)) + "scrap.fasta"; outputNames.push_back(scrapSeqFile); + string trimQualFile = outputDir + getRootName(getSimpleName(fastaFile)) + "trim.qual"; + outputNames.push_back(trimQualFile); + string scrapQualFile = outputDir + getRootName(getSimpleName(fastaFile)) + "scrap.qual"; + outputNames.push_back(scrapQualFile); string groupFile = outputDir + getRootName(getSimpleName(fastaFile)) + "groups"; vector fastaFileNames; + vector qualFileNames; if(oligoFile != ""){ outputNames.push_back(groupFile); - getOligos(fastaFileNames); + getOligos(fastaFileNames, qualFileNames); } if(qFileName != "") { setLines(qFileName, qLines); } @@ -232,36 +249,69 @@ int TrimSeqsCommand::execute(){ lines.push_back(new linePair(0, numSeqs)); - driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, groupFile, fastaFileNames, lines[0], lines[0]); + driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, groupFile, fastaFileNames, qualFileNames, lines[0], lines[0]); for (int j = 0; j < fastaFileNames.size(); j++) { rename((fastaFileNames[j] + toString(getpid()) + ".temp").c_str(), fastaFileNames[j].c_str()); } + if(qFileName != ""){ + for (int j = 0; j < qualFileNames.size(); j++) { + rename((qualFileNames[j] + toString(getpid()) + ".temp").c_str(), qualFileNames[j].c_str()); + } + } }else{ setLines(fastaFile, lines); if(qFileName == "") { qLines = lines; } - createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, groupFile, fastaFileNames); + createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, groupFile, fastaFileNames, qualFileNames); rename((trimSeqFile + toString(processIDS[0]) + ".temp").c_str(), trimSeqFile.c_str()); rename((scrapSeqFile + toString(processIDS[0]) + ".temp").c_str(), scrapSeqFile.c_str()); rename((groupFile + toString(processIDS[0]) + ".temp").c_str(), groupFile.c_str()); + + if(qFileName != ""){ + rename((trimQualFile + toString(processIDS[0]) + ".temp").c_str(), trimQualFile.c_str()); + rename((scrapQualFile + toString(processIDS[0]) + ".temp").c_str(), scrapQualFile.c_str()); + } + + for (int j = 0; j < fastaFileNames.size(); j++) { rename((fastaFileNames[j] + toString(processIDS[0]) + ".temp").c_str(), fastaFileNames[j].c_str()); } + if(qFileName != ""){ + for (int j = 0; j < qualFileNames.size(); j++) { + rename((qualFileNames[j] + toString(getpid()) + ".temp").c_str(), qualFileNames[j].c_str()); + } + } + //append files for(int i=1;icontrol_pressed) { return 0; } #endif for(int i=0;i 0) { remove(fastaFileNames[i].c_str()); } else { ifstream inFASTA; string seqName; - //openInputFile(getRootName(fastaFile) + groupVector[i] + ".fasta", inFASTA); openInputFile(fastaFileNames[i], inFASTA); ofstream outGroups; string outGroupFilename = outputDir + getRootName(getSimpleName(fastaFileNames[i])) + "groups"; openOutputFile(outGroupFilename, outGroups); - //openOutputFile(outputDir + getRootName(getSimpleName(fastaFile)) + groupVector[i] + ".groups", outGroups); outputNames.push_back(outGroupFilename); string thisGroup = ""; @@ -315,6 +363,31 @@ int TrimSeqsCommand::execute(){ } } + if(qFileName != ""){ + for(int i=0;i 0) { remove(qualFileNames[i].c_str()); } + else { + ifstream inQual; + string seqName; + openInputFile(qualFileNames[i], inQual); +// ofstream outGroups; +// +// string thisGroup = ""; +// if (i > comboStarts) { +// map::iterator itCombo; +// for(itCombo=combos.begin();itCombo!=combos.end(); itCombo++){ +// if(itCombo->second == i){ thisGroup = itCombo->first; combos.erase(itCombo); break; } +// } +// } +// else{ thisGroup = groupVector[i]; } + + inQual.close(); + } + } + } + + if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; @@ -335,7 +408,9 @@ int TrimSeqsCommand::execute(){ } /**************************************************************************************/ -int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFile, string scrapFile, string groupFile, vector fastaNames, linePair* line, linePair* qline) { + +int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFile, string scrapFile, string trimQFile, string scrapQFile, string groupFile, vector fastaNames, vector qualNames, linePair* line, linePair* qline) { + try { ofstream outFASTA; @@ -344,15 +419,31 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string ofstream scrapFASTA; openOutputFile(scrapFile, scrapFASTA); + ofstream outQual; + ofstream scrapQual; + if(qFileName != ""){ + openOutputFile(trimQFile, outQual); + openOutputFile(scrapQFile, scrapQual); + } + ofstream outGroups; vector fastaFileNames; + vector qualFileNames; + + if (oligoFile != "") { openOutputFile(groupFile, outGroups); for (int i = 0; i < fastaNames.size(); i++) { #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) fastaFileNames.push_back(new ofstream((fastaNames[i] + toString(getpid()) + ".temp").c_str(), ios::ate)); + if(qFileName != ""){ + qualFileNames.push_back(new ofstream((qualNames[i] + toString(getpid()) + ".temp").c_str(), ios::ate)); + } #else fastaFileNames.push_back(new ofstream((fastaNames[i] + toString(i) + ".temp").c_str(), ios::ate)); + if(qFileName != ""){ + qualFileNames.push_back(new ofstream((qualNames[i] + toString(i) + ".temp").c_str(), ios::ate)); + } #endif } } @@ -369,70 +460,86 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string for(int i=0;inum;i++){ if (m->control_pressed) { - inFASTA.close(); outFASTA.close(); scrapFASTA.close(); if (oligoFile != "") { outGroups.close(); } if(qFileName != "") { qFile.close(); } + inFASTA.close(); outFASTA.close(); scrapFASTA.close(); + if (oligoFile != "") { outGroups.close(); } + for(int i=0;iclose(); delete fastaFileNames[i]; } + + if(qFileName != ""){ + qFile.close(); + for(int i=0;iclose(); delete qualFileNames[i]; } + } for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } + return 0; } int success = 1; Sequence currSeq(inFASTA); - + QualityScores currQual; + if(qFileName != ""){ + currQual = QualityScores(qFile); + } + string origSeq = currSeq.getUnaligned(); if (origSeq != "") { int groupBar, groupPrime; string trashCode = ""; int currentSeqsDiffs = 0; - + if(qFileName != ""){ - if(qThreshold != 0) { success = stripQualThreshold(currSeq, qFile); } - else if(qAverage != 0) { success = cullQualAverage(currSeq, qFile); } - + if(qThreshold != 0) { success = currQual.stripQualThreshold(currSeq, qThreshold); } + else if(qAverage != 0) { success = currQual.cullQualAverage(currSeq, qAverage); } + else if(qRollAverage != 0) { success = currQual.stripQualRollingAverage(currSeq, qRollAverage); } + else if(qWindowAverage != 0){ success = currQual.stripQualWindowAverage(currSeq, qWindowStep, qWindowSize, qWindowAverage); } + if (qtrim == 1 && (origSeq.length() != currSeq.getUnaligned().length())) { success = 0; //if you don't want to trim and the sequence does not meet quality requirements, move to scrap } - if(!success) { trashCode += 'q'; } + if(!success) { trashCode += 'q'; } } if(barcodes.size() != 0){ - success = stripBarcode(currSeq, groupBar); - if(success > bdiffs){ trashCode += 'b'; } + success = stripBarcode(currSeq, currQual, groupBar); + if(success > bdiffs) { trashCode += 'b'; } else{ currentSeqsDiffs += success; } } if(numFPrimers != 0){ - success = stripForward(currSeq, groupPrime); - if(success > pdiffs){ trashCode += 'f'; } + success = stripForward(currSeq, currQual, groupPrime); + if(success > pdiffs) { trashCode += 'f'; } else{ currentSeqsDiffs += success; } } - if (currentSeqsDiffs > tdiffs) { trashCode += 't'; } + if (currentSeqsDiffs > tdiffs) { trashCode += 't'; } if(numRPrimers != 0){ - success = stripReverse(currSeq); - if(!success){ trashCode += 'r'; } + success = stripReverse(currSeq, currQual); + if(!success) { trashCode += 'r'; } } if(minLength > 0 || maxLength > 0){ success = cullLength(currSeq); - if(!success){ trashCode += 'l'; } + if(!success) { trashCode += 'l'; } } if(maxHomoP > 0){ success = cullHomoP(currSeq); - if(!success){ trashCode += 'h'; } + if(!success) { trashCode += 'h'; } } if(maxAmbig != -1){ success = cullAmbigs(currSeq); - if(!success){ trashCode += 'n'; } + if(!success) { trashCode += 'n'; } } - if(flip){ currSeq.reverseComplement(); } // should go last + if(flip){ currSeq.reverseComplement(); currQual.flipQScores(); } // should go last if(trashCode.length() == 0){ currSeq.setAligned(currSeq.getUnaligned()); currSeq.printSequence(outFASTA); + currQual.printQScores(outQual); + if(barcodes.size() != 0){ string thisGroup = groupVector[groupBar]; int indexToFastaFile = groupBar; @@ -445,7 +552,11 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string } outGroups << currSeq.getName() << '\t' << thisGroup << endl; if(allFiles){ - currSeq.printSequence(*fastaFileNames[indexToFastaFile]); + currSeq.printSequence(*fastaFileNames[indexToFastaFile]); + + if(qFileName != ""){ + currQual.printQScores(*qualFileNames[indexToFastaFile]); + } } } } @@ -454,22 +565,31 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string currSeq.setUnaligned(origSeq); currSeq.setAligned(origSeq); currSeq.printSequence(scrapFASTA); + currQual.printQScores(scrapQual); } } gobble(inFASTA); + gobble(qFile); } inFASTA.close(); outFASTA.close(); scrapFASTA.close(); if (oligoFile != "") { outGroups.close(); } - if(qFileName != "") { qFile.close(); } + if(qFileName != "") { qFile.close(); scrapQual.close(); outQual.close(); } for(int i=0;iclose(); delete fastaFileNames[i]; } + if(qFileName != ""){ + for(int i=0;iclose(); + delete qualFileNames[i]; + } + } + return 0; } catch(exception& e) { @@ -477,8 +597,10 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string exit(1); } } + /**************************************************************************************************/ -int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFile, string scrapFile, string groupFile, vector fastaNames) { + +int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFile, string scrapFile, string trimQFile, string scrapQFile, string groupFile, vector fastaNames, vector qualNames) { try { #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) int process = 0; @@ -493,7 +615,7 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later process++; }else if (pid == 0){ - driverCreateTrim(filename, qFileName, (trimFile + toString(getpid()) + ".temp"), (scrapFile + toString(getpid()) + ".temp"), (groupFile + toString(getpid()) + ".temp"), fastaNames, lines[process], qLines[process]); + driverCreateTrim(filename, qFileName, (trimFile + toString(getpid()) + ".temp"), (scrapFile + toString(getpid()) + ".temp"), (trimQFile + toString(getpid()) + ".temp"), (scrapQFile + toString(getpid()) + ".temp"), (groupFile + toString(getpid()) + ".temp"), fastaNames, qualNames, lines[process], qLines[process]); exit(0); }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); } } @@ -512,6 +634,7 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName exit(1); } } + /**************************************************************************************************/ int TrimSeqsCommand::setLines(string filename, vector& lines) { @@ -570,7 +693,7 @@ int TrimSeqsCommand::setLines(string filename, vector& lines) { } //*************************************************************************************************************** -void TrimSeqsCommand::getOligos(vector& outFASTAVec){ //vector& outFASTAVec +void TrimSeqsCommand::getOligos(vector& outFASTAVec, vector& outQualVec){ try { ifstream inOligos; openInputFile(oligoFile, inOligos); @@ -617,11 +740,20 @@ void TrimSeqsCommand::getOligos(vector& outFASTAVec){ //vector& outFASTAVec){ //vectormothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); } } @@ -658,9 +794,14 @@ void TrimSeqsCommand::getOligos(vector& outFASTAVec){ //vector::iterator itBar = barcodes.begin(); itBar != barcodes.end(); itBar++) { for (map::iterator itPrime = primers.begin(); itPrime != primers.end(); itPrime++) { if (groupVector[itPrime->second] != "") { //there is a group for this primer - outputNames.push_back((outputDir + getRootName(getSimpleName(fastaFile)) + toString(itBar->second) + "." + groupVector[itBar->second] + "." + toString(itPrime->second) + "." + groupVector[itPrime->second] + ".fasta")); - outFASTAVec.push_back((outputDir + getRootName(getSimpleName(fastaFile)) + toString(itBar->second) + "." + groupVector[itBar->second] + "." + toString(itPrime->second) + "." + groupVector[itPrime->second] + ".fasta")); + outputNames.push_back((outputDir + getRootName(getSimpleName(qFileName)) + groupVector[itBar->second] + "." + groupVector[itPrime->second] + ".fasta")); + outFASTAVec.push_back((outputDir + getRootName(getSimpleName(fastaFile)) + groupVector[itBar->second] + "." + groupVector[itPrime->second] + ".fasta")); combos[(groupVector[itBar->second] + "." + groupVector[itPrime->second])] = outFASTAVec.size()-1; + + if(qFileName != ""){ + outQualVec.push_back((outputDir + getRootName(getSimpleName(qFileName)) + groupVector[itBar->second] + "." + groupVector[itPrime->second] + ".qual")); + outputNames.push_back((outputDir + getRootName(getSimpleName(qFileName)) + groupVector[itBar->second] + "." + groupVector[itPrime->second] + ".qual")); + } } } } @@ -677,7 +818,7 @@ void TrimSeqsCommand::getOligos(vector& outFASTAVec){ //vectorsecond; seq.setUnaligned(rawSequence.substr(oligo.length())); + + if(qual.getName() != ""){ + qual.trimQScores(oligo.length(), -1); + } + success = 0; break; } @@ -776,6 +922,10 @@ int TrimSeqsCommand::stripBarcode(Sequence& seq, int& group){ else{ //use the best match group = minGroup; seq.setUnaligned(rawSequence.substr(minPos)); + + if(qual.getName() != ""){ + qual.trimQScores(minPos, -1); + } success = minDiff; } @@ -796,7 +946,7 @@ int TrimSeqsCommand::stripBarcode(Sequence& seq, int& group){ //*************************************************************************************************************** -int TrimSeqsCommand::stripForward(Sequence& seq, int& group){ +int TrimSeqsCommand::stripForward(Sequence& seq, QualityScores& qual, int& group){ try { string rawSequence = seq.getUnaligned(); int success = pdiffs + 1; //guilty until proven innocent @@ -812,6 +962,10 @@ int TrimSeqsCommand::stripForward(Sequence& seq, int& group){ if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){ group = it->second; seq.setUnaligned(rawSequence.substr(oligo.length())); + if(qual.getName() != ""){ + qual.trimQScores(oligo.length(), -1); + + } success = 0; break; } @@ -894,6 +1048,9 @@ int TrimSeqsCommand::stripForward(Sequence& seq, int& group){ else{ //use the best match group = minGroup; seq.setUnaligned(rawSequence.substr(minPos)); + if(qual.getName() != ""){ + qual.trimQScores(minPos, -1); + } success = minDiff; } @@ -912,7 +1069,7 @@ int TrimSeqsCommand::stripForward(Sequence& seq, int& group){ //*************************************************************************************************************** -bool TrimSeqsCommand::stripReverse(Sequence& seq){ +bool TrimSeqsCommand::stripReverse(Sequence& seq, QualityScores& qual){ try { string rawSequence = seq.getUnaligned(); bool success = 0; //guilty until proven innocent @@ -927,6 +1084,9 @@ bool TrimSeqsCommand::stripReverse(Sequence& seq){ if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){ seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length())); + if(qual.getName() != ""){ + qual.trimQScores(-1, rawSequence.length()-oligo.length()); + } success = 1; break; } @@ -1075,102 +1235,76 @@ int TrimSeqsCommand::countDiffs(string oligo, string seq){ } //*************************************************************************************************************** -bool TrimSeqsCommand::stripQualThreshold(Sequence& seq, ifstream& qFile){ - try { +//bool TrimSeqsCommand::stripQualThreshold(Sequence& seq, ifstream& qFile){ +// try { +// // string rawSequence = seq.getUnaligned(); -// int seqLength; // = rawSequence.length(); -// string name, temp, temp2; +// int seqLength = seq.getNumBases(); +// bool success = 0; //guilty until proven innocent +// string name; // // qFile >> name; +// if (name[0] == '>') { if(name.substr(1) != seq.getName()) { m->mothurOut("sequence name mismatch btwn fasta: " + seq.getName() + " and qual file: " + name); m->mothurOutEndLine(); } } +// +// while (!qFile.eof()) { char c = qFile.get(); if (c == 10 || c == 13){ break; } } +// +// int score; +// int end = seqLength; // -// //get rest of line -// temp = ""; -// while (!qFile.eof()) { -// char c = qFile.get(); -// if (c == 10 || c == 13){ break; } -// else { temp += c; } -// } -// -// int pos = temp.find("length"); -// if (pos == temp.npos) { m->mothurOut("Cannot find length in qfile for " + seq.getName()); m->mothurOutEndLine(); seqLength = 0; } -// else { -// string tempLength = temp.substr(pos); -// istringstream iss (tempLength,istringstream::in); -// iss >> temp; +// for(int i=0;i> score; +// +// if(score < qThreshold){ +// end = i; +// break; +// } +// } +// for(int i=end+1;i> score; // } // -// splitAtEquals(temp2, temp); //separates length=242, temp=length, temp2=242 -// convert(temp, seqLength); //converts string to int -// -// if (name.length() != 0) { if(name.substr(1) != seq.getName()) { m->mothurOut("sequence name mismatch btwn fasta and qual file"); m->mothurOutEndLine(); } } - - string rawSequence = seq.getUnaligned(); - int seqLength = seq.getNumBases(); - bool success = 0; //guilty until proven innocent - string name; - - qFile >> name; - if (name[0] == '>') { if(name.substr(1) != seq.getName()) { m->mothurOut("sequence name mismatch btwn fasta: " + seq.getName() + " and qual file: " + name); m->mothurOutEndLine(); } } - - while (!qFile.eof()) { char c = qFile.get(); if (c == 10 || c == 13){ break; } } - - int score; - int end = seqLength; - - for(int i=0;i> score; - - if(score < qThreshold){ - end = i; - break; - } - } - for(int i=end+1;i> score; - } - - seq.setUnaligned(rawSequence.substr(0,end)); - - return 1; - } - catch(exception& e) { - m->errorOut(e, "TrimSeqsCommand", "stripQualThreshold"); - exit(1); - } -} +// seq.setUnaligned(rawSequence.substr(0,end)); +// +// return 1; +// } +// catch(exception& e) { +// m->errorOut(e, "TrimSeqsCommand", "stripQualThreshold"); +// exit(1); +// } +//} //*************************************************************************************************************** -bool TrimSeqsCommand::cullQualAverage(Sequence& seq, ifstream& qFile){ - try { - string rawSequence = seq.getUnaligned(); - int seqLength = seq.getNumBases(); - bool success = 0; //guilty until proven innocent - string name; - - qFile >> name; - if (name[0] == '>') { if(name.substr(1) != seq.getName()) { m->mothurOut("sequence name mismatch btwn fasta: " + seq.getName() + " and qual file: " + name); m->mothurOutEndLine(); } } - - while (!qFile.eof()) { char c = qFile.get(); if (c == 10 || c == 13){ break; } } - - float score; - float average = 0; - - for(int i=0;i> score; - average += score; - } - average /= seqLength; - - if(average >= qAverage) { success = 1; } - else { success = 0; } - - return success; - } - catch(exception& e) { - m->errorOut(e, "TrimSeqsCommand", "cullQualAverage"); - exit(1); - } -} +//bool TrimSeqsCommand::cullQualAverage(Sequence& seq, ifstream& qFile){ +// try { +// string rawSequence = seq.getUnaligned(); +// int seqLength = seq.getNumBases(); +// bool success = 0; //guilty until proven innocent +// string name; +// +// qFile >> name; +// if (name[0] == '>') { if(name.substr(1) != seq.getName()) { m->mothurOut("sequence name mismatch btwn fasta: " + seq.getName() + " and qual file: " + name); m->mothurOutEndLine(); } } +// +// while (!qFile.eof()) { char c = qFile.get(); if (c == 10 || c == 13){ break; } } +// +// float score; +// float average = 0; +// +// for(int i=0;i> score; +// average += score; +// } +// average /= seqLength; +// +// if(average >= qAverage) { success = 1; } +// else { success = 0; } +// +// return success; +// } +// catch(exception& e) { +// m->errorOut(e, "TrimSeqsCommand", "cullQualAverage"); +// exit(1); +// } +//} //*************************************************************************************************************** diff --git a/trimseqscommand.h b/trimseqscommand.h index ab692d3..a904a10 100644 --- a/trimseqscommand.h +++ b/trimseqscommand.h @@ -13,6 +13,7 @@ #include "mothur.h" #include "command.hpp" #include "sequence.hpp" +#include "qualityscores.h" class TrimSeqsCommand : public Command { public: @@ -29,23 +30,23 @@ private: linePair(unsigned long int i, int j) : start(i), num(j) {} }; - void getOligos(vector&); - bool stripQualThreshold(Sequence&, ifstream&); - bool cullQualAverage(Sequence&, ifstream&); - int stripBarcode(Sequence&, int&); - int stripForward(Sequence&, int&); - bool stripReverse(Sequence&); + void getOligos(vector&, vector&); + int stripBarcode(Sequence&, QualityScores&, int&); + int stripForward(Sequence&, QualityScores&, int&); + bool stripReverse(Sequence&, QualityScores&); bool cullLength(Sequence&); bool cullHomoP(Sequence&); bool cullAmbigs(Sequence&); bool compareDNASeq(string, string); - int countDiffs(string, string);//, int, int&, int); + int countDiffs(string, string); bool abort; string fastaFile, oligoFile, qFileName, outputDir; bool flip, allFiles, qtrim; - int numFPrimers, numRPrimers, maxAmbig, maxHomoP, minLength, maxLength, qThreshold, qAverage, processors, tdiffs, bdiffs, pdiffs, comboStarts; + int numFPrimers, numRPrimers, maxAmbig, maxHomoP, minLength, maxLength, processors, tdiffs, bdiffs, pdiffs, comboStarts; + int qWindowSize, qWindowStep; + double qRollAverage, qThreshold, qWindowAverage, qAverage; vector revPrimer, outputNames; set filesToRemove; map barcodes; @@ -57,8 +58,8 @@ private: vector lines; vector qLines; - int driverCreateTrim(string, string, string, string, string, vector, linePair*, linePair*); - int createProcessesCreateTrim(string, string, string, string, string, vector); + int driverCreateTrim(string, string, string, string, string, string, string, vector, vector, linePair*, linePair*); + int createProcessesCreateTrim(string, string, string, string, string, string, string, vector, vector); int setLines(string, vector&); }; -- 2.39.2