From 7a18f1438113f6c52c66f97ae0044cfa365dd221 Mon Sep 17 00:00:00 2001 From: Pat Schloss Date: Thu, 3 Jan 2013 08:45:00 -0500 Subject: [PATCH] Changes to seq.error so that an alignment is not necessary --- aligncommand.cpp | 12 +- chimeraperseuscommand.cpp | 4 +- qualityscores.cpp | 4 +- qualityscores.h | 2 +- refchimeratest.cpp | 421 +++++++++++++++++++++++++++++++++++++- refchimeratest.h | 19 +- seqerrorcommand.cpp | 310 +++++++++++++++++----------- trimseqscommand.cpp | 6 +- 8 files changed, 635 insertions(+), 143 deletions(-) diff --git a/aligncommand.cpp b/aligncommand.cpp index 9d4a609..a871244 100644 --- a/aligncommand.cpp +++ b/aligncommand.cpp @@ -27,8 +27,8 @@ vector AlignCommand::setParameters(){ CommandParameter pmatch("match", "Number", "", "1.0", "", "", "","",false,false); parameters.push_back(pmatch); CommandParameter palign("align", "Multiple", "needleman-gotoh-blast-noalign", "needleman", "", "", "","",false,false,true); parameters.push_back(palign); CommandParameter pmismatch("mismatch", "Number", "", "-1.0", "", "", "","",false,false); parameters.push_back(pmismatch); - CommandParameter pgapopen("gapopen", "Number", "", "-2.0", "", "", "","",false,false); parameters.push_back(pgapopen); - CommandParameter pgapextend("gapextend", "Number", "", "-1.0", "", "", "","",false,false); parameters.push_back(pgapextend); + CommandParameter pgapopen("gapopen", "Number", "", "-5.0", "", "", "","",false,false); parameters.push_back(pgapopen); + CommandParameter pgapextend("gapextend", "Number", "", "-2.0", "", "", "","",false,false); parameters.push_back(pgapextend); CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors); CommandParameter pflip("flip", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pflip); CommandParameter psave("save", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(psave); @@ -57,8 +57,8 @@ string AlignCommand::getHelpString(){ helpString += "The ksize parameter allows you to specify the kmer size for finding most similar template to candidate. The default is 8."; helpString += "The match parameter allows you to specify the bonus for having the same base. The default is 1.0."; helpString += "The mistmatch parameter allows you to specify the penalty for having different bases. The default is -1.0."; - helpString += "The gapopen parameter allows you to specify the penalty for opening a gap in an alignment. The default is -2.0."; - helpString += "The gapextend parameter allows you to specify the penalty for extending a gap in an alignment. The default is -1.0."; + helpString += "The gapopen parameter allows you to specify the penalty for opening a gap in an alignment. The default is -5.0."; + helpString += "The gapextend parameter allows you to specify the penalty for extending a gap in an alignment. The default is -2.0."; helpString += "The flip parameter is used to specify whether or not you want mothur to try the reverse complement if a sequence falls below the threshold. The default is false."; helpString += "The threshold is used to specify a cutoff at which an alignment is deemed 'bad' and the reverse complement may be tried. The default threshold is 0.50, meaning 50% of the bases are removed in the alignment."; helpString += "If the flip parameter is set to true the reverse complement of the sequence is aligned and the better alignment is reported."; @@ -249,10 +249,10 @@ AlignCommand::AlignCommand(string option) { temp = validParameter.validFile(parameters, "mismatch", false); if (temp == "not found"){ temp = "-1.0"; } m->mothurConvert(temp, misMatch); - temp = validParameter.validFile(parameters, "gapopen", false); if (temp == "not found"){ temp = "-2.0"; } + temp = validParameter.validFile(parameters, "gapopen", false); if (temp == "not found"){ temp = "-5.0"; } m->mothurConvert(temp, gapOpen); - temp = validParameter.validFile(parameters, "gapextend", false); if (temp == "not found"){ temp = "-1.0"; } + temp = validParameter.validFile(parameters, "gapextend", false); if (temp == "not found"){ temp = "-2.0"; } m->mothurConvert(temp, gapExtend); temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); } diff --git a/chimeraperseuscommand.cpp b/chimeraperseuscommand.cpp index e1ad14e..1e5f9d4 100644 --- a/chimeraperseuscommand.cpp +++ b/chimeraperseuscommand.cpp @@ -911,8 +911,8 @@ int ChimeraPerseusCommand::driver(string chimeraFileName, vector& seque type = "chimera"; chimeraRefSeq = myPerseus.stitchBimera(alignments, leftParentBi, rightParentBi, breakPointBi, leftMaps, rightMaps); } - ; - if (m->control_pressed) { chimeraFile.close(); m->mothurRemove(chimeraFileName); accnosFile.close(); m->mothurRemove(accnosFileName); return 0; } + + if (m->control_pressed) { chimeraFile.close(); m->mothurRemove(chimeraFileName); accnosFile.close(); m->mothurRemove(accnosFileName); return 0; } double chimeraDist = myPerseus.modeledPairwiseAlignSeqs(sequences[i].sequence, chimeraRefSeq, dummyA, dummyB, correctModel); diff --git a/qualityscores.cpp b/qualityscores.cpp index 566b44c..0b7bd06 100644 --- a/qualityscores.cpp +++ b/qualityscores.cpp @@ -423,13 +423,13 @@ void QualityScores::updateReverseMap(vector >& reverseMap, int start try { int index = 0; - for(int i=stop-1;i>=start;i--){ + for(int i=stop-1;i>=start-1;i--){ reverseMap[index++][qScores[i]] += weight; } } catch(exception& e) { - m->errorOut(e, "QualityScores", "updateForwardMap"); + m->errorOut(e, "QualityScores", "updateReverseMap"); exit(1); } } diff --git a/qualityscores.h b/qualityscores.h index 49034b8..77c3ee0 100644 --- a/qualityscores.h +++ b/qualityscores.h @@ -24,7 +24,7 @@ public: QualityScores(); QualityScores(ifstream&); string getName(); - + int getLength(){ return (int)qScores.size(); } vector getQualityScores() { return qScores; } void printQScores(ofstream&); void trimQScores(int, int); diff --git a/refchimeratest.cpp b/refchimeratest.cpp index ab55ca4..178d0c4 100644 --- a/refchimeratest.cpp +++ b/refchimeratest.cpp @@ -8,6 +8,7 @@ */ #include "refchimeratest.h" +#include "myPerseus.h" #include "mothur.h" int MAXINT = numeric_limits::max(); @@ -41,9 +42,27 @@ int RefChimeraTest::printHeader(ofstream& chimeraReportFile){ exit(1); } } + //*************************************************************************************************************** int RefChimeraTest::analyzeQuery(string queryName, string querySeq, ofstream& chimeraReportFile){ + + int numParents = -1; + + if(aligned){ + numParents = analyzeAlignedQuery(queryName, querySeq, chimeraReportFile); + } + else{ + numParents = analyzeUnalignedQuery(queryName, querySeq, chimeraReportFile); + } + + return numParents; + +} + +//*************************************************************************************************************** + +int RefChimeraTest::analyzeAlignedQuery(string queryName, string querySeq, ofstream& chimeraReportFile){ vector > left(numRefSeqs); vector singleLeft, bestLeft; @@ -55,7 +74,9 @@ int RefChimeraTest::analyzeQuery(string queryName, string querySeq, ofstream& ch } right = left; - int bestSequenceMismatch = getMismatches(querySeq, left, right, bestMatch); + int bestSequenceMismatch = getAlignedMismatches(querySeq, left, right, bestMatch); + + int leftParentBi, rightParentBi, breakPointBi; int minMismatchToChimera = getChimera(left, right, leftParentBi, rightParentBi, breakPointBi, singleLeft, bestLeft, singleRight, bestRight); @@ -72,8 +93,10 @@ int RefChimeraTest::analyzeQuery(string queryName, string querySeq, ofstream& ch chimeraRefSeq = referenceSeqs[bestMatch]; } - - double distToChimera = calcDistToChimera(querySeq, chimeraRefSeq); + bestRefAlignment = chimeraRefSeq; + bestQueryAlignment = querySeq; + + double distToChimera = calcDistToChimera(bestQueryAlignment, bestRefAlignment); chimeraReportFile << queryName << '\t' << referenceNames[bestMatch] << '\t' << bestSequenceMismatch << '\t'; chimeraReportFile << referenceNames[leftParentBi] << ',' << referenceNames[rightParentBi] << '\t' << breakPointBi << '\t'; @@ -83,9 +106,383 @@ int RefChimeraTest::analyzeQuery(string queryName, string querySeq, ofstream& ch return nMera; } +//*************************************************************************************************************** + +int RefChimeraTest::analyzeUnalignedQuery(string queryName, string querySeq, ofstream& chimeraReportFile){ + + int nMera = 0; + + int seqLength = querySeq.length(); + + vector queryAlign(numRefSeqs); + vector refAlign(numRefSeqs); + + vector > leftDiffs(numRefSeqs, 0); + vector > rightDiffs(numRefSeqs, 0); + vector > leftMaps(numRefSeqs, 0); + vector > rightMaps(numRefSeqs, 0); + + int bestRefIndex = -1; + int bestRefDiffs = numeric_limits::max(); + double bestRefLength = 0; + + for(int i=0;i= 3){ + for(int i=0;i singleLeft(seqLength, numeric_limits::max()); + vector bestLeft(seqLength, -1); + + for(int l=0;l singleRight(seqLength, numeric_limits::max()); + vector bestRight(seqLength, -1); + + for(int l=0;l::max(); + int leftParent = 0; + int rightParent = 0; + int breakPoint = 0; + + for(int l=0;l= 3){// || (minMismatchToChimera == 0 && bestSequenceMismatch != 0)){ + nMera = 2; + + int breakLeft = leftMaps[leftParent][breakPoint]; + int breakRight = rightMaps[rightParent][rightMaps[rightParent].size() - breakPoint - 2]; + + string left = refAlign[leftParent]; + string right = refAlign[rightParent]; + + for(int i=0;i<=breakLeft;i++){ + + if (m->control_pressed) { return 0; } + + if(left[i] != '-' && left[i] != '.'){ + reference += left[i]; + } + } + + + for(int i=breakRight;icontrol_pressed) { return 0; } + + if(right[i] != '-' && right[i] != '.'){ + reference += right[i]; + } + } + + } + else{ + nMera = 1; + reference = referenceSeqs[bestRefIndex]; + } + + double alignLength; + double finalDiffs = alignQueryToReferences(querySeq, reference, bestQueryAlignment, bestRefAlignment, alignLength); + double finalDistance = finalDiffs / alignLength; + + chimeraReportFile << queryName << '\t' << referenceNames[bestRefIndex] << '\t' << bestRefDiffs << '\t'; + chimeraReportFile << referenceNames[leftParent] << ',' << referenceNames[rightParent] << '\t' << breakPoint << '\t'; + chimeraReportFile << bestChimeraMismatches << '\t'; + chimeraReportFile << '\t' << finalDistance << '\t' << nMera << endl; + } + else{ + bestQueryAlignment = queryAlign[bestRefIndex]; + bestRefAlignment = refAlign[bestRefIndex]; + nMera = 1; + + chimeraReportFile << queryName << '\t' << referenceNames[bestRefIndex] << '\t' << bestRefDiffs << '\t'; + chimeraReportFile << "NA\tNA\tNA\tNA\t1" << endl; + } + + bestMatch = bestRefIndex; + return nMera; +} + /**************************************************************************************************/ -int RefChimeraTest::getMismatches(string& querySeq, vector >& left, vector >& right, int& bestRefSeq){ +double RefChimeraTest::alignQueryToReferences(string query, string reference, string& qAlign, string& rAlign, double& length){ + + + try { + double GAP = -5; + double MATCH = 1; + double MISMATCH = -1; + + int queryLength = query.length(); + int refLength = reference.length(); + + vector > alignMatrix(queryLength + 1); + vector > alignMoves(queryLength + 1); + + for(int i=0;i<=queryLength;i++){ + if (m->control_pressed) { return 0; } + alignMatrix[i].resize(refLength + 1, 0); + alignMoves[i].resize(refLength + 1, 'x'); + } + + for(int i=0;i<=queryLength;i++){ + if (m->control_pressed) { return 0; } + alignMatrix[i][0] = 0;//GAP * i; + alignMoves[i][0] = 'u'; + } + + for(int i=0;i<=refLength;i++){ + if (m->control_pressed) { return 0; } + alignMatrix[0][i] = 0;//GAP * i; + alignMoves[0][i] = 'l'; + } + + for(int i=1;i<=queryLength;i++){ + + if (m->control_pressed) { return 0; } + + for(int j=1;j<=refLength;j++){ + + double nogapScore; + if(query[i-1] == reference[j-1]){ nogapScore = alignMatrix[i-1][j-1] + MATCH; } + else { nogapScore = alignMatrix[i-1][j-1] + MISMATCH; } + + double leftScore; + if(i == queryLength) { leftScore = alignMatrix[i][j-1]; } + else { leftScore = alignMatrix[i][j-1] + GAP; } + + + double upScore; + if(j == refLength) { upScore = alignMatrix[i-1][j]; } + else { upScore = alignMatrix[i-1][j] + GAP; } + + if(nogapScore > leftScore){ + if(nogapScore > upScore){ + alignMoves[i][j] = 'd'; + alignMatrix[i][j] = nogapScore; + } + else{ + alignMoves[i][j] = 'u'; + alignMatrix[i][j] = upScore; + } + } + else{ + if(leftScore > upScore){ + alignMoves[i][j] = 'l'; + alignMatrix[i][j] = leftScore; + } + else{ + alignMoves[i][j] = 'u'; + alignMatrix[i][j] = upScore; + } + } + } + } + + int end = refLength - 1; + int maxRow = 0; + double maxRowValue = -100000000000; + for(int i=0;i maxRowValue){ + maxRow = i; + maxRowValue = alignMatrix[i][end]; + } + } + + end = queryLength - 1; + int maxColumn = 0; + double maxColumnValue = -100000000000; + + for(int j=0;j maxColumnValue){ + maxColumn = j; + maxColumnValue = alignMatrix[end][j]; + } + } + + int row = queryLength-1; + int column = refLength-1; + + if(maxColumn == column && maxRow == row){} // if the max values are the lower right corner, then we're good + else if(alignMatrix[row][maxColumn] < alignMatrix[maxRow][column]){ + for(int i=maxRow+1;i 0 && j > 0){ + + if (m->control_pressed) { return 0; } + + if(alignMoves[i][j] == 'd'){ + qAlign = query[i-1] + qAlign; + rAlign = reference[j-1] + rAlign; + + if(query[i-1] != reference[j-1]){ diffs++; } + length++; + + i--; + j--; + } + else if(alignMoves[i][j] == 'u'){ + qAlign = query[i-1] + qAlign; + + if(j != refLength) { rAlign = '-' + rAlign; diffs++; length++; } + else { rAlign = '.' + rAlign; } + i--; + } + else if(alignMoves[i][j] == 'l'){ + rAlign = reference[j-1] + rAlign; + + if(i != queryLength){ qAlign = '-' + qAlign; diffs++; length++; } + else { qAlign = '.' + qAlign; } + j--; + } + } + + if(i>0){ + qAlign = query.substr(0, i) + qAlign; + rAlign = string(i, '.') + rAlign; + } + else if(j>0){ + qAlign = string(j, '.') + qAlign; + rAlign = reference.substr(0, j) + rAlign; + } + + + return diffs; + } + catch(exception& e) { + m->errorOut(e, "RefChimeraTest", "alignQueryToReferences"); + exit(1); + } +} + +/**************************************************************************************************/ + +int RefChimeraTest::getUnalignedDiffs(string qAlign, string rAlign, vector& leftDiffs, vector& leftMap, vector& rightDiffs, vector& rightMap){ + try { + int alignLength = qAlign.length(); + + int lDiffs = 0; + int lCount = 0; + for(int l=0;lcontrol_pressed) { return 0; } + + if(qAlign[l] == '-'){ + lDiffs++; + } + else if(qAlign[l] != '.'){ + + if(rAlign[l] == '-'){ + lDiffs++; + } + else if(qAlign[l] != rAlign[l]){;// && rAlign[l] != '.'){ + lDiffs++; + } + leftDiffs[lCount] = lDiffs; + leftMap[lCount] = l; + + lCount++; + } + } + + int rDiffs = 0; + int rCount = 0; + for(int l=alignLength-1;l>=0;l--){ + + if (m->control_pressed) { return 0; } + + if(qAlign[l] == '-'){ + rDiffs++; + } + else if(qAlign[l] != '.'){ + + if(rAlign[l] == '-'){ + rDiffs++; + } + else if(qAlign[l] != rAlign[l]){;// && rAlign[l] != '.'){ + rDiffs++; + } + + rightDiffs[rCount] = rDiffs; + rightMap[rCount] = l; + rCount++; + } + + } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "RefChimeraTest", "getUnalignedDiffs"); + exit(1); + } +} + +/**************************************************************************************************/ + +int RefChimeraTest::getAlignedMismatches(string& querySeq, vector >& left, vector >& right, int& bestRefSeq){ int bestSequenceMismatch = MAXINT; @@ -262,3 +659,19 @@ int RefChimeraTest::getClosestRefIndex(){ } //*************************************************************************************************************** + +string RefChimeraTest::getQueryAlignment(){ + + return bestQueryAlignment; + +} + +//*************************************************************************************************************** + +string RefChimeraTest::getClosestRefAlignment(){ + + return bestRefAlignment; + +} + +//*************************************************************************************************************** diff --git a/refchimeratest.h b/refchimeratest.h index 41df4eb..5ea0801 100644 --- a/refchimeratest.h +++ b/refchimeratest.h @@ -16,13 +16,22 @@ class RefChimeraTest { public: - RefChimeraTest(vector&, bool); + RefChimeraTest(){}; + RefChimeraTest(vector&, bool); int printHeader(ofstream&); - int analyzeQuery(string, string, ofstream&); + int analyzeQuery(string, string, ofstream&); int getClosestRefIndex(); + string getClosestRefAlignment(); + string getQueryAlignment(); + private: - int getMismatches(string&, vector >&, vector >&, int&); - int getChimera(vector >&, vector >&, int&, int&, int&, vector&, vector&, vector&, vector&); + int getAlignedMismatches(string&, vector >&, vector >&, int&); + int analyzeAlignedQuery(string, string, ofstream&); + int analyzeUnalignedQuery(string, string, ofstream&); + double alignQueryToReferences(string, string, string&, string&, double&); + int getUnalignedDiffs(string, string, vector&, vector&, vector&, vector&); + + int getChimera(vector >&, vector >&, int&, int&, int&, vector&, vector&, vector&, vector&); int getTrimera(vector >&, vector >&, int&, int&, int&, int&, int&, vector&, vector&, vector&, vector&); string stitchBimera(int, int, int); string stitchTrimera(int, int, int, int, int); @@ -33,6 +42,8 @@ private: int numRefSeqs; int alignLength; int bestMatch; + string bestRefAlignment; + string bestQueryAlignment; //ofstream chimeraReportFile; bool aligned; diff --git a/seqerrorcommand.cpp b/seqerrorcommand.cpp index 4ff1b6f..1fe60e8 100644 --- a/seqerrorcommand.cpp +++ b/seqerrorcommand.cpp @@ -11,10 +11,12 @@ #include "reportfile.h" #include "qualityscores.h" #include "refchimeratest.h" +#include "myPerseus.h" #include "filterseqscommand.h" //********************************************************************************************************************** + vector SeqErrorCommand::setParameters(){ try { CommandParameter pquery("fasta", "InputTypes", "", "", "none", "none", "none","errorType",false,true,true); parameters.push_back(pquery); @@ -24,7 +26,7 @@ vector SeqErrorCommand::setParameters(){ CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none","",false,false); parameters.push_back(pname); CommandParameter pignorechimeras("ignorechimeras", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pignorechimeras); CommandParameter pthreshold("threshold", "Number", "", "1.0", "", "", "","",false,false); parameters.push_back(pthreshold); - CommandParameter paligned("aligned", "Boolean", "T", "", "", "", "","",false,false); parameters.push_back(paligned); + CommandParameter paligned("aligned", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(paligned); CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors); CommandParameter psave("save", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(psave); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir); @@ -39,7 +41,9 @@ vector SeqErrorCommand::setParameters(){ exit(1); } } + //********************************************************************************************************************** + string SeqErrorCommand::getHelpString(){ try { string helpString = ""; @@ -63,7 +67,9 @@ string SeqErrorCommand::getHelpString(){ exit(1); } } + //********************************************************************************************************************** + string SeqErrorCommand::getOutputPattern(string type) { try { string pattern = ""; @@ -88,7 +94,9 @@ string SeqErrorCommand::getOutputPattern(string type) { exit(1); } } + //********************************************************************************************************************** + SeqErrorCommand::SeqErrorCommand(){ try { abort = true; calledHelp = true; @@ -111,6 +119,7 @@ SeqErrorCommand::SeqErrorCommand(){ exit(1); } } + //*************************************************************************************************************** SeqErrorCommand::SeqErrorCommand(string option) { @@ -228,12 +237,6 @@ SeqErrorCommand::SeqErrorCommand(string option) { if(reportFileName == "not found"){ reportFileName = ""; } else if (reportFileName == "not open") { reportFileName = ""; abort = true; } - if((reportFileName != "" && qualFileName == "") || (reportFileName == "" && qualFileName != "")){ - m->mothurOut("if you use either a qual file or a report file, you have to have both."); - m->mothurOutEndLine(); - abort = true; - } - outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; @@ -245,12 +248,6 @@ SeqErrorCommand::SeqErrorCommand(string option) { temp = validParameter.validFile(parameters, "threshold", false); if (temp == "not found") { temp = "1.00"; } m->mothurConvert(temp, threshold); - temp = validParameter.validFile(parameters, "aligned", true); if (temp == "not found"){ temp = "t"; } - aligned = m->isTrue(temp); -// rdb->aligned = aligned; #do we need these lines for aligned? -// if (aligned) { //clear out old references -// rdb->clearMemory(); -// } temp = validParameter.validFile(parameters, "save", false); if (temp == "not found"){ temp = "f"; } save = m->isTrue(temp); @@ -277,6 +274,9 @@ SeqErrorCommand::SeqErrorCommand(string option) { temp = validParameter.validFile(parameters, "ignorechimeras", false); if (temp == "not found") { temp = "T"; } ignoreChimeras = m->isTrue(temp); + temp = validParameter.validFile(parameters, "aligned", false); if (temp == "not found"){ temp = "t"; } + aligned = m->isTrue(temp); + temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); } m->setProcessors(temp); m->mothurConvert(temp, processors); @@ -288,6 +288,22 @@ SeqErrorCommand::SeqErrorCommand(string option) { vector files; files.push_back(queryFileName); parser.getNameFile(files); } + + if(aligned == true){ + if((reportFileName != "" && qualFileName == "") || (reportFileName == "" && qualFileName != "")){ + m->mothurOut("if you use either a qual file or a report file, you have to have both."); + m->mothurOutEndLine(); + abort = true; + } + } + else{ + if(reportFileName != ""){ + m->mothurOut("we are ignoring the report file if your sequences are not aligned. we will check that the sequences in your fasta and and qual fileare the same length."); + m->mothurOutEndLine(); + } + } + + } } catch(exception& e) { @@ -295,6 +311,7 @@ SeqErrorCommand::SeqErrorCommand(string option) { exit(1); } } + //*************************************************************************************************************** int SeqErrorCommand::execute(){ @@ -302,7 +319,7 @@ int SeqErrorCommand::execute(){ if (abort == true) { if (calledHelp) { return 0; } return 2; } int start = time(NULL); - maxLength = 2000; + maxLength = 5000; totalBases = 0; totalMatches = 0; @@ -333,22 +350,23 @@ int SeqErrorCommand::execute(){ for (int i = 0; i < (fastaFilePos.size()-1); i++) { lines.push_back(linePair(fastaFilePos[i], fastaFilePos[(i+1)])); if (qualFileName != "") { qLines.push_back(linePair(qFilePos[i], qFilePos[(i+1)])); } - if (reportFileName != "") { rLines.push_back(linePair(reportFilePos[i], reportFilePos[(i+1)])); } + if (reportFileName != "" && aligned == true) { rLines.push_back(linePair(reportFilePos[i], reportFilePos[(i+1)])); } } if(qualFileName == "") { qLines = lines; rLines = lines; } //fills with duds - + if(aligned == false){ rLines = lines; } int numSeqs = 0; #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) if(processors == 1){ numSeqs = driver(queryFileName, qualFileName, reportFileName, errorSummaryFileName, errorSeqFileName, errorChimeraFileName, lines[0], qLines[0], rLines[0]); + }else{ numSeqs = createProcesses(queryFileName, qualFileName, reportFileName, errorSummaryFileName, errorSeqFileName, errorChimeraFileName); } #else numSeqs = driver(queryFileName, qualFileName, reportFileName, errorSummaryFileName, errorSeqFileName, errorChimeraFileName, lines[0], qLines[0], rLines[0]); #endif - - if(qualFileName != "" && reportFileName != ""){ + + if(qualFileName != ""){ printErrorQuality(qScoreErrorMap); printQualityFR(qualForwardMap, qualReverseMap); } @@ -370,19 +388,21 @@ int SeqErrorCommand::execute(){ } errorCountFile.close(); - if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } +// if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } printSubMatrix(); - + string megAlignmentFileName = getOutputFileName("errorref-query",variables); ofstream megAlignmentFile; m->openOutputFile(megAlignmentFileName, megAlignmentFile); outputNames.push_back(megAlignmentFileName); outputTypes["errorref-query"].push_back(megAlignmentFileName); - + for(int i=0;imothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences."); m->mothurOutEndLine(); @@ -399,7 +419,9 @@ int SeqErrorCommand::execute(){ exit(1); } } + //********************************************************************************************************************** + int SeqErrorCommand::createProcesses(string filename, string qFileName, string rFileName, string summaryFileName, string errorOutputFileName, string chimeraOutputFileName) { try { int process = 1; @@ -637,7 +659,9 @@ int SeqErrorCommand::createProcesses(string filename, string qFileName, string r exit(1); } } + //********************************************************************************************************************** + int SeqErrorCommand::driver(string filename, string qFileName, string rFileName, string summaryFileName, string errorOutputFileName, string chimeraOutputFileName, linePair line, linePair qline, linePair rline) { try { @@ -669,11 +693,12 @@ int SeqErrorCommand::driver(string filename, string qFileName, string rFileName, //open inputfiles and go to beginning place for this processor ifstream queryFile; m->openInputFile(filename, queryFile); + queryFile.seekg(line.start); ifstream reportFile; ifstream qualFile; - if(qFileName != "" && rFileName != ""){ + if((qFileName != "" && rFileName != "" && aligned)){ m->openInputFile(qFileName, qualFile); qualFile.seekg(qline.start); @@ -691,13 +716,28 @@ int SeqErrorCommand::driver(string filename, string qFileName, string rFileName, qualReverseMap[i].assign(41,0); } } - + else if(qFileName != "" && !aligned){ + + m->openInputFile(qFileName, qualFile); + qualFile.seekg(qline.start); + + qualForwardMap.resize(maxLength); + qualReverseMap.resize(maxLength); + for(int i=0;iopenOutputFile(chimeraOutputFileName, outChimeraReport); - RefChimeraTest chimeraTest(referenceSeqs, aligned); - if (line.start == 0) { chimeraTest.printHeader(outChimeraReport); } + RefChimeraTest chimeraTest; + + chimeraTest = RefChimeraTest(referenceSeqs, aligned); + if (line.start == 0) { chimeraTest.printHeader(outChimeraReport); } + ofstream errorSummaryFile; m->openOutputFile(summaryFileName, errorSummaryFile); @@ -706,26 +746,34 @@ int SeqErrorCommand::driver(string filename, string qFileName, string rFileName, ofstream errorSeqFile; m->openOutputFile(errorOutputFileName, errorSeqFile); - megaAlignVector.resize(numRefs, ""); + megaAlignVector.assign(numRefs, ""); int index = 0; bool ignoreSeq = 0; bool moreSeqs = 1; while (moreSeqs) { - - if (m->control_pressed) { queryFile.close(); if(qFileName != "" && rFileName != ""){ reportFile.close(); qualFile.close(); } outChimeraReport.close(); errorSummaryFile.close();errorSeqFile.close(); return 0; } - + Sequence query(queryFile); - - int numParentSeqs = chimeraTest.analyzeQuery(query.getName(), query.getAligned(), outChimeraReport); - int closestRefIndex = chimeraTest.getClosestRefIndex(); - + Sequence reference; + int numParentSeqs = -1; + int closestRefIndex = -1; + + numParentSeqs = chimeraTest.analyzeQuery(query.getName(), query.getAligned(), outChimeraReport); + + closestRefIndex = chimeraTest.getClosestRefIndex(); + + reference = referenceSeqs[closestRefIndex]; + + reference.setAligned(chimeraTest.getClosestRefAlignment()); + query.setAligned(chimeraTest.getQueryAlignment()); + if(numParentSeqs > 1 && ignoreChimeras == 1) { ignoreSeq = 1; } - else { ignoreSeq = 0; } - + else { ignoreSeq = 0; } + Compare minCompare; - getErrors(query, referenceSeqs[closestRefIndex], minCompare); + + getErrors(query, reference, minCompare); if(namesFileName != ""){ it = weights.find(query.getName()); @@ -733,21 +781,20 @@ int SeqErrorCommand::driver(string filename, string qFileName, string rFileName, } else{ minCompare.weight = 1; } + printErrorData(minCompare, numParentSeqs, errorSummaryFile, errorSeqFile); if(!ignoreSeq){ - for(int i=0;i maxMismatch){ @@ -792,12 +851,15 @@ int SeqErrorCommand::driver(string filename, string qFileName, string rFileName, if(index % 100 == 0){ m->mothurOut(toString(index)); m->mothurOutEndLine(); } } queryFile.close(); - if(qFileName != "" && rFileName != ""){ reportFile.close(); qualFile.close(); } - errorSummaryFile.close(); + outChimeraReport.close(); + errorSummaryFile.close(); errorSeqFile.close(); - + + if(qFileName != "" && rFileName != "") { reportFile.close(); qualFile.close(); } + else if(qFileName != "" && aligned == false){ qualFile.close(); } + //report progress - if(index % 100 != 0){ m->mothurOut(toString(index)); m->mothurOutEndLine(); } + m->mothurOut(toString(index)); m->mothurOutEndLine(); return index; } @@ -806,6 +868,7 @@ int SeqErrorCommand::driver(string filename, string qFileName, string rFileName, exit(1); } } + //*************************************************************************************************************** void SeqErrorCommand::getReferences(){ @@ -901,9 +964,10 @@ int SeqErrorCommand::getErrors(Sequence query, Sequence reference, Compare& erro int started = 0; //Compare errors; - + + errors.sequence = ""; for(int i=0;i SeqErrorCommand::getWeights(){ nameCountMap[seqName] = m->getNumNames(redundantSeqs); m->gobble(nameFile); } + + nameFile.close(); + return nameCountMap; } - //*************************************************************************************************************** void SeqErrorCommand::printErrorHeader(ofstream& errorSummaryFile){ @@ -1139,6 +1205,7 @@ void SeqErrorCommand::printSubMatrix(){ exit(1); } } + //*************************************************************************************************************** void SeqErrorCommand::printErrorFRFile(map > errorForward, map > errorReverse){ @@ -1207,12 +1274,11 @@ void SeqErrorCommand::printErrorQuality(map > qScoreErrorMap){ errorQualityFile.close(); } catch(exception& e) { - m->errorOut(e, "SeqErrorCommand", "printErrorFRFile"); + m->errorOut(e, "SeqErrorCommand", "printErrorQuality"); exit(1); } } - //*************************************************************************************************************** void SeqErrorCommand::printQualityFR(vector > qualForwardMap, vector > qualReverseMap){ @@ -1266,11 +1332,12 @@ void SeqErrorCommand::printQualityFR(vector > qualForwardMap, vector qualityReverseFile.close(); } catch(exception& e) { - m->errorOut(e, "SeqErrorCommand", "printErrorFRFile"); + m->errorOut(e, "SeqErrorCommand", "printQualityFR"); exit(1); } } + /**************************************************************************************************/ int SeqErrorCommand::setLines(string filename, string qfilename, string rfilename, vector& fastaFilePos, vector& qfileFilePos, vector& rfileFilePos) { @@ -1349,65 +1416,65 @@ int SeqErrorCommand::setLines(string filename, string qfilename, string rfilenam qfileFilePos.push_back(size); - //seach for filePos of each first name in the rfile and save in rfileFilePos - string junk; - ifstream inR; - m->openInputFile(rfilename, inR); - - //read column headers - for (int i = 0; i < 16; i++) { - if (!inR.eof()) { inR >> junk; } - else { break; } - } - - while(!inR.eof()){ - - if (m->control_pressed) { inR.close(); return processors; } - - input = m->getline(inR); - - if (input.length() != 0) { - - istringstream nameStream(input); - string sname = ""; nameStream >> sname; - - map::iterator it = firstSeqNamesReport.find(sname); - - if(it != firstSeqNamesReport.end()) { //this is the start of a new chunk - unsigned long long pos = inR.tellg(); - rfileFilePos.push_back(pos - input.length() - 1); - firstSeqNamesReport.erase(it); - } - } - - if (firstSeqNamesReport.size() == 0) { break; } - m->gobble(inR); - } - inR.close(); - - if (firstSeqNamesReport.size() != 0) { - for (map::iterator it = firstSeqNamesReport.begin(); it != firstSeqNamesReport.end(); it++) { - m->mothurOut(it->first + " is in your fasta file and not in your report file, aborting."); m->mothurOutEndLine(); - } - m->control_pressed = true; - return processors; - } - - //get last file position of qfile - FILE * rFile; - unsigned long long sizeR; - - //get num bytes in file - rFile = fopen (rfilename.c_str(),"rb"); - if (rFile==NULL) perror ("Error opening file"); - else{ - fseek (rFile, 0, SEEK_END); - sizeR=ftell (rFile); - fclose (rFile); + if(aligned){ + //seach for filePos of each first name in the rfile and save in rfileFilePos + string junk; + ifstream inR; + + m->openInputFile(rfilename, inR); + + //read column headers + for (int i = 0; i < 16; i++) { + if (!inR.eof()) { inR >> junk; } + else { break; } + } + + while(!inR.eof()){ + + input = m->getline(inR); + + if (input.length() != 0) { + + istringstream nameStream(input); + string sname = ""; nameStream >> sname; + + map::iterator it = firstSeqNamesReport.find(sname); + + if(it != firstSeqNamesReport.end()) { //this is the start of a new chunk + unsigned long long pos = inR.tellg(); + rfileFilePos.push_back(pos - input.length() - 1); + firstSeqNamesReport.erase(it); + } + } + + if (firstSeqNamesReport.size() == 0) { break; } + m->gobble(inR); + } + inR.close(); + + if (firstSeqNamesReport.size() != 0) { + for (map::iterator it = firstSeqNamesReport.begin(); it != firstSeqNamesReport.end(); it++) { + m->mothurOut(it->first + " is in your fasta file and not in your report file, aborting."); m->mothurOutEndLine(); + } + m->control_pressed = true; + return processors; + } + + //get last file position of qfile + FILE * rFile; + unsigned long long sizeR; + + //get num bytes in file + rFile = fopen (rfilename.c_str(),"rb"); + if (rFile==NULL) perror ("Error opening file"); + else{ + fseek (rFile, 0, SEEK_END); + sizeR=ftell (rFile); + fclose (rFile); + } + + rfileFilePos.push_back(sizeR); } - - rfileFilePos.push_back(sizeR); - return processors; #else @@ -1449,4 +1516,5 @@ int SeqErrorCommand::setLines(string filename, string qfilename, string rfilenam exit(1); } } + //*************************************************************************************************************** diff --git a/trimseqscommand.cpp b/trimseqscommand.cpp index eb31e78..ed2b046 100644 --- a/trimseqscommand.cpp +++ b/trimseqscommand.cpp @@ -109,7 +109,7 @@ string TrimSeqsCommand::getOutputPattern(string type) { else if (type == "fasta") { pattern = "[filename],[tag],fasta"; } else if (type == "group") { pattern = "[filename],groups"; } else if (type == "name") { pattern = "[filename],[tag],names"; } - else if (type == "count") { pattern = "[filename],[tag],count_table"; } + else if (type == "count") { pattern = "[filename],[tag],count_table-[filename],count_table"; } else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; } return pattern; @@ -495,8 +495,8 @@ int TrimSeqsCommand::execute(){ map myvariables; myvariables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(it->first)); string thisGroupName = ""; - if (countfile == "") { thisGroupName = getOutputFileName("group",variables); outputNames.push_back(thisGroupName); outputTypes["group"].push_back(thisGroupName); } - else { thisGroupName = getOutputFileName("count",variables); outputNames.push_back(thisGroupName); outputTypes["count"].push_back(thisGroupName); } + if (countfile == "") { thisGroupName = getOutputFileName("group",myvariables); outputNames.push_back(thisGroupName); outputTypes["group"].push_back(thisGroupName); } + else { thisGroupName = getOutputFileName("count",myvariables); outputNames.push_back(thisGroupName); outputTypes["count"].push_back(thisGroupName); } m->openOutputFile(thisGroupName, out); if (countfile != "") { out << "Representative_Sequence\ttotal\t" << it->second << endl; } -- 2.39.2