From: Pat Schloss Date: Thu, 3 Jan 2013 14:24:17 +0000 (-0500) Subject: Merge remote-tracking branch 'mothur/master' X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=commitdiff_plain;h=23f4ffb61f28ec1e6b837633f77f7c6a81c46352;hp=0bd3a2d33b478f0b09fd6b8ce562e9ab41227535 Merge remote-tracking branch 'mothur/master' --- diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj index 0c8c2dd..c5b5b5b 100644 --- a/Mothur.xcodeproj/project.pbxproj +++ b/Mothur.xcodeproj/project.pbxproj @@ -9,6 +9,7 @@ /* Begin PBXBuildFile section */ 219C1DE01552C4BD004209F9 /* newcommandtemplate.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 219C1DDF1552C4BD004209F9 /* newcommandtemplate.cpp */; }; 219C1DE41559BCCF004209F9 /* getcoremicrobiomecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 219C1DE31559BCCD004209F9 /* getcoremicrobiomecommand.cpp */; }; + 7E16B6F51695CB5C00C0DC4F /* main.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 7E16B6F41695CB5C00C0DC4F /* main.cpp */; }; 7E6BE10A12F710D8007ADDBE /* refchimeratest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 7E6BE10912F710D8007ADDBE /* refchimeratest.cpp */; }; 834D9D581656D7C400E7FAB9 /* regularizedrandomforest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 834D9D561656D7C400E7FAB9 /* regularizedrandomforest.cpp */; }; 834D9D5C1656DEC800E7FAB9 /* regularizeddecisiontree.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 834D9D5A1656DEC700E7FAB9 /* regularizeddecisiontree.cpp */; }; @@ -385,6 +386,7 @@ 219C1DE11552C508004209F9 /* newcommandtemplate.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = newcommandtemplate.h; sourceTree = ""; }; 219C1DE31559BCCD004209F9 /* getcoremicrobiomecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = getcoremicrobiomecommand.cpp; sourceTree = ""; }; 219C1DE51559BCF2004209F9 /* getcoremicrobiomecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = getcoremicrobiomecommand.h; sourceTree = ""; }; + 7E16B6F41695CB5C00C0DC4F /* main.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = main.cpp; path = pdsMakeContigs/pdsMakeContigs/main.cpp; sourceTree = ""; }; 7E6BE10812F710D8007ADDBE /* refchimeratest.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = refchimeratest.h; sourceTree = ""; }; 7E6BE10912F710D8007ADDBE /* refchimeratest.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = refchimeratest.cpp; sourceTree = ""; }; 7E78911B135F3E8600E725D2 /* eachgapdistignorens.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = eachgapdistignorens.h; sourceTree = ""; }; @@ -1187,6 +1189,7 @@ A7E9B7AD12D37EC400DA6239 /* rarefactioncurvedata.h */, 7E6BE10812F710D8007ADDBE /* refchimeratest.h */, 7E6BE10912F710D8007ADDBE /* refchimeratest.cpp */, + 7E16B6F41695CB5C00C0DC4F /* main.cpp */, A77410F514697C300098E6AC /* seqnoise.h */, A77410F414697C300098E6AC /* seqnoise.cpp */, A7E9BA5312D39A5E00DA6239 /* read */, @@ -2304,6 +2307,7 @@ 834D9D581656D7C400E7FAB9 /* regularizedrandomforest.cpp in Sources */, 834D9D5C1656DEC800E7FAB9 /* regularizeddecisiontree.cpp in Sources */, A7496D2E167B531B00CC7D7C /* kruskalwalliscommand.cpp in Sources */, + 7E16B6F51695CB5C00C0DC4F /* main.cpp in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; diff --git a/aligncommand.cpp b/aligncommand.cpp index 9d4a609..a871244 100644 --- a/aligncommand.cpp +++ b/aligncommand.cpp @@ -27,8 +27,8 @@ vector AlignCommand::setParameters(){ CommandParameter pmatch("match", "Number", "", "1.0", "", "", "","",false,false); parameters.push_back(pmatch); CommandParameter palign("align", "Multiple", "needleman-gotoh-blast-noalign", "needleman", "", "", "","",false,false,true); parameters.push_back(palign); CommandParameter pmismatch("mismatch", "Number", "", "-1.0", "", "", "","",false,false); parameters.push_back(pmismatch); - CommandParameter pgapopen("gapopen", "Number", "", "-2.0", "", "", "","",false,false); parameters.push_back(pgapopen); - CommandParameter pgapextend("gapextend", "Number", "", "-1.0", "", "", "","",false,false); parameters.push_back(pgapextend); + CommandParameter pgapopen("gapopen", "Number", "", "-5.0", "", "", "","",false,false); parameters.push_back(pgapopen); + CommandParameter pgapextend("gapextend", "Number", "", "-2.0", "", "", "","",false,false); parameters.push_back(pgapextend); CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors); CommandParameter pflip("flip", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pflip); CommandParameter psave("save", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(psave); @@ -57,8 +57,8 @@ string AlignCommand::getHelpString(){ helpString += "The ksize parameter allows you to specify the kmer size for finding most similar template to candidate. The default is 8."; helpString += "The match parameter allows you to specify the bonus for having the same base. The default is 1.0."; helpString += "The mistmatch parameter allows you to specify the penalty for having different bases. The default is -1.0."; - helpString += "The gapopen parameter allows you to specify the penalty for opening a gap in an alignment. The default is -2.0."; - helpString += "The gapextend parameter allows you to specify the penalty for extending a gap in an alignment. The default is -1.0."; + helpString += "The gapopen parameter allows you to specify the penalty for opening a gap in an alignment. The default is -5.0."; + helpString += "The gapextend parameter allows you to specify the penalty for extending a gap in an alignment. The default is -2.0."; helpString += "The flip parameter is used to specify whether or not you want mothur to try the reverse complement if a sequence falls below the threshold. The default is false."; helpString += "The threshold is used to specify a cutoff at which an alignment is deemed 'bad' and the reverse complement may be tried. The default threshold is 0.50, meaning 50% of the bases are removed in the alignment."; helpString += "If the flip parameter is set to true the reverse complement of the sequence is aligned and the better alignment is reported."; @@ -249,10 +249,10 @@ AlignCommand::AlignCommand(string option) { temp = validParameter.validFile(parameters, "mismatch", false); if (temp == "not found"){ temp = "-1.0"; } m->mothurConvert(temp, misMatch); - temp = validParameter.validFile(parameters, "gapopen", false); if (temp == "not found"){ temp = "-2.0"; } + temp = validParameter.validFile(parameters, "gapopen", false); if (temp == "not found"){ temp = "-5.0"; } m->mothurConvert(temp, gapOpen); - temp = validParameter.validFile(parameters, "gapextend", false); if (temp == "not found"){ temp = "-1.0"; } + temp = validParameter.validFile(parameters, "gapextend", false); if (temp == "not found"){ temp = "-2.0"; } m->mothurConvert(temp, gapExtend); temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); } diff --git a/chimeraperseuscommand.cpp b/chimeraperseuscommand.cpp index e1ad14e..1e5f9d4 100644 --- a/chimeraperseuscommand.cpp +++ b/chimeraperseuscommand.cpp @@ -911,8 +911,8 @@ int ChimeraPerseusCommand::driver(string chimeraFileName, vector& seque type = "chimera"; chimeraRefSeq = myPerseus.stitchBimera(alignments, leftParentBi, rightParentBi, breakPointBi, leftMaps, rightMaps); } - ; - if (m->control_pressed) { chimeraFile.close(); m->mothurRemove(chimeraFileName); accnosFile.close(); m->mothurRemove(accnosFileName); return 0; } + + if (m->control_pressed) { chimeraFile.close(); m->mothurRemove(chimeraFileName); accnosFile.close(); m->mothurRemove(accnosFileName); return 0; } double chimeraDist = myPerseus.modeledPairwiseAlignSeqs(sequences[i].sequence, chimeraRefSeq, dummyA, dummyB, correctModel); diff --git a/pdsMakeContigs/pdsMakeContigs/main.cpp b/pdsMakeContigs/pdsMakeContigs/main.cpp new file mode 100644 index 0000000..f467955 --- /dev/null +++ b/pdsMakeContigs/pdsMakeContigs/main.cpp @@ -0,0 +1,18 @@ +// +// main.cpp +// pdsMakeContigs +// +// Created by Patrick Schloss on 1/2/13. +// Copyright (c) 2013 University of Michigan. All rights reserved. +// + +#include + +int main (int argc, const char * argv[]) +{ + + // insert code here... + std::cout << "Hello, World!\n"; + return 0; +} + diff --git a/qualityscores.cpp b/qualityscores.cpp index 566b44c..0b7bd06 100644 --- a/qualityscores.cpp +++ b/qualityscores.cpp @@ -423,13 +423,13 @@ void QualityScores::updateReverseMap(vector >& reverseMap, int start try { int index = 0; - for(int i=stop-1;i>=start;i--){ + for(int i=stop-1;i>=start-1;i--){ reverseMap[index++][qScores[i]] += weight; } } catch(exception& e) { - m->errorOut(e, "QualityScores", "updateForwardMap"); + m->errorOut(e, "QualityScores", "updateReverseMap"); exit(1); } } diff --git a/qualityscores.h b/qualityscores.h index 49034b8..77c3ee0 100644 --- a/qualityscores.h +++ b/qualityscores.h @@ -24,7 +24,7 @@ public: QualityScores(); QualityScores(ifstream&); string getName(); - + int getLength(){ return (int)qScores.size(); } vector getQualityScores() { return qScores; } void printQScores(ofstream&); void trimQScores(int, int); diff --git a/refchimeratest.cpp b/refchimeratest.cpp index 2f397e8..3685ed1 100644 --- a/refchimeratest.cpp +++ b/refchimeratest.cpp @@ -8,13 +8,14 @@ */ #include "refchimeratest.h" +#include "myPerseus.h" #include "mothur.h" int MAXINT = numeric_limits::max(); //*************************************************************************************************************** -RefChimeraTest::RefChimeraTest(vector& refs){ +RefChimeraTest::RefChimeraTest(vector& refs, bool aligned) : aligned(aligned){ m = MothurOut::getInstance(); @@ -29,7 +30,6 @@ RefChimeraTest::RefChimeraTest(vector& refs){ alignLength = referenceSeqs[0].length(); - } //*************************************************************************************************************** @@ -42,9 +42,27 @@ int RefChimeraTest::printHeader(ofstream& chimeraReportFile){ exit(1); } } + //*************************************************************************************************************** int RefChimeraTest::analyzeQuery(string queryName, string querySeq, ofstream& chimeraReportFile){ + + int numParents = -1; + + if(aligned){ + numParents = analyzeAlignedQuery(queryName, querySeq, chimeraReportFile); + } + else{ + numParents = analyzeUnalignedQuery(queryName, querySeq, chimeraReportFile); + } + + return numParents; + +} + +//*************************************************************************************************************** + +int RefChimeraTest::analyzeAlignedQuery(string queryName, string querySeq, ofstream& chimeraReportFile){ vector > left(numRefSeqs); vector singleLeft, bestLeft; @@ -56,63 +74,415 @@ int RefChimeraTest::analyzeQuery(string queryName, string querySeq, ofstream& ch } right = left; - int bestSequenceMismatch = getMismatches(querySeq, left, right, bestMatch); + int bestSequenceMismatch = getAlignedMismatches(querySeq, left, right, bestMatch); + + int leftParentBi, rightParentBi, breakPointBi; int minMismatchToChimera = getChimera(left, right, leftParentBi, rightParentBi, breakPointBi, singleLeft, bestLeft, singleRight, bestRight); -// int minMismatchToTrimera = MAXINT; -// int leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB; - int nMera = 0; string chimeraRefSeq = ""; if(bestSequenceMismatch - minMismatchToChimera >= 3){// || (minMismatchToChimera == 0 && bestSequenceMismatch != 0)){ - nMera = 2; chimeraRefSeq = stitchBimera(leftParentBi, rightParentBi, breakPointBi); - -// minMismatchToTrimera = getTrimera(left, right, leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB, singleLeft, bestLeft, singleRight, bestRight); -// -// if(minMismatchToChimera - minMismatchToTrimera <= 3){ -// nMera = 2; -// chimeraRefSeq = stitchBimera(leftParentBi, rightParentBi, breakPointBi); -// } -// else{ -// nMera = 3; -// chimeraRefSeq = stitchTrimera(leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB); -// } - } else{ nMera = 1; chimeraRefSeq = referenceSeqs[bestMatch]; } - - double distToChimera = calcDistToChimera(querySeq, chimeraRefSeq); - -// double loonIndex = calcLoonIndex(querySeq, referenceSeqs[leftParentBi], referenceSeqs[rightParentBi], breakPointBi, binMatrix); + bestRefAlignment = chimeraRefSeq; + bestQueryAlignment = querySeq; + + double distToChimera = calcDistToChimera(bestQueryAlignment, bestRefAlignment); chimeraReportFile << queryName << '\t' << referenceNames[bestMatch] << '\t' << bestSequenceMismatch << '\t'; chimeraReportFile << referenceNames[leftParentBi] << ',' << referenceNames[rightParentBi] << '\t' << breakPointBi << '\t'; chimeraReportFile << minMismatchToChimera << '\t'; - -// if(nMera == 1){ -// chimeraReportFile << "NA" << '\t' << "NA" << '\t' << "NA"; -// } -// else{ -// chimeraReportFile << referenceNames[leftParentTri] << ',' << referenceNames[middleParentTri] << ',' << referenceNames[rightParentTri] << '\t' << breakPointTriA << ',' << breakPointTriB << '\t' << minMismatchToTrimera; -// } - - chimeraReportFile << '\t' << distToChimera << '\t' << nMera << endl; + chimeraReportFile << '\t' << distToChimera << '\t' << nMera << endl; return nMera; } +//*************************************************************************************************************** + +int RefChimeraTest::analyzeUnalignedQuery(string queryName, string querySeq, ofstream& chimeraReportFile){ + + int nMera = 0; + + int seqLength = querySeq.length(); + + vector queryAlign(numRefSeqs); + vector refAlign(numRefSeqs); + + vector > leftDiffs(numRefSeqs, 0); + vector > rightDiffs(numRefSeqs, 0); + vector > leftMaps(numRefSeqs, 0); + vector > rightMaps(numRefSeqs, 0); + + int bestRefIndex = -1; + int bestRefDiffs = numeric_limits::max(); + double bestRefLength = 0; + + for(int i=0;i= 3){ + for(int i=0;i singleLeft(seqLength, numeric_limits::max()); + vector bestLeft(seqLength, -1); + + for(int l=0;l singleRight(seqLength, numeric_limits::max()); + vector bestRight(seqLength, -1); + + for(int l=0;l::max(); + int leftParent = 0; + int rightParent = 0; + int breakPoint = 0; + + for(int l=0;l= 3){// || (minMismatchToChimera == 0 && bestSequenceMismatch != 0)){ + nMera = 2; + + int breakLeft = leftMaps[leftParent][breakPoint]; + int breakRight = rightMaps[rightParent][rightMaps[rightParent].size() - breakPoint - 2]; + + string left = refAlign[leftParent]; + string right = refAlign[rightParent]; + + for(int i=0;i<=breakLeft;i++){ + + if (m->control_pressed) { return 0; } + + if(left[i] != '-' && left[i] != '.'){ + reference += left[i]; + } + } + + + for(int i=breakRight;icontrol_pressed) { return 0; } + + if(right[i] != '-' && right[i] != '.'){ + reference += right[i]; + } + } + + } + else{ + nMera = 1; + reference = referenceSeqs[bestRefIndex]; + } + + double alignLength; + double finalDiffs = alignQueryToReferences(querySeq, reference, bestQueryAlignment, bestRefAlignment, alignLength); + double finalDistance = finalDiffs / alignLength; + + chimeraReportFile << queryName << '\t' << referenceNames[bestRefIndex] << '\t' << bestRefDiffs << '\t'; + chimeraReportFile << referenceNames[leftParent] << ',' << referenceNames[rightParent] << '\t' << breakPoint << '\t'; + chimeraReportFile << bestChimeraMismatches << '\t'; + chimeraReportFile << '\t' << finalDistance << '\t' << nMera << endl; + } + else{ + bestQueryAlignment = queryAlign[bestRefIndex]; + bestRefAlignment = refAlign[bestRefIndex]; + nMera = 1; + + chimeraReportFile << queryName << '\t' << referenceNames[bestRefIndex] << '\t' << bestRefDiffs << '\t'; + chimeraReportFile << "NA\tNA\tNA\tNA\t1" << endl; + } + + bestMatch = bestRefIndex; + return nMera; +} + /**************************************************************************************************/ -int RefChimeraTest::getMismatches(string& querySeq, vector >& left, vector >& right, int& bestRefSeq){ +double RefChimeraTest::alignQueryToReferences(string query, string reference, string& qAlign, string& rAlign, double& length){ + + + try { + double GAP = -5; + double MATCH = 1; + double MISMATCH = -1; + + int queryLength = query.length(); + int refLength = reference.length(); + + vector > alignMatrix(queryLength + 1); + vector > alignMoves(queryLength + 1); + + for(int i=0;i<=queryLength;i++){ + if (m->control_pressed) { return 0; } + alignMatrix[i].resize(refLength + 1, 0); + alignMoves[i].resize(refLength + 1, 'x'); + } + + for(int i=0;i<=queryLength;i++){ + if (m->control_pressed) { return 0; } + alignMatrix[i][0] = 0;//GAP * i; + alignMoves[i][0] = 'u'; + } + + for(int i=0;i<=refLength;i++){ + if (m->control_pressed) { return 0; } + alignMatrix[0][i] = 0;//GAP * i; + alignMoves[0][i] = 'l'; + } + + for(int i=1;i<=queryLength;i++){ + + if (m->control_pressed) { return 0; } + + for(int j=1;j<=refLength;j++){ + + double nogapScore; + if(query[i-1] == reference[j-1]){ nogapScore = alignMatrix[i-1][j-1] + MATCH; } + else { nogapScore = alignMatrix[i-1][j-1] + MISMATCH; } + + double leftScore; + if(i == queryLength) { leftScore = alignMatrix[i][j-1]; } + else { leftScore = alignMatrix[i][j-1] + GAP; } + + + double upScore; + if(j == refLength) { upScore = alignMatrix[i-1][j]; } + else { upScore = alignMatrix[i-1][j] + GAP; } + + if(nogapScore > leftScore){ + if(nogapScore > upScore){ + alignMoves[i][j] = 'd'; + alignMatrix[i][j] = nogapScore; + } + else{ + alignMoves[i][j] = 'u'; + alignMatrix[i][j] = upScore; + } + } + else{ + if(leftScore > upScore){ + alignMoves[i][j] = 'l'; + alignMatrix[i][j] = leftScore; + } + else{ + alignMoves[i][j] = 'u'; + alignMatrix[i][j] = upScore; + } + } + } + } + + int end = refLength - 1; + int maxRow = 0; + double maxRowValue = -100000000000; + for(int i=0;i maxRowValue){ + maxRow = i; + maxRowValue = alignMatrix[i][end]; + } + } + + end = queryLength - 1; + int maxColumn = 0; + double maxColumnValue = -100000000000; + + for(int j=0;j maxColumnValue){ + maxColumn = j; + maxColumnValue = alignMatrix[end][j]; + } + } + + int row = queryLength-1; + int column = refLength-1; + + if(maxColumn == column && maxRow == row){} // if the max values are the lower right corner, then we're good + else if(alignMatrix[row][maxColumn] < alignMatrix[maxRow][column]){ + for(int i=maxRow+1;i 0 && j > 0){ + + if (m->control_pressed) { return 0; } + + if(alignMoves[i][j] == 'd'){ + qAlign = query[i-1] + qAlign; + rAlign = reference[j-1] + rAlign; + + if(query[i-1] != reference[j-1]){ diffs++; } + length++; + + i--; + j--; + } + else if(alignMoves[i][j] == 'u'){ + qAlign = query[i-1] + qAlign; + + if(j != refLength) { rAlign = '-' + rAlign; diffs++; length++; } + else { rAlign = '.' + rAlign; } + i--; + } + else if(alignMoves[i][j] == 'l'){ + rAlign = reference[j-1] + rAlign; + + if(i != queryLength){ qAlign = '-' + qAlign; diffs++; length++; } + else { qAlign = '.' + qAlign; } + j--; + } + } + + if(i>0){ + qAlign = query.substr(0, i) + qAlign; + rAlign = string(i, '.') + rAlign; + } + else if(j>0){ + qAlign = string(j, '.') + qAlign; + rAlign = reference.substr(0, j) + rAlign; + } + + + return diffs; + } + catch(exception& e) { + m->errorOut(e, "RefChimeraTest", "alignQueryToReferences"); + exit(1); + } +} + +/**************************************************************************************************/ + +int RefChimeraTest::getUnalignedDiffs(string qAlign, string rAlign, vector& leftDiffs, vector& leftMap, vector& rightDiffs, vector& rightMap){ + try { + int alignLength = qAlign.length(); + + int lDiffs = 0; + int lCount = 0; + for(int l=0;lcontrol_pressed) { return 0; } + + if(qAlign[l] == '-'){ + lDiffs++; + } + else if(qAlign[l] != '.'){ + + if(rAlign[l] == '-'){ + lDiffs++; + } + else if(qAlign[l] != rAlign[l]){;// && rAlign[l] != '.'){ + lDiffs++; + } + leftDiffs[lCount] = lDiffs; + leftMap[lCount] = l; + + lCount++; + } + } + + int rDiffs = 0; + int rCount = 0; + for(int l=alignLength-1;l>=0;l--){ + + if (m->control_pressed) { return 0; } + + if(qAlign[l] == '-'){ + rDiffs++; + } + else if(qAlign[l] != '.'){ + + if(rAlign[l] == '-'){ + rDiffs++; + } + else if(qAlign[l] != rAlign[l]){;// && rAlign[l] != '.'){ + rDiffs++; + } + + rightDiffs[rCount] = rDiffs; + rightMap[rCount] = l; + rCount++; + } + + } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "RefChimeraTest", "getUnalignedDiffs"); + exit(1); + } +} + +/**************************************************************************************************/ + +int RefChimeraTest::getAlignedMismatches(string& querySeq, vector >& left, vector >& right, int& bestRefSeq){ int bestSequenceMismatch = MAXINT; @@ -121,8 +491,6 @@ int RefChimeraTest::getMismatches(string& querySeq, vector >& left, int lDiffs = 0; for(int l=0;l >& left, int rDiffs = 0; int index = 0; for(int l=alignLength-1;l>=0;l--){ -// if(querySeq[l] != '.' && querySeq[l] != referenceSeqs[i][l]){ if(querySeq[l] != '.' && referenceSeqs[i][l] != '.' && querySeq[l] != referenceSeqs[i][l] && referenceSeqs[i][l] != 'N'){ rDiffs++; } @@ -292,3 +659,19 @@ int RefChimeraTest::getClosestRefIndex(){ } //*************************************************************************************************************** + +string RefChimeraTest::getQueryAlignment(){ + + return bestQueryAlignment; + +} + +//*************************************************************************************************************** + +string RefChimeraTest::getClosestRefAlignment(){ + + return bestRefAlignment; + +} + +//*************************************************************************************************************** diff --git a/refchimeratest.h b/refchimeratest.h index d4983e0..5ea0801 100644 --- a/refchimeratest.h +++ b/refchimeratest.h @@ -16,13 +16,22 @@ class RefChimeraTest { public: - RefChimeraTest(vector&); + RefChimeraTest(){}; + RefChimeraTest(vector&, bool); int printHeader(ofstream&); - int analyzeQuery(string, string, ofstream&); + int analyzeQuery(string, string, ofstream&); int getClosestRefIndex(); + string getClosestRefAlignment(); + string getQueryAlignment(); + private: - int getMismatches(string&, vector >&, vector >&, int&); - int getChimera(vector >&, vector >&, int&, int&, int&, vector&, vector&, vector&, vector&); + int getAlignedMismatches(string&, vector >&, vector >&, int&); + int analyzeAlignedQuery(string, string, ofstream&); + int analyzeUnalignedQuery(string, string, ofstream&); + double alignQueryToReferences(string, string, string&, string&, double&); + int getUnalignedDiffs(string, string, vector&, vector&, vector&, vector&); + + int getChimera(vector >&, vector >&, int&, int&, int&, vector&, vector&, vector&, vector&); int getTrimera(vector >&, vector >&, int&, int&, int&, int&, int&, vector&, vector&, vector&, vector&); string stitchBimera(int, int, int); string stitchTrimera(int, int, int, int, int); @@ -33,8 +42,11 @@ private: int numRefSeqs; int alignLength; int bestMatch; + string bestRefAlignment; + string bestQueryAlignment; //ofstream chimeraReportFile; - + bool aligned; + MothurOut* m; }; diff --git a/seqerrorcommand.cpp b/seqerrorcommand.cpp index 87d4524..1fe60e8 100644 --- a/seqerrorcommand.cpp +++ b/seqerrorcommand.cpp @@ -11,10 +11,12 @@ #include "reportfile.h" #include "qualityscores.h" #include "refchimeratest.h" +#include "myPerseus.h" #include "filterseqscommand.h" //********************************************************************************************************************** + vector SeqErrorCommand::setParameters(){ try { CommandParameter pquery("fasta", "InputTypes", "", "", "none", "none", "none","errorType",false,true,true); parameters.push_back(pquery); @@ -24,6 +26,7 @@ vector SeqErrorCommand::setParameters(){ CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none","",false,false); parameters.push_back(pname); CommandParameter pignorechimeras("ignorechimeras", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pignorechimeras); CommandParameter pthreshold("threshold", "Number", "", "1.0", "", "", "","",false,false); parameters.push_back(pthreshold); + CommandParameter paligned("aligned", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(paligned); CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors); CommandParameter psave("save", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(psave); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir); @@ -38,7 +41,9 @@ vector SeqErrorCommand::setParameters(){ exit(1); } } + //********************************************************************************************************************** + string SeqErrorCommand::getHelpString(){ try { string helpString = ""; @@ -62,7 +67,9 @@ string SeqErrorCommand::getHelpString(){ exit(1); } } + //********************************************************************************************************************** + string SeqErrorCommand::getOutputPattern(string type) { try { string pattern = ""; @@ -87,7 +94,9 @@ string SeqErrorCommand::getOutputPattern(string type) { exit(1); } } + //********************************************************************************************************************** + SeqErrorCommand::SeqErrorCommand(){ try { abort = true; calledHelp = true; @@ -110,6 +119,7 @@ SeqErrorCommand::SeqErrorCommand(){ exit(1); } } + //*************************************************************************************************************** SeqErrorCommand::SeqErrorCommand(string option) { @@ -227,12 +237,6 @@ SeqErrorCommand::SeqErrorCommand(string option) { if(reportFileName == "not found"){ reportFileName = ""; } else if (reportFileName == "not open") { reportFileName = ""; abort = true; } - if((reportFileName != "" && qualFileName == "") || (reportFileName == "" && qualFileName != "")){ - m->mothurOut("if you use either a qual file or a report file, you have to have both."); - m->mothurOutEndLine(); - abort = true; - } - outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; @@ -243,6 +247,7 @@ SeqErrorCommand::SeqErrorCommand(string option) { // ...at some point should added some additional type checking... temp = validParameter.validFile(parameters, "threshold", false); if (temp == "not found") { temp = "1.00"; } m->mothurConvert(temp, threshold); + temp = validParameter.validFile(parameters, "save", false); if (temp == "not found"){ temp = "f"; } save = m->isTrue(temp); @@ -269,6 +274,9 @@ SeqErrorCommand::SeqErrorCommand(string option) { temp = validParameter.validFile(parameters, "ignorechimeras", false); if (temp == "not found") { temp = "T"; } ignoreChimeras = m->isTrue(temp); + temp = validParameter.validFile(parameters, "aligned", false); if (temp == "not found"){ temp = "t"; } + aligned = m->isTrue(temp); + temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); } m->setProcessors(temp); m->mothurConvert(temp, processors); @@ -280,6 +288,22 @@ SeqErrorCommand::SeqErrorCommand(string option) { vector files; files.push_back(queryFileName); parser.getNameFile(files); } + + if(aligned == true){ + if((reportFileName != "" && qualFileName == "") || (reportFileName == "" && qualFileName != "")){ + m->mothurOut("if you use either a qual file or a report file, you have to have both."); + m->mothurOutEndLine(); + abort = true; + } + } + else{ + if(reportFileName != ""){ + m->mothurOut("we are ignoring the report file if your sequences are not aligned. we will check that the sequences in your fasta and and qual fileare the same length."); + m->mothurOutEndLine(); + } + } + + } } catch(exception& e) { @@ -287,6 +311,7 @@ SeqErrorCommand::SeqErrorCommand(string option) { exit(1); } } + //*************************************************************************************************************** int SeqErrorCommand::execute(){ @@ -294,7 +319,7 @@ int SeqErrorCommand::execute(){ if (abort == true) { if (calledHelp) { return 0; } return 2; } int start = time(NULL); - maxLength = 2000; + maxLength = 5000; totalBases = 0; totalMatches = 0; @@ -325,22 +350,23 @@ int SeqErrorCommand::execute(){ for (int i = 0; i < (fastaFilePos.size()-1); i++) { lines.push_back(linePair(fastaFilePos[i], fastaFilePos[(i+1)])); if (qualFileName != "") { qLines.push_back(linePair(qFilePos[i], qFilePos[(i+1)])); } - if (reportFileName != "") { rLines.push_back(linePair(reportFilePos[i], reportFilePos[(i+1)])); } + if (reportFileName != "" && aligned == true) { rLines.push_back(linePair(reportFilePos[i], reportFilePos[(i+1)])); } } if(qualFileName == "") { qLines = lines; rLines = lines; } //fills with duds - + if(aligned == false){ rLines = lines; } int numSeqs = 0; #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) if(processors == 1){ numSeqs = driver(queryFileName, qualFileName, reportFileName, errorSummaryFileName, errorSeqFileName, errorChimeraFileName, lines[0], qLines[0], rLines[0]); + }else{ numSeqs = createProcesses(queryFileName, qualFileName, reportFileName, errorSummaryFileName, errorSeqFileName, errorChimeraFileName); } #else numSeqs = driver(queryFileName, qualFileName, reportFileName, errorSummaryFileName, errorSeqFileName, errorChimeraFileName, lines[0], qLines[0], rLines[0]); #endif - - if(qualFileName != "" && reportFileName != ""){ + + if(qualFileName != ""){ printErrorQuality(qScoreErrorMap); printQualityFR(qualForwardMap, qualReverseMap); } @@ -362,19 +388,21 @@ int SeqErrorCommand::execute(){ } errorCountFile.close(); - if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } +// if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } printSubMatrix(); - + string megAlignmentFileName = getOutputFileName("errorref-query",variables); ofstream megAlignmentFile; m->openOutputFile(megAlignmentFileName, megAlignmentFile); outputNames.push_back(megAlignmentFileName); outputTypes["errorref-query"].push_back(megAlignmentFileName); - + for(int i=0;imothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences."); m->mothurOutEndLine(); @@ -391,7 +419,9 @@ int SeqErrorCommand::execute(){ exit(1); } } + //********************************************************************************************************************** + int SeqErrorCommand::createProcesses(string filename, string qFileName, string rFileName, string summaryFileName, string errorOutputFileName, string chimeraOutputFileName) { try { int process = 1; @@ -629,7 +659,9 @@ int SeqErrorCommand::createProcesses(string filename, string qFileName, string r exit(1); } } + //********************************************************************************************************************** + int SeqErrorCommand::driver(string filename, string qFileName, string rFileName, string summaryFileName, string errorOutputFileName, string chimeraOutputFileName, linePair line, linePair qline, linePair rline) { try { @@ -661,11 +693,12 @@ int SeqErrorCommand::driver(string filename, string qFileName, string rFileName, //open inputfiles and go to beginning place for this processor ifstream queryFile; m->openInputFile(filename, queryFile); + queryFile.seekg(line.start); ifstream reportFile; ifstream qualFile; - if(qFileName != "" && rFileName != ""){ + if((qFileName != "" && rFileName != "" && aligned)){ m->openInputFile(qFileName, qualFile); qualFile.seekg(qline.start); @@ -683,11 +716,28 @@ int SeqErrorCommand::driver(string filename, string qFileName, string rFileName, qualReverseMap[i].assign(41,0); } } - + else if(qFileName != "" && !aligned){ + + m->openInputFile(qFileName, qualFile); + qualFile.seekg(qline.start); + + qualForwardMap.resize(maxLength); + qualReverseMap.resize(maxLength); + for(int i=0;iopenOutputFile(chimeraOutputFileName, outChimeraReport); - RefChimeraTest chimeraTest(referenceSeqs); - if (line.start == 0) { chimeraTest.printHeader(outChimeraReport); } + + + RefChimeraTest chimeraTest; + + chimeraTest = RefChimeraTest(referenceSeqs, aligned); + if (line.start == 0) { chimeraTest.printHeader(outChimeraReport); } + ofstream errorSummaryFile; m->openOutputFile(summaryFileName, errorSummaryFile); @@ -696,26 +746,34 @@ int SeqErrorCommand::driver(string filename, string qFileName, string rFileName, ofstream errorSeqFile; m->openOutputFile(errorOutputFileName, errorSeqFile); - megaAlignVector.resize(numRefs, ""); + megaAlignVector.assign(numRefs, ""); int index = 0; bool ignoreSeq = 0; bool moreSeqs = 1; while (moreSeqs) { - - if (m->control_pressed) { queryFile.close(); if(qFileName != "" && rFileName != ""){ reportFile.close(); qualFile.close(); } outChimeraReport.close(); errorSummaryFile.close();errorSeqFile.close(); return 0; } - + Sequence query(queryFile); - - int numParentSeqs = chimeraTest.analyzeQuery(query.getName(), query.getAligned(), outChimeraReport); - int closestRefIndex = chimeraTest.getClosestRefIndex(); - + Sequence reference; + int numParentSeqs = -1; + int closestRefIndex = -1; + + numParentSeqs = chimeraTest.analyzeQuery(query.getName(), query.getAligned(), outChimeraReport); + + closestRefIndex = chimeraTest.getClosestRefIndex(); + + reference = referenceSeqs[closestRefIndex]; + + reference.setAligned(chimeraTest.getClosestRefAlignment()); + query.setAligned(chimeraTest.getQueryAlignment()); + if(numParentSeqs > 1 && ignoreChimeras == 1) { ignoreSeq = 1; } - else { ignoreSeq = 0; } - + else { ignoreSeq = 0; } + Compare minCompare; - getErrors(query, referenceSeqs[closestRefIndex], minCompare); + + getErrors(query, reference, minCompare); if(namesFileName != ""){ it = weights.find(query.getName()); @@ -723,21 +781,20 @@ int SeqErrorCommand::driver(string filename, string qFileName, string rFileName, } else{ minCompare.weight = 1; } + printErrorData(minCompare, numParentSeqs, errorSummaryFile, errorSeqFile); if(!ignoreSeq){ - for(int i=0;i maxMismatch){ @@ -782,12 +851,15 @@ int SeqErrorCommand::driver(string filename, string qFileName, string rFileName, if(index % 100 == 0){ m->mothurOut(toString(index)); m->mothurOutEndLine(); } } queryFile.close(); - if(qFileName != "" && rFileName != ""){ reportFile.close(); qualFile.close(); } - errorSummaryFile.close(); + outChimeraReport.close(); + errorSummaryFile.close(); errorSeqFile.close(); - + + if(qFileName != "" && rFileName != "") { reportFile.close(); qualFile.close(); } + else if(qFileName != "" && aligned == false){ qualFile.close(); } + //report progress - if(index % 100 != 0){ m->mothurOut(toString(index)); m->mothurOutEndLine(); } + m->mothurOut(toString(index)); m->mothurOutEndLine(); return index; } @@ -796,6 +868,7 @@ int SeqErrorCommand::driver(string filename, string qFileName, string rFileName, exit(1); } } + //*************************************************************************************************************** void SeqErrorCommand::getReferences(){ @@ -891,9 +964,10 @@ int SeqErrorCommand::getErrors(Sequence query, Sequence reference, Compare& erro int started = 0; //Compare errors; - + + errors.sequence = ""; for(int i=0;i SeqErrorCommand::getWeights(){ nameCountMap[seqName] = m->getNumNames(redundantSeqs); m->gobble(nameFile); } + + nameFile.close(); + return nameCountMap; } - //*************************************************************************************************************** void SeqErrorCommand::printErrorHeader(ofstream& errorSummaryFile){ @@ -1129,6 +1205,7 @@ void SeqErrorCommand::printSubMatrix(){ exit(1); } } + //*************************************************************************************************************** void SeqErrorCommand::printErrorFRFile(map > errorForward, map > errorReverse){ @@ -1197,12 +1274,11 @@ void SeqErrorCommand::printErrorQuality(map > qScoreErrorMap){ errorQualityFile.close(); } catch(exception& e) { - m->errorOut(e, "SeqErrorCommand", "printErrorFRFile"); + m->errorOut(e, "SeqErrorCommand", "printErrorQuality"); exit(1); } } - //*************************************************************************************************************** void SeqErrorCommand::printQualityFR(vector > qualForwardMap, vector > qualReverseMap){ @@ -1256,11 +1332,12 @@ void SeqErrorCommand::printQualityFR(vector > qualForwardMap, vector qualityReverseFile.close(); } catch(exception& e) { - m->errorOut(e, "SeqErrorCommand", "printErrorFRFile"); + m->errorOut(e, "SeqErrorCommand", "printQualityFR"); exit(1); } } + /**************************************************************************************************/ int SeqErrorCommand::setLines(string filename, string qfilename, string rfilename, vector& fastaFilePos, vector& qfileFilePos, vector& rfileFilePos) { @@ -1339,65 +1416,65 @@ int SeqErrorCommand::setLines(string filename, string qfilename, string rfilenam qfileFilePos.push_back(size); - //seach for filePos of each first name in the rfile and save in rfileFilePos - string junk; - ifstream inR; - m->openInputFile(rfilename, inR); - - //read column headers - for (int i = 0; i < 16; i++) { - if (!inR.eof()) { inR >> junk; } - else { break; } - } - - while(!inR.eof()){ - - if (m->control_pressed) { inR.close(); return processors; } - - input = m->getline(inR); - - if (input.length() != 0) { - - istringstream nameStream(input); - string sname = ""; nameStream >> sname; - - map::iterator it = firstSeqNamesReport.find(sname); - - if(it != firstSeqNamesReport.end()) { //this is the start of a new chunk - unsigned long long pos = inR.tellg(); - rfileFilePos.push_back(pos - input.length() - 1); - firstSeqNamesReport.erase(it); - } - } - - if (firstSeqNamesReport.size() == 0) { break; } - m->gobble(inR); - } - inR.close(); - - if (firstSeqNamesReport.size() != 0) { - for (map::iterator it = firstSeqNamesReport.begin(); it != firstSeqNamesReport.end(); it++) { - m->mothurOut(it->first + " is in your fasta file and not in your report file, aborting."); m->mothurOutEndLine(); - } - m->control_pressed = true; - return processors; - } - - //get last file position of qfile - FILE * rFile; - unsigned long long sizeR; - - //get num bytes in file - rFile = fopen (rfilename.c_str(),"rb"); - if (rFile==NULL) perror ("Error opening file"); - else{ - fseek (rFile, 0, SEEK_END); - sizeR=ftell (rFile); - fclose (rFile); + if(aligned){ + //seach for filePos of each first name in the rfile and save in rfileFilePos + string junk; + ifstream inR; + + m->openInputFile(rfilename, inR); + + //read column headers + for (int i = 0; i < 16; i++) { + if (!inR.eof()) { inR >> junk; } + else { break; } + } + + while(!inR.eof()){ + + input = m->getline(inR); + + if (input.length() != 0) { + + istringstream nameStream(input); + string sname = ""; nameStream >> sname; + + map::iterator it = firstSeqNamesReport.find(sname); + + if(it != firstSeqNamesReport.end()) { //this is the start of a new chunk + unsigned long long pos = inR.tellg(); + rfileFilePos.push_back(pos - input.length() - 1); + firstSeqNamesReport.erase(it); + } + } + + if (firstSeqNamesReport.size() == 0) { break; } + m->gobble(inR); + } + inR.close(); + + if (firstSeqNamesReport.size() != 0) { + for (map::iterator it = firstSeqNamesReport.begin(); it != firstSeqNamesReport.end(); it++) { + m->mothurOut(it->first + " is in your fasta file and not in your report file, aborting."); m->mothurOutEndLine(); + } + m->control_pressed = true; + return processors; + } + + //get last file position of qfile + FILE * rFile; + unsigned long long sizeR; + + //get num bytes in file + rFile = fopen (rfilename.c_str(),"rb"); + if (rFile==NULL) perror ("Error opening file"); + else{ + fseek (rFile, 0, SEEK_END); + sizeR=ftell (rFile); + fclose (rFile); + } + + rfileFilePos.push_back(sizeR); } - - rfileFilePos.push_back(sizeR); - return processors; #else @@ -1439,4 +1516,5 @@ int SeqErrorCommand::setLines(string filename, string qfilename, string rfilenam exit(1); } } + //*************************************************************************************************************** diff --git a/seqerrorcommand.h b/seqerrorcommand.h index 6c7af2c..9f400b3 100644 --- a/seqerrorcommand.h +++ b/seqerrorcommand.h @@ -92,7 +92,7 @@ private: string queryFileName, referenceFileName, qualFileName, reportFileName, namesFileName, outputDir; double threshold; - bool ignoreChimeras, save; + bool ignoreChimeras, save, aligned; int numRefs, processors; int maxLength, totalBases, totalMatches; //ofstream errorSummaryFile, errorSeqFile;