X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=blobdiff_plain;f=refchimeratest.cpp;h=c084bffd40c94ef9c14a43d05de39e768dc96bea;hp=4d4a658fe471ba9d0970fbb7fa57151d93c53807;hb=615301e57c25e241356a9c2380648d117709458d;hpb=0bcfddf7bc721a334bdae42d86a580019303537d diff --git a/refchimeratest.cpp b/refchimeratest.cpp index 4d4a658..c084bff 100644 --- a/refchimeratest.cpp +++ b/refchimeratest.cpp @@ -8,36 +8,62 @@ */ #include "refchimeratest.h" +#include "myPerseus.h" #include "mothur.h" int MAXINT = numeric_limits::max(); //*************************************************************************************************************** -RefChimeraTest::RefChimeraTest(vector& refs, string chimeraReportFileName){ +RefChimeraTest::RefChimeraTest(vector& refs, bool aligned) : aligned(aligned){ m = MothurOut::getInstance(); - m->openOutputFile(chimeraReportFileName, chimeraReportFile); numRefSeqs = refs.size(); referenceSeqs.resize(numRefSeqs); referenceNames.resize(numRefSeqs); for(int i=0;ierrorOut(e, "RefChimeraTest", "execute"); + exit(1); + } } //*************************************************************************************************************** -int RefChimeraTest::analyzeQuery(string queryName, string querySeq){ +int RefChimeraTest::analyzeQuery(string queryName, string querySeq, ofstream& chimeraReportFile){ + + int numParents = -1; + + if(aligned){ + numParents = analyzeAlignedQuery(queryName, querySeq, chimeraReportFile); + } + else{ + numParents = analyzeUnalignedQuery(queryName, querySeq, chimeraReportFile); + } + + return numParents; + +} + +//*************************************************************************************************************** + +int RefChimeraTest::analyzeAlignedQuery(string queryName, string querySeq, ofstream& chimeraReportFile){ vector > left(numRefSeqs); vector singleLeft, bestLeft; @@ -49,70 +75,424 @@ int RefChimeraTest::analyzeQuery(string queryName, string querySeq){ } right = left; - int bestSequenceMismatch = getMismatches(querySeq, left, right, bestMatch); + int bestSequenceMismatch = getAlignedMismatches(querySeq, left, right, bestMatch); + + int leftParentBi, rightParentBi, breakPointBi; int minMismatchToChimera = getChimera(left, right, leftParentBi, rightParentBi, breakPointBi, singleLeft, bestLeft, singleRight, bestRight); -// int minMismatchToTrimera = MAXINT; -// int leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB; - int nMera = 0; string chimeraRefSeq = ""; - if(bestSequenceMismatch - minMismatchToChimera <= 3){ - nMera = 1; - chimeraRefSeq = referenceSeqs[bestMatch]; - } - else { - + if(bestSequenceMismatch - minMismatchToChimera >= 3){// || (minMismatchToChimera == 0 && bestSequenceMismatch != 0)){ nMera = 2; chimeraRefSeq = stitchBimera(leftParentBi, rightParentBi, breakPointBi); - -// minMismatchToTrimera = getTrimera(left, right, leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB, singleLeft, bestLeft, singleRight, bestRight); -// -// if(minMismatchToChimera - minMismatchToTrimera <= 3){ -// nMera = 2; -// chimeraRefSeq = stitchBimera(leftParentBi, rightParentBi, breakPointBi); -// } -// else{ -// nMera = 3; -// chimeraRefSeq = stitchTrimera(leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB); -// } - } - double distToChimera = calcDistToChimera(querySeq, chimeraRefSeq); + else{ + nMera = 1; + chimeraRefSeq = referenceSeqs[bestMatch]; + } -// double loonIndex = calcLoonIndex(querySeq, referenceSeqs[leftParentBi], referenceSeqs[rightParentBi], breakPointBi, binMatrix); + bestRefAlignment = chimeraRefSeq; + bestQueryAlignment = querySeq; + + double distToChimera = calcDistToChimera(bestQueryAlignment, bestRefAlignment); chimeraReportFile << queryName << '\t' << referenceNames[bestMatch] << '\t' << bestSequenceMismatch << '\t'; chimeraReportFile << referenceNames[leftParentBi] << ',' << referenceNames[rightParentBi] << '\t' << breakPointBi << '\t'; chimeraReportFile << minMismatchToChimera << '\t'; - -// if(nMera == 1){ -// chimeraReportFile << "NA" << '\t' << "NA" << '\t' << "NA"; -// } -// else{ -// chimeraReportFile << referenceNames[leftParentTri] << ',' << referenceNames[middleParentTri] << ',' << referenceNames[rightParentTri] << '\t' << breakPointTriA << ',' << breakPointTriB << '\t' << minMismatchToTrimera; -// } - - chimeraReportFile << '\t' << distToChimera << '\t' << nMera << endl; + chimeraReportFile << '\t' << distToChimera << '\t' << nMera << endl; return nMera; } +//*************************************************************************************************************** + +int RefChimeraTest::analyzeUnalignedQuery(string queryName, string querySeq, ofstream& chimeraReportFile){ + + int nMera = 0; + + int seqLength = querySeq.length(); + + vector queryAlign(numRefSeqs); + vector refAlign(numRefSeqs); + + vector > leftDiffs(numRefSeqs); + vector > rightDiffs(numRefSeqs); + vector > leftMaps(numRefSeqs); + vector > rightMaps(numRefSeqs); + + int bestRefIndex = -1; + int bestRefDiffs = numeric_limits::max(); + double bestRefLength = 0; + + for(int i=0;i= 3){ + for(int i=0;i singleLeft(seqLength, numeric_limits::max()); + vector bestLeft(seqLength, -1); + + for(int l=0;l singleRight(seqLength, numeric_limits::max()); + vector bestRight(seqLength, -1); + + for(int l=0;l::max(); + int leftParent = 0; + int rightParent = 0; + int breakPoint = 0; + + for(int l=0;l= 3){// || (minMismatchToChimera == 0 && bestSequenceMismatch != 0)){ + nMera = 2; + + int breakLeft = leftMaps[leftParent][breakPoint]; + int breakRight = rightMaps[rightParent][rightMaps[rightParent].size() - breakPoint - 2]; + + string left = refAlign[leftParent]; + string right = refAlign[rightParent]; + + for(int i=0;i<=breakLeft;i++){ + + if (m->control_pressed) { return 0; } + + if(left[i] != '-' && left[i] != '.'){ + reference += left[i]; + } + } + + + for(int i=breakRight;icontrol_pressed) { return 0; } + + if(right[i] != '-' && right[i] != '.'){ + reference += right[i]; + } + } + + } + else{ + nMera = 1; + reference = referenceSeqs[bestRefIndex]; + } + + double alignLength; + double finalDiffs = alignQueryToReferences(querySeq, reference, bestQueryAlignment, bestRefAlignment, alignLength); + double finalDistance = finalDiffs / alignLength; + + chimeraReportFile << queryName << '\t' << referenceNames[bestRefIndex] << '\t' << bestRefDiffs << '\t'; + chimeraReportFile << referenceNames[leftParent] << ',' << referenceNames[rightParent] << '\t' << breakPoint << '\t'; + chimeraReportFile << bestChimeraMismatches << '\t'; + chimeraReportFile << '\t' << finalDistance << '\t' << nMera << endl; + } + else{ + bestQueryAlignment = queryAlign[bestRefIndex]; + bestRefAlignment = refAlign[bestRefIndex]; + nMera = 1; + + chimeraReportFile << queryName << '\t' << referenceNames[bestRefIndex] << '\t' << bestRefDiffs << '\t'; + chimeraReportFile << "NA\tNA\tNA\tNA\t1" << endl; + } + + bestMatch = bestRefIndex; + return nMera; +} + /**************************************************************************************************/ -int RefChimeraTest::getMismatches(string& querySeq, vector >& left, vector >& right, int& bestRefSeq){ +double RefChimeraTest::alignQueryToReferences(string query, string reference, string& qAlign, string& rAlign, double& length){ + + + try { + double GAP = -5; + double MATCH = 1; + double MISMATCH = -1; + + int queryLength = query.length(); + int refLength = reference.length(); + + vector > alignMatrix(queryLength + 1); + vector > alignMoves(queryLength + 1); + + for(int i=0;i<=queryLength;i++){ + if (m->control_pressed) { return 0; } + alignMatrix[i].resize(refLength + 1, 0); + alignMoves[i].resize(refLength + 1, 'x'); + } + + for(int i=0;i<=queryLength;i++){ + if (m->control_pressed) { return 0; } + alignMatrix[i][0] = 0;//GAP * i; + alignMoves[i][0] = 'u'; + } + + for(int i=0;i<=refLength;i++){ + if (m->control_pressed) { return 0; } + alignMatrix[0][i] = 0;//GAP * i; + alignMoves[0][i] = 'l'; + } + + for(int i=1;i<=queryLength;i++){ + + if (m->control_pressed) { return 0; } + + for(int j=1;j<=refLength;j++){ + + double nogapScore; + if(query[i-1] == reference[j-1]){ nogapScore = alignMatrix[i-1][j-1] + MATCH; } + else { nogapScore = alignMatrix[i-1][j-1] + MISMATCH; } + + double leftScore; + if(i == queryLength) { leftScore = alignMatrix[i][j-1]; } + else { leftScore = alignMatrix[i][j-1] + GAP; } + + + double upScore; + if(j == refLength) { upScore = alignMatrix[i-1][j]; } + else { upScore = alignMatrix[i-1][j] + GAP; } + + if(nogapScore > leftScore){ + if(nogapScore > upScore){ + alignMoves[i][j] = 'd'; + alignMatrix[i][j] = nogapScore; + } + else{ + alignMoves[i][j] = 'u'; + alignMatrix[i][j] = upScore; + } + } + else{ + if(leftScore > upScore){ + alignMoves[i][j] = 'l'; + alignMatrix[i][j] = leftScore; + } + else{ + alignMoves[i][j] = 'u'; + alignMatrix[i][j] = upScore; + } + } + } + } + + int end = refLength - 1; + int maxRow = 0; + double maxRowValue = -2147483647; + for(int i=0;i maxRowValue){ + maxRow = i; + maxRowValue = alignMatrix[i][end]; + } + } + + end = queryLength - 1; + int maxColumn = 0; + double maxColumnValue = -2147483647; + + for(int j=0;j maxColumnValue){ + maxColumn = j; + maxColumnValue = alignMatrix[end][j]; + } + } + + int row = queryLength-1; + int column = refLength-1; + + if(maxColumn == column && maxRow == row){} // if the max values are the lower right corner, then we're good + else if(alignMatrix[row][maxColumn] < alignMatrix[maxRow][column]){ + for(int i=maxRow+1;i 0 && j > 0){ + + if (m->control_pressed) { return 0; } + + if(alignMoves[i][j] == 'd'){ + qAlign = query[i-1] + qAlign; + rAlign = reference[j-1] + rAlign; + + if(query[i-1] != reference[j-1]){ diffs++; } + length++; + + i--; + j--; + } + else if(alignMoves[i][j] == 'u'){ + qAlign = query[i-1] + qAlign; + + if(j != refLength) { rAlign = '-' + rAlign; diffs++; length++; } + else { rAlign = '.' + rAlign; } + i--; + } + else if(alignMoves[i][j] == 'l'){ + rAlign = reference[j-1] + rAlign; + + if(i != queryLength){ qAlign = '-' + qAlign; diffs++; length++; } + else { qAlign = '.' + qAlign; } + j--; + } + } + + if(i>0){ + qAlign = query.substr(0, i) + qAlign; + rAlign = string(i, '.') + rAlign; + } + else if(j>0){ + qAlign = string(j, '.') + qAlign; + rAlign = reference.substr(0, j) + rAlign; + } + + + return diffs; + } + catch(exception& e) { + m->errorOut(e, "RefChimeraTest", "alignQueryToReferences"); + exit(1); + } +} + +/**************************************************************************************************/ + +int RefChimeraTest::getUnalignedDiffs(string qAlign, string rAlign, vector& leftDiffs, vector& leftMap, vector& rightDiffs, vector& rightMap){ + try { + int alignLength = qAlign.length(); + + int lDiffs = 0; + int lCount = 0; + for(int l=0;lcontrol_pressed) { return 0; } + + if(qAlign[l] == '-'){ + lDiffs++; + } + else if(qAlign[l] != '.'){ + + if(rAlign[l] == '-'){ + lDiffs++; + } + else if(qAlign[l] != rAlign[l]){;// && rAlign[l] != '.'){ + lDiffs++; + } + leftDiffs[lCount] = lDiffs; + leftMap[lCount] = l; + + lCount++; + } + } + + int rDiffs = 0; + int rCount = 0; + for(int l=alignLength-1;l>=0;l--){ + + if (m->control_pressed) { return 0; } + + if(qAlign[l] == '-'){ + rDiffs++; + } + else if(qAlign[l] != '.'){ + + if(rAlign[l] == '-'){ + rDiffs++; + } + else if(qAlign[l] != rAlign[l]){;// && rAlign[l] != '.'){ + rDiffs++; + } + + rightDiffs[rCount] = rDiffs; + rightMap[rCount] = l; + rCount++; + } + + } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "RefChimeraTest", "getUnalignedDiffs"); + exit(1); + } +} + +/**************************************************************************************************/ + +int RefChimeraTest::getAlignedMismatches(string& querySeq, vector >& left, vector >& right, int& bestRefSeq){ int bestSequenceMismatch = MAXINT; for(int i=0;i >& left, int rDiffs = 0; int index = 0; for(int l=alignLength-1;l>=0;l--){ -// if(querySeq[l] != '.' && querySeq[l] != referenceSeqs[i][l]){ - if(querySeq[l] != '.' && referenceSeqs[i][l] != '.' && querySeq[l] != referenceSeqs[i][l]){ + if(querySeq[l] != '.' && referenceSeqs[i][l] != '.' && querySeq[l] != referenceSeqs[i][l] && referenceSeqs[i][l] != 'N'){ rDiffs++; } right[i][index++] = rDiffs; } + if(lDiffs < bestSequenceMismatch){ bestSequenceMismatch = lDiffs; bestRefSeq = i; @@ -257,8 +637,8 @@ double RefChimeraTest::calcDistToChimera(string& querySeq, string& chimeraRefSeq for(int i=0;i