*/
#include "refchimeratest.h"
+#include "myPerseus.h"
#include "mothur.h"
int MAXINT = numeric_limits<int>::max();
//***************************************************************************************************************
-RefChimeraTest::RefChimeraTest(vector<Sequence>& refs){
+RefChimeraTest::RefChimeraTest(vector<Sequence>& refs, bool aligned) : aligned(aligned){
m = MothurOut::getInstance();
referenceSeqs.resize(numRefSeqs);
referenceNames.resize(numRefSeqs);
for(int i=0;i<numRefSeqs;i++){
- referenceSeqs[i] = refs[i].getAligned();
+ if (aligned) { referenceSeqs[i] = refs[i].getAligned(); }
+ else { referenceSeqs[i] = refs[i].getUnaligned(); }
referenceNames[i] = refs[i].getName();
}
alignLength = referenceSeqs[0].length();
-
}
//***************************************************************************************************************
exit(1);
}
}
+
//***************************************************************************************************************
int RefChimeraTest::analyzeQuery(string queryName, string querySeq, ofstream& chimeraReportFile){
+
+ int numParents = -1;
+
+ if(aligned){
+ numParents = analyzeAlignedQuery(queryName, querySeq, chimeraReportFile);
+ }
+ else{
+ numParents = analyzeUnalignedQuery(queryName, querySeq, chimeraReportFile);
+ }
+
+ return numParents;
+
+}
+
+//***************************************************************************************************************
+
+int RefChimeraTest::analyzeAlignedQuery(string queryName, string querySeq, ofstream& chimeraReportFile){
vector<vector<int> > left(numRefSeqs);
vector<int> singleLeft, bestLeft;
}
right = left;
- int bestSequenceMismatch = getMismatches(querySeq, left, right, bestMatch);
+ int bestSequenceMismatch = getAlignedMismatches(querySeq, left, right, bestMatch);
+
+
int leftParentBi, rightParentBi, breakPointBi;
int minMismatchToChimera = getChimera(left, right, leftParentBi, rightParentBi, breakPointBi, singleLeft, bestLeft, singleRight, bestRight);
-// int minMismatchToTrimera = MAXINT;
-// int leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB;
-
int nMera = 0;
string chimeraRefSeq = "";
if(bestSequenceMismatch - minMismatchToChimera >= 3){// || (minMismatchToChimera == 0 && bestSequenceMismatch != 0)){
-
nMera = 2;
chimeraRefSeq = stitchBimera(leftParentBi, rightParentBi, breakPointBi);
-
-// minMismatchToTrimera = getTrimera(left, right, leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB, singleLeft, bestLeft, singleRight, bestRight);
-//
-// if(minMismatchToChimera - minMismatchToTrimera <= 3){
-// nMera = 2;
-// chimeraRefSeq = stitchBimera(leftParentBi, rightParentBi, breakPointBi);
-// }
-// else{
-// nMera = 3;
-// chimeraRefSeq = stitchTrimera(leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB);
-// }
-
}
else{
nMera = 1;
chimeraRefSeq = referenceSeqs[bestMatch];
}
-
- double distToChimera = calcDistToChimera(querySeq, chimeraRefSeq);
-
-// double loonIndex = calcLoonIndex(querySeq, referenceSeqs[leftParentBi], referenceSeqs[rightParentBi], breakPointBi, binMatrix);
+ bestRefAlignment = chimeraRefSeq;
+ bestQueryAlignment = querySeq;
+
+ double distToChimera = calcDistToChimera(bestQueryAlignment, bestRefAlignment);
chimeraReportFile << queryName << '\t' << referenceNames[bestMatch] << '\t' << bestSequenceMismatch << '\t';
chimeraReportFile << referenceNames[leftParentBi] << ',' << referenceNames[rightParentBi] << '\t' << breakPointBi << '\t';
chimeraReportFile << minMismatchToChimera << '\t';
-
-// if(nMera == 1){
-// chimeraReportFile << "NA" << '\t' << "NA" << '\t' << "NA";
-// }
-// else{
-// chimeraReportFile << referenceNames[leftParentTri] << ',' << referenceNames[middleParentTri] << ',' << referenceNames[rightParentTri] << '\t' << breakPointTriA << ',' << breakPointTriB << '\t' << minMismatchToTrimera;
-// }
-
- chimeraReportFile << '\t' << distToChimera << '\t' << nMera << endl;
+ chimeraReportFile << '\t' << distToChimera << '\t' << nMera << endl;
return nMera;
}
+//***************************************************************************************************************
+
+int RefChimeraTest::analyzeUnalignedQuery(string queryName, string querySeq, ofstream& chimeraReportFile){
+
+ int nMera = 0;
+
+ int seqLength = querySeq.length();
+
+ vector<string> queryAlign(numRefSeqs);
+ vector<string> refAlign(numRefSeqs);
+
+ vector<vector<int> > leftDiffs(numRefSeqs);
+ vector<vector<int> > rightDiffs(numRefSeqs);
+ vector<vector<int> > leftMaps(numRefSeqs);
+ vector<vector<int> > rightMaps(numRefSeqs);
+
+ int bestRefIndex = -1;
+ int bestRefDiffs = numeric_limits<int>::max();
+ double bestRefLength = 0;
+
+ for(int i=0;i<numRefSeqs;i++){
+ double length = 0;
+ double diffs = alignQueryToReferences(querySeq, referenceSeqs[i], queryAlign[i], refAlign[i], length);
+ if(diffs < bestRefDiffs){
+ bestRefDiffs = diffs;
+ bestRefLength = length;
+ bestRefIndex = i;
+ }
+ }
+
+ if(bestRefDiffs >= 3){
+ for(int i=0;i<numRefSeqs;i++){
+ leftDiffs[i].assign(seqLength, 0);
+ rightDiffs[i].assign(seqLength, 0);
+ leftMaps[i].assign(seqLength, 0);
+ rightMaps[i].assign(seqLength, 0);
+
+ getUnalignedDiffs(queryAlign[i], refAlign[i], leftDiffs[i], leftMaps[i], rightDiffs[i], rightMaps[i]);
+ }
+
+ vector<int> singleLeft(seqLength, numeric_limits<int>::max());
+ vector<int> bestLeft(seqLength, -1);
+
+ for(int l=0;l<seqLength;l++){
+
+ for(int i=0;i<numRefSeqs;i++){
+ if(leftDiffs[i][l] < singleLeft[l]){
+ singleLeft[l] = leftDiffs[i][l];
+ bestLeft[l] = i;
+ }
+ }
+ }
+
+ vector<int> singleRight(seqLength, numeric_limits<int>::max());
+ vector<int> bestRight(seqLength, -1);
+
+ for(int l=0;l<seqLength;l++){
+
+ for(int i=0;i<numRefSeqs;i++){
+ if(rightDiffs[i][l] < singleRight[l]){
+ singleRight[l] = rightDiffs[i][l];
+ bestRight[l] = i;
+ }
+ }
+ }
+
+ int bestChimeraMismatches = numeric_limits<int>::max();
+ int leftParent = 0;
+ int rightParent = 0;
+ int breakPoint = 0;
+
+ for(int l=0;l<seqLength-1;l++){
+
+ int chimera = singleLeft[l] + singleRight[seqLength - l - 2];
+ if(chimera < bestChimeraMismatches){
+ bestChimeraMismatches = chimera;
+ breakPoint = l;
+ leftParent = bestLeft[l];
+ rightParent = bestRight[seqLength - l - 2];
+ }
+ }
+
+ string reference;
+
+ if(bestRefDiffs - bestChimeraMismatches >= 3){// || (minMismatchToChimera == 0 && bestSequenceMismatch != 0)){
+ nMera = 2;
+
+ int breakLeft = leftMaps[leftParent][breakPoint];
+ int breakRight = rightMaps[rightParent][rightMaps[rightParent].size() - breakPoint - 2];
+
+ string left = refAlign[leftParent];
+ string right = refAlign[rightParent];
+
+ for(int i=0;i<=breakLeft;i++){
+
+ if (m->control_pressed) { return 0; }
+
+ if(left[i] != '-' && left[i] != '.'){
+ reference += left[i];
+ }
+ }
+
+
+ for(int i=breakRight;i<right.length();i++){
+
+ if (m->control_pressed) { return 0; }
+
+ if(right[i] != '-' && right[i] != '.'){
+ reference += right[i];
+ }
+ }
+
+ }
+ else{
+ nMera = 1;
+ reference = referenceSeqs[bestRefIndex];
+ }
+
+ double alignLength;
+ double finalDiffs = alignQueryToReferences(querySeq, reference, bestQueryAlignment, bestRefAlignment, alignLength);
+ double finalDistance = finalDiffs / alignLength;
+
+ chimeraReportFile << queryName << '\t' << referenceNames[bestRefIndex] << '\t' << bestRefDiffs << '\t';
+ chimeraReportFile << referenceNames[leftParent] << ',' << referenceNames[rightParent] << '\t' << breakPoint << '\t';
+ chimeraReportFile << bestChimeraMismatches << '\t';
+ chimeraReportFile << '\t' << finalDistance << '\t' << nMera << endl;
+ }
+ else{
+ bestQueryAlignment = queryAlign[bestRefIndex];
+ bestRefAlignment = refAlign[bestRefIndex];
+ nMera = 1;
+
+ chimeraReportFile << queryName << '\t' << referenceNames[bestRefIndex] << '\t' << bestRefDiffs << '\t';
+ chimeraReportFile << "NA\tNA\tNA\tNA\t1" << endl;
+ }
+
+ bestMatch = bestRefIndex;
+ return nMera;
+}
+
/**************************************************************************************************/
-int RefChimeraTest::getMismatches(string& querySeq, vector<vector<int> >& left, vector<vector<int> >& right, int& bestRefSeq){
+double RefChimeraTest::alignQueryToReferences(string query, string reference, string& qAlign, string& rAlign, double& length){
+
+
+ try {
+ double GAP = -5;
+ double MATCH = 1;
+ double MISMATCH = -1;
+
+ int queryLength = query.length();
+ int refLength = reference.length();
+
+ vector<vector<double> > alignMatrix(queryLength + 1);
+ vector<vector<char> > alignMoves(queryLength + 1);
+
+ for(int i=0;i<=queryLength;i++){
+ if (m->control_pressed) { return 0; }
+ alignMatrix[i].resize(refLength + 1, 0);
+ alignMoves[i].resize(refLength + 1, 'x');
+ }
+
+ for(int i=0;i<=queryLength;i++){
+ if (m->control_pressed) { return 0; }
+ alignMatrix[i][0] = 0;//GAP * i;
+ alignMoves[i][0] = 'u';
+ }
+
+ for(int i=0;i<=refLength;i++){
+ if (m->control_pressed) { return 0; }
+ alignMatrix[0][i] = 0;//GAP * i;
+ alignMoves[0][i] = 'l';
+ }
+
+ for(int i=1;i<=queryLength;i++){
+
+ if (m->control_pressed) { return 0; }
+
+ for(int j=1;j<=refLength;j++){
+
+ double nogapScore;
+ if(query[i-1] == reference[j-1]){ nogapScore = alignMatrix[i-1][j-1] + MATCH; }
+ else { nogapScore = alignMatrix[i-1][j-1] + MISMATCH; }
+
+ double leftScore;
+ if(i == queryLength) { leftScore = alignMatrix[i][j-1]; }
+ else { leftScore = alignMatrix[i][j-1] + GAP; }
+
+
+ double upScore;
+ if(j == refLength) { upScore = alignMatrix[i-1][j]; }
+ else { upScore = alignMatrix[i-1][j] + GAP; }
+
+ if(nogapScore > leftScore){
+ if(nogapScore > upScore){
+ alignMoves[i][j] = 'd';
+ alignMatrix[i][j] = nogapScore;
+ }
+ else{
+ alignMoves[i][j] = 'u';
+ alignMatrix[i][j] = upScore;
+ }
+ }
+ else{
+ if(leftScore > upScore){
+ alignMoves[i][j] = 'l';
+ alignMatrix[i][j] = leftScore;
+ }
+ else{
+ alignMoves[i][j] = 'u';
+ alignMatrix[i][j] = upScore;
+ }
+ }
+ }
+ }
+
+ int end = refLength - 1;
+ int maxRow = 0;
+ double maxRowValue = -2147483647;
+ for(int i=0;i<queryLength;i++){
+ if(alignMatrix[i][end] > maxRowValue){
+ maxRow = i;
+ maxRowValue = alignMatrix[i][end];
+ }
+ }
+
+ end = queryLength - 1;
+ int maxColumn = 0;
+ double maxColumnValue = -2147483647;
+
+ for(int j=0;j<refLength;j++){
+ if(alignMatrix[end][j] > maxColumnValue){
+ maxColumn = j;
+ maxColumnValue = alignMatrix[end][j];
+ }
+ }
+
+ int row = queryLength-1;
+ int column = refLength-1;
+
+ if(maxColumn == column && maxRow == row){} // if the max values are the lower right corner, then we're good
+ else if(alignMatrix[row][maxColumn] < alignMatrix[maxRow][column]){
+ for(int i=maxRow+1;i<queryLength;i++){ // decide whether sequence A or B needs the gaps at the end either set
+ alignMoves[i][column] = 'u';// the pointer upwards or...
+ }
+
+ }
+ else {
+ for(int i=maxColumn+1;i<refLength;i++){
+ alignMoves[row][i] = 'l'; // ...to the left
+ }
+ }
+
+ int i = queryLength;
+ int j = refLength;
+
+
+ qAlign = "";
+ rAlign = "";
+
+ int diffs = 0;
+ length = 0;
+
+ while(i > 0 && j > 0){
+
+ if (m->control_pressed) { return 0; }
+
+ if(alignMoves[i][j] == 'd'){
+ qAlign = query[i-1] + qAlign;
+ rAlign = reference[j-1] + rAlign;
+
+ if(query[i-1] != reference[j-1]){ diffs++; }
+ length++;
+
+ i--;
+ j--;
+ }
+ else if(alignMoves[i][j] == 'u'){
+ qAlign = query[i-1] + qAlign;
+
+ if(j != refLength) { rAlign = '-' + rAlign; diffs++; length++; }
+ else { rAlign = '.' + rAlign; }
+ i--;
+ }
+ else if(alignMoves[i][j] == 'l'){
+ rAlign = reference[j-1] + rAlign;
+
+ if(i != queryLength){ qAlign = '-' + qAlign; diffs++; length++; }
+ else { qAlign = '.' + qAlign; }
+ j--;
+ }
+ }
+
+ if(i>0){
+ qAlign = query.substr(0, i) + qAlign;
+ rAlign = string(i, '.') + rAlign;
+ }
+ else if(j>0){
+ qAlign = string(j, '.') + qAlign;
+ rAlign = reference.substr(0, j) + rAlign;
+ }
+
+
+ return diffs;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "RefChimeraTest", "alignQueryToReferences");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+int RefChimeraTest::getUnalignedDiffs(string qAlign, string rAlign, vector<int>& leftDiffs, vector<int>& leftMap, vector<int>& rightDiffs, vector<int>& rightMap){
+ try {
+ int alignLength = qAlign.length();
+
+ int lDiffs = 0;
+ int lCount = 0;
+ for(int l=0;l<alignLength;l++){
+
+ if (m->control_pressed) { return 0; }
+
+ if(qAlign[l] == '-'){
+ lDiffs++;
+ }
+ else if(qAlign[l] != '.'){
+
+ if(rAlign[l] == '-'){
+ lDiffs++;
+ }
+ else if(qAlign[l] != rAlign[l]){;// && rAlign[l] != '.'){
+ lDiffs++;
+ }
+ leftDiffs[lCount] = lDiffs;
+ leftMap[lCount] = l;
+
+ lCount++;
+ }
+ }
+
+ int rDiffs = 0;
+ int rCount = 0;
+ for(int l=alignLength-1;l>=0;l--){
+
+ if (m->control_pressed) { return 0; }
+
+ if(qAlign[l] == '-'){
+ rDiffs++;
+ }
+ else if(qAlign[l] != '.'){
+
+ if(rAlign[l] == '-'){
+ rDiffs++;
+ }
+ else if(qAlign[l] != rAlign[l]){;// && rAlign[l] != '.'){
+ rDiffs++;
+ }
+
+ rightDiffs[rCount] = rDiffs;
+ rightMap[rCount] = l;
+ rCount++;
+ }
+
+ }
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "RefChimeraTest", "getUnalignedDiffs");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+int RefChimeraTest::getAlignedMismatches(string& querySeq, vector<vector<int> >& left, vector<vector<int> >& right, int& bestRefSeq){
int bestSequenceMismatch = MAXINT;
int lDiffs = 0;
for(int l=0;l<alignLength;l++){
-// if(querySeq[l] != '.' && querySeq[l] != referenceSeqs[i][l]){
-
if(querySeq[l] != '.' && referenceSeqs[i][l] != '.' && querySeq[l] != referenceSeqs[i][l] && referenceSeqs[i][l] != 'N'){
lDiffs++;
}
int rDiffs = 0;
int index = 0;
for(int l=alignLength-1;l>=0;l--){
-// if(querySeq[l] != '.' && querySeq[l] != referenceSeqs[i][l]){
if(querySeq[l] != '.' && referenceSeqs[i][l] != '.' && querySeq[l] != referenceSeqs[i][l] && referenceSeqs[i][l] != 'N'){
rDiffs++;
}
}
//***************************************************************************************************************
+
+string RefChimeraTest::getQueryAlignment(){
+
+ return bestQueryAlignment;
+
+}
+
+//***************************************************************************************************************
+
+string RefChimeraTest::getClosestRefAlignment(){
+
+ return bestRefAlignment;
+
+}
+
+//***************************************************************************************************************