5 * Created by Pat Schloss on 1/31/11.
6 * Copyright 2011 Schloss Lab. All rights reserved.
10 #include "refchimeratest.h"
11 #include "myPerseus.h"
14 int MAXINT = numeric_limits<int>::max();
16 //***************************************************************************************************************
18 RefChimeraTest::RefChimeraTest(vector<Sequence>& refs, bool aligned) : aligned(aligned){
20 m = MothurOut::getInstance();
22 numRefSeqs = refs.size();
24 referenceSeqs.resize(numRefSeqs);
25 referenceNames.resize(numRefSeqs);
26 for(int i=0;i<numRefSeqs;i++){
27 if (aligned) { referenceSeqs[i] = refs[i].getAligned(); }
28 else { referenceSeqs[i] = refs[i].getUnaligned(); }
29 referenceNames[i] = refs[i].getName();
32 alignLength = referenceSeqs[0].length();
35 //***************************************************************************************************************
37 int RefChimeraTest::printHeader(ofstream& chimeraReportFile){
39 chimeraReportFile << "queryName\tbestRef\tbestSequenceMismatch\tleftParentChi,rightParentChi\tbreakPointChi\tminMismatchToChimera\tdistToBestMera\tnumParents" << endl;
41 }catch(exception& e) {
42 m->errorOut(e, "RefChimeraTest", "execute");
47 //***************************************************************************************************************
49 int RefChimeraTest::analyzeQuery(string queryName, string querySeq, ofstream& chimeraReportFile){
54 numParents = analyzeAlignedQuery(queryName, querySeq, chimeraReportFile);
57 numParents = analyzeUnalignedQuery(queryName, querySeq, chimeraReportFile);
64 //***************************************************************************************************************
66 int RefChimeraTest::analyzeAlignedQuery(string queryName, string querySeq, ofstream& chimeraReportFile){
68 vector<vector<int> > left(numRefSeqs);
69 vector<int> singleLeft, bestLeft;
70 vector<int> singleRight, bestRight;
72 vector<vector<int> > right(numRefSeqs);
73 for(int i=0;i<numRefSeqs;i++){
74 left[i].assign(alignLength, 0);
78 int bestSequenceMismatch = getAlignedMismatches(querySeq, left, right, bestMatch);
82 int leftParentBi, rightParentBi, breakPointBi;
83 int minMismatchToChimera = getChimera(left, right, leftParentBi, rightParentBi, breakPointBi, singleLeft, bestLeft, singleRight, bestRight);
86 string chimeraRefSeq = "";
88 if(bestSequenceMismatch - minMismatchToChimera >= 3){// || (minMismatchToChimera == 0 && bestSequenceMismatch != 0)){
90 chimeraRefSeq = stitchBimera(leftParentBi, rightParentBi, breakPointBi);
94 chimeraRefSeq = referenceSeqs[bestMatch];
97 bestRefAlignment = chimeraRefSeq;
98 bestQueryAlignment = querySeq;
100 double distToChimera = calcDistToChimera(bestQueryAlignment, bestRefAlignment);
102 chimeraReportFile << queryName << '\t' << referenceNames[bestMatch] << '\t' << bestSequenceMismatch << '\t';
103 chimeraReportFile << referenceNames[leftParentBi] << ',' << referenceNames[rightParentBi] << '\t' << breakPointBi << '\t';
104 chimeraReportFile << minMismatchToChimera << '\t';
105 chimeraReportFile << '\t' << distToChimera << '\t' << nMera << endl;
110 //***************************************************************************************************************
112 int RefChimeraTest::analyzeUnalignedQuery(string queryName, string querySeq, ofstream& chimeraReportFile){
116 int seqLength = querySeq.length();
118 vector<string> queryAlign(numRefSeqs);
119 vector<string> refAlign(numRefSeqs);
121 vector<vector<int> > leftDiffs(numRefSeqs);
122 vector<vector<int> > rightDiffs(numRefSeqs);
123 vector<vector<int> > leftMaps(numRefSeqs);
124 vector<vector<int> > rightMaps(numRefSeqs);
126 int bestRefIndex = -1;
127 int bestRefDiffs = numeric_limits<int>::max();
128 double bestRefLength = 0;
130 for(int i=0;i<numRefSeqs;i++){
132 double diffs = alignQueryToReferences(querySeq, referenceSeqs[i], queryAlign[i], refAlign[i], length);
133 if(diffs < bestRefDiffs){
134 bestRefDiffs = diffs;
135 bestRefLength = length;
140 if(bestRefDiffs >= 3){
141 for(int i=0;i<numRefSeqs;i++){
142 leftDiffs[i].assign(seqLength, 0);
143 rightDiffs[i].assign(seqLength, 0);
144 leftMaps[i].assign(seqLength, 0);
145 rightMaps[i].assign(seqLength, 0);
147 getUnalignedDiffs(queryAlign[i], refAlign[i], leftDiffs[i], leftMaps[i], rightDiffs[i], rightMaps[i]);
150 vector<int> singleLeft(seqLength, numeric_limits<int>::max());
151 vector<int> bestLeft(seqLength, -1);
153 for(int l=0;l<seqLength;l++){
155 for(int i=0;i<numRefSeqs;i++){
156 if(leftDiffs[i][l] < singleLeft[l]){
157 singleLeft[l] = leftDiffs[i][l];
163 vector<int> singleRight(seqLength, numeric_limits<int>::max());
164 vector<int> bestRight(seqLength, -1);
166 for(int l=0;l<seqLength;l++){
168 for(int i=0;i<numRefSeqs;i++){
169 if(rightDiffs[i][l] < singleRight[l]){
170 singleRight[l] = rightDiffs[i][l];
176 int bestChimeraMismatches = numeric_limits<int>::max();
181 for(int l=0;l<seqLength-1;l++){
183 int chimera = singleLeft[l] + singleRight[seqLength - l - 2];
184 if(chimera < bestChimeraMismatches){
185 bestChimeraMismatches = chimera;
187 leftParent = bestLeft[l];
188 rightParent = bestRight[seqLength - l - 2];
194 if(bestRefDiffs - bestChimeraMismatches >= 3){// || (minMismatchToChimera == 0 && bestSequenceMismatch != 0)){
197 int breakLeft = leftMaps[leftParent][breakPoint];
198 int breakRight = rightMaps[rightParent][rightMaps[rightParent].size() - breakPoint - 2];
200 string left = refAlign[leftParent];
201 string right = refAlign[rightParent];
203 for(int i=0;i<=breakLeft;i++){
205 if (m->control_pressed) { return 0; }
207 if(left[i] != '-' && left[i] != '.'){
208 reference += left[i];
213 for(int i=breakRight;i<right.length();i++){
215 if (m->control_pressed) { return 0; }
217 if(right[i] != '-' && right[i] != '.'){
218 reference += right[i];
225 reference = referenceSeqs[bestRefIndex];
229 double finalDiffs = alignQueryToReferences(querySeq, reference, bestQueryAlignment, bestRefAlignment, alignLength);
230 double finalDistance = finalDiffs / alignLength;
232 chimeraReportFile << queryName << '\t' << referenceNames[bestRefIndex] << '\t' << bestRefDiffs << '\t';
233 chimeraReportFile << referenceNames[leftParent] << ',' << referenceNames[rightParent] << '\t' << breakPoint << '\t';
234 chimeraReportFile << bestChimeraMismatches << '\t';
235 chimeraReportFile << '\t' << finalDistance << '\t' << nMera << endl;
238 bestQueryAlignment = queryAlign[bestRefIndex];
239 bestRefAlignment = refAlign[bestRefIndex];
242 chimeraReportFile << queryName << '\t' << referenceNames[bestRefIndex] << '\t' << bestRefDiffs << '\t';
243 chimeraReportFile << "NA\tNA\tNA\tNA\t1" << endl;
246 bestMatch = bestRefIndex;
250 /**************************************************************************************************/
252 double RefChimeraTest::alignQueryToReferences(string query, string reference, string& qAlign, string& rAlign, double& length){
258 double MISMATCH = -1;
260 int queryLength = query.length();
261 int refLength = reference.length();
263 vector<vector<double> > alignMatrix(queryLength + 1);
264 vector<vector<char> > alignMoves(queryLength + 1);
266 for(int i=0;i<=queryLength;i++){
267 if (m->control_pressed) { return 0; }
268 alignMatrix[i].resize(refLength + 1, 0);
269 alignMoves[i].resize(refLength + 1, 'x');
272 for(int i=0;i<=queryLength;i++){
273 if (m->control_pressed) { return 0; }
274 alignMatrix[i][0] = 0;//GAP * i;
275 alignMoves[i][0] = 'u';
278 for(int i=0;i<=refLength;i++){
279 if (m->control_pressed) { return 0; }
280 alignMatrix[0][i] = 0;//GAP * i;
281 alignMoves[0][i] = 'l';
284 for(int i=1;i<=queryLength;i++){
286 if (m->control_pressed) { return 0; }
288 for(int j=1;j<=refLength;j++){
291 if(query[i-1] == reference[j-1]){ nogapScore = alignMatrix[i-1][j-1] + MATCH; }
292 else { nogapScore = alignMatrix[i-1][j-1] + MISMATCH; }
295 if(i == queryLength) { leftScore = alignMatrix[i][j-1]; }
296 else { leftScore = alignMatrix[i][j-1] + GAP; }
300 if(j == refLength) { upScore = alignMatrix[i-1][j]; }
301 else { upScore = alignMatrix[i-1][j] + GAP; }
303 if(nogapScore > leftScore){
304 if(nogapScore > upScore){
305 alignMoves[i][j] = 'd';
306 alignMatrix[i][j] = nogapScore;
309 alignMoves[i][j] = 'u';
310 alignMatrix[i][j] = upScore;
314 if(leftScore > upScore){
315 alignMoves[i][j] = 'l';
316 alignMatrix[i][j] = leftScore;
319 alignMoves[i][j] = 'u';
320 alignMatrix[i][j] = upScore;
326 int end = refLength - 1;
328 double maxRowValue = -2147483647;
329 for(int i=0;i<queryLength;i++){
330 if(alignMatrix[i][end] > maxRowValue){
332 maxRowValue = alignMatrix[i][end];
336 end = queryLength - 1;
338 double maxColumnValue = -2147483647;
340 for(int j=0;j<refLength;j++){
341 if(alignMatrix[end][j] > maxColumnValue){
343 maxColumnValue = alignMatrix[end][j];
347 int row = queryLength-1;
348 int column = refLength-1;
350 if(maxColumn == column && maxRow == row){} // if the max values are the lower right corner, then we're good
351 else if(alignMatrix[row][maxColumn] < alignMatrix[maxRow][column]){
352 for(int i=maxRow+1;i<queryLength;i++){ // decide whether sequence A or B needs the gaps at the end either set
353 alignMoves[i][column] = 'u';// the pointer upwards or...
358 for(int i=maxColumn+1;i<refLength;i++){
359 alignMoves[row][i] = 'l'; // ...to the left
373 while(i > 0 && j > 0){
375 if (m->control_pressed) { return 0; }
377 if(alignMoves[i][j] == 'd'){
378 qAlign = query[i-1] + qAlign;
379 rAlign = reference[j-1] + rAlign;
381 if(query[i-1] != reference[j-1]){ diffs++; }
387 else if(alignMoves[i][j] == 'u'){
388 qAlign = query[i-1] + qAlign;
390 if(j != refLength) { rAlign = '-' + rAlign; diffs++; length++; }
391 else { rAlign = '.' + rAlign; }
394 else if(alignMoves[i][j] == 'l'){
395 rAlign = reference[j-1] + rAlign;
397 if(i != queryLength){ qAlign = '-' + qAlign; diffs++; length++; }
398 else { qAlign = '.' + qAlign; }
404 qAlign = query.substr(0, i) + qAlign;
405 rAlign = string(i, '.') + rAlign;
408 qAlign = string(j, '.') + qAlign;
409 rAlign = reference.substr(0, j) + rAlign;
415 catch(exception& e) {
416 m->errorOut(e, "RefChimeraTest", "alignQueryToReferences");
421 /**************************************************************************************************/
423 int RefChimeraTest::getUnalignedDiffs(string qAlign, string rAlign, vector<int>& leftDiffs, vector<int>& leftMap, vector<int>& rightDiffs, vector<int>& rightMap){
425 int alignLength = qAlign.length();
429 for(int l=0;l<alignLength;l++){
431 if (m->control_pressed) { return 0; }
433 if(qAlign[l] == '-'){
436 else if(qAlign[l] != '.'){
438 if(rAlign[l] == '-'){
441 else if(qAlign[l] != rAlign[l]){;// && rAlign[l] != '.'){
444 leftDiffs[lCount] = lDiffs;
453 for(int l=alignLength-1;l>=0;l--){
455 if (m->control_pressed) { return 0; }
457 if(qAlign[l] == '-'){
460 else if(qAlign[l] != '.'){
462 if(rAlign[l] == '-'){
465 else if(qAlign[l] != rAlign[l]){;// && rAlign[l] != '.'){
469 rightDiffs[rCount] = rDiffs;
470 rightMap[rCount] = l;
478 catch(exception& e) {
479 m->errorOut(e, "RefChimeraTest", "getUnalignedDiffs");
484 /**************************************************************************************************/
486 int RefChimeraTest::getAlignedMismatches(string& querySeq, vector<vector<int> >& left, vector<vector<int> >& right, int& bestRefSeq){
488 int bestSequenceMismatch = MAXINT;
490 for(int i=0;i<numRefSeqs;i++){
494 for(int l=0;l<alignLength;l++){
495 if(querySeq[l] != '.' && referenceSeqs[i][l] != '.' && querySeq[l] != referenceSeqs[i][l] && referenceSeqs[i][l] != 'N'){
503 for(int l=alignLength-1;l>=0;l--){
504 if(querySeq[l] != '.' && referenceSeqs[i][l] != '.' && querySeq[l] != referenceSeqs[i][l] && referenceSeqs[i][l] != 'N'){
507 right[i][index++] = rDiffs;
510 if(lDiffs < bestSequenceMismatch){
511 bestSequenceMismatch = lDiffs;
515 return bestSequenceMismatch;
518 /**************************************************************************************************/
520 int RefChimeraTest::getChimera(vector<vector<int> >& left, vector<vector<int> >& right, int& leftParent, int& rightParent, int& breakPoint, vector<int>& singleLeft, vector<int>& bestLeft, vector<int>& singleRight, vector<int>& bestRight){
522 singleLeft.resize(alignLength, MAXINT);
523 bestLeft.resize(alignLength, -1);
525 for(int l=0;l<alignLength;l++){
526 for(int i=0;i<numRefSeqs;i++){
527 if(left[i][l] <= singleLeft[l]){
528 singleLeft[l] = left[i][l];
534 singleRight.resize(alignLength, MAXINT);
535 bestRight.resize(alignLength, -1);
537 for(int l=0;l<alignLength;l++){
538 for(int i=0;i<numRefSeqs;i++){
539 if(right[i][l] <= singleRight[l]){
540 singleRight[l] = right[i][l];
546 int bestChimeraMismatches = MAXINT;
551 for(int l=0;l<alignLength;l++){
552 int chimera = singleLeft[l] + singleRight[alignLength - l - 1];
553 if(chimera < bestChimeraMismatches){
554 bestChimeraMismatches = chimera;
556 leftParent = bestLeft[l];
557 rightParent = bestRight[alignLength - l - 1];
561 return bestChimeraMismatches;
564 /**************************************************************************************************/
566 int RefChimeraTest::getTrimera(vector<vector<int> >& left, vector<vector<int> >& right, int& leftParent, int& middleParent, int& rightParent, int& breakPointA, int& breakPointB, vector<int>& singleLeft, vector<int>& bestLeft, vector<int>& singleRight, vector<int>& bestRight){
568 int bestTrimeraMismatches = MAXINT;
577 vector<vector<int> > minDelta(alignLength);
578 vector<vector<int> > minDeltaSeq(alignLength);
580 for(int i=0;i<alignLength;i++){
581 minDelta[i].assign(alignLength, MAXINT);
582 minDeltaSeq[i].assign(alignLength, -1);
585 for(int x=0;x<alignLength;x++){
586 for(int y=x;y<alignLength-1;y++){
588 for(int i=0;i<numRefSeqs;i++){
589 int delta = left[i][y] - left[i][x];
591 if(delta <= minDelta[x][y]){
592 minDelta[x][y] = delta;
593 minDeltaSeq[x][y] = i;
596 minDelta[x][y] += singleLeft[x] + singleRight[alignLength - y - 2];
598 if(minDelta[x][y] < bestTrimeraMismatches){
599 bestTrimeraMismatches = minDelta[x][y];
604 leftParent = bestLeft[x];
605 middleParent = minDeltaSeq[x][y];
606 rightParent = bestRight[alignLength - y - 2];
610 return bestTrimeraMismatches;
613 /**************************************************************************************************/
615 string RefChimeraTest::stitchBimera(int leftParent, int rightParent, int breakPoint){
617 string chimeraRefSeq = referenceSeqs[leftParent].substr(0, breakPoint) + referenceSeqs[rightParent].substr(breakPoint);
618 return chimeraRefSeq;
622 /**************************************************************************************************/
624 string RefChimeraTest::stitchTrimera(int leftParent, int middleParent, int rightParent, int breakPointA, int breakPointB){
626 string chimeraRefSeq = referenceSeqs[leftParent].substr(0, breakPointA) + referenceSeqs[middleParent].substr(breakPointA, breakPointB-breakPointA) + referenceSeqs[rightParent].substr(breakPointB);
628 return chimeraRefSeq;
631 /**************************************************************************************************/
633 double RefChimeraTest::calcDistToChimera(string& querySeq, string& chimeraRefSeq){
638 for(int i=0;i<alignLength;i++){
639 // if(querySeq[i] != '.' && chimeraRefSeq[i] != '.'){
640 if(chimeraRefSeq[i] != '.' && querySeq[i] != '.'){
641 if(querySeq[i] == '-' && chimeraRefSeq[i] == '-' && chimeraRefSeq[i] != 'N'){ /* do nothing */ }
642 else if(querySeq[i] == chimeraRefSeq[i]){
651 return (double)mismatch / (double)(mismatch + match);
654 //***************************************************************************************************************
656 int RefChimeraTest::getClosestRefIndex(){
662 //***************************************************************************************************************
664 string RefChimeraTest::getQueryAlignment(){
666 return bestQueryAlignment;
670 //***************************************************************************************************************
672 string RefChimeraTest::getClosestRefAlignment(){
674 return bestRefAlignment;
678 //***************************************************************************************************************