5 * Created by Pat Schloss on 9/5/11.
6 * Copyright 2011 Patrick D. Schloss. All rights reserved.
10 #include "myPerseus.h"
12 /**************************************************************************************************/
13 int PERSEUSMAXINT = numeric_limits<int>::max();
14 /**************************************************************************************************/
16 vector<vector<double> > Perseus::binomial(int maxOrder){
18 vector<vector<double> > binomial(maxOrder+1);
20 for(int i=0;i<=maxOrder;i++){
21 binomial[i].resize(maxOrder+1);
30 for(int i=2;i<=maxOrder;i++){
34 for(int i=2;i<=maxOrder;i++){
35 for(int j=1;j<=maxOrder;j++){
36 if(i==j){ binomial[i][j]=1; }
37 if(j>i) { binomial[i][j]=0; }
38 else { binomial[i][j]=binomial[i-1][j-1]+binomial[i-1][j]; }
45 m->errorOut(e, "Perseus", "binomial");
50 /**************************************************************************************************/
51 double Perseus::basicPairwiseAlignSeqs(string query, string reference, string& qAlign, string& rAlign, pwModel model){
53 double GAP = model.GAP_OPEN;
54 double MATCH = model.MATCH;
55 double MISMATCH = model.MISMATCH;
57 int queryLength = query.size();
58 int refLength = reference.size();
60 vector<vector<double> > alignMatrix(queryLength + 1);
61 vector<vector<char> > alignMoves(queryLength + 1);
63 for(int i=0;i<=queryLength;i++){
64 if (m->control_pressed) { return 0; }
65 alignMatrix[i].resize(refLength + 1, 0);
66 alignMoves[i].resize(refLength + 1, 'x');
69 for(int i=0;i<=queryLength;i++){
70 if (m->control_pressed) { return 0; }
71 alignMatrix[i][0] = GAP * i;
72 alignMoves[i][0] = 'u';
75 for(int i=0;i<=refLength;i++){
76 if (m->control_pressed) { return 0; }
77 alignMatrix[0][i] = GAP * i;
78 alignMoves[0][i] = 'l';
81 for(int i=1;i<=queryLength;i++){
83 if (m->control_pressed) { return 0; }
85 for(int j=1;j<=refLength;j++){
88 if(query[i-1] == reference[j-1]){ nogapScore = alignMatrix[i-1][j-1] + MATCH; }
89 else { nogapScore = alignMatrix[i-1][j-1] + MISMATCH; }
92 if(i == queryLength) { leftScore = alignMatrix[i][j-1]; }
93 else { leftScore = alignMatrix[i][j-1] + GAP; }
97 if(j == refLength) { upScore = alignMatrix[i-1][j]; }
98 else { upScore = alignMatrix[i-1][j] + GAP; }
100 if(nogapScore > leftScore){
101 if(nogapScore > upScore){
102 alignMoves[i][j] = 'd';
103 alignMatrix[i][j] = nogapScore;
106 alignMoves[i][j] = 'u';
107 alignMatrix[i][j] = upScore;
111 if(leftScore > upScore){
112 alignMoves[i][j] = 'l';
113 alignMatrix[i][j] = leftScore;
116 alignMoves[i][j] = 'u';
117 alignMatrix[i][j] = upScore;
132 while(i > 0 && j > 0){
134 if (m->control_pressed) { return 0; }
136 if(alignMoves[i][j] == 'd'){
137 qAlign = query[i-1] + qAlign;
138 rAlign = reference[j-1] + rAlign;
140 if(query[i-1] != reference[j-1]){ diffs++; }
146 else if(alignMoves[i][j] == 'u'){
147 qAlign = query[i-1] + qAlign;
149 if(j != refLength) { rAlign = '-' + rAlign; diffs++; length++; }
150 else { rAlign = '.' + rAlign; }
153 else if(alignMoves[i][j] == 'l'){
154 rAlign = reference[j-1] + rAlign;
156 if(i != queryLength){ qAlign = '-' + qAlign; diffs++; length++; }
157 else { qAlign = '.' + qAlign; }
164 if (m->control_pressed) { return 0; }
166 rAlign = '.' + rAlign;
167 qAlign = query[i-1] + qAlign;
173 if (m->control_pressed) { return 0; }
175 rAlign = reference[j-1] + rAlign;
176 qAlign = '.' + qAlign;
182 return double(diffs)/double(length);
184 catch(exception& e) {
185 m->errorOut(e, "Perseus", "basicPairwiseAlignSeqs");
190 /**************************************************************************************************/
191 int Perseus::getDiffs(string qAlign, string rAlign, vector<int>& leftDiffs, vector<int>& leftMap, vector<int>& rightDiffs, vector<int>& rightMap){
193 int alignLength = qAlign.length();
197 for(int l=0;l<alignLength;l++){
199 if (m->control_pressed) { return 0; }
201 if(qAlign[l] == '-'){
204 else if(qAlign[l] != '.'){
206 if(rAlign[l] == '-'){
209 else if(qAlign[l] != rAlign[l] && rAlign[l] != '.'){
212 leftDiffs[lCount] = lDiffs;
221 for(int l=alignLength-1;l>=0;l--){
223 if (m->control_pressed) { return 0; }
225 if(qAlign[l] == '-'){
228 else if(qAlign[l] != '.'){
230 if(rAlign[l] == '-'){
233 else if(qAlign[l] != rAlign[l] && rAlign[l] != '.'){
237 rightDiffs[rCount] = rDiffs;
238 rightMap[rCount] = l;
247 catch(exception& e) {
248 m->errorOut(e, "Perseus", "getDiffs");
252 /**************************************************************************************************/
253 int Perseus::getLastMatch(char direction, vector<vector<char> >& alignMoves, int i, int j, string& seqA, string& seqB){
255 char nullReturn = -1;
259 if (m->control_pressed) { return 0; }
261 if(direction == 'd'){
262 if(seqA[i-1] == seqB[j-1]) { return seqA[i-1]; }
263 else { return nullReturn; }
266 else if(direction == 'l') { j--; }
269 direction = alignMoves[i][j];
274 catch(exception& e) {
275 m->errorOut(e, "Perseus", "getLastMatch");
280 /**************************************************************************************************/
282 int Perseus::toInt(char b){
284 if(b == 'A') { return 0; }
285 else if(b == 'C') { return 1; }
286 else if(b == 'T') { return 2; }
287 else if(b == 'G') { return 3; }
288 else { m->mothurOut("[ERROR]: " + toString(b) + " is not ATGC."); m->mothurOutEndLine(); return -1; }
290 catch(exception& e) {
291 m->errorOut(e, "Perseus", "toInt");
296 /**************************************************************************************************/
298 double Perseus::modeledPairwiseAlignSeqs(string query, string reference, string& qAlign, string& rAlign, vector<vector<double> >& correctMatrix){
300 int queryLength = query.size();
301 int refLength = reference.size();
303 vector<vector<double> > alignMatrix(queryLength + 1);
304 vector<vector<char> > alignMoves(queryLength + 1);
306 for(int i=0;i<=queryLength;i++){
307 if (m->control_pressed) { return 0; }
308 alignMatrix[i].resize(refLength + 1, 0);
309 alignMoves[i].resize(refLength + 1, 'x');
312 for(int i=0;i<=queryLength;i++){
313 if (m->control_pressed) { return 0; }
314 alignMatrix[i][0] = 15.0 * i;
315 alignMoves[i][0] = 'u';
318 for(int i=0;i<=refLength;i++){
319 if (m->control_pressed) { return 0; }
320 alignMatrix[0][i] = 15.0 * i;
321 alignMoves[0][i] = 'l';
324 for(int i=1;i<=queryLength;i++){
326 if (m->control_pressed) { return 0; }
328 for(int j=1;j<=refLength;j++){
331 nogap = alignMatrix[i-1][j-1] + correctMatrix[toInt(query[i-1])][toInt(reference[j-1])];
336 if(i == queryLength){ //terminal gap
337 left = alignMatrix[i][j-1];
340 if(reference[j-1] == getLastMatch('l', alignMoves, i, j, query, reference)){
347 left = alignMatrix[i][j-1] + gap;
351 if(j == refLength){ //terminal gap
352 up = alignMatrix[i-1][j];
356 if(query[i-1] == getLastMatch('u', alignMoves, i, j, query, reference)){
363 up = alignMatrix[i-1][j] + gap;
369 alignMoves[i][j] = 'd';
370 alignMatrix[i][j] = nogap;
373 alignMoves[i][j] = 'u';
374 alignMatrix[i][j] = up;
379 alignMoves[i][j] = 'l';
380 alignMatrix[i][j] = left;
383 alignMoves[i][j] = 'u';
384 alignMatrix[i][j] = up;
395 while(i > 0 && j > 0){
397 if (m->control_pressed) { return 0; }
399 if(alignMoves[i][j] == 'd'){
400 qAlign = query[i-1] + qAlign;
401 rAlign = reference[j-1] + rAlign;
406 else if(alignMoves[i][j] == 'u'){
408 qAlign = query[i-1] + qAlign;
409 rAlign = '-' + rAlign;
415 else if(alignMoves[i][j] == 'l'){
416 if(i != queryLength){
417 qAlign = '-' + qAlign;
418 rAlign = reference[j-1] + rAlign;
426 return alignMatrix[queryLength][refLength] / (double)alignLength;
428 catch(exception& e) {
429 m->errorOut(e, "Perseus", "modeledPairwiseAlignSeqs");
434 /**************************************************************************************************/
435 int Perseus::getAlignments(int curSequenceIndex, vector<seqData> sequences, vector<pwAlign>& alignments, vector<vector<int> >& leftDiffs, vector<vector<int> >& leftMaps, vector<vector<int> >& rightDiffs, vector<vector<int> >& rightMaps, int& bestRefSeq, int& bestRefDiff, vector<bool>& restricted){
437 int numSeqs = sequences.size();
438 //int bestSequenceMismatch = PERSEUSMAXINT;
440 string curSequence = sequences[curSequenceIndex].sequence;
441 int curFrequency = sequences[curSequenceIndex].frequency;
446 int bestDiffs = PERSEUSMAXINT;
449 pwModel model(0, -1, -1.5);
451 for(int i=0;i<numSeqs;i++){
453 if (m->control_pressed) { return 0; }
455 if(i != curSequenceIndex && restricted[i] != 1 && sequences[i].frequency >= 2 * curFrequency){
456 string refSequence = sequences[i].sequence;
458 leftDiffs[i].assign(curSequence.length(), 0);
459 leftMaps[i].assign(curSequence.length(), 0);
460 rightDiffs[i].assign(curSequence.length(), 0);
461 rightMaps[i].assign(curSequence.length(), 0);
463 basicPairwiseAlignSeqs(curSequence, refSequence, alignments[i].query, alignments[i].reference, model);
466 getDiffs(alignments[i].query, alignments[i].reference, leftDiffs[i], leftMaps[i], rightDiffs[i], rightMaps[i]);
468 int diffs = rightDiffs[i][curSequence.length()-1];
470 if(diffs < bestDiffs){
482 bestRefSeq = bestIndex;
483 bestRefDiff = bestDiffs;
487 catch(exception& e) {
488 m->errorOut(e, "Perseus", "getAlignments");
492 /**************************************************************************************************/
493 int Perseus::getChimera(vector<seqData> sequences,
494 vector<vector<int> >& leftDiffs,
495 vector<vector<int> >& rightDiffs,
499 vector<int>& singleLeft,
500 vector<int>& bestLeft,
501 vector<int>& singleRight,
502 vector<int>& bestRight,
503 vector<bool> restricted){
505 int numRefSeqs = restricted.size();
506 int seqLength = leftDiffs[0].size();
508 singleLeft.resize(seqLength, PERSEUSMAXINT);
509 bestLeft.resize(seqLength, -1);
511 for(int l=0;l<seqLength;l++){
513 if (m->control_pressed) { return 0; }
515 for(int i=0;i<numRefSeqs;i++){
517 if(restricted[i] == 0){
518 if(((leftDiffs[i][l] < singleLeft[l]) && sequences[i].frequency) || ((leftDiffs[i][l] == singleLeft[l]) && (sequences[i].frequency > sequences[bestLeft[l]].frequency))){
519 singleLeft[l] = leftDiffs[i][l];
526 singleRight.resize(seqLength, PERSEUSMAXINT);
527 bestRight.resize(seqLength, -1);
529 for(int l=0;l<seqLength;l++){
531 if (m->control_pressed) { return 0; }
533 for(int i=0;i<numRefSeqs;i++){
535 if(restricted[i] == 0){
536 if((rightDiffs[i][l] < singleRight[l] && sequences[i].frequency) || ((rightDiffs[i][l] == singleRight[l] && sequences[i].frequency > sequences[bestRight[l]].frequency))){
537 singleRight[l] = rightDiffs[i][l];
546 int bestChimeraMismatches = PERSEUSMAXINT;
551 for(int l=0;l<seqLength-1;l++){
553 if (m->control_pressed) { return 0; }
555 int chimera = singleLeft[l] + singleRight[seqLength - l - 2];
556 if(chimera < bestChimeraMismatches){
557 bestChimeraMismatches = chimera;
559 leftParent = bestLeft[l];
560 rightParent = bestRight[seqLength - l - 2];
564 return bestChimeraMismatches;
566 catch(exception& e) {
567 m->errorOut(e, "Perseus", "getChimera");
572 /**************************************************************************************************/
574 string Perseus::stitchBimera(vector<pwAlign>& alignments, int leftParent, int rightParent, int breakPoint, vector<vector<int> >& leftMaps, vector<vector<int> >& rightMaps){
576 int breakLeft = leftMaps[leftParent][breakPoint];
577 int breakRight = rightMaps[rightParent][rightMaps[rightParent].size() - breakPoint - 2];
579 string left = alignments[leftParent].reference;
580 string right = alignments[rightParent].reference;
583 for(int i=0;i<=breakLeft;i++){
585 if (m->control_pressed) { return 0; }
587 if(left[i] != '-' && left[i] != '.'){
593 for(int i=breakRight;i<right.length();i++){
595 if (m->control_pressed) { return 0; }
597 if(right[i] != '-' && right[i] != '.'){
604 catch(exception& e) {
605 m->errorOut(e, "Perseus", "stitchBimera");
609 /**************************************************************************************************/
610 int Perseus::getTrimera(vector<seqData>& sequences,
611 vector<vector<int> >& leftDiffs,
617 vector<int>& singleLeft,
618 vector<int>& bestLeft,
619 vector<int>& singleRight,
620 vector<int>& bestRight,
621 vector<bool> restricted){
623 int numRefSeqs = leftDiffs.size();
624 int alignLength = leftDiffs[0].size();
625 int bestTrimeraMismatches = PERSEUSMAXINT;
634 vector<vector<int> > minDelta(alignLength);
635 vector<vector<int> > minDeltaSeq(alignLength);
637 for(int i=0;i<alignLength;i++){
638 if (m->control_pressed) { return 0; }
639 minDelta[i].assign(alignLength, PERSEUSMAXINT);
640 minDeltaSeq[i].assign(alignLength, -1);
643 for(int x=0;x<alignLength;x++){
644 for(int y=x;y<alignLength-1;y++){
645 for(int i=0;i<numRefSeqs;i++){
647 if (m->control_pressed) { return 0; }
649 if(restricted[i] == 0){
650 int delta = leftDiffs[i][y] - leftDiffs[i][x];
652 if(delta < minDelta[x][y] || (delta == minDelta[x][y] && sequences[i].frequency > sequences[minDeltaSeq[x][y]].frequency)){
653 minDelta[x][y] = delta;
654 minDeltaSeq[x][y] = i;
658 minDelta[x][y] += singleLeft[x] + singleRight[alignLength - y - 2];
660 if(minDelta[x][y] < bestTrimeraMismatches){
661 bestTrimeraMismatches = minDelta[x][y];
666 leftParent = bestLeft[x];
667 middleParent = minDeltaSeq[x][y];
668 rightParent = bestRight[alignLength - y - 2];
673 return bestTrimeraMismatches;
675 catch(exception& e) {
676 m->errorOut(e, "Perseus", "getTrimera");
681 /**************************************************************************************************/
683 string Perseus::stitchTrimera(vector<pwAlign> alignments, int leftParent, int middleParent, int rightParent, int breakPointA, int breakPointB, vector<vector<int> >& leftMaps, vector<vector<int> >& rightMaps){
685 int p1SplitPoint = leftMaps[leftParent][breakPointA];
686 int p2SplitPoint = leftMaps[middleParent][breakPointB];
687 int p3SplitPoint = rightMaps[rightParent][rightMaps[rightParent].size() - breakPointB - 2];
689 string chimeraRefSeq;
690 for(int i=0;i<=p1SplitPoint;i++){
691 if (m->control_pressed) { return chimeraRefSeq; }
692 if(alignments[leftParent].reference[i] != '-' && alignments[leftParent].reference[i] != '.'){
693 chimeraRefSeq += alignments[leftParent].reference[i];
697 for(int i=p1SplitPoint+1;i<=p2SplitPoint;i++){
698 if (m->control_pressed) { return chimeraRefSeq; }
699 if(alignments[middleParent].reference[i] != '-' && alignments[middleParent].reference[i] != '.'){
700 chimeraRefSeq += alignments[middleParent].reference[i];
704 for(int i=p3SplitPoint;i<alignments[rightParent].reference.length();i++){
705 if (m->control_pressed) { return chimeraRefSeq; }
706 if(alignments[rightParent].reference[i] != '-' && alignments[rightParent].reference[i] != '.'){
707 chimeraRefSeq += alignments[rightParent].reference[i];
711 return chimeraRefSeq;
713 catch(exception& e) {
714 m->errorOut(e, "Perseus", "stitchTrimera");
719 /**************************************************************************************************/
721 int Perseus::threeWayAlign(string query, string parent1, string parent2, string& qAlign, string& aAlign, string& bAlign){
723 pwModel model(1.0, -1.0, -5.0);
728 basicPairwiseAlignSeqs(query, parent1, qL, rL, model);
729 basicPairwiseAlignSeqs(query, parent2, qR, rR, model);
731 int lLength = qL.length();
732 int rLength = qR.length();
740 while(lIndex<lLength || rIndex<rLength){
742 if (m->control_pressed) { return 0; }
744 if(qL[lIndex] == qR[rIndex]){
753 else if(qL[lIndex] == '-' || qL[lIndex] == '.'){
754 //insert a gap into the right sequences
759 if(rIndex != rLength){
768 else if(qR[rIndex] == '-' || qR[rIndex] == '.'){
769 //insert a gap into the left sequences
775 if(lIndex != lLength){
795 for(int i=0;i<qAlign.length();i++){
797 if (m->control_pressed) { return 0; }
800 if(qAlign[i] == '-') { qAlign[i] = '.'; }
804 if(aAlign[i] == '-') { aAlign[i] = '.'; }
808 if(bAlign[i] == '-') { bAlign[i] = '.'; }
811 if(aStart == 1 && bStart == 1 && qStart == 1){
817 catch(exception& e) {
818 m->errorOut(e, "Perseus", "threeWayAlign");
823 /**************************************************************************************************/
825 double Perseus::calcLoonIndex(string query, string parent1, string parent2, int breakPoint, vector<vector<double> >& binMatrix){
827 string queryAln, leftParentAln, rightParentAln;
828 threeWayAlign(query, parent1, parent2, queryAln, leftParentAln, rightParentAln);
830 int alignLength = queryAln.length();
832 int endPos = alignLength;
833 for(int i=alignLength-1;i>=0; i--){
834 if(queryAln[i] != '.' && leftParentAln[i] != '.' && rightParentAln[i] != '.'){
840 int diffToLeftCount = 0;
841 vector<int> diffToLeftMap(alignLength, 0);
843 int diffToRightCount = 0;
844 vector<int> diffToRightMap(alignLength, 0);
846 for(int i=0;i<endPos;i++){
848 if (m->control_pressed) { return 0; }
850 if(queryAln[i] != leftParentAln[i]){
851 diffToLeftMap[diffToLeftCount] = i;
855 if(queryAln[i] != rightParentAln[i]){
856 diffToRightMap[diffToRightCount] = i;
863 diffToLeftMap[diffToLeftCount] = endPos;
864 diffToRightMap[diffToRightCount] = endPos;
873 splits.push_back(-1);
874 diffs.push_back(diffToRightCount);
877 while(indexL < diffToLeftCount || indexR < diffToRightCount){
879 if (m->control_pressed) { return 0; }
881 if(diffToLeftMap[indexL] <= diffToRightMap[indexR]){
882 diffs.push_back(diffs[indexS - 1] + 1);
883 splits.push_back(diffToLeftMap[indexL]);
888 else if(diffToLeftMap[indexL] > diffToRightMap[indexR]) {
889 diffs.push_back(diffs[indexS - 1] - 1);
890 splits.push_back(diffToRightMap[indexR]);
897 int minDiff = PERSEUSMAXINT;
899 for(int i=0;i<indexS;i++){
901 if (m->control_pressed) { return 0; }
903 if(diffs[i] < minDiff){
909 int splitPos = endPos;
910 if(minIndex < indexS - 1){
911 splitPos = (splits[minIndex]+splits[minIndex+1]) / 2;
914 int diffToChimera = 0;
915 int leftDiffToP1 = 0;
916 int rightDiffToP1 = 0;
917 int leftDiffToP2 = 0;
918 int rightDiffToP2 = 0;
920 for(int i=0;i<endPos;i++){
922 if (m->control_pressed) { return 0; }
924 char bQuery = queryAln[i];
925 char bP1 = leftParentAln[i];
926 char bP2 = rightParentAln[i];
928 char bConsensus = bQuery;
929 if(bP1 == bP2){ bConsensus = bP1; }
931 if(bConsensus != bQuery){
935 if(bConsensus != bP1){
943 if(bConsensus != bP2){
954 int diffToClosestParent, diffToFurtherParent;
956 double aFraction, bFraction;
958 if(diffToLeftCount <= diffToRightCount){ //if parent 1 is closer
960 diffToClosestParent = leftDiffToP1 + rightDiffToP1;
964 diffToFurtherParent = leftDiffToP2 + rightDiffToP2;
968 aFraction = double(splitPos + 1)/(double) endPos;
969 bFraction = 1 - aFraction;
972 else{ //if parent 2 is closer
974 diffToClosestParent = leftDiffToP2 + rightDiffToP2;
978 diffToFurtherParent = leftDiffToP1 + rightDiffToP1;
982 bFraction = double(splitPos + 1)/(double) endPos;
983 aFraction = 1 - bFraction;
987 double loonIndex = 0;
989 int totalDifference = diffToClosestParent + diffToChimera;
991 if(totalDifference > 0){
994 for(int i=diffToClosestParent;i<=totalDifference;i++){
995 prob += binMatrix[totalDifference][i] * pow(0.50, i) * pow(0.50, totalDifference - i);
997 loonIndex += -log(prob);
1000 if(diffToFurtherParent > 0){
1003 for(int i=yA;i<=diffToFurtherParent;i++){
1004 prob += binMatrix[diffToFurtherParent][i] * pow(aFraction, i) * pow(1-aFraction, diffToFurtherParent - i);
1006 loonIndex += -log(prob);
1009 if(diffToClosestParent > 0){
1012 for(int i=xB;i<=diffToClosestParent;i++){
1013 prob += binMatrix[diffToClosestParent][i] * pow(bFraction, i) * pow(1-bFraction, diffToClosestParent - i);
1015 loonIndex += -log(prob);
1020 catch(exception& e) {
1021 m->errorOut(e, "Perseus", "calcLoonIndex");
1026 /**************************************************************************************************/
1028 double Perseus::calcBestDistance(string query, string reference){
1030 int alignLength = query.length();
1034 for(int i=0;i<alignLength;i++){
1036 if (m->control_pressed) { return 0; }
1038 if((query[i] != '.' || reference[i] != '.') && (query[i] != '-' && reference[i] != '-')){
1039 if(query[i] != reference[i]){ mismatch++; }
1044 return (double)mismatch / (double)counter;
1046 catch(exception& e) {
1047 m->errorOut(e, "Perseus", "calcBestDistance");
1052 /**************************************************************************************************/
1054 double Perseus::classifyChimera(double singleDist, double cIndex, double loonIndex, double alpha, double beta){
1056 double difference = cIndex - singleDist; //y
1059 if(cIndex >= 0.15 || difference > 0.00){
1060 probability = 0.0000;
1063 probability = 1.0 / (1.0 + exp(-(alpha + beta * loonIndex)));
1068 catch(exception& e) {
1069 m->errorOut(e, "Perseus", "classifyChimera");
1073 /**************************************************************************************************/