1 //uchime by Robert C. Edgar http://drive5.com/uchime This code is donated to the public domain.
\r
11 void Make3Way(const SeqData &SDQ, const SeqData &SDA, const SeqData &SDB,
\r
12 const string &PathQA, const string &PathQB,
\r
13 string &Q3, string &A3, string &B3);
\r
15 void AlignChimeLocal3(const string &Q3, const string &A3, const string &B3,
\r
16 const string &QLabel, const string &ALabel, const string &BLabel,
\r
19 double GetScore2(double Y, double N, double A)
\r
21 return Y/(opt_xn*(N + opt_dn) + opt_xa*A);
\r
24 void AlignChimeGlobal3(const string &Q3, const string &A3, const string &B3,
\r
25 const string &QLabel, const string &ALabel, const string &BLabel,
\r
29 Hit.QLabel = QLabel;
\r
31 const byte *Q3Seq = (const byte *) Q3.c_str();
\r
32 const byte *A3Seq = (const byte *) A3.c_str();
\r
33 const byte *B3Seq = (const byte *) B3.c_str();
\r
35 const unsigned ColCount = SIZE(Q3);
\r
36 asserta(SIZE(A3) == ColCount && SIZE(B3) == ColCount);
\r
39 Log("Q %5u %*.*s\n", ColCount, ColCount, ColCount, Q3Seq);
\r
40 Log("A %5u %*.*s\n", ColCount, ColCount, ColCount, A3Seq);
\r
41 Log("B %5u %*.*s\n", ColCount, ColCount, ColCount, B3Seq);
\r
44 // Discard terminal gaps
\r
45 unsigned ColLo = UINT_MAX;
\r
46 unsigned ColHi = UINT_MAX;
\r
47 for (unsigned Col = 2; Col + 2 < ColCount; ++Col)
\r
49 char q = Q3Seq[Col];
\r
50 char a = A3Seq[Col];
\r
51 char b = B3Seq[Col];
\r
53 if (isacgt(q) && isacgt(a) && isacgt(b))
\r
55 if (ColLo == UINT_MAX)
\r
61 if (ColLo == UINT_MAX)
\r
67 unsigned DiffCount = 0;
\r
69 vector<unsigned> ColToQPos(ColLo, UINT_MAX);
\r
70 vector<unsigned> AccumCount(ColLo, UINT_MAX);
\r
71 vector<unsigned> AccumSameA(ColLo, UINT_MAX);
\r
72 vector<unsigned> AccumSameB(ColLo, UINT_MAX);
\r
73 vector<unsigned> AccumForA(ColLo, UINT_MAX);
\r
74 vector<unsigned> AccumForB(ColLo, UINT_MAX);
\r
75 vector<unsigned> AccumAbstain(ColLo, UINT_MAX);
\r
76 vector<unsigned> AccumAgainst(ColLo, UINT_MAX);
\r
78 unsigned SumSameA = 0;
\r
79 unsigned SumSameB = 0;
\r
80 unsigned SumSameAB = 0;
\r
82 unsigned SumForA = 0;
\r
83 unsigned SumForB = 0;
\r
84 unsigned SumAbstain = 0;
\r
85 unsigned SumAgainst = 0;
\r
86 for (unsigned Col = ColLo; Col <= ColHi; ++Col)
\r
88 char q = Q3Seq[Col];
\r
89 char a = A3Seq[Col];
\r
90 char b = B3Seq[Col];
\r
92 if (isacgt(q) && isacgt(a) && isacgt(b))
\r
100 if (q == a && q != b)
\r
102 if (q == b && q != a)
\r
104 if (a == b && q != a)
\r
106 if (q != a && q != b)
\r
111 ColToQPos.push_back(QPos);
\r
112 AccumSameA.push_back(SumSameA);
\r
113 AccumSameB.push_back(SumSameB);
\r
114 AccumCount.push_back(Sum);
\r
115 AccumForA.push_back(SumForA);
\r
116 AccumForB.push_back(SumForB);
\r
117 AccumAbstain.push_back(SumAbstain);
\r
118 AccumAgainst.push_back(SumAgainst);
\r
128 asserta(SIZE(ColToQPos) == ColHi+1);
\r
129 asserta(SIZE(AccumSameA) == ColHi+1);
\r
130 asserta(SIZE(AccumSameB) == ColHi+1);
\r
131 asserta(SIZE(AccumAbstain) == ColHi+1);
\r
132 asserta(SIZE(AccumAgainst) == ColHi+1);
\r
134 double IdQA = double(SumSameA)/Sum;
\r
135 double IdQB = double(SumSameB)/Sum;
\r
136 double IdAB = double(SumSameAB)/Sum;
\r
137 double MaxId = max(IdQA, IdQB);
\r
140 Log("IdQA=%.1f%% IdQB=%.1f%% IdAB=%.1f\n", IdQA*100.0, IdQB*100.0, IdAB*100.0);
\r
142 Log(" x AQB IdAL IdBL IdAR IdBR DivAB DivBA YAL YBL YAR YBR AbL AbR ScoreAB ScoreAB XLo Xhi\n");
\r
143 Log("----- --- ----- ----- ----- ----- ------ ------ ----- ----- ----- ----- ----- ----- ------- ------- ----- -----\n");
\r
145 unsigned BestXLo = UINT_MAX;
\r
146 unsigned BestXHi = UINT_MAX;
\r
147 double BestDiv = 0.0;
\r
148 double BestIdQM = 0.0;
\r
149 double BestScore = 0.0;
\r
151 // Find range of cols BestXLo..BestXHi that maximizes score
\r
152 bool FirstA = false;
\r
154 // NOTE: Must be < ColHi not <= because use Col+1 below
\r
155 for (unsigned Col = ColLo; Col < ColHi; ++Col)
\r
157 char q = Q3Seq[Col];
\r
158 char a = A3Seq[Col];
\r
159 char b = B3Seq[Col];
\r
161 unsigned SameAL = AccumSameA[Col];
\r
162 unsigned SameBL = AccumSameB[Col];
\r
163 unsigned SameAR = SumSameA - AccumSameA[Col];
\r
164 unsigned SameBR = SumSameB - AccumSameB[Col];
\r
166 double IdAB = double(SameAL + SameBR)/Sum;
\r
167 double IdBA = double(SameBL + SameAR)/Sum;
\r
169 unsigned ForAL = AccumForA[Col];
\r
170 unsigned ForBL = AccumForB[Col];
\r
171 unsigned ForAR = SumForA - AccumForA[Col+1];
\r
172 unsigned ForBR = SumForB - AccumForB[Col+1];
\r
173 unsigned AbL = AccumAbstain[Col];
\r
174 unsigned AbR = SumAbstain - AccumAbstain[Col+1];
\r
176 double ScoreAB = GetScore2(ForAL, ForBL, AbL)*GetScore2(ForBR, ForAR, AbR);
\r
177 double ScoreBA = GetScore2(ForBL, ForAL, AbL)*GetScore2(ForAR, ForBR, AbR);
\r
179 double DivAB = IdAB/MaxId;
\r
180 double DivBA = IdBA/MaxId;
\r
181 double MaxDiv = max(DivAB, DivBA);
\r
183 //if (MaxDiv > BestDiv)
\r
185 // BestDiv = MaxDiv;
\r
188 // FirstA = (DivAB > DivBA);
\r
190 // BestIdQM = IdAB;
\r
192 // BestIdQM = IdBA;
\r
194 //else if (MaxDiv == BestDiv)
\r
197 double MaxScore = max(ScoreAB, ScoreBA);
\r
198 if (MaxScore > BestScore)
\r
200 BestScore = MaxScore;
\r
203 FirstA = (ScoreAB > ScoreBA);
\r
208 if (MaxDiv > BestDiv)
\r
211 else if (MaxScore == BestScore)
\r
214 if (MaxDiv > BestDiv)
\r
221 char q = Q3Seq[Col];
\r
222 char a = A3Seq[Col];
\r
223 char b = B3Seq[Col];
\r
224 Log(" %c%c%c", a, q, b);
\r
225 Log(" %5u", SameAL);
\r
226 Log(" %5u", SameBL);
\r
227 Log(" %5u", SameAR);
\r
228 Log(" %5u", SameBR);
\r
229 Log(" %5.4f", DivAB);
\r
230 Log(" %5.4f", DivBA);
\r
231 Log(" %5u", ForAL);
\r
232 Log(" %5u", ForBL);
\r
233 Log(" %5u", ForAR);
\r
234 Log(" %5u", ForBR);
\r
237 Log(" %7.4f", ScoreAB);
\r
238 Log(" %7.4f", ScoreBA);
\r
239 if (BestXLo != UINT_MAX)
\r
240 Log(" %5u", BestXLo);
\r
241 if (BestXHi != UINT_MAX)
\r
242 Log(" %5u", BestXHi);
\r
248 if (BestXLo == UINT_MAX)
\r
252 Log("No crossover found.\n");
\r
257 Log("BestX col %u - %u\n", BestXLo, BestXHi);
\r
260 // Find maximum region of identity within BestXLo..BestXHi
\r
261 unsigned ColXLo = (BestXLo + BestXHi)/2;
\r
262 unsigned ColXHi = ColXLo;
\r
263 unsigned SegLo = UINT_MAX;
\r
264 unsigned SegHi = UINT_MAX;
\r
265 for (unsigned Col = BestXLo; Col <= BestXHi; ++Col)
\r
267 char q = Q3Seq[Col];
\r
268 char a = A3Seq[Col];
\r
269 char b = B3Seq[Col];
\r
271 if (q == a && q == b)
\r
273 if (SegLo == UINT_MAX)
\r
279 unsigned SegLength = SegHi - SegLo + 1;
\r
280 unsigned BestSegLength = ColXHi - ColXLo + 1;
\r
281 if (SegLength > BestSegLength)
\r
290 unsigned SegLength = SegHi - SegLo + 1;
\r
291 unsigned BestSegLength = ColXHi - ColXLo + 1;
\r
292 if (SegLength > BestSegLength)
\r
299 for (unsigned x = 0; x < ColCount; ++x)
\r
303 else if (x == ColXHi)
\r
313 Hit.ColXLo = ColXLo;
\r
314 Hit.ColXHi = ColXHi;
\r
318 // Hit.LY = AccumForA[ColXLo];
\r
319 // Hit.LN = AccumForB[ColXLo];
\r
321 // Hit.RY = SumForB - AccumForB[ColXHi];
\r
322 // Hit.RN = SumForA - AccumForA[ColXHi];
\r
326 // Hit.LY = AccumForB[ColXLo];
\r
327 // Hit.LN = AccumForA[ColXLo];
\r
328 // Hit.RY = SumForA - AccumForA[ColXHi];
\r
329 // Hit.RN = SumForB - AccumForB[ColXHi];
\r
332 //Hit.LA = AccumAgainst[ColXLo];
\r
333 //Hit.LD = AccumAbstain[ColXLo];
\r
335 //Hit.RA = SumAgainst - AccumAgainst[ColXHi];
\r
336 //Hit.RD = SumAbstain - AccumAbstain[ColXHi];
\r
338 Hit.PctIdAB = IdAB*100.0;
\r
339 Hit.PctIdQM = BestIdQM*100.0;
\r
341 Hit.Div = (BestDiv - 1.0)*100.0;
\r
345 Hit.QLabel = QLabel;
\r
350 //Hit.PathQA = PathQA;
\r
351 //Hit.PathQB = PathQB;
\r
354 Hit.ALabel = ALabel;
\r
355 Hit.BLabel = BLabel;
\r
356 Hit.PctIdQA = IdQA*100.0;
\r
357 Hit.PctIdQB = IdQB*100.0;
\r
363 Hit.ALabel = BLabel;
\r
364 Hit.BLabel = ALabel;
\r
365 Hit.PctIdQA = IdQB*100.0;
\r
366 Hit.PctIdQB = IdQA*100.0;
\r
377 //vector<float> Cons;
\r
378 //for (unsigned Col = 0; Col < ColCount; ++Col)
\r
380 // char q = Q3Seq[Col];
\r
381 // char a = A3Seq[Col];
\r
382 // char b = B3Seq[Col];
\r
383 // if (q == a && q == b && a == b)
\r
385 // Cons.push_back(1.0f);
\r
389 // bool gapq = isgap(q);
\r
390 // bool gapa = isgap(a);
\r
391 // bool gapb = isgap(b);
\r
393 // if (!gapq && !gapa && !gapb)
\r
395 // if (q == a || q == b || a == b)
\r
396 // Cons.push_back(0.75);
\r
398 // Cons.push_back(0.5);
\r
402 // if (!gapa && (a == b || a == q))
\r
403 // Cons.push_back(0.5f);
\r
404 // else if (!gapb && b == q)
\r
405 // Cons.push_back(0.5f);
\r
407 // Cons.push_back(0.0f);
\r
411 //float fLY = 0.0f;
\r
412 //float fLN = 0.0f;
\r
413 //float fLA = 0.0f;
\r
414 //float fRY = 0.0f;
\r
415 //float fRN = 0.0f;
\r
416 //float fRA = 0.0f;
\r
417 for (unsigned Col = ColLo; Col <= ColHi; ++Col)
\r
419 char q = Q3Seq[Col];
\r
420 char a = A3Seq[Col];
\r
421 char b = B3Seq[Col];
\r
422 if (q == a && q == b && a == b)
\r
425 unsigned ngaps = 0;
\r
447 //float AvgCons = (Cons[Col-2] + Cons[Col-1] + Cons[Col+1] + Cons[Col+2])/4;
\r
448 //if (Col < ColXLo)
\r
450 // if (q == a && q != b)
\r
452 // else if (q == b && q != a)
\r
457 //else if (Col > ColXHi)
\r
459 // if (q == b && q != a)
\r
461 // else if (q == a && q != b)
\r
469 if (Col > 0 && (isgap(Q3Seq[Col-1]) || isgap(A3Seq[Col-1]) || isgap(B3Seq[Col-1])))
\r
471 if (Col + 1 < ColCount && (isgap(Q3Seq[Col+1]) || isgap(A3Seq[Col+1]) || isgap(B3Seq[Col+1])))
\r
475 //if (Col > 0 && isgap(Q3Seq[Col-1]))
\r
477 //if (Col + 1 < ColCount && isgap(Q3Seq[Col+1]))
\r
482 if (q == a && q != b)
\r
484 else if (q == b && q != a)
\r
489 else if (Col > ColXHi)
\r
491 if (q == b && q != a)
\r
493 else if (q == a && q != b)
\r
500 double ScoreL = GetScore2(Hit.CS_LY, Hit.CS_LN, Hit.CS_LA);
\r
501 double ScoreR = GetScore2(Hit.CS_RY, Hit.CS_RN, Hit.CS_RA);
\r
502 Hit.Score = ScoreL*ScoreR;
\r
504 extern bool g_UchimeDeNovo;
\r
506 //if (0)//g_UchimeDeNovo)
\r
508 // double AbQ = GetAbFromLabel(QLabel.c_str());
\r
509 // double AbA = GetAbFromLabel(ALabel.c_str());
\r
510 // double AbB = GetAbFromLabel(BLabel.c_str());
\r
511 // if (AbQ > 0.0 && AbA > 0.0 && AbB > 0.0)
\r
513 // double MinAb = min(AbA, AbB);
\r
514 // double Ratio = MinAb/AbQ;
\r
515 // double t = Ratio - opt_abx;
\r
516 // // double Factor = 2.0/(1.0 + exp(-t));
\r
517 // double Factor = min(Ratio, opt_abx)/opt_abx;
\r
518 // if (opt_verbose)
\r
519 // Log("Score %.4f Ab factor %.4f >%s\n", Hit.Score, Factor, QLabel.c_str());
\r
520 // Hit.Score *= Factor;
\r
524 extern FILE *g_fUChimeAlns;
\r
525 if (g_fUChimeAlns != 0 && Hit.Div > 0.0)
\r
527 void WriteChimeHitX(FILE *f, const ChimeHit2 &Hit);
\r
528 WriteChimeHitX(g_fUChimeAlns, Hit);
\r
532 void AlignChime3(const string &Q3, const string &A3, const string &B3,
\r
533 const string &QLabel, const string &ALabel, const string &BLabel,
\r
537 AlignChimeLocal3(Q3, A3, B3, QLabel, ALabel, BLabel, Hit);
\r
539 AlignChimeGlobal3(Q3, A3, B3, QLabel, ALabel, BLabel, Hit);
\r
542 static void StripGaps(const byte *Seq, unsigned L, string &s)
\r
545 for (unsigned i = 0; i < L; ++i)
\r
553 static void StripGapsAlloc(const SeqData &SDIn, SeqData &SDOut)
\r
556 byte *s = myalloc(byte, SDIn.L);
\r
558 for (unsigned i = 0; i < SDIn.L; ++i)
\r
560 char c = SDIn.Seq[i];
\r
562 s[k++] = toupper(c);
\r
568 void AlignChime(const SeqData &QSD, const SeqData &ASD, const SeqData &BSD,
\r
569 const string &PathQA, const string &PathQB, ChimeHit2 &Hit)
\r
573 // AlignChimeLocal(QSD, ASD, BSD, PathQA, PathQB, Hit);
\r
580 Make3Way(QSD, ASD, BSD, PathQA, PathQB, Q3, A3, B3);
\r
582 AlignChime3(Q3, A3, B3, QSD.Label, ASD.Label, BSD.Label, Hit);
\r
585 void AlignChime3SDRealign(const SeqData &QSD3, const SeqData &ASD3, const SeqData &BSD3,
\r
591 StripGapsAlloc(QSD3, QSD);
\r
592 StripGapsAlloc(ASD3, ASD);
\r
593 StripGapsAlloc(BSD3, BSD);
\r
597 bool FoundQA = GlobalAlign(QSD, ASD, PathQA);
\r
598 bool FoundQB = GlobalAlign(QSD, BSD, PathQB);
\r
599 if (!FoundQA || !FoundQB)
\r
602 Hit.QLabel = QSD3.Label;
\r
606 AlignChime(QSD, ASD, BSD, PathQA, PathQB, Hit);
\r
608 myfree((void *) QSD.Seq);
\r
609 myfree((void *) ASD.Seq);
\r
610 myfree((void *) BSD.Seq);
\r
613 void AlignChime3SD(const SeqData &QSD3, const SeqData &ASD3, const SeqData &BSD3,
\r
618 AlignChime3SDRealign(QSD3, ASD3, BSD3, Hit);
\r
626 const unsigned ColCount = QSD3.L;
\r
627 asserta(ASD3.L == ColCount && BSD3.L == ColCount);
\r
629 Q3.reserve(ColCount);
\r
630 A3.reserve(ColCount);
\r
631 B3.reserve(ColCount);
\r
633 const byte *QS = QSD3.Seq;
\r
634 const byte *AS = ASD3.Seq;
\r
635 const byte *BS = BSD3.Seq;
\r
636 for (unsigned Col = 0; Col < ColCount; ++Col)
\r
638 byte q = toupper(QS[Col]);
\r
639 byte a = toupper(AS[Col]);
\r
640 byte b = toupper(BS[Col]);
\r
642 if (isgap(q) && isgap(a) && isgap(b))
\r
650 AlignChime3(Q3, A3, B3, QSD3.Label, ASD3.Label, BSD3.Label, Hit);
\r