From: Heng Li Date: Mon, 8 Nov 2010 03:23:16 +0000 (+0000) Subject: prepare for the indel caller. It is not ready yet. X-Git-Url: https://git.donarmstrong.com/?p=samtools.git;a=commitdiff_plain;h=0bb4884ffb1f4c9485efee763dd63b575f2c1381 prepare for the indel caller. It is not ready yet. --- diff --git a/Makefile b/Makefile index 7ab972e..eeae213 100644 --- a/Makefile +++ b/Makefile @@ -7,7 +7,7 @@ LOBJS= bgzf.o kstring.o bam_aux.o bam.o bam_import.o sam.o bam_index.o \ $(KNETFILE_O) bam_sort.o sam_header.o bam_reheader.o AOBJS= bam_tview.o bam_maqcns.o bam_plcmd.o sam_view.o \ bam_rmdup.o bam_rmdupse.o bam_mate.o bam_stat.o bam_color.o \ - bamtk.o kaln.o kprobaln.o bam2bcf.o errmod.o sample.o + bamtk.o kaln.o kprobaln.o bam2bcf.o bam2bcf_indel.o errmod.o sample.o PROG= samtools INCLUDES= -I. SUBDIRS= . bcftools misc diff --git a/bam.h b/bam.h index 4594ac4..eef2ea9 100644 --- a/bam.h +++ b/bam.h @@ -495,7 +495,7 @@ extern "C" { bam1_t *b; int32_t qpos; int indel, level; - uint32_t is_del:1, is_head:1, is_tail:1, is_refskip:1; + uint32_t is_del:1, is_head:1, is_tail:1, is_refskip:1, aux:28; } bam_pileup1_t; typedef int (*bam_plp_auto_f)(void *data, bam1_t *b); diff --git a/bam2bcf.c b/bam2bcf.c index 509761d..fa9d7a4 100644 --- a/bam2bcf.c +++ b/bam2bcf.c @@ -14,18 +14,13 @@ extern void ks_introsort_uint32_t(size_t n, uint32_t a[]); #define CAP_DIST 25 -struct __bcf_callaux_t { - int max_bases, capQ, min_baseQ; - uint16_t *bases; - errmod_t *e; -}; - bcf_callaux_t *bcf_call_init(double theta, int min_baseQ) { bcf_callaux_t *bca; if (theta <= 0.) theta = CALL_DEFTHETA; bca = calloc(1, sizeof(bcf_callaux_t)); bca->capQ = 60; + bca->openQ = 40; bca->extQ = 20; bca->tandemQ = 70; bca->min_baseQ = min_baseQ; bca->e = errmod_init(1. - theta); return bca; diff --git a/bam2bcf.h b/bam2bcf.h index 4de743a..94207d0 100644 --- a/bam2bcf.h +++ b/bam2bcf.h @@ -2,10 +2,20 @@ #define BAM2BCF_H #include +#include "errmod.h" #include "bcftools/bcf.h" -struct __bcf_callaux_t; -typedef struct __bcf_callaux_t bcf_callaux_t; +#define B2B_INDEL_NULL 10000 + +typedef struct __bcf_callaux_t { + int capQ, min_baseQ; + int openQ, extQ, tandemQ; + // for internal uses + int max_bases; + int indel_types[4]; + uint16_t *bases; + errmod_t *e; +} bcf_callaux_t; typedef struct { int depth, qsum[4]; @@ -29,6 +39,7 @@ extern "C" { int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base /*4-bit*/, bcf_callaux_t *bca, bcf_callret1_t *r); int bcf_call_combine(int n, const bcf_callret1_t *calls, int ref_base /*4-bit*/, bcf_call_t *call); int bcf_call2bcf(int tid, int pos, bcf_call_t *bc, bcf1_t *b, bcf_callret1_t *bcr, int is_SP); + int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref); #ifdef __cplusplus } diff --git a/bam2bcf_indel.c b/bam2bcf_indel.c new file mode 100644 index 0000000..7359cc8 --- /dev/null +++ b/bam2bcf_indel.c @@ -0,0 +1,262 @@ +#include +#include "bam.h" +#include "bam2bcf.h" +#include "ksort.h" +#include "kaln.h" + +#define INDEL_DEBUG + +#define MINUS_CONST 0x10000000 +#define INDEL_WINDOW_SIZE 50 +#define INDEL_BAD_SCORE 10000 + +static int tpos2qpos(const bam1_core_t *c, const uint32_t *cigar, int32_t tpos, int is_left, int32_t *_tpos) +{ + int k, x = c->pos, y = 0, last_y = 0; + *_tpos = c->pos; + for (k = 0; k < c->n_cigar; ++k) { + int op = cigar[k] & BAM_CIGAR_MASK; + int l = cigar[k] >> BAM_CIGAR_SHIFT; + if (op == BAM_CMATCH) { + if (c->pos > tpos) return y; + if (x + l > tpos) { + *_tpos = tpos; + return y + (tpos - x); + } + x += l; y += l; + last_y = y; + } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l; + else if (op == BAM_CDEL || op == BAM_CREF_SKIP) { + if (x + l > tpos) { + *_tpos = is_left? x : x + l; + return y; + } + x += l; + } + } + *_tpos = x; + return last_y; +} +// l is the relative gap length and l_run is the length of the homopolymer on the reference +static inline int est_seqQ(const bcf_callaux_t *bca, int l, int l_run) +{ + int q, qh; + q = bca->openQ + bca->extQ * (abs(l) - 1); + qh = l_run >= 3? (int)(bca->tandemQ * (double)l / l_run + .499) : 1000; + return q < qh? q : qh; +} + +int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref) +{ + extern void ks_introsort_uint32_t(int, uint32_t*); + int i, s, j, k, t, n_types, *types, max_rd_len, left, right, max_ins, *score, N, K, l_run, ref_type; + char *inscns = 0, *ref2, *query; + if (ref == 0 || bca == 0) return -1; + // determine if there is a gap + for (s = 0; s < n; ++s) { + for (i = 0; i < n_plp[s]; ++i) + if (plp[s][i].indel != 0) break; + if (i < n_plp[s]) break; + } + if (s == n) return -1; // there is no indel at this position. + { // find out how many types of indels are present + int m; + uint32_t *aux; + aux = calloc(n + 1, 4); + m = max_rd_len = 0; + aux[m++] = MINUS_CONST; // zero indel is always a type + for (s = N = 0; s < n; ++s) { + N += n_plp[s]; // N is the total number of reads + for (i = 0; i < n_plp[s]; ++i) { + const bam_pileup1_t *p = plp[s] + i; + if (p->indel != 0) + aux[m++] = MINUS_CONST + p->indel; + j = bam_cigar2qlen(&p->b->core, bam1_cigar(p->b)); + if (j > max_rd_len) max_rd_len = j; + } + } + ks_introsort(uint32_t, m, aux); + // squeeze out identical types + for (i = 1, n_types = 1; i < m; ++i) + if (aux[i] != aux[i-1]) ++n_types; + assert(n_types > 1); // there must at least one type of non-reference indel + types = (int*)calloc(n_types, sizeof(int)); + t = 0; + types[t++] = aux[0] - MINUS_CONST; + for (i = 1; i < m; ++i) + if (aux[i] != aux[i-1]) + types[t++] = aux[i] - MINUS_CONST; + free(aux); + for (t = 0; t < n_types; ++t) + if (types[t] == 0) break; + ref_type = t; // the index of the reference type (0) + assert(n_types < 64); + } + { // calculate left and right boundary + left = pos > INDEL_WINDOW_SIZE? pos - INDEL_WINDOW_SIZE : 0; + right = pos + INDEL_WINDOW_SIZE; + if (types[0] < 0) right -= types[0]; + // in case the alignments stand out the reference + for (i = pos; i < right; ++i) + if (ref[i] == 0) break; + right = i; + } + { // the length of the homopolymer run around the current position + int c = bam_nt16_table[(int)ref[pos + 1]]; + if (c == 15) l_run = 1; + else { + for (i = pos + 2; ref[i]; ++i) + if (bam_nt16_table[(int)ref[i]] != c) break; + l_run = i; + for (i = pos; i >= 0; --i) + if (bam_nt16_table[(int)ref[i]] != c) break; + l_run -= i + 1; + } + } + // construct the consensus sequence + max_ins = types[n_types - 1]; // max_ins is at least 0 + if (max_ins > 0) { + int *inscns_aux = calloc(4 * n_types * max_ins, sizeof(int)); + // count the number of occurrences of each base at each position for each type of insertion + for (t = 0; t < n_types; ++t) { + if (types[t] > 0) { + for (s = 0; s < n; ++s) { + for (i = 0; i < n_plp[s]; ++i) { + bam_pileup1_t *p = plp[s] + i; + if (p->indel == types[t]) { + uint8_t *seq = bam1_seq(p->b); + for (k = 1; k <= p->indel; ++k) { + int c = bam_nt16_nt4_table[bam1_seqi(seq, p->qpos + k)]; + if (c < 4) ++inscns_aux[(t*max_ins+(k-1))*4 + c]; + } + } + } + } + } + } + // use the majority rule to construct the consensus + inscns = calloc(n_types * max_ins, 1); + for (t = 0; t < n_types; ++t) { + for (j = 0; j < types[t]; ++j) { + int max = 0, max_k = -1, *ia = &inscns_aux[(t*max_ins+j)*4]; + for (k = 0; k < 4; ++k) + if (ia[k] > max) + max = ia[k], max_k = k; + inscns[t*max_ins + j] = max? max_k : 4; + } + } + free(inscns_aux); + } + // compute the likelihood given each type of indel for each read + ref2 = calloc(right - left + max_ins + 2, 1); + query = calloc(right - left + max_rd_len + max_ins + 2, 1); + score = calloc(N * n_types, sizeof(int)); + for (t = 0; t < n_types; ++t) { + int l; + ka_param2_t ap = ka_param2_qual; + ap.band_width = abs(types[t]) + 3; + // write ref2 + for (k = 0, j = left; j <= pos; ++j) + ref2[k++] = bam_nt16_nt4_table[bam_nt16_table[(int)ref[j]]]; + if (types[t] <= 0) j += -types[t]; + else for (l = 0; l < types[t]; ++l) + ref2[k++] = inscns[t*max_ins + l]; + if (types[0] < 0) { // mask deleted sequences to avoid a particular error in the model. + int jj, tmp = types[t] >= 0? -types[0] : -types[0] + types[t]; + for (jj = 0; jj < tmp && j < right && ref[j]; ++jj, ++j) + ref2[k++] = 4; + } + for (; j < right && ref[j]; ++j) + ref2[k++] = bam_nt16_nt4_table[bam_nt16_table[(int)ref[j]]]; + if (j < right) right = j; + // align each read to ref2 + for (s = K = 0; s < n; ++s) { + for (i = 0; i < n_plp[s]; ++i, ++K) { + bam_pileup1_t *p = plp[s] + i; + int qbeg, qend, tbeg, tend, sc; + uint8_t *seq = bam1_seq(p->b); + // determine the start and end of sequences for alignment + qbeg = tpos2qpos(&p->b->core, bam1_cigar(p->b), left, 0, &tbeg); + qend = tpos2qpos(&p->b->core, bam1_cigar(p->b), right, 1, &tend); + assert(tbeg >= left); + // write the query sequence + for (l = qbeg; l < qend; ++l) + query[l - qbeg] = bam_nt16_nt4_table[bam1_seqi(seq, l)]; + // do alignment; this takes most of computing time for indel calling +#ifdef INDEL_DEBUG + for (sc = 0; sc < tend - tbeg + types[t]; ++sc) + fputc("ACGTN"[(int)ref2[tbeg-left+sc]], stderr); + fputc('\n', stderr); + for (sc = 0; sc < qend - qbeg; ++sc) fputc("ACGTN"[(int)query[qbeg + sc]], stderr); + fputc('\n', stderr); +#endif + sc = ka_global_score((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]), + (uint8_t*)query + qbeg, qend - qbeg, &ap); +#ifdef INDEL_DEBUG + fprintf(stderr, "pos=%d type=%d read=%d:%d name=%s score=%d\n", pos, types[t], s, i, bam1_qname(p->b), sc); +#endif + score[K*n_types + t] = -sc; + } + } + } + free(ref2); free(query); + { // choose the top 4 indel types; reference must be included + } + { // compute indelQ + int *sc, tmp, *sumq; + sc = alloca(n_types * sizeof(int)); + sumq = alloca(n_types * sizeof(int)); + memset(sumq, 0, sizeof(int) * n_types); + for (s = K = 0; s < n; ++s) { + for (i = 0; i < n_plp[s]; ++i, ++K) { + bam_pileup1_t *p = plp[s] + i; + int *sct = &score[K*n_types], indelQ; + for (t = 0; t < n_types; ++t) sc[t] = sct[t]<<6 | t; + for (t = 1; t < n_types; ++t) // insertion sort + for (j = t; j > 0 && sc[j] < sc[j-1]; --j) + tmp = sc[j], sc[j] = sc[j-1], sc[j-1] = tmp; + /* errmod_cal() assumes that if the call is wrong, the + * likelihoods of other events are equal. This is about + * right for substitutions, but is not desired for + * indels. To reuse errmod_cal(), I have to make + * compromise for multi-allelic indels. + */ + if ((sc[0]&0x3f) == ref_type) { + indelQ = (sc[1]>>6) - (sc[0]>>6); + tmp = est_seqQ(bca, types[sc[1]&0x3f], l_run); + } else { + for (t = 0; t < n_types; ++t) // look for the reference type + if ((sc[t]&0x3f) == ref_type) break; + indelQ = (sc[t]>>6) - (sc[0]>>6); + tmp = est_seqQ(bca, types[sc[0]&0x3f], l_run); + } + if (indelQ > tmp) indelQ = tmp; + if (indelQ > p->b->core.qual) indelQ = p->b->core.qual; + if (indelQ > bca->capQ) indelQ = bca->capQ; + p->aux = (sc[0]&0x3f)<<8 | indelQ; + sumq[sc[0]&0x3f] += indelQ; + } + } + // determine bca->indel_types[] + for (t = 0; t < n_types; ++t) + sumq[t] = sumq[t]<<6 | t; + for (t = 1; t < n_types; ++t) // insertion sort + for (j = t; j > 0 && sumq[j] < sumq[j-1]; --j) + tmp = sumq[j], sumq[j] = sumq[j-1], sumq[j-1] = tmp; + for (t = 0; t < n_types; ++t) // look for the reference type + if ((sumq[t]&0x3f) == ref_type) break; + if (t) { // then move the reference type to the first + tmp = sumq[t]; + for (; t > 0; --t) sumq[t] = sumq[t-1]; + sumq[0] = tmp; + } + for (t = 0; t < 4; ++t) bca->indel_types[t] = B2B_INDEL_NULL; + for (t = 0; t < 4 && t < n_types; ++t) + bca->indel_types[t] = types[sumq[t]&0x3f]; + } + // FIXME: to set the inserted sequence + free(score); + // free + free(types); free(inscns); + return 0; +} diff --git a/bam_plcmd.c b/bam_plcmd.c index a3e6aeb..1b15aae 100644 --- a/bam_plcmd.c +++ b/bam_plcmd.c @@ -729,6 +729,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) bcf_call2bcf(tid, pos, &bc, b, (conf->flag&(MPLP_FMT_DP|MPLP_FMT_SP))? bcr : 0, (conf->flag&MPLP_FMT_SP)); bcf_write(bp, bh, b); bcf_destroy(b); +// bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref); } else { printf("%s\t%d\t%c", h->target_name[tid], pos + 1, (ref && pos < ref_len)? ref[pos] : 'N'); for (i = 0; i < n; ++i) { diff --git a/kaln.c b/kaln.c index 4cba98d..01efe2e 100644 --- a/kaln.c +++ b/kaln.c @@ -73,9 +73,19 @@ int aln_sm_blast[] = { -2, -2, -2, -2, -2 }; +int aln_sm_qual[] = { + 0, -23, -23, -23, 0, + -23, 0, -23, -23, 0, + -23, -23, 0, -23, 0, + -23, -23, -23, 0, 0, + 0, 0, 0, 0, 0 +}; + ka_param_t ka_param_blast = { 5, 2, 5, 2, aln_sm_blast, 5, 50 }; ka_param_t ka_param_aa2aa = { 10, 2, 10, 2, aln_sm_blosum62, 22, 50 }; +ka_param2_t ka_param2_qual = { 35, 5, 35, 5, 35, 5, 0, 0, aln_sm_qual, 5, 50 }; + static uint32_t *ka_path2cigar32(const path_t *path, int path_len, int *n_cigar) { int i, n; @@ -176,7 +186,7 @@ static uint32_t *ka_path2cigar32(const path_t *path, int path_len, int *n_cigar) } typedef struct { - uint8_t Mt:3, It:2, Dt:2; + uint8_t Mt:3, It:2, Dt:3; } dpcell_t; typedef struct { @@ -208,7 +218,7 @@ uint32_t *ka_global_core(uint8_t *seq1, int len1, uint8_t *seq2, int len2, const score_matrix = ap->matrix; N_MATRIX_ROW = ap->row; - *n_cigar = 0; + if (n_cigar) *n_cigar = 0; if (len1 == 0 || len2 == 0) return 0; /* calculate b1 and b2 */ @@ -370,3 +380,107 @@ uint32_t *ka_global_core(uint8_t *seq1, int len1, uint8_t *seq2, int len2, const return cigar; } + +typedef struct { + int M, I, D; +} score_aux_t; + +#define MINUS_INF -0x40000000 + +// matrix: len2 rows and len1 columns +int ka_global_score(const uint8_t *_seq1, int len1, const uint8_t *_seq2, int len2, const ka_param2_t *ap) +{ + +#define __score_aux(_p, _q0, _sc, _io, _ie, _do, _de) { \ + int t1, t2; \ + score_aux_t *_q; \ + _q = _q0; \ + _p->M = _q->M >= _q->I? _q->M : _q->I; \ + _p->M = _p->M >= _q->D? _p->M : _q->D; \ + _p->M += (_sc); \ + ++_q; t1 = _q->M - _io - _ie; t2 = _q->I - _ie; _p->I = t1 >= t2? t1 : t2; \ + _q = _p-1; t1 = _q->M - _do - _de; t2 = _q->D - _de; _p->D = t1 >= t2? t1 : t2; \ + } + + int i, j, bw, scmat_size = ap->row, *scmat = ap->matrix, ret; + const uint8_t *seq1, *seq2; + score_aux_t *curr, *last, *swap; + bw = abs(len1 - len2) + ap->band_width; + i = len1 > len2? len1 : len2; + if (bw > i + 1) bw = i + 1; + seq1 = _seq1 - 1; seq2 = _seq2 - 1; + curr = calloc(len1 + 2, sizeof(score_aux_t)); + last = calloc(len1 + 2, sizeof(score_aux_t)); + { // the zero-th row + int x, end = len1; + score_aux_t *p; + j = 0; + x = j + bw; end = len1 < x? len1 : x; // band end + p = curr; + p->M = 0; p->I = p->D = MINUS_INF; + for (i = 1, p = &curr[1]; i <= end; ++i, ++p) + p->M = p->I = MINUS_INF, p->D = -(ap->edo + ap->ede * i); + p->M = p->I = p->D = MINUS_INF; + swap = curr; curr = last; last = swap; + } + for (j = 1; j < len2; ++j) { + int x, beg = 0, end = len1, *scrow, col_end; + score_aux_t *p; + x = j - bw; beg = 0 > x? 0 : x; // band start + x = j + bw; end = len1 < x? len1 : x; // band end + if (beg == 0) { // from zero-th column + p = curr; + p->M = p->D = MINUS_INF; p->I = -(ap->eio + ap->eie * j); + ++beg; // then beg = 1 + } + scrow = scmat + seq2[j] * scmat_size; + if (end == len1) col_end = 1, --end; + else col_end = 0; + for (i = beg, p = &curr[beg]; i <= end; ++i, ++p) + __score_aux(p, &last[i-1], scrow[(int)seq1[i]], ap->iio, ap->iie, ap->ido, ap->ide); + if (col_end) { + __score_aux(p, &last[i-1], scrow[(int)seq1[i]], ap->eio, ap->eie, ap->ido, ap->ide); + ++p; + } + p->M = p->I = p->D = MINUS_INF; +// for (i = 0; i <= len1; ++i) printf("(%d,%d,%d) ", curr[i].M, curr[i].I, curr[i].D); putchar('\n'); + swap = curr; curr = last; last = swap; + } + { // the last row + int x, beg = 0, *scrow; + score_aux_t *p; + j = len2; + x = j - bw; beg = 0 > x? 0 : x; // band start + if (beg == 0) { // from zero-th column + p = curr; + p->M = p->D = MINUS_INF; p->I = -(ap->eio + ap->eie * j); + ++beg; // then beg = 1 + } + scrow = scmat + seq2[j] * scmat_size; + for (i = beg, p = &curr[beg]; i < len1; ++i, ++p) + __score_aux(p, &last[i-1], scrow[(int)seq1[i]], ap->iio, ap->iie, ap->edo, ap->ede); + __score_aux(p, &last[i-1], scrow[(int)seq1[i]], ap->eio, ap->eie, ap->edo, ap->ede); +// for (i = 0; i <= len1; ++i) printf("(%d,%d,%d) ", curr[i].M, curr[i].I, curr[i].D); putchar('\n'); + } + ret = curr[len1].M >= curr[len1].I? curr[len1].M : curr[len1].I; + ret = ret >= curr[len1].D? ret : curr[len1].D; + free(curr); free(last); + return ret; +} + +#ifdef _MAIN +int main(int argc, char *argv[]) +{ +// int len1 = 35, len2 = 35; +// uint8_t *seq1 = (uint8_t*)"\0\0\3\3\2\0\0\0\1\0\2\1\2\1\3\2\3\3\3\0\2\3\2\1\1\3\3\3\2\3\3\1\0\0\1"; +// uint8_t *seq2 = (uint8_t*)"\0\0\3\3\2\0\0\0\1\0\2\1\2\1\3\2\3\3\3\0\2\3\2\1\1\3\3\3\2\3\3\1\0\1\0"; + int len1 = 4, len2 = 4; + uint8_t *seq1 = (uint8_t*)"\1\0\0\1"; + uint8_t *seq2 = (uint8_t*)"\1\0\1\0"; + int sc; +// ka_global_core(seq1, 2, seq2, 1, &ka_param_qual, &sc, 0); + sc = ka_global_score(seq1, len1, seq2, len2, &ka_param2_qual); + printf("%d\n", sc); + return 0; +} +#endif diff --git a/kaln.h b/kaln.h index b25dccb..1ece132 100644 --- a/kaln.h +++ b/kaln.h @@ -41,16 +41,27 @@ typedef struct { int band_width; } ka_param_t; +typedef struct { + int iio, iie, ido, ide; + int eio, eie, edo, ede; + int *matrix; + int row; + int band_width; +} ka_param2_t; + #ifdef __cplusplus extern "C" { #endif uint32_t *ka_global_core(uint8_t *seq1, int len1, uint8_t *seq2, int len2, const ka_param_t *ap, int *_score, int *n_cigar); + int ka_global_score(const uint8_t *_seq1, int len1, const uint8_t *_seq2, int len2, const ka_param2_t *ap); #ifdef __cplusplus } #endif extern ka_param_t ka_param_blast; /* = { 5, 2, 5, 2, aln_sm_blast, 5, 50 }; */ +extern ka_param_t ka_param_qual; // only use this for global alignment!!! +extern ka_param2_t ka_param2_qual; // only use this for global alignment!!! #endif