From: Heng Li Date: Tue, 14 Sep 2010 20:32:33 +0000 (+0000) Subject: * aggressive gapped aligner is implemented in calmd. X-Git-Url: https://git.donarmstrong.com/?p=samtools.git;a=commitdiff_plain;h=addab0d26aa2f0ea4461bfcacf6304556786408a * aggressive gapped aligner is implemented in calmd. * distinguish gap_open and gap_end_open in banded alignment * make tview accepts alignment with heading and tailing D --- diff --git a/bam_md.c b/bam_md.c index 17b0a4a..63c25e9 100644 --- a/bam_md.c +++ b/bam_md.c @@ -5,6 +5,7 @@ #include "faidx.h" #include "sam.h" #include "kstring.h" +#include "kaln.h" void bam_fillmd1_core(bam1_t *b, char *ref, int is_equal, int max_nm) { @@ -108,19 +109,68 @@ void bam_fillmd1(bam1_t *b, char *ref, int is_equal) bam_fillmd1_core(b, ref, is_equal, 0); } +// local realignment + +#define MIN_REF_LEN 10 + +int bam_realn(bam1_t *b, const char *ref) +{ + int k, l_ref, score, n_cigar; + uint32_t *cigar = bam1_cigar(b); + uint8_t *s_ref = 0, *s_read = 0, *seq; + ka_param_t par; + bam1_core_t *c = &b->core; + // set S/W parameters + par = ka_param_blast; par.gap_open = 4; par.gap_ext = 1; par.gap_end_open = par.gap_end_ext = 0; + // calculate the length of the reference in the alignment + for (k = l_ref = 0; k < c->n_cigar; ++k) { + if ((cigar[k]&0xf) == BAM_CREF_SKIP) break; // do not do realignment if there is an `N' operation + if ((cigar[k]&0xf) == BAM_CMATCH || (cigar[k]&0xf) == BAM_CDEL) + l_ref += cigar[k]>>4; + } + if (k != c->n_cigar || l_ref < MIN_REF_LEN) return -1; + for (k = 0; k < l_ref; ++k) + if (ref[c->pos + k] == 0) return -1; // the read stands out of the reference + // allocate + s_ref = calloc(l_ref, 1); + s_read = calloc(c->l_qseq, 1); + for (k = 0, seq = bam1_seq(b); k < c->l_qseq; ++k) + s_read[k] = bam_nt16_nt4_table[bam1_seqi(seq, k)]; + for (k = 0; k < l_ref; ++k) + s_ref[k] = bam_nt16_nt4_table[(int)bam_nt16_table[(int)ref[c->pos + k]]]; + // do alignment + cigar = ka_global_core(s_ref, l_ref, s_read, c->l_qseq, &par, &score, &n_cigar); + if (score <= 0) { // realignment failed + free(cigar); free(s_ref); free(s_read); + } + // copy over the alignment + if (4 * (n_cigar - (int)c->n_cigar) + b->data_len > b->m_data) { // enlarge b->data + b->m_data = 4 * (n_cigar - (int)c->n_cigar) + b->data_len; + kroundup32(b->m_data); + b->data = realloc(b->data, b->m_data); + } + if (n_cigar != (int)c->n_cigar) // move data + memmove(b->data + c->l_qname + 4 * n_cigar, bam1_seq(b), b->data_len - c->l_qseq - 4 * c->n_cigar); + memcpy(bam1_cigar(b), cigar, n_cigar * 4); + c->n_cigar = n_cigar; + free(s_ref); free(s_read); free(cigar); + return 0; +} + int bam_fillmd(int argc, char *argv[]) { - int c, is_equal = 0, tid = -2, ret, len, is_bam_out, is_sam_in, is_uncompressed, max_nm = 0; + int c, is_equal = 0, tid = -2, ret, len, is_bam_out, is_sam_in, is_uncompressed, max_nm = 0, is_realn; samfile_t *fp, *fpout = 0; faidx_t *fai; char *ref = 0, mode_w[8], mode_r[8]; bam1_t *b; - is_bam_out = is_sam_in = is_uncompressed = 0; + is_bam_out = is_sam_in = is_uncompressed = is_realn = 0; mode_w[0] = mode_r[0] = 0; strcpy(mode_r, "r"); strcpy(mode_w, "w"); - while ((c = getopt(argc, argv, "eubSn:")) >= 0) { + while ((c = getopt(argc, argv, "reubSn:")) >= 0) { switch (c) { + case 'r': is_realn = 1; break; case 'e': is_equal = 1; break; case 'b': is_bam_out = 1; break; case 'u': is_uncompressed = is_bam_out = 1; break; @@ -135,11 +185,12 @@ int bam_fillmd(int argc, char *argv[]) if (is_uncompressed) strcat(mode_w, "u"); if (optind + 1 >= argc) { fprintf(stderr, "\n"); - fprintf(stderr, "Usage: samtools fillmd [-eubS] \n\n"); + fprintf(stderr, "Usage: samtools fillmd [-eubrS] \n\n"); fprintf(stderr, "Options: -e change identical bases to '='\n"); fprintf(stderr, " -u uncompressed BAM output (for piping)\n"); fprintf(stderr, " -b compressed BAM output\n"); - fprintf(stderr, " -S the input is SAM with header\n\n"); + fprintf(stderr, " -S the input is SAM with header\n"); + fprintf(stderr, " -r read-independent local realignment\n\n"); return 1; } fp = samopen(argv[optind], mode_r, 0); @@ -162,6 +213,7 @@ int bam_fillmd(int argc, char *argv[]) fprintf(stderr, "[bam_fillmd] fail to find sequence '%s' in the reference.\n", fp->header->target_name[tid]); } + if (is_realn) bam_realn(b, ref); if (ref) bam_fillmd1_core(b, ref, is_equal, max_nm); } samwrite(fpout, b); diff --git a/bam_pileup.c b/bam_pileup.c index 3c41a16..db170e8 100644 --- a/bam_pileup.c +++ b/bam_pileup.c @@ -59,11 +59,13 @@ static inline int resolve_cigar(bam_pileup1_t *p, uint32_t pos) bam1_t *b = p->b; bam1_core_t *c = &b->core; uint32_t x = c->pos, y = 0; - int ret = 1, is_restart = 1; + int ret = 1, is_restart = 1, first_op, last_op; if (c->flag&BAM_FUNMAP) return 0; // unmapped read assert(x <= pos); // otherwise a bug p->qpos = -1; p->indel = 0; p->is_del = p->is_head = p->is_tail = 0; + first_op = bam1_cigar(b)[0] & BAM_CIGAR_MASK; + last_op = bam1_cigar(b)[c->n_cigar-1] & BAM_CIGAR_MASK; for (k = 0; k < c->n_cigar; ++k) { int op = bam1_cigar(b)[k] & BAM_CIGAR_MASK; // operation int l = bam1_cigar(b)[k] >> BAM_CIGAR_SHIFT; // length @@ -71,7 +73,7 @@ static inline int resolve_cigar(bam_pileup1_t *p, uint32_t pos) if (x + l > pos) { // overlap with pos p->indel = p->is_del = 0; p->qpos = y + (pos - x); - if (x == pos && is_restart) p->is_head = 1; + if (x == pos && is_restart && first_op != BAM_CDEL) p->is_head = 1; if (x + l - 1 == pos) { // come to the end of a match int has_next_match = 0; unsigned i; @@ -83,7 +85,7 @@ static inline int resolve_cigar(bam_pileup1_t *p, uint32_t pos) break; } else if (opi == BAM_CSOFT_CLIP || opi == BAM_CREF_SKIP || opi == BAM_CHARD_CLIP) break; } - if (!has_next_match) p->is_tail = 1; + if (!has_next_match && last_op != BAM_CDEL) p->is_tail = 1; if (k < c->n_cigar - 1 && has_next_match) { // there are additional operation(s) uint32_t cigar = bam1_cigar(b)[k+1]; // next CIGAR int op_next = cigar&BAM_CIGAR_MASK; // next CIGAR operation @@ -99,9 +101,11 @@ static inline int resolve_cigar(bam_pileup1_t *p, uint32_t pos) } x += l; y += l; } else if (op == BAM_CDEL) { // then set ->is_del + if (k == 0 && x == pos) p->is_head = 1; + else if (x + l - 1 == pos && k == c->n_cigar - 1) p->is_tail = 1; if (x + l > pos) { p->indel = 0; p->is_del = 1; - p->qpos = y + (pos - x); + p->qpos = y; } x += l; } else if (op == BAM_CREF_SKIP) x += l; diff --git a/kaln.c b/kaln.c index 9fa40d0..c68e80c 100644 --- a/kaln.c +++ b/kaln.c @@ -72,8 +72,8 @@ int aln_sm_blast[] = { -2, -2, -2, -2, -2 }; -ka_param_t ka_param_blast = { 5, 2, 2, aln_sm_blast, 5, 50 }; -ka_param_t ka_param_aa2aa = { 10, 2, 2, aln_sm_blosum62, 22, 50 }; +ka_param_t ka_param_blast = { 5, 2, 5, 2, aln_sm_blast, 5, 50 }; +ka_param_t ka_param_aa2aa = { 10, 2, 10, 2, aln_sm_blosum62, 22, 50 }; static uint32_t *ka_path2cigar32(const path_t *path, int path_len, int *n_cigar) { @@ -141,13 +141,13 @@ static uint32_t *ka_path2cigar32(const path_t *path, int path_len, int *n_cigar) } #define set_end_I(II, cur, p) \ { \ - if (gap_end >= 0) { \ - if ((p)->M - gap_open > (p)->I) { \ + if (gap_end_ext >= 0) { \ + if ((p)->M - gap_end_open > (p)->I) { \ (cur)->It = FROM_M; \ - (II) = (p)->M - gap_open - gap_end; \ + (II) = (p)->M - gap_end_open - gap_end_ext; \ } else { \ (cur)->It = FROM_I; \ - (II) = (p)->I - gap_end; \ + (II) = (p)->I - gap_end_ext; \ } \ } else set_I(II, cur, p); \ } @@ -163,13 +163,13 @@ static uint32_t *ka_path2cigar32(const path_t *path, int path_len, int *n_cigar) } #define set_end_D(DD, cur, p) \ { \ - if (gap_end >= 0) { \ - if ((p)->M - gap_open > (p)->D) { \ + if (gap_end_ext >= 0) { \ + if ((p)->M - gap_end_open > (p)->D) { \ (cur)->Dt = FROM_M; \ - (DD) = (p)->M - gap_open - gap_end; \ + (DD) = (p)->M - gap_end_open - gap_end_ext; \ } else { \ (cur)->Dt = FROM_D; \ - (DD) = (p)->D - gap_end; \ + (DD) = (p)->D - gap_end_ext; \ } \ } else set_D(DD, cur, p); \ } @@ -195,13 +195,14 @@ uint32_t *ka_global_core(uint8_t *seq1, int len1, uint8_t *seq2, int len2, const uint8_t type, ctype; uint32_t *cigar = 0; - int gap_open, gap_ext, gap_end, b; + int gap_open, gap_ext, gap_end_open, gap_end_ext, b; int *score_matrix, N_MATRIX_ROW; /* initialize some align-related parameters. just for compatibility */ gap_open = ap->gap_open; gap_ext = ap->gap_ext; - gap_end = ap->gap_end; + gap_end_open = ap->gap_end_open; + gap_end_ext = ap->gap_end_ext; b = ap->band_width; score_matrix = ap->matrix; N_MATRIX_ROW = ap->row; diff --git a/kaln.h b/kaln.h index b04d8cc..4b6869c 100644 --- a/kaln.h +++ b/kaln.h @@ -33,7 +33,8 @@ typedef struct { int gap_open; int gap_ext; - int gap_end; + int gap_end_open; + int gap_end_ext; int *matrix; int row; @@ -50,6 +51,6 @@ extern "C" { } #endif -extern ka_param_t ka_param_blast; /* = { 5, 2, 2, aln_sm_blast, 5, 50 }; */ +extern ka_param_t ka_param_blast; /* = { 5, 2, 5, 2, aln_sm_blast, 5, 50 }; */ #endif