#include "faidx.h"
#include "sam.h"
#include "kstring.h"
+#include "kaln.h"
void bam_fillmd1_core(bam1_t *b, char *ref, int is_equal, int max_nm)
{
bam_fillmd1_core(b, ref, is_equal, 0);
}
+// local realignment
+
+#define MIN_REF_LEN 10
+
+int bam_realn(bam1_t *b, const char *ref)
+{
+ int k, l_ref, score, n_cigar;
+ uint32_t *cigar = bam1_cigar(b);
+ uint8_t *s_ref = 0, *s_read = 0, *seq;
+ ka_param_t par;
+ bam1_core_t *c = &b->core;
+ // set S/W parameters
+ par = ka_param_blast; par.gap_open = 4; par.gap_ext = 1; par.gap_end_open = par.gap_end_ext = 0;
+ // calculate the length of the reference in the alignment
+ for (k = l_ref = 0; k < c->n_cigar; ++k) {
+ if ((cigar[k]&0xf) == BAM_CREF_SKIP) break; // do not do realignment if there is an `N' operation
+ if ((cigar[k]&0xf) == BAM_CMATCH || (cigar[k]&0xf) == BAM_CDEL)
+ l_ref += cigar[k]>>4;
+ }
+ if (k != c->n_cigar || l_ref < MIN_REF_LEN) return -1;
+ for (k = 0; k < l_ref; ++k)
+ if (ref[c->pos + k] == 0) return -1; // the read stands out of the reference
+ // allocate
+ s_ref = calloc(l_ref, 1);
+ s_read = calloc(c->l_qseq, 1);
+ for (k = 0, seq = bam1_seq(b); k < c->l_qseq; ++k)
+ s_read[k] = bam_nt16_nt4_table[bam1_seqi(seq, k)];
+ for (k = 0; k < l_ref; ++k)
+ s_ref[k] = bam_nt16_nt4_table[(int)bam_nt16_table[(int)ref[c->pos + k]]];
+ // do alignment
+ cigar = ka_global_core(s_ref, l_ref, s_read, c->l_qseq, &par, &score, &n_cigar);
+ if (score <= 0) { // realignment failed
+ free(cigar); free(s_ref); free(s_read);
+ }
+ // copy over the alignment
+ if (4 * (n_cigar - (int)c->n_cigar) + b->data_len > b->m_data) { // enlarge b->data
+ b->m_data = 4 * (n_cigar - (int)c->n_cigar) + b->data_len;
+ kroundup32(b->m_data);
+ b->data = realloc(b->data, b->m_data);
+ }
+ if (n_cigar != (int)c->n_cigar) // move data
+ memmove(b->data + c->l_qname + 4 * n_cigar, bam1_seq(b), b->data_len - c->l_qseq - 4 * c->n_cigar);
+ memcpy(bam1_cigar(b), cigar, n_cigar * 4);
+ c->n_cigar = n_cigar;
+ free(s_ref); free(s_read); free(cigar);
+ return 0;
+}
+
int bam_fillmd(int argc, char *argv[])
{
- int c, is_equal = 0, tid = -2, ret, len, is_bam_out, is_sam_in, is_uncompressed, max_nm = 0;
+ int c, is_equal = 0, tid = -2, ret, len, is_bam_out, is_sam_in, is_uncompressed, max_nm = 0, is_realn;
samfile_t *fp, *fpout = 0;
faidx_t *fai;
char *ref = 0, mode_w[8], mode_r[8];
bam1_t *b;
- is_bam_out = is_sam_in = is_uncompressed = 0;
+ is_bam_out = is_sam_in = is_uncompressed = is_realn = 0;
mode_w[0] = mode_r[0] = 0;
strcpy(mode_r, "r"); strcpy(mode_w, "w");
- while ((c = getopt(argc, argv, "eubSn:")) >= 0) {
+ while ((c = getopt(argc, argv, "reubSn:")) >= 0) {
switch (c) {
+ case 'r': is_realn = 1; break;
case 'e': is_equal = 1; break;
case 'b': is_bam_out = 1; break;
case 'u': is_uncompressed = is_bam_out = 1; break;
if (is_uncompressed) strcat(mode_w, "u");
if (optind + 1 >= argc) {
fprintf(stderr, "\n");
- fprintf(stderr, "Usage: samtools fillmd [-eubS] <aln.bam> <ref.fasta>\n\n");
+ fprintf(stderr, "Usage: samtools fillmd [-eubrS] <aln.bam> <ref.fasta>\n\n");
fprintf(stderr, "Options: -e change identical bases to '='\n");
fprintf(stderr, " -u uncompressed BAM output (for piping)\n");
fprintf(stderr, " -b compressed BAM output\n");
- fprintf(stderr, " -S the input is SAM with header\n\n");
+ fprintf(stderr, " -S the input is SAM with header\n");
+ fprintf(stderr, " -r read-independent local realignment\n\n");
return 1;
}
fp = samopen(argv[optind], mode_r, 0);
fprintf(stderr, "[bam_fillmd] fail to find sequence '%s' in the reference.\n",
fp->header->target_name[tid]);
}
+ if (is_realn) bam_realn(b, ref);
if (ref) bam_fillmd1_core(b, ref, is_equal, max_nm);
}
samwrite(fpout, b);
bam1_t *b = p->b;
bam1_core_t *c = &b->core;
uint32_t x = c->pos, y = 0;
- int ret = 1, is_restart = 1;
+ int ret = 1, is_restart = 1, first_op, last_op;
if (c->flag&BAM_FUNMAP) return 0; // unmapped read
assert(x <= pos); // otherwise a bug
p->qpos = -1; p->indel = 0; p->is_del = p->is_head = p->is_tail = 0;
+ first_op = bam1_cigar(b)[0] & BAM_CIGAR_MASK;
+ last_op = bam1_cigar(b)[c->n_cigar-1] & BAM_CIGAR_MASK;
for (k = 0; k < c->n_cigar; ++k) {
int op = bam1_cigar(b)[k] & BAM_CIGAR_MASK; // operation
int l = bam1_cigar(b)[k] >> BAM_CIGAR_SHIFT; // length
if (x + l > pos) { // overlap with pos
p->indel = p->is_del = 0;
p->qpos = y + (pos - x);
- if (x == pos && is_restart) p->is_head = 1;
+ if (x == pos && is_restart && first_op != BAM_CDEL) p->is_head = 1;
if (x + l - 1 == pos) { // come to the end of a match
int has_next_match = 0;
unsigned i;
break;
} else if (opi == BAM_CSOFT_CLIP || opi == BAM_CREF_SKIP || opi == BAM_CHARD_CLIP) break;
}
- if (!has_next_match) p->is_tail = 1;
+ if (!has_next_match && last_op != BAM_CDEL) p->is_tail = 1;
if (k < c->n_cigar - 1 && has_next_match) { // there are additional operation(s)
uint32_t cigar = bam1_cigar(b)[k+1]; // next CIGAR
int op_next = cigar&BAM_CIGAR_MASK; // next CIGAR operation
}
x += l; y += l;
} else if (op == BAM_CDEL) { // then set ->is_del
+ if (k == 0 && x == pos) p->is_head = 1;
+ else if (x + l - 1 == pos && k == c->n_cigar - 1) p->is_tail = 1;
if (x + l > pos) {
p->indel = 0; p->is_del = 1;
- p->qpos = y + (pos - x);
+ p->qpos = y;
}
x += l;
} else if (op == BAM_CREF_SKIP) x += l;
-2, -2, -2, -2, -2
};
-ka_param_t ka_param_blast = { 5, 2, 2, aln_sm_blast, 5, 50 };
-ka_param_t ka_param_aa2aa = { 10, 2, 2, aln_sm_blosum62, 22, 50 };
+ka_param_t ka_param_blast = { 5, 2, 5, 2, aln_sm_blast, 5, 50 };
+ka_param_t ka_param_aa2aa = { 10, 2, 10, 2, aln_sm_blosum62, 22, 50 };
static uint32_t *ka_path2cigar32(const path_t *path, int path_len, int *n_cigar)
{
}
#define set_end_I(II, cur, p) \
{ \
- if (gap_end >= 0) { \
- if ((p)->M - gap_open > (p)->I) { \
+ if (gap_end_ext >= 0) { \
+ if ((p)->M - gap_end_open > (p)->I) { \
(cur)->It = FROM_M; \
- (II) = (p)->M - gap_open - gap_end; \
+ (II) = (p)->M - gap_end_open - gap_end_ext; \
} else { \
(cur)->It = FROM_I; \
- (II) = (p)->I - gap_end; \
+ (II) = (p)->I - gap_end_ext; \
} \
} else set_I(II, cur, p); \
}
}
#define set_end_D(DD, cur, p) \
{ \
- if (gap_end >= 0) { \
- if ((p)->M - gap_open > (p)->D) { \
+ if (gap_end_ext >= 0) { \
+ if ((p)->M - gap_end_open > (p)->D) { \
(cur)->Dt = FROM_M; \
- (DD) = (p)->M - gap_open - gap_end; \
+ (DD) = (p)->M - gap_end_open - gap_end_ext; \
} else { \
(cur)->Dt = FROM_D; \
- (DD) = (p)->D - gap_end; \
+ (DD) = (p)->D - gap_end_ext; \
} \
} else set_D(DD, cur, p); \
}
uint8_t type, ctype;
uint32_t *cigar = 0;
- int gap_open, gap_ext, gap_end, b;
+ int gap_open, gap_ext, gap_end_open, gap_end_ext, b;
int *score_matrix, N_MATRIX_ROW;
/* initialize some align-related parameters. just for compatibility */
gap_open = ap->gap_open;
gap_ext = ap->gap_ext;
- gap_end = ap->gap_end;
+ gap_end_open = ap->gap_end_open;
+ gap_end_ext = ap->gap_end_ext;
b = ap->band_width;
score_matrix = ap->matrix;
N_MATRIX_ROW = ap->row;