#include <string.h>
#include "bam.h"
#include "bam2bcf.h"
-#include "ksort.h"
#include "kaln.h"
#include "kprobaln.h"
#include "khash.h"
KHASH_SET_INIT_STR(rg)
+#include "ksort.h"
+KSORT_INIT_GENERIC(uint32_t)
+
#define MINUS_CONST 0x10000000
#define INDEL_WINDOW_SIZE 50
for (k = 0; k < c->n_cigar; ++k) {
int op = cigar[k] & BAM_CIGAR_MASK;
int l = cigar[k] >> BAM_CIGAR_SHIFT;
- if (op == BAM_CMATCH) {
+ if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
if (c->pos > tpos) return y;
if (x + l > tpos) {
*_tpos = tpos;
int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref,
const void *rghash)
{
- extern void ks_introsort_uint32_t(int, uint32_t*);
int i, s, j, k, t, n_types, *types, max_rd_len, left, right, max_ins, *score1, *score2, max_ref2;
int N, K, l_run, ref_type, n_alt;
char *inscns = 0, *ref2, *query, **ref_sample;
if (n_types == 1 || (double)n_alt / n_tot < bca->min_frac || n_alt < bca->min_support) { // then skip
free(aux); return -1;
}
+ if (n_types >= 64) {
+ free(aux);
+ if (bam_verbose >= 2)
+ fprintf(stderr, "[%s] excessive INDEL alleles at position %d. Skip the position.\n", __func__, pos + 1);
+ return -1;
+ }
types = (int*)calloc(n_types, sizeof(int));
t = 0;
types[t++] = aux[0] - MINUS_CONST;
for (t = 0; t < n_types; ++t)
if (types[t] == 0) break;
ref_type = t; // the index of the reference type (0)
- assert(n_types < 64);
}
{ // calculate left and right boundary
left = pos > INDEL_WINDOW_SIZE? pos - INDEL_WINDOW_SIZE : 0;
* sequence realignment helps to increase the power.
*/
{ // construct per-sample consensus
- int L = right - left + 1;
- uint32_t *cns;
+ int L = right - left + 1, max_i, max2_i;
+ uint32_t *cns, max, max2;
char *ref0, *r;
ref_sample = calloc(n, sizeof(void*));
cns = calloc(L, 4);
for (k = 0; k < b->core.n_cigar; ++k) {
int op = cigar[k]&0xf;
int j, l = cigar[k]>>4;
- if (op == BAM_CMATCH) {
+ if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
for (j = 0; j < l; ++j)
if (x + j >= left && x + j < right)
cns[x+j-left] += (bam1_seqi(seq, y+j) == ref0[x+j-left])? 1 : 0x10000;
}
}
// determine the consensus
- for (i = 0; i < right - left; ++i)
- r[i] = (cns[i] && (double)(cns[i]&0xffff) / ((cns[i]&0xffff)+(cns[i]>>16&0xffff)) < 0.7)? 15 : ref0[i];
+ for (i = 0; i < right - left; ++i) r[i] = ref0[i];
+ max = max2 = 0; max_i = max2_i = -1;
+ for (i = 0; i < right - left; ++i) {
+ if (cns[i]>>16 >= max>>16) max2 = max, max2_i = max_i, max = cns[i], max_i = i;
+ else if (cns[i]>>16 >= max2>>16) max2 = cns[i], max2_i = i;
+ }
+ if ((double)(max&0xffff) / ((max&0xffff) + (max>>16)) >= 0.7) max_i = -1;
+ if ((double)(max2&0xffff) / ((max2&0xffff) + (max2>>16)) >= 0.7) max2_i = -1;
+ if (max_i >= 0) r[max_i] = 15;
+ if (max2_i >= 0) r[max2_i] = 15;
// for (i = 0; i < right - left; ++i) fputc("=ACMGRSVTWYHKDBN"[(int)r[i]], stderr); fputc('\n', stderr);
}
free(ref0); free(cns);
// align each read to ref2
for (i = 0; i < n_plp[s]; ++i, ++K) {
bam_pileup1_t *p = plp[s] + i;
- int qbeg, qend, tbeg, tend, sc;
+ int qbeg, qend, tbeg, tend, sc, kk;
uint8_t *seq = bam1_seq(p->b);
+ uint32_t *cigar = bam1_cigar(p->b);
+ if (p->b->core.flag&4) continue; // unmapped reads
+ // FIXME: the following loop should be better moved outside; nonetheless, realignment should be much slower anyway.
+ for (kk = 0; kk < p->b->core.n_cigar; ++kk)
+ if ((cigar[kk]&BAM_CIGAR_MASK) == BAM_CREF_SKIP) break;
+ if (kk < p->b->core.n_cigar) continue;
// FIXME: the following skips soft clips, but using them may be more sensitive.
// determine the start and end of sequences for alignment
qbeg = tpos2qpos(&p->b->core, bam1_cigar(p->b), left, 0, &tbeg);
indelQ2 = tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ2 + .499);
// pick the smaller between indelQ1 and indelQ2
indelQ = indelQ1 < indelQ2? indelQ1 : indelQ2;
- p->aux = (sc[0]&0x3f)<<16 | seqQ<<8 | indelQ;
+ if (indelQ > 255) indelQ = 255;
+ if (seqQ > 255) seqQ = 255;
+ p->aux = (sc[0]&0x3f)<<16 | seqQ<<8 | indelQ; // use 22 bits in total
sumq[sc[0]&0x3f] += indelQ < seqQ? indelQ : seqQ;
-// fprintf(stderr, "pos=%d read=%d:%d name=%s call=%d q=%d\n", pos, s, i, bam1_qname(p->b), types[sc[0]&0x3f], indelQ);
+// fprintf(stderr, "pos=%d read=%d:%d name=%s call=%d indelQ=%d seqQ=%d\n", pos, s, i, bam1_qname(p->b), types[sc[0]&0x3f], indelQ, seqQ);
}
}
// determine bca->indel_types[] and bca->inscns