From 91142420b2c2b9cc218e5f1f36b557cae26f926f Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 30 Oct 2010 03:17:22 +0000 Subject: [PATCH] * samtools-0.1.9-2 (r787) * Allow to set a maximum per-sample depth to reduce memory. However, BAQ computation is still applied to every read. The speed is not improved. --- bam.h | 6 ++++-- bam_pileup.c | 19 +++++++++++++++++-- bam_plcmd.c | 17 ++++++++++++++--- bamtk.c | 2 +- 4 files changed, 36 insertions(+), 8 deletions(-) diff --git a/bam.h b/bam.h index 4c8536f..4594ac4 100644 --- a/bam.h +++ b/bam.h @@ -508,6 +508,7 @@ extern "C" { const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp); const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp); void bam_plp_set_mask(bam_plp_t iter, int mask); + void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt); void bam_plp_reset(bam_plp_t iter); void bam_plp_destroy(bam_plp_t iter); @@ -516,6 +517,7 @@ extern "C" { bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data); void bam_mplp_destroy(bam_mplp_t iter); + void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt); int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp); /*! @typedef @@ -708,8 +710,8 @@ static inline bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc) { uint8_t *data = bdst->data; int m_data = bdst->m_data; // backup data and m_data - if (m_data < bsrc->m_data) { // double the capacity - m_data = bsrc->m_data; kroundup32(m_data); + if (m_data < bsrc->data_len) { // double the capacity + m_data = bsrc->data_len; kroundup32(m_data); data = (uint8_t*)realloc(data, m_data); } memcpy(data, bsrc->data, bsrc->data_len); // copy var-len data diff --git a/bam_pileup.c b/bam_pileup.c index a49f795..55e51e2 100644 --- a/bam_pileup.c +++ b/bam_pileup.c @@ -152,7 +152,7 @@ struct __bam_plp_t { mempool_t *mp; lbnode_t *head, *tail, *dummy; int32_t tid, pos, max_tid, max_pos; - int is_eof, flag_mask, max_plp, error; + int is_eof, flag_mask, max_plp, error, maxcnt; bam_pileup1_t *plp; // for the "auto" interface only bam1_t *b; @@ -169,6 +169,7 @@ bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data) iter->dummy = mp_alloc(iter->mp); iter->max_tid = iter->max_pos = -1; iter->flag_mask = BAM_DEF_MASK; + iter->maxcnt = 8000; if (func) { iter->func = func; iter->data = data; @@ -240,6 +241,7 @@ int bam_plp_push(bam_plp_t iter, const bam1_t *b) if (b) { if (b->core.tid < 0) return 0; if (b->core.flag & iter->flag_mask) return 0; + if (iter->tid == b->core.tid && iter->pos == b->core.pos && iter->mp->cnt > iter->maxcnt) return 0; bam_copy1(&iter->tail->b, b); iter->tail->beg = b->core.pos; iter->tail->end = bam_calend(&b->core, bam1_cigar(b)); iter->tail->s = g_cstate_null; iter->tail->s.end = iter->tail->end - 1; // initialize cstate_t @@ -267,7 +269,7 @@ const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_ const bam_pileup1_t *plp; if (iter->func == 0 || iter->error) { *_n_plp = -1; return 0; } if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp; - else { + else { // no pileup line can be obtained; read alignments *_n_plp = 0; if (iter->is_eof) return 0; while (iter->func(iter->data, iter->b) >= 0) { @@ -276,6 +278,7 @@ const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_ return 0; } if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp; + // otherwise no pileup line can be returned; read the next alignment. } bam_plp_push(iter, 0); if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp; @@ -302,6 +305,11 @@ void bam_plp_set_mask(bam_plp_t iter, int mask) iter->flag_mask = mask < 0? BAM_DEF_MASK : (BAM_FUNMAP | mask); } +void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt) +{ + iter->maxcnt = maxcnt; +} + /***************** * callback APIs * *****************/ @@ -389,6 +397,13 @@ bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data) return iter; } +void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt) +{ + int i; + for (i = 0; i < iter->n; ++i) + iter->iter[i]->maxcnt = maxcnt; +} + void bam_mplp_destroy(bam_mplp_t iter) { int i; diff --git a/bam_plcmd.c b/bam_plcmd.c index c23726e..f44627f 100644 --- a/bam_plcmd.c +++ b/bam_plcmd.c @@ -532,7 +532,7 @@ int bam_pileup(int argc, char *argv[]) #define MPLP_FMT_SP 0x200 typedef struct { - int max_mq, min_mq, flag, min_baseQ, capQ_thres; + int max_mq, min_mq, flag, min_baseQ, capQ_thres, max_depth; char *reg, *fn_pos; faidx_t *fai; kh_64_t *hash; @@ -601,7 +601,7 @@ static void group_smpl(mplp_pileup_t *m, bam_sample_t *sm, kstring_t *buf, static int mpileup(mplp_conf_t *conf, int n, char **fn) { mplp_aux_t **data; - int i, tid, pos, *n_plp, beg0 = 0, end0 = 1u<<29, ref_len, ref_tid; + int i, tid, pos, *n_plp, beg0 = 0, end0 = 1u<<29, ref_len, ref_tid, max_depth; const bam_pileup1_t **plp; bam_mplp_t iter; bam_header_t *h = 0; @@ -694,6 +694,14 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) } ref_tid = -1; ref = 0; iter = bam_mplp_init(n, mplp_func, (void**)data); + max_depth = conf->max_depth; + if (max_depth * sm->n > 1<<20) + fprintf(stderr, "(%s) Max depth is above 1M. Potential memory hog!\n", __func__); + if (max_depth * sm->n < 8000) { + max_depth = 8000 / sm->n; + fprintf(stderr, "<%s> Set max per-sample depth to %d\n", __func__, max_depth); + } + bam_mplp_set_maxcnt(iter, max_depth); while (bam_mplp_auto(iter, &tid, &pos, n_plp, plp) > 0) { if (conf->reg && (pos < beg0 || pos >= end0)) continue; // out of the region requested if (hash) { @@ -771,13 +779,15 @@ int bam_mpileup(int argc, char *argv[]) mplp.max_mq = 60; mplp.min_baseQ = 13; mplp.capQ_thres = 0; + mplp.max_depth = 250; mplp.flag = MPLP_NO_ORPHAN | MPLP_REALN; - while ((c = getopt(argc, argv, "gf:r:l:M:q:Q:uaORC:BDS")) >= 0) { + while ((c = getopt(argc, argv, "gf:r:l:M:q:Q:uaORC:BDSd:")) >= 0) { switch (c) { case 'f': mplp.fai = fai_load(optarg); if (mplp.fai == 0) return 1; break; + case 'd': mplp.max_depth = atoi(optarg); break; case 'r': mplp.reg = strdup(optarg); break; case 'l': mplp.fn_pos = strdup(optarg); break; case 'g': mplp.flag |= MPLP_GLF; break; @@ -803,6 +813,7 @@ int bam_mpileup(int argc, char *argv[]) fprintf(stderr, " -M INT cap mapping quality at INT [%d]\n", mplp.max_mq); fprintf(stderr, " -Q INT min base quality [%d]\n", mplp.min_baseQ); fprintf(stderr, " -q INT filter out alignment with MQ smaller than INT [%d]\n", mplp.min_mq); + fprintf(stderr, " -d INT max per-sample depth [%d]\n", mplp.max_depth); fprintf(stderr, " -g generate BCF output\n"); fprintf(stderr, " -u do not compress BCF output\n"); fprintf(stderr, " -B disable BAQ computation\n"); diff --git a/bamtk.c b/bamtk.c index 741afee..4f881c0 100644 --- a/bamtk.c +++ b/bamtk.c @@ -9,7 +9,7 @@ #endif #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.1.9-1 (r786)" +#define PACKAGE_VERSION "0.1.9-2 (r787)" #endif int bam_taf2baf(int argc, char *argv[]); -- 2.39.2