From: Petr Danecek Date: Tue, 12 Mar 2013 13:24:16 +0000 (+0000) Subject: backup commit X-Git-Url: https://git.donarmstrong.com/?p=samtools.git;a=commitdiff_plain;h=6842e4470dcbd381d0893690b7d07344fd08e831 backup commit --- diff --git a/bam2bcf.c b/bam2bcf.c index b4f7fa6..0b41c6f 100644 --- a/bam2bcf.c +++ b/bam2bcf.c @@ -28,11 +28,14 @@ bcf_callaux_t *bcf_call_init(double theta, int min_baseQ) bca->min_frac = 0.002; bca->min_support = 1; bca->per_sample_flt = 0; - return bca; + bca->npos = 100; + bca->ref_pos = calloc(bca->npos, sizeof(int)); + bca->alt_pos = calloc(bca->npos, sizeof(int)); + return bca; } -int dist_from_end(const bam_pileup1_t *p) +static int get_position(const bam_pileup1_t *p, int *len) { int icig, n_tot_bases = 0, iread = 0, edist = p->qpos + 1; for (icig=0; icigb->core.n_cigar; icig++) @@ -58,9 +61,7 @@ int dist_from_end(const bam_pileup1_t *p) if ( iread<=p->qpos ) edist -= ncig; } } - // distance from either end - if ( edist > n_tot_bases/2. ) edist = n_tot_bases - edist + 1; - if ( edist<0 ) { fprintf(stderr,"uh: %d %d -> %d (%d)\n", p->qpos,n_tot_bases,edist,p->b->core.pos+1); exit(1); } + *len = n_tot_bases; return edist; } @@ -89,7 +90,6 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t bca->bases = (uint16_t*)realloc(bca->bases, 2 * bca->max_bases); } // fill the bases array - bca->read_len = 0; for (i = n = r->n_supp = 0; i < _n; ++i) { const bam_pileup1_t *p = pl + i; int q, b, mapQ, baseQ, is_diff, min_dist, seqQ; @@ -122,32 +122,20 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t if (min_dist > p->qpos) min_dist = p->qpos; if (min_dist > CAP_DIST) min_dist = CAP_DIST; r->anno[1<<2|is_diff<<1|0] += baseQ; - r->anno[1<<2|is_diff<<1|1] += baseQ * baseQ; // FIXME: signed int is not enough for thousands of samples + r->anno[1<<2|is_diff<<1|1] += baseQ * baseQ; r->anno[2<<2|is_diff<<1|0] += mapQ; - r->anno[2<<2|is_diff<<1|1] += mapQ * mapQ; // FIXME: signed int is not enough for thousands of samples + r->anno[2<<2|is_diff<<1|1] += mapQ * mapQ; r->anno[3<<2|is_diff<<1|0] += min_dist; r->anno[3<<2|is_diff<<1|1] += min_dist * min_dist; // collect read positions for ReadPosBias - int epos = dist_from_end(p); - int npos = p->b->core.l_qseq; - if ( bca->npos < npos ) - { - bca->ref_pos = realloc(bca->ref_pos, sizeof(int)*npos); - bca->alt_pos = realloc(bca->alt_pos, sizeof(int)*npos); - int j; - for (j=bca->npos; jref_pos[j] = 0; - for (j=bca->npos; jalt_pos[j] = 0; - bca->npos = npos; - } + int len, pos = get_position(p, &len); + int epos = (double)pos/len * bca->npos; if ( bam1_seqi(bam1_seq(p->b),p->qpos) == ref_base ) bca->ref_pos[epos]++; else bca->alt_pos[epos]++; - bca->read_len += npos; - //fprintf(stderr,"qpos=%d epos=%d fwd=%d var=%d len=%d %s\n", p->qpos, epos, p->b->core.flag&BAM_FREVERSE ? 0 : 1, bam1_seqi(bam1_seq(p->b),p->qpos) == ref_base ? 1 : 0, p->b->core.l_qseq, bam1_qname(p->b)); } - if ( n ) bca->read_len /= n; r->depth = n; r->ori_depth = ori_depth; // glfgen errmod_cal(bca->e, n, 5, bca->bases, r->p); @@ -170,9 +158,18 @@ void calc_ReadPosBias(bcf_callaux_t *bca, bcf_call_t *call) nref += bca->ref_pos[i]; nalt += bca->alt_pos[i]; U += nref*bca->alt_pos[i]; + } + double var = 0, avg = (double)(nref+nalt)/bca->npos; + for (i=0; inpos; i++) + { + double ediff = bca->ref_pos[i] + bca->alt_pos[i] - avg; + var += ediff*ediff; bca->ref_pos[i] = 0; bca->alt_pos[i] = 0; } + call->read_pos.avg = avg; + call->read_pos.var = sqrt(var/bca->npos); + call->read_pos.dp = nref+nalt; if ( !nref || !nalt ) { call->read_pos_bias = -1; @@ -230,7 +227,7 @@ float mean_diff_to_prob(float mdiff, int dp, int readlen) float m, v; if ( dp>=24 ) { - m = (int)readlen/8.; + m = readlen/8.; if (dp>100) dp = 100; v = 1.476/(0.182*pow(dp,0.514)); v = v*(readlen/100.); @@ -239,7 +236,7 @@ float mean_diff_to_prob(float mdiff, int dp, int readlen) { m = mv[dp][0]; v = mv[dp][1]; - m = (int)m*readlen/100.; + m = m*readlen/100.; v = v*readlen/100.; v *= 1.2; // allow more variability } @@ -253,10 +250,11 @@ void calc_vdb(bcf_callaux_t *bca, bcf_call_t *call) for (i=0; inpos; i++) { if ( !bca->alt_pos[i] ) continue; - dp += bca->alt_pos[i]; - mean_pos += bca->alt_pos[i]*i; + dp += bca->alt_pos[i]; + int j = inpos/2 ? i : bca->npos - i; + mean_pos += bca->alt_pos[i]*j; } - if ( !dp ) + if ( dp<2 ) { call->vdb = -1; return; @@ -265,10 +263,11 @@ void calc_vdb(bcf_callaux_t *bca, bcf_call_t *call) for (i=0; inpos; i++) { if ( !bca->alt_pos[i] ) continue; - mean_diff += bca->alt_pos[i] * fabs(i - mean_pos); + int j = inpos/2 ? i : bca->npos - i; + mean_diff += bca->alt_pos[i] * fabs(j - mean_pos); } mean_diff /= dp; - call->vdb = mean_diff_to_prob(mean_diff, dp, bca->read_len); + call->vdb = mean_diff_to_prob(mean_diff, dp, bca->npos); } /** @@ -417,11 +416,12 @@ int bcf_call2bcf(int tid, int pos, bcf_call_t *bc, bcf1_t *b, bcf_callret1_t *bc if (i) kputc(',', &s); kputw(bc->anno[i], &s); } + ksprintf(&s,";RPS=%d,%f,%f", bc->read_pos.dp,bc->read_pos.avg,bc->read_pos.var); ksprintf(&s,";QS=%f,%f,%f,%f", bc->qsum[0],bc->qsum[1],bc->qsum[2],bc->qsum[3]); if (bc->vdb != -1) - ksprintf(&s, ";VDB=%.4f", bc->vdb); + ksprintf(&s, ";VDB=%e", bc->vdb); if (bc->read_pos_bias != -1 ) - ksprintf(&s, ";RPB=%.4f", bc->read_pos_bias); + ksprintf(&s, ";RPB=%e", bc->read_pos_bias); kputc('\0', &s); // FMT kputs("PL", &s); diff --git a/bam2bcf.h b/bam2bcf.h index 46f91f9..b2b1825 100644 --- a/bam2bcf.h +++ b/bam2bcf.h @@ -31,7 +31,7 @@ typedef struct __bcf_callaux_t { typedef struct { int depth, n_supp, ori_depth, qsum[4]; - int anno[16]; + unsigned int anno[16]; float p[25]; } bcf_callret1_t; @@ -40,10 +40,11 @@ typedef struct { float qsum[4]; int n, n_alleles, shift, ori_ref, unseen; int n_supp; // number of supporting non-reference reads - int anno[16], depth, ori_depth; + unsigned int anno[16], depth, ori_depth; uint8_t *PL; float vdb; // variant distance bias float read_pos_bias; + struct { float avg, var; int dp; } read_pos; } bcf_call_t; #ifdef __cplusplus diff --git a/bam2bcf_indel.c b/bam2bcf_indel.c index 9aea5f9..7ccd1cd 100644 --- a/bam2bcf_indel.c +++ b/bam2bcf_indel.c @@ -109,6 +109,9 @@ static inline int est_indelreg(int pos, const char *ref, int l, char *ins4) return max_i - pos; } +/* + * @n: number of samples + */ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref, const void *rghash) { diff --git a/bam_plcmd.c b/bam_plcmd.c index 3f5d424..e45c8c8 100644 --- a/bam_plcmd.c +++ b/bam_plcmd.c @@ -215,6 +215,10 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) } data[i]->conf = conf; h_tmp = bam_header_read(data[i]->fp); + if ( !h_tmp ) { + fprintf(stderr,"[%s] fail to read the header of %s\n", __func__, fn[i]); + exit(1); + } data[i]->h = i? h : h_tmp; // for i==0, "h" has not been set yet bam_smpl_add(sm, fn[i], (conf->flag&MPLP_IGNORE_RG)? 0 : h_tmp->text); rghash = bcf_call_add_rg(rghash, h_tmp->text, conf->pl_list); diff --git a/bcftools/call1.c b/bcftools/call1.c index 240f0ee..eb58498 100644 --- a/bcftools/call1.c +++ b/bcftools/call1.c @@ -290,6 +290,8 @@ static void write_header(bcf_hdr_t *h) kputs("##INFO=\n", &str); if (!strstr(str.s, "##INFO=\n", &str); + if (!strstr(str.s, "##INFO=\n", &str); if (!strstr(str.s, "##INFO=\n", &str); if (!strstr(str.s, "##INFO=