#include "kstring.h"
#include "sam_header.h"
-int bam_is_be = 0;
+int bam_is_be = 0, bam_verbose = 2;
char *bam_flag2char_table = "pPuUrR12sfd\0\0\0\0\0";
/**************************
@copyright Genome Research Ltd.
*/
-#define BAM_VERSION "0.1.14 (r933:176)"
+#define BAM_VERSION "0.1.14 (r947:199)"
#include <stdint.h>
#include <stdlib.h>
*/
extern int bam_is_be;
+/*!
+ @abstract Verbose level between 0 and 3; 0 is supposed to disable all
+ debugging information, though this may not have been implemented.
+ */
+extern int bam_verbose;
+
/*! @abstract Table for converting a nucleotide character to the 4-bit encoding. */
extern unsigned char bam_nt16_table[256];
while (bam_mplp_auto(mplp, &tid, &pos, n_plp, plp) > 0) { // come to the next covered position
if (pos < beg || pos >= end) continue; // out of range; skip
if (bed && bed_overlap(bed, h->target_name[tid], pos, pos + 1) == 0) continue; // not in BED; skip
- printf("%s\t%d", h->target_name[tid], pos+1);
+ fputs(h->target_name[tid], stdout); printf("\t%d", pos+1);
for (i = 0; i < n; ++i) {
int j, m = 0;
for (j = 0; j < n_plp[i]; ++j) {
if (id < 0) id = bam_smpl_rg2smid(sm, fn[i], 0, buf);
if (id < 0 || id >= m->n) {
assert(q); // otherwise a bug
- fprintf(stderr, "[%s] Read group %s used in file %s but not defined in the header.\n", __func__, (char*)q+1, fn[i]);
+ fprintf(stderr, "[%s] Read group %s used in file %s but absent from the header or an alignment missing read group.\n", __func__, (char*)q+1, fn[i]);
exit(1);
}
if (m->n_plp[id] == m->m_plp[id]) {
fprintf(stderr, "(%s) Max depth is above 1M. Potential memory hog!\n", __func__);
if (max_depth * sm->n < 8000) {
max_depth = 8000 / sm->n;
- fprintf(stderr, "<%s> Set max per-sample depth to %d\n", __func__, max_depth);
+ fprintf(stderr, "<%s> Set max per-file depth to %d\n", __func__, max_depth);
}
max_indel_depth = conf->max_indel_depth * sm->n;
bam_mplp_set_maxcnt(iter, max_depth);
void bed_destroy(void *_h);
int bed_overlap(const void *_h, const char *chr, int beg, int end);
+static double test_hwe(const double g[3])
+{
+ extern double kf_gammaq(double p, double x);
+ double fexp, chi2, f[3], n;
+ int i;
+ n = g[0] + g[1] + g[2];
+ fexp = (2. * g[2] + g[1]) / (2. * n);
+ if (fexp > 1. - 1e-10) fexp = 1. - 1e-10;
+ if (fexp < 1e-10) fexp = 1e-10;
+ f[0] = n * (1. - fexp) * (1. - fexp);
+ f[1] = n * 2. * fexp * (1. - fexp);
+ f[2] = n * fexp * fexp;
+ for (i = 0, chi2 = 0.; i < 3; ++i)
+ chi2 += (g[i] - f[i]) * (g[i] - f[i]) / f[i];
+ return kf_gammaq(.5, chi2 / 2.);
+}
+
typedef struct {
double p[4];
int mq, depth, is_tested, d[4];
kputs(b->info, &s);
if (b->info[0]) kputc(';', &s);
// ksprintf(&s, "AF1=%.4lg;AFE=%.4lg;CI95=%.4lg,%.4lg", 1.-pr->f_em, 1.-pr->f_exp, pr->cil, pr->cih);
- ksprintf(&s, "AF1=%.4g;CI95=%.4g,%.4g", 1.-pr->f_em, pr->cil, pr->cih);
+ ksprintf(&s, "AF1=%.4g;CI95=%.4g,%.4g;G3=%.4g,%.4g,%.4g", 1.-pr->f_em, pr->cil, pr->cih, pr->g[2], pr->g[1], pr->g[0]);
+ if (n_smpl > 5) {
+ double hwe = test_hwe(pr->g);
+ if (hwe < 0.1) ksprintf(&s, ";HWE=%.4g", hwe);
+ }
if (has_I16) ksprintf(&s, ";DP4=%d,%d,%d,%d;MQ=%d", a.d[0], a.d[1], a.d[2], a.d[3], a.mq);
fq = pr->p_ref_folded < 0.5? -4.343 * log(pr->p_ref_folded) : 4.343 * log(pr->p_var_folded);
if (fq < -999) fq = -999;
kputs("##INFO=<ID=FQ,Number=1,Type=Float,Description=\"Phred probability of all samples being the same\">\n", &str);
if (!strstr(str.s, "##INFO=<ID=AF1,"))
kputs("##INFO=<ID=AF1,Number=1,Type=Float,Description=\"Max-likelihood estimate of the site allele frequency of the first ALT allele\">\n", &str);
+ if (!strstr(str.s, "##INFO=<ID=G3,"))
+ kputs("##INFO=<ID=G3,Number=3,Type=Float,Description=\"ML estimate of genotype frequencies\">\n", &str);
+ if (!strstr(str.s, "##INFO=<ID=HWE,"))
+ kputs("##INFO=<ID=HWE,Number=1,Type=Float,Description=\"Chi^2 based HWE test P-value based on G3\">\n", &str);
if (!strstr(str.s, "##INFO=<ID=CI95,"))
kputs("##INFO=<ID=CI95,Number=2,Type=Float,Description=\"Equal-tail Bayesian credible interval of the site allele frequency at the 95% level\">\n", &str);
if (!strstr(str.s, "##INFO=<ID=PV4,"))
kstring_t s;
s.m = s.l = 0; s.s = 0;
if (*b->info) kputc(';', &s);
- ksprintf(&s, "NEIR=%.3f;NEIF=%.3f,%.3f", r2, f[0]+f[2], f[0]+f[1]);
+ ksprintf(&s, "NEIR=%.3f;NEIF4=%.3f,%.3f,%.3f,%.3f", r2, f[0], f[1], f[2], f[3]);
bcf_append_info(b, s.s, s.l);
free(s.s);
}
KSTREAM_INIT(gzFile, gzread, 16384)
#define MC_MAX_EM_ITER 16
-#define MC_EM_EPS 1e-4
+#define MC_EM_EPS 1e-5
#define MC_DEF_INDEL 0.15
unsigned char seq_nt4_table[256] = {
return f;
}
+static double mc_gtfreq_iter(double g[3], const bcf_p1aux_t *ma, int beg, int end)
+{
+ double err, gg[3];
+ int i;
+ gg[0] = gg[1] = gg[2] = 0.;
+ for (i = beg; i < end; ++i) {
+ double *pdg, sum, tmp[3];
+ pdg = ma->pdg + i * 3;
+ tmp[0] = pdg[0] * g[0]; tmp[1] = pdg[1] * g[1]; tmp[2] = pdg[2] * g[2];
+ sum = (tmp[0] + tmp[1] + tmp[2]) * (end - beg);
+ gg[0] += tmp[0] / sum; gg[1] += tmp[1] / sum; gg[2] += tmp[2] / sum;
+ }
+ err = fabs(gg[0] - g[0]) > fabs(gg[1] - g[1])? fabs(gg[0] - g[0]) : fabs(gg[1] - g[1]);
+ err = err > fabs(gg[2] - g[2])? err : fabs(gg[2] - g[2]);
+ g[0] = gg[0]; g[1] = gg[1]; g[2] = gg[2];
+ return err;
+}
+
int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k)
{
double sum, g[3];
}
}
}
+ { // compute g[3]
+ rst->g[0] = (1. - rst->f_em) * (1. - rst->f_em);
+ rst->g[1] = 2. * rst->f_em * (1. - rst->f_em);
+ rst->g[2] = rst->f_em * rst->f_em;
+ for (i = 0; i < MC_MAX_EM_ITER; ++i)
+ if (mc_gtfreq_iter(rst->g, ma, 0, ma->n) < MC_EM_EPS) break;
+ }
{ // estimate equal-tail credible interval (95% level)
int l, h;
double p;
h = i;
rst->cil = (double)(ma->M - h) / ma->M; rst->cih = (double)(ma->M - l) / ma->M;
}
- rst->g[0] = rst->g[1] = rst->g[2] = -1.;
rst->cmp[0] = rst->cmp[1] = rst->cmp[2] = rst->p_chi2 = -1.0;
if (rst->p_var > 0.1) // skip contrast2() if the locus is a strong non-variant
rst->p_chi2 = contrast2(ma, rst->cmp);
append_header_text(fp->header, textheader->text, textheader->l_text);
bam_header_destroy(textheader);
}
- if (fp->header->n_targets == 0)
+ if (fp->header->n_targets == 0 && bam_verbose >= 1)
fprintf(stderr, "[samopen] no @SQ lines in the header.\n");
- } else fprintf(stderr, "[samopen] SAM header is present: %d sequences.\n", fp->header->n_targets);
+ } else if (bam_verbose >= 2) fprintf(stderr, "[samopen] SAM header is present: %d sequences.\n", fp->header->n_targets);
}
} else if (strchr(mode, 'w')) { // write
fp->header = bam_header_dup((const bam_header_t*)aux);
sam_header_parse(alt);
alt->l_text = 0; alt->text = 0;
// check if there are @SQ lines in the header
- fwrite(fp->header->text, 1, fp->header->l_text, fp->x.tamw);
+ fwrite(fp->header->text, 1, fp->header->l_text, fp->x.tamw); // FIXME: better to skip the trailing NULL
if (alt->n_targets) { // then write the header text without dumping ->target_{name,len}
- if (alt->n_targets != fp->header->n_targets)
- fprintf(stderr, "[samopen] inconsistent number of target sequences.\n");
+ if (alt->n_targets != fp->header->n_targets && bam_verbose >= 1)
+ fprintf(stderr, "[samopen] inconsistent number of target sequences. Output the text header.\n");
} else { // then dump ->target_{name,len}
for (i = 0; i < fp->header->n_targets; ++i)
fprintf(fp->x.tamw, "@SQ\tSN:%s\tLN:%d\n", fp->header->target_name[i], fp->header->target_len[i]);
if (access(fn_ref, R_OK) == -1) {
fprintf(stderr, "[samfaipath] fail to read file %s.\n", fn_ref);
} else {
- fprintf(stderr, "[samfaipath] build FASTA index...\n");
+ if (bam_verbose >= 3) fprintf(stderr, "[samfaipath] build FASTA index...\n");
if (fai_build(fn_ref) == -1) {
fprintf(stderr, "[samfaipath] fail to build FASTA index.\n");
free(fn_list); fn_list = 0;
const char *text;
char *buf=NULL;
size_t nbuf = 0;
+ int tovalidate = 0;
if ( !headerText )
return 0;
while ( (text=nextline(&buf, &nbuf, text)) )
{
hline = sam_header_line_parse(buf);
- if ( hline && sam_header_line_validate(hline) )
+ if ( hline && (!tovalidate || sam_header_line_validate(hline)) )
// With too many (~250,000) reference sequences the header parsing was too slow with list_append.
hlines = list_append_to_end(hlines, hline);
else
#include "sam_header.h"
#include "sam.h"
#include "faidx.h"
+#include "kstring.h"
#include "khash.h"
KHASH_SET_INIT_STR(rg)
return 0;
}
+static char *drop_rg(char *hdtxt, rghash_t h, int *len)
+{
+ char *p = hdtxt, *q, *r, *s;
+ kstring_t str;
+ memset(&str, 0, sizeof(kstring_t));
+ while (1) {
+ int toprint = 0;
+ q = strchr(p, '\n');
+ if (q == 0) q = p + strlen(p);
+ if (q - p < 3) break; // the line is too short; then stop
+ if (strncmp(p, "@RG\t", 4) == 0) {
+ int c;
+ khint_t k;
+ if ((r = strstr(p, "\tID:")) != 0) {
+ r += 4;
+ for (s = r; *s != '\0' && *s != '\n' && *s != '\t'; ++s);
+ c = *s; *s = '\0';
+ k = kh_get(rg, h, r);
+ *s = c;
+ if (k != kh_end(h)) toprint = 1;
+ }
+ } else toprint = 1;
+ if (toprint) {
+ kputsn(p, q - p, &str); kputc('\n', &str);
+ }
+ p = q + 1;
+ }
+ *len = str.l;
+ return str.s;
+}
+
// callback function for bam_fetch() that prints nonskipped records
static int view_func(const bam1_t *b, void *data)
{
ret = 1;
goto view_end;
}
+ if (g_rghash) { // FIXME: I do not know what "bam_header_t::n_text" is for...
+ char *tmp;
+ int l;
+ tmp = drop_rg(in->header->text, g_rghash, &l);
+ free(in->header->text);
+ in->header->text = tmp;
+ in->header->l_text = l;
+ }
if (!is_count && (out = samopen(fn_out? fn_out : "-", out_mode, in->header)) == 0) {
fprintf(stderr, "[main_samview] fail to open \"%s\" for writing.\n", fn_out? fn_out : "standard output");
ret = 1;
p = q > r? q : r;
++n;
}
-// if (n == 0) add_pair(sm, sm2id, fn, fn);
- add_pair(sm, sm2id, fn, fn);
+ if (n == 0) add_pair(sm, sm2id, fn, fn);
+// add_pair(sm, sm2id, fn, fn);
free(buf.s);
return 0;
}