From d382711a770f67a72b9af3bfd98a88fbced34f64 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 18 Mar 2011 20:33:51 +0000 Subject: [PATCH] * allow to change the compression level in BAM output * use compress level 1 in sorting * association test * concatenate BAMs * allow to threshold on max depth in INDEL calling --- Makefile | 2 +- NEWS | 27 ++++++++- bam.h | 2 +- bam_plcmd.c | 18 +++--- bam_sort.c | 13 ++-- bamtk.c | 3 + bcftools/bcf.h | 4 ++ bcftools/bcf.tex | 13 ++-- bcftools/bcftools.1 | 131 +++++++++++++++++++++++++++------------ bcftools/bcfutils.c | 75 ++++++++++++++++++++--- bcftools/call1.c | 75 ++++++++++++++++++----- bcftools/prob1.c | 145 +++++++++++++++++++++++++++++++++++--------- bcftools/prob1.h | 4 +- bgzf.c | 51 ++++++++++------ bgzf.h | 3 +- sam.c | 20 +++--- sam_view.c | 16 +++-- samtools.1 | 26 +++++++- 18 files changed, 481 insertions(+), 147 deletions(-) diff --git a/Makefile b/Makefile index af93d9c..f025942 100644 --- a/Makefile +++ b/Makefile @@ -4,7 +4,7 @@ DFLAGS= -D_FILE_OFFSET_BITS=64 -D_LARGEFILE64_SOURCE -D_USE_KNETFILE -D_CURSES_ KNETFILE_O= knetfile.o LOBJS= bgzf.o kstring.o bam_aux.o bam.o bam_import.o sam.o bam_index.o \ bam_pileup.o bam_lpileup.o bam_md.o glf.o razf.o faidx.o \ - $(KNETFILE_O) bam_sort.o sam_header.o bam_reheader.o kprobaln.o + $(KNETFILE_O) bam_sort.o sam_header.o bam_reheader.o kprobaln.o bam_cat.o AOBJS= bam_tview.o bam_maqcns.o bam_plcmd.o sam_view.o \ bam_rmdup.o bam_rmdupse.o bam_mate.o bam_stat.o bam_color.o \ bamtk.o kaln.o bam2bcf.o bam2bcf_indel.o errmod.o sample.o \ diff --git a/NEWS b/NEWS index 8455b48..946ba7b 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,28 @@ +Beta release 0.1.14 (16 March, 2011) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This release implements a method for testing associations for case-control +data. The method does not call genotypes but instead sums over all genotype +configurations to compute a chi^2 based test statistics. It can be potentially +applied to comparing a pair of samples (e.g. a tumor-normal pair), but this +has not been evaluated on real data. + +Another new feature is to make X chromosome variant calls when female and male +samples are both present. The user needs to provide a file indicating the +ploidy of each sample. + +Other new features: + + * Added `samtools mpileup -L' to skip INDEL calling in regions with + excessively high coverage. Such regions dramatically slow down mpileup. + + * Added `bcftools view -F' to parse BCF files generated by samtools r921 or + older which encode PL in a different way. + +(0.1.14: 16 March 2011, r930:163) + + + Beta release 0.1.13 (1 March, 2011) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -56,7 +81,7 @@ Other notable changes in bcftools: * Updated the BCF spec. * Added the `FQ' VCF INFO field, which gives the phred-scaled probability - of all samples being the samei (identical to the reference or all homozygous + of all samples being the same (identical to the reference or all homozygous variants). Option `view -f' has been dropped. * Implementated of "vcfutils.pl vcf2fq" to generate a consensus sequence diff --git a/bam.h b/bam.h index 4ad2644..84253ae 100644 --- a/bam.h +++ b/bam.h @@ -40,7 +40,7 @@ @copyright Genome Research Ltd. */ -#define BAM_VERSION "0.1.13 (r926:134)" +#define BAM_VERSION "0.1.13 (r926:161)" #include #include diff --git a/bam_plcmd.c b/bam_plcmd.c index bba62cb..368502c 100644 --- a/bam_plcmd.c +++ b/bam_plcmd.c @@ -536,7 +536,7 @@ int bam_pileup(int argc, char *argv[]) #define MPLP_EXT_BAQ 0x800 typedef struct { - int max_mq, min_mq, flag, min_baseQ, capQ_thres, max_depth; + int max_mq, min_mq, flag, min_baseQ, capQ_thres, max_depth, max_indel_depth; int openQ, extQ, tandemQ, min_support; // for indels double min_frac; // for indels char *reg, *fn_pos, *pl_list; @@ -617,7 +617,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) extern void *bcf_call_add_rg(void *rghash, const char *hdtext, const char *list); extern void bcf_call_del_rghash(void *rghash); mplp_aux_t **data; - int i, tid, pos, *n_plp, beg0 = 0, end0 = 1u<<29, ref_len, ref_tid, max_depth; + int i, tid, pos, *n_plp, beg0 = 0, end0 = 1u<<29, ref_len, ref_tid, max_depth, max_indel_depth; const bam_pileup1_t **plp; bam_mplp_t iter; bam_header_t *h = 0; @@ -722,6 +722,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) max_depth = 8000 / sm->n; fprintf(stderr, "<%s> Set max per-sample depth to %d\n", __func__, max_depth); } + max_indel_depth = conf->max_indel_depth * sm->n; bam_mplp_set_maxcnt(iter, max_depth); while (bam_mplp_auto(iter, &tid, &pos, n_plp, plp) > 0) { if (conf->reg && (pos < beg0 || pos >= end0)) continue; // out of the region requested @@ -737,8 +738,9 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) ref_tid = tid; } if (conf->flag & MPLP_GLF) { - int _ref0, ref16; + int total_depth, _ref0, ref16; bcf1_t *b = calloc(1, sizeof(bcf1_t)); + for (i = total_depth = 0; i < n; ++i) total_depth += n_plp[i]; group_smpl(&gplp, sm, &buf, n, fn, n_plp, plp); _ref0 = (ref && pos < ref_len)? ref[pos] : 'N'; ref16 = bam_nt16_table[_ref0]; @@ -750,7 +752,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) bcf_write(bp, bh, b); bcf_destroy(b); // call indels - if (!(conf->flag&MPLP_NO_INDEL) && bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, rghash) >= 0) { + if (!(conf->flag&MPLP_NO_INDEL) && total_depth < max_indel_depth && bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, rghash) >= 0) { for (i = 0; i < gplp.n; ++i) bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], -1, bca, bcr + i); if (bcf_call_combine(gplp.n, bcr, -1, &bc) >= 0) { @@ -866,11 +868,11 @@ int bam_mpileup(int argc, char *argv[]) mplp.max_mq = 60; mplp.min_baseQ = 13; mplp.capQ_thres = 0; - mplp.max_depth = 250; + mplp.max_depth = 250; mplp.max_indel_depth = 250; mplp.openQ = 40; mplp.extQ = 20; mplp.tandemQ = 100; mplp.min_frac = 0.002; mplp.min_support = 1; mplp.flag = MPLP_NO_ORPHAN | MPLP_REALN; - while ((c = getopt(argc, argv, "Agf:r:l:M:q:Q:uaRC:BDSd:b:P:o:e:h:Im:F:EG:")) >= 0) { + while ((c = getopt(argc, argv, "Agf:r:l:M:q:Q:uaRC:BDSd:L:b:P:o:e:h:Im:F:EG:")) >= 0) { switch (c) { case 'f': mplp.fai = fai_load(optarg); @@ -900,6 +902,7 @@ int bam_mpileup(int argc, char *argv[]) case 'A': use_orphan = 1; break; case 'F': mplp.min_frac = atof(optarg); break; case 'm': mplp.min_support = atoi(optarg); break; + case 'L': mplp.max_indel_depth = atoi(optarg); break; case 'G': { FILE *fp_rg; char buf[1024]; @@ -924,7 +927,8 @@ int bam_mpileup(int argc, char *argv[]) fprintf(stderr, " -M INT cap mapping quality at INT [%d]\n", mplp.max_mq); fprintf(stderr, " -Q INT min base quality [%d]\n", mplp.min_baseQ); fprintf(stderr, " -q INT filter out alignment with MQ smaller than INT [%d]\n", mplp.min_mq); - fprintf(stderr, " -d INT max per-sample depth [%d]\n", mplp.max_depth); + fprintf(stderr, " -d INT max per-BAM depth [%d]\n", mplp.max_depth); + fprintf(stderr, " -L INT max per-sample depth for INDEL calling [%d]\n", mplp.max_indel_depth); fprintf(stderr, " -P STR comma separated list of platforms for indels [all]\n"); fprintf(stderr, " -o INT Phred-scaled gap open sequencing error probability [%d]\n", mplp.openQ); fprintf(stderr, " -e INT Phred-scaled gap extension seq error probability [%d]\n", mplp.extQ); diff --git a/bam_sort.c b/bam_sort.c index 38f15d6..7aeccff 100644 --- a/bam_sort.c +++ b/bam_sort.c @@ -298,14 +298,19 @@ KSORT_INIT(sort, bam1_p, bam1_lt) static void sort_blocks(int n, int k, bam1_p *buf, const char *prefix, const bam_header_t *h, int is_stdout) { - char *name; + char *name, mode[3]; int i; bamFile fp; ks_mergesort(sort, k, buf, 0); name = (char*)calloc(strlen(prefix) + 20, 1); - if (n >= 0) sprintf(name, "%s.%.4d.bam", prefix, n); - else sprintf(name, "%s.bam", prefix); - fp = is_stdout? bam_dopen(fileno(stdout), "w") : bam_open(name, "w"); + if (n >= 0) { + sprintf(name, "%s.%.4d.bam", prefix, n); + strcpy(mode, "w1"); + } else { + sprintf(name, "%s.bam", prefix); + strcpy(mode, "w"); + } + fp = is_stdout? bam_dopen(fileno(stdout), mode) : bam_open(name, mode); if (fp == 0) { fprintf(stderr, "[sort_blocks] fail to create file %s.\n", name); free(name); diff --git a/bamtk.c b/bamtk.c index 1d11519..5309d5e 100644 --- a/bamtk.c +++ b/bamtk.c @@ -25,6 +25,7 @@ int main_import(int argc, char *argv[]); int main_reheader(int argc, char *argv[]); int main_cut_target(int argc, char *argv[]); int main_phase(int argc, char *argv[]); +int main_cat(int argc, char *argv[]); int faidx_main(int argc, char *argv[]); int glf3_view_main(int argc, char *argv[]); @@ -92,6 +93,7 @@ static int usage() fprintf(stderr, " merge merge sorted alignments\n"); fprintf(stderr, " rmdup remove PCR duplicates\n"); fprintf(stderr, " reheader replace BAM header\n"); + fprintf(stderr, " cat concatenate BAMs\n"); fprintf(stderr, " targetcut cut fosmid regions (for fosmid pool only)\n"); fprintf(stderr, " phase phase heterozygotes\n"); fprintf(stderr, "\n"); @@ -131,6 +133,7 @@ int main(int argc, char *argv[]) else if (strcmp(argv[1], "calmd") == 0) return bam_fillmd(argc-1, argv+1); else if (strcmp(argv[1], "fillmd") == 0) return bam_fillmd(argc-1, argv+1); else if (strcmp(argv[1], "reheader") == 0) return main_reheader(argc-1, argv+1); + else if (strcmp(argv[1], "cat") == 0) return main_cat(argc-1, argv+1); else if (strcmp(argv[1], "targetcut") == 0) return main_cut_target(argc-1, argv+1); else if (strcmp(argv[1], "phase") == 0) return main_phase(argc-1, argv+1); #if _CURSES_LIB != 0 diff --git a/bcftools/bcf.h b/bcftools/bcf.h index f545a91..ba6dbe9 100644 --- a/bcftools/bcf.h +++ b/bcftools/bcf.h @@ -146,6 +146,10 @@ extern "C" { int bcf_is_indel(const bcf1_t *b); bcf_hdr_t *bcf_hdr_subsam(const bcf_hdr_t *h0, int n, char *const* samples, int *list); int bcf_subsam(int n_smpl, int *list, bcf1_t *b); + // move GT to the first FORMAT field + int bcf_fix_gt(bcf1_t *b); + // update PL generated by old samtools + int bcf_fix_pl(bcf1_t *b); // string hash table void *bcf_build_refhash(bcf_hdr_t *h); diff --git a/bcftools/bcf.tex b/bcftools/bcf.tex index 6f2171f..442fc2a 100644 --- a/bcftools/bcf.tex +++ b/bcftools/bcf.tex @@ -33,13 +33,14 @@ \end{center} \begin{center} -\begin{tabular}{cll} +\begin{tabular}{clp{9cm}} \hline \multicolumn{1}{l}{\bf Field} & \multicolumn{1}{l}{\bf Type} & \multicolumn{1}{l}{\bf Description} \\\hline {\tt DP} & {\tt uint16\_t[n]} & Read depth \\ {\tt GL} & {\tt float[n*G]} & Log10 likelihood of data; $G=\frac{A(A+1)}{2}$, $A=\#\{alleles\}$\\ {\tt GT} & {\tt uint8\_t[n]} & {\tt missing\char60\char60 7 | phased\char60\char60 6 | allele1\char60\char60 3 | allele2} \\ -{\tt \_GT} & {\tt uint8\_t+uint8\_t[n*P]} & {Generic GT; the first int equals the max ploidy $P$} \\ +{\tt \_GT} & {\tt uint8\_t+uint8\_t[n*P]} & {Generic GT; the first int equals the max ploidy $P$. If the highest bit is set, + the allele is not present (e.g. due to different ploidy between samples).} \\ {\tt GQ} & {\tt uint8\_t[n]} & {Genotype quality}\\ {\tt HQ} & {\tt uint8\_t[n*2]} & {Haplotype quality}\\ {\tt \_HQ} & {\tt uint8\_t+uint8\_t[n*P]} & {Generic HQ}\\ @@ -67,10 +68,10 @@ BCF proposal). \item Predefined {\tt FORMAT} fields can be missing from VCF headers, but custom {\tt FORMAT} fields are required to be explicitly defined in the headers. -\item A {\tt FORMAT} field with its name starting with `{\tt \_}' gives an alternative - binary representation of the corresponding VCF field. The alternative representation - is used when the default representation is unable to keep the genotype information, - for example, when the ploidy is over 2 or there are more than 8 alleles. +\item A {\tt FORMAT} field with its name starting with `{\tt \_}' is specific to BCF only. + It gives an alternative binary representation of the corresponding VCF field, in case + the default representation is unable to keep the genotype information, + for example, when the ploidy is not 2 or there are more than 8 alleles. \end{itemize} \end{document} diff --git a/bcftools/bcftools.1 b/bcftools/bcftools.1 index 6c7403b..236abe4 100644 --- a/bcftools/bcftools.1 +++ b/bcftools/bcftools.1 @@ -1,4 +1,4 @@ -.TH bcftools 1 "2 October 2010" "bcftools" "Bioinformatics tools" +.TH bcftools 1 "16 March 2011" "bcftools" "Bioinformatics tools" .SH NAME .PP bcftools - Utilities for the Binary Call Format (BCF) and VCF. @@ -20,53 +20,55 @@ estimating site allele frequencies and allele frequency spectrums. .TP 10 .B view .B bcftools view -.RB [ \-cbuSAGgHvNQ ] -.RB [ \-1 -.IR nGroup1 ] +.RB [ \-AbFGNQSucgv ] +.RB [ \-D +.IR seqDict ] .RB [ \-l -.IR listFile ] +.IR listLoci ] +.RB [ \-s +.IR listSample ] +.RB [ \-i +.IR gapSNPratio ] .RB [ \-t .IR mutRate ] .RB [ \-p .IR varThres ] .RB [ \-P .IR prior ] +.RB [ \-1 +.IR nGroup1 ] +.RB [ \-U +.IR nPerm ] +.RB [ \-X +.IR permThres ] .I in.bcf .RI [ region ] Convert between BCF and VCF, call variant candidates and estimate allele frequencies. -.B OPTIONS: .RS +.TP +.B Input/Output Options: +.TP 10 +.B -A +Retain all possible alternate alleles at variant sites. By default, the view +command discards unlikely alleles. .TP 10 .B -b Output in the BCF format. The default is VCF. .TP -.B -c -Call variants. +.BI -D \ FILE +Sequence dictionary (list of chromosome names) for VCF->BCF conversion [null] .TP -.B -v -Output variant sites only (force -c) -.TP -.B -g -Call per-sample genotypes at variant sites (force -c) -.TP -.B -u -Uncompressed BCF output (force -b). -.TP -.B -S -The input is VCF instead of BCF. -.TP -.B -A -Retain all possible alternate alleles at variant sites. By default, this -command discards unlikely alleles. +.B -F +Indicate PL is generated by r921 or before (ordering is different). .TP .B -G Suppress all individual genotype information. .TP -.B -H -Perform Hardy-Weiberg Equilibrium test. This will add computation time, sometimes considerably. +.BI -l \ FILE +List of sites at which information are outputted [all sites] .TP .B -N Skip sites where the REF field is not A/C/G/T @@ -74,31 +76,63 @@ Skip sites where the REF field is not A/C/G/T .B -Q Output the QCALL likelihood format .TP -.B -f -Reference-free variant calling mode. In this mode, the prior will be -folded; a variant is called iff the sample(s) contains at least two -alleles; the QUAL field in the VCF/BCF output is changed accordingly. +.BI -s \ FILE +List of samples to use. In the output, the ordering of samples will be identical +to the one in +.IR FILE . +[null] .TP -.BI "-1 " INT -Number of group-1 samples. This option is used for dividing input into -two groups for comparing. A zero value disables this functionality. [0] +.B -S +The input is VCF instead of BCF. .TP -.BI "-l " FILE -List of sites at which information are outputted [all sites] +.B -u +Uncompressed BCF output (force -b). .TP -.BI "-t " FLOAT -Scaled muttion rate for variant calling [0.001] +.B Consensus/Variant Calling Options: +.TP 10 +.B -c +Call variants. +.TP +.B -g +Call per-sample genotypes at variant sites (force -c) .TP -.BI "-p " FLOAT +.BI -i \ FLOAT +Ratio of INDEL-to-SNP mutation rate [0.15] +.TP +.BI -p \ FLOAT A site is considered to be a variant if P(ref|D)n_gi; ++i) + if (b->gi[i].fmt == tmp) break; + if (i == b->n_gi) return 0; + // prepare + gi = b->gi + i; + PL = (uint8_t*)gi->data; + swap = alloca(gi->len); + // loop through individuals + for (i = 0; i < b->n_smpl; ++i) { + int k, l, x; + uint8_t *PLi = PL + i * gi->len; + memcpy(swap, PLi, gi->len); + for (k = x = 0; k < b->n_alleles; ++k) + for (l = k; l < b->n_alleles; ++l) + PLi[l*(l+1)/2 + k] = swap[x++]; + } + return 0; +} + static void *locate_field(const bcf1_t *b, const char *fmt, int l) { int i; @@ -188,6 +215,31 @@ int bcf_anno_max(bcf1_t *b) return 0; } +// FIXME: only data are shuffled; the header is NOT +int bcf_shuffle(bcf1_t *b, int seed) +{ + int i, j, *a; + if (seed > 0) srand48(seed); + a = malloc(b->n_smpl * sizeof(int)); + for (i = 0; i < b->n_smpl; ++i) a[i] = i; + for (i = b->n_smpl; i > 1; --i) { + int tmp; + j = (int)(drand48() * i); + tmp = a[j]; a[j] = a[i-1]; a[i-1] = tmp; + } + for (j = 0; j < b->n_gi; ++j) { + bcf_ginfo_t *gi = b->gi + j; + uint8_t *swap, *data = (uint8_t*)gi->data; + swap = malloc(gi->len * b->n_smpl); + for (i = 0; i < b->n_smpl; ++i) + memcpy(swap + gi->len * a[i], data + gi->len * i, gi->len); + free(gi->data); + gi->data = swap; + } + free(a); + return 0; +} + bcf_hdr_t *bcf_hdr_subsam(const bcf_hdr_t *h0, int n, char *const* samples, int *list) { int i, ret, j; @@ -197,12 +249,15 @@ bcf_hdr_t *bcf_hdr_subsam(const bcf_hdr_t *h0, int n, char *const* samples, int kstring_t s; s.l = s.m = 0; s.s = 0; hash = kh_init(str2id); - for (i = 0; i < n; ++i) - k = kh_put(str2id, hash, samples[i], &ret); - for (i = j = 0; i < h0->n_smpl; ++i) { - if (kh_get(str2id, hash, h0->sns[i]) != kh_end(hash)) { - list[j++] = i; - kputs(h0->sns[i], &s); kputc('\0', &s); + for (i = 0; i < h0->n_smpl; ++i) { + k = kh_put(str2id, hash, h0->sns[i], &ret); + kh_val(hash, k) = i; + } + for (i = j = 0; i < n; ++i) { + k = kh_get(str2id, hash, samples[i]); + if (k != kh_end(hash)) { + list[j++] = kh_val(hash, k); + kputs(samples[i], &s); kputc('\0', &s); } } if (j < n) fprintf(stderr, "<%s> %d samples in the list but not in BCF.", __func__, n - j); @@ -217,13 +272,17 @@ bcf_hdr_t *bcf_hdr_subsam(const bcf_hdr_t *h0, int n, char *const* samples, int return h; } -int bcf_subsam(int n_smpl, int *list, bcf1_t *b) // list MUST BE sorted +int bcf_subsam(int n_smpl, int *list, bcf1_t *b) { int i, j; for (j = 0; j < b->n_gi; ++j) { bcf_ginfo_t *gi = b->gi + j; + uint8_t *swap; + swap = malloc(gi->len * b->n_smpl); for (i = 0; i < n_smpl; ++i) - if (i != list[i]) memcpy((uint8_t*)gi->data + i * gi->len, (uint8_t*)gi->data + list[i] * gi->len, gi->len); + memcpy(swap + i * gi->len, (uint8_t*)gi->data + list[i] * gi->len, gi->len); + free(gi->data); + gi->data = swap; } b->n_smpl = n_smpl; return 0; diff --git a/bcftools/call1.c b/bcftools/call1.c index 9c0b498..a520e3c 100644 --- a/bcftools/call1.c +++ b/bcftools/call1.c @@ -6,6 +6,7 @@ #include "bcf.h" #include "prob1.h" #include "kstring.h" +#include "time.h" #include "khash.h" KHASH_SET_INIT_INT64(set64) @@ -26,12 +27,13 @@ KSTREAM_INIT(gzFile, gzread, 16384) #define VC_ADJLD 4096 #define VC_NO_INDEL 8192 #define VC_ANNO_MAX 16384 +#define VC_FIX_PL 32768 typedef struct { - int flag, prior_type, n1, n_sub, *sublist; + int flag, prior_type, n1, n_sub, *sublist, n_perm; char *fn_list, *prior_file, **subsam, *fn_dict; uint8_t *ploidy; - double theta, pref, indel_frac; + double theta, pref, indel_frac, min_perm_p; } viewconf_t; khash_t(set64) *bcf_load_pos(const char *fn, bcf_hdr_t *_h) @@ -133,11 +135,11 @@ static void rm_info(bcf1_t *b, const char *key) static int update_bcf1(int n_smpl, bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p1rst_t *pr, double pref, int flag) { kstring_t s; - int is_var = (pr->p_ref < pref); - double r = is_var? pr->p_ref : pr->p_var, fq; + int has_I16, is_var = (pr->p_ref < pref); + double fq, r = is_var? pr->p_ref : pr->p_var; anno16_t a; - test16(b, &a); + has_I16 = test16(b, &a) >= 0? 1 : 0; rm_info(b, "I16="); memset(&s, 0, sizeof(kstring_t)); @@ -147,15 +149,24 @@ static int update_bcf1(int n_smpl, bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p if (b->info[0]) kputc(';', &s); // ksprintf(&s, "AF1=%.4lg;AFE=%.4lg;CI95=%.4lg,%.4lg", 1.-pr->f_em, 1.-pr->f_exp, pr->cil, pr->cih); ksprintf(&s, "AF1=%.4g;CI95=%.4g,%.4g", 1.-pr->f_em, pr->cil, pr->cih); - ksprintf(&s, ";DP4=%d,%d,%d,%d;MQ=%d", a.d[0], a.d[1], a.d[2], a.d[3], a.mq); + if (has_I16) ksprintf(&s, ";DP4=%d,%d,%d,%d;MQ=%d", a.d[0], a.d[1], a.d[2], a.d[3], a.mq); fq = pr->p_ref_folded < 0.5? -4.343 * log(pr->p_ref_folded) : 4.343 * log(pr->p_var_folded); if (fq < -999) fq = -999; if (fq > 999) fq = 999; ksprintf(&s, ";FQ=%.3g", fq); - if (a.is_tested) { - if (pr->pc[0] >= 0.) ksprintf(&s, ";PC4=%g,%g,%g,%g", pr->pc[0], pr->pc[1], pr->pc[2], pr->pc[3]); - ksprintf(&s, ";PV4=%.2g,%.2g,%.2g,%.2g", a.p[0], a.p[1], a.p[2], a.p[3]); + if (pr->cmp[0] >= 0.) { + int i, q[3], pq; + for (i = 1; i < 3; ++i) { + double x = pr->cmp[i] + pr->cmp[0]/2.; + q[i] = x == 0? 255 : (int)(-4.343 * log(x) + .499); + if (q[i] > 255) q[i] = 255; + } + pq = (int)(-4.343 * log(pr->p_chi2) + .499); + if (pr->perm_rank >= 0) ksprintf(&s, ";PR=%d", pr->perm_rank); + ksprintf(&s, ";QCHI2=%d;PCHI2=%.3g;PC2=%d,%d", pq, q[1], q[2], pr->p_chi2); +// ksprintf(&s, ",%g,%g,%g", pr->cmp[0], pr->cmp[1], pr->cmp[2]); } + if (has_I16 && a.is_tested) ksprintf(&s, ";PV4=%.2g,%.2g,%.2g,%.2g", a.p[0], a.p[1], a.p[2], a.p[3]); kputc('\0', &s); kputs(b->fmt, &s); kputc('\0', &s); free(b->str); @@ -232,7 +243,7 @@ static void write_header(bcf_hdr_t *h) if (!strstr(str.s, "##INFO=\n", &str); if (!strstr(str.s, "##INFO=\n", &str); + kputs("##INFO=\n", &str); if (!strstr(str.s, "##INFO=\n", &str); if (!strstr(str.s, "##INFO=\n", &str); if (!strstr(str.s, "##INFO=\n", &str); - if (!strstr(str.s, "##INFO=\n", &str); + if (!strstr(str.s, "##INFO=\n", &str); + if (!strstr(str.s, "##INFO=\n", &str); + if (!strstr(str.s, "##INFO=\n", &str); + if (!strstr(str.s, "##FORMAT=\n", &str); if (!strstr(str.s, "##FORMAT=\n", &str); @@ -264,9 +283,10 @@ int bcfview(int argc, char *argv[]) extern void bcf_p1_indel_prior(bcf_p1aux_t *ma, double x); extern int bcf_fix_gt(bcf1_t *b); extern int bcf_anno_max(bcf1_t *b); + extern int bcf_shuffle(bcf1_t *b, int seed); bcf_t *bp, *bout = 0; bcf1_t *b, *blast; - int c; + int c, *seeds = 0; uint64_t n_processed = 0; viewconf_t vc; bcf_p1aux_t *p1 = 0; @@ -277,12 +297,13 @@ int bcfview(int argc, char *argv[]) tid = begin = end = -1; memset(&vc, 0, sizeof(viewconf_t)); - vc.prior_type = vc.n1 = -1; vc.theta = 1e-3; vc.pref = 0.5; vc.indel_frac = -1.; - while ((c = getopt(argc, argv, "N1:l:cHAGvbSuP:t:p:QgLi:IMs:D:")) >= 0) { + vc.prior_type = vc.n1 = -1; vc.theta = 1e-3; vc.pref = 0.5; vc.indel_frac = -1.; vc.n_perm = 0; vc.min_perm_p = 0.01; + while ((c = getopt(argc, argv, "FN1:l:cHAGvbSuP:t:p:QgLi:IMs:D:U:X:")) >= 0) { switch (c) { case '1': vc.n1 = atoi(optarg); break; case 'l': vc.fn_list = strdup(optarg); break; case 'D': vc.fn_dict = strdup(optarg); break; + case 'F': vc.flag |= VC_FIX_PL; break; case 'N': vc.flag |= VC_ACGT_ONLY; break; case 'G': vc.flag |= VC_NO_GENO; break; case 'A': vc.flag |= VC_KEEPALT; break; @@ -299,6 +320,8 @@ int bcfview(int argc, char *argv[]) case 'i': vc.indel_frac = atof(optarg); break; case 'Q': vc.flag |= VC_QCALL; break; case 'L': vc.flag |= VC_ADJLD; break; + case 'U': vc.n_perm = atoi(optarg); break; + case 'X': vc.min_perm_p = atof(optarg); break; case 's': vc.subsam = read_samples(optarg, &vc.n_sub); vc.ploidy = calloc(vc.n_sub + 1, 1); for (tid = 0; tid < vc.n_sub; ++tid) vc.ploidy[tid] = vc.subsam[tid][strlen(vc.subsam[tid]) + 1]; @@ -323,19 +346,21 @@ int bcfview(int argc, char *argv[]) fprintf(stderr, " -S input is VCF\n"); fprintf(stderr, " -A keep all possible alternate alleles at variant sites\n"); fprintf(stderr, " -G suppress all individual genotype information\n"); - fprintf(stderr, " -H perform Hardy-Weinberg test (slower)\n"); fprintf(stderr, " -N skip sites where REF is not A/C/G/T\n"); fprintf(stderr, " -Q output the QCALL likelihood format\n"); fprintf(stderr, " -L calculate LD for adjacent sites\n"); fprintf(stderr, " -I skip indels\n"); + fprintf(stderr, " -F PL generated by r921 or before\n"); fprintf(stderr, " -D FILE sequence dictionary for VCF->BCF conversion [null]\n"); - fprintf(stderr, " -1 INT number of group-1 samples [0]\n"); fprintf(stderr, " -l FILE list of sites to output [all sites]\n"); fprintf(stderr, " -t FLOAT scaled substitution mutation rate [%.4g]\n", vc.theta); fprintf(stderr, " -i FLOAT indel-to-substitution ratio [%.4g]\n", vc.indel_frac); fprintf(stderr, " -p FLOAT variant if P(ref|D)BCF conversion please specify the sequence dictionary with -D\n", __func__); return 1; } + if (vc.n1 <= 0) vc.n_perm = 0; // TODO: give a warning here! + if (vc.n_perm > 0) { + seeds = malloc(vc.n_perm * sizeof(int)); + srand48(time(0)); + for (c = 0; c < vc.n_perm; ++c) seeds[c] = lrand48(); + } b = calloc(1, sizeof(bcf1_t)); blast = calloc(1, sizeof(bcf1_t)); strcpy(moder, "r"); @@ -399,6 +430,7 @@ int bcfview(int argc, char *argv[]) while (vcf_read(bp, hin, b) > 0) { int is_indel; if (vc.n_sub) bcf_subsam(vc.n_sub, vc.sublist, b); + if (vc.flag & VC_FIX_PL) bcf_fix_pl(b); is_indel = bcf_is_indel(b); if ((vc.flag & VC_NO_INDEL) && is_indel) continue; if ((vc.flag & VC_ACGT_ONLY) && !is_indel) { @@ -434,6 +466,16 @@ int bcfview(int argc, char *argv[]) bcf_p1_dump_afs(p1); } if (pr.p_ref >= vc.pref && (vc.flag & VC_VARONLY)) continue; + if (vc.n_perm && vc.n1 > 0 && pr.p_chi2 < vc.min_perm_p) { // permutation test + bcf_p1rst_t r; + int i, n = 0; + for (i = 0; i < vc.n_perm; ++i) { + bcf_shuffle(b, seeds[i]); + bcf_p1_cal(b, p1, &r); + if (pr.p_chi2 >= r.p_chi2) ++n; + } + pr.perm_rank = n; + } update_bcf1(hout->n_smpl, b, p1, &pr, vc.pref, vc.flag); } if (vc.flag & VC_ADJLD) { // compute LD @@ -471,6 +513,7 @@ int bcfview(int argc, char *argv[]) for (i = 0; i < vc.n_sub; ++i) free(vc.subsam[i]); free(vc.subsam); free(vc.sublist); } + if (seeds) free(seeds); if (p1) bcf_p1_destroy(p1); return 0; } diff --git a/bcftools/prob1.c b/bcftools/prob1.c index 4804e6e..503a998 100644 --- a/bcftools/prob1.c +++ b/bcftools/prob1.c @@ -39,6 +39,7 @@ struct __bcf_p1aux_t { double *phi, *phi_indel; double *z, *zswap; // aux for afs double *z1, *z2, *phi1, *phi2; // only calculated when n1 is set + double **hg; // hypergeometric distribution double t, t1, t2; double *afs, *afs1; // afs: accumulative AFS; afs1: site posterior distribution const uint8_t *PL; // point to PL @@ -173,6 +174,11 @@ int bcf_p1_set_n1(bcf_p1aux_t *b, int n1) void bcf_p1_destroy(bcf_p1aux_t *ma) { if (ma) { + int k; + if (ma->hg && ma->n1 > 0) { + for (k = 0; k <= 2*ma->n1; ++k) free(ma->hg[k]); + free(ma->hg); + } free(ma->ploidy); free(ma->q2p); free(ma->pdg); free(ma->phi); free(ma->phi_indel); free(ma->phi1); free(ma->phi2); free(ma->z); free(ma->zswap); free(ma->z1); free(ma->z2); @@ -231,7 +237,7 @@ int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k) } for (i = 0, sum = 0.; i < 3; ++i) sum += (g[i] = pdg[i] * f3[i]); - for (i = 0, max = -1., max_i = 0; i <= ploidy; ++i) { + for (i = 0, max = -1., max_i = 0; i < 3; ++i) { g[i] /= sum; if (g[i] > max) max = g[i], max_i = i; } @@ -258,24 +264,22 @@ static void mc_cal_y_core(bcf_p1aux_t *ma, int beg) last_min = last_max = 0; ma->t = 0.; if (ma->M == ma->n * 2) { + int M = 0; for (_j = beg; _j < ma->n; ++_j) { - int k, j = _j - beg, _min = last_min, _max = last_max; + int k, j = _j - beg, _min = last_min, _max = last_max, M0; double p[3], sum; + M0 = M; M += 2; pdg = ma->pdg + _j * 3; p[0] = pdg[0]; p[1] = 2. * pdg[1]; p[2] = pdg[2]; for (; _min < _max && z[0][_min] < TINY; ++_min) z[0][_min] = z[1][_min] = 0.; for (; _max > _min && z[0][_max] < TINY; --_max) z[0][_max] = z[1][_max] = 0.; _max += 2; - if (_min == 0) - k = 0, z[1][k] = (2*j+2-k)*(2*j-k+1) * p[0] * z[0][k]; - if (_min <= 1) - k = 1, z[1][k] = (2*j+2-k)*(2*j-k+1) * p[0] * z[0][k] + k*(2*j+2-k) * p[1] * z[0][k-1]; + if (_min == 0) k = 0, z[1][k] = (M0-k+1) * (M0-k+2) * p[0] * z[0][k]; + if (_min <= 1) k = 1, z[1][k] = (M0-k+1) * (M0-k+2) * p[0] * z[0][k] + k*(M0-k+2) * p[1] * z[0][k-1]; for (k = _min < 2? 2 : _min; k <= _max; ++k) - z[1][k] = (2*j+2-k)*(2*j-k+1) * p[0] * z[0][k] - + k*(2*j+2-k) * p[1] * z[0][k-1] - + k*(k-1)* p[2] * z[0][k-2]; + z[1][k] = (M0-k+1)*(M0-k+2) * p[0] * z[0][k] + k*(M0-k+2) * p[1] * z[0][k-1] + k*(k-1)* p[2] * z[0][k-2]; for (k = _min, sum = 0.; k <= _max; ++k) sum += z[1][k]; - ma->t += log(sum / ((2. * j + 2) * (2. * j + 1))); + ma->t += log(sum / (M * (M - 1.))); for (k = _min; k <= _max; ++k) z[1][k] /= sum; if (_min >= 1) z[1][_min-1] = 0.; if (_min >= 2) z[1][_min-2] = 0.; @@ -287,6 +291,8 @@ static void mc_cal_y_core(bcf_p1aux_t *ma, int beg) tmp = z[0]; z[0] = z[1]; z[1] = tmp; last_min = _min; last_max = _max; } + //for (_j = 0; _j < last_min; ++_j) z[0][_j] = 0.; // TODO: are these necessary? + //for (_j = last_max + 1; _j < ma->M; ++_j) z[0][_j] = 0.; } else { // this block is very similar to the block above; these two might be merged in future int j, M = 0; for (j = 0; j < ma->n; ++j) { @@ -347,26 +353,104 @@ static void mc_cal_y(bcf_p1aux_t *ma) } else mc_cal_y_core(ma, 0); } -static void contrast(bcf_p1aux_t *ma, double pc[4]) // mc_cal_y() must be called before hand +#define CONTRAST_TINY 1e-30 + +extern double kf_gammaq(double s, double z); // incomplete gamma function for chi^2 test + +static inline double chi2_test(int a, int b, int c, int d) +{ + double x, z; + x = (double)(a+b) * (c+d) * (b+d) * (a+c); + if (x == 0.) return 1; + z = a * d - b * c; + return kf_gammaq(.5, .5 * z * z * (a+b+c+d) / x); +} + +// chi2=(a+b+c+d)(ad-bc)^2/[(a+b)(c+d)(a+c)(b+d)] +static inline double contrast2_aux(const bcf_p1aux_t *p1, double sum, int n1, int n2, int k1, int k2, double x[3]) +{ + double p = p1->phi[k1+k2] * p1->z1[k1] * p1->z2[k2] / sum * p1->hg[k1][k2]; + if (p < CONTRAST_TINY) return -1; + if (.5*k1/n1 < .5*k2/n2) x[1] += p; + else if (.5*k1/n1 > .5*k2/n2) x[2] += p; + else x[0] += p; + return p * chi2_test(k1, k2, (n1<<1) - k1, (n2<<1) - k2); +} + +static double contrast2(bcf_p1aux_t *p1, double ret[3]) { - int k, n1 = ma->n1, n2 = ma->n - ma->n1; - long double sum1, sum2; - pc[0] = pc[1] = pc[2] = pc[3] = -1.; - if (n1 <= 0 || n2 <= 0) return; - for (k = 0, sum1 = 0.; k <= 2*n1; ++k) sum1 += ma->phi1[k] * ma->z1[k]; - for (k = 0, sum2 = 0.; k <= 2*n2; ++k) sum2 += ma->phi2[k] * ma->z2[k]; - pc[2] = ma->phi1[2*n1] * ma->z1[2*n1] / sum1; - pc[3] = ma->phi2[2*n2] * ma->z2[2*n2] / sum2; - for (k = 2; k < 4; ++k) { - pc[k] = pc[k] > .5? -(-4.343 * log(1. - pc[k] + TINY) + .499) : -4.343 * log(pc[k] + TINY) + .499; - pc[k] = (int)pc[k]; - if (pc[k] > 99) pc[k] = 99; - if (pc[k] < -99) pc[k] = -99; + int k, k1, k2, k10, k20, n1, n2; + double sum; + // get n1 and n2 + n1 = p1->n1; n2 = p1->n - p1->n1; + if (n1 <= 0 || n2 <= 0) return 0.; + if (p1->hg == 0) { // initialize the hypergeometric distribution + /* NB: the hg matrix may take a lot of memory when there are many samples. There is a way + to avoid precomputing this matrix, but it is slower and quite intricate. The following + computation in this block can be accelerated with a similar strategy, but perhaps this + is not a serious concern for now. */ + double tmp = lgamma(2*(n1+n2)+1) - (lgamma(2*n1+1) + lgamma(2*n2+1)); + p1->hg = calloc(2*n1+1, sizeof(void*)); + for (k1 = 0; k1 <= 2*n1; ++k1) { + p1->hg[k1] = calloc(2*n2+1, sizeof(double)); + for (k2 = 0; k2 <= 2*n2; ++k2) + p1->hg[k1][k2] = exp(lgamma(k1+k2+1) + lgamma(p1->M-k1-k2+1) - (lgamma(k1+1) + lgamma(k2+1) + lgamma(2*n1-k1+1) + lgamma(2*n2-k2+1) + tmp)); + } + } + { // compute sum1 and sum2 + long double suml = 0; + for (k = 0; k <= p1->M; ++k) suml += p1->phi[k] * p1->z[k]; + sum = suml; + } + { // get the mean k1 and k2 + double max; + int max_k; + for (k = 0, max = 0, max_k = -1; k <= 2*n1; ++k) { + double x = p1->phi1[k] * p1->z1[k]; + if (x > max) max = x, max_k = k; + } + k10 = max_k; + for (k = 0, max = 0, max_k = -1; k <= 2*n2; ++k) { + double x = p1->phi2[k] * p1->z2[k]; + if (x > max) max = x, max_k = k; + } + k20 = max_k; + } + { // We can do the following with one nested loop, but that is an O(N^2) thing. The following code block is much faster for large N. + double x[3], y; + long double z = 0.; + x[0] = x[1] = x[2] = 0; + for (k1 = k10; k1 >= 0; --k1) { + for (k2 = k20; k2 >= 0; --k2) { + if ((y = contrast2_aux(p1, sum, n1, n2, k1, k2, x)) < 0) break; + else z += y; + } + for (k2 = k20 + 1; k2 <= 2*n2; ++k2) { + if ((y = contrast2_aux(p1, sum, n1, n2, k1, k2, x)) < 0) break; + else z += y; + } + } + ret[0] = x[0]; ret[1] = x[1]; ret[2] = x[2]; + x[0] = x[1] = x[2] = 0; + for (k1 = k10 + 1; k1 <= 2*n1; ++k1) { + for (k2 = k20; k2 >= 0; --k2) { + if ((y = contrast2_aux(p1, sum, n1, n2, k1, k2, x)) < 0) break; + else z += y; + } + for (k2 = k20 + 1; k2 <= 2*n2; ++k2) { + if ((y = contrast2_aux(p1, sum, n1, n2, k1, k2, x)) < 0) break; + else z += y; + } + } + ret[0] += x[0]; ret[1] += x[1]; ret[2] += x[2]; + if (ret[0] + ret[1] + ret[2] < 0.99) { // in case of bad things happened + ret[0] = ret[1] = ret[2] = 0; + for (k1 = 0, z = 0.; k1 <= 2*n1; ++k1) + for (k2 = 0; k2 <= 2*n2; ++k2) + if ((y = contrast2_aux(p1, sum, n1, n2, k1, k2, ret)) >= 0) z += y; + } + return (double)z; } - pc[0] = ma->phi2[2*n2] * ma->z2[2*n2] / sum2 * (1. - ma->phi1[2*n1] * ma->z1[2*n1] / sum1); - pc[1] = ma->phi1[2*n1] * ma->z1[2*n1] / sum1 * (1. - ma->phi2[2*n2] * ma->z2[2*n2] / sum2); - pc[0] = pc[0] == 1.? 99 : (int)(-4.343 * log(1. - pc[0]) + .499); - pc[1] = pc[1] == 1.? 99 : (int)(-4.343 * log(1. - pc[1]) + .499); } static double mc_cal_afs(bcf_p1aux_t *ma, double *p_ref_folded, double *p_var_folded) @@ -403,6 +487,7 @@ int bcf_p1_cal(const bcf1_t *b, bcf_p1aux_t *ma, bcf_p1rst_t *rst) int i, k; long double sum = 0.; ma->is_indel = bcf_is_indel(b); + rst->perm_rank = -1; // set PL and PL_len for (i = 0; i < b->n_gi; ++i) { if (b->gi[i].fmt == bcf_str2int("PL", 2)) { @@ -450,7 +535,9 @@ int bcf_p1_cal(const bcf1_t *b, bcf_p1aux_t *ma, bcf_p1rst_t *rst) rst->cil = (double)(ma->M - h) / ma->M; rst->cih = (double)(ma->M - l) / ma->M; } rst->g[0] = rst->g[1] = rst->g[2] = -1.; - contrast(ma, rst->pc); + rst->cmp[0] = rst->cmp[1] = rst->cmp[2] = rst->p_chi2 = -1.0; + if (rst->p_var > 0.1) // skip contrast2() if the locus is a strong non-variant + rst->p_chi2 = contrast2(ma, rst->cmp); return 0; } diff --git a/bcftools/prob1.h b/bcftools/prob1.h index eb4dc92..3f89dda 100644 --- a/bcftools/prob1.h +++ b/bcftools/prob1.h @@ -7,10 +7,10 @@ struct __bcf_p1aux_t; typedef struct __bcf_p1aux_t bcf_p1aux_t; typedef struct { - int rank0; + int rank0, perm_rank; // NB: perm_rank is always set to -1 by bcf_p1_cal() double f_em, f_exp, f_flat, p_ref_folded, p_ref, p_var_folded, p_var; double cil, cih; - double pc[4]; + double cmp[3], p_chi2; // used by contrast2() double g[3]; } bcf_p1rst_t; diff --git a/bgzf.c b/bgzf.c index 66d6b02..db970c8 100644 --- a/bgzf.c +++ b/bgzf.c @@ -148,7 +148,7 @@ open_read(int fd) static BGZF* -open_write(int fd, bool is_uncompressed) +open_write(int fd, int compress_level) // compress_level==-1 for the default level { FILE* file = fdopen(fd, "w"); BGZF* fp; @@ -156,7 +156,9 @@ open_write(int fd, bool is_uncompressed) fp = malloc(sizeof(BGZF)); fp->file_descriptor = fd; fp->open_mode = 'w'; - fp->owned_file = 0; fp->is_uncompressed = is_uncompressed; + fp->owned_file = 0; + fp->compress_level = compress_level < 0? Z_DEFAULT_COMPRESSION : compress_level; // Z_DEFAULT_COMPRESSION==-1 + if (fp->compress_level > 9) fp->compress_level = Z_DEFAULT_COMPRESSION; #ifdef _USE_KNETFILE fp->x.fpw = file; #else @@ -195,13 +197,20 @@ bgzf_open(const char* __restrict path, const char* __restrict mode) fp = open_read(fd); #endif } else if (strchr(mode, 'w') || strchr(mode, 'W')) { - int fd, oflag = O_WRONLY | O_CREAT | O_TRUNC; + int fd, compress_level = -1, oflag = O_WRONLY | O_CREAT | O_TRUNC; #ifdef _WIN32 oflag |= O_BINARY; #endif fd = open(path, oflag, 0666); if (fd == -1) return 0; - fp = open_write(fd, strchr(mode, 'u')? 1 : 0); + { // set compress_level + int i; + for (i = 0; mode[i]; ++i) + if (mode[i] >= '0' && mode[i] <= '9') break; + if (mode[i]) compress_level = (int)mode[i] - '0'; + if (strchr(mode, 'u')) compress_level = 0; + } + fp = open_write(fd, compress_level); } if (fp != NULL) fp->owned_file = 1; return fp; @@ -214,7 +223,12 @@ bgzf_fdopen(int fd, const char * __restrict mode) if (mode[0] == 'r' || mode[0] == 'R') { return open_read(fd); } else if (mode[0] == 'w' || mode[0] == 'W') { - return open_write(fd, strstr(mode, "u")? 1 : 0); + int i, compress_level = -1; + for (i = 0; mode[i]; ++i) + if (mode[i] >= '0' && mode[i] <= '9') break; + if (mode[i]) compress_level = (int)mode[i] - '0'; + if (strchr(mode, 'u')) compress_level = 0; + return open_write(fd, compress_level); } else { return NULL; } @@ -254,7 +268,6 @@ deflate_block(BGZF* fp, int block_length) int input_length = block_length; int compressed_length = 0; while (1) { - int compress_level = fp->is_uncompressed? 0 : Z_DEFAULT_COMPRESSION; z_stream zs; zs.zalloc = NULL; zs.zfree = NULL; @@ -263,7 +276,7 @@ deflate_block(BGZF* fp, int block_length) zs.next_out = (void*)&buffer[BLOCK_HEADER_LENGTH]; zs.avail_out = buffer_size - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH; - int status = deflateInit2(&zs, compress_level, Z_DEFLATED, + int status = deflateInit2(&zs, fp->compress_level, Z_DEFLATED, GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY); if (status != Z_OK) { report_error(fp, "deflate init failed"); @@ -330,6 +343,7 @@ inflate_block(BGZF* fp, int block_length) // Inflate the block in fp->compressed_block into fp->uncompressed_block z_stream zs; + int status; zs.zalloc = NULL; zs.zfree = NULL; zs.next_in = fp->compressed_block + 18; @@ -337,7 +351,7 @@ inflate_block(BGZF* fp, int block_length) zs.next_out = fp->uncompressed_block; zs.avail_out = fp->uncompressed_block_size; - int status = inflateInit2(&zs, GZIP_WINDOW_BITS); + status = inflateInit2(&zs, GZIP_WINDOW_BITS); if (status != Z_OK) { report_error(fp, "inflate init failed"); return -1; @@ -431,7 +445,7 @@ int bgzf_read_block(BGZF* fp) { bgzf_byte_t header[BLOCK_HEADER_LENGTH]; - int count, size = 0; + int count, size = 0, block_length, remaining; #ifdef _USE_KNETFILE int64_t block_address = knet_tell(fp->x.fpr); if (load_block_from_cache(fp, block_address)) return 0; @@ -454,10 +468,10 @@ bgzf_read_block(BGZF* fp) report_error(fp, "invalid block header"); return -1; } - int block_length = unpackInt16((uint8_t*)&header[16]) + 1; + block_length = unpackInt16((uint8_t*)&header[16]) + 1; bgzf_byte_t* compressed_block = (bgzf_byte_t*) fp->compressed_block; memcpy(compressed_block, header, BLOCK_HEADER_LENGTH); - int remaining = block_length - BLOCK_HEADER_LENGTH; + remaining = block_length - BLOCK_HEADER_LENGTH; #ifdef _USE_KNETFILE count = knet_read(fp->x.fpr, &compressed_block[BLOCK_HEADER_LENGTH], remaining); #else @@ -494,7 +508,8 @@ bgzf_read(BGZF* fp, void* data, int length) int bytes_read = 0; bgzf_byte_t* output = data; while (bytes_read < length) { - int available = fp->block_length - fp->block_offset; + int copy_length, available = fp->block_length - fp->block_offset; + bgzf_byte_t *buffer; if (available <= 0) { if (bgzf_read_block(fp) != 0) { return -1; @@ -504,8 +519,8 @@ bgzf_read(BGZF* fp, void* data, int length) break; } } - int copy_length = bgzf_min(length-bytes_read, available); - bgzf_byte_t* buffer = fp->uncompressed_block; + copy_length = bgzf_min(length-bytes_read, available); + buffer = fp->uncompressed_block; memcpy(output, buffer + fp->block_offset, copy_length); fp->block_offset += copy_length; output += copy_length; @@ -552,6 +567,8 @@ int bgzf_flush_try(BGZF *fp, int size) int bgzf_write(BGZF* fp, const void* data, int length) { + const bgzf_byte_t *input = data; + int block_length, bytes_written; if (fp->open_mode != 'w') { report_error(fp, "file not open for writing"); return -1; @@ -560,9 +577,9 @@ int bgzf_write(BGZF* fp, const void* data, int length) if (fp->uncompressed_block == NULL) fp->uncompressed_block = malloc(fp->uncompressed_block_size); - const bgzf_byte_t* input = data; - int block_length = fp->uncompressed_block_size; - int bytes_written = 0; + input = data; + block_length = fp->uncompressed_block_size; + bytes_written = 0; while (bytes_written < length) { int copy_length = bgzf_min(block_length - fp->block_offset, length - bytes_written); bgzf_byte_t* buffer = fp->uncompressed_block; diff --git a/bgzf.h b/bgzf.h index 099ae9a..9dc7b5f 100644 --- a/bgzf.h +++ b/bgzf.h @@ -26,7 +26,6 @@ #include #include -#include #include #ifdef _USE_KNETFILE #include "knetfile.h" @@ -37,7 +36,7 @@ typedef struct { int file_descriptor; char open_mode; // 'r' or 'w' - bool owned_file, is_uncompressed; + int16_t owned_file, compress_level; #ifdef _USE_KNETFILE union { knetFile *fpr; diff --git a/sam.c b/sam.c index ecdee02..e195b0f 100644 --- a/sam.c +++ b/sam.c @@ -40,9 +40,9 @@ samfile_t *samopen(const char *fn, const char *mode, const void *aux) { samfile_t *fp; fp = (samfile_t*)calloc(1, sizeof(samfile_t)); - if (mode[0] == 'r') { // read + if (strchr(mode, 'r')) { // read fp->type |= TYPE_READ; - if (mode[1] == 'b') { // binary + if (strchr(mode, 'b')) { // binary fp->type |= TYPE_BAM; fp->x.bam = strcmp(fn, "-")? bam_open(fn, "r") : bam_dopen(fileno(stdin), "r"); if (fp->x.bam == 0) goto open_err_ret; @@ -63,11 +63,15 @@ samfile_t *samopen(const char *fn, const char *mode, const void *aux) fprintf(stderr, "[samopen] no @SQ lines in the header.\n"); } else fprintf(stderr, "[samopen] SAM header is present: %d sequences.\n", fp->header->n_targets); } - } else if (mode[0] == 'w') { // write + } else if (strchr(mode, 'w')) { // write fp->header = bam_header_dup((const bam_header_t*)aux); - if (mode[1] == 'b') { // binary + if (strchr(mode, 'b')) { // binary char bmode[3]; - bmode[0] = 'w'; bmode[1] = strstr(mode, "u")? 'u' : 0; bmode[2] = 0; + int i, compress_level = -1; + for (i = 0; mode[i]; ++i) if (mode[i] >= '0' && mode[i] <= '9') break; + if (mode[i]) compress_level = mode[i] - '0'; + if (strchr(mode, 'u')) compress_level = 0; + bmode[0] = 'w'; bmode[1] = compress_level < 0? 0 : compress_level + '0'; bmode[2] = 0; fp->type |= TYPE_BAM; fp->x.bam = strcmp(fn, "-")? bam_open(fn, bmode) : bam_dopen(fileno(stdout), bmode); if (fp->x.bam == 0) goto open_err_ret; @@ -76,11 +80,11 @@ samfile_t *samopen(const char *fn, const char *mode, const void *aux) // open file fp->x.tamw = strcmp(fn, "-")? fopen(fn, "w") : stdout; if (fp->x.tamr == 0) goto open_err_ret; - if (strstr(mode, "X")) fp->type |= BAM_OFSTR<<2; - else if (strstr(mode, "x")) fp->type |= BAM_OFHEX<<2; + if (strchr(mode, 'X')) fp->type |= BAM_OFSTR<<2; + else if (strchr(mode, 'x')) fp->type |= BAM_OFHEX<<2; else fp->type |= BAM_OFDEC<<2; // write header - if (strstr(mode, "h")) { + if (strchr(mode, 'h')) { int i; bam_header_t *alt; // parse the header text diff --git a/sam_view.c b/sam_view.c index eb69449..170bd84 100644 --- a/sam_view.c +++ b/sam_view.c @@ -82,7 +82,7 @@ static int usage(int is_long_help); int main_samview(int argc, char *argv[]) { - int c, is_header = 0, is_header_only = 0, is_bamin = 1, ret = 0, is_uncompressed = 0, is_bamout = 0, slx2sngr = 0, is_count = 0; + int c, is_header = 0, is_header_only = 0, is_bamin = 1, ret = 0, compress_level = -1, is_bamout = 0, slx2sngr = 0, is_count = 0; int of_type = BAM_OFDEC, is_long_help = 0; int count = 0; samfile_t *in = 0, *out = 0; @@ -90,7 +90,7 @@ int main_samview(int argc, char *argv[]) /* parse command-line options */ strcpy(in_mode, "r"); strcpy(out_mode, "w"); - while ((c = getopt(argc, argv, "Sbct:hHo:q:f:F:ul:r:xX?T:CR:")) >= 0) { + while ((c = getopt(argc, argv, "Sbct:h1Ho:q:f:F:ul:r:xX?T:CR:")) >= 0) { switch (c) { case 'c': is_count = 1; break; case 'C': slx2sngr = 1; break; @@ -103,7 +103,8 @@ int main_samview(int argc, char *argv[]) case 'f': g_flag_on = strtol(optarg, 0, 0); break; case 'F': g_flag_off = strtol(optarg, 0, 0); break; case 'q': g_min_mapQ = atoi(optarg); break; - case 'u': is_uncompressed = 1; break; + case 'u': compress_level = 0; break; + case '1': compress_level = 1; break; case 'l': g_library = strdup(optarg); break; case 'r': g_rg = strdup(optarg); break; case 'R': fn_rg = strdup(optarg); break; @@ -114,7 +115,7 @@ int main_samview(int argc, char *argv[]) default: return usage(is_long_help); } } - if (is_uncompressed) is_bamout = 1; + if (compress_level >= 0) is_bamout = 1; if (is_header_only) is_header = 1; if (is_bamout) strcat(out_mode, "b"); else { @@ -123,7 +124,11 @@ int main_samview(int argc, char *argv[]) } if (is_bamin) strcat(in_mode, "b"); if (is_header) strcat(out_mode, "h"); - if (is_uncompressed) strcat(out_mode, "u"); + if (compress_level >= 0) { + char tmp[2]; + tmp[0] = compress_level + '0'; tmp[1] = '\0'; + strcat(out_mode, tmp); + } if (argc == optind) return usage(is_long_help); // potential memory leak... // read the list of read groups @@ -231,6 +236,7 @@ static int usage(int is_long_help) fprintf(stderr, " -H print header only (no alignments)\n"); fprintf(stderr, " -S input is SAM\n"); fprintf(stderr, " -u uncompressed BAM output (force -b)\n"); + fprintf(stderr, " -1 fast compression (force -b)\n"); fprintf(stderr, " -x output FLAG in HEX (samtools-C specific)\n"); fprintf(stderr, " -X output FLAG in string (samtools-C specific)\n"); fprintf(stderr, " -c print only the count of matching records\n"); diff --git a/samtools.1 b/samtools.1 index e0c7743..4834679 100644 --- a/samtools.1 +++ b/samtools.1 @@ -1,4 +1,4 @@ -.TH samtools 1 "1 March 2011" "samtools-0.1.13" "Bioinformatics tools" +.TH samtools 1 "16 March 2011" "samtools-0.1.14" "Bioinformatics tools" .SH NAME .PP samtools - Utilities for the Sequence Alignment/Map (SAM) format @@ -145,7 +145,10 @@ identifiers are absent, each input file is regarded as one sample. .B OPTIONS: .RS -.TP 8 +.TP 10 +.B -A +Do not skip anomalous read pairs in variant calling. +.TP .B -B Disable probabilistic realignment for the computation of base alignment quality (BAQ). BAQ is the Phred-scaled probability of a read base being @@ -159,6 +162,14 @@ being generated from the mapped position, the new mapping quality is about sqrt((INT-q)/INT)*INT. A zero value disables this functionality; if enabled, the recommended value for BWA is 50. [0] .TP +.BI -d \ INT +At a position, read maximally +.I INT +reads per input BAM. [250] +.TP +.B -D +Output per-sample read depth +.TP .BI -e \ INT Phred-scaled gap extension sequencing error probability. Reducing .I INT @@ -184,9 +195,17 @@ is modeled as .IR INT * s / l . [100] .TP +.B -I +Do not perform INDEL calling +.TP .BI -l \ FILE File containing a list of sites where pileup or BCF is outputted [null] .TP +.BI -L \ INT +Skip INDEL calling if the average per-sample depth is above +.IR INT . +[250] +.TP .BI -o \ INT Phred-scaled gap open sequencing error probability. Reducing .I INT @@ -210,6 +229,9 @@ Only generate pileup in region .I STR [all sites] .TP +.B -S +Output per-sample Phred-scaled strand bias P-value +.TP .B -u Similar to .B -g -- 2.39.2