KNETFILE_O= knetfile.o
LOBJS= bgzf.o kstring.o bam_aux.o bam.o bam_import.o sam.o bam_index.o \
bam_pileup.o bam_lpileup.o bam_md.o glf.o razf.o faidx.o \
- $(KNETFILE_O) bam_sort.o sam_header.o bam_reheader.o kprobaln.o
+ $(KNETFILE_O) bam_sort.o sam_header.o bam_reheader.o kprobaln.o bam_cat.o
AOBJS= bam_tview.o bam_maqcns.o bam_plcmd.o sam_view.o \
bam_rmdup.o bam_rmdupse.o bam_mate.o bam_stat.o bam_color.o \
bamtk.o kaln.o bam2bcf.o bam2bcf_indel.o errmod.o sample.o \
+Beta release 0.1.14 (16 March, 2011)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This release implements a method for testing associations for case-control
+data. The method does not call genotypes but instead sums over all genotype
+configurations to compute a chi^2 based test statistics. It can be potentially
+applied to comparing a pair of samples (e.g. a tumor-normal pair), but this
+has not been evaluated on real data.
+
+Another new feature is to make X chromosome variant calls when female and male
+samples are both present. The user needs to provide a file indicating the
+ploidy of each sample.
+
+Other new features:
+
+ * Added `samtools mpileup -L' to skip INDEL calling in regions with
+ excessively high coverage. Such regions dramatically slow down mpileup.
+
+ * Added `bcftools view -F' to parse BCF files generated by samtools r921 or
+ older which encode PL in a different way.
+
+(0.1.14: 16 March 2011, r930:163)
+
+
+
Beta release 0.1.13 (1 March, 2011)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
* Updated the BCF spec.
* Added the `FQ' VCF INFO field, which gives the phred-scaled probability
- of all samples being the samei (identical to the reference or all homozygous
+ of all samples being the same (identical to the reference or all homozygous
variants). Option `view -f' has been dropped.
* Implementated of "vcfutils.pl vcf2fq" to generate a consensus sequence
@copyright Genome Research Ltd.
*/
-#define BAM_VERSION "0.1.13 (r926:134)"
+#define BAM_VERSION "0.1.13 (r926:161)"
#include <stdint.h>
#include <stdlib.h>
#define MPLP_EXT_BAQ 0x800
typedef struct {
- int max_mq, min_mq, flag, min_baseQ, capQ_thres, max_depth;
+ int max_mq, min_mq, flag, min_baseQ, capQ_thres, max_depth, max_indel_depth;
int openQ, extQ, tandemQ, min_support; // for indels
double min_frac; // for indels
char *reg, *fn_pos, *pl_list;
extern void *bcf_call_add_rg(void *rghash, const char *hdtext, const char *list);
extern void bcf_call_del_rghash(void *rghash);
mplp_aux_t **data;
- int i, tid, pos, *n_plp, beg0 = 0, end0 = 1u<<29, ref_len, ref_tid, max_depth;
+ int i, tid, pos, *n_plp, beg0 = 0, end0 = 1u<<29, ref_len, ref_tid, max_depth, max_indel_depth;
const bam_pileup1_t **plp;
bam_mplp_t iter;
bam_header_t *h = 0;
max_depth = 8000 / sm->n;
fprintf(stderr, "<%s> Set max per-sample depth to %d\n", __func__, max_depth);
}
+ max_indel_depth = conf->max_indel_depth * sm->n;
bam_mplp_set_maxcnt(iter, max_depth);
while (bam_mplp_auto(iter, &tid, &pos, n_plp, plp) > 0) {
if (conf->reg && (pos < beg0 || pos >= end0)) continue; // out of the region requested
ref_tid = tid;
}
if (conf->flag & MPLP_GLF) {
- int _ref0, ref16;
+ int total_depth, _ref0, ref16;
bcf1_t *b = calloc(1, sizeof(bcf1_t));
+ for (i = total_depth = 0; i < n; ++i) total_depth += n_plp[i];
group_smpl(&gplp, sm, &buf, n, fn, n_plp, plp);
_ref0 = (ref && pos < ref_len)? ref[pos] : 'N';
ref16 = bam_nt16_table[_ref0];
bcf_write(bp, bh, b);
bcf_destroy(b);
// call indels
- if (!(conf->flag&MPLP_NO_INDEL) && bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, rghash) >= 0) {
+ if (!(conf->flag&MPLP_NO_INDEL) && total_depth < max_indel_depth && bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, rghash) >= 0) {
for (i = 0; i < gplp.n; ++i)
bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], -1, bca, bcr + i);
if (bcf_call_combine(gplp.n, bcr, -1, &bc) >= 0) {
mplp.max_mq = 60;
mplp.min_baseQ = 13;
mplp.capQ_thres = 0;
- mplp.max_depth = 250;
+ mplp.max_depth = 250; mplp.max_indel_depth = 250;
mplp.openQ = 40; mplp.extQ = 20; mplp.tandemQ = 100;
mplp.min_frac = 0.002; mplp.min_support = 1;
mplp.flag = MPLP_NO_ORPHAN | MPLP_REALN;
- while ((c = getopt(argc, argv, "Agf:r:l:M:q:Q:uaRC:BDSd:b:P:o:e:h:Im:F:EG:")) >= 0) {
+ while ((c = getopt(argc, argv, "Agf:r:l:M:q:Q:uaRC:BDSd:L:b:P:o:e:h:Im:F:EG:")) >= 0) {
switch (c) {
case 'f':
mplp.fai = fai_load(optarg);
case 'A': use_orphan = 1; break;
case 'F': mplp.min_frac = atof(optarg); break;
case 'm': mplp.min_support = atoi(optarg); break;
+ case 'L': mplp.max_indel_depth = atoi(optarg); break;
case 'G': {
FILE *fp_rg;
char buf[1024];
fprintf(stderr, " -M INT cap mapping quality at INT [%d]\n", mplp.max_mq);
fprintf(stderr, " -Q INT min base quality [%d]\n", mplp.min_baseQ);
fprintf(stderr, " -q INT filter out alignment with MQ smaller than INT [%d]\n", mplp.min_mq);
- fprintf(stderr, " -d INT max per-sample depth [%d]\n", mplp.max_depth);
+ fprintf(stderr, " -d INT max per-BAM depth [%d]\n", mplp.max_depth);
+ fprintf(stderr, " -L INT max per-sample depth for INDEL calling [%d]\n", mplp.max_indel_depth);
fprintf(stderr, " -P STR comma separated list of platforms for indels [all]\n");
fprintf(stderr, " -o INT Phred-scaled gap open sequencing error probability [%d]\n", mplp.openQ);
fprintf(stderr, " -e INT Phred-scaled gap extension seq error probability [%d]\n", mplp.extQ);
static void sort_blocks(int n, int k, bam1_p *buf, const char *prefix, const bam_header_t *h, int is_stdout)
{
- char *name;
+ char *name, mode[3];
int i;
bamFile fp;
ks_mergesort(sort, k, buf, 0);
name = (char*)calloc(strlen(prefix) + 20, 1);
- if (n >= 0) sprintf(name, "%s.%.4d.bam", prefix, n);
- else sprintf(name, "%s.bam", prefix);
- fp = is_stdout? bam_dopen(fileno(stdout), "w") : bam_open(name, "w");
+ if (n >= 0) {
+ sprintf(name, "%s.%.4d.bam", prefix, n);
+ strcpy(mode, "w1");
+ } else {
+ sprintf(name, "%s.bam", prefix);
+ strcpy(mode, "w");
+ }
+ fp = is_stdout? bam_dopen(fileno(stdout), mode) : bam_open(name, mode);
if (fp == 0) {
fprintf(stderr, "[sort_blocks] fail to create file %s.\n", name);
free(name);
int main_reheader(int argc, char *argv[]);
int main_cut_target(int argc, char *argv[]);
int main_phase(int argc, char *argv[]);
+int main_cat(int argc, char *argv[]);
int faidx_main(int argc, char *argv[]);
int glf3_view_main(int argc, char *argv[]);
fprintf(stderr, " merge merge sorted alignments\n");
fprintf(stderr, " rmdup remove PCR duplicates\n");
fprintf(stderr, " reheader replace BAM header\n");
+ fprintf(stderr, " cat concatenate BAMs\n");
fprintf(stderr, " targetcut cut fosmid regions (for fosmid pool only)\n");
fprintf(stderr, " phase phase heterozygotes\n");
fprintf(stderr, "\n");
else if (strcmp(argv[1], "calmd") == 0) return bam_fillmd(argc-1, argv+1);
else if (strcmp(argv[1], "fillmd") == 0) return bam_fillmd(argc-1, argv+1);
else if (strcmp(argv[1], "reheader") == 0) return main_reheader(argc-1, argv+1);
+ else if (strcmp(argv[1], "cat") == 0) return main_cat(argc-1, argv+1);
else if (strcmp(argv[1], "targetcut") == 0) return main_cut_target(argc-1, argv+1);
else if (strcmp(argv[1], "phase") == 0) return main_phase(argc-1, argv+1);
#if _CURSES_LIB != 0
int bcf_is_indel(const bcf1_t *b);
bcf_hdr_t *bcf_hdr_subsam(const bcf_hdr_t *h0, int n, char *const* samples, int *list);
int bcf_subsam(int n_smpl, int *list, bcf1_t *b);
+ // move GT to the first FORMAT field
+ int bcf_fix_gt(bcf1_t *b);
+ // update PL generated by old samtools
+ int bcf_fix_pl(bcf1_t *b);
// string hash table
void *bcf_build_refhash(bcf_hdr_t *h);
\end{center}
\begin{center}
-\begin{tabular}{cll}
+\begin{tabular}{clp{9cm}}
\hline
\multicolumn{1}{l}{\bf Field} & \multicolumn{1}{l}{\bf Type} & \multicolumn{1}{l}{\bf Description} \\\hline
{\tt DP} & {\tt uint16\_t[n]} & Read depth \\
{\tt GL} & {\tt float[n*G]} & Log10 likelihood of data; $G=\frac{A(A+1)}{2}$, $A=\#\{alleles\}$\\
{\tt GT} & {\tt uint8\_t[n]} & {\tt missing\char60\char60 7 | phased\char60\char60 6 | allele1\char60\char60 3 | allele2} \\
-{\tt \_GT} & {\tt uint8\_t+uint8\_t[n*P]} & {Generic GT; the first int equals the max ploidy $P$} \\
+{\tt \_GT} & {\tt uint8\_t+uint8\_t[n*P]} & {Generic GT; the first int equals the max ploidy $P$. If the highest bit is set,
+ the allele is not present (e.g. due to different ploidy between samples).} \\
{\tt GQ} & {\tt uint8\_t[n]} & {Genotype quality}\\
{\tt HQ} & {\tt uint8\_t[n*2]} & {Haplotype quality}\\
{\tt \_HQ} & {\tt uint8\_t+uint8\_t[n*P]} & {Generic HQ}\\
BCF proposal).
\item Predefined {\tt FORMAT} fields can be missing from VCF headers, but custom {\tt FORMAT} fields
are required to be explicitly defined in the headers.
-\item A {\tt FORMAT} field with its name starting with `{\tt \_}' gives an alternative
- binary representation of the corresponding VCF field. The alternative representation
- is used when the default representation is unable to keep the genotype information,
- for example, when the ploidy is over 2 or there are more than 8 alleles.
+\item A {\tt FORMAT} field with its name starting with `{\tt \_}' is specific to BCF only.
+ It gives an alternative binary representation of the corresponding VCF field, in case
+ the default representation is unable to keep the genotype information,
+ for example, when the ploidy is not 2 or there are more than 8 alleles.
\end{itemize}
\end{document}
-.TH bcftools 1 "2 October 2010" "bcftools" "Bioinformatics tools"
+.TH bcftools 1 "16 March 2011" "bcftools" "Bioinformatics tools"
.SH NAME
.PP
bcftools - Utilities for the Binary Call Format (BCF) and VCF.
.TP 10
.B view
.B bcftools view
-.RB [ \-cbuSAGgHvNQ ]
-.RB [ \-1
-.IR nGroup1 ]
+.RB [ \-AbFGNQSucgv ]
+.RB [ \-D
+.IR seqDict ]
.RB [ \-l
-.IR listFile ]
+.IR listLoci ]
+.RB [ \-s
+.IR listSample ]
+.RB [ \-i
+.IR gapSNPratio ]
.RB [ \-t
.IR mutRate ]
.RB [ \-p
.IR varThres ]
.RB [ \-P
.IR prior ]
+.RB [ \-1
+.IR nGroup1 ]
+.RB [ \-U
+.IR nPerm ]
+.RB [ \-X
+.IR permThres ]
.I in.bcf
.RI [ region ]
Convert between BCF and VCF, call variant candidates and estimate allele
frequencies.
-.B OPTIONS:
.RS
+.TP
+.B Input/Output Options:
+.TP 10
+.B -A
+Retain all possible alternate alleles at variant sites. By default, the view
+command discards unlikely alleles.
.TP 10
.B -b
Output in the BCF format. The default is VCF.
.TP
-.B -c
-Call variants.
+.BI -D \ FILE
+Sequence dictionary (list of chromosome names) for VCF->BCF conversion [null]
.TP
-.B -v
-Output variant sites only (force -c)
-.TP
-.B -g
-Call per-sample genotypes at variant sites (force -c)
-.TP
-.B -u
-Uncompressed BCF output (force -b).
-.TP
-.B -S
-The input is VCF instead of BCF.
-.TP
-.B -A
-Retain all possible alternate alleles at variant sites. By default, this
-command discards unlikely alleles.
+.B -F
+Indicate PL is generated by r921 or before (ordering is different).
.TP
.B -G
Suppress all individual genotype information.
.TP
-.B -H
-Perform Hardy-Weiberg Equilibrium test. This will add computation time, sometimes considerably.
+.BI -l \ FILE
+List of sites at which information are outputted [all sites]
.TP
.B -N
Skip sites where the REF field is not A/C/G/T
.B -Q
Output the QCALL likelihood format
.TP
-.B -f
-Reference-free variant calling mode. In this mode, the prior will be
-folded; a variant is called iff the sample(s) contains at least two
-alleles; the QUAL field in the VCF/BCF output is changed accordingly.
+.BI -s \ FILE
+List of samples to use. In the output, the ordering of samples will be identical
+to the one in
+.IR FILE .
+[null]
.TP
-.BI "-1 " INT
-Number of group-1 samples. This option is used for dividing input into
-two groups for comparing. A zero value disables this functionality. [0]
+.B -S
+The input is VCF instead of BCF.
.TP
-.BI "-l " FILE
-List of sites at which information are outputted [all sites]
+.B -u
+Uncompressed BCF output (force -b).
.TP
-.BI "-t " FLOAT
-Scaled muttion rate for variant calling [0.001]
+.B Consensus/Variant Calling Options:
+.TP 10
+.B -c
+Call variants.
+.TP
+.B -g
+Call per-sample genotypes at variant sites (force -c)
.TP
-.BI "-p " FLOAT
+.BI -i \ FLOAT
+Ratio of INDEL-to-SNP mutation rate [0.15]
+.TP
+.BI -p \ FLOAT
A site is considered to be a variant if P(ref|D)<FLOAT [0.5]
.TP
-.BI "-P " STR
+.BI -P \ STR
Prior or initial allele frequency spectrum. If STR can be
.IR full ,
.IR cond2 ,
.I flat
or the file consisting of error output from a previous variant calling
run.
+.TP
+.BI -t \ FLOAT
+Scaled muttion rate for variant calling [0.001]
+.TP
+.B -v
+Output variant sites only (force -c)
+.TP
+.B Contrast Calling and Association Test Options:
+.TP
+.BI -1 \ INT
+Number of group-1 samples. This option is used for dividing the samples into
+two groups for contrast SNP calling or association test.
+When this option is in use, the following VCF INFO will be outputted:
+PC2, PCHI2 and QCHI2. [0]
+.TP
+.BI -U \ INT
+Number of permutations for association test (effective only with
+.BR -1 )
+[0]
+.TP
+.BI -X \ FLOAT
+Only perform permutations for P(chi^2)<FLOAT (effective only with
+.BR -U )
+[0.01]
.RE
.TP
Concatenate BCF files. The input files are required to be sorted and
have identical samples appearing in the same order.
.RE
+
+.SH BCFTOOLS SPECIFIC VCF TAGS
+
+.TS
+center box;
+cb | cb | cb
+l | l | l .
+Tag Format Description
+_
+AF1 double Max-likelihood estimate of the site allele frequency (AF) of the first ALT allele
+CI95 double[2] Equal-tail Bayesian credible interval of AF at the 95% level
+DP int Raw read depth (without quality filtering)
+DP4 int[4] # high-quality reference forward bases, ref reverse, alternate for and alt rev bases
+FQ int Consensus quality. Positive: sample genotypes different; negative: otherwise
+MQ int Root-Mean-Square mapping quality of covering reads
+PC2 int[2] Phred probability of AF in group1 samples being larger (,smaller) than in group2
+PCHI2 double Posterior weighted chi^2 P-value between group1 and group2 samples
+PV4 double[4] P-value for strand bias, baseQ bias, mapQ bias and tail distance bias
+QCHI2 int Phred-scaled PCHI2
+RP int # permutations yielding a smaller PCHI2
+.TE
return 0;
}
+int bcf_fix_pl(bcf1_t *b)
+{
+ int i;
+ uint32_t tmp;
+ uint8_t *PL, *swap;
+ bcf_ginfo_t *gi;
+ // pinpoint PL
+ tmp = bcf_str2int("PL", 2);
+ for (i = 0; i < b->n_gi; ++i)
+ if (b->gi[i].fmt == tmp) break;
+ if (i == b->n_gi) return 0;
+ // prepare
+ gi = b->gi + i;
+ PL = (uint8_t*)gi->data;
+ swap = alloca(gi->len);
+ // loop through individuals
+ for (i = 0; i < b->n_smpl; ++i) {
+ int k, l, x;
+ uint8_t *PLi = PL + i * gi->len;
+ memcpy(swap, PLi, gi->len);
+ for (k = x = 0; k < b->n_alleles; ++k)
+ for (l = k; l < b->n_alleles; ++l)
+ PLi[l*(l+1)/2 + k] = swap[x++];
+ }
+ return 0;
+}
+
static void *locate_field(const bcf1_t *b, const char *fmt, int l)
{
int i;
return 0;
}
+// FIXME: only data are shuffled; the header is NOT
+int bcf_shuffle(bcf1_t *b, int seed)
+{
+ int i, j, *a;
+ if (seed > 0) srand48(seed);
+ a = malloc(b->n_smpl * sizeof(int));
+ for (i = 0; i < b->n_smpl; ++i) a[i] = i;
+ for (i = b->n_smpl; i > 1; --i) {
+ int tmp;
+ j = (int)(drand48() * i);
+ tmp = a[j]; a[j] = a[i-1]; a[i-1] = tmp;
+ }
+ for (j = 0; j < b->n_gi; ++j) {
+ bcf_ginfo_t *gi = b->gi + j;
+ uint8_t *swap, *data = (uint8_t*)gi->data;
+ swap = malloc(gi->len * b->n_smpl);
+ for (i = 0; i < b->n_smpl; ++i)
+ memcpy(swap + gi->len * a[i], data + gi->len * i, gi->len);
+ free(gi->data);
+ gi->data = swap;
+ }
+ free(a);
+ return 0;
+}
+
bcf_hdr_t *bcf_hdr_subsam(const bcf_hdr_t *h0, int n, char *const* samples, int *list)
{
int i, ret, j;
kstring_t s;
s.l = s.m = 0; s.s = 0;
hash = kh_init(str2id);
- for (i = 0; i < n; ++i)
- k = kh_put(str2id, hash, samples[i], &ret);
- for (i = j = 0; i < h0->n_smpl; ++i) {
- if (kh_get(str2id, hash, h0->sns[i]) != kh_end(hash)) {
- list[j++] = i;
- kputs(h0->sns[i], &s); kputc('\0', &s);
+ for (i = 0; i < h0->n_smpl; ++i) {
+ k = kh_put(str2id, hash, h0->sns[i], &ret);
+ kh_val(hash, k) = i;
+ }
+ for (i = j = 0; i < n; ++i) {
+ k = kh_get(str2id, hash, samples[i]);
+ if (k != kh_end(hash)) {
+ list[j++] = kh_val(hash, k);
+ kputs(samples[i], &s); kputc('\0', &s);
}
}
if (j < n) fprintf(stderr, "<%s> %d samples in the list but not in BCF.", __func__, n - j);
return h;
}
-int bcf_subsam(int n_smpl, int *list, bcf1_t *b) // list MUST BE sorted
+int bcf_subsam(int n_smpl, int *list, bcf1_t *b)
{
int i, j;
for (j = 0; j < b->n_gi; ++j) {
bcf_ginfo_t *gi = b->gi + j;
+ uint8_t *swap;
+ swap = malloc(gi->len * b->n_smpl);
for (i = 0; i < n_smpl; ++i)
- if (i != list[i]) memcpy((uint8_t*)gi->data + i * gi->len, (uint8_t*)gi->data + list[i] * gi->len, gi->len);
+ memcpy(swap + i * gi->len, (uint8_t*)gi->data + list[i] * gi->len, gi->len);
+ free(gi->data);
+ gi->data = swap;
}
b->n_smpl = n_smpl;
return 0;
#include "bcf.h"
#include "prob1.h"
#include "kstring.h"
+#include "time.h"
#include "khash.h"
KHASH_SET_INIT_INT64(set64)
#define VC_ADJLD 4096
#define VC_NO_INDEL 8192
#define VC_ANNO_MAX 16384
+#define VC_FIX_PL 32768
typedef struct {
- int flag, prior_type, n1, n_sub, *sublist;
+ int flag, prior_type, n1, n_sub, *sublist, n_perm;
char *fn_list, *prior_file, **subsam, *fn_dict;
uint8_t *ploidy;
- double theta, pref, indel_frac;
+ double theta, pref, indel_frac, min_perm_p;
} viewconf_t;
khash_t(set64) *bcf_load_pos(const char *fn, bcf_hdr_t *_h)
static int update_bcf1(int n_smpl, bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p1rst_t *pr, double pref, int flag)
{
kstring_t s;
- int is_var = (pr->p_ref < pref);
- double r = is_var? pr->p_ref : pr->p_var, fq;
+ int has_I16, is_var = (pr->p_ref < pref);
+ double fq, r = is_var? pr->p_ref : pr->p_var;
anno16_t a;
- test16(b, &a);
+ has_I16 = test16(b, &a) >= 0? 1 : 0;
rm_info(b, "I16=");
memset(&s, 0, sizeof(kstring_t));
if (b->info[0]) kputc(';', &s);
// ksprintf(&s, "AF1=%.4lg;AFE=%.4lg;CI95=%.4lg,%.4lg", 1.-pr->f_em, 1.-pr->f_exp, pr->cil, pr->cih);
ksprintf(&s, "AF1=%.4g;CI95=%.4g,%.4g", 1.-pr->f_em, pr->cil, pr->cih);
- ksprintf(&s, ";DP4=%d,%d,%d,%d;MQ=%d", a.d[0], a.d[1], a.d[2], a.d[3], a.mq);
+ if (has_I16) ksprintf(&s, ";DP4=%d,%d,%d,%d;MQ=%d", a.d[0], a.d[1], a.d[2], a.d[3], a.mq);
fq = pr->p_ref_folded < 0.5? -4.343 * log(pr->p_ref_folded) : 4.343 * log(pr->p_var_folded);
if (fq < -999) fq = -999;
if (fq > 999) fq = 999;
ksprintf(&s, ";FQ=%.3g", fq);
- if (a.is_tested) {
- if (pr->pc[0] >= 0.) ksprintf(&s, ";PC4=%g,%g,%g,%g", pr->pc[0], pr->pc[1], pr->pc[2], pr->pc[3]);
- ksprintf(&s, ";PV4=%.2g,%.2g,%.2g,%.2g", a.p[0], a.p[1], a.p[2], a.p[3]);
+ if (pr->cmp[0] >= 0.) {
+ int i, q[3], pq;
+ for (i = 1; i < 3; ++i) {
+ double x = pr->cmp[i] + pr->cmp[0]/2.;
+ q[i] = x == 0? 255 : (int)(-4.343 * log(x) + .499);
+ if (q[i] > 255) q[i] = 255;
+ }
+ pq = (int)(-4.343 * log(pr->p_chi2) + .499);
+ if (pr->perm_rank >= 0) ksprintf(&s, ";PR=%d", pr->perm_rank);
+ ksprintf(&s, ";QCHI2=%d;PCHI2=%.3g;PC2=%d,%d", pq, q[1], q[2], pr->p_chi2);
+// ksprintf(&s, ",%g,%g,%g", pr->cmp[0], pr->cmp[1], pr->cmp[2]);
}
+ if (has_I16 && a.is_tested) ksprintf(&s, ";PV4=%.2g,%.2g,%.2g,%.2g", a.p[0], a.p[1], a.p[2], a.p[3]);
kputc('\0', &s);
kputs(b->fmt, &s); kputc('\0', &s);
free(b->str);
if (!strstr(str.s, "##INFO=<ID=MQ,"))
kputs("##INFO=<ID=MQ,Number=1,Type=Integer,Description=\"Root-mean-square mapping quality of covering reads\">\n", &str);
if (!strstr(str.s, "##INFO=<ID=FQ,"))
- kputs("##INFO=<ID=FQ,Number=1,Type=Float,Description=\"Phred probability that sample chromosomes are not all the same\">\n", &str);
+ kputs("##INFO=<ID=FQ,Number=1,Type=Float,Description=\"Phred probability of all samples being the same\">\n", &str);
if (!strstr(str.s, "##INFO=<ID=AF1,"))
kputs("##INFO=<ID=AF1,Number=1,Type=Float,Description=\"Max-likelihood estimate of the site allele frequency of the first ALT allele\">\n", &str);
if (!strstr(str.s, "##INFO=<ID=CI95,"))
kputs("##INFO=<ID=PV4,Number=4,Type=Float,Description=\"P-values for strand bias, baseQ bias, mapQ bias and tail distance bias\">\n", &str);
if (!strstr(str.s, "##INFO=<ID=INDEL,"))
kputs("##INFO=<ID=INDEL,Number=0,Type=Flag,Description=\"Indicates that the variant is an INDEL.\">\n", &str);
- if (!strstr(str.s, "##INFO=<ID=GT,"))
+ if (!strstr(str.s, "##INFO=<ID=PC2,"))
+ kputs("##INFO=<ID=PC2,Number=2,Type=Integer,Description=\"Phred probability of the nonRef allele frequency in group1 samples being larger (,smaller) than in group2.\">\n", &str);
+ if (!strstr(str.s, "##INFO=<ID=PCHI2,"))
+ kputs("##INFO=<ID=PCHI2,Number=1,Type=Float,Description=\"Posterior weighted chi^2 P-value for testing the association between group1 and group2 samples.\">\n", &str);
+ if (!strstr(str.s, "##INFO=<ID=QCHI2,"))
+ kputs("##INFO=<ID=QCHI2,Number=1,Type=Integer,Description=\"Phred scaled PCHI2.\">\n", &str);
+ if (!strstr(str.s, "##INFO=<ID=RP,"))
+ kputs("##INFO=<ID=RP,Number=1,Type=Integer,Description=\"# permutations yielding a smaller PCHI2.\">\n", &str);
+ if (!strstr(str.s, "##FORMAT=<ID=GT,"))
kputs("##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n", &str);
if (!strstr(str.s, "##FORMAT=<ID=GQ,"))
kputs("##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Quality\">\n", &str);
extern void bcf_p1_indel_prior(bcf_p1aux_t *ma, double x);
extern int bcf_fix_gt(bcf1_t *b);
extern int bcf_anno_max(bcf1_t *b);
+ extern int bcf_shuffle(bcf1_t *b, int seed);
bcf_t *bp, *bout = 0;
bcf1_t *b, *blast;
- int c;
+ int c, *seeds = 0;
uint64_t n_processed = 0;
viewconf_t vc;
bcf_p1aux_t *p1 = 0;
tid = begin = end = -1;
memset(&vc, 0, sizeof(viewconf_t));
- vc.prior_type = vc.n1 = -1; vc.theta = 1e-3; vc.pref = 0.5; vc.indel_frac = -1.;
- while ((c = getopt(argc, argv, "N1:l:cHAGvbSuP:t:p:QgLi:IMs:D:")) >= 0) {
+ vc.prior_type = vc.n1 = -1; vc.theta = 1e-3; vc.pref = 0.5; vc.indel_frac = -1.; vc.n_perm = 0; vc.min_perm_p = 0.01;
+ while ((c = getopt(argc, argv, "FN1:l:cHAGvbSuP:t:p:QgLi:IMs:D:U:X:")) >= 0) {
switch (c) {
case '1': vc.n1 = atoi(optarg); break;
case 'l': vc.fn_list = strdup(optarg); break;
case 'D': vc.fn_dict = strdup(optarg); break;
+ case 'F': vc.flag |= VC_FIX_PL; break;
case 'N': vc.flag |= VC_ACGT_ONLY; break;
case 'G': vc.flag |= VC_NO_GENO; break;
case 'A': vc.flag |= VC_KEEPALT; break;
case 'i': vc.indel_frac = atof(optarg); break;
case 'Q': vc.flag |= VC_QCALL; break;
case 'L': vc.flag |= VC_ADJLD; break;
+ case 'U': vc.n_perm = atoi(optarg); break;
+ case 'X': vc.min_perm_p = atof(optarg); break;
case 's': vc.subsam = read_samples(optarg, &vc.n_sub);
vc.ploidy = calloc(vc.n_sub + 1, 1);
for (tid = 0; tid < vc.n_sub; ++tid) vc.ploidy[tid] = vc.subsam[tid][strlen(vc.subsam[tid]) + 1];
fprintf(stderr, " -S input is VCF\n");
fprintf(stderr, " -A keep all possible alternate alleles at variant sites\n");
fprintf(stderr, " -G suppress all individual genotype information\n");
- fprintf(stderr, " -H perform Hardy-Weinberg test (slower)\n");
fprintf(stderr, " -N skip sites where REF is not A/C/G/T\n");
fprintf(stderr, " -Q output the QCALL likelihood format\n");
fprintf(stderr, " -L calculate LD for adjacent sites\n");
fprintf(stderr, " -I skip indels\n");
+ fprintf(stderr, " -F PL generated by r921 or before\n");
fprintf(stderr, " -D FILE sequence dictionary for VCF->BCF conversion [null]\n");
- fprintf(stderr, " -1 INT number of group-1 samples [0]\n");
fprintf(stderr, " -l FILE list of sites to output [all sites]\n");
fprintf(stderr, " -t FLOAT scaled substitution mutation rate [%.4g]\n", vc.theta);
fprintf(stderr, " -i FLOAT indel-to-substitution ratio [%.4g]\n", vc.indel_frac);
fprintf(stderr, " -p FLOAT variant if P(ref|D)<FLOAT [%.3g]\n", vc.pref);
fprintf(stderr, " -P STR type of prior: full, cond2, flat [full]\n");
fprintf(stderr, " -s FILE list of samples to use [all samples]\n");
+ fprintf(stderr, " -1 INT number of group-1 samples [0]\n");
+ fprintf(stderr, " -U INT number of permutations for association testing (effective with -1) [0]\n");
+ fprintf(stderr, " -X FLOAT only perform permutations for P(chi^2)<FLOAT [%g]\n", vc.min_perm_p);
fprintf(stderr, "\n");
return 1;
}
fprintf(stderr, "[%s] For VCF->BCF conversion please specify the sequence dictionary with -D\n", __func__);
return 1;
}
+ if (vc.n1 <= 0) vc.n_perm = 0; // TODO: give a warning here!
+ if (vc.n_perm > 0) {
+ seeds = malloc(vc.n_perm * sizeof(int));
+ srand48(time(0));
+ for (c = 0; c < vc.n_perm; ++c) seeds[c] = lrand48();
+ }
b = calloc(1, sizeof(bcf1_t));
blast = calloc(1, sizeof(bcf1_t));
strcpy(moder, "r");
while (vcf_read(bp, hin, b) > 0) {
int is_indel;
if (vc.n_sub) bcf_subsam(vc.n_sub, vc.sublist, b);
+ if (vc.flag & VC_FIX_PL) bcf_fix_pl(b);
is_indel = bcf_is_indel(b);
if ((vc.flag & VC_NO_INDEL) && is_indel) continue;
if ((vc.flag & VC_ACGT_ONLY) && !is_indel) {
bcf_p1_dump_afs(p1);
}
if (pr.p_ref >= vc.pref && (vc.flag & VC_VARONLY)) continue;
+ if (vc.n_perm && vc.n1 > 0 && pr.p_chi2 < vc.min_perm_p) { // permutation test
+ bcf_p1rst_t r;
+ int i, n = 0;
+ for (i = 0; i < vc.n_perm; ++i) {
+ bcf_shuffle(b, seeds[i]);
+ bcf_p1_cal(b, p1, &r);
+ if (pr.p_chi2 >= r.p_chi2) ++n;
+ }
+ pr.perm_rank = n;
+ }
update_bcf1(hout->n_smpl, b, p1, &pr, vc.pref, vc.flag);
}
if (vc.flag & VC_ADJLD) { // compute LD
for (i = 0; i < vc.n_sub; ++i) free(vc.subsam[i]);
free(vc.subsam); free(vc.sublist);
}
+ if (seeds) free(seeds);
if (p1) bcf_p1_destroy(p1);
return 0;
}
double *phi, *phi_indel;
double *z, *zswap; // aux for afs
double *z1, *z2, *phi1, *phi2; // only calculated when n1 is set
+ double **hg; // hypergeometric distribution
double t, t1, t2;
double *afs, *afs1; // afs: accumulative AFS; afs1: site posterior distribution
const uint8_t *PL; // point to PL
void bcf_p1_destroy(bcf_p1aux_t *ma)
{
if (ma) {
+ int k;
+ if (ma->hg && ma->n1 > 0) {
+ for (k = 0; k <= 2*ma->n1; ++k) free(ma->hg[k]);
+ free(ma->hg);
+ }
free(ma->ploidy); free(ma->q2p); free(ma->pdg);
free(ma->phi); free(ma->phi_indel); free(ma->phi1); free(ma->phi2);
free(ma->z); free(ma->zswap); free(ma->z1); free(ma->z2);
}
for (i = 0, sum = 0.; i < 3; ++i)
sum += (g[i] = pdg[i] * f3[i]);
- for (i = 0, max = -1., max_i = 0; i <= ploidy; ++i) {
+ for (i = 0, max = -1., max_i = 0; i < 3; ++i) {
g[i] /= sum;
if (g[i] > max) max = g[i], max_i = i;
}
last_min = last_max = 0;
ma->t = 0.;
if (ma->M == ma->n * 2) {
+ int M = 0;
for (_j = beg; _j < ma->n; ++_j) {
- int k, j = _j - beg, _min = last_min, _max = last_max;
+ int k, j = _j - beg, _min = last_min, _max = last_max, M0;
double p[3], sum;
+ M0 = M; M += 2;
pdg = ma->pdg + _j * 3;
p[0] = pdg[0]; p[1] = 2. * pdg[1]; p[2] = pdg[2];
for (; _min < _max && z[0][_min] < TINY; ++_min) z[0][_min] = z[1][_min] = 0.;
for (; _max > _min && z[0][_max] < TINY; --_max) z[0][_max] = z[1][_max] = 0.;
_max += 2;
- if (_min == 0)
- k = 0, z[1][k] = (2*j+2-k)*(2*j-k+1) * p[0] * z[0][k];
- if (_min <= 1)
- k = 1, z[1][k] = (2*j+2-k)*(2*j-k+1) * p[0] * z[0][k] + k*(2*j+2-k) * p[1] * z[0][k-1];
+ if (_min == 0) k = 0, z[1][k] = (M0-k+1) * (M0-k+2) * p[0] * z[0][k];
+ if (_min <= 1) k = 1, z[1][k] = (M0-k+1) * (M0-k+2) * p[0] * z[0][k] + k*(M0-k+2) * p[1] * z[0][k-1];
for (k = _min < 2? 2 : _min; k <= _max; ++k)
- z[1][k] = (2*j+2-k)*(2*j-k+1) * p[0] * z[0][k]
- + k*(2*j+2-k) * p[1] * z[0][k-1]
- + k*(k-1)* p[2] * z[0][k-2];
+ z[1][k] = (M0-k+1)*(M0-k+2) * p[0] * z[0][k] + k*(M0-k+2) * p[1] * z[0][k-1] + k*(k-1)* p[2] * z[0][k-2];
for (k = _min, sum = 0.; k <= _max; ++k) sum += z[1][k];
- ma->t += log(sum / ((2. * j + 2) * (2. * j + 1)));
+ ma->t += log(sum / (M * (M - 1.)));
for (k = _min; k <= _max; ++k) z[1][k] /= sum;
if (_min >= 1) z[1][_min-1] = 0.;
if (_min >= 2) z[1][_min-2] = 0.;
tmp = z[0]; z[0] = z[1]; z[1] = tmp;
last_min = _min; last_max = _max;
}
+ //for (_j = 0; _j < last_min; ++_j) z[0][_j] = 0.; // TODO: are these necessary?
+ //for (_j = last_max + 1; _j < ma->M; ++_j) z[0][_j] = 0.;
} else { // this block is very similar to the block above; these two might be merged in future
int j, M = 0;
for (j = 0; j < ma->n; ++j) {
} else mc_cal_y_core(ma, 0);
}
-static void contrast(bcf_p1aux_t *ma, double pc[4]) // mc_cal_y() must be called before hand
+#define CONTRAST_TINY 1e-30
+
+extern double kf_gammaq(double s, double z); // incomplete gamma function for chi^2 test
+
+static inline double chi2_test(int a, int b, int c, int d)
+{
+ double x, z;
+ x = (double)(a+b) * (c+d) * (b+d) * (a+c);
+ if (x == 0.) return 1;
+ z = a * d - b * c;
+ return kf_gammaq(.5, .5 * z * z * (a+b+c+d) / x);
+}
+
+// chi2=(a+b+c+d)(ad-bc)^2/[(a+b)(c+d)(a+c)(b+d)]
+static inline double contrast2_aux(const bcf_p1aux_t *p1, double sum, int n1, int n2, int k1, int k2, double x[3])
+{
+ double p = p1->phi[k1+k2] * p1->z1[k1] * p1->z2[k2] / sum * p1->hg[k1][k2];
+ if (p < CONTRAST_TINY) return -1;
+ if (.5*k1/n1 < .5*k2/n2) x[1] += p;
+ else if (.5*k1/n1 > .5*k2/n2) x[2] += p;
+ else x[0] += p;
+ return p * chi2_test(k1, k2, (n1<<1) - k1, (n2<<1) - k2);
+}
+
+static double contrast2(bcf_p1aux_t *p1, double ret[3])
{
- int k, n1 = ma->n1, n2 = ma->n - ma->n1;
- long double sum1, sum2;
- pc[0] = pc[1] = pc[2] = pc[3] = -1.;
- if (n1 <= 0 || n2 <= 0) return;
- for (k = 0, sum1 = 0.; k <= 2*n1; ++k) sum1 += ma->phi1[k] * ma->z1[k];
- for (k = 0, sum2 = 0.; k <= 2*n2; ++k) sum2 += ma->phi2[k] * ma->z2[k];
- pc[2] = ma->phi1[2*n1] * ma->z1[2*n1] / sum1;
- pc[3] = ma->phi2[2*n2] * ma->z2[2*n2] / sum2;
- for (k = 2; k < 4; ++k) {
- pc[k] = pc[k] > .5? -(-4.343 * log(1. - pc[k] + TINY) + .499) : -4.343 * log(pc[k] + TINY) + .499;
- pc[k] = (int)pc[k];
- if (pc[k] > 99) pc[k] = 99;
- if (pc[k] < -99) pc[k] = -99;
+ int k, k1, k2, k10, k20, n1, n2;
+ double sum;
+ // get n1 and n2
+ n1 = p1->n1; n2 = p1->n - p1->n1;
+ if (n1 <= 0 || n2 <= 0) return 0.;
+ if (p1->hg == 0) { // initialize the hypergeometric distribution
+ /* NB: the hg matrix may take a lot of memory when there are many samples. There is a way
+ to avoid precomputing this matrix, but it is slower and quite intricate. The following
+ computation in this block can be accelerated with a similar strategy, but perhaps this
+ is not a serious concern for now. */
+ double tmp = lgamma(2*(n1+n2)+1) - (lgamma(2*n1+1) + lgamma(2*n2+1));
+ p1->hg = calloc(2*n1+1, sizeof(void*));
+ for (k1 = 0; k1 <= 2*n1; ++k1) {
+ p1->hg[k1] = calloc(2*n2+1, sizeof(double));
+ for (k2 = 0; k2 <= 2*n2; ++k2)
+ p1->hg[k1][k2] = exp(lgamma(k1+k2+1) + lgamma(p1->M-k1-k2+1) - (lgamma(k1+1) + lgamma(k2+1) + lgamma(2*n1-k1+1) + lgamma(2*n2-k2+1) + tmp));
+ }
+ }
+ { // compute sum1 and sum2
+ long double suml = 0;
+ for (k = 0; k <= p1->M; ++k) suml += p1->phi[k] * p1->z[k];
+ sum = suml;
+ }
+ { // get the mean k1 and k2
+ double max;
+ int max_k;
+ for (k = 0, max = 0, max_k = -1; k <= 2*n1; ++k) {
+ double x = p1->phi1[k] * p1->z1[k];
+ if (x > max) max = x, max_k = k;
+ }
+ k10 = max_k;
+ for (k = 0, max = 0, max_k = -1; k <= 2*n2; ++k) {
+ double x = p1->phi2[k] * p1->z2[k];
+ if (x > max) max = x, max_k = k;
+ }
+ k20 = max_k;
+ }
+ { // We can do the following with one nested loop, but that is an O(N^2) thing. The following code block is much faster for large N.
+ double x[3], y;
+ long double z = 0.;
+ x[0] = x[1] = x[2] = 0;
+ for (k1 = k10; k1 >= 0; --k1) {
+ for (k2 = k20; k2 >= 0; --k2) {
+ if ((y = contrast2_aux(p1, sum, n1, n2, k1, k2, x)) < 0) break;
+ else z += y;
+ }
+ for (k2 = k20 + 1; k2 <= 2*n2; ++k2) {
+ if ((y = contrast2_aux(p1, sum, n1, n2, k1, k2, x)) < 0) break;
+ else z += y;
+ }
+ }
+ ret[0] = x[0]; ret[1] = x[1]; ret[2] = x[2];
+ x[0] = x[1] = x[2] = 0;
+ for (k1 = k10 + 1; k1 <= 2*n1; ++k1) {
+ for (k2 = k20; k2 >= 0; --k2) {
+ if ((y = contrast2_aux(p1, sum, n1, n2, k1, k2, x)) < 0) break;
+ else z += y;
+ }
+ for (k2 = k20 + 1; k2 <= 2*n2; ++k2) {
+ if ((y = contrast2_aux(p1, sum, n1, n2, k1, k2, x)) < 0) break;
+ else z += y;
+ }
+ }
+ ret[0] += x[0]; ret[1] += x[1]; ret[2] += x[2];
+ if (ret[0] + ret[1] + ret[2] < 0.99) { // in case of bad things happened
+ ret[0] = ret[1] = ret[2] = 0;
+ for (k1 = 0, z = 0.; k1 <= 2*n1; ++k1)
+ for (k2 = 0; k2 <= 2*n2; ++k2)
+ if ((y = contrast2_aux(p1, sum, n1, n2, k1, k2, ret)) >= 0) z += y;
+ }
+ return (double)z;
}
- pc[0] = ma->phi2[2*n2] * ma->z2[2*n2] / sum2 * (1. - ma->phi1[2*n1] * ma->z1[2*n1] / sum1);
- pc[1] = ma->phi1[2*n1] * ma->z1[2*n1] / sum1 * (1. - ma->phi2[2*n2] * ma->z2[2*n2] / sum2);
- pc[0] = pc[0] == 1.? 99 : (int)(-4.343 * log(1. - pc[0]) + .499);
- pc[1] = pc[1] == 1.? 99 : (int)(-4.343 * log(1. - pc[1]) + .499);
}
static double mc_cal_afs(bcf_p1aux_t *ma, double *p_ref_folded, double *p_var_folded)
int i, k;
long double sum = 0.;
ma->is_indel = bcf_is_indel(b);
+ rst->perm_rank = -1;
// set PL and PL_len
for (i = 0; i < b->n_gi; ++i) {
if (b->gi[i].fmt == bcf_str2int("PL", 2)) {
rst->cil = (double)(ma->M - h) / ma->M; rst->cih = (double)(ma->M - l) / ma->M;
}
rst->g[0] = rst->g[1] = rst->g[2] = -1.;
- contrast(ma, rst->pc);
+ rst->cmp[0] = rst->cmp[1] = rst->cmp[2] = rst->p_chi2 = -1.0;
+ if (rst->p_var > 0.1) // skip contrast2() if the locus is a strong non-variant
+ rst->p_chi2 = contrast2(ma, rst->cmp);
return 0;
}
typedef struct __bcf_p1aux_t bcf_p1aux_t;
typedef struct {
- int rank0;
+ int rank0, perm_rank; // NB: perm_rank is always set to -1 by bcf_p1_cal()
double f_em, f_exp, f_flat, p_ref_folded, p_ref, p_var_folded, p_var;
double cil, cih;
- double pc[4];
+ double cmp[3], p_chi2; // used by contrast2()
double g[3];
} bcf_p1rst_t;
static
BGZF*
-open_write(int fd, bool is_uncompressed)
+open_write(int fd, int compress_level) // compress_level==-1 for the default level
{
FILE* file = fdopen(fd, "w");
BGZF* fp;
fp = malloc(sizeof(BGZF));
fp->file_descriptor = fd;
fp->open_mode = 'w';
- fp->owned_file = 0; fp->is_uncompressed = is_uncompressed;
+ fp->owned_file = 0;
+ fp->compress_level = compress_level < 0? Z_DEFAULT_COMPRESSION : compress_level; // Z_DEFAULT_COMPRESSION==-1
+ if (fp->compress_level > 9) fp->compress_level = Z_DEFAULT_COMPRESSION;
#ifdef _USE_KNETFILE
fp->x.fpw = file;
#else
fp = open_read(fd);
#endif
} else if (strchr(mode, 'w') || strchr(mode, 'W')) {
- int fd, oflag = O_WRONLY | O_CREAT | O_TRUNC;
+ int fd, compress_level = -1, oflag = O_WRONLY | O_CREAT | O_TRUNC;
#ifdef _WIN32
oflag |= O_BINARY;
#endif
fd = open(path, oflag, 0666);
if (fd == -1) return 0;
- fp = open_write(fd, strchr(mode, 'u')? 1 : 0);
+ { // set compress_level
+ int i;
+ for (i = 0; mode[i]; ++i)
+ if (mode[i] >= '0' && mode[i] <= '9') break;
+ if (mode[i]) compress_level = (int)mode[i] - '0';
+ if (strchr(mode, 'u')) compress_level = 0;
+ }
+ fp = open_write(fd, compress_level);
}
if (fp != NULL) fp->owned_file = 1;
return fp;
if (mode[0] == 'r' || mode[0] == 'R') {
return open_read(fd);
} else if (mode[0] == 'w' || mode[0] == 'W') {
- return open_write(fd, strstr(mode, "u")? 1 : 0);
+ int i, compress_level = -1;
+ for (i = 0; mode[i]; ++i)
+ if (mode[i] >= '0' && mode[i] <= '9') break;
+ if (mode[i]) compress_level = (int)mode[i] - '0';
+ if (strchr(mode, 'u')) compress_level = 0;
+ return open_write(fd, compress_level);
} else {
return NULL;
}
int input_length = block_length;
int compressed_length = 0;
while (1) {
- int compress_level = fp->is_uncompressed? 0 : Z_DEFAULT_COMPRESSION;
z_stream zs;
zs.zalloc = NULL;
zs.zfree = NULL;
zs.next_out = (void*)&buffer[BLOCK_HEADER_LENGTH];
zs.avail_out = buffer_size - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
- int status = deflateInit2(&zs, compress_level, Z_DEFLATED,
+ int status = deflateInit2(&zs, fp->compress_level, Z_DEFLATED,
GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY);
if (status != Z_OK) {
report_error(fp, "deflate init failed");
// Inflate the block in fp->compressed_block into fp->uncompressed_block
z_stream zs;
+ int status;
zs.zalloc = NULL;
zs.zfree = NULL;
zs.next_in = fp->compressed_block + 18;
zs.next_out = fp->uncompressed_block;
zs.avail_out = fp->uncompressed_block_size;
- int status = inflateInit2(&zs, GZIP_WINDOW_BITS);
+ status = inflateInit2(&zs, GZIP_WINDOW_BITS);
if (status != Z_OK) {
report_error(fp, "inflate init failed");
return -1;
bgzf_read_block(BGZF* fp)
{
bgzf_byte_t header[BLOCK_HEADER_LENGTH];
- int count, size = 0;
+ int count, size = 0, block_length, remaining;
#ifdef _USE_KNETFILE
int64_t block_address = knet_tell(fp->x.fpr);
if (load_block_from_cache(fp, block_address)) return 0;
report_error(fp, "invalid block header");
return -1;
}
- int block_length = unpackInt16((uint8_t*)&header[16]) + 1;
+ block_length = unpackInt16((uint8_t*)&header[16]) + 1;
bgzf_byte_t* compressed_block = (bgzf_byte_t*) fp->compressed_block;
memcpy(compressed_block, header, BLOCK_HEADER_LENGTH);
- int remaining = block_length - BLOCK_HEADER_LENGTH;
+ remaining = block_length - BLOCK_HEADER_LENGTH;
#ifdef _USE_KNETFILE
count = knet_read(fp->x.fpr, &compressed_block[BLOCK_HEADER_LENGTH], remaining);
#else
int bytes_read = 0;
bgzf_byte_t* output = data;
while (bytes_read < length) {
- int available = fp->block_length - fp->block_offset;
+ int copy_length, available = fp->block_length - fp->block_offset;
+ bgzf_byte_t *buffer;
if (available <= 0) {
if (bgzf_read_block(fp) != 0) {
return -1;
break;
}
}
- int copy_length = bgzf_min(length-bytes_read, available);
- bgzf_byte_t* buffer = fp->uncompressed_block;
+ copy_length = bgzf_min(length-bytes_read, available);
+ buffer = fp->uncompressed_block;
memcpy(output, buffer + fp->block_offset, copy_length);
fp->block_offset += copy_length;
output += copy_length;
int bgzf_write(BGZF* fp, const void* data, int length)
{
+ const bgzf_byte_t *input = data;
+ int block_length, bytes_written;
if (fp->open_mode != 'w') {
report_error(fp, "file not open for writing");
return -1;
if (fp->uncompressed_block == NULL)
fp->uncompressed_block = malloc(fp->uncompressed_block_size);
- const bgzf_byte_t* input = data;
- int block_length = fp->uncompressed_block_size;
- int bytes_written = 0;
+ input = data;
+ block_length = fp->uncompressed_block_size;
+ bytes_written = 0;
while (bytes_written < length) {
int copy_length = bgzf_min(block_length - fp->block_offset, length - bytes_written);
bgzf_byte_t* buffer = fp->uncompressed_block;
#include <stdint.h>
#include <stdio.h>
-#include <stdbool.h>
#include <zlib.h>
#ifdef _USE_KNETFILE
#include "knetfile.h"
typedef struct {
int file_descriptor;
char open_mode; // 'r' or 'w'
- bool owned_file, is_uncompressed;
+ int16_t owned_file, compress_level;
#ifdef _USE_KNETFILE
union {
knetFile *fpr;
{
samfile_t *fp;
fp = (samfile_t*)calloc(1, sizeof(samfile_t));
- if (mode[0] == 'r') { // read
+ if (strchr(mode, 'r')) { // read
fp->type |= TYPE_READ;
- if (mode[1] == 'b') { // binary
+ if (strchr(mode, 'b')) { // binary
fp->type |= TYPE_BAM;
fp->x.bam = strcmp(fn, "-")? bam_open(fn, "r") : bam_dopen(fileno(stdin), "r");
if (fp->x.bam == 0) goto open_err_ret;
fprintf(stderr, "[samopen] no @SQ lines in the header.\n");
} else fprintf(stderr, "[samopen] SAM header is present: %d sequences.\n", fp->header->n_targets);
}
- } else if (mode[0] == 'w') { // write
+ } else if (strchr(mode, 'w')) { // write
fp->header = bam_header_dup((const bam_header_t*)aux);
- if (mode[1] == 'b') { // binary
+ if (strchr(mode, 'b')) { // binary
char bmode[3];
- bmode[0] = 'w'; bmode[1] = strstr(mode, "u")? 'u' : 0; bmode[2] = 0;
+ int i, compress_level = -1;
+ for (i = 0; mode[i]; ++i) if (mode[i] >= '0' && mode[i] <= '9') break;
+ if (mode[i]) compress_level = mode[i] - '0';
+ if (strchr(mode, 'u')) compress_level = 0;
+ bmode[0] = 'w'; bmode[1] = compress_level < 0? 0 : compress_level + '0'; bmode[2] = 0;
fp->type |= TYPE_BAM;
fp->x.bam = strcmp(fn, "-")? bam_open(fn, bmode) : bam_dopen(fileno(stdout), bmode);
if (fp->x.bam == 0) goto open_err_ret;
// open file
fp->x.tamw = strcmp(fn, "-")? fopen(fn, "w") : stdout;
if (fp->x.tamr == 0) goto open_err_ret;
- if (strstr(mode, "X")) fp->type |= BAM_OFSTR<<2;
- else if (strstr(mode, "x")) fp->type |= BAM_OFHEX<<2;
+ if (strchr(mode, 'X')) fp->type |= BAM_OFSTR<<2;
+ else if (strchr(mode, 'x')) fp->type |= BAM_OFHEX<<2;
else fp->type |= BAM_OFDEC<<2;
// write header
- if (strstr(mode, "h")) {
+ if (strchr(mode, 'h')) {
int i;
bam_header_t *alt;
// parse the header text
int main_samview(int argc, char *argv[])
{
- int c, is_header = 0, is_header_only = 0, is_bamin = 1, ret = 0, is_uncompressed = 0, is_bamout = 0, slx2sngr = 0, is_count = 0;
+ int c, is_header = 0, is_header_only = 0, is_bamin = 1, ret = 0, compress_level = -1, is_bamout = 0, slx2sngr = 0, is_count = 0;
int of_type = BAM_OFDEC, is_long_help = 0;
int count = 0;
samfile_t *in = 0, *out = 0;
/* parse command-line options */
strcpy(in_mode, "r"); strcpy(out_mode, "w");
- while ((c = getopt(argc, argv, "Sbct:hHo:q:f:F:ul:r:xX?T:CR:")) >= 0) {
+ while ((c = getopt(argc, argv, "Sbct:h1Ho:q:f:F:ul:r:xX?T:CR:")) >= 0) {
switch (c) {
case 'c': is_count = 1; break;
case 'C': slx2sngr = 1; break;
case 'f': g_flag_on = strtol(optarg, 0, 0); break;
case 'F': g_flag_off = strtol(optarg, 0, 0); break;
case 'q': g_min_mapQ = atoi(optarg); break;
- case 'u': is_uncompressed = 1; break;
+ case 'u': compress_level = 0; break;
+ case '1': compress_level = 1; break;
case 'l': g_library = strdup(optarg); break;
case 'r': g_rg = strdup(optarg); break;
case 'R': fn_rg = strdup(optarg); break;
default: return usage(is_long_help);
}
}
- if (is_uncompressed) is_bamout = 1;
+ if (compress_level >= 0) is_bamout = 1;
if (is_header_only) is_header = 1;
if (is_bamout) strcat(out_mode, "b");
else {
}
if (is_bamin) strcat(in_mode, "b");
if (is_header) strcat(out_mode, "h");
- if (is_uncompressed) strcat(out_mode, "u");
+ if (compress_level >= 0) {
+ char tmp[2];
+ tmp[0] = compress_level + '0'; tmp[1] = '\0';
+ strcat(out_mode, tmp);
+ }
if (argc == optind) return usage(is_long_help); // potential memory leak...
// read the list of read groups
fprintf(stderr, " -H print header only (no alignments)\n");
fprintf(stderr, " -S input is SAM\n");
fprintf(stderr, " -u uncompressed BAM output (force -b)\n");
+ fprintf(stderr, " -1 fast compression (force -b)\n");
fprintf(stderr, " -x output FLAG in HEX (samtools-C specific)\n");
fprintf(stderr, " -X output FLAG in string (samtools-C specific)\n");
fprintf(stderr, " -c print only the count of matching records\n");
-.TH samtools 1 "1 March 2011" "samtools-0.1.13" "Bioinformatics tools"
+.TH samtools 1 "16 March 2011" "samtools-0.1.14" "Bioinformatics tools"
.SH NAME
.PP
samtools - Utilities for the Sequence Alignment/Map (SAM) format
.B OPTIONS:
.RS
-.TP 8
+.TP 10
+.B -A
+Do not skip anomalous read pairs in variant calling.
+.TP
.B -B
Disable probabilistic realignment for the computation of base alignment
quality (BAQ). BAQ is the Phred-scaled probability of a read base being
about sqrt((INT-q)/INT)*INT. A zero value disables this
functionality; if enabled, the recommended value for BWA is 50. [0]
.TP
+.BI -d \ INT
+At a position, read maximally
+.I INT
+reads per input BAM. [250]
+.TP
+.B -D
+Output per-sample read depth
+.TP
.BI -e \ INT
Phred-scaled gap extension sequencing error probability. Reducing
.I INT
.IR INT * s / l .
[100]
.TP
+.B -I
+Do not perform INDEL calling
+.TP
.BI -l \ FILE
File containing a list of sites where pileup or BCF is outputted [null]
.TP
+.BI -L \ INT
+Skip INDEL calling if the average per-sample depth is above
+.IR INT .
+[250]
+.TP
.BI -o \ INT
Phred-scaled gap open sequencing error probability. Reducing
.I INT
.I STR
[all sites]
.TP
+.B -S
+Output per-sample Phred-scaled strand bias P-value
+.TP
.B -u
Similar to
.B -g