@copyright Genome Research Ltd.
*/
-#define BAM_VERSION "0.1.13 (r926:161)"
+#define BAM_VERSION "0.1.14 (r933:170)"
#include <stdint.h>
#include <stdlib.h>
#define MPLP_FMT_SP 0x200
#define MPLP_NO_INDEL 0x400
#define MPLP_EXT_BAQ 0x800
+#define MPLP_ILLUMINA13 0x1000
typedef struct {
int max_mq, min_mq, flag, min_baseQ, capQ_thres, max_depth, max_indel_depth;
skip = (rg && bcf_str2id(ma->conf->rghash, (const char*)(rg+1)) >= 0);
if (skip) continue;
}
+ if (ma->conf->flag & MPLP_ILLUMINA13) {
+ int i;
+ uint8_t *qual = bam1_qual(b);
+ for (i = 0; i < b->core.l_qseq; ++i)
+ qual[i] = qual[i] > 31? qual[i] - 31 : 0;
+ }
has_ref = (ma->ref && ma->ref_id == b->core.tid)? 1 : 0;
skip = 0;
if (has_ref && (ma->conf->flag&MPLP_REALN)) bam_prob_realn_core(b, ma->ref, (ma->conf->flag & MPLP_EXT_BAQ)? 3 : 1);
mplp.openQ = 40; mplp.extQ = 20; mplp.tandemQ = 100;
mplp.min_frac = 0.002; mplp.min_support = 1;
mplp.flag = MPLP_NO_ORPHAN | MPLP_REALN;
- while ((c = getopt(argc, argv, "Agf:r:l:M:q:Q:uaRC:BDSd:L:b:P:o:e:h:Im:F:EG:")) >= 0) {
+ while ((c = getopt(argc, argv, "Agf:r:l:M:q:Q:uaRC:BDSd:L:b:P:o:e:h:Im:F:EG:6")) >= 0) {
switch (c) {
case 'f':
mplp.fai = fai_load(optarg);
case 'S': mplp.flag |= MPLP_FMT_SP; break;
case 'I': mplp.flag |= MPLP_NO_INDEL; break;
case 'E': mplp.flag |= MPLP_EXT_BAQ; break;
+ case '6': mplp.flag |= MPLP_ILLUMINA13; break;
case 'C': mplp.capQ_thres = atoi(optarg); break;
case 'M': mplp.max_mq = atoi(optarg); break;
case 'q': mplp.min_mq = atoi(optarg); break;
fprintf(stderr, " -m INT minimum gapped reads for indel candidates [%d]\n", mplp.min_support);
fprintf(stderr, " -F FLOAT minimum fraction of gapped reads for candidates [%g]\n", mplp.min_frac);
fprintf(stderr, " -G FILE exclude read groups listed in FILE [null]\n");
+ fprintf(stderr, " -6 quality is in the Illumina-1.3+ encoding\n");
fprintf(stderr, " -A use anomalous read pairs in SNP/INDEL calling\n");
fprintf(stderr, " -g generate BCF output\n");
fprintf(stderr, " -u do not compress BCF output\n");
.IR prior ]
.RB [ \-1
.IR nGroup1 ]
+.RB [ \-d
+.IR minFrac ]
.RB [ \-U
.IR nPerm ]
.RB [ \-X
Output the QCALL likelihood format
.TP
.BI -s \ FILE
-List of samples to use. In the output, the ordering of samples will be identical
-to the one in
+List of samples to use. The first column in the input gives the sample names
+and the second gives the ploidy, which can only be 1 or 2. When the 2nd column
+is absent, the sample ploidy is assumed to be 2. In the output, the ordering of
+samples will be identical to the one in
.IR FILE .
[null]
.TP
.B -c
Call variants.
.TP
+.BI -d \ FLOAT
+When
+.B -v
+is in use, skip loci where the fraction of samples covered by reads is below FLOAT. [0]
+.TP
.B -g
Call per-sample genotypes at variant sites (force -c)
.TP
return 0;
}
+int bcf_smpl_covered(const bcf1_t *b)
+{
+ int i, j, n = 0;
+ uint32_t tmp;
+ bcf_ginfo_t *gi;
+ // pinpoint PL
+ tmp = bcf_str2int("PL", 2);
+ for (i = 0; i < b->n_gi; ++i)
+ if (b->gi[i].fmt == tmp) break;
+ if (i == b->n_gi) return 0;
+ // count how many samples having PL!=[0..0]
+ gi = b->gi + i;
+ for (i = 0; i < b->n_smpl; ++i) {
+ uint8_t *PLi = ((uint8_t*)gi->data) + i * gi->len;
+ for (j = 0; j < gi->len; ++j)
+ if (PLi[j]) break;
+ if (j < gi->len) ++n;
+ }
+ return n;
+}
+
static void *locate_field(const bcf1_t *b, const char *fmt, int l)
{
int i;
int flag, prior_type, n1, n_sub, *sublist, n_perm;
char *fn_list, *prior_file, **subsam, *fn_dict;
uint8_t *ploidy;
- double theta, pref, indel_frac, min_perm_p;
+ double theta, pref, indel_frac, min_perm_p, min_smpl_frac;
} viewconf_t;
khash_t(set64) *bcf_load_pos(const char *fn, bcf_hdr_t *_h)
tid = begin = end = -1;
memset(&vc, 0, sizeof(viewconf_t));
- vc.prior_type = vc.n1 = -1; vc.theta = 1e-3; vc.pref = 0.5; vc.indel_frac = -1.; vc.n_perm = 0; vc.min_perm_p = 0.01;
- while ((c = getopt(argc, argv, "FN1:l:cHAGvbSuP:t:p:QgLi:IMs:D:U:X:")) >= 0) {
+ vc.prior_type = vc.n1 = -1; vc.theta = 1e-3; vc.pref = 0.5; vc.indel_frac = -1.; vc.n_perm = 0; vc.min_perm_p = 0.01; vc.min_smpl_frac = 0;
+ while ((c = getopt(argc, argv, "FN1:l:cHAGvbSuP:t:p:QgLi:IMs:D:U:X:d:")) >= 0) {
switch (c) {
case '1': vc.n1 = atoi(optarg); break;
case 'l': vc.fn_list = strdup(optarg); break;
case 'L': vc.flag |= VC_ADJLD; break;
case 'U': vc.n_perm = atoi(optarg); break;
case 'X': vc.min_perm_p = atof(optarg); break;
+ case 'd': vc.min_smpl_frac = atof(optarg); break;
case 's': vc.subsam = read_samples(optarg, &vc.n_sub);
vc.ploidy = calloc(vc.n_sub + 1, 1);
for (tid = 0; tid < vc.n_sub; ++tid) vc.ploidy[tid] = vc.subsam[tid][strlen(vc.subsam[tid]) + 1];
fprintf(stderr, " -L calculate LD for adjacent sites\n");
fprintf(stderr, " -I skip indels\n");
fprintf(stderr, " -F PL generated by r921 or before\n");
+ fprintf(stderr, " -d FLOAT skip loci where less than FLOAT fraction of samples covered [0]\n");
fprintf(stderr, " -D FILE sequence dictionary for VCF->BCF conversion [null]\n");
fprintf(stderr, " -l FILE list of sites to output [all sites]\n");
fprintf(stderr, " -t FLOAT scaled substitution mutation rate [%.4g]\n", vc.theta);
}
while (vcf_read(bp, hin, b) > 0) {
int is_indel;
+ if ((vc.flag & VC_VARONLY) && strcmp(b->alt, "X") == 0) continue;
+ if ((vc.flag & VC_VARONLY) && vc.min_smpl_frac > 0.) {
+ extern int bcf_smpl_covered(const bcf1_t *b);
+ int n = bcf_smpl_covered(b);
+ if ((double)n / b->n_smpl < vc.min_smpl_frac) continue;
+ }
if (vc.n_sub) bcf_subsam(vc.n_sub, vc.sublist, b);
if (vc.flag & VC_FIX_PL) bcf_fix_pl(b);
is_indel = bcf_is_indel(b);
This command is much faster than replacing the header with a
BAM->SAM->BAM conversion.
+.TP
+.B cat
+samtools cat [-h header.sam] [-o out.bam] <in1.bam> <in2.bam> [ ... ]
+
+Concatenate BAMs. The sequence dictionary of each input BAM must be identical,
+although this command does not check this. This command uses a similar trick
+to
+.B reheader
+which enables fast BAM concatenation.
+
.TP
.B sort
samtools sort [-no] [-m maxMem] <in.bam> <out.prefix>