* samtools-0.1.9-17 (r824)

author Heng Li <lh3@live.co.uk>

Tue, 16 Nov 2010 04:31:53 +0000 (04:31 +0000)

committer Heng Li <lh3@live.co.uk>

Tue, 16 Nov 2010 04:31:53 +0000 (04:31 +0000)
author Heng Li <lh3@live.co.uk>
Tue, 16 Nov 2010 04:31:53 +0000 (04:31 +0000)
committer Heng Li <lh3@live.co.uk>
Tue, 16 Nov 2010 04:31:53 +0000 (04:31 +0000)
diff --git a/bam_plcmd.c b/bam_plcmd.c

index ad6134b3ee7cdeb179c3a86fd46658b0aef83aef..e3f73aaddd03a14343a82084ee060d9c41ba42d4 100644 (file)
--- a/bam_plcmd.c
+++ b/bam_plcmd.c
@@ -532,9 +532,11 @@ int bam_pileup(int argc, char *argv[])
  #define MPLP_REALN   0x80
  #define MPLP_FMT_DP 0x100
  #define MPLP_FMT_SP 0x200
  #define MPLP_REALN   0x80
  #define MPLP_FMT_DP 0x100
  #define MPLP_FMT_SP 0x200
+#define MPLP_NO_INDEL 0x400
  
  typedef struct {
         int max_mq, min_mq, flag, min_baseQ, capQ_thres, max_depth;
  
  typedef struct {
         int max_mq, min_mq, flag, min_baseQ, capQ_thres, max_depth;
+       int openQ, extQ, tandemQ;
         char *reg, *fn_pos, *pl_list;
         faidx_t *fai;
         kh_64_t *hash;
         char *reg, *fn_pos, *pl_list;
         faidx_t *fai;
         kh_64_t *hash;
@@ -698,6 +700,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
                 bca = bcf_call_init(-1., conf->min_baseQ);
                 bcr = calloc(sm->n, sizeof(bcf_callret1_t));
                 bca->rghash = rghash;
                 bca = bcf_call_init(-1., conf->min_baseQ);
                 bcr = calloc(sm->n, sizeof(bcf_callret1_t));
                 bca->rghash = rghash;
+               bca->openQ = conf->openQ, bca->extQ = conf->extQ, bca->tandemQ = conf->tandemQ;
         }
         ref_tid = -1; ref = 0;
         iter = bam_mplp_init(n, mplp_func, (void**)data);
         }
         ref_tid = -1; ref = 0;
         iter = bam_mplp_init(n, mplp_func, (void**)data);
@@ -736,7 +739,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
                         bcf_write(bp, bh, b);
                         bcf_destroy(b);
                         // call indels
                         bcf_write(bp, bh, b);
                         bcf_destroy(b);
                         // call indels
-                       if (bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, rghash) >= 0) {
+                       if (!(conf->flag&MPLP_NO_INDEL) && bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, rghash) >= 0) {
                                 for (i = 0; i < gplp.n; ++i)
                                         bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], -1, bca, bcr + i);
                                 if (bcf_call_combine(gplp.n, bcr, -1, &bc) >= 0) {
                                 for (i = 0; i < gplp.n; ++i)
                                         bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], -1, bca, bcr + i);
                                 if (bcf_call_combine(gplp.n, bcr, -1, &bc) >= 0) {
@@ -853,8 +856,9 @@ int bam_mpileup(int argc, char *argv[])
         mplp.min_baseQ = 13;
         mplp.capQ_thres = 0;
         mplp.max_depth = 250;
         mplp.min_baseQ = 13;
         mplp.capQ_thres = 0;
         mplp.max_depth = 250;
+       mplp.openQ = 40; mplp.extQ = 20; mplp.tandemQ = 100;
         mplp.flag = MPLP_NO_ORPHAN | MPLP_REALN;
         mplp.flag = MPLP_NO_ORPHAN | MPLP_REALN;
-       while ((c = getopt(argc, argv, "gf:r:l:M:q:Q:uaORC:BDSd:b:P:")) >= 0) {
+       while ((c = getopt(argc, argv, "gf:r:l:M:q:Q:uaORC:BDSd:b:P:o:e:h:I")) >= 0) {
                 switch (c) {
                 case 'f':
                         mplp.fai = fai_load(optarg);
                 switch (c) {
                 case 'f':
                         mplp.fai = fai_load(optarg);
@@ -872,11 +876,15 @@ int bam_mpileup(int argc, char *argv[])
                 case 'R': mplp.flag |= MPLP_REALN; break;
                 case 'D': mplp.flag |= MPLP_FMT_DP; break;
                 case 'S': mplp.flag |= MPLP_FMT_SP; break;
                 case 'R': mplp.flag |= MPLP_REALN; break;
                 case 'D': mplp.flag |= MPLP_FMT_DP; break;
                 case 'S': mplp.flag |= MPLP_FMT_SP; break;
+               case 'I': mplp.flag |= MPLP_NO_INDEL; break;
                 case 'C': mplp.capQ_thres = atoi(optarg); break;
                 case 'M': mplp.max_mq = atoi(optarg); break;
                 case 'q': mplp.min_mq = atoi(optarg); break;
                 case 'Q': mplp.min_baseQ = atoi(optarg); break;
          case 'b': file_list = optarg; break;
                 case 'C': mplp.capQ_thres = atoi(optarg); break;
                 case 'M': mplp.max_mq = atoi(optarg); break;
                 case 'q': mplp.min_mq = atoi(optarg); break;
                 case 'Q': mplp.min_baseQ = atoi(optarg); break;
          case 'b': file_list = optarg; break;
+               case 'o': mplp.openQ = atoi(optarg); break;
+               case 'e': mplp.extQ = atoi(optarg); break;
+               case 'h': mplp.tandemQ = atoi(optarg); break;
                 }
         }
         if (argc == 1) {
                 }
         }
         if (argc == 1) {
@@ -891,11 +899,15 @@ int bam_mpileup(int argc, char *argv[])
                 fprintf(stderr, "         -q INT      filter out alignment with MQ smaller than INT [%d]\n", mplp.min_mq);
                 fprintf(stderr, "         -d INT      max per-sample depth [%d]\n", mplp.max_depth);
                 fprintf(stderr, "         -P STR      comma separated list of platforms for indels [all]\n");
                 fprintf(stderr, "         -q INT      filter out alignment with MQ smaller than INT [%d]\n", mplp.min_mq);
                 fprintf(stderr, "         -d INT      max per-sample depth [%d]\n", mplp.max_depth);
                 fprintf(stderr, "         -P STR      comma separated list of platforms for indels [all]\n");
+               fprintf(stderr, "         -o INT      Phred-scaled gap open sequencing error probability [%d]\n", mplp.openQ);
+               fprintf(stderr, "         -e INT      Phred-scaled gap extension seq error probability [%d]\n", mplp.extQ);
+               fprintf(stderr, "         -h INT      coefficient for homopolyer errors [%d]\n", mplp.tandemQ);
                 fprintf(stderr, "         -g          generate BCF output\n");
                 fprintf(stderr, "         -u          do not compress BCF output\n");
                 fprintf(stderr, "         -B          disable BAQ computation\n");
                 fprintf(stderr, "         -D          output per-sample DP\n");
                 fprintf(stderr, "         -S          output per-sample SP (strand bias P-value, slow)\n");
                 fprintf(stderr, "         -g          generate BCF output\n");
                 fprintf(stderr, "         -u          do not compress BCF output\n");
                 fprintf(stderr, "         -B          disable BAQ computation\n");
                 fprintf(stderr, "         -D          output per-sample DP\n");
                 fprintf(stderr, "         -S          output per-sample SP (strand bias P-value, slow)\n");
+               fprintf(stderr, "         -I          do not perform indel calling\n");
                 fprintf(stderr, "\n");
                 fprintf(stderr, "Notes: Assuming diploid individuals.\n\n");
                 return 1;
                 fprintf(stderr, "\n");
                 fprintf(stderr, "Notes: Assuming diploid individuals.\n\n");
                 return 1;
diff --git a/bamtk.c b/bamtk.c

index ef11b06a7366db2a2f1f0088d46824d894fed8a6..7923b04007fbce7359c504bb6d105b4e518bd129 100644 (file)
--- a/bamtk.c
+++ b/bamtk.c
@@ -9,7 +9,7 @@
  #endif
  
  #ifndef PACKAGE_VERSION
  #endif
  
  #ifndef PACKAGE_VERSION
-#define PACKAGE_VERSION "0.1.9-16 (r823)"
+#define PACKAGE_VERSION "0.1.9-17 (r824)"
  #endif
  
  int bam_taf2baf(int argc, char *argv[]);
  #endif
  
  int bam_taf2baf(int argc, char *argv[]);
diff --git a/bcftools/call1.c b/bcftools/call1.c

index 5a5c1cd1aba3e1c7e15a00638ee468186d052a52..15f6f8bcc7095c37ea5fb589efdce630680b07b7 100644 (file)
--- a/bcftools/call1.c
+++ b/bcftools/call1.c
@@ -29,7 +29,7 @@ KSTREAM_INIT(gzFile, gzread, 16384)
  typedef struct {
         int flag, prior_type, n1;
         char *fn_list, *prior_file;
  typedef struct {
         int flag, prior_type, n1;
         char *fn_list, *prior_file;
-       double theta, pref;
+       double theta, pref, indel_frac;
  } viewconf_t;
  
  khash_t(set64) *bcf_load_pos(const char *fn, bcf_hdr_t *_h)
  } viewconf_t;
  
  khash_t(set64) *bcf_load_pos(const char *fn, bcf_hdr_t *_h)
@@ -200,6 +200,7 @@ double bcf_ld_freq(const bcf1_t *b0, const bcf1_t *b1, double f[4]);
  int bcfview(int argc, char *argv[])
  {
         extern int bcf_2qcall(bcf_hdr_t *h, bcf1_t *b);
  int bcfview(int argc, char *argv[])
  {
         extern int bcf_2qcall(bcf_hdr_t *h, bcf1_t *b);
+       extern void bcf_p1_indel_prior(bcf_p1aux_t *ma, double x);
         bcf_t *bp, *bout = 0;
         bcf1_t *b, *blast;
         int c;
         bcf_t *bp, *bout = 0;
         bcf1_t *b, *blast;
         int c;
@@ -213,8 +214,8 @@ int bcfview(int argc, char *argv[])
  
         tid = begin = end = -1;
         memset(&vc, 0, sizeof(viewconf_t));
  
         tid = begin = end = -1;
         memset(&vc, 0, sizeof(viewconf_t));
-       vc.prior_type = vc.n1 = -1; vc.theta = 1e-3; vc.pref = 0.5;
-       while ((c = getopt(argc, argv, "N1:l:cHAGvbSuP:t:p:QgL")) >= 0) {
+       vc.prior_type = vc.n1 = -1; vc.theta = 1e-3; vc.pref = 0.5; vc.indel_frac = -1.;
+       while ((c = getopt(argc, argv, "N1:l:cHAGvbSuP:t:p:QgLi:")) >= 0) {
                 switch (c) {
                 case '1': vc.n1 = atoi(optarg); break;
                 case 'l': vc.fn_list = strdup(optarg); break;
                 switch (c) {
                 case '1': vc.n1 = atoi(optarg); break;
                 case 'l': vc.fn_list = strdup(optarg); break;
@@ -230,6 +231,7 @@ int bcfview(int argc, char *argv[])
                 case 'g': vc.flag |= VC_CALL_GT | VC_CALL; break;
                 case 't': vc.theta = atof(optarg); break;
                 case 'p': vc.pref = atof(optarg); break;
                 case 'g': vc.flag |= VC_CALL_GT | VC_CALL; break;
                 case 't': vc.theta = atof(optarg); break;
                 case 'p': vc.pref = atof(optarg); break;
+               case 'i': vc.indel_frac = atof(optarg); break;
                 case 'Q': vc.flag |= VC_QCALL; break;
                 case 'L': vc.flag |= VC_ADJLD; break;
                 case 'P':
                 case 'Q': vc.flag |= VC_QCALL; break;
                 case 'L': vc.flag |= VC_ADJLD; break;
                 case 'P':
@@ -257,7 +259,8 @@ int bcfview(int argc, char *argv[])
                 fprintf(stderr, "         -L        calculate LD for adjacent sites\n");
                 fprintf(stderr, "         -1 INT    number of group-1 samples [0]\n");
                 fprintf(stderr, "         -l FILE   list of sites to output [all sites]\n");
                 fprintf(stderr, "         -L        calculate LD for adjacent sites\n");
                 fprintf(stderr, "         -1 INT    number of group-1 samples [0]\n");
                 fprintf(stderr, "         -l FILE   list of sites to output [all sites]\n");
-               fprintf(stderr, "         -t FLOAT  scaled mutation rate [%.4lg]\n", vc.theta);
+               fprintf(stderr, "         -t FLOAT  scaled substitution mutation rate [%.4lg]\n", vc.theta);
+               fprintf(stderr, "         -i FLOAT  indel-to-substitution ratio [%.4lg]\n", vc.indel_frac);
                 fprintf(stderr, "         -p FLOAT  variant if P(ref|D)<FLOAT [%.3lg]\n", vc.pref);
                 fprintf(stderr, "         -P STR    type of prior: full, cond2, flat [full]\n");
                 fprintf(stderr, "\n");
                 fprintf(stderr, "         -p FLOAT  variant if P(ref|D)<FLOAT [%.3lg]\n", vc.pref);
                 fprintf(stderr, "         -P STR    type of prior: full, cond2, flat [full]\n");
                 fprintf(stderr, "\n");
@@ -287,6 +290,7 @@ int bcfview(int argc, char *argv[])
                         bcf_p1_set_n1(p1, vc.n1);
                         bcf_p1_init_subprior(p1, vc.prior_type, vc.theta);
                 }
                         bcf_p1_set_n1(p1, vc.n1);
                         bcf_p1_init_subprior(p1, vc.prior_type, vc.theta);
                 }
+               if (vc.indel_frac > 0.) bcf_p1_indel_prior(p1, vc.indel_frac);
         }
         if (vc.fn_list) hash = bcf_load_pos(vc.fn_list, h);
         if (optind + 1 < argc && !(vc.flag&VC_VCFIN)) {
         }
         if (vc.fn_list) hash = bcf_load_pos(vc.fn_list, h);
         if (optind + 1 < argc && !(vc.flag&VC_VCFIN)) {
diff --git a/samtools.1 b/samtools.1

index 7de9857245ca6debe24abaa4d6786f942886035e..17db53758b2e97c95b9ee450f14784a0ed0a7d74 100644 (file)
--- a/samtools.1
+++ b/samtools.1
@@ -1,4 +1,4 @@
-.TH samtools 1 "27 October 2010" "samtools-0.1.9" "Bioinformatics tools"
+.TH samtools 1 "15 November 2010" "samtools-0.1.10" "Bioinformatics tools"
  .SH NAME
  .PP
  samtools - Utilities for the Sequence Alignment/Map (SAM) format
  .SH NAME
  .PP
  samtools - Utilities for the Sequence Alignment/Map (SAM) format
@@ -20,7 +20,7 @@ samtools faidx ref.fasta
  .PP
  samtools pileup -vcf ref.fasta aln.sorted.bam
  .PP
  .PP
  samtools pileup -vcf ref.fasta aln.sorted.bam
  .PP
-samtools mpileup -C50 -agf ref.fasta -r chr3:1,000-2,000 in1.bam in2.bam
+samtools mpileup -C50 -gf ref.fasta -r chr3:1,000-2,000 in1.bam in2.bam
  .PP
  samtools tview aln.sorted.bam ref.fasta
  
  .PP
  samtools tview aln.sorted.bam ref.fasta
  
@@ -65,11 +65,11 @@ format: `chr2' (the whole chr2), `chr2:1000000' (region starting from
  .B -b
  Output in the BAM format.
  .TP
  .B -b
  Output in the BAM format.
  .TP
-.BI -f " INT"
+.BI -f \ INT
  Only output alignments with all bits in INT present in the FLAG
  field. INT can be in hex in the format of /^0x[0-9A-F]+/ [0]
  .TP
  Only output alignments with all bits in INT present in the FLAG
  field. INT can be in hex in the format of /^0x[0-9A-F]+/ [0]
  .TP
-.BI -F " INT"
+.BI -F \ INT
  Skip alignments with bits present in INT [0]
  .TP
  .B -h
  Skip alignments with bits present in INT [0]
  .TP
  .B -h
@@ -78,19 +78,19 @@ Include the header in the output.
  .B -H
  Output the header only.
  .TP
  .B -H
  Output the header only.
  .TP
-.BI -l " STR"
+.BI -l \ STR
  Only output reads in library STR [null]
  .TP
  Only output reads in library STR [null]
  .TP
-.BI -o " FILE"
+.BI -o \ FILE
  Output file [stdout]
  .TP
  Output file [stdout]
  .TP
-.BI -q " INT"
+.BI -q \ INT
  Skip alignments with MAPQ smaller than INT [0]
  .TP
  Skip alignments with MAPQ smaller than INT [0]
  .TP
-.BI -r " STR"
+.BI -r \ STR
  Only output reads in read group STR [null]
  .TP
  Only output reads in read group STR [null]
  .TP
-.BI -R " FILE"
+.BI -R \ FILE
  Output reads in read groups listed in
  .I FILE
  [null]
  Output reads in read groups listed in
  .I FILE
  [null]
@@ -109,7 +109,7 @@ and
  .B `-q'
  , are taken into account.
  .TP
  .B `-q'
  , are taken into account.
  .TP
-.BI -t " FILE"
+.BI -t \ FILE
  This file is TAB-delimited. Each line must contain the reference name
  and the length of the reference, one line for each distinct reference;
  additional fields are ignored. This file also defines the order of the
  This file is TAB-delimited. Each line must contain the reference name
  and the length of the reference, one line for each distinct reference;
  additional fields are ignored. This file also defines the order of the
@@ -135,135 +135,6 @@ press `?' for help and press `g' to check the alignment start from a
  region in the format like `chr10:10,000,000' or `=10,000,000' when
  viewing the same reference sequence.
  
  region in the format like `chr10:10,000,000' or `=10,000,000' when
  viewing the same reference sequence.
  
-.TP
-.B pileup
-samtools pileup [-2sSBicv] [-f in.ref.fasta] [-t in.ref_list] [-l
-in.site_list] [-C capMapQ] [-M maxMapQ] [-T theta] [-N nHap] [-r
-pairDiffRate] [-m mask] [-d maxIndelDepth] [-G indelPrior]
-<in.bam>|<in.sam>
-
-Print the alignment in the pileup format. In the pileup format, each
-line represents a genomic position, consisting of chromosome name,
-coordinate, reference base, read bases, read qualities and alignment
-mapping qualities. Information on match, mismatch, indel, strand,
-mapping quality and start and end of a read are all encoded at the read
-base column. At this column, a dot stands for a match to the reference
-base on the forward strand, a comma for a match on the reverse strand,
-a '>' or '<' for a reference skip, `ACGTN' for a mismatch on the forward
-strand and `acgtn' for a mismatch on the reverse strand. A pattern
-`\\+[0-9]+[ACGTNacgtn]+' indicates there is an insertion between this
-reference position and the next reference position. The length of the
-insertion is given by the integer in the pattern, followed by the
-inserted sequence. Similarly, a pattern `-[0-9]+[ACGTNacgtn]+'
-represents a deletion from the reference. The deleted bases will be
-presented as `*' in the following lines. Also at the read base column, a
-symbol `^' marks the start of a read. The ASCII of the character
-following `^' minus 33 gives the mapping quality. A symbol `$' marks the
-end of a read segment.
-
-If option
-.B -c
-is applied, the consensus base, Phred-scaled consensus quality, SNP
-quality (i.e. the Phred-scaled probability of the consensus being
-identical to the reference) and root mean square (RMS) mapping quality
-of the reads covering the site will be inserted between the `reference
-base' and the `read bases' columns. An indel occupies an additional
-line. Each indel line consists of chromosome name, coordinate, a star,
-the genotype, consensus quality, SNP quality, RMS mapping quality, #
-covering reads, the first alllele, the second allele, # reads supporting
-the first allele, # reads supporting the second allele and # reads
-containing indels different from the top two alleles.
-
-The position of indels is offset by -1.
-
-.B OPTIONS:
-.RS
-.TP 10
-.B -B
-Disable the BAQ computation. See the
-.B mpileup
-command for details.
-.TP
-.B -c
-Call the consensus sequence using SOAPsnp consensus model. Options
-.BR -T ", " -N ", " -I " and " -r
-are only effective when
-.BR -c " or " -g
-is in use.
-.TP
-.BI -C " INT"
-Coefficient for downgrading the mapping quality of poorly mapped
-reads. See the
-.B mpileup
-command for details. [0]
-.TP
-.BI -d " INT"
-Use the first
-.I NUM
-reads in the pileup for indel calling for speed up. Zero for unlimited. [1024]
-.TP
-.BI -f " FILE"
-The reference sequence in the FASTA format. Index file
-.I FILE.fai
-will be created if
-absent.
-.TP
-.B -g
-Generate genotype likelihood in the binary GLFv3 format. This option
-suppresses -c, -i and -s. This option is deprecated by the
-.B mpileup
-command.
-.TP
-.B -i
-Only output pileup lines containing indels.
-.TP
-.BI -I " INT"
-Phred probability of an indel in sequencing/prep. [40]
-.TP
-.BI -l " FILE"
-List of sites at which pileup is output. This file is space
-delimited. The first two columns are required to be chromosome and
-1-based coordinate. Additional columns are ignored. It is
-recommended to use option
-.TP
-.BI -m " INT"
-Filter reads with flag containing bits in
-.I INT
-[1796]
-.TP
-.BI -M " INT"
-Cap mapping quality at INT [60]
-.TP
-.BI -N " INT"
-Number of haplotypes in the sample (>=2) [2]
-.TP
-.BI -r " FLOAT"
-Expected fraction of differences between a pair of haplotypes [0.001]
-.TP
-.B -s
-Print the mapping quality as the last column. This option makes the
-output easier to parse, although this format is not space efficient.
-.TP
-.B -S
-The input file is in SAM.
-.TP
-.BI -t " FILE"
-List of reference names ane sequence lengths, in the format described
-for the
-.B import
-command. If this option is present, samtools assumes the input
-.I <in.alignment>
-is in SAM format; otherwise it assumes in BAM format.
-.B -s
-together with
-.B -l
-as in the default format we may not know the mapping quality.
-.TP
-.BI -T " FLOAT"
-The theta parameter (error dependency coefficient) in the maq consensus
-calling model [0.85]
-.RE
-
  .TP
  .B mpileup
  samtools mpileup [-Bug] [-C capQcoef] [-r reg] [-f in.fa] [-l list] [-M capMapQ] [-Q minBaseQ] [-q minMapQ] in.bam [in2.bam [...]]
  .TP
  .B mpileup
  samtools mpileup [-Bug] [-C capQcoef] [-r reg] [-f in.fa] [-l list] [-M capMapQ] [-Q minBaseQ] [-q minMapQ] in.bam [in2.bam [...]]
@@ -281,37 +152,64 @@ quality (BAQ). BAQ is the Phred-scaled probability of a read base being
  misaligned. Applying this option greatly helps to reduce false SNPs
  caused by misalignments.
  .TP
  misaligned. Applying this option greatly helps to reduce false SNPs
  caused by misalignments.
  .TP
-.BI -C " INT"
+.BI -C \ INT
  Coefficient for downgrading mapping quality for reads containing
  excessive mismatches. Given a read with a phred-scaled probability q of
  being generated from the mapped position, the new mapping quality is
  about sqrt((INT-q)/INT)*INT. A zero value disables this
  Coefficient for downgrading mapping quality for reads containing
  excessive mismatches. Given a read with a phred-scaled probability q of
  being generated from the mapped position, the new mapping quality is
  about sqrt((INT-q)/INT)*INT. A zero value disables this
-functionality; if enabled, the recommended value is 50. [0]
+functionality; if enabled, the recommended value for BWA is 50. [0]
+.TP
+.BI -e \ INT
+Phred-scaled gap extension sequencing error probability. Reducing
+.I INT
+leads to longer indels. [20]
  .TP
  .TP
-.BI -f " FILE"
+.BI -f \ FILE
  The reference file [null]
  .TP
  .B -g
  Compute genotype likelihoods and output them in the binary call format (BCF).
  .TP
  The reference file [null]
  .TP
  .B -g
  Compute genotype likelihoods and output them in the binary call format (BCF).
  .TP
-.B -u
-Similar to
-.B -g
-except that the output is uncompressed BCF, which is preferred for pipeing.
-.TP
-.BI -l " FILE"
+.BI -h \ INT
+Coefficient for modeling homopolymer errors. Given an
+.IR l -long
+homopolymer
+run, the sequencing error of an indel of size
+.I s
+is modeled as
+.IR INT * s / l .
+[100]
+.TP
+.BI -l \ FILE
  File containing a list of sites where pileup or BCF is outputted [null]
  .TP
  File containing a list of sites where pileup or BCF is outputted [null]
  .TP
-.BI -q " INT"
+.BI -o \ INT
+Phred-scaled gap open sequencing error probability. Reducing
+.I INT
+leads to more indel calls. [40]
+.TP
+.BI -P \ STR
+Comma dilimited list of platforms (determined by
+.BR @RG-PL )
+from which indel candidates are obtained. It is recommended to collect
+indel candidates from sequencing technologies that have low indel error
+rate such as ILLUMINA. [all]
+.TP
+.BI -q \ INT
  Minimum mapping quality for an alignment to be used [0]
  .TP
  Minimum mapping quality for an alignment to be used [0]
  .TP
-.BI -Q " INT"
+.BI -Q \ INT
  Minimum base quality for a base to be considered [13]
  .TP
  Minimum base quality for a base to be considered [13]
  .TP
-.BI -r " STR"
+.BI -r \ STR
  Only generate pileup in region
  .I STR
  [all sites]
  Only generate pileup in region
  .I STR
  [all sites]
+.TP
+.B -u
+Similar to
+.B -g
+except that the output is uncompressed BCF, which is preferred for piping.
  .RE
  
  .TP
  .RE
  
  .TP
@@ -345,7 +243,7 @@ Output the final alignment to the standard output.
  .B -n
  Sort by read names rather than by chromosomal coordinates
  .TP
  .B -n
  Sort by read names rather than by chromosomal coordinates
  .TP
-.B -m INT
+.BI -m \ INT
  Approximately the maximum required memory. [500000000]
  .RE
  
  Approximately the maximum required memory. [500000000]
  .RE
  
@@ -368,7 +266,7 @@ and the headers of other files will be ignored.
  .B OPTIONS:
  .RS
  .TP 8
  .B OPTIONS:
  .RS
  .TP 8
-.BI -h " FILE"
+.BI -h \ FILE
  Use the lines of
  .I FILE
  as `@' headers to be copied to
  Use the lines of
  .I FILE
  as `@' headers to be copied to
@@ -379,7 +277,7 @@ replacing any header lines that would otherwise be copied from
  is actually in SAM format, though any alignment records it may contain
  are ignored.)
  .TP
  is actually in SAM format, though any alignment records it may contain
  are ignored.)
  .TP
-.BI -R " STR"
+.BI -R \ STR
  Merge files in the specified region indicated by
  .I STR
  .TP
  Merge files in the specified region indicated by
  .I STR
  .TP
@@ -479,7 +377,7 @@ Output compressed BAM
  .B -S
  The input is SAM with header lines
  .TP
  .B -S
  The input is SAM with header lines
  .TP
-.BI -C " INT"
+.BI -C \ INT
  Coefficient to cap mapping quality of poorly mapped reads. See the
  .B pileup
  command for details. [0]
  Coefficient to cap mapping quality of poorly mapped reads. See the
  .B pileup
  command for details. [0]
@@ -489,6 +387,136 @@ Perform probabilistic realignment to compute BAQ, which will be used to
  cap base quality.
  .RE
  
  cap base quality.
  .RE
  
+.TP
+.B pileup
+samtools pileup [-2sSBicv] [-f in.ref.fasta] [-t in.ref_list] [-l
+in.site_list] [-C capMapQ] [-M maxMapQ] [-T theta] [-N nHap] [-r
+pairDiffRate] [-m mask] [-d maxIndelDepth] [-G indelPrior]
+<in.bam>|<in.sam>
+
+Print the alignment in the pileup format. In the pileup format, each
+line represents a genomic position, consisting of chromosome name,
+coordinate, reference base, read bases, read qualities and alignment
+mapping qualities. Information on match, mismatch, indel, strand,
+mapping quality and start and end of a read are all encoded at the read
+base column. At this column, a dot stands for a match to the reference
+base on the forward strand, a comma for a match on the reverse strand,
+a '>' or '<' for a reference skip, `ACGTN' for a mismatch on the forward
+strand and `acgtn' for a mismatch on the reverse strand. A pattern
+`\\+[0-9]+[ACGTNacgtn]+' indicates there is an insertion between this
+reference position and the next reference position. The length of the
+insertion is given by the integer in the pattern, followed by the
+inserted sequence. Similarly, a pattern `-[0-9]+[ACGTNacgtn]+'
+represents a deletion from the reference. The deleted bases will be
+presented as `*' in the following lines. Also at the read base column, a
+symbol `^' marks the start of a read. The ASCII of the character
+following `^' minus 33 gives the mapping quality. A symbol `$' marks the
+end of a read segment.
+
+If option
+.B -c
+is applied, the consensus base, Phred-scaled consensus quality, SNP
+quality (i.e. the Phred-scaled probability of the consensus being
+identical to the reference) and root mean square (RMS) mapping quality
+of the reads covering the site will be inserted between the `reference
+base' and the `read bases' columns. An indel occupies an additional
+line. Each indel line consists of chromosome name, coordinate, a star,
+the genotype, consensus quality, SNP quality, RMS mapping quality, #
+covering reads, the first alllele, the second allele, # reads supporting
+the first allele, # reads supporting the second allele and # reads
+containing indels different from the top two alleles.
+
+.B NOTE:
+Since 0.1.10, the `pileup' command is deprecated by `mpileup'.
+
+.B OPTIONS:
+.RS
+.TP 10
+.B -B
+Disable the BAQ computation. See the
+.B mpileup
+command for details.
+.TP
+.B -c
+Call the consensus sequence. Options
+.BR -T ", " -N ", " -I " and " -r
+are only effective when
+.BR -c " or " -g
+is in use.
+.TP
+.BI -C \ INT
+Coefficient for downgrading the mapping quality of poorly mapped
+reads. See the
+.B mpileup
+command for details. [0]
+.TP
+.BI -d \ INT
+Use the first
+.I NUM
+reads in the pileup for indel calling for speed up. Zero for unlimited. [1024]
+.TP
+.BI -f \ FILE
+The reference sequence in the FASTA format. Index file
+.I FILE.fai
+will be created if
+absent.
+.TP
+.B -g
+Generate genotype likelihood in the binary GLFv3 format. This option
+suppresses -c, -i and -s. This option is deprecated by the
+.B mpileup
+command.
+.TP
+.B -i
+Only output pileup lines containing indels.
+.TP
+.BI -I \ INT
+Phred probability of an indel in sequencing/prep. [40]
+.TP
+.BI -l \ FILE
+List of sites at which pileup is output. This file is space
+delimited. The first two columns are required to be chromosome and
+1-based coordinate. Additional columns are ignored. It is
+recommended to use option
+.TP
+.BI -m \ INT
+Filter reads with flag containing bits in
+.I INT
+[1796]
+.TP
+.BI -M \ INT
+Cap mapping quality at INT [60]
+.TP
+.BI -N \ INT
+Number of haplotypes in the sample (>=2) [2]
+.TP
+.BI -r \ FLOAT
+Expected fraction of differences between a pair of haplotypes [0.001]
+.TP
+.B -s
+Print the mapping quality as the last column. This option makes the
+output easier to parse, although this format is not space efficient.
+.TP
+.B -S
+The input file is in SAM.
+.TP
+.BI -t \ FILE
+List of reference names ane sequence lengths, in the format described
+for the
+.B import
+command. If this option is present, samtools assumes the input
+.I <in.alignment>
+is in SAM format; otherwise it assumes in BAM format.
+.B -s
+together with
+.B -l
+as in the default format we may not know the mapping quality.
+.TP
+.BI -T \ FLOAT
+The theta parameter (error dependency coefficient) in the maq consensus
+calling model [0.85]
+.RE
+
  .SH SAM FORMAT
  
  SAM is TAB-delimited. Apart from the header lines, which are started
  .SH SAM FORMAT
  
  SAM is TAB-delimited. Apart from the header lines, which are started
@@ -582,9 +610,8 @@ will be attached
  .IP o 2
  Call SNPs and short indels for one diploid individual:
  
  .IP o 2
  Call SNPs and short indels for one diploid individual:
  
-  samtools pileup -vcf ref.fa aln.bam > var.raw.plp
-  samtools.pl varFilter -D 100 var.raw.plp > var.flt.plp
-  awk '($3=="*"&&$6>=50)||($3!="*"&&$6>=20)' var.flt.plp > var.final.plp
+  samtools mpileup -ugf ref.fa aln.bam | bcftools view -bvcg - > var.raw.bcf
+  bcftools view var.raw.bcf | vcfutils.pl varFilter -D 100 > var.flt.vcf
  
  The
  .B -D
  
  The
  .B -D
@@ -592,38 +619,37 @@ option of varFilter controls the maximum read depth, which should be
  adjusted to about twice the average read depth.  One may consider to add
  .B -C50
  to
  adjusted to about twice the average read depth.  One may consider to add
  .B -C50
  to
-.B pileup
+.B mpileup
  if mapping quality is overestimated for reads containing excessive
  mismatches. Applying this option usually helps
  .B BWA-short
  if mapping quality is overestimated for reads containing excessive
  mismatches. Applying this option usually helps
  .B BWA-short
-but may not other mappers. It also potentially increases reference
-biases.
+but may not other mappers.
  
  .IP o 2
  
  .IP o 2
-Call SNPs (not short indels) for multiple diploid individuals:
+Call SNPs and short indels for multiple diploid individuals:
  
  
-  samtools mpileup -augf ref.fa *.bam | bcftools view -bcv - > snp.raw.bcf
-  bcftools view snp.raw.bcf | vcfutils.pl filter4vcf -D 2000 | bgzip > snp.flt.vcf.gz
+  samtools mpileup -P ILLUMINA -ugf ref.fa *.bam | bcftools view -bcvg - > var.raw.bcf
+  bcftools view var.raw.bcf | vcfutils.pl varFilter -D 2000 > var.flt.vcf
  
  Individuals are identified from the
  .B SM
  tags in the
  .B @RG
  header lines. Individuals can be pooled in one alignment file; one
  
  Individuals are identified from the
  .B SM
  tags in the
  .B @RG
  header lines. Individuals can be pooled in one alignment file; one
-individual can also be separated into multiple files. In addition, one
-may consider to apply
-.B -C50
-to
-.BR mpileup .
-
-SNP calling with mpileup also works for single sample and has the
-advantage of enabling more powerful filtering. The drawback is the lack
-of short indel calling, which may be implemented in future.
+individual can also be separated into multiple files. The
+.B -P
+option specifies that indel candidates should be collected only from
+read groups with the
+.B @RG-PL
+tag set to
+.IR ILLUMINA .
+Collecting indel candidates from reads sequenced by an indel-prone
+technology may affect the performance of indel calling.
  
  .IP o 2
  Derive the allele frequency spectrum (AFS) on a list of sites from multiple individuals:
  
  
  .IP o 2
  Derive the allele frequency spectrum (AFS) on a list of sites from multiple individuals:
  
-  samtools mpileup -gf ref.fa *.bam > all.bcf
+  samtools mpileup -Igf ref.fa *.bam > all.bcf
    bcftools view -bl sites.list all.bcf > sites.bcf
    bcftools view -cGP cond2 sites.bcf > /dev/null 2> sites.1.afs
    bcftools view -cGP sites.1.afs sites.bcf > /dev/null 2> sites.2.afs
    bcftools view -bl sites.list all.bcf > sites.bcf
    bcftools view -cGP cond2 sites.bcf > /dev/null 2> sites.1.afs
    bcftools view -cGP sites.1.afs sites.bcf > /dev/null 2> sites.2.afs
author	Heng Li <lh3@live.co.uk>
	Tue, 16 Nov 2010 04:31:53 +0000 (04:31 +0000)
committer	Heng Li <lh3@live.co.uk>
	Tue, 16 Nov 2010 04:31:53 +0000 (04:31 +0000)
bam_plcmd.c		patch \| blob \| history
bamtk.c		patch \| blob \| history
bcftools/call1.c		patch \| blob \| history
samtools.1		patch \| blob \| history