From 4bb71ff87774996c84e2a7ad543e40e0da8d0483 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 8 Nov 2010 19:07:33 +0000 Subject: [PATCH] Incorporate patches by Marcel Martin for read counting. --- ChangeLog | 159 +++++++++++++++++++++++++++++++++++++++++++++++++++++ sam_view.c | 45 ++++++++++++--- samtools.1 | 11 +++- 3 files changed, 206 insertions(+), 9 deletions(-) diff --git a/ChangeLog b/ChangeLog index 12908c9..d43635f 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,162 @@ +------------------------------------------------------------------------ +r797 | lh3lh3 | 2010-11-08 13:39:52 -0500 (Mon, 08 Nov 2010) | 3 lines +Changed paths: + M /trunk/samtools/bam2bcf.c + M /trunk/samtools/bam2bcf.h + M /trunk/samtools/bam2bcf_indel.c + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.9-2 (r797) + * mpileup: indel calling seems to be working + +------------------------------------------------------------------------ +r796 | lh3lh3 | 2010-11-08 10:54:46 -0500 (Mon, 08 Nov 2010) | 2 lines +Changed paths: + M /trunk/samtools/bam2bcf.c + M /trunk/samtools/bam2bcf.h + M /trunk/samtools/bam2bcf_indel.c + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/kaln.c + +indel calling is apparently working, but more information needs to be collected + +------------------------------------------------------------------------ +r795 | lh3lh3 | 2010-11-08 00:39:18 -0500 (Mon, 08 Nov 2010) | 2 lines +Changed paths: + M /trunk/samtools/bam2bcf.c + M /trunk/samtools/bam2bcf_indel.c + +fixed a few bugs in the indel caller. Probably there are more. + +------------------------------------------------------------------------ +r794 | lh3lh3 | 2010-11-07 22:23:16 -0500 (Sun, 07 Nov 2010) | 2 lines +Changed paths: + M /trunk/samtools/Makefile + M /trunk/samtools/bam.h + M /trunk/samtools/bam2bcf.c + M /trunk/samtools/bam2bcf.h + A /trunk/samtools/bam2bcf_indel.c + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/kaln.c + M /trunk/samtools/kaln.h + +prepare for the indel caller. It is not ready yet. + +------------------------------------------------------------------------ +r793 | lh3lh3 | 2010-11-05 11:28:23 -0400 (Fri, 05 Nov 2010) | 2 lines +Changed paths: + M /trunk/samtools/bam2bcf.c + M /trunk/samtools/bam2bcf.h + M /trunk/samtools/bam_plcmd.c + +Revert to r790. The recent changes are not good... + +------------------------------------------------------------------------ +r792 | lh3lh3 | 2010-11-05 00:19:14 -0400 (Fri, 05 Nov 2010) | 6 lines +Changed paths: + M /trunk/samtools/bam2bcf.c + M /trunk/samtools/bam2bcf.h + M /trunk/samtools/bam_plcmd.c + + * this revision is UNSTABLE + + * indel caller seems working, but it is very insensitive and has + several things I do not quite understand. + + +------------------------------------------------------------------------ +r791 | lh3lh3 | 2010-11-04 22:58:43 -0400 (Thu, 04 Nov 2010) | 2 lines +Changed paths: + M /trunk/samtools/bam2bcf.c + M /trunk/samtools/bam2bcf.h + M /trunk/samtools/bam_plcmd.c + +for backup. no effective changes + +------------------------------------------------------------------------ +r790 | lh3lh3 | 2010-11-03 15:51:24 -0400 (Wed, 03 Nov 2010) | 2 lines +Changed paths: + M /trunk/samtools/bcftools/vcfutils.pl + M /trunk/samtools/kprobaln.c + +fixed a minor problem in the example coming with kprobaln.c + +------------------------------------------------------------------------ +r789 | lh3lh3 | 2010-11-02 15:41:27 -0400 (Tue, 02 Nov 2010) | 4 lines +Changed paths: + M /trunk/samtools/Makefile + M /trunk/samtools/bam_md.c + M /trunk/samtools/kaln.c + M /trunk/samtools/kaln.h + A /trunk/samtools/kprobaln.c + A /trunk/samtools/kprobaln.h + +Separate kaln and kprobaln as I am preparing further changes. At +present, the results should be identical to the previous. + + +------------------------------------------------------------------------ +r788 | petulda | 2010-11-02 12:19:04 -0400 (Tue, 02 Nov 2010) | 1 line +Changed paths: + M /trunk/samtools/bam_plcmd.c + +Added -b option: read file names from a file +------------------------------------------------------------------------ +r787 | lh3lh3 | 2010-10-29 23:17:22 -0400 (Fri, 29 Oct 2010) | 7 lines +Changed paths: + M /trunk/samtools/bam.h + M /trunk/samtools/bam_pileup.c + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.9-2 (r787) + + * Allow to set a maximum per-sample depth to reduce memory. However, + BAQ computation is still applied to every read. The speed is not + improved. + + +------------------------------------------------------------------------ +r786 | lh3lh3 | 2010-10-29 12:10:40 -0400 (Fri, 29 Oct 2010) | 3 lines +Changed paths: + M /trunk/samtools/bam2bcf.c + M /trunk/samtools/bam2bcf.h + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/bcftools/bcf.c + M /trunk/samtools/bcftools/vcf.c + + * samtools-0.1.9-1 (r786) + * samtools: optionally perform exact test for each sample + +------------------------------------------------------------------------ +r785 | lh3lh3 | 2010-10-29 09:42:25 -0400 (Fri, 29 Oct 2010) | 2 lines +Changed paths: + M /trunk/samtools/bam2bcf.c + M /trunk/samtools/bam2bcf.h + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bcftools/bcf.c + +Optionally output "DP", the individual read depth + +------------------------------------------------------------------------ +r784 | lh3lh3 | 2010-10-27 23:10:27 -0400 (Wed, 27 Oct 2010) | 2 lines +Changed paths: + M /trunk/samtools/samtools.1 + +acknowledge Petr and John who have greatly contributed to the project. + +------------------------------------------------------------------------ +r783 | lh3lh3 | 2010-10-27 22:47:47 -0400 (Wed, 27 Oct 2010) | 2 lines +Changed paths: + M /trunk/samtools/ChangeLog + M /trunk/samtools/NEWS + M /trunk/samtools/bamtk.c + M /trunk/samtools/samtools.1 + +Release samtools-0.1.9 (r783) + ------------------------------------------------------------------------ r782 | lh3lh3 | 2010-10-27 19:58:54 -0400 (Wed, 27 Oct 2010) | 2 lines Changed paths: diff --git a/sam_view.c b/sam_view.c index d0fdad2..eb69449 100644 --- a/sam_view.c +++ b/sam_view.c @@ -9,6 +9,13 @@ #include "khash.h" KHASH_SET_INIT_STR(rg) +// When counting records instead of printing them, +// data passed to the bam_fetch callback is encapsulated in this struct. +typedef struct { + bam_header_t *header; + int *count; +} count_func_data_t; + typedef khash_t(rg) *rghash_t; rghash_t g_rghash = 0; @@ -54,7 +61,7 @@ static inline int __g_skip_aln(const bam_header_t *h, const bam1_t *b) return 0; } -// callback function for bam_fetch() +// callback function for bam_fetch() that prints nonskipped records static int view_func(const bam1_t *b, void *data) { if (!__g_skip_aln(((samfile_t*)data)->header, b)) @@ -62,19 +69,30 @@ static int view_func(const bam1_t *b, void *data) return 0; } +// callback function for bam_fetch() that counts nonskipped records +static int count_func(const bam1_t *b, void *data) +{ + if (!__g_skip_aln(((count_func_data_t*)data)->header, b)) { + (*((count_func_data_t*)data)->count)++; + } + return 0; +} + static int usage(int is_long_help); int main_samview(int argc, char *argv[]) { - int c, is_header = 0, is_header_only = 0, is_bamin = 1, ret = 0, is_uncompressed = 0, is_bamout = 0, slx2sngr = 0; + int c, is_header = 0, is_header_only = 0, is_bamin = 1, ret = 0, is_uncompressed = 0, is_bamout = 0, slx2sngr = 0, is_count = 0; int of_type = BAM_OFDEC, is_long_help = 0; + int count = 0; samfile_t *in = 0, *out = 0; char in_mode[5], out_mode[5], *fn_out = 0, *fn_list = 0, *fn_ref = 0, *fn_rg = 0; /* parse command-line options */ strcpy(in_mode, "r"); strcpy(out_mode, "w"); - while ((c = getopt(argc, argv, "Sbt:hHo:q:f:F:ul:r:xX?T:CR:")) >= 0) { + while ((c = getopt(argc, argv, "Sbct:hHo:q:f:F:ul:r:xX?T:CR:")) >= 0) { switch (c) { + case 'c': is_count = 1; break; case 'C': slx2sngr = 1; break; case 'S': is_bamin = 0; break; case 'b': is_bamout = 1; break; @@ -133,7 +151,7 @@ int main_samview(int argc, char *argv[]) ret = 1; goto view_end; } - if ((out = samopen(fn_out? fn_out : "-", out_mode, in->header)) == 0) { + if (!is_count && (out = samopen(fn_out? fn_out : "-", out_mode, in->header)) == 0) { fprintf(stderr, "[main_samview] fail to open \"%s\" for writing.\n", fn_out? fn_out : "standard output"); ret = 1; goto view_end; @@ -146,7 +164,8 @@ int main_samview(int argc, char *argv[]) while ((r = samread(in, b)) >= 0) { // read one alignment from `in' if (!__g_skip_aln(in->header, b)) { if (slx2sngr) sol2sanger(b); - samwrite(out, b); // write the alignment to `out' + if (!is_count) samwrite(out, b); // write the alignment to `out' + count++; } } if (r < -1) { @@ -164,14 +183,19 @@ int main_samview(int argc, char *argv[]) goto view_end; } for (i = optind + 1; i < argc; ++i) { - int tid, beg, end; + int tid, beg, end, result; bam_parse_region(in->header, argv[i], &tid, &beg, &end); // parse a region in the format like `chr2:100-200' if (tid < 0) { // reference name is not found fprintf(stderr, "[main_samview] region \"%s\" specifies an unknown reference name. Continue anyway.\n", argv[i]); continue; } // fetch alignments - if (bam_fetch(in->x.bam, idx, tid, beg, end, out, view_func) < 0) { + if (is_count) { + count_func_data_t count_data = { in->header, &count }; + result = bam_fetch(in->x.bam, idx, tid, beg, end, &count_data, count_func); + } else + result = bam_fetch(in->x.bam, idx, tid, beg, end, out, view_func); + if (result < 0) { fprintf(stderr, "[main_samview] retrieval of region \"%s\" failed due to truncated file or corrupt BAM index file\n", argv[i]); ret = 1; break; @@ -181,6 +205,9 @@ int main_samview(int argc, char *argv[]) } view_end: + if (is_count && ret == 0) { + printf("%d\n", count); + } // close files, free and return free(fn_list); free(fn_ref); free(fn_out); free(g_library); free(g_rg); free(fn_rg); if (g_rghash) { @@ -190,7 +217,8 @@ view_end: kh_destroy(rg, g_rghash); } samclose(in); - samclose(out); + if (!is_count) + samclose(out); return ret; } @@ -205,6 +233,7 @@ static int usage(int is_long_help) fprintf(stderr, " -u uncompressed BAM output (force -b)\n"); fprintf(stderr, " -x output FLAG in HEX (samtools-C specific)\n"); fprintf(stderr, " -X output FLAG in string (samtools-C specific)\n"); + fprintf(stderr, " -c print only the count of matching records\n"); fprintf(stderr, " -t FILE list of reference names and lengths (force -S) [null]\n"); fprintf(stderr, " -T FILE reference sequence file (force -S) [null]\n"); fprintf(stderr, " -o FILE output file name [stdout]\n"); diff --git a/samtools.1 b/samtools.1 index 09cb147..7de9857 100644 --- a/samtools.1 +++ b/samtools.1 @@ -47,7 +47,7 @@ entire alignment file unless it is asked to do so. .TP 10 .B view -samtools view [-bhuHS] [-t in.refList] [-o output] [-f reqFlag] [-F +samtools view [-bchuHS] [-t in.refList] [-o output] [-f reqFlag] [-F skipFlag] [-q minMapQ] [-l library] [-r readGroup] [-R rgFile] | [region1 [...]] Extract/print all or sub alignments in SAM or BAM format. If no region @@ -100,6 +100,15 @@ Input is in SAM. If @SQ header lines are absent, the .B `-t' option is required. .TP +.B -c +Instead of printing the alignments, only count them and print the +total number. All filter options, such as +.B `-f', +.B `-F' +and +.B `-q' +, are taken into account. +.TP .BI -t " FILE" This file is TAB-delimited. Each line must contain the reference name and the length of the reference, one line for each distinct reference; -- 2.39.2