From 521aa49c29217a0398ff5e186072df97cf9d80c4 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 2 Sep 2009 09:44:48 +0000 Subject: [PATCH] * samtools-0.1.5-34 (r451) * applied the patch by John * improved the help message a little bit --- bam_md.c | 1 - bam_rmdup.c | 3 ++- bam_rmdupse.c | 3 ++- bam_sort.c | 66 ++++++++++++++++++++++++++++++++++++++++++--------- bamtk.c | 12 +++++----- samtools.1 | 49 ++++++++++++++++++++++++++++---------- 6 files changed, 102 insertions(+), 32 deletions(-) diff --git a/bam_md.c b/bam_md.c index e2fdff8..8d07487 100644 --- a/bam_md.c +++ b/bam_md.c @@ -59,7 +59,6 @@ void bam_fillmd1(bam1_t *b, char *ref, int is_equal) if (old_nm) old_nm_i = bam_aux2i(old_nm); if (!old_nm) bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm); else if (nm != old_nm_i) { - uint8_t *old_data = b->data; fprintf(stderr, "[bam_fillmd1] different NM for read '%s': %d -> %d\n", bam1_qname(b), old_nm_i, nm); bam_aux_del(b, old_nm); bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm); diff --git a/bam_rmdup.c b/bam_rmdup.c index 1fa6cad..5da9460 100644 --- a/bam_rmdup.c +++ b/bam_rmdup.c @@ -128,7 +128,8 @@ int bam_rmdup(int argc, char *argv[]) { bamFile in, out; if (argc < 3) { - fprintf(stderr, "Usage: samtools rmdup \n"); + fprintf(stderr, "Usage: samtools rmdup \n\n"); + fprintf(stderr, "Note: Picard is recommended for this task.\n"); return 1; } in = (strcmp(argv[1], "-") == 0)? bam_dopen(fileno(stdin), "r") : bam_open(argv[1], "r"); diff --git a/bam_rmdupse.c b/bam_rmdupse.c index df03717..cf1b7bd 100644 --- a/bam_rmdupse.c +++ b/bam_rmdupse.c @@ -161,7 +161,8 @@ int bam_rmdupse(int argc, char *argv[]) samfile_t *in, *out; buffer_t *buf; if (argc < 3) { - fprintf(stderr, "Usage: samtools rmdupse \n"); + fprintf(stderr, "Usage: samtools rmdupse \n\n"); + fprintf(stderr, "Note: Picard is recommended for this task.\n"); return 1; } buf = calloc(1, sizeof(buffer_t)); diff --git a/bam_sort.c b/bam_sort.c index c43e56f..a2d3d09 100644 --- a/bam_sort.c +++ b/bam_sort.c @@ -49,23 +49,44 @@ static inline int heap_lt(const heap1_t a, const heap1_t b) KSORT_INIT(heap, heap1_t, heap_lt) +static void swap_header_text(bam_header_t *h1, bam_header_t *h2) +{ + int tempi; + char *temps; + tempi = h1->l_text, h1->l_text = h2->l_text, h2->l_text = tempi; + temps = h1->text, h1->text = h2->text, h2->text = temps; +} + /*! @abstract Merge multiple sorted BAM. @param is_by_qname whether to sort by query name @param out output BAM file name + @param headers name of SAM file from which to copy '@' header lines, + or NULL to copy them from the first file to be merged @param n number of files to be merged @param fn names of files to be merged @discussion Padding information may NOT correctly maintained. This function is NOT thread safe. */ -void bam_merge_core(int by_qname, const char *out, int n, char * const *fn) +void bam_merge_core(int by_qname, const char *out, const char *headers, int n, char * const *fn) { bamFile fpout, *fp; heap1_t *heap; bam_header_t *hout = 0; + bam_header_t *hheaders = NULL; int i, j; + if (headers) { + tamFile fpheaders = sam_open(headers); + if (fpheaders == 0) { + fprintf(stderr, "[bam_merge_core] Cannot open file `%s'. Continue anyway.\n", headers); + } else { + hheaders = sam_header_read(fpheaders); + sam_close(fpheaders); + } + } + g_is_by_qname = by_qname; fp = (bamFile*)calloc(n, sizeof(bamFile)); heap = (heap1_t*)calloc(n, sizeof(heap1_t)); @@ -74,8 +95,24 @@ void bam_merge_core(int by_qname, const char *out, int n, char * const *fn) bam_header_t *hin; assert(fp[i] = bam_open(fn[i], "r")); hin = bam_header_read(fp[i]); - if (i == 0) hout = hin; - else { // validate multiple baf + if (i == 0) { // the first SAM + hout = hin; + if (hheaders) { + // If the text headers to be swapped in include any @SQ headers, + // check that they are consistent with the existing binary list + // of reference information. + if (hheaders->n_targets > 0) { + if (hout->n_targets != hheaders->n_targets) + fprintf(stderr, "[bam_merge_core] number of @SQ headers in `%s' differs from number of target sequences", headers); + for (j = 0; j < hout->n_targets; ++j) + if (strcmp(hout->target_name[j], hheaders->target_name[j]) != 0) + fprintf(stderr, "[bam_merge_core] @SQ header '%s' in '%s' differs from target sequence", hheaders->target_name[j], headers); + } + swap_header_text(hout, hheaders); + bam_header_destroy(hheaders); + hheaders = NULL; + } + } else { // validate multiple baf if (hout->n_targets != hin->n_targets) { fprintf(stderr, "[bam_merge_core] file '%s' has different number of target sequences. Abort!\n", fn[i]); exit(1); @@ -86,9 +123,6 @@ void bam_merge_core(int by_qname, const char *out, int n, char * const *fn) hout->target_name[j], hin->target_name[j], fn[i]); exit(1); } - if (hout->target_len[j] != hin->target_len[j]) - fprintf(stderr, "[bam_merge_core] different target sequence length: %d != %d in file '%s'. Continue.\n", - hout->target_len[j], hin->target_len[j], fn[i]); } bam_header_destroy(hin); } @@ -125,16 +159,26 @@ void bam_merge_core(int by_qname, const char *out, int n, char * const *fn) int bam_merge(int argc, char *argv[]) { int c, is_by_qname = 0; - while ((c = getopt(argc, argv, "n")) >= 0) { + char *fn_headers = NULL; + + while ((c = getopt(argc, argv, "h:n")) >= 0) { switch (c) { + case 'h': fn_headers = strdup(optarg); break; case 'n': is_by_qname = 1; break; } } if (optind + 2 >= argc) { - fprintf(stderr, "Usage: samtools merge [-n] [...]\n"); + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: samtools merge [-n] [-h inh.sam] [...]\n\n"); + fprintf(stderr, "Options: -n sort by read names\n"); + fprintf(stderr, " -h FILE copy the header in FILE to [in1.bam]\n\n"); + fprintf(stderr, "Note: Samtools' merge does not reconstruct the @RG dictionary in the header. Users\n"); + fprintf(stderr, " must provide the correct header with -h, or uses Picard which properly maintains\n"); + fprintf(stderr, " the header dictionary in merging.\n\n"); return 1; } - bam_merge_core(is_by_qname, argv[optind], argc - optind - 1, argv + optind + 1); + bam_merge_core(is_by_qname, argv[optind], fn_headers, argc - optind - 1, argv + optind + 1); + free(fn_headers); return 0; } @@ -220,10 +264,10 @@ void bam_sort_core(int is_by_qname, const char *fn, const char *prefix, size_t m fns[i] = (char*)calloc(strlen(prefix) + 20, 1); sprintf(fns[i], "%s.%.4d.bam", prefix, i); } - bam_merge_core(is_by_qname, fnout, n, fns); + bam_merge_core(is_by_qname, fnout, 0, n, fns); free(fnout); for (i = 0; i < n; ++i) { -// unlink(fns[i]); + unlink(fns[i]); free(fns[i]); } free(fns); diff --git a/bamtk.c b/bamtk.c index c91200e..3742bb2 100644 --- a/bamtk.c +++ b/bamtk.c @@ -9,7 +9,7 @@ #endif #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.1.5-33 (r450)" +#define PACKAGE_VERSION "0.1.5-34 (r451)" #endif int bam_taf2baf(int argc, char *argv[]); @@ -76,10 +76,9 @@ static int usage() fprintf(stderr, "Program: samtools (Tools for alignments in the SAM format)\n"); fprintf(stderr, "Version: %s\n\n", PACKAGE_VERSION); fprintf(stderr, "Usage: samtools [options]\n\n"); - fprintf(stderr, "Command: import import from SAM (obsolete; use `view')\n"); - fprintf(stderr, " view export to the text format\n"); + fprintf(stderr, "Command: view SAM<->BAM conversion\n"); fprintf(stderr, " sort sort alignment file\n"); - fprintf(stderr, " merge merge multiple sorted alignment files\n"); + fprintf(stderr, " merge merge sorted alignments (Picard recommended)\n"); fprintf(stderr, " pileup generate pileup output\n"); fprintf(stderr, " faidx index/extract FASTA\n"); #if _CURSES_LIB != 0 @@ -87,10 +86,10 @@ static int usage() #endif fprintf(stderr, " index index alignment\n"); fprintf(stderr, " fixmate fix mate information\n"); - fprintf(stderr, " rmdup remove PCR duplicates\n"); + fprintf(stderr, " rmdup remove PCR duplicates (Picard recommended)\n"); fprintf(stderr, " glfview print GLFv3 file\n"); fprintf(stderr, " flagstat simple stats\n"); - fprintf(stderr, " fillmd recalculate MD/NM tags and '=' bases\n"); + fprintf(stderr, " calmd recalculate MD/NM tags and '=' bases\n"); fprintf(stderr, "\n"); return 1; } @@ -118,6 +117,7 @@ int main(int argc, char *argv[]) else if (strcmp(argv[1], "glfview") == 0) return glf3_view_main(argc-1, argv+1); else if (strcmp(argv[1], "flagstat") == 0) return bam_flagstat(argc-1, argv+1); else if (strcmp(argv[1], "tagview") == 0) return bam_tagview(argc-1, argv+1); + else if (strcmp(argv[1], "calmd") == 0) return bam_fillmd(argc-1, argv+1); else if (strcmp(argv[1], "fillmd") == 0) return bam_fillmd(argc-1, argv+1); #if _CURSES_LIB != 0 else if (strcmp(argv[1], "tview") == 0) return bam_tview_main(argc-1, argv+1); diff --git a/samtools.1 b/samtools.1 index 11ea5b6..7b50ef9 100644 --- a/samtools.1 +++ b/samtools.1 @@ -33,12 +33,11 @@ output (stdout). Several commands can thus be combined with Unix pipes. Samtools always output warning and error messages to the standard error output (stderr). -Samtools is also able to open a BAM (not SAM) file on a remote FTP -server if the BAM file name starts with `ftp://'. Samtools checks the -current working directory for the index file and will download the index -upon absence. Samtools achieves random FTP file access with the `REST' -ftp command. It does not retrieve the entire alignment file unless it is -asked to do so. +Samtools is also able to open a BAM (not SAM) file on a remote FTP or +HTTP server if the BAM file name starts with `ftp://' or `http://'. +Samtools checks the current working directory for the index file and +will download the index upon absence. Samtools does not retrieve the +entire alignment file unless it is asked to do so. .SH COMMANDS AND OPTIONS @@ -73,17 +72,34 @@ Approximately the maximum required memory. [500000000] .TP .B merge -samtools merge [-n] [...] - -Merge multiple sorted alignments. The header of -.I +samtools merge [-h inh.sam] [-n] [...] + +Merge multiple sorted alignments. +The header reference lists of all the input BAM files, and the @SQ headers of +.IR inh.sam , +if any, must all refer to the same set of reference sequences. +The header reference list and (unless overridden by +.BR -h ) +`@' headers of +.I in1.bam will be copied to -.I +.IR out.bam , and the headers of other files will be ignored. .B OPTIONS: .RS .TP 8 +.B -h FILE +Use the lines of +.I FILE +as `@' headers to be copied to +.IR out.bam , +replacing any header lines that would otherwise be copied from +.IR in1.bam . +.RI ( FILE +is actually in SAM format, though any alignment records it may contain +are ignored.) +.TP .B -n The input alignments are sorted by read names rather than by chromosomal coordinates @@ -406,6 +422,15 @@ _ Unaligned words used in bam_import.c, bam_endian.h, bam.c and bam_aux.c. .IP o 2 CIGAR operation P is not properly handled at the moment. +.IP o 2 +In merging, the input files are required to have the same number of +reference sequences. The requirement can be relaxed. In addition, +merging does not reconstruct the header dictionaries +automatically. Endusers have to provide the correct header. Picard is +better at merging. +.IP o 2 +Samtools' rmdup does not work for single-end data and does not remove +duplicates across chromosomes. Picard is better. .SH AUTHOR .PP @@ -417,4 +442,4 @@ specification. .SH SEE ALSO .PP -Samtools website: http://samtools.sourceforge.net +Samtools website: -- 2.39.2