X-Git-Url: https://git.donarmstrong.com/?p=samtools.git;a=blobdiff_plain;f=bam.h;h=f3f37f3ecefff6509b7480c52eed0f0090a1fd2f;hp=4b3a68856e7245d4f1595a194cc724f2f95af2eb;hb=9f118264ea012adc21a46d7c03eaad4f9ce7d4d4;hpb=f93dae0d03856955f9424e8b2aaf261304ca647e diff --git a/bam.h b/bam.h index 4b3a688..f3f37f3 100644 --- a/bam.h +++ b/bam.h @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2008 Genome Research Ltd (GRL). + Copyright (c) 2008-2010 Genome Research Ltd (GRL). Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the @@ -32,33 +32,22 @@ @header BAM library provides I/O and various operations on manipulating files - in the BAM (Binary Alignment/Mapping) or TAM (Text Alignment/Mapping) - format. It now supports importing from or exporting to TAM, sorting, + in the BAM (Binary Alignment/Mapping) or SAM (Sequence Alignment/Map) + format. It now supports importing from or exporting to SAM, sorting, merging, generating pileup, and quickly retrieval of reads overlapped with a specified region. @copyright Genome Research Ltd. */ +#define BAM_VERSION "0.1.18-r572" + #include -#include #include #include #include -#if _IOLIB == 1 -#define BAM_TRUE_OFFSET -#include "razf.h" -/*! @abstract BAM file handler */ -typedef RAZF *bamFile; -#define bam_open(fn, mode) razf_open(fn, mode) -#define bam_dopen(fd, mode) razf_dopen(fd, mode) -#define bam_close(fp) razf_close(fp) -#define bam_read(fp, buf, size) razf_read(fp, buf, size) -#define bam_write(fp, buf, size) razf_write(fp, buf, size) -#define bam_tell(fp) razf_tell(fp) -#define bam_seek(fp, pos, dir) razf_seek(fp, pos, dir) -#elif _IOLIB == 2 +#ifndef BAM_LITE #define BAM_VIRTUAL_OFFSET16 #include "bgzf.h" /*! @abstract BAM file handler */ @@ -70,18 +59,15 @@ typedef BGZF *bamFile; #define bam_write(fp, buf, size) bgzf_write(fp, buf, size) #define bam_tell(fp) bgzf_tell(fp) #define bam_seek(fp, pos, dir) bgzf_seek(fp, pos, dir) -#elif _IOLIB == 3 -#define BAM_VIRTUAL_OFFSET16 -#include "razf.h" -/*! @abstract BAM file handler */ -typedef RAZF *bamFile; -#define bam_open(fn, mode) razf_open2(fn, mode) -#define bam_dopen(fd, mode) razf_dopen2(fd, mode) -#define bam_close(fp) razf_close(fp) -#define bam_read(fp, buf, size) razf_read(fp, buf, size) -#define bam_write(fp, buf, size) razf_write(fp, buf, size) -#define bam_tell(fp) razf_tell2(fp) -#define bam_seek(fp, pos, dir) razf_seek2(fp, pos, dir) +#else +#define BAM_TRUE_OFFSET +#include +typedef gzFile bamFile; +#define bam_open(fn, mode) gzopen(fn, mode) +#define bam_dopen(fd, mode) gzdopen(fd, mode) +#define bam_close(fp) gzclose(fp) +#define bam_read(fp, buf, size) gzread(fp, buf, size) +/* no bam_write/bam_tell/bam_seek() here */ #endif /*! @typedef @@ -89,7 +75,9 @@ typedef RAZF *bamFile; @field n_targets number of reference sequences @field target_name names of the reference sequences @field target_len lengths of the referene sequences + @field dict header dictionary @field hash hash table for fast name lookup + @field rg2lib hash table for @RG-ID -> LB lookup @field l_text length of the plain text in the header @field text plain text @@ -100,8 +88,8 @@ typedef struct { int32_t n_targets; char **target_name; uint32_t *target_len; - void *hash; - int l_text; + void *dict, *hash, *rg2lib; + uint32_t l_text, n_text; char *text; } bam_header_t; @@ -113,11 +101,27 @@ typedef struct { #define BAM_FUNMAP 4 /*! @abstract the mate is unmapped */ #define BAM_FMUNMAP 8 +/*! @abstract the read is mapped to the reverse strand */ #define BAM_FREVERSE 16 +/*! @abstract the mate is mapped to the reverse strand */ #define BAM_FMREVERSE 32 +/*! @abstract this is read1 */ #define BAM_FREAD1 64 +/*! @abstract this is read2 */ #define BAM_FREAD2 128 +/*! @abstract not primary alignment */ #define BAM_FSECONDARY 256 +/*! @abstract QC failure */ +#define BAM_FQCFAIL 512 +/*! @abstract optical or PCR duplicate */ +#define BAM_FDUP 1024 + +#define BAM_OFDEC 0 +#define BAM_OFHEX 1 +#define BAM_OFSTR 2 + +/*! @abstract defautl mask for pileup */ +#define BAM_DEF_MASK (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) #define BAM_CORE_SIZE sizeof(bam1_core_t) @@ -130,20 +134,35 @@ typedef struct { /* CIGAR operations. */ -/*! @abstract CIGAR: match */ +/*! @abstract CIGAR: M = match or mismatch*/ #define BAM_CMATCH 0 -/*! @abstract CIGAR: insertion to the reference */ +/*! @abstract CIGAR: I = insertion to the reference */ #define BAM_CINS 1 -/*! @abstract CIGAR: deletion from the reference */ +/*! @abstract CIGAR: D = deletion from the reference */ #define BAM_CDEL 2 -/*! @abstract CIGAR: skip on the reference (e.g. spliced alignment) */ +/*! @abstract CIGAR: N = skip on the reference (e.g. spliced alignment) */ #define BAM_CREF_SKIP 3 -/*! @abstract CIGAR: clip on the read with clipped sequence present in qseq */ +/*! @abstract CIGAR: S = clip on the read with clipped sequence + present in qseq */ #define BAM_CSOFT_CLIP 4 -/*! @abstract CIGAR: clip on the read with clipped sequence trimmed off */ +/*! @abstract CIGAR: H = clip on the read with clipped sequence trimmed off */ #define BAM_CHARD_CLIP 5 -/*! @abstract CIGAR: padding */ +/*! @abstract CIGAR: P = padding */ #define BAM_CPAD 6 +/*! @abstract CIGAR: equals = match */ +#define BAM_CEQUAL 7 +/*! @abstract CIGAR: X = mismatch */ +#define BAM_CDIFF 8 +#define BAM_CBACK 9 + +#define BAM_CIGAR_STR "MIDNSHP=XB" +#define BAM_CIGAR_TYPE 0x3C1A7 + +#define bam_cigar_op(c) ((c)&BAM_CIGAR_MASK) +#define bam_cigar_oplen(c) ((c)>>BAM_CIGAR_SHIFT) +#define bam_cigar_opchr(c) (BAM_CIGAR_STR[bam_cigar_op(c)]) +#define bam_cigar_gen(l, o) ((l)<>((o)<<1)&3) // bit 1: consume query; bit 2: consume reference /*! @typedef @abstract Structure for core alignment information. @@ -175,7 +194,6 @@ typedef struct { @field data_len current length of bam1_t::data @field m_data maximum length of bam1_t::data @field data all variable-length data, concatenated; structure: cigar-qname-seq-qual-aux - @field hash hash table for fast retrieval of tag-value pairs; private @discussion Notes: @@ -187,9 +205,10 @@ typedef struct { bam1_core_t core; int l_aux, data_len, m_data; uint8_t *data; - void *hash; } bam1_t; +typedef struct __bam_iter_t *bam_iter_t; + #define bam1_strand(b) (((b)->core.flag&BAM_FREVERSE) != 0) #define bam1_mstrand(b) (((b)->core.flag&BAM_FMREVERSE) != 0) @@ -228,7 +247,7 @@ typedef struct { @param b pointer to an alignment @return pointer to quality string */ -#define bam1_qual(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname + ((b)->core.l_qseq + 1)/2) +#define bam1_qual(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname + (((b)->core.l_qseq + 1)>>1)) /*! @function @abstract Get a base on read @@ -236,7 +255,10 @@ typedef struct { @param i The i-th position, 0-based @return 4-bit integer representing the base. */ -#define bam1_seqi(s, i) ((s)[(i)/2] >> 4*(1-(i)%2) & 0xf) +//#define bam1_seqi(s, i) ((s)[(i)/2] >> 4*(1-(i)%2) & 0xf) +#define bam1_seqi(s, i) ((s)[(i)>>1] >> ((~(i)&1)<<2) & 0xf) + +#define bam1_seq_seti(s, i, c) ( (s)[(i)>>1] = ((s)[(i)>>1] & 0xf<<(((i)&1)<<2)) | (c)<<((~(i)&1)<<2) ) /*! @function @abstract Get query sequence and quality @@ -245,12 +267,6 @@ typedef struct { */ #define bam1_aux(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname + (b)->core.l_qseq + ((b)->core.l_qseq + 1)/2) -typedef struct { - int32_t qbeg, qend; - int32_t tbeg, tend; - int32_t cbeg, cend; -} bam_segreg_t; - #ifndef kroundup32 /*! @function @abstract Round an integer to the next closest power-2 integer. @@ -266,6 +282,14 @@ typedef struct { */ extern int bam_is_be; +/*! + @abstract Verbose level between 0 and 3; 0 is supposed to disable all + debugging information, though this may not have been implemented. + */ +extern int bam_verbose; + +extern int bam_no_B; + /*! @abstract Table for converting a nucleotide character to the 4-bit encoding. */ extern unsigned char bam_nt16_table[256]; @@ -278,25 +302,29 @@ extern char bam_nt16_nt4_table[]; extern "C" { #endif + /********************* + * Low-level SAM I/O * + *********************/ + /*! @abstract TAM file handler */ typedef struct __tamFile_t *tamFile; /*! - @abstract Open a TAM file, either uncompressed or compressed by gzip/zlib. - @param fn TAM file name - @return TAM file handler + @abstract Open a SAM file for reading, either uncompressed or compressed by gzip/zlib. + @param fn SAM file name + @return SAM file handler */ tamFile sam_open(const char *fn); /*! - @abstract Close a TAM file handler - @param fp TAM file handler + @abstract Close a SAM file handler + @param fp SAM file handler */ void sam_close(tamFile fp); /*! - @abstract Read one alignment from a TAM file handler - @param fp TAM file handler + @abstract Read one alignment from a SAM file handler + @param fp SAM file handler @param header header information (ordered names of chromosomes) @param b read alignment; all members in b will be updated @return 0 if successful; otherwise negative @@ -313,8 +341,52 @@ extern "C" { */ bam_header_t *sam_header_read2(const char *fn_list); + /*! + @abstract Read header from a SAM file (if present) + @param fp SAM file handler + @return pointer to header struct; 0 if no @SQ lines available + */ + bam_header_t *sam_header_read(tamFile fp); + + /*! + @abstract Parse @SQ lines a update a header struct + @param h pointer to the header struct to be updated + @return number of target sequences + + @discussion bam_header_t::{n_targets,target_len,target_name} will + be destroyed in the first place. + */ + int sam_header_parse(bam_header_t *h); + int32_t bam_get_tid(const bam_header_t *header, const char *seq_name); + + /*! + @abstract Parse @RG lines a update a header struct + @param h pointer to the header struct to be updated + @return number of @RG lines + + @discussion bam_header_t::rg2lib will be destroyed in the first + place. + */ + int sam_header_parse_rg(bam_header_t *h); + #define sam_write1(header, b) bam_view1(header, b) + + /******************************** + * APIs for string dictionaries * + ********************************/ + + int bam_strmap_put(void *strmap, const char *rg, const char *lib); + const char *bam_strmap_get(const void *strmap, const char *rg); + void *bam_strmap_dup(const void*); + void *bam_strmap_init(); + void bam_strmap_destroy(void *strmap); + + + /********************* + * Low-level BAM I/O * + *********************/ + /*! @abstract Initialize a header structure. @return the pointer to the header structure @@ -363,6 +435,8 @@ extern "C" { */ int bam_read1(bamFile fp, bam1_t *b); + int bam_remove_B(bam1_t *b); + /*! @abstract Write an alignment to BAM. @param fp BAM file handler @@ -397,44 +471,41 @@ extern "C" { @abstract Free the memory allocated for an alignment. @param b pointer to an alignment */ -#define bam_destroy1(b) do { \ - if ((b)->hash) bam_aux_destroy(b); free((b)->data); free(b); \ +#define bam_destroy1(b) do { \ + if (b) { free((b)->data); free(b); } \ } while (0) /*! - @abstract Print an alignment to the standard output in TAM format. + @abstract Format a BAM record in the SAM format @param header pointer to the header structure @param b alignment to print + @return a pointer to the SAM string */ - void bam_view1(const bam_header_t *header, const bam1_t *b); + char *bam_format1(const bam_header_t *header, const bam1_t *b); - /*! - @abstract Merge multiple sorted BAM. - @param is_by_qname whether to sort by query name - @param out output BAM file name - @param n number of files to be merged - @param fn names of files to be merged - - @discussion Padding information may NOT correctly maintained. This - function is NOT thread safe. - */ - void bam_merge_core(int is_by_qname, const char *out, int n, char * const *fn); + char *bam_format1_core(const bam_header_t *header, const bam1_t *b, int of); /*! - @abstract Sort an unsorted BAM file based on the chromosome order - and the leftmost position of an alignment - - @param is_by_qname whether to sort by query name - @param fn name of the file to be sorted - @param prefix prefix of the output and the temporary files; upon - sucessess, prefix.bam will be written. - @param max_mem approxiate maximum memory (very inaccurate) - - @discussion It may create multiple temporary subalignment files - and then merge them by calling bam_merge_core(). This function is - NOT thread safe. + @abstract Check whether a BAM record is plausibly valid + @param header associated header structure, or NULL if unavailable + @param b alignment to validate + @return 0 if the alignment is invalid; non-zero otherwise + + @discussion Simple consistency check of some of the fields of the + alignment record. If the header is provided, several additional checks + are made. Not all fields are checked, so a non-zero result is not a + guarantee that the record is valid. However it is usually good enough + to detect when bam_seek() has been called with a virtual file offset + that is not the offset of an alignment record. */ - void bam_sort_core(int is_by_qname, const char *fn, const char *prefix, size_t max_mem); + int bam_validate1(const bam_header_t *header, const bam1_t *b); + + const char *bam_get_library(bam_header_t *header, const bam1_t *b); + + + /*************** + * pileup APIs * + ***************/ /*! @typedef @abstract Structure for one alignment covering the pileup position. @@ -454,12 +525,30 @@ extern "C" { bam1_t *b; int32_t qpos; int indel, level; - uint32_t is_del:1, is_head:1, is_tail:1; + uint32_t is_del:1, is_head:1, is_tail:1, is_refskip:1, aux:28; } bam_pileup1_t; - struct __bam_plbuf_t; - /*! @abstract pileup buffer */ - typedef struct __bam_plbuf_t bam_plbuf_t; + typedef int (*bam_plp_auto_f)(void *data, bam1_t *b); + + struct __bam_plp_t; + typedef struct __bam_plp_t *bam_plp_t; + + bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data); + int bam_plp_push(bam_plp_t iter, const bam1_t *b); + const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp); + const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp); + void bam_plp_set_mask(bam_plp_t iter, int mask); + void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt); + void bam_plp_reset(bam_plp_t iter); + void bam_plp_destroy(bam_plp_t iter); + + struct __bam_mplp_t; + typedef struct __bam_mplp_t *bam_mplp_t; + + bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data); + void bam_mplp_destroy(bam_mplp_t iter); + void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt); + int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp); /*! @typedef @abstract Type of function to be called by bam_plbuf_push(). @@ -472,51 +561,19 @@ extern "C" { */ typedef int (*bam_pileup_f)(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data); - void bam_plbuf_reset(bam_plbuf_t *buf); + typedef struct { + bam_plp_t iter; + bam_pileup_f func; + void *data; + } bam_plbuf_t; - /*! - @abstract Initialize a buffer for pileup. - @param func fucntion to be called by bam_pileup_core() - @param data user provided data - @return pointer to the pileup buffer - */ + void bam_plbuf_set_mask(bam_plbuf_t *buf, int mask); + void bam_plbuf_reset(bam_plbuf_t *buf); bam_plbuf_t *bam_plbuf_init(bam_pileup_f func, void *data); - - /*! - @abstract Destroy a pileup buffer. - @param buf pointer to the pileup buffer - */ void bam_plbuf_destroy(bam_plbuf_t *buf); - - /*! - @abstract Push an alignment to the pileup buffer. - @param b alignment to be pushed - @param buf pileup buffer - @see bam_plbuf_init() - @return always 0 currently - - @discussion If all the alignments covering a particular site have - been collected, this function will call the user defined function - as is provided to bam_plbuf_init(). The coordinate of the site the - all the alignments will be transferred to the user defined - function as function parameters. - - When all the alignments are pushed to the buffer, this function - needs to be called with b equal to NULL. This will flush the - buffer. A pileup buffer cannot be reused. - */ int bam_plbuf_push(const bam1_t *b, bam_plbuf_t *buf); - /*! - @abstract A more convenient interface to bam_plbuf_push() - @param fp BAM file handler - @param func user defined function - @param func_data user provided data - - @discussion The file position indicator must be placed right - before the start of an alignment. See also bam_plbuf_push(). - */ - int bam_pileup_file(bamFile fp, bam_pileup_f func, void *func_data); + int bam_pileup_file(bamFile fp, int mask, bam_pileup_f func, void *func_data); struct __bam_lplbuf_t; typedef struct __bam_lplbuf_t bam_lplbuf_t; @@ -532,8 +589,10 @@ extern "C" { /*! @abstract bam_plbuf_push() equivalent with level calculated. */ int bam_lplbuf_push(const bam1_t *b, bam_lplbuf_t *buf); - /*! @abstract bam_plbuf_file() equivalent with level calculated. */ - int bam_lpileup_file(bamFile fp, bam_pileup_f func, void *func_data); + + /********************* + * BAM indexing APIs * + *********************/ struct __bam_index_t; typedef struct __bam_index_t bam_index_t; @@ -583,6 +642,10 @@ extern "C" { */ int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func); + bam_iter_t bam_iter_query(const bam_index_t *idx, int tid, int beg, int end); + int bam_iter_read(bamFile fp, bam_iter_t iter, bam1_t *b); + void bam_iter_destroy(bam_iter_t iter); + /*! @abstract Parse a region in the format: "chr2:100,000-200,000". @discussion bam_header_t::hash will be initialized if empty. @@ -591,14 +654,42 @@ extern "C" { @param ref_id the returned chromosome ID @param begin the returned start coordinate @param end the returned end coordinate + @return 0 on success; -1 on failure */ - void bam_parse_region(bam_header_t *header, const char *str, int *ref_id, int *begin, int *end); + int bam_parse_region(bam_header_t *header, const char *str, int *ref_id, int *begin, int *end); + + + /************************** + * APIs for optional tags * + **************************/ + + /*! + @abstract Retrieve data of a tag + @param b pointer to an alignment struct + @param tag two-character tag to be retrieved - int32_t bam_aux_geti(bam1_t *b, const char tag[2], int *err); - float bam_aux_getf(bam1_t *b, const char tag[2], int *err); - char bam_aux_getc(bam1_t *b, const char tag[2], int *err); - char *bam_aux_getZH(bam1_t *b, const char tag[2], int *err); - void bam_aux_destroy(bam1_t *b); + @return pointer to the type and data. The first character is the + type that can be 'iIsScCdfAZH'. + + @discussion Use bam_aux2?() series to convert the returned data to + the corresponding type. + */ + uint8_t *bam_aux_get(const bam1_t *b, const char tag[2]); + + int32_t bam_aux2i(const uint8_t *s); + float bam_aux2f(const uint8_t *s); + double bam_aux2d(const uint8_t *s); + char bam_aux2A(const uint8_t *s); + char *bam_aux2Z(const uint8_t *s); + + int bam_aux_del(bam1_t *b, uint8_t *s); + void bam_aux_append(bam1_t *b, const char tag[2], char type, int len, uint8_t *data); + uint8_t *bam_aux_get_core(bam1_t *b, const char tag[2]); // an alias of bam_aux_get() + + + /***************** + * Miscellaneous * + *****************/ /*! @abstract Calculate the rightmost coordinate of an alignment on the @@ -618,8 +709,6 @@ extern "C" { */ int32_t bam_cigar2qlen(const bam1_core_t *c, const uint32_t *cigar); - int bam_segreg(int32_t pos, const bam1_core_t *c, const uint32_t *cigar, bam_segreg_t *reg); - #ifdef __cplusplus } #endif @@ -641,12 +730,18 @@ static inline int bam_reg2bin(uint32_t beg, uint32_t end) return 0; } -static inline void bam_copy1(bam1_t *bdst, const bam1_t *bsrc) +/*! + @abstract Copy an alignment + @param bdst destination alignment struct + @param bsrc source alignment struct + @return pointer to the destination alignment struct + */ +static inline bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc) { uint8_t *data = bdst->data; int m_data = bdst->m_data; // backup data and m_data - if (m_data < bsrc->m_data) { // double the capacity - m_data = bsrc->m_data; kroundup32(m_data); + if (m_data < bsrc->data_len) { // double the capacity + m_data = bsrc->data_len; kroundup32(m_data); data = (uint8_t*)realloc(data, m_data); } memcpy(data, bsrc->data, bsrc->data_len); // copy var-len data @@ -654,6 +749,44 @@ static inline void bam_copy1(bam1_t *bdst, const bam1_t *bsrc) // restore the backup bdst->m_data = m_data; bdst->data = data; + return bdst; } +/*! + @abstract Duplicate an alignment + @param src source alignment struct + @return pointer to the destination alignment struct + */ +static inline bam1_t *bam_dup1(const bam1_t *src) +{ + bam1_t *b; + b = bam_init1(); + *b = *src; + b->m_data = b->data_len; + b->data = (uint8_t*)calloc(b->data_len, 1); + memcpy(b->data, src->data, b->data_len); + return b; +} + +static inline int bam_aux_type2size(int x) +{ + if (x == 'C' || x == 'c' || x == 'A') return 1; + else if (x == 'S' || x == 's') return 2; + else if (x == 'I' || x == 'i' || x == 'f') return 4; + else return 0; +} + +/********************************* + *** Compatibility with htslib *** + *********************************/ + +typedef bam_header_t bam_hdr_t; + +#define bam_get_qname(b) bam1_qname(b) +#define bam_get_cigar(b) bam1_cigar(b) + +#define bam_hdr_read(fp) bam_header_read(fp) +#define bam_hdr_write(fp, h) bam_header_write(fp, h) +#define bam_hdr_destroy(fp) bam_header_destroy(fp) + #endif