/* The MIT License
- Copyright (c) 2008 Genome Research Ltd (GRL).
+ Copyright (c) 2008-2010 Genome Research Ltd (GRL).
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
@header
BAM library provides I/O and various operations on manipulating files
- in the BAM (Binary Alignment/Mapping) or TAM (Text Alignment/Mapping)
- format. It now supports importing from or exporting to TAM, sorting,
+ in the BAM (Binary Alignment/Mapping) or SAM (Sequence Alignment/Map)
+ format. It now supports importing from or exporting to SAM, sorting,
merging, generating pileup, and quickly retrieval of reads overlapped
with a specified region.
@copyright Genome Research Ltd.
*/
+#define BAM_VERSION "0.1.18-r580"
+
#include <stdint.h>
-#include <assert.h>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
-#if _IOLIB == 1 && !defined(_NO_RAZF)
-#define BAM_TRUE_OFFSET
-#include "razf.h"
-/*! @abstract BAM file handler */
-typedef RAZF *bamFile;
-#define bam_open(fn, mode) razf_open(fn, mode)
-#define bam_dopen(fd, mode) razf_dopen(fd, mode)
-#define bam_close(fp) razf_close(fp)
-#define bam_read(fp, buf, size) razf_read(fp, buf, size)
-#define bam_write(fp, buf, size) razf_write(fp, buf, size)
-#define bam_tell(fp) razf_tell(fp)
-#define bam_seek(fp, pos, dir) razf_seek(fp, pos, dir)
-#elif _IOLIB == 2
+#ifndef BAM_LITE
#define BAM_VIRTUAL_OFFSET16
#include "bgzf.h"
/*! @abstract BAM file handler */
#define bam_write(fp, buf, size) bgzf_write(fp, buf, size)
#define bam_tell(fp) bgzf_tell(fp)
#define bam_seek(fp, pos, dir) bgzf_seek(fp, pos, dir)
-#elif _IOLIB == 3
-#define BAM_VIRTUAL_OFFSET16
-#include "razf.h"
-/*! @abstract BAM file handler */
-typedef RAZF *bamFile;
-#define bam_open(fn, mode) razf_open2(fn, mode)
-#define bam_dopen(fd, mode) razf_dopen2(fd, mode)
-#define bam_close(fp) razf_close(fp)
-#define bam_read(fp, buf, size) razf_read(fp, buf, size)
-#define bam_write(fp, buf, size) razf_write(fp, buf, size)
-#define bam_tell(fp) razf_tell2(fp)
-#define bam_seek(fp, pos, dir) razf_seek2(fp, pos, dir)
+#else
+#define BAM_TRUE_OFFSET
+#include <zlib.h>
+typedef gzFile bamFile;
+#define bam_open(fn, mode) gzopen(fn, mode)
+#define bam_dopen(fd, mode) gzdopen(fd, mode)
+#define bam_close(fp) gzclose(fp)
+#define bam_read(fp, buf, size) gzread(fp, buf, size)
+/* no bam_write/bam_tell/bam_seek() here */
#endif
/*! @typedef
@field n_targets number of reference sequences
@field target_name names of the reference sequences
@field target_len lengths of the referene sequences
+ @field dict header dictionary
@field hash hash table for fast name lookup
+ @field rg2lib hash table for @RG-ID -> LB lookup
@field l_text length of the plain text in the header
@field text plain text
int32_t n_targets;
char **target_name;
uint32_t *target_len;
- void *hash;
- int l_text;
+ void *dict, *hash, *rg2lib;
+ uint32_t l_text, n_text;
char *text;
} bam_header_t;
#define BAM_FUNMAP 4
/*! @abstract the mate is unmapped */
#define BAM_FMUNMAP 8
+/*! @abstract the read is mapped to the reverse strand */
#define BAM_FREVERSE 16
+/*! @abstract the mate is mapped to the reverse strand */
#define BAM_FMREVERSE 32
+/*! @abstract this is read1 */
#define BAM_FREAD1 64
+/*! @abstract this is read2 */
#define BAM_FREAD2 128
+/*! @abstract not primary alignment */
#define BAM_FSECONDARY 256
+/*! @abstract QC failure */
#define BAM_FQCFAIL 512
+/*! @abstract optical or PCR duplicate */
#define BAM_FDUP 1024
+#define BAM_OFDEC 0
+#define BAM_OFHEX 1
+#define BAM_OFSTR 2
+
+/*! @abstract defautl mask for pileup */
#define BAM_DEF_MASK (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP)
#define BAM_CORE_SIZE sizeof(bam1_core_t)
/*
CIGAR operations.
*/
-/*! @abstract CIGAR: match */
+/*! @abstract CIGAR: M = match or mismatch*/
#define BAM_CMATCH 0
-/*! @abstract CIGAR: insertion to the reference */
+/*! @abstract CIGAR: I = insertion to the reference */
#define BAM_CINS 1
-/*! @abstract CIGAR: deletion from the reference */
+/*! @abstract CIGAR: D = deletion from the reference */
#define BAM_CDEL 2
-/*! @abstract CIGAR: skip on the reference (e.g. spliced alignment) */
+/*! @abstract CIGAR: N = skip on the reference (e.g. spliced alignment) */
#define BAM_CREF_SKIP 3
-/*! @abstract CIGAR: clip on the read with clipped sequence present in qseq */
+/*! @abstract CIGAR: S = clip on the read with clipped sequence
+ present in qseq */
#define BAM_CSOFT_CLIP 4
-/*! @abstract CIGAR: clip on the read with clipped sequence trimmed off */
+/*! @abstract CIGAR: H = clip on the read with clipped sequence trimmed off */
#define BAM_CHARD_CLIP 5
-/*! @abstract CIGAR: padding */
+/*! @abstract CIGAR: P = padding */
#define BAM_CPAD 6
+/*! @abstract CIGAR: equals = match */
+#define BAM_CEQUAL 7
+/*! @abstract CIGAR: X = mismatch */
+#define BAM_CDIFF 8
+#define BAM_CBACK 9
+
+#define BAM_CIGAR_STR "MIDNSHP=XB"
+#define BAM_CIGAR_TYPE 0x3C1A7
+
+#define bam_cigar_op(c) ((c)&BAM_CIGAR_MASK)
+#define bam_cigar_oplen(c) ((c)>>BAM_CIGAR_SHIFT)
+#define bam_cigar_opchr(c) (BAM_CIGAR_STR[bam_cigar_op(c)])
+#define bam_cigar_gen(l, o) ((l)<<BAM_CIGAR_SHIFT|(o))
+#define bam_cigar_type(o) (BAM_CIGAR_TYPE>>((o)<<1)&3) // bit 1: consume query; bit 2: consume reference
/*! @typedef
@abstract Structure for core alignment information.
@field tid chromosome ID, defined by bam_header_t
@field pos 0-based leftmost coordinate
- @field strand strand; 0 for forward and 1 otherwise
@field bin bin calculated by bam_reg2bin()
@field qual mapping quality
@field l_qname length of the query name
@field l_aux length of auxiliary data
@field data_len current length of bam1_t::data
@field m_data maximum length of bam1_t::data
- @field data all variable-length data, concatenated; structure: cigar-qname-seq-qual-aux
+ @field data all variable-length data, concatenated; structure: qname-cigar-seq-qual-aux
@discussion Notes:
1. qname is zero tailing and core.l_qname includes the tailing '\0'.
2. l_qseq is calculated from the total length of an alignment block
on reading or from CIGAR.
+ 3. cigar data is encoded 4 bytes per CIGAR operation.
+ 4. seq is nybble-encoded according to bam_nt16_table.
*/
typedef struct {
bam1_core_t core;
uint8_t *data;
} bam1_t;
+typedef struct __bam_iter_t *bam_iter_t;
+
#define bam1_strand(b) (((b)->core.flag&BAM_FREVERSE) != 0)
#define bam1_mstrand(b) (((b)->core.flag&BAM_FMREVERSE) != 0)
@param b pointer to an alignment
@return pointer to quality string
*/
-#define bam1_qual(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname + ((b)->core.l_qseq + 1)/2)
+#define bam1_qual(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname + (((b)->core.l_qseq + 1)>>1))
/*! @function
@abstract Get a base on read
@param i The i-th position, 0-based
@return 4-bit integer representing the base.
*/
-#define bam1_seqi(s, i) ((s)[(i)/2] >> 4*(1-(i)%2) & 0xf)
+//#define bam1_seqi(s, i) ((s)[(i)/2] >> 4*(1-(i)%2) & 0xf)
+#define bam1_seqi(s, i) ((s)[(i)>>1] >> ((~(i)&1)<<2) & 0xf)
+
+#define bam1_seq_seti(s, i, c) ( (s)[(i)>>1] = ((s)[(i)>>1] & 0xf<<(((i)&1)<<2)) | (c)<<((~(i)&1)<<2) )
/*! @function
@abstract Get query sequence and quality
*/
#define bam1_aux(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname + (b)->core.l_qseq + ((b)->core.l_qseq + 1)/2)
-typedef struct {
- int32_t qbeg, qend;
- int32_t tbeg, tend;
- int32_t cbeg, cend;
-} bam_segreg_t;
-
#ifndef kroundup32
/*! @function
@abstract Round an integer to the next closest power-2 integer.
*/
extern int bam_is_be;
+/*!
+ @abstract Verbose level between 0 and 3; 0 is supposed to disable all
+ debugging information, though this may not have been implemented.
+ */
+extern int bam_verbose;
+
+extern int bam_no_B;
+
/*! @abstract Table for converting a nucleotide character to the 4-bit encoding. */
extern unsigned char bam_nt16_table[256];
extern "C" {
#endif
+ /*********************
+ * Low-level SAM I/O *
+ *********************/
+
/*! @abstract TAM file handler */
typedef struct __tamFile_t *tamFile;
/*!
- @abstract Open a TAM file, either uncompressed or compressed by gzip/zlib.
- @param fn TAM file name
- @return TAM file handler
+ @abstract Open a SAM file for reading, either uncompressed or compressed by gzip/zlib.
+ @param fn SAM file name
+ @return SAM file handler
*/
tamFile sam_open(const char *fn);
/*!
- @abstract Close a TAM file handler
- @param fp TAM file handler
+ @abstract Close a SAM file handler
+ @param fp SAM file handler
*/
void sam_close(tamFile fp);
/*!
- @abstract Read one alignment from a TAM file handler
- @param fp TAM file handler
+ @abstract Read one alignment from a SAM file handler
+ @param fp SAM file handler
@param header header information (ordered names of chromosomes)
@param b read alignment; all members in b will be updated
@return 0 if successful; otherwise negative
*/
bam_header_t *sam_header_read2(const char *fn_list);
+ /*!
+ @abstract Read header from a SAM file (if present)
+ @param fp SAM file handler
+ @return pointer to header struct; 0 if no @SQ lines available
+ */
bam_header_t *sam_header_read(tamFile fp);
+
+ /*!
+ @abstract Parse @SQ lines a update a header struct
+ @param h pointer to the header struct to be updated
+ @return number of target sequences
+
+ @discussion bam_header_t::{n_targets,target_len,target_name} will
+ be destroyed in the first place.
+ */
int sam_header_parse(bam_header_t *h);
+ int32_t bam_get_tid(const bam_header_t *header, const char *seq_name);
+
+ /*!
+ @abstract Parse @RG lines a update a header struct
+ @param h pointer to the header struct to be updated
+ @return number of @RG lines
+
+ @discussion bam_header_t::rg2lib will be destroyed in the first
+ place.
+ */
+ int sam_header_parse_rg(bam_header_t *h);
#define sam_write1(header, b) bam_view1(header, b)
+
+ /********************************
+ * APIs for string dictionaries *
+ ********************************/
+
+ int bam_strmap_put(void *strmap, const char *rg, const char *lib);
+ const char *bam_strmap_get(const void *strmap, const char *rg);
+ void *bam_strmap_dup(const void*);
+ void *bam_strmap_init();
+ void bam_strmap_destroy(void *strmap);
+
+
+ /*********************
+ * Low-level BAM I/O *
+ *********************/
+
/*!
@abstract Initialize a header structure.
@return the pointer to the header structure
*/
int bam_read1(bamFile fp, bam1_t *b);
+ int bam_remove_B(bam1_t *b);
+
/*!
@abstract Write an alignment to BAM.
@param fp BAM file handler
@abstract Free the memory allocated for an alignment.
@param b pointer to an alignment
*/
-#define bam_destroy1(b) do { \
- free((b)->data); free(b); \
+#define bam_destroy1(b) do { \
+ if (b) { free((b)->data); free(b); } \
} while (0)
/*!
*/
char *bam_format1(const bam_header_t *header, const bam1_t *b);
- /*!
- @abstract Merge multiple sorted BAM.
- @param is_by_qname whether to sort by query name
- @param out output BAM file name
- @param n number of files to be merged
- @param fn names of files to be merged
-
- @discussion Padding information may NOT correctly maintained. This
- function is NOT thread safe.
- */
- void bam_merge_core(int is_by_qname, const char *out, int n, char * const *fn);
+ char *bam_format1_core(const bam_header_t *header, const bam1_t *b, int of);
/*!
- @abstract Sort an unsorted BAM file based on the chromosome order
- and the leftmost position of an alignment
-
- @param is_by_qname whether to sort by query name
- @param fn name of the file to be sorted
- @param prefix prefix of the output and the temporary files; upon
- sucessess, prefix.bam will be written.
- @param max_mem approxiate maximum memory (very inaccurate)
-
- @discussion It may create multiple temporary subalignment files
- and then merge them by calling bam_merge_core(). This function is
- NOT thread safe.
+ @abstract Check whether a BAM record is plausibly valid
+ @param header associated header structure, or NULL if unavailable
+ @param b alignment to validate
+ @return 0 if the alignment is invalid; non-zero otherwise
+
+ @discussion Simple consistency check of some of the fields of the
+ alignment record. If the header is provided, several additional checks
+ are made. Not all fields are checked, so a non-zero result is not a
+ guarantee that the record is valid. However it is usually good enough
+ to detect when bam_seek() has been called with a virtual file offset
+ that is not the offset of an alignment record.
*/
- void bam_sort_core(int is_by_qname, const char *fn, const char *prefix, size_t max_mem);
+ int bam_validate1(const bam_header_t *header, const bam1_t *b);
+
+ const char *bam_get_library(bam_header_t *header, const bam1_t *b);
+
+
+ /***************
+ * pileup APIs *
+ ***************/
/*! @typedef
@abstract Structure for one alignment covering the pileup position.
bam1_t *b;
int32_t qpos;
int indel, level;
- uint32_t is_del:1, is_head:1, is_tail:1;
+ uint32_t is_del:1, is_head:1, is_tail:1, is_refskip:1, aux:28;
} bam_pileup1_t;
- struct __bam_plbuf_t;
- /*! @abstract pileup buffer */
- typedef struct __bam_plbuf_t bam_plbuf_t;
+ typedef int (*bam_plp_auto_f)(void *data, bam1_t *b);
- void bam_plbuf_set_mask(bam_plbuf_t *buf, int mask);
+ struct __bam_plp_t;
+ typedef struct __bam_plp_t *bam_plp_t;
+
+ bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data);
+ int bam_plp_push(bam_plp_t iter, const bam1_t *b);
+ const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp);
+ const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp);
+ void bam_plp_set_mask(bam_plp_t iter, int mask);
+ void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt);
+ void bam_plp_reset(bam_plp_t iter);
+ void bam_plp_destroy(bam_plp_t iter);
+
+ struct __bam_mplp_t;
+ typedef struct __bam_mplp_t *bam_mplp_t;
+
+ bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data);
+ void bam_mplp_destroy(bam_mplp_t iter);
+ void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt);
+ int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp);
/*! @typedef
@abstract Type of function to be called by bam_plbuf_push().
*/
typedef int (*bam_pileup_f)(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data);
- void bam_plbuf_reset(bam_plbuf_t *buf);
+ typedef struct {
+ bam_plp_t iter;
+ bam_pileup_f func;
+ void *data;
+ } bam_plbuf_t;
- /*!
- @abstract Initialize a buffer for pileup.
- @param func fucntion to be called by bam_pileup_core()
- @param data user provided data
- @return pointer to the pileup buffer
- */
+ void bam_plbuf_set_mask(bam_plbuf_t *buf, int mask);
+ void bam_plbuf_reset(bam_plbuf_t *buf);
bam_plbuf_t *bam_plbuf_init(bam_pileup_f func, void *data);
-
- /*!
- @abstract Destroy a pileup buffer.
- @param buf pointer to the pileup buffer
- */
void bam_plbuf_destroy(bam_plbuf_t *buf);
-
- /*!
- @abstract Push an alignment to the pileup buffer.
- @param b alignment to be pushed
- @param buf pileup buffer
- @see bam_plbuf_init()
- @return always 0 currently
-
- @discussion If all the alignments covering a particular site have
- been collected, this function will call the user defined function
- as is provided to bam_plbuf_init(). The coordinate of the site the
- all the alignments will be transferred to the user defined
- function as function parameters.
-
- When all the alignments are pushed to the buffer, this function
- needs to be called with b equal to NULL. This will flush the
- buffer. A pileup buffer cannot be reused.
- */
int bam_plbuf_push(const bam1_t *b, bam_plbuf_t *buf);
+ int bam_pileup_file(bamFile fp, int mask, bam_pileup_f func, void *func_data);
+
struct __bam_lplbuf_t;
typedef struct __bam_lplbuf_t bam_lplbuf_t;
/*! @abstract bam_plbuf_push() equivalent with level calculated. */
int bam_lplbuf_push(const bam1_t *b, bam_lplbuf_t *buf);
- /*! @abstract bam_plbuf_file() equivalent with level calculated. */
- int bam_lpileup_file(bamFile fp, int mask, bam_pileup_f func, void *func_data);
+
+ /*********************
+ * BAM indexing APIs *
+ *********************/
struct __bam_index_t;
typedef struct __bam_index_t bam_index_t;
*/
int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func);
+ bam_iter_t bam_iter_query(const bam_index_t *idx, int tid, int beg, int end);
+ int bam_iter_read(bamFile fp, bam_iter_t iter, bam1_t *b);
+ void bam_iter_destroy(bam_iter_t iter);
+
/*!
@abstract Parse a region in the format: "chr2:100,000-200,000".
@discussion bam_header_t::hash will be initialized if empty.
@param ref_id the returned chromosome ID
@param begin the returned start coordinate
@param end the returned end coordinate
+ @return 0 on success; -1 on failure
*/
- void bam_parse_region(bam_header_t *header, const char *str, int *ref_id, int *begin, int *end);
+ int bam_parse_region(bam_header_t *header, const char *str, int *ref_id, int *begin, int *end);
+
+
+ /**************************
+ * APIs for optional tags *
+ **************************/
+
+ /*!
+ @abstract Retrieve data of a tag
+ @param b pointer to an alignment struct
+ @param tag two-character tag to be retrieved
+
+ @return pointer to the type and data. The first character is the
+ type that can be 'iIsScCdfAZH'.
+
+ @discussion Use bam_aux2?() series to convert the returned data to
+ the corresponding type.
+ */
+ uint8_t *bam_aux_get(const bam1_t *b, const char tag[2]);
- void bam_aux_append(bam1_t *b, const char tag[2], char type, int len, uint8_t *data);
- uint8_t *bam_aux_get_core(bam1_t *b, const char tag[2]); // an alias of bam_aux_get()
- uint8_t *bam_aux_get(bam1_t *b, const char tag[2]);
int32_t bam_aux2i(const uint8_t *s);
float bam_aux2f(const uint8_t *s);
double bam_aux2d(const uint8_t *s);
char bam_aux2A(const uint8_t *s);
char *bam_aux2Z(const uint8_t *s);
- /*!
- @abstract Get the color encoding the previous and current base
- @param b pointer to an alignment
- @param i The i-th position, 0-based
- @return color
-
- @discussion Returns 0 no color information is found.
- */
- char bam_aux_getCSi(bam1_t *b, int i);
-
- /*!
- @abstract Get the color quality of the color encoding the previous and current base
- @param b pointer to an alignment
- @param i The i-th position, 0-based
- @return color quality
-
- @discussion Returns 0 no color information is found.
- */
- char bam_aux_getCQi(bam1_t *b, int i);
+ int bam_aux_del(bam1_t *b, uint8_t *s);
+ void bam_aux_append(bam1_t *b, const char tag[2], char type, int len, uint8_t *data);
+ uint8_t *bam_aux_get_core(bam1_t *b, const char tag[2]); // an alias of bam_aux_get()
- /*!
- @abstract Get the color error profile at the give position
- @param b pointer to an alignment
- @return the original color if the color was an error, '-' (dash) otherwise
- @discussion Returns 0 no color information is found.
- */
- char bam_aux_getCEi(bam1_t *b, int i);
+ /*****************
+ * Miscellaneous *
+ *****************/
/*!
@abstract Calculate the rightmost coordinate of an alignment on the
*/
int32_t bam_cigar2qlen(const bam1_core_t *c, const uint32_t *cigar);
- int bam_segreg(int32_t pos, const bam1_core_t *c, const uint32_t *cigar, bam_segreg_t *reg);
-
#ifdef __cplusplus
}
#endif
return 0;
}
+/*!
+ @abstract Copy an alignment
+ @param bdst destination alignment struct
+ @param bsrc source alignment struct
+ @return pointer to the destination alignment struct
+ */
static inline bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc)
{
uint8_t *data = bdst->data;
int m_data = bdst->m_data; // backup data and m_data
- if (m_data < bsrc->m_data) { // double the capacity
- m_data = bsrc->m_data; kroundup32(m_data);
+ if (m_data < bsrc->data_len) { // double the capacity
+ m_data = bsrc->data_len; kroundup32(m_data);
data = (uint8_t*)realloc(data, m_data);
}
memcpy(data, bsrc->data, bsrc->data_len); // copy var-len data
return bdst;
}
+/*!
+ @abstract Duplicate an alignment
+ @param src source alignment struct
+ @return pointer to the destination alignment struct
+ */
static inline bam1_t *bam_dup1(const bam1_t *src)
{
bam1_t *b;
return b;
}
+static inline int bam_aux_type2size(int x)
+{
+ if (x == 'C' || x == 'c' || x == 'A') return 1;
+ else if (x == 'S' || x == 's') return 2;
+ else if (x == 'I' || x == 'i' || x == 'f' || x == 'F') return 4;
+ else return 0;
+}
+
+/*********************************
+ *** Compatibility with htslib ***
+ *********************************/
+
+typedef bam_header_t bam_hdr_t;
+
+#define bam_get_qname(b) bam1_qname(b)
+#define bam_get_cigar(b) bam1_cigar(b)
+
+#define bam_hdr_read(fp) bam_header_read(fp)
+#define bam_hdr_write(fp, h) bam_header_write(fp, h)
+#define bam_hdr_destroy(fp) bam_header_destroy(fp)
+
#endif