X-Git-Url: https://git.donarmstrong.com/?p=samtools.git;a=blobdiff_plain;f=bam.h;h=a3305e3cf4b758edaaeaf201ea4fbc87ab8f919b;hp=fb71a49b46dafea0512a99e157735104cd3832a3;hb=0ccd9d36ebf35ce620a8248ecf4336c84065e6c0;hpb=a958954399757774ee26bfcf0b5b95e9ec9b62f4 diff --git a/bam.h b/bam.h index fb71a49..a3305e3 100644 --- a/bam.h +++ b/bam.h @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2008 Genome Research Ltd (GRL). + Copyright (c) 2008-2010 Genome Research Ltd (GRL). Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the @@ -33,13 +33,15 @@ BAM library provides I/O and various operations on manipulating files in the BAM (Binary Alignment/Mapping) or SAM (Sequence Alignment/Map) - format. It now supports importing from or exporting to TAM, sorting, + format. It now supports importing from or exporting to SAM, sorting, merging, generating pileup, and quickly retrieval of reads overlapped with a specified region. @copyright Genome Research Ltd. */ +#define BAM_VERSION "0.1.18-r580" + #include #include #include @@ -87,7 +89,7 @@ typedef struct { char **target_name; uint32_t *target_len; void *dict, *hash, *rg2lib; - size_t l_text, n_text; + uint32_t l_text, n_text; char *text; } bam_header_t; @@ -132,26 +134,40 @@ typedef struct { /* CIGAR operations. */ -/*! @abstract CIGAR: match */ +/*! @abstract CIGAR: M = match or mismatch*/ #define BAM_CMATCH 0 -/*! @abstract CIGAR: insertion to the reference */ +/*! @abstract CIGAR: I = insertion to the reference */ #define BAM_CINS 1 -/*! @abstract CIGAR: deletion from the reference */ +/*! @abstract CIGAR: D = deletion from the reference */ #define BAM_CDEL 2 -/*! @abstract CIGAR: skip on the reference (e.g. spliced alignment) */ +/*! @abstract CIGAR: N = skip on the reference (e.g. spliced alignment) */ #define BAM_CREF_SKIP 3 -/*! @abstract CIGAR: clip on the read with clipped sequence present in qseq */ +/*! @abstract CIGAR: S = clip on the read with clipped sequence + present in qseq */ #define BAM_CSOFT_CLIP 4 -/*! @abstract CIGAR: clip on the read with clipped sequence trimmed off */ +/*! @abstract CIGAR: H = clip on the read with clipped sequence trimmed off */ #define BAM_CHARD_CLIP 5 -/*! @abstract CIGAR: padding */ +/*! @abstract CIGAR: P = padding */ #define BAM_CPAD 6 +/*! @abstract CIGAR: equals = match */ +#define BAM_CEQUAL 7 +/*! @abstract CIGAR: X = mismatch */ +#define BAM_CDIFF 8 +#define BAM_CBACK 9 + +#define BAM_CIGAR_STR "MIDNSHP=XB" +#define BAM_CIGAR_TYPE 0x3C1A7 + +#define bam_cigar_op(c) ((c)&BAM_CIGAR_MASK) +#define bam_cigar_oplen(c) ((c)>>BAM_CIGAR_SHIFT) +#define bam_cigar_opchr(c) (BAM_CIGAR_STR[bam_cigar_op(c)]) +#define bam_cigar_gen(l, o) ((l)<>((o)<<1)&3) // bit 1: consume query; bit 2: consume reference /*! @typedef @abstract Structure for core alignment information. @field tid chromosome ID, defined by bam_header_t @field pos 0-based leftmost coordinate - @field strand strand; 0 for forward and 1 otherwise @field bin bin calculated by bam_reg2bin() @field qual mapping quality @field l_qname length of the query name @@ -176,13 +192,15 @@ typedef struct { @field l_aux length of auxiliary data @field data_len current length of bam1_t::data @field m_data maximum length of bam1_t::data - @field data all variable-length data, concatenated; structure: cigar-qname-seq-qual-aux + @field data all variable-length data, concatenated; structure: qname-cigar-seq-qual-aux @discussion Notes: 1. qname is zero tailing and core.l_qname includes the tailing '\0'. 2. l_qseq is calculated from the total length of an alignment block on reading or from CIGAR. + 3. cigar data is encoded 4 bytes per CIGAR operation. + 4. seq is nybble-encoded according to bam_nt16_table. */ typedef struct { bam1_core_t core; @@ -230,7 +248,7 @@ typedef struct __bam_iter_t *bam_iter_t; @param b pointer to an alignment @return pointer to quality string */ -#define bam1_qual(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname + ((b)->core.l_qseq + 1)/2) +#define bam1_qual(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname + (((b)->core.l_qseq + 1)>>1)) /*! @function @abstract Get a base on read @@ -238,7 +256,10 @@ typedef struct __bam_iter_t *bam_iter_t; @param i The i-th position, 0-based @return 4-bit integer representing the base. */ -#define bam1_seqi(s, i) ((s)[(i)/2] >> 4*(1-(i)%2) & 0xf) +//#define bam1_seqi(s, i) ((s)[(i)/2] >> 4*(1-(i)%2) & 0xf) +#define bam1_seqi(s, i) ((s)[(i)>>1] >> ((~(i)&1)<<2) & 0xf) + +#define bam1_seq_seti(s, i, c) ( (s)[(i)>>1] = ((s)[(i)>>1] & 0xf<<(((i)&1)<<2)) | (c)<<((~(i)&1)<<2) ) /*! @function @abstract Get query sequence and quality @@ -262,6 +283,14 @@ typedef struct __bam_iter_t *bam_iter_t; */ extern int bam_is_be; +/*! + @abstract Verbose level between 0 and 3; 0 is supposed to disable all + debugging information, though this may not have been implemented. + */ +extern int bam_verbose; + +extern int bam_no_B; + /*! @abstract Table for converting a nucleotide character to the 4-bit encoding. */ extern unsigned char bam_nt16_table[256]; @@ -329,6 +358,7 @@ extern "C" { be destroyed in the first place. */ int sam_header_parse(bam_header_t *h); + int32_t bam_get_tid(const bam_header_t *header, const char *seq_name); /*! @abstract Parse @RG lines a update a header struct @@ -406,6 +436,8 @@ extern "C" { */ int bam_read1(bamFile fp, bam1_t *b); + int bam_remove_B(bam1_t *b); + /*! @abstract Write an alignment to BAM. @param fp BAM file handler @@ -454,6 +486,21 @@ extern "C" { char *bam_format1_core(const bam_header_t *header, const bam1_t *b, int of); + /*! + @abstract Check whether a BAM record is plausibly valid + @param header associated header structure, or NULL if unavailable + @param b alignment to validate + @return 0 if the alignment is invalid; non-zero otherwise + + @discussion Simple consistency check of some of the fields of the + alignment record. If the header is provided, several additional checks + are made. Not all fields are checked, so a non-zero result is not a + guarantee that the record is valid. However it is usually good enough + to detect when bam_seek() has been called with a virtual file offset + that is not the offset of an alignment record. + */ + int bam_validate1(const bam_header_t *header, const bam1_t *b); + const char *bam_get_library(bam_header_t *header, const bam1_t *b); @@ -479,7 +526,7 @@ extern "C" { bam1_t *b; int32_t qpos; int indel, level; - uint32_t is_del:1, is_head:1, is_tail:1; + uint32_t is_del:1, is_head:1, is_tail:1, is_refskip:1, aux:28; } bam_pileup1_t; typedef int (*bam_plp_auto_f)(void *data, bam1_t *b); @@ -492,6 +539,7 @@ extern "C" { const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp); const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp); void bam_plp_set_mask(bam_plp_t iter, int mask); + void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt); void bam_plp_reset(bam_plp_t iter); void bam_plp_destroy(bam_plp_t iter); @@ -500,6 +548,7 @@ extern "C" { bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data); void bam_mplp_destroy(bam_mplp_t iter); + void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt); int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp); /*! @typedef @@ -692,8 +741,8 @@ static inline bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc) { uint8_t *data = bdst->data; int m_data = bdst->m_data; // backup data and m_data - if (m_data < bsrc->m_data) { // double the capacity - m_data = bsrc->m_data; kroundup32(m_data); + if (m_data < bsrc->data_len) { // double the capacity + m_data = bsrc->data_len; kroundup32(m_data); data = (uint8_t*)realloc(data, m_data); } memcpy(data, bsrc->data, bsrc->data_len); // copy var-len data @@ -720,4 +769,25 @@ static inline bam1_t *bam_dup1(const bam1_t *src) return b; } +static inline int bam_aux_type2size(int x) +{ + if (x == 'C' || x == 'c' || x == 'A') return 1; + else if (x == 'S' || x == 's') return 2; + else if (x == 'I' || x == 'i' || x == 'f' || x == 'F') return 4; + else return 0; +} + +/********************************* + *** Compatibility with htslib *** + *********************************/ + +typedef bam_header_t bam_hdr_t; + +#define bam_get_qname(b) bam1_qname(b) +#define bam_get_cigar(b) bam1_cigar(b) + +#define bam_hdr_read(fp) bam_header_read(fp) +#define bam_hdr_write(fp, h) bam_header_write(fp, h) +#define bam_hdr_destroy(fp) bam_header_destroy(fp) + #endif