From df50415f5380510f5b44bee2361413fc39c8bbcf Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 23 Sep 2011 15:14:21 +0000 Subject: [PATCH] converted padded SAM to unpadded SAM --- Makefile | 2 +- bam.h | 7 ++++ bam_mate.c | 46 +++++++++++++++++--- bamtk.c | 3 ++ kstring.c | 4 +- kstring.h | 36 +++++++++++++++- padding.c | 120 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 7 files changed, 209 insertions(+), 9 deletions(-) create mode 100644 padding.c diff --git a/Makefile b/Makefile index db18333..99f7eca 100644 --- a/Makefile +++ b/Makefile @@ -8,7 +8,7 @@ LOBJS= bgzf.o kstring.o bam_aux.o bam.o bam_import.o sam.o bam_index.o \ AOBJS= bam_tview.o bam_plcmd.o sam_view.o \ bam_rmdup.o bam_rmdupse.o bam_mate.o bam_stat.o bam_color.o \ bamtk.o kaln.o bam2bcf.o bam2bcf_indel.o errmod.o sample.o \ - cut_target.o phase.o bam2depth.o + cut_target.o phase.o bam2depth.o padding.o PROG= samtools INCLUDES= -I. SUBDIRS= . bcftools misc diff --git a/bam.h b/bam.h index 346c750..376e324 100644 --- a/bam.h +++ b/bam.h @@ -154,6 +154,13 @@ typedef struct { /*! @abstract CIGAR: X = mismatch */ #define BAM_CDIFF 8 +#define BAM_CIGAR_STR "MIDNSHP=X" + +#define bam_cigar_op(c) ((c)&BAM_CIGAR_MASK) +#define bam_cigar_oplen(c) ((c)>>BAM_CIGAR_SHIFT) +#define bam_cigar_opchr(c) (BAM_CIGAR_STR[bam_cigar_op(c)]) +#define bam_cigar_gen(o, l) ((o)< #include +#include "kstring.h" #include "bam.h" +void bam_template_cigar(bam1_t *b1, bam1_t *b2, kstring_t *str) +{ + bam1_t *swap; + int i, end; + uint32_t *cigar; + str->l = 0; + if (b1->core.tid != b2->core.tid || b1->core.tid < 0) return; // coordinateless or not on the same chr; skip + if (b1->core.pos > b2->core.pos) swap = b1, b1 = b2, b2 = swap; // make sure b1 has a smaller coordinate + kputc((b1->core.flag & BAM_FREAD1)? '1' : '2', str); // segment index + kputc((b1->core.flag & BAM_FREVERSE)? 'R' : 'F', str); // strand + for (i = 0, cigar = bam1_cigar(b1); i < b1->core.n_cigar; ++i) { + kputw(bam_cigar_oplen(cigar[i]), str); + kputc(bam_cigar_opchr(cigar[i]), str); + } + end = bam_calend(&b1->core, cigar); + kputw(b2->core.pos - end, str); + kputc('T', str); + kputc((b2->core.flag & BAM_FREAD1)? '1' : '2', str); // segment index + kputc((b2->core.flag & BAM_FREVERSE)? 'R' : 'F', str); // strand + for (i = 0, cigar = bam1_cigar(b2); i < b2->core.n_cigar; ++i) { + kputw(bam_cigar_oplen(cigar[i]), str); + kputc(bam_cigar_opchr(cigar[i]), str); + } + bam_aux_append(b1, "CT", 'Z', str->l+1, (uint8_t*)str->s); +} + // currently, this function ONLY works if each read has one hit void bam_mating_core(bamFile in, bamFile out) { bam_header_t *header; bam1_t *b[2]; - int curr, has_prev; + int curr, has_prev, pre_end, cur_end; + kstring_t str; + str.l = str.m = 0; str.s = 0; header = bam_header_read(in); bam_header_write(out, header); @@ -17,16 +46,20 @@ void bam_mating_core(bamFile in, bamFile out) curr = 0; has_prev = 0; while (bam_read1(in, b[curr]) >= 0) { bam1_t *cur = b[curr], *pre = b[1-curr]; + if (cur->core.tid < 0) continue; + cur_end = bam_calend(&cur->core, bam1_cigar(cur)); + if (cur_end > (int)header->target_len[cur->core.tid]) cur->core.flag |= BAM_FUNMAP; + if (cur->core.flag & BAM_FSECONDARY) continue; // skip secondary alignments if (has_prev) { if (strcmp(bam1_qname(cur), bam1_qname(pre)) == 0) { // identical pair name cur->core.mtid = pre->core.tid; cur->core.mpos = pre->core.pos; pre->core.mtid = cur->core.tid; pre->core.mpos = cur->core.pos; if (pre->core.tid == cur->core.tid && !(cur->core.flag&(BAM_FUNMAP|BAM_FMUNMAP)) - && !(pre->core.flag&(BAM_FUNMAP|BAM_FMUNMAP))) + && !(pre->core.flag&(BAM_FUNMAP|BAM_FMUNMAP))) // set TLEN/ISIZE { uint32_t cur5, pre5; - cur5 = (cur->core.flag&BAM_FREVERSE)? bam_calend(&cur->core, bam1_cigar(cur)) : cur->core.pos; - pre5 = (pre->core.flag&BAM_FREVERSE)? bam_calend(&pre->core, bam1_cigar(pre)) : pre->core.pos; + cur5 = (cur->core.flag&BAM_FREVERSE)? cur_end : cur->core.pos; + pre5 = (pre->core.flag&BAM_FREVERSE)? pre_end : pre->core.pos; cur->core.isize = pre5 - cur5; pre->core.isize = cur5 - pre5; } else cur->core.isize = pre->core.isize = 0; if (pre->core.flag&BAM_FREVERSE) cur->core.flag |= BAM_FMREVERSE; @@ -35,6 +68,7 @@ void bam_mating_core(bamFile in, bamFile out) else pre->core.flag &= ~BAM_FMREVERSE; if (cur->core.flag & BAM_FUNMAP) { pre->core.flag |= BAM_FMUNMAP; pre->core.flag &= ~BAM_FPROPER_PAIR; } if (pre->core.flag & BAM_FUNMAP) { cur->core.flag |= BAM_FMUNMAP; cur->core.flag &= ~BAM_FPROPER_PAIR; } + bam_template_cigar(pre, cur, &str); bam_write1(out, pre); bam_write1(out, cur); has_prev = 0; @@ -48,18 +82,20 @@ void bam_mating_core(bamFile in, bamFile out) } } else has_prev = 1; curr = 1 - curr; + pre_end = cur_end; } if (has_prev) bam_write1(out, b[1-curr]); bam_header_destroy(header); bam_destroy1(b[0]); bam_destroy1(b[1]); + free(str.s); } int bam_mating(int argc, char *argv[]) { bamFile in, out; if (argc < 3) { - fprintf(stderr, "samtools fixmate \n"); + fprintf(stderr, "Usage: samtools fixmate \n"); return 1; } in = (strcmp(argv[1], "-") == 0)? bam_dopen(fileno(stdin), "r") : bam_open(argv[1], "r"); diff --git a/bamtk.c b/bamtk.c index 8ba2581..6700b55 100644 --- a/bamtk.c +++ b/bamtk.c @@ -27,6 +27,7 @@ int main_phase(int argc, char *argv[]); int main_cat(int argc, char *argv[]); int main_depth(int argc, char *argv[]); int main_bam2fq(int argc, char *argv[]); +int main_pad2unpad(int argc, char *argv[]); int faidx_main(int argc, char *argv[]); @@ -55,6 +56,7 @@ static int usage() fprintf(stderr, " cat concatenate BAMs\n"); fprintf(stderr, " targetcut cut fosmid regions (for fosmid pool only)\n"); fprintf(stderr, " phase phase heterozygotes\n"); + fprintf(stderr, " pad2unpad convert padded BAM to unpadded BAM\n"); fprintf(stderr, "\n"); #ifdef _WIN32 fprintf(stderr, "\ @@ -94,6 +96,7 @@ int main(int argc, char *argv[]) else if (strcmp(argv[1], "phase") == 0) return main_phase(argc-1, argv+1); else if (strcmp(argv[1], "depth") == 0) return main_depth(argc-1, argv+1); else if (strcmp(argv[1], "bam2fq") == 0) return main_bam2fq(argc-1, argv+1); + else if (strcmp(argv[1], "pad2unpad") == 0) return main_pad2unpad(argc-1, argv+1); else if (strcmp(argv[1], "pileup") == 0) { fprintf(stderr, "[main] The `pileup' command has been removed. Please use `mpileup' instead.\n"); return 1; diff --git a/kstring.c b/kstring.c index b2a0dab..b8ff45c 100644 --- a/kstring.c +++ b/kstring.c @@ -98,13 +98,13 @@ typedef unsigned char ubyte_t; static int *ksBM_prep(const ubyte_t *pat, int m) { int i, *suff, *prep, *bmGs, *bmBc; - prep = calloc(m + 256, sizeof(int)); + prep = (int*)calloc(m + 256, sizeof(int)); bmGs = prep; bmBc = prep + m; { // preBmBc() for (i = 0; i < 256; ++i) bmBc[i] = m; for (i = 0; i < m - 1; ++i) bmBc[pat[i]] = m - i - 1; } - suff = calloc(m, sizeof(int)); + suff = (int*)calloc(m, sizeof(int)); { // suffixes() int f = 0, g; suff[m - 1] = m; diff --git a/kstring.h b/kstring.h index ec5775b..89a7920 100644 --- a/kstring.h +++ b/kstring.h @@ -1,3 +1,28 @@ +/* The MIT License + + Copyright (c) by Attractive Chaos + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + #ifndef KSTRING_H #define KSTRING_H @@ -42,7 +67,16 @@ extern "C" { #ifdef __cplusplus } #endif - + +static inline void ks_resize(kstring_t *s, size_t size) +{ + if (s->m < size) { + s->m = size; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + } +} + static inline int kputsn(const char *p, int l, kstring_t *s) { if (s->l + l + 1 >= s->m) { diff --git a/padding.c b/padding.c new file mode 100644 index 0000000..d76cb31 --- /dev/null +++ b/padding.c @@ -0,0 +1,120 @@ +#include +#include +#include "kstring.h" +#include "bam.h" + +static void replace_cigar(bam1_t *b, int n, uint32_t *cigar) +{ + if (n != b->core.n_cigar) { + int o = b->core.l_qname + b->core.n_cigar * 4; + if (b->data_len + (n - b->core.n_cigar) * 4 > b->m_data) { + b->m_data = b->data_len + (n - b->core.n_cigar) * 4; + kroundup32(b->m_data); + b->data = (uint8_t*)realloc(b->data, b->m_data); + } + memmove(b->data + b->core.l_qname + n * 4, b->data + o, b->data_len - o); + memcpy(b->data + b->core.l_qname, cigar, n * 4); + b->data_len += (n - b->core.n_cigar) * 4; + b->core.n_cigar = n; + } else memcpy(b->data + b->core.l_qname, cigar, n * 4); +} + +#define write_cigar(_c, _n, _m, _v) do { \ + if (_n == _m) { \ + _m = _m? _m<<1 : 4; \ + _c = (uint32_t*)realloc(_c, _m * 4); \ + } \ + _c[_n++] = (_v); \ + } while (0) + +static void unpad_seq(bam1_t *b, kstring_t *s) +{ + int k, j, i; + uint32_t *cigar = bam1_cigar(b); + uint8_t *seq = bam1_seq(b); + ks_resize(s, b->core.l_qseq); + for (k = 0, s->l = 0, j = 0; k < b->core.n_cigar; ++k) { + int op, ol; + op = bam_cigar_op(cigar[k]); + ol = bam_cigar_oplen(cigar[k]); + assert(op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CSOFT_CLIP); + if (op == BAM_CMATCH) { + for (i = 0; i < ol; ++i) s->s[s->l++] = bam1_seqi(seq, j); + ++j; + } else if (op == BAM_CSOFT_CLIP) { + j += ol; + } else { + for (i = 0; i < ol; ++i) s->s[s->l++] = 0; + } + } +} + +int bam_pad2unpad(bamFile in, bamFile out) +{ + bam_header_t *h; + bam1_t *b; + kstring_t r, q; + uint32_t *cigar2 = 0; + int n2 = 0, m2 = 0, *posmap = 0; + + h = bam_header_read(in); + bam_header_write(out, h); + b = bam_init1(); + r.l = r.m = q.l = q.m = 0; r.s = q.s = 0; + while (bam_read1(in, b) >= 0) { + uint32_t *cigar = bam1_cigar(b); + n2 = 0; + if (b->core.pos == 0 && b->core.tid >= 0 && strcmp(bam1_qname(b), h->target_name[b->core.tid]) == 0) { + int i, k; + unpad_seq(b, &r); + write_cigar(cigar2, n2, m2, bam_cigar_gen(b->core.l_qseq, BAM_CMATCH)); + replace_cigar(b, n2, cigar2); + posmap = realloc(posmap, r.m * sizeof(int)); + for (i = k = 0; i < r.l; ++i) { + posmap[i] = k; // note that a read should NOT start at a padding + if (r.s[i]) ++k; + } + } else { + int i, k, op; + unpad_seq(b, &q); + if (bam_cigar_op(cigar[0]) == BAM_CSOFT_CLIP) write_cigar(cigar2, n2, m2, cigar[0]); + for (i = 0, k = b->core.pos; i < q.l; ++i, ++k) + q.s[i] = q.s[i]? (r.s[k]? BAM_CMATCH : BAM_CINS) : (r.s[k]? BAM_CDEL : BAM_CPAD); + for (i = k = 1, op = q.s[0]; i < q.l; ++i) { + if (op != q.s[i]) { + write_cigar(cigar2, n2, m2, bam_cigar_gen(k, op)); + op = q.s[i]; k = 1; + } else ++k; + } + write_cigar(cigar2, n2, m2, bam_cigar_gen(k, op)); + if (bam_cigar_op(cigar[b->core.n_cigar-1]) == BAM_CSOFT_CLIP) write_cigar(cigar2, n2, m2, cigar[b->core.n_cigar-1]); + for (i = 2; i < n2; ++i) + if (bam_cigar_op(cigar2[i]) == BAM_CMATCH && bam_cigar_op(cigar2[i-1]) == BAM_CPAD && bam_cigar_op(cigar2[i-2]) == BAM_CMATCH) + cigar2[i] += cigar2[i-2], cigar2[i-2] = cigar2[i-1] = 0; + for (i = k = 0; i < n2; ++i) + if (cigar2[i]) cigar2[k++] = cigar2[i]; + n2 = k; + replace_cigar(b, n2, cigar2); + b->core.pos = posmap[b->core.pos]; + } + bam_write1(out, b); + } + free(r.s); free(q.s); free(posmap); + bam_destroy1(b); + bam_header_destroy(h); + return 0; +} + +int main_pad2unpad(int argc, char *argv[]) +{ + bamFile in, out; + if (argc == 1) { + fprintf(stderr, "Usage: samtools pad2unpad \n"); + return 1; + } + in = strcmp(argv[1], "-")? bam_open(argv[1], "r") : bam_dopen(fileno(stdin), "r"); + out = bam_dopen(fileno(stdout), "w"); + bam_pad2unpad(in, out); + bam_close(in); bam_close(out); + return 0; +} -- 2.39.2