X-Git-Url: https://git.donarmstrong.com/?p=samtools.git;a=blobdiff_plain;f=padding.c;h=d5fd15fb6a7cf49e782d506ebe74201748f9467e;hp=af3df94b57d6c0b33582559f9539532b3c435f1a;hb=8919a745585a68d671a1301b444e6edf7f4fdd4b;hpb=ededaf72960a46d036269b1aa7a5579df4798fd3 diff --git a/padding.c b/padding.c index af3df94..d5fd15f 100644 --- a/padding.c +++ b/padding.c @@ -5,6 +5,7 @@ #include "sam_header.h" #include "sam.h" #include "bam.h" +#include "faidx.h" static void replace_cigar(bam1_t *b, int n, uint32_t *cigar) { @@ -68,7 +69,52 @@ static void unpad_seq(bam1_t *b, kstring_t *s) assert(length == s->l); } -int bam_pad2unpad(samfile_t *in, samfile_t *out) +int load_unpadded_ref(faidx_t *fai, char *ref_name, int ref_len, kstring_t *seq) +{ + char base; + char *fai_ref = 0; + int fai_ref_len = 0, k; + + fai_ref = fai_fetch(fai, ref_name, &fai_ref_len); + if (fai_ref_len != ref_len) { + fprintf(stderr, "[depad] ERROR: FASTA sequence %s length %i, expected %i\n", ref_name, fai_ref_len, ref_len); + free(fai_ref); + return -1; + } + ks_resize(seq, ref_len); + seq->l = 0; + for (k = 0; k < ref_len; ++k) { + base = fai_ref[k]; + if (base == '-' || base == '*') { + // Map gaps to null to match unpad_seq function + seq->s[seq->l++] = 0; + } else { + int i = bam_nt16_table[(int)base]; + if (i == 0 || i==16) { // Equals maps to 0, anything unexpected to 16 + fprintf(stderr, "[depad] ERROR: Invalid character %c (ASCII %i) in FASTA sequence %s\n", base, (int)base, ref_name); + free(fai_ref); + return -1; + } + seq->s[seq->l++] = i; + } + } + assert(ref_len == seq->l); + free(fai_ref); + return 0; +} + +inline int * update_posmap(int *posmap, kstring_t ref) +{ + int i, k; + posmap = realloc(posmap, ref.m * sizeof(int)); + for (i = k = 0; i < ref.l; ++i) { + posmap[i] = k; + if (ref.s[i]) ++k; + } + return posmap; +} + +int bam_pad2unpad(samfile_t *in, samfile_t *out, faidx_t *fai) { bam_header_t *h; bam1_t *b; @@ -85,30 +131,46 @@ int bam_pad2unpad(samfile_t *in, samfile_t *out) uint32_t *cigar = bam1_cigar(b); n2 = 0; if (b->core.pos == 0 && b->core.tid >= 0 && strcmp(bam1_qname(b), h->target_name[b->core.tid]) == 0) { - int i, k; - /* - fprintf(stderr, "[depad] Found embedded reference %s\n", bam1_qname(b)); - */ + // fprintf(stderr, "[depad] Found embedded reference %s\n", bam1_qname(b)); r_tid = b->core.tid; unpad_seq(b, &r); if (h->target_len[r_tid] != r.l) { fprintf(stderr, "[depad] ERROR: (Padded) length of %s is %i in BAM header, but %ld in embedded reference\n", bam1_qname(b), h->target_len[r_tid], r.l); return -1; } + if (fai) { + // Check the embedded reference matches the FASTA file + if (load_unpadded_ref(fai, h->target_name[b->core.tid], h->target_len[b->core.tid], &q)) return -1; + assert(r.l == q.l); + int i; + for (i = 0; i < r.l; ++i) { + if (r.s[i] != q.s[i]) { + // Show gaps as ASCII 45 + fprintf(stderr, "[depad] ERROR: Embedded sequence and reference FASTA don't match for %s base %i, '%c' vs '%c'\n", + h->target_name[b->core.tid], i+1, + r.s[i] ? bam_nt16_rev_table[r.s[i]] : 45, + q.s[i] ? bam_nt16_rev_table[q.s[i]] : 45); + return -1; + } + } + } write_cigar(cigar2, n2, m2, bam_cigar_gen(b->core.l_qseq, BAM_CMATCH)); replace_cigar(b, n2, cigar2); - posmap = realloc(posmap, r.m * sizeof(int)); - for (i = k = 0; i < r.l; ++i) { - posmap[i] = k; - if (r.s[i]) ++k; - } + posmap = update_posmap(posmap, r); } else if (b->core.n_cigar > 0) { int i, k, op; if (b->core.tid < 0) { fprintf(stderr, "[depad] ERROR: Read %s has CIGAR but no RNAME\n", bam1_qname(b)); return -1; - } else if (b->core.tid != r_tid) { - fprintf(stderr, "[depad] ERROR: Missing %s embedded reference sequence\n", h->target_name[b->core.tid]); + } else if (b->core.tid == r_tid) { + ; // good case, reference available + } else if (fai) { + if (load_unpadded_ref(fai, h->target_name[b->core.tid], h->target_len[b->core.tid], &r)) return -1; + posmap = update_posmap(posmap, r); + r_tid = b->core.tid; + // fprintf(stderr, "[depad] Loaded %s from FASTA file\n", h->target_name[b->core.tid]); + } else { + fprintf(stderr, "[depad] ERROR: Missing %s embedded reference sequence (and no FASTA file)\n", h->target_name[b->core.tid]); return -1; } unpad_seq(b, &q); @@ -185,6 +247,7 @@ static int usage(int is_long_help); int main_pad2unpad(int argc, char *argv[]) { samfile_t *in = 0, *out = 0; + faidx_t *fai = 0; int c, is_bamin = 1, compress_level = -1, is_bamout = 1, is_long_help = 0; char in_mode[5], out_mode[5], *fn_out = 0, *fn_list = 0, *fn_ref = 0; int ret=0; @@ -215,34 +278,37 @@ int main_pad2unpad(int argc, char *argv[]) } // Load FASTA reference (also needed for SAM -> BAM if missing header) - if (fn_ref) fn_list = samfaipath(fn_ref); - + if (fn_ref) { + fn_list = samfaipath(fn_ref); + fai = fai_load(fn_ref); + } // open file handlers if ((in = samopen(argv[optind], in_mode, fn_list)) == 0) { - fprintf(stderr, "[depad] fail to open \"%s\" for reading.\n", argv[optind]); + fprintf(stderr, "[depad] failed to open \"%s\" for reading.\n", argv[optind]); ret = 1; goto depad_end; } if (in->header == 0) { - fprintf(stderr, "[depad] fail to read the header from \"%s\".\n", argv[optind]); + fprintf(stderr, "[depad] failed to read the header from \"%s\".\n", argv[optind]); ret = 1; goto depad_end; } /* TODO - The reference sequence lengths in the BAM + SAM headers should be updated */ if ((out = samopen(fn_out? fn_out : "-", out_mode, in->header)) == 0) { - fprintf(stderr, "[depad] fail to open \"%s\" for writing.\n", fn_out? fn_out : "standard output"); + fprintf(stderr, "[depad] failed to open \"%s\" for writing.\n", fn_out? fn_out : "standard output"); ret = 1; goto depad_end; } // Do the depad - ret = bam_pad2unpad(in, out); + ret = bam_pad2unpad(in, out, fai); depad_end: // close files, free and return free(fn_list); free(fn_out); samclose(in); samclose(out); + fai_destroy(fai); return ret; }