From: Heng Li Date: Tue, 5 May 2009 21:09:00 +0000 (+0000) Subject: * samtools-0.1.3-14 (r262) X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=39cfa14f71a83dd06f77e4035852d975360d7a44;p=samtools.git * samtools-0.1.3-14 (r262) * make samopen() recognize @SQ header lines --- diff --git a/bam.h b/bam.h index 7167dc1..163dc89 100644 --- a/bam.h +++ b/bam.h @@ -315,6 +315,9 @@ extern "C" { */ bam_header_t *sam_header_read2(const char *fn_list); + bam_header_t *sam_header_read(tamFile fp); + int sam_header_parse(bam_header_t *h); + #define sam_write1(header, b) bam_view1(header, b) /*! diff --git a/bam_import.c b/bam_import.c index 9daa29f..fe9af91 100644 --- a/bam_import.c +++ b/bam_import.c @@ -42,6 +42,7 @@ struct __tamFile_t { kstream_t *ks; kstring_t *str; uint64_t n_lines; + int is_first; }; char **__bam_get_lines(const char *fn, int *_n) // for bam_plcmd.c only @@ -144,28 +145,89 @@ static inline void append_text(bam_header_t *header, kstring_t *str) header->l_text += str->l + 1; header->text[header->l_text] = 0; } -int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b) + +int sam_header_parse(bam_header_t *h) { - int ret, doff, doff0, dret, z = 0; - bam1_core_t *c = &b->core; - kstring_t *str = fp->str; - kstream_t *ks = fp->ks; + int i; + char *s, *p, *q, *r; + // free + free(h->target_len); free(h->target_name); + h->n_targets = 0; h->target_len = 0; h->target_name = 0; + if (h->l_text < 3) return 0; + // count number of @SQ + s = h->text; + while ((s = strstr(s, "@SQ")) != 0) { + ++h->n_targets; + s += 3; + } + if (h->n_targets == 0) return 0; + h->target_len = (uint32_t*)calloc(h->n_targets, 4); + h->target_name = (char**)calloc(h->n_targets, sizeof(void*)); + // parse @SQ lines + i = 0; + s = h->text; + while ((s = strstr(s, "@SQ")) != 0) { + s += 3; + r = s; + if ((p = strstr(s, "SN:")) != 0) { + q = p + 3; + for (p = q; *p && *p != '\t' && *p != '\r' && *p != '\n'; ++p); + h->target_name[i] = (char*)calloc(p - q + 1, 1); + strncpy(h->target_name[i], q, p - q); + } else goto header_err_ret; + if (r < p) r = p; + if ((p = strstr(s, "LN:")) != 0) h->target_len[i] = strtol(p + 3, 0, 10); + else goto header_err_ret; + if (r < p) r = p; + s = r + 3; + ++i; + } + return h->n_targets; + +header_err_ret: + fprintf(stderr, "[bam_header_parse] missing SN or LN tag in @SQ lines.\n"); + free(h->target_len); free(h->target_name); + h->n_targets = 0; h->target_len = 0; h->target_name = 0; + return 0; +} + +bam_header_t *sam_header_read(tamFile fp) +{ + int ret, dret; + bam_header_t *header = bam_header_init(); + kstring_t *str = fp->str; while ((ret = ks_getuntil(fp->ks, KS_SEP_TAB, str, &dret)) >= 0 && str->s[0] == '@') { // skip header - z += str->l + 1; str->s[str->l] = dret; // note that str->s is NOT null terminated!! append_text(header, str); if (dret != '\n') { ret = ks_getuntil(fp->ks, '\n', str, &dret); - z += str->l + 1; str->s[str->l] = '\n'; // NOT null terminated!! append_text(header, str); } ++fp->n_lines; } - while (ret == 0) { // special consideration for "\r\n" and empty lines - ret = ks_getuntil(fp->ks, KS_SEP_TAB, str, &dret); - if (ret >= 0) z += str->l + 1; + sam_header_parse(header); + bam_init_header_hash(header); + fp->is_first = 1; + return header; +} + +int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b) +{ + int ret, doff, doff0, dret, z = 0; + bam1_core_t *c = &b->core; + kstring_t *str = fp->str; + kstream_t *ks = fp->ks; + + if (fp->is_first) { + fp->is_first = 0; + ret = str->l; + } else { + do { // special consideration for empty lines + ret = ks_getuntil(fp->ks, KS_SEP_TAB, str, &dret); + if (ret >= 0) z += str->l + 1; + } while (ret == 0); } if (ret < 0) return -1; ++fp->n_lines; @@ -179,6 +241,8 @@ int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b) { // flag, tid, pos, qual ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; c->flag = atoi(str->s); ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; c->tid = bam_get_tid(header, str->s); + if (c->tid < 0 && strcmp(str->s, "*")) + fprintf(stderr, "[sam_read1] reference '%s' is recognized as '*'.\n", str->s); ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; c->pos = isdigit(str->s[0])? atoi(str->s) - 1 : -1; ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; c->qual = isdigit(str->s[0])? atoi(str->s) : 0; if (ret < 0) return -2; @@ -335,7 +399,6 @@ tamFile sam_open(const char *fn) fp->str = (kstring_t*)calloc(1, sizeof(kstring_t)); fp->fp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(fn, "r"); fp->ks = ks_init(fp->fp); - fp->n_lines = 0; return fp; } diff --git a/bamtk.c b/bamtk.c index a8972f9..316b128 100644 --- a/bamtk.c +++ b/bamtk.c @@ -3,7 +3,7 @@ #include "bam.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.1.3-13 (r260)" +#define PACKAGE_VERSION "0.1.3-14 (r262)" #endif int bam_taf2baf(int argc, char *argv[]); diff --git a/sam.c b/sam.c index dd6cd3a..671b504 100644 --- a/sam.c +++ b/sam.c @@ -22,92 +22,61 @@ bam_header_t *bam_header_dup(const bam_header_t *h0) return h; } -bam_header_t *bam_header_parse(const char *text) -{ - bam_header_t *h; - int i; - char *s, *p, *q, *r; - - i = strlen(text); - if (i < 3) return 0; // headerless - h = bam_header_init(); - h->l_text = i; - h->text = strdup(text); - s = h->text; - while ((s = strstr(s, "@SQ")) != 0) { - ++h->n_targets; - s += 3; - } - h->target_len = (uint32_t*)calloc(h->n_targets, 4); - h->target_name = (char**)calloc(h->n_targets, sizeof(void*)); - i = 0; - s = h->text; - while ((s = strstr(s, "@SQ")) != 0) { - s += 3; - r = s; - if ((p = strstr(s, "SN:")) != 0) { - q = p + 3; - for (p = q; *p && *p != '\t'; ++p); - h->target_name[i] = (char*)calloc(p - q + 1, 1); - strncpy(h->target_name[i], q, p - q); - } else goto header_err_ret; - if (r < p) r = p; - if ((p = strstr(s, "LN:")) != 0) h->target_len[i] = strtol(p + 3, 0, 10); - else goto header_err_ret; - if (r < p) r = p; - s = r + 3; - ++i; - } - if (h->n_targets == 0) { - bam_header_destroy(h); - return 0; - } else return h; - -header_err_ret: - fprintf(stderr, "[bam_header_parse] missing SN tag in a @SQ line.\n"); - bam_header_destroy(h); - return 0; -} - samfile_t *samopen(const char *fn, const char *mode, const void *aux) { samfile_t *fp; fp = (samfile_t*)calloc(1, sizeof(samfile_t)); - if (mode[0] == 'r') { - const char *fn_list = (const char*)aux; + if (mode[0] == 'r') { // read fp->type |= TYPE_READ; if (mode[1] == 'b') { // binary fp->type |= TYPE_BAM; fp->x.bam = strcmp(fn, "-")? bam_open(fn, "r") : bam_dopen(fileno(stdin), "r"); if (fp->x.bam == 0) goto open_err_ret; fp->header = bam_header_read(fp->x.bam); - } else { + } else { // text fp->x.tamr = sam_open(fn); if (fp->x.tamr == 0) goto open_err_ret; - fp->header = sam_header_read2(fn_list); + fp->header = sam_header_read(fp->x.tamr); + if (fp->header->n_targets == 0) { // no @SQ fields + if (aux) { // check if aux is present + bam_header_destroy(fp->header); + fp->header = sam_header_read2((const char*)aux); + } + if (fp->header->n_targets == 0) + fprintf(stderr, "[samopen] empty header.\n"); + } else fprintf(stderr, "[samopen] SAM header is present: %d sequences.\n", fp->header->n_targets); } - } else if (mode[0] == 'w') { + } else if (mode[0] == 'w') { // write fp->header = bam_header_dup((const bam_header_t*)aux); if (mode[1] == 'b') { // binary fp->type |= TYPE_BAM; fp->x.bam = strcmp(fn, "-")? bam_open(fn, "w") : bam_dopen(fileno(stdout), "w"); if (fp->x.bam == 0) goto open_err_ret; bam_header_write(fp->x.bam, fp->header); - } else { - int i; - bam_header_t *alt = 0; - alt = bam_header_parse(fp->header->text); + } else { // text + // open file fp->x.tamw = strcmp(fn, "-")? fopen(fn, "w") : stdout; if (fp->x.tamr == 0) goto open_err_ret; - if (alt) { - if (alt->n_targets != fp->header->n_targets) - fprintf(stderr, "[samopen] inconsistent number of target sequences.\n"); + // write header + if (strstr(mode, "h")) { + int i; + bam_header_t *alt; + // parse the header text + alt = bam_header_init(); + alt->l_text = fp->header->l_text; alt->text = fp->header->text; + sam_header_parse(alt); + alt->l_text = 0; alt->text = 0; + // check if there are @SQ lines in the header + if (alt->n_targets) { // then write the header text without dumping ->target_{name,len} + if (alt->n_targets != fp->header->n_targets) + fprintf(stderr, "[samopen] inconsistent number of target sequences.\n"); + fwrite(fp->header->text, 1, fp->header->l_text, fp->x.tamw); + } else { // then dump ->target_{name,len} + for (i = 0; i < fp->header->n_targets; ++i) + fprintf(fp->x.tamw, "@SQ\tSN:%s\tLN:%d\n", fp->header->target_name[i], fp->header->target_len[i]); + } bam_header_destroy(alt); - fwrite(fp->header->text, 1, fp->header->l_text, fp->x.tamw); } - if (strstr(mode, "h")) // print header - for (i = 0; i < fp->header->n_targets; ++i) - fprintf(fp->x.tamw, "@SQ\tSN:%s\tLN:%d\n", fp->header->target_name[i], fp->header->target_len[i]); } } return fp; diff --git a/sam_view.c b/sam_view.c index cefac2f..f070b32 100644 --- a/sam_view.c +++ b/sam_view.c @@ -26,15 +26,16 @@ int main_samview(int argc, char *argv[]) /* parse command-line options */ strcpy(in_mode, "r"); strcpy(out_mode, "w"); - while ((c = getopt(argc, argv, "bt:hHo:q:f:F:")) >= 0) { + while ((c = getopt(argc, argv, "Sbt:hHo:q:f:F:")) >= 0) { switch (c) { + case 'S': is_bamin = 0; break; case 'b': strcat(out_mode, "b"); break; case 't': fn_list = strdup(optarg); is_bamin = 0; break; case 'h': is_header = 1; break; case 'H': is_header_only = 1; break; case 'o': fn_out = strdup(optarg); break; case 'f': g_flag_on = strtol(optarg, 0, 0); break; - case 'F': g_flag_on = strtol(optarg, 0, 0); break; + case 'F': g_flag_off = strtol(optarg, 0, 0); break; case 'q': g_min_mapQ = atoi(optarg); break; default: return usage(); } @@ -98,7 +99,8 @@ static int usage() fprintf(stderr, "Options: -b output BAM\n"); fprintf(stderr, " -h print header for the SAM output\n"); fprintf(stderr, " -H print header only (no alignments)\n"); - fprintf(stderr, " -t FILE list of reference names and lengths, assuming SAM input [null]\n"); + fprintf(stderr, " -S input is SAM\n"); + fprintf(stderr, " -t FILE list of reference names and lengths (force -S) [null]\n"); fprintf(stderr, " -o FILE output file name [stdout]\n"); fprintf(stderr, " -f INT required flag, 0 for unset [0]\n"); fprintf(stderr, " -F INT filtering flag, 0 for unset [0]\n");