]> git.donarmstrong.com Git - samtools.git/commitdiff
* samtools-0.1.3-14 (r262)
authorHeng Li <lh3@live.co.uk>
Tue, 5 May 2009 21:09:00 +0000 (21:09 +0000)
committerHeng Li <lh3@live.co.uk>
Tue, 5 May 2009 21:09:00 +0000 (21:09 +0000)
 * make samopen() recognize @SQ header lines

bam.h
bam_import.c
bamtk.c
sam.c
sam_view.c

diff --git a/bam.h b/bam.h
index 7167dc155588642ea79ce30b282b0e4dd84c0cef..163dc893e192d6eaa9ebfcc3b3b8614d142ca211 100644 (file)
--- a/bam.h
+++ b/bam.h
@@ -315,6 +315,9 @@ extern "C" {
         */
        bam_header_t *sam_header_read2(const char *fn_list);
 
+       bam_header_t *sam_header_read(tamFile fp);
+       int sam_header_parse(bam_header_t *h);
+
 #define sam_write1(header, b) bam_view1(header, b)
 
        /*!
index 9daa29f42a1676077b29f3f63ab7a7027896dbad..fe9af911f890bdb6a70f56e6206ea6f8923f718b 100644 (file)
@@ -42,6 +42,7 @@ struct __tamFile_t {
        kstream_t *ks;
        kstring_t *str;
        uint64_t n_lines;
+       int is_first;
 };
 
 char **__bam_get_lines(const char *fn, int *_n) // for bam_plcmd.c only
@@ -144,28 +145,89 @@ static inline void append_text(bam_header_t *header, kstring_t *str)
        header->l_text += str->l + 1;
        header->text[header->l_text] = 0;
 }
-int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b)
+
+int sam_header_parse(bam_header_t *h)
 {
-       int ret, doff, doff0, dret, z = 0;
-       bam1_core_t *c = &b->core;
-       kstring_t *str = fp->str;
-       kstream_t *ks = fp->ks;
+       int i;
+       char *s, *p, *q, *r;
 
+       // free
+       free(h->target_len); free(h->target_name);
+       h->n_targets = 0; h->target_len = 0; h->target_name = 0;
+       if (h->l_text < 3) return 0;
+       // count number of @SQ
+       s = h->text;
+       while ((s = strstr(s, "@SQ")) != 0) {
+               ++h->n_targets;
+               s += 3;
+       }
+       if (h->n_targets == 0) return 0;
+       h->target_len = (uint32_t*)calloc(h->n_targets, 4);
+       h->target_name = (char**)calloc(h->n_targets, sizeof(void*));
+       // parse @SQ lines
+       i = 0;
+       s = h->text;
+       while ((s = strstr(s, "@SQ")) != 0) {
+               s += 3;
+               r = s;
+               if ((p = strstr(s, "SN:")) != 0) {
+                       q = p + 3;
+                       for (p = q; *p && *p != '\t' && *p != '\r' && *p != '\n'; ++p);
+                       h->target_name[i] = (char*)calloc(p - q + 1, 1);
+                       strncpy(h->target_name[i], q, p - q);
+               } else goto header_err_ret;
+               if (r < p) r = p;
+               if ((p = strstr(s, "LN:")) != 0) h->target_len[i] = strtol(p + 3, 0, 10);
+               else goto header_err_ret;
+               if (r < p) r = p;
+               s = r + 3;
+               ++i;
+       }
+       return h->n_targets;
+
+header_err_ret:
+       fprintf(stderr, "[bam_header_parse] missing SN or LN tag in @SQ lines.\n");
+       free(h->target_len); free(h->target_name);
+       h->n_targets = 0; h->target_len = 0; h->target_name = 0;
+       return 0;
+}
+
+bam_header_t *sam_header_read(tamFile fp)
+{
+       int ret, dret;
+       bam_header_t *header = bam_header_init();
+       kstring_t *str = fp->str;
        while ((ret = ks_getuntil(fp->ks, KS_SEP_TAB, str, &dret)) >= 0 && str->s[0] == '@') { // skip header
-               z += str->l + 1;
                str->s[str->l] = dret; // note that str->s is NOT null terminated!!
                append_text(header, str);
                if (dret != '\n') {
                        ret = ks_getuntil(fp->ks, '\n', str, &dret);
-                       z += str->l + 1;
                        str->s[str->l] = '\n'; // NOT null terminated!!
                        append_text(header, str);
                }
                ++fp->n_lines;
        }
-       while (ret == 0) { // special consideration for "\r\n" and empty lines
-               ret = ks_getuntil(fp->ks, KS_SEP_TAB, str, &dret);
-               if (ret >= 0) z += str->l + 1;
+       sam_header_parse(header);
+       bam_init_header_hash(header);
+       fp->is_first = 1;
+       return header;
+}
+
+int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b)
+{
+       int ret, doff, doff0, dret, z = 0;
+       bam1_core_t *c = &b->core;
+       kstring_t *str = fp->str;
+       kstream_t *ks = fp->ks;
+
+       if (fp->is_first) {
+               fp->is_first = 0;
+               ret = str->l;
+       } else {
+               do { // special consideration for empty lines
+                       ret = ks_getuntil(fp->ks, KS_SEP_TAB, str, &dret);
+                       if (ret >= 0) z += str->l + 1;
+               } while (ret == 0);
        }
        if (ret < 0) return -1;
        ++fp->n_lines;
@@ -179,6 +241,8 @@ int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b)
        { // flag, tid, pos, qual
                ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; c->flag = atoi(str->s);
                ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; c->tid = bam_get_tid(header, str->s);
+               if (c->tid < 0 && strcmp(str->s, "*"))
+                       fprintf(stderr, "[sam_read1] reference '%s' is recognized as '*'.\n", str->s);
                ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; c->pos = isdigit(str->s[0])? atoi(str->s) - 1 : -1;
                ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; c->qual = isdigit(str->s[0])? atoi(str->s) : 0;
                if (ret < 0) return -2;
@@ -335,7 +399,6 @@ tamFile sam_open(const char *fn)
        fp->str = (kstring_t*)calloc(1, sizeof(kstring_t));
        fp->fp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(fn, "r");
        fp->ks = ks_init(fp->fp);
-       fp->n_lines = 0;
        return fp;
 }
 
diff --git a/bamtk.c b/bamtk.c
index a8972f91f2fb61d5895e935cb952f4fc1dc59329..316b1285c0fb367a35f770cd39644ecfe8662a36 100644 (file)
--- a/bamtk.c
+++ b/bamtk.c
@@ -3,7 +3,7 @@
 #include "bam.h"
 
 #ifndef PACKAGE_VERSION
-#define PACKAGE_VERSION "0.1.3-13 (r260)"
+#define PACKAGE_VERSION "0.1.3-14 (r262)"
 #endif
 
 int bam_taf2baf(int argc, char *argv[]);
diff --git a/sam.c b/sam.c
index dd6cd3a8d8fa007bda51b897a59dd2694d25d3d7..671b5042bc17ec1948c83fe54093c03ff7065120 100644 (file)
--- a/sam.c
+++ b/sam.c
@@ -22,92 +22,61 @@ bam_header_t *bam_header_dup(const bam_header_t *h0)
        return h;
 }
 
-bam_header_t *bam_header_parse(const char *text)
-{
-       bam_header_t *h;
-       int i;
-       char *s, *p, *q, *r;
-
-       i = strlen(text);
-       if (i < 3) return 0; // headerless
-       h = bam_header_init();
-       h->l_text = i;
-       h->text = strdup(text);
-       s = h->text;
-       while ((s = strstr(s, "@SQ")) != 0) {
-               ++h->n_targets;
-               s += 3;
-       }
-       h->target_len = (uint32_t*)calloc(h->n_targets, 4);
-       h->target_name = (char**)calloc(h->n_targets, sizeof(void*));
-       i = 0;
-       s = h->text;
-       while ((s = strstr(s, "@SQ")) != 0) {
-               s += 3;
-               r = s;
-               if ((p = strstr(s, "SN:")) != 0) {
-                       q = p + 3;
-                       for (p = q; *p && *p != '\t'; ++p);
-                       h->target_name[i] = (char*)calloc(p - q + 1, 1);
-                       strncpy(h->target_name[i], q, p - q);
-               } else goto header_err_ret;
-               if (r < p) r = p;
-               if ((p = strstr(s, "LN:")) != 0) h->target_len[i] = strtol(p + 3, 0, 10);
-               else goto header_err_ret;
-               if (r < p) r = p;
-               s = r + 3;
-               ++i;
-       }
-       if (h->n_targets == 0) {
-               bam_header_destroy(h);
-               return 0;
-       } else return h;
-
-header_err_ret:
-       fprintf(stderr, "[bam_header_parse] missing SN tag in a @SQ line.\n");
-       bam_header_destroy(h);
-       return 0;
-}
-
 samfile_t *samopen(const char *fn, const char *mode, const void *aux)
 {
        samfile_t *fp;
        fp = (samfile_t*)calloc(1, sizeof(samfile_t));
-       if (mode[0] == 'r') {
-               const char *fn_list = (const char*)aux;
+       if (mode[0] == 'r') { // read
                fp->type |= TYPE_READ;
                if (mode[1] == 'b') { // binary
                        fp->type |= TYPE_BAM;
                        fp->x.bam = strcmp(fn, "-")? bam_open(fn, "r") : bam_dopen(fileno(stdin), "r");
                        if (fp->x.bam == 0) goto open_err_ret;
                        fp->header = bam_header_read(fp->x.bam);
-               } else {
+               } else { // text
                        fp->x.tamr = sam_open(fn);
                        if (fp->x.tamr == 0) goto open_err_ret;
-                       fp->header = sam_header_read2(fn_list);
+                       fp->header = sam_header_read(fp->x.tamr);
+                       if (fp->header->n_targets == 0) { // no @SQ fields
+                               if (aux) { // check if aux is present
+                                       bam_header_destroy(fp->header);
+                                       fp->header = sam_header_read2((const char*)aux);
+                               }
+                               if (fp->header->n_targets == 0)
+                                       fprintf(stderr, "[samopen] empty header.\n");
+                       } else fprintf(stderr, "[samopen] SAM header is present: %d sequences.\n", fp->header->n_targets);
                }
-       } else if (mode[0] == 'w') {
+       } else if (mode[0] == 'w') { // write
                fp->header = bam_header_dup((const bam_header_t*)aux);
                if (mode[1] == 'b') { // binary
                        fp->type |= TYPE_BAM;
                        fp->x.bam = strcmp(fn, "-")? bam_open(fn, "w") : bam_dopen(fileno(stdout), "w");
                        if (fp->x.bam == 0) goto open_err_ret;
                        bam_header_write(fp->x.bam, fp->header);
-               } else {
-                       int i;
-                       bam_header_t *alt = 0;
-                       alt = bam_header_parse(fp->header->text);
+               } else { // text
+                       // open file
                        fp->x.tamw = strcmp(fn, "-")? fopen(fn, "w") : stdout;
                        if (fp->x.tamr == 0) goto open_err_ret;
-                       if (alt) {
-                               if (alt->n_targets != fp->header->n_targets)
-                                       fprintf(stderr, "[samopen] inconsistent number of target sequences.\n");
+                       // write header
+                       if (strstr(mode, "h")) {
+                               int i;
+                               bam_header_t *alt;
+                               // parse the header text 
+                               alt = bam_header_init();
+                               alt->l_text = fp->header->l_text; alt->text = fp->header->text;
+                               sam_header_parse(alt);
+                               alt->l_text = 0; alt->text = 0;
+                               // check if there are @SQ lines in the header
+                               if (alt->n_targets) { // then write the header text without dumping ->target_{name,len}
+                                       if (alt->n_targets != fp->header->n_targets)
+                                               fprintf(stderr, "[samopen] inconsistent number of target sequences.\n");
+                                       fwrite(fp->header->text, 1, fp->header->l_text, fp->x.tamw);
+                               } else { // then dump ->target_{name,len}
+                                       for (i = 0; i < fp->header->n_targets; ++i)
+                                               fprintf(fp->x.tamw, "@SQ\tSN:%s\tLN:%d\n", fp->header->target_name[i], fp->header->target_len[i]);
+                               }
                                bam_header_destroy(alt);
-                               fwrite(fp->header->text, 1, fp->header->l_text, fp->x.tamw);
                        }
-                       if (strstr(mode, "h")) // print header
-                               for (i = 0; i < fp->header->n_targets; ++i)
-                                       fprintf(fp->x.tamw, "@SQ\tSN:%s\tLN:%d\n", fp->header->target_name[i], fp->header->target_len[i]);
                }
        }
        return fp;
index cefac2f4835e84c4d6846862b85dc59b9032badf..f070b32a6f671c995be3733e6c9d0f1fbe166473 100644 (file)
@@ -26,15 +26,16 @@ int main_samview(int argc, char *argv[])
 
        /* parse command-line options */
        strcpy(in_mode, "r"); strcpy(out_mode, "w");
-       while ((c = getopt(argc, argv, "bt:hHo:q:f:F:")) >= 0) {
+       while ((c = getopt(argc, argv, "Sbt:hHo:q:f:F:")) >= 0) {
                switch (c) {
+               case 'S': is_bamin = 0; break;
                case 'b': strcat(out_mode, "b"); break;
                case 't': fn_list = strdup(optarg); is_bamin = 0; break;
                case 'h': is_header = 1; break;
                case 'H': is_header_only = 1; break;
                case 'o': fn_out = strdup(optarg); break;
                case 'f': g_flag_on = strtol(optarg, 0, 0); break;
-               case 'F': g_flag_on = strtol(optarg, 0, 0); break;
+               case 'F': g_flag_off = strtol(optarg, 0, 0); break;
                case 'q': g_min_mapQ = atoi(optarg); break;
                default: return usage();
                }
@@ -98,7 +99,8 @@ static int usage()
        fprintf(stderr, "Options: -b       output BAM\n");
        fprintf(stderr, "         -h       print header for the SAM output\n");
        fprintf(stderr, "         -H       print header only (no alignments)\n");
-       fprintf(stderr, "         -t FILE  list of reference names and lengths, assuming SAM input [null]\n");
+       fprintf(stderr, "         -S       input is SAM\n");
+       fprintf(stderr, "         -t FILE  list of reference names and lengths (force -S) [null]\n");
        fprintf(stderr, "         -o FILE  output file name [stdout]\n");
        fprintf(stderr, "         -f INT   required flag, 0 for unset [0]\n");
        fprintf(stderr, "         -F INT   filtering flag, 0 for unset [0]\n");