#include <stdlib.h>
#include <unistd.h>
#include <assert.h>
+#include "kstring.h"
#include "bam.h"
#include "kseq.h"
#include "khash.h"
kstream_t *ks;
kstring_t *str;
uint64_t n_lines;
+ int is_first;
};
char **__bam_get_lines(const char *fn, int *_n) // for bam_plcmd.c only
header->l_text += str->l + 1;
header->text[header->l_text] = 0;
}
-int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b)
+
+int sam_header_parse_rg(bam_header_t *h)
{
- int ret, doff, doff0, dret, z = 0;
- bam1_core_t *c = &b->core;
- kstring_t *str = fp->str;
- kstream_t *ks = fp->ks;
+ kstring_t *rgid, *rglib;
+ char *p, *q, *s, *r;
+ int n = 0;
+
+ // free
+ if (h == 0) return 0;
+ bam_strmap_destroy(h->rg2lib); h->rg2lib = 0;
+ if (h->l_text < 3) return 0;
+ // parse @RG lines
+ h->rg2lib = bam_strmap_init();
+ rgid = calloc(1, sizeof(kstring_t));
+ rglib = calloc(1, sizeof(kstring_t));
+ s = h->text;
+ while ((s = strstr(s, "@RG")) != 0) {
+ if (rgid->l && rglib->l) {
+ bam_strmap_put(h->rg2lib, rgid->s, rglib->s);
+ ++n;
+ }
+ rgid->l = rglib->l = 0;
+ s += 3;
+ r = s;
+ if ((p = strstr(s, "ID:")) != 0) {
+ q = p + 3;
+ for (p = q; *p && *p != '\t' && *p != '\r' && *p != '\n'; ++p);
+ kputsn(q, p - q, rgid);
+ } else {
+ fprintf(stderr, "[bam_header_parse] missing ID tag in @RG lines.\n");
+ break;
+ }
+ if (r < p) r = p;
+ if ((p = strstr(s, "LB:")) != 0) {
+ q = p + 3;
+ for (p = q; *p && *p != '\t' && *p != '\r' && *p != '\n'; ++p);
+ kputsn(q, p - q, rglib);
+ } else {
+ fprintf(stderr, "[bam_header_parse] missing LB tag in @RG lines.\n");
+ break;
+ }
+ if (r < p) r = p;
+ s = r + 3;
+ }
+ if (rgid->l && rglib->l) {
+ bam_strmap_put(h->rg2lib, rgid->s, rglib->s);
+ ++n;
+ }
+ free(rgid->s); free(rgid);
+ free(rglib->s); free(rglib);
+ if (n == 0) {
+ bam_strmap_destroy(h->rg2lib);
+ h->rg2lib = 0;
+ }
+ return n;
+}
+
+int sam_header_parse(bam_header_t *h)
+{
+ int i;
+ char *s, *p, *q, *r;
+
+ // free
+ free(h->target_len); free(h->target_name);
+ h->n_targets = 0; h->target_len = 0; h->target_name = 0;
+ if (h->l_text < 3) return 0;
+ // count number of @SQ
+ s = h->text;
+ while ((s = strstr(s, "@SQ")) != 0) {
+ ++h->n_targets;
+ s += 3;
+ }
+ if (h->n_targets == 0) return 0;
+ h->target_len = (uint32_t*)calloc(h->n_targets, 4);
+ h->target_name = (char**)calloc(h->n_targets, sizeof(void*));
+ // parse @SQ lines
+ i = 0;
+ s = h->text;
+ while ((s = strstr(s, "@SQ")) != 0) {
+ s += 3;
+ r = s;
+ if ((p = strstr(s, "SN:")) != 0) {
+ q = p + 3;
+ for (p = q; *p && *p != '\t' && *p != '\r' && *p != '\n'; ++p);
+ h->target_name[i] = (char*)calloc(p - q + 1, 1);
+ strncpy(h->target_name[i], q, p - q);
+ } else goto header_err_ret;
+ if (r < p) r = p;
+ if ((p = strstr(s, "LN:")) != 0) h->target_len[i] = strtol(p + 3, 0, 10);
+ else goto header_err_ret;
+ if (r < p) r = p;
+ s = r + 3;
+ ++i;
+ }
+ sam_header_parse_rg(h);
+ return h->n_targets;
+header_err_ret:
+ fprintf(stderr, "[bam_header_parse] missing SN or LN tag in @SQ lines.\n");
+ free(h->target_len); free(h->target_name);
+ h->n_targets = 0; h->target_len = 0; h->target_name = 0;
+ return 0;
+}
+
+bam_header_t *sam_header_read(tamFile fp)
+{
+ int ret, dret;
+ bam_header_t *header = bam_header_init();
+ kstring_t *str = fp->str;
while ((ret = ks_getuntil(fp->ks, KS_SEP_TAB, str, &dret)) >= 0 && str->s[0] == '@') { // skip header
- z += str->l + 1;
str->s[str->l] = dret; // note that str->s is NOT null terminated!!
append_text(header, str);
if (dret != '\n') {
ret = ks_getuntil(fp->ks, '\n', str, &dret);
- z += str->l + 1;
str->s[str->l] = '\n'; // NOT null terminated!!
append_text(header, str);
}
++fp->n_lines;
}
- while (ret == 0) { // special consideration for "\r\n" and empty lines
- ret = ks_getuntil(fp->ks, KS_SEP_TAB, str, &dret);
- if (ret >= 0) z += str->l + 1;
+ sam_header_parse(header);
+ bam_init_header_hash(header);
+ fp->is_first = 1;
+ return header;
+}
+
+int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b)
+{
+ int ret, doff, doff0, dret, z = 0;
+ bam1_core_t *c = &b->core;
+ kstring_t *str = fp->str;
+ kstream_t *ks = fp->ks;
+
+ if (fp->is_first) {
+ fp->is_first = 0;
+ ret = str->l;
+ } else {
+ do { // special consideration for empty lines
+ ret = ks_getuntil(fp->ks, KS_SEP_TAB, str, &dret);
+ if (ret >= 0) z += str->l + 1;
+ } while (ret == 0);
}
if (ret < 0) return -1;
++fp->n_lines;
{ // flag, tid, pos, qual
ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; c->flag = atoi(str->s);
ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; c->tid = bam_get_tid(header, str->s);
+ if (c->tid < 0 && strcmp(str->s, "*")) {
+ if (header->n_targets == 0) {
+ fprintf(stderr, "[sam_read1] missing header? Abort!\n");
+ exit(1);
+ } else fprintf(stderr, "[sam_read1] reference '%s' is recognized as '*'.\n", str->s);
+ }
ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; c->pos = isdigit(str->s[0])? atoi(str->s) - 1 : -1;
ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; c->qual = isdigit(str->s[0])? atoi(str->s) : 0;
if (ret < 0) return -2;
if (c->n_cigar && c->l_qseq != (int32_t)bam_cigar2qlen(c, bam1_cigar(b)))
parse_error(fp->n_lines, "CIGAR and sequence length are inconsistent");
p = (uint8_t*)alloc_data(b, doff + c->l_qseq + (c->l_qseq+1)/2) + doff;
- bzero(p, (c->l_qseq+1)/2);
+ memset(p, 0, (c->l_qseq+1)/2);
for (i = 0; i < c->l_qseq; ++i)
p[i/2] |= bam_nt16_table[(int)str->s[i]] << 4*(1-i%2);
if (ks_getuntil(ks, KS_SEP_TAB, str, &dret) < 0) return -6; // qual
fp->str = (kstring_t*)calloc(1, sizeof(kstring_t));
fp->fp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(fn, "r");
fp->ks = ks_init(fp->fp);
- fp->n_lines = 0;
return fp;
}