*/
bam_header_t *sam_header_read2(const char *fn_list);
+ bam_header_t *sam_header_read(tamFile fp);
+ int sam_header_parse(bam_header_t *h);
+
#define sam_write1(header, b) bam_view1(header, b)
/*!
kstream_t *ks;
kstring_t *str;
uint64_t n_lines;
+ int is_first;
};
char **__bam_get_lines(const char *fn, int *_n) // for bam_plcmd.c only
header->l_text += str->l + 1;
header->text[header->l_text] = 0;
}
-int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b)
+
+int sam_header_parse(bam_header_t *h)
{
- int ret, doff, doff0, dret, z = 0;
- bam1_core_t *c = &b->core;
- kstring_t *str = fp->str;
- kstream_t *ks = fp->ks;
+ int i;
+ char *s, *p, *q, *r;
+ // free
+ free(h->target_len); free(h->target_name);
+ h->n_targets = 0; h->target_len = 0; h->target_name = 0;
+ if (h->l_text < 3) return 0;
+ // count number of @SQ
+ s = h->text;
+ while ((s = strstr(s, "@SQ")) != 0) {
+ ++h->n_targets;
+ s += 3;
+ }
+ if (h->n_targets == 0) return 0;
+ h->target_len = (uint32_t*)calloc(h->n_targets, 4);
+ h->target_name = (char**)calloc(h->n_targets, sizeof(void*));
+ // parse @SQ lines
+ i = 0;
+ s = h->text;
+ while ((s = strstr(s, "@SQ")) != 0) {
+ s += 3;
+ r = s;
+ if ((p = strstr(s, "SN:")) != 0) {
+ q = p + 3;
+ for (p = q; *p && *p != '\t' && *p != '\r' && *p != '\n'; ++p);
+ h->target_name[i] = (char*)calloc(p - q + 1, 1);
+ strncpy(h->target_name[i], q, p - q);
+ } else goto header_err_ret;
+ if (r < p) r = p;
+ if ((p = strstr(s, "LN:")) != 0) h->target_len[i] = strtol(p + 3, 0, 10);
+ else goto header_err_ret;
+ if (r < p) r = p;
+ s = r + 3;
+ ++i;
+ }
+ return h->n_targets;
+
+header_err_ret:
+ fprintf(stderr, "[bam_header_parse] missing SN or LN tag in @SQ lines.\n");
+ free(h->target_len); free(h->target_name);
+ h->n_targets = 0; h->target_len = 0; h->target_name = 0;
+ return 0;
+}
+
+bam_header_t *sam_header_read(tamFile fp)
+{
+ int ret, dret;
+ bam_header_t *header = bam_header_init();
+ kstring_t *str = fp->str;
while ((ret = ks_getuntil(fp->ks, KS_SEP_TAB, str, &dret)) >= 0 && str->s[0] == '@') { // skip header
- z += str->l + 1;
str->s[str->l] = dret; // note that str->s is NOT null terminated!!
append_text(header, str);
if (dret != '\n') {
ret = ks_getuntil(fp->ks, '\n', str, &dret);
- z += str->l + 1;
str->s[str->l] = '\n'; // NOT null terminated!!
append_text(header, str);
}
++fp->n_lines;
}
- while (ret == 0) { // special consideration for "\r\n" and empty lines
- ret = ks_getuntil(fp->ks, KS_SEP_TAB, str, &dret);
- if (ret >= 0) z += str->l + 1;
+ sam_header_parse(header);
+ bam_init_header_hash(header);
+ fp->is_first = 1;
+ return header;
+}
+
+int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b)
+{
+ int ret, doff, doff0, dret, z = 0;
+ bam1_core_t *c = &b->core;
+ kstring_t *str = fp->str;
+ kstream_t *ks = fp->ks;
+
+ if (fp->is_first) {
+ fp->is_first = 0;
+ ret = str->l;
+ } else {
+ do { // special consideration for empty lines
+ ret = ks_getuntil(fp->ks, KS_SEP_TAB, str, &dret);
+ if (ret >= 0) z += str->l + 1;
+ } while (ret == 0);
}
if (ret < 0) return -1;
++fp->n_lines;
{ // flag, tid, pos, qual
ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; c->flag = atoi(str->s);
ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; c->tid = bam_get_tid(header, str->s);
+ if (c->tid < 0 && strcmp(str->s, "*"))
+ fprintf(stderr, "[sam_read1] reference '%s' is recognized as '*'.\n", str->s);
ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; c->pos = isdigit(str->s[0])? atoi(str->s) - 1 : -1;
ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; c->qual = isdigit(str->s[0])? atoi(str->s) : 0;
if (ret < 0) return -2;
fp->str = (kstring_t*)calloc(1, sizeof(kstring_t));
fp->fp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(fn, "r");
fp->ks = ks_init(fp->fp);
- fp->n_lines = 0;
return fp;
}
#include "bam.h"
#ifndef PACKAGE_VERSION
-#define PACKAGE_VERSION "0.1.3-13 (r260)"
+#define PACKAGE_VERSION "0.1.3-14 (r262)"
#endif
int bam_taf2baf(int argc, char *argv[]);
return h;
}
-bam_header_t *bam_header_parse(const char *text)
-{
- bam_header_t *h;
- int i;
- char *s, *p, *q, *r;
-
- i = strlen(text);
- if (i < 3) return 0; // headerless
- h = bam_header_init();
- h->l_text = i;
- h->text = strdup(text);
- s = h->text;
- while ((s = strstr(s, "@SQ")) != 0) {
- ++h->n_targets;
- s += 3;
- }
- h->target_len = (uint32_t*)calloc(h->n_targets, 4);
- h->target_name = (char**)calloc(h->n_targets, sizeof(void*));
- i = 0;
- s = h->text;
- while ((s = strstr(s, "@SQ")) != 0) {
- s += 3;
- r = s;
- if ((p = strstr(s, "SN:")) != 0) {
- q = p + 3;
- for (p = q; *p && *p != '\t'; ++p);
- h->target_name[i] = (char*)calloc(p - q + 1, 1);
- strncpy(h->target_name[i], q, p - q);
- } else goto header_err_ret;
- if (r < p) r = p;
- if ((p = strstr(s, "LN:")) != 0) h->target_len[i] = strtol(p + 3, 0, 10);
- else goto header_err_ret;
- if (r < p) r = p;
- s = r + 3;
- ++i;
- }
- if (h->n_targets == 0) {
- bam_header_destroy(h);
- return 0;
- } else return h;
-
-header_err_ret:
- fprintf(stderr, "[bam_header_parse] missing SN tag in a @SQ line.\n");
- bam_header_destroy(h);
- return 0;
-}
-
samfile_t *samopen(const char *fn, const char *mode, const void *aux)
{
samfile_t *fp;
fp = (samfile_t*)calloc(1, sizeof(samfile_t));
- if (mode[0] == 'r') {
- const char *fn_list = (const char*)aux;
+ if (mode[0] == 'r') { // read
fp->type |= TYPE_READ;
if (mode[1] == 'b') { // binary
fp->type |= TYPE_BAM;
fp->x.bam = strcmp(fn, "-")? bam_open(fn, "r") : bam_dopen(fileno(stdin), "r");
if (fp->x.bam == 0) goto open_err_ret;
fp->header = bam_header_read(fp->x.bam);
- } else {
+ } else { // text
fp->x.tamr = sam_open(fn);
if (fp->x.tamr == 0) goto open_err_ret;
- fp->header = sam_header_read2(fn_list);
+ fp->header = sam_header_read(fp->x.tamr);
+ if (fp->header->n_targets == 0) { // no @SQ fields
+ if (aux) { // check if aux is present
+ bam_header_destroy(fp->header);
+ fp->header = sam_header_read2((const char*)aux);
+ }
+ if (fp->header->n_targets == 0)
+ fprintf(stderr, "[samopen] empty header.\n");
+ } else fprintf(stderr, "[samopen] SAM header is present: %d sequences.\n", fp->header->n_targets);
}
- } else if (mode[0] == 'w') {
+ } else if (mode[0] == 'w') { // write
fp->header = bam_header_dup((const bam_header_t*)aux);
if (mode[1] == 'b') { // binary
fp->type |= TYPE_BAM;
fp->x.bam = strcmp(fn, "-")? bam_open(fn, "w") : bam_dopen(fileno(stdout), "w");
if (fp->x.bam == 0) goto open_err_ret;
bam_header_write(fp->x.bam, fp->header);
- } else {
- int i;
- bam_header_t *alt = 0;
- alt = bam_header_parse(fp->header->text);
+ } else { // text
+ // open file
fp->x.tamw = strcmp(fn, "-")? fopen(fn, "w") : stdout;
if (fp->x.tamr == 0) goto open_err_ret;
- if (alt) {
- if (alt->n_targets != fp->header->n_targets)
- fprintf(stderr, "[samopen] inconsistent number of target sequences.\n");
+ // write header
+ if (strstr(mode, "h")) {
+ int i;
+ bam_header_t *alt;
+ // parse the header text
+ alt = bam_header_init();
+ alt->l_text = fp->header->l_text; alt->text = fp->header->text;
+ sam_header_parse(alt);
+ alt->l_text = 0; alt->text = 0;
+ // check if there are @SQ lines in the header
+ if (alt->n_targets) { // then write the header text without dumping ->target_{name,len}
+ if (alt->n_targets != fp->header->n_targets)
+ fprintf(stderr, "[samopen] inconsistent number of target sequences.\n");
+ fwrite(fp->header->text, 1, fp->header->l_text, fp->x.tamw);
+ } else { // then dump ->target_{name,len}
+ for (i = 0; i < fp->header->n_targets; ++i)
+ fprintf(fp->x.tamw, "@SQ\tSN:%s\tLN:%d\n", fp->header->target_name[i], fp->header->target_len[i]);
+ }
bam_header_destroy(alt);
- fwrite(fp->header->text, 1, fp->header->l_text, fp->x.tamw);
}
- if (strstr(mode, "h")) // print header
- for (i = 0; i < fp->header->n_targets; ++i)
- fprintf(fp->x.tamw, "@SQ\tSN:%s\tLN:%d\n", fp->header->target_name[i], fp->header->target_len[i]);
}
}
return fp;
/* parse command-line options */
strcpy(in_mode, "r"); strcpy(out_mode, "w");
- while ((c = getopt(argc, argv, "bt:hHo:q:f:F:")) >= 0) {
+ while ((c = getopt(argc, argv, "Sbt:hHo:q:f:F:")) >= 0) {
switch (c) {
+ case 'S': is_bamin = 0; break;
case 'b': strcat(out_mode, "b"); break;
case 't': fn_list = strdup(optarg); is_bamin = 0; break;
case 'h': is_header = 1; break;
case 'H': is_header_only = 1; break;
case 'o': fn_out = strdup(optarg); break;
case 'f': g_flag_on = strtol(optarg, 0, 0); break;
- case 'F': g_flag_on = strtol(optarg, 0, 0); break;
+ case 'F': g_flag_off = strtol(optarg, 0, 0); break;
case 'q': g_min_mapQ = atoi(optarg); break;
default: return usage();
}
fprintf(stderr, "Options: -b output BAM\n");
fprintf(stderr, " -h print header for the SAM output\n");
fprintf(stderr, " -H print header only (no alignments)\n");
- fprintf(stderr, " -t FILE list of reference names and lengths, assuming SAM input [null]\n");
+ fprintf(stderr, " -S input is SAM\n");
+ fprintf(stderr, " -t FILE list of reference names and lengths (force -S) [null]\n");
fprintf(stderr, " -o FILE output file name [stdout]\n");
fprintf(stderr, " -f INT required flag, 0 for unset [0]\n");
fprintf(stderr, " -F INT filtering flag, 0 for unset [0]\n");