X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=bam_import.c;h=746dc5bbbde0415911794b28d666a4a53ef86be8;hb=8cc87acf088966330e253d44e67791696d74f35b;hp=6b3b4bc69352a92d0c9e74379fe8ddd2e4564c43;hpb=f93dae0d03856955f9424e8b2aaf261304ca647e;p=samtools.git diff --git a/bam_import.c b/bam_import.c index 6b3b4bc..746dc5b 100644 --- a/bam_import.c +++ b/bam_import.c @@ -44,29 +44,27 @@ struct __tamFile_t { uint64_t n_lines; }; -char **bam_load_pos(const char *fn, int *_n) +char **__bam_get_lines(const char *fn, int *_n) // for bam_plcmd.c only { char **list = 0, *s; - int n = 0, dret, m = 0, c; + int n = 0, dret, m = 0; gzFile fp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(fn, "r"); kstream_t *ks; kstring_t *str; str = (kstring_t*)calloc(1, sizeof(kstring_t)); ks = ks_init(fp); - while (ks_getuntil(ks, 0, str, &dret) > 0) { + while (ks_getuntil(ks, '\n', str, &dret) > 0) { if (n == m) { m = m? m << 1 : 16; list = (char**)realloc(list, m * sizeof(char*)); } - s = list[n++] = (char*)calloc(str->l + 5, 1); + if (str->s[str->l-1] == '\r') + str->s[--str->l] = '\0'; + s = list[n++] = (char*)calloc(str->l + 1, 1); strcpy(s, str->s); - s += str->l + 1; - ks_getuntil(ks, 0, str, &dret); - *((uint32_t*)s) = atoi(str->s); - if (dret != '\n') - while ((c = ks_getc(fp)) >= 0 && c != '\n'); } ks_destroy(ks); + gzclose(fp); free(str->s); free(str); *_n = n; return list; @@ -104,7 +102,7 @@ bam_header_t *sam_header_read2(const char *fn) assert(fp); ks = ks_init(fp); str = (kstring_t*)calloc(1, sizeof(kstring_t)); - while (ks_getuntil(ks, 0, str, &dret) >= 0) { + while (ks_getuntil(ks, 0, str, &dret) > 0) { char *s = strdup(str->s); int len, i; i = kh_size(hash); @@ -153,7 +151,7 @@ int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b) kstring_t *str = fp->str; kstream_t *ks = fp->ks; - while ((ret = ks_getuntil(fp->ks, 0, str, &dret)) >= 0 && str->s[0] == '@') { // skip header + while ((ret = ks_getuntil(fp->ks, KS_SEP_TAB, str, &dret)) >= 0 && str->s[0] == '@') { // skip header str->s[str->l] = dret; // note that str->s is NOT null terminated!! append_text(header, str); if (dret != '\n') { @@ -163,7 +161,7 @@ int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b) } ++fp->n_lines; } - while (ret == 0) ret = ks_getuntil(fp->ks, 0, str, &dret); // special consideration for "\r\n" + while (ret == 0) ret = ks_getuntil(fp->ks, KS_SEP_TAB, str, &dret); // special consideration for "\r\n" if (ret < 0) return -1; ++fp->n_lines; doff = 0; @@ -174,10 +172,10 @@ int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b) doff += c->l_qname; } { // flag, tid, pos, qual - ret = ks_getuntil(ks, 0, str, &dret); c->flag = atoi(str->s); - ret = ks_getuntil(ks, 0, str, &dret); c->tid = bam_get_tid(header, str->s); - ret = ks_getuntil(ks, 0, str, &dret); c->pos = isdigit(str->s[0])? atoi(str->s) - 1 : -1; - ret = ks_getuntil(ks, 0, str, &dret); c->qual = isdigit(str->s[0])? atoi(str->s) : 0; + ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); c->flag = atoi(str->s); + ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); c->tid = bam_get_tid(header, str->s); + ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); c->pos = isdigit(str->s[0])? atoi(str->s) - 1 : -1; + ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); c->qual = isdigit(str->s[0])? atoi(str->s) : 0; if (ret < 0) return -2; } { // cigar @@ -185,7 +183,7 @@ int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b) int i, op; long x; c->n_cigar = 0; - if (ks_getuntil(ks, 0, str, &dret) < 0) return -3; + if (ks_getuntil(ks, KS_SEP_TAB, str, &dret) < 0) return -3; if (str->s[0] != '*') { for (s = str->s; *s; ++s) { if (isalpha(*s)) ++c->n_cigar; @@ -209,18 +207,18 @@ int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b) if (*s) parse_error(fp->n_lines, "unmatched CIGAR operation"); c->bin = bam_reg2bin(c->pos, bam_calend(c, bam1_cigar(b))); doff += c->n_cigar * 4; - } + } else c->bin = bam_reg2bin(c->pos, c->pos + 1); } { // mtid, mpos, isize - ret = ks_getuntil(ks, 0, str, &dret); c->mtid = strcmp(str->s, "=")? bam_get_tid(header, str->s) : c->tid; - ret = ks_getuntil(ks, 0, str, &dret); c->mpos = isdigit(str->s[0])? atoi(str->s) - 1 : -1; - ret = ks_getuntil(ks, 0, str, &dret); c->isize = (str->s[0] == '-' || isdigit(str->s[0]))? atoi(str->s) : 0; + ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); c->mtid = strcmp(str->s, "=")? bam_get_tid(header, str->s) : c->tid; + ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); c->mpos = isdigit(str->s[0])? atoi(str->s) - 1 : -1; + ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); c->isize = (str->s[0] == '-' || isdigit(str->s[0]))? atoi(str->s) : 0; if (ret < 0) return -4; } { // seq and qual int i; uint8_t *p; - if (ks_getuntil(ks, 0, str, &dret) < 0) return -5; // seq + if (ks_getuntil(ks, KS_SEP_TAB, str, &dret) < 0) return -5; // seq c->l_qseq = strlen(str->s); if (c->n_cigar && c->l_qseq != (int32_t)bam_cigar2qlen(c, bam1_cigar(b))) parse_error(fp->n_lines, "CIGAR and sequence length are inconsistent"); @@ -228,16 +226,17 @@ int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b) bzero(p, (c->l_qseq+1)/2); for (i = 0; i < c->l_qseq; ++i) p[i/2] |= bam_nt16_table[(int)str->s[i]] << 4*(1-i%2); - if (ks_getuntil(ks, 0, str, &dret) < 0) return -6; // qual - if (c->l_qseq != strlen(str->s)) + if (ks_getuntil(ks, KS_SEP_TAB, str, &dret) < 0) return -6; // qual + if (strcmp(str->s, "*") && c->l_qseq != strlen(str->s)) parse_error(fp->n_lines, "sequence and quality are inconsistent"); p += (c->l_qseq+1)/2; - for (i = 0; i < c->l_qseq; ++i) p[i] = str->s[i] - 33; + if (strcmp(str->s, "*") == 0) for (i = 0; i < c->l_qseq; ++i) p[i] = 0xff; + else for (i = 0; i < c->l_qseq; ++i) p[i] = str->s[i] - 33; doff += c->l_qseq + (c->l_qseq+1)/2; } doff0 = doff; if (dret != '\n' && dret != '\r') { // aux - while (ks_getuntil(ks, 0, str, &dret) >= 0) { + while (ks_getuntil(ks, KS_SEP_TAB, str, &dret) >= 0) { uint8_t *s, type, key[2]; if (str->l < 6 || str->s[2] != ':' || str->s[4] != ':') parse_error(fp->n_lines, "missing colon in auxiliary data"); @@ -245,9 +244,9 @@ int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b) type = str->s[3]; s = alloc_data(b, doff + 3) + doff; s[0] = key[0]; s[1] = key[1]; s += 2; doff += 2; - if (type == 'A' || type == 'a') { + if (type == 'A' || type == 'a' || type == 'c' || type == 'C') { // c and C for backward compatibility s = alloc_data(b, doff + 2) + doff; - *s++ = type; *s = str->s[5]; + *s++ = 'A'; *s = str->s[5]; doff += 2; } else if (type == 'I' || type == 'i') { long long x; @@ -287,6 +286,11 @@ int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b) *s++ = 'f'; *(float*)s = (float)atof(str->s + 5); s += 4; doff += 5; + } else if (type == 'd') { + s = alloc_data(b, doff + 9) + doff; + *s++ = 'd'; + *(float*)s = (float)atof(str->s + 9); + s += 8; doff += 9; } else if (type == 'Z' || type == 'H') { int size = 1 + (str->l - 5) + 1; if (type == 'H') { // check whether the hex string is valid @@ -341,7 +345,7 @@ static void taf2baf_core(const char *fntaf, const char *fnbaf, bam_header_t *hea int ret; b = (bam1_t*)calloc(1, sizeof(bam1_t)); - fpbaf = bam_open(fnbaf, "w"); + fpbaf = (strcmp(fnbaf, "-") == 0)? bam_dopen(fileno(stdout), "w") : bam_open(fnbaf, "w"); fp = sam_open(fntaf); ret = sam_read1(fp, header, b); bam_header_write(fpbaf, header);