From 48a5939e574874106f1450fd278f602b731d2a83 Mon Sep 17 00:00:00 2001 From: Daniel Caleb Jones Date: Wed, 2 Mar 2011 19:47:33 -0800 Subject: [PATCH] a new and improved parser --- src/Makefile.am | 6 +- src/fastq-common.c | 56 +++++++++++ src/fastq-common.h | 23 +++++ src/fastq-grep.c | 55 ++++------- src/fastq-kmers.c | 50 ++++------ src/fastq-match.c | 59 +++++------- src/fastq-parse.c | 235 +++++++++++++++++++++++++++++++++++++++++++++ src/fastq-parse.h | 57 +++++++++++ src/kseq.h | 223 ------------------------------------------ 9 files changed, 437 insertions(+), 327 deletions(-) create mode 100644 src/fastq-common.c create mode 100644 src/fastq-common.h create mode 100644 src/fastq-parse.c create mode 100644 src/fastq-parse.h delete mode 100644 src/kseq.h diff --git a/src/Makefile.am b/src/Makefile.am index 4d47d16..c50f420 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -3,13 +3,13 @@ SUBDIRS = swsse2 bin_PROGRAMS = fastq-grep fastq-kmers fastq-match -fastq_grep_SOURCES = kseq.h fastq-grep.c +fastq_grep_SOURCES = kseq.h fastq-grep.c fastq-parse.c fastq-common.c fastq_grep_LDADD = $(PCRE_LIBS) -lz -fastq_kmers_SOURCES = kseq.h fastq-kmers.c +fastq_kmers_SOURCES = kseq.h fastq-kmers.c fastq-parse.c fastq-common.c fastq_kmers_LDADD = -lz -fastq_match_SOURCES = kseq.h fastq-match.c +fastq_match_SOURCES = kseq.h fastq-match.c fastq-parse.c fastq-common.c fastq_match_LDADD = swsse2/libswsse2.la -lz fastq_match_CFLAGS = -msse2 diff --git a/src/fastq-common.c b/src/fastq-common.c new file mode 100644 index 0000000..ae1ecac --- /dev/null +++ b/src/fastq-common.c @@ -0,0 +1,56 @@ + +/* + * This file is part of fastq-tools. + * + * Copyright (c) 2011 by Daniel C. Jones + * + */ + + +#include "fastq-common.h" +#include + + +void or_die(int b, const char* msg) +{ + if (b == 0) { + fputs(msg, stderr); + exit(1); + } +} + + +void* malloc_or_die(size_t n) +{ + void* p = malloc(n); + if (p == NULL) { + fprintf(stderr, "Can not allocate %zu bytes.\n", n); + exit(1); + } + return p; +} + + +void* realloc_or_die(void* ptr, size_t n) +{ + void* p = realloc(ptr, n); + if (p == NULL) { + fprintf(stderr, "Can not allocate %zu bytes.\n", n); + exit(1); + } + return p; +} + + +FILE* fopen_or_die(const char* path, const char* mode) +{ + FILE* f = fopen(path, mode); + if (f == NULL) { + fprintf(stderr, "Can not open file %s with mode %s.\n", path, mode); + exit(1); + } + return f; +} + + + diff --git a/src/fastq-common.h b/src/fastq-common.h new file mode 100644 index 0000000..cdc89e8 --- /dev/null +++ b/src/fastq-common.h @@ -0,0 +1,23 @@ +/* + * This file is part of fastq-tools. + * + * Copyright (c) 2011 by Daniel C. Jones + * + * common : + * A few common functions, primarily for crashing whilst retaining our dignity. + * + */ + +#ifndef FASTQ_TOOLS_COMMON_H +#define FASTQ_TOOLS_COMMON_H + +#include + +void or_die(int b, const char* msg); + +void* malloc_or_die(size_t); +void* realloc_or_die(void*, size_t); +FILE* fopen_or_die(const char*, const char*); + +#endif + diff --git a/src/fastq-grep.c b/src/fastq-grep.c index b177357..f7b3913 100644 --- a/src/fastq-grep.c +++ b/src/fastq-grep.c @@ -1,22 +1,23 @@ - -/* - * fastq-grep: regular expression searches of the sequences within a fastq file +/* + * This file is part of fastq-tools. + * + * Copyright (c) 2011 by Daniel C. Jones * - * Febuary 2011 / Daniel Jones + * fastq-grep : + * Regular expression searches of the sequences within a FASTQ file. * */ +#include "fastq-common.h" +#include "fastq-parse.h" #include #include #include #include #include -#include "kseq.h" -KSEQ_INIT(gzFile, gzread) - #if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(__CYGWIN__) # include @@ -46,30 +47,30 @@ static int count_flag; -void print_fastq_entry( FILE* fout, kseq_t* seq ) +void print_fastq_entry(FILE* fout, seq_t* seq) { fprintf(fout, "@%s\n%s\n+%s\n%s\n", - seq->name.s, + seq->id1.s, seq->seq.s, - seq->comment.s, + seq->id2.s, seq->qual.s ); } -void fastq_grep( gzFile fin, FILE* fout, pcre* re ) +void fastq_grep(FILE* fin, FILE* fout, pcre* re) { int rc; int ovector[3]; size_t count = 0; - kseq_t* seq; - seq = kseq_init(fin); + fastq_t* fqf = fastq_open(fin); + seq_t* seq = fastq_alloc_seq(); - while (kseq_read(seq) >= 0) { + while (fastq_next(fqf, seq)) { rc = pcre_exec(re, /* pattern */ NULL, /* extre data */ seq->seq.s, /* subject */ - seq->seq.l, /* subject length */ + seq->seq.n, /* subject length */ 0, /* subject offset */ 0, /* options */ ovector, /* output vector */ @@ -81,7 +82,8 @@ void fastq_grep( gzFile fin, FILE* fout, pcre* re ) } } - kseq_destroy(seq); + fastq_free_seq(seq); + fastq_close(fqf); if (count_flag) fprintf(fout, "%zu\n", count); } @@ -99,7 +101,6 @@ int main(int argc, char* argv[]) int pat_error_offset; FILE* fin; - gzFile gzfin; invert_flag = 0; @@ -172,15 +173,7 @@ int main(int argc, char* argv[]) if (optind >= argc || (argc - optind == 1 && strcmp(argv[optind],"-") == 0)) { - gzfin = gzdopen( fileno(stdin), "rb" ); - if (gzfin == NULL) { - fprintf(stderr, "Malformed file 'stdin'.\n"); - return 1; - } - - fastq_grep(gzfin, stdout, re); - - gzclose(gzfin); + fastq_grep(stdin, stdout, re); } else { for (; optind < argc; optind++) { @@ -190,15 +183,9 @@ int main(int argc, char* argv[]) continue; } - gzfin = gzdopen(fileno(fin), "rb"); - if (gzfin == NULL) { - fprintf(stderr, "Malformed file '%s'.\n", argv[optind]); - continue; - } - - fastq_grep(gzfin, stdout, re); + fastq_grep(fin, stdout, re); - gzclose(gzfin); + fclose(fin); } } diff --git a/src/fastq-kmers.c b/src/fastq-kmers.c index 43fa302..d78a453 100644 --- a/src/fastq-kmers.c +++ b/src/fastq-kmers.c @@ -1,22 +1,23 @@ -/* - * fastq-kmers: kmer frequences within fastq files +/* + * This file is part of fastq-tools. * - * Febuary 2011 / Daniel Jones + * Copyright (c) 2011 by Daniel C. Jones + * + * fastq-kmers : + * Tabulate k-mer frequencies with FASTQ files. * */ - - +#include "fastq-common.h" +#include "fastq-parse.h" +#include #include #include #include #include #include -#include "kseq.h" -KSEQ_INIT(gzFile, gzread) - #if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(__CYGWIN__) # include @@ -105,15 +106,16 @@ void unpackkmer( uint32_t kmer, char* s, int k ) } -void count_fastq_kmers(gzFile* fin, uint32_t* cs) +void count_fastq_kmers(FILE* fin, uint32_t* cs) { - kseq_t* seq = kseq_init(fin); + seq_t* seq = fastq_alloc_seq(); + fastq_t* fqf = fastq_open(fin); int i; int n; uint32_t kmer; - while (kseq_read(seq) >= 0) { - n = (int)seq->seq.l - k + 1; + while (fastq_next(fqf, seq)) { + n = (int)seq->seq.n - k + 1; for (i = 0; i < n; i++) { if( packkmer(seq->seq.s + i, &kmer, k) ) { cs[kmer]++; @@ -121,7 +123,8 @@ void count_fastq_kmers(gzFile* fin, uint32_t* cs) } } - kseq_destroy(seq); + fastq_free_seq(seq); + fastq_close(fqf); } @@ -154,7 +157,6 @@ int main(int argc, char* argv[]) uint32_t* cs; /* counts */ FILE* fin; - gzFile gzfin; int opt; int opt_idx; @@ -222,15 +224,7 @@ int main(int argc, char* argv[]) } if (optind >= argc || (argc - optind == 1 && strcmp(argv[optind],"-") == 0)) { - gzfin = gzdopen( fileno(stdin), "rb" ); - if (gzfin == NULL) { - fprintf(stderr, "Malformed file 'stdin'.\n"); - return 1; - } - - count_fastq_kmers(gzfin, cs); - - gzclose(gzfin); + count_fastq_kmers(stdin, cs); } else { for (; optind < argc; optind++) { @@ -240,15 +234,7 @@ int main(int argc, char* argv[]) continue; } - gzfin = gzdopen(fileno(fin), "rb"); - if (gzfin == NULL) { - fprintf(stderr, "Malformed file '%s'.\n", argv[optind]); - continue; - } - - count_fastq_kmers(gzfin, cs); - - gzclose(gzfin); + count_fastq_kmers(fin, cs); } } diff --git a/src/fastq-match.c b/src/fastq-match.c index ccfca73..69eff70 100644 --- a/src/fastq-match.c +++ b/src/fastq-match.c @@ -1,20 +1,25 @@ -/* Smith-Waterman alignments against sequences within a fastq file. +/* + * This file is part of fastq-tools. + * + * Copyright (c) 2011 by Daniel C. Jones + * + * fastq-match : + * Smith-Waterman alignments against sequences within a fastq file. * - * Daniel Jones */ -#include -#include -#include +#include "fastq-common.h" +#include "fastq-parse.h" #include "swsse2/blosum62.h" #include "swsse2/swsse2.h" #include "swsse2/matrix.h" #include "swsse2/swstriped.h" -#include "kseq.h" -KSEQ_INIT(gzFile, gzread) - +#include +#include +#include +#include #if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(__CYGWIN__) # include @@ -49,7 +54,7 @@ void convert_sequence(unsigned char* s, int n) } -void fastq_match(gzFile fin, FILE* fout, +void fastq_match(FILE* fin, FILE* fout, SwStripedData* sw_data, unsigned char* query, int n, SEARCH_OPTIONS* options) @@ -60,16 +65,16 @@ void fastq_match(gzFile fin, FILE* fout, int gap_ext = -options->gapExt; int threshold = options->threshold; - kseq_t* seq; - seq = kseq_init(fin); + fastq_t* fqf = fastq_open(fin); + seq_t* seq = fastq_alloc_seq(); - while (kseq_read(seq) >= 0) { + while (fastq_next(fqf, seq)) { fprintf(fout, "%s\t", seq->seq.s); - convert_sequence((unsigned char*)seq->seq.s, seq->seq.l); + convert_sequence((unsigned char*)seq->seq.s, seq->seq.n); score = swStripedByte(query, n, - (unsigned char*)seq->seq.s, seq->seq.l, + (unsigned char*)seq->seq.s, seq->seq.n, gap_init, gap_ext, sw_data->pvbQueryProf, sw_data->pvH1, @@ -78,7 +83,7 @@ void fastq_match(gzFile fin, FILE* fout, sw_data->bias); if (score >= 255) { score = swStripedWord(query, n, - (unsigned char*)seq->seq.s, seq->seq.l, + (unsigned char*)seq->seq.s, seq->seq.n, gap_init, gap_ext, sw_data->pvbQueryProf, sw_data->pvH1, @@ -89,7 +94,8 @@ void fastq_match(gzFile fin, FILE* fout, fprintf(fout, "%d\n", score); } - kseq_destroy(seq); + fastq_free_seq(seq); + fastq_close(fqf); } @@ -110,7 +116,6 @@ int main(int argc, char* argv[]) options.threshold = -1; FILE* fin; - gzFile gzfin; help_flag = 0; @@ -165,15 +170,7 @@ int main(int argc, char* argv[]) sw_data = swStripedInit(query, query_len, mat); if (optind >= argc || (argc - optind == 1 && strcmp(argv[optind],"-") == 0)) { - gzfin = gzdopen( fileno(stdin), "rb" ); - if (gzfin == NULL) { - fprintf(stderr, "Malformed file 'stdin'.\n"); - return 1; - } - - fastq_match(gzfin, stdout, sw_data, query, query_len, &options); - - gzclose(gzfin); + fastq_match(stdin, stdout, sw_data, query, query_len, &options); } else { for (; optind < argc; optind++) { @@ -183,15 +180,7 @@ int main(int argc, char* argv[]) continue; } - gzfin = gzdopen(fileno(fin), "rb"); - if (gzfin == NULL) { - fprintf(stderr, "Malformed file '%s'.\n", argv[optind]); - continue; - } - - fastq_match(gzfin, stdout, sw_data, query, query_len, &options); - - gzclose(gzfin); + fastq_match(fin, stdout, sw_data, query, query_len, &options); } } diff --git a/src/fastq-parse.c b/src/fastq-parse.c new file mode 100644 index 0000000..ec5b478 --- /dev/null +++ b/src/fastq-parse.c @@ -0,0 +1,235 @@ +/* + * This file is part of fastq-tools. + * + * Copyright (c) 2011 by Daniel C. Jones + * + */ + +#include "fastq-parse.h" +#include "fastq-common.h" +#include +#include + + +static const size_t init_str_size = 128; +static const size_t fastq_buf_size = 4096; + +static void fastq_alloc_str(str_t* s) +{ + s->s = malloc_or_die(init_str_size); + s->s[0] = '\0'; + s->n = 0; + s->size = init_str_size; +} + + +static void fastq_expand_str(str_t* s) +{ + s->size *= 2; + realloc_or_die(s->s, s->size); +} + + +seq_t* fastq_alloc_seq() +{ + seq_t* seq = malloc_or_die(sizeof(seq_t)); + fastq_alloc_str(&seq->id1); + fastq_alloc_str(&seq->seq); + fastq_alloc_str(&seq->id2); + fastq_alloc_str(&seq->qual); + + return seq; +} + + +void fastq_free_seq(seq_t* seq) +{ + free(seq->id1.s); + free(seq->seq.s); + free(seq->id2.s); + free(seq->qual.s); + free(seq); +} + + +typedef enum +{ + STATE_EOF, + STATE_ERROR, + STATE_ID1, + STATE_SEQ, + STATE_ID2, + STATE_QUAL + +} fastq_state; + + +fastq_t* fastq_open(FILE* f) +{ + fastq_t* fqf = malloc_or_die(sizeof(fastq_t)); + or_die((int)(fqf->file = gzdopen(fileno(f), "rb")), + "Can not open gzip file."); + + fqf->state = STATE_ID1; + fqf->buf = malloc_or_die(fastq_buf_size); + fqf->buf[0] = '\0'; + fqf->c = fqf->buf; + + return fqf; +} + + +void fastq_close(fastq_t* fqf) +{ + gzclose(fqf->file); + free(fqf->buf); + free(fqf); +} + + +void fastq_refill(fastq_t* f) +{ + int errnum; + const char* errmsg; + + int n = gzread(f->file, f->buf, fastq_buf_size - 1); + + if (n <= 0) { + if (gzeof(f->file)) { + f->state = STATE_EOF; + n = 0; + } + else { + errmsg = gzerror(f->file, &errnum); + fprintf(stderr, "I/O error: %s\n", errmsg); + exit(1); + } + } + + f->buf[n] = '\0'; + f->c = f->buf; +} + + +void fastq_get_line(fastq_t* f, str_t* s) +{ + int i = 0; + + if (f->state == STATE_EOF) goto fastq_get_line_done; + + while (1) { + switch (*f->c) { + case '\0': + fastq_refill(f); + if (f->state == STATE_EOF) goto fastq_get_line_done; + break; + + case '\r': + f->c++; + break; + + case '\n': + goto fastq_get_line_done; + + default: + while (s->size < i + 2) { + fastq_expand_str(s); + } + if (s) s->s[i++] = *f->c; + f->c++; + } + + } + +fastq_get_line_done: + if (s) { + s->s[i] = '\0'; + s->n = i; + } +} + + + +int fastq_next(fastq_t* f, seq_t* seq) +{ + if (f->state == STATE_EOF) return 0; + + while (1) { + + /* read more, if needed */ + if (*f->c == '\0' ) { + fastq_refill(f); + if (f->state == STATE_EOF) return 0; + continue; + } + + /* skip over leading whitespace */ + else if (isspace(*f->c)) { + /* do nothing */ + } + + /* skip comments */ + else if (*f->c == ';') { + fastq_get_line(f, NULL); + if (f->state == STATE_EOF) return 0; + } + + /* read id1 */ + else if (f->state == STATE_ID1) { + if (*f->c == '@') { + f->c++; + fastq_get_line(f, &seq->id1); + if (f->state == STATE_EOF) return 0; + + f->state = STATE_SEQ; + } + else { + fprintf(stderr, "Malformed FASTQ file: expecting an '@', saw a '%c'\n", *f->c); + exit(1); + } + } + + /* read sequence */ + else if (f->state == STATE_SEQ) { + fastq_get_line(f, &seq->seq); + if (f->state == STATE_EOF) return 0; + + f->state = STATE_ID2; + } + + /* read id2 */ + else if (f->state == STATE_ID2) { + if (*f->c == '+') { + f->c++; + fastq_get_line(f, &seq->id2); + if (f->state == STATE_EOF) return 0; + + f->state = STATE_QUAL; + } + else { + fprintf(stderr, "Malformed FASTQ file: expecting an '+', saw a '%c'\n", *f->c); + exit(1); + } + } + + /* read quality string */ + else if (f->state == STATE_QUAL) { + fastq_get_line(f, &seq->qual); + if (f->state == STATE_EOF) return 1; + + f->state = STATE_ID1; + break; + } + + else { + fputs("Inexplicable error in fastq parser.\n", stderr); + exit(1); + } + + f->c++; + } + + return 1; +} + + diff --git a/src/fastq-parse.h b/src/fastq-parse.h new file mode 100644 index 0000000..56a074a --- /dev/null +++ b/src/fastq-parse.h @@ -0,0 +1,57 @@ +/* + * This file is part of fastq-tools. + * + * Copyright (c) 2011 by Daniel C. Jones + * + * fastq-parse : + * A parser for FASTQ files. + * + * This parser is mostly derivative of Heng Li's. + * See: http://lh3lh3.users.sourceforge.net/kseq.shtml + * + */ + +#ifndef FASTQ_TOOLS_PARSE_H +#define FASTQ_TOOLS_PARSE_H + +#include +#include + + +typedef struct +{ + char* s; /* null-terminated string */ + size_t n; /* length of s */ + size_t size; /* bytes allocated for s */ +} str_t; + + + +typedef struct +{ + str_t id1; + str_t seq; + str_t id2; + str_t qual; +} seq_t; + + +seq_t* fastq_alloc_seq(); +void fastq_free_seq(seq_t*); + + +typedef struct +{ + gzFile file; + int state; + char* buf; + char* c; +} fastq_t; + + +fastq_t* fastq_open(FILE*); +void fastq_close(fastq_t*); +int fastq_next(fastq_t*, seq_t*); + +#endif + diff --git a/src/kseq.h b/src/kseq.h deleted file mode 100644 index 4fafd85..0000000 --- a/src/kseq.h +++ /dev/null @@ -1,223 +0,0 @@ -/* The MIT License - - Copyright (c) 2008 Genome Research Ltd (GRL). - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ - -/* Contact: Heng Li */ - -/* Last Modified: 12APR2009 */ - -#ifndef AC_KSEQ_H -#define AC_KSEQ_H - -#include -#include -#include - -#define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r -#define KS_SEP_TAB 1 // isspace() && !' ' -#define KS_SEP_MAX 1 - -#define __KS_TYPE(type_t) \ - typedef struct __kstream_t { \ - char *buf; \ - int begin, end, is_eof; \ - type_t f; \ - } kstream_t; - -#define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end) -#define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0) - -#define __KS_BASIC(type_t, __bufsize) \ - static inline kstream_t *ks_init(type_t f) \ - { \ - kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \ - ks->f = f; \ - ks->buf = (char*)malloc(__bufsize); \ - return ks; \ - } \ - static inline void ks_destroy(kstream_t *ks) \ - { \ - if (ks) { \ - free(ks->buf); \ - free(ks); \ - } \ - } - -#define __KS_GETC(__read, __bufsize) \ - static inline int ks_getc(kstream_t *ks) \ - { \ - if (ks->is_eof && ks->begin >= ks->end) return -1; \ - if (ks->begin >= ks->end) { \ - ks->begin = 0; \ - ks->end = __read(ks->f, ks->buf, __bufsize); \ - if (ks->end < __bufsize) ks->is_eof = 1; \ - if (ks->end == 0) return -1; \ - } \ - return (int)ks->buf[ks->begin++]; \ - } - -#ifndef KSTRING_T -#define KSTRING_T kstring_t -typedef struct __kstring_t { - size_t l, m; - char *s; -} kstring_t; -#endif - -#ifndef kroundup32 -#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) -#endif - -#define __KS_GETUNTIL(__read, __bufsize) \ - static int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ - { \ - if (dret) *dret = 0; \ - str->l = 0; \ - if (ks->begin >= ks->end && ks->is_eof) return -1; \ - for (;;) { \ - int i; \ - if (ks->begin >= ks->end) { \ - if (!ks->is_eof) { \ - ks->begin = 0; \ - ks->end = __read(ks->f, ks->buf, __bufsize); \ - if (ks->end < __bufsize) ks->is_eof = 1; \ - if (ks->end == 0) break; \ - } else break; \ - } \ - if (delimiter > KS_SEP_MAX) { \ - for (i = ks->begin; i < ks->end; ++i) \ - if (ks->buf[i] == delimiter) break; \ - } else if (delimiter == KS_SEP_SPACE) { \ - for (i = ks->begin; i < ks->end; ++i) \ - if (isspace(ks->buf[i])) break; \ - } else if (delimiter == KS_SEP_TAB) { \ - for (i = ks->begin; i < ks->end; ++i) \ - if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \ - } else i = 0; /* never come to here! */ \ - if (str->m - str->l < i - ks->begin + 1) { \ - str->m = str->l + (i - ks->begin) + 1; \ - kroundup32(str->m); \ - str->s = (char*)realloc(str->s, str->m); \ - } \ - memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \ - str->l = str->l + (i - ks->begin); \ - ks->begin = i + 1; \ - if (i < ks->end) { \ - if (dret) *dret = ks->buf[i]; \ - break; \ - } \ - } \ - if (str->l == 0) { \ - str->m = 1; \ - str->s = (char*)calloc(1, 1); \ - } \ - str->s[str->l] = '\0'; \ - return str->l; \ - } - -#define KSTREAM_INIT(type_t, __read, __bufsize) \ - __KS_TYPE(type_t) \ - __KS_BASIC(type_t, __bufsize) \ - __KS_GETC(__read, __bufsize) \ - __KS_GETUNTIL(__read, __bufsize) - -#define __KSEQ_BASIC(type_t) \ - static inline kseq_t *kseq_init(type_t fd) \ - { \ - kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \ - s->f = ks_init(fd); \ - return s; \ - } \ - static inline void kseq_rewind(kseq_t *ks) \ - { \ - ks->last_char = 0; \ - ks->f->is_eof = ks->f->begin = ks->f->end = 0; \ - } \ - static inline void kseq_destroy(kseq_t *ks) \ - { \ - if (!ks) return; \ - free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \ - ks_destroy(ks->f); \ - free(ks); \ - } - -/* Return value: - >=0 length of the sequence (normal) - -1 end-of-file - -2 truncated quality string - */ -#define __KSEQ_READ \ - static int kseq_read(kseq_t *seq) \ - { \ - int c; \ - kstream_t *ks = seq->f; \ - if (seq->last_char == 0) { /* then jump to the next header line */ \ - while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ - if (c == -1) return -1; /* end of file */ \ - seq->last_char = c; \ - } /* the first header char has been read */ \ - seq->comment.l = seq->seq.l = seq->qual.l = 0; \ - if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; \ - if (c != '\n') ks_getuntil(ks, '\n', &seq->comment, 0); \ - while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \ - if (isgraph(c)) { /* printable non-space character */ \ - if (seq->seq.l + 1 >= seq->seq.m) { /* double the memory */ \ - seq->seq.m = seq->seq.l + 2; \ - kroundup32(seq->seq.m); /* rounded to next closest 2^k */ \ - seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ - } \ - seq->seq.s[seq->seq.l++] = (char)c; \ - } \ - } \ - if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \ - seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ - if (c != '+') return seq->seq.l; /* FASTA */ \ - if (seq->qual.m < seq->seq.m) { /* allocate enough memory */ \ - seq->qual.m = seq->seq.m; \ - seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ - } \ - ks_getuntil(ks, '\n', &seq->comment, &c); \ - if (c == -1) return -2; /* we should not stop here */ \ - while ((c = ks_getc(ks)) != -1 && seq->qual.l < seq->seq.l) \ - if (c >= 33 && c <= 127) seq->qual.s[seq->qual.l++] = (unsigned char)c; \ - seq->qual.s[seq->qual.l] = 0; /* null terminated string */ \ - seq->last_char = 0; /* we have not come to the next header line */ \ - if (seq->seq.l != seq->qual.l) return -2; /* qual string is shorter than seq string */ \ - return seq->seq.l; \ - } - -#define __KSEQ_TYPE(type_t) \ - typedef struct { \ - kstring_t name, comment, seq, qual; \ - int last_char; \ - kstream_t *f; \ - } kseq_t; - -#define KSEQ_INIT(type_t, __read) \ - KSTREAM_INIT(type_t, __read, 4096) \ - __KSEQ_TYPE(type_t) \ - __KSEQ_BASIC(type_t) \ - __KSEQ_READ - -#endif -- 2.39.2