10 static void str_init(str_t* str)
14 str->s = malloc_or_die(str->size);
19 static void str_free(str_t* str)
25 /* Reserve space for `size` more characters. */
26 static void str_reserve_extra(str_t* str, size_t size)
28 if (str->n + size > str->size) {
29 if (str->n + size > 2 * str->size) {
30 str->size = str->n + size;
33 str->s = realloc_or_die(str->s, str->size * sizeof(char));
38 /* Copy n characters from c to the end of str. */
39 static void str_append(str_t* str, char* c, size_t n)
41 str_reserve_extra(str, n);
42 memcpy(str->s + str->n, c, n);
44 str->s[str->n] = '\0';
50 seq_t* seq = malloc_or_die(sizeof(seq_t));
59 void seq_free(seq_t* seq)
69 /* This is MurmurHash3. The original C++ code was placed in the public domain
70 * by its author, Austin Appleby. */
72 static inline uint32_t fmix(uint32_t h)
84 static inline uint32_t rotl32(uint32_t x, int8_t r)
86 return (x << r) | (x >> (32 - r));
90 uint32_t murmurhash3(uint32_t seed, const uint8_t* data, size_t len_)
92 const int len = (int) len_;
93 const int nblocks = len / 4;
97 uint32_t c1 = 0xcc9e2d51;
98 uint32_t c2 = 0x1b873593;
103 const uint32_t * blocks = (const uint32_t*) (data + nblocks * 4);
106 for(i = -nblocks; i; i++)
108 uint32_t k1 = blocks[i];
116 h1 = h1*5+0xe6546b64;
122 const uint8_t * tail = (const uint8_t*)(data + nblocks*4);
128 case 3: k1 ^= tail[2] << 16;
129 case 2: k1 ^= tail[1] << 8;
130 case 1: k1 ^= tail[0];
131 k1 *= c1; k1 = rotl32(k1,15); k1 *= c2; h1 ^= k1;
145 uint32_t seq_hash(const seq_t* seq)
147 uint32_t h = 0xc062fb4a; /* random seed */
148 h = murmurhash3(h, (uint8_t*) seq->id1.s, seq->id1.n);
149 h = murmurhash3(h, (uint8_t*) seq->seq.s, seq->seq.n);
150 h = murmurhash3(h, (uint8_t*) seq->id2.s, seq->id2.n);
151 h = murmurhash3(h, (uint8_t*) seq->qual.s, seq->qual.n);
156 static const size_t parser_buf_size = 1000000;
170 fastq_t* fastq_create(FILE* file)
172 fastq_t* f = malloc_or_die(sizeof(fastq_t));
174 f->next = f->buf = malloc_or_die(parser_buf_size);
181 void fastq_free(fastq_t* f)
189 FASTQ_STATE_ID1, /* Reading ID1. */
190 FASTQ_STATE_SEQ, /* Reading the sequence. */
191 FASTQ_STATE_ID2, /* Reading ID2. */
192 FASTQ_STATE_QUAL, /* Reading quality scores. */
193 } fastq_parser_state_t;
196 bool fastq_read(fastq_t* f, seq_t* seq)
198 seq->id1.n = seq->seq.n = seq->id2.n = seq->qual.n = 0;
199 fastq_parser_state_t state = FASTQ_STATE_ID1;
200 char* end = f->buf + f->readlen;
202 while (f->next < end) {
203 /* Consume pointless special characters prefixing IDs */
204 if ((state == FASTQ_STATE_ID1 && f->linestart && f->next[0] == '@') ||
205 (state == FASTQ_STATE_ID2 && f->linestart && f->next[0] == '+')) {
206 f->linestart = false;
211 char* u = memchr(f->next, '\n', end - f->next);
213 f->linestart = false;
216 else f->linestart = true;
219 case FASTQ_STATE_ID1:
220 str_append(&seq->id1, f->next, u - f->next);
221 if (f->linestart) state = FASTQ_STATE_SEQ;
224 case FASTQ_STATE_SEQ:
225 str_append(&seq->seq, f->next, u - f->next);
226 if (f->linestart) state = FASTQ_STATE_ID2;
229 case FASTQ_STATE_ID2:
230 str_append(&seq->id2, f->next, u - f->next);
231 if (f->linestart) state = FASTQ_STATE_QUAL;
234 case FASTQ_STATE_QUAL:
235 str_append(&seq->qual, f->next, u - f->next);
246 /* Try to read more. */
247 f->readlen = fread(f->buf, 1, parser_buf_size, f->file);
249 end = f->buf + f->readlen;
250 } while (f->readlen);
256 void fastq_rewind(fastq_t* f)
264 void fastq_print(FILE* fout, const seq_t* seq)
266 fprintf(fout, "@%s\n%s\n+%s\n%s\n",