9 uint64_t len:32, line_len:16, line_blen:16;
12 KHASH_MAP_INIT_STR(s, faidx1_t)
17 extern off_t ftello(FILE *stream);
18 extern int fseeko(FILE *stream, off_t offset, int whence);
20 #define razf_read(fp, buf, size) fread(buf, 1, size, fp)
21 #define razf_open(fn, mode) fopen(fn, mode)
22 #define razf_close(fp) fclose(fp)
23 #define razf_seek(fp, offset, whence) fseeko(fp, offset, whence)
24 #define razf_tell(fp) ftello(fp)
35 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
38 static inline void fai_insert_index(faidx_t *idx, const char *name, int len, int line_len, int line_blen, uint64_t offset)
43 if (idx->n == idx->m) {
44 idx->m = idx->m? idx->m<<1 : 16;
45 idx->name = (char**)realloc(idx->name, sizeof(void*) * idx->m);
47 idx->name[idx->n] = strdup(name);
48 k = kh_put(s, idx->hash, idx->name[idx->n], &ret);
49 t.len = len; t.line_len = line_len; t.line_blen = line_blen; t.offset = offset;
50 kh_value(idx->hash, k) = t;
54 faidx_t *fai_build_core(RAZF *rz)
57 int l_name, m_name, ret;
58 int len, line_len, line_blen, state;
63 idx = (faidx_t*)calloc(1, sizeof(faidx_t));
64 idx->hash = kh_init(s);
65 name = 0; l_name = m_name = 0;
66 len = line_len = line_blen = -1; state = 0; l1 = l2 = -1; offset = 0;
67 while (razf_read(rz, &c, 1)) {
68 if (c == '\n') { // an empty line
70 offset = razf_tell(rz);
72 } else if ((state == 0 && len < 0) || state == 2) continue;
74 if (c == '>') { // fasta header
76 fai_insert_index(idx, name, len, line_len, line_blen, offset);
78 while ((ret = razf_read(rz, &c, 1)) != 0 && !isspace(c)) {
79 if (m_name < l_name + 2) {
82 name = (char*)realloc(name, m_name);
88 fprintf(stderr, "[fai_build_core] the last entry has no sequence\n");
89 free(name); fai_destroy(idx);
92 if (c != '\n') while (razf_read(rz, &c, 1) && c != '\n');
94 offset = razf_tell(rz);
97 fprintf(stderr, "[fai_build_core] inlined empty line is not allowed in sequence '%s'.\n", name);
98 free(name); fai_destroy(idx);
101 if (state == 2) state = 3;
105 if (isgraph(c)) ++l2;
106 } while ((ret = razf_read(rz, &c, 1)) && c != '\n');
107 if (state == 3 && l2) {
108 fprintf(stderr, "[fai_build_core] different line length in sequence '%s'.\n", name);
109 free(name); fai_destroy(idx);
114 fprintf(stderr, "[fai_build_core] line length exceeds 65535 in sequence '%s'.\n", name);
115 free(name); fai_destroy(idx);
118 if (state == 1) line_len = l1, line_blen = l2, state = 0;
119 else if (state == 0) {
120 if (l1 != line_len || l2 != line_blen) state = 2;
124 fai_insert_index(idx, name, len, line_len, line_blen, offset);
129 void fai_save(const faidx_t *fai, FILE *fp)
133 for (i = 0; i < fai->n; ++i) {
135 k = kh_get(s, fai->hash, fai->name[i]);
136 x = kh_value(fai->hash, k);
137 fprintf(fp, "%s\t%d\t%lld\t%d\t%d\n", fai->name[i], (int)x.len, (long long)x.offset, (int)x.line_blen, (int)x.line_len);
141 faidx_t *fai_read(FILE *fp)
145 int len, line_len, line_blen;
147 fai = (faidx_t*)calloc(1, sizeof(faidx_t));
148 fai->hash = kh_init(s);
149 buf = (char*)calloc(0x10000, 1);
150 while (!feof(fp) && fgets(buf, 0x10000, fp)) {
151 for (p = buf; *p && isgraph(*p); ++p);
153 sscanf(p, "%d%lld%d%d", &len, &offset, &line_blen, &line_len);
154 fai_insert_index(fai, buf, len, line_len, line_blen, offset);
160 void fai_destroy(faidx_t *fai)
163 for (i = 0; i < fai->n; ++i) free(fai->name[i]);
165 kh_destroy(s, fai->hash);
166 if (fai->rz) razf_close(fai->rz);
170 int fai_build(const char *fn)
176 str = (char*)calloc(strlen(fn) + 5, 1);
177 sprintf(str, "%s.fai", fn);
178 rz = razf_open(fn, "r");
180 fprintf(stderr, "[fai_build] fail to open the FASTA file.\n");
184 fai = fai_build_core(rz);
186 fp = fopen(str, "w");
188 fprintf(stderr, "[fai_build] fail to write FASTA index.\n");
189 fai_destroy(fai); free(str);
199 faidx_t *fai_load(const char *fn)
204 str = (char*)calloc(strlen(fn) + 5, 1);
205 sprintf(str, "%s.fai", fn);
206 fp = fopen(str, "r");
208 fprintf(stderr, "[fai_load] build FASTA index.\n");
210 fp = fopen(str, "r");
212 fprintf(stderr, "[fai_load] fail to open FASTA index.\n");
219 fai->rz = razf_open(fn, "r");
222 fprintf(stderr, "[fai_load] fail to open FASTA file.\n");
228 char *fai_fetch(const faidx_t *fai, const char *str, int *len)
240 p = s = (char*)malloc(l+1);
241 /* squeeze out "," */
242 for (i = k = 0; i != l; ++i)
243 if (str[i] != ',' && !isspace(str[i])) s[k++] = str[i];
245 for (i = 0; i != k; ++i) if (s[i] == ':') break;
247 iter = kh_get(s, h, s); /* get the ref_id */
248 if (iter == kh_end(h)) {
252 val = kh_value(h, iter);
253 if (i == k) { /* dump the whole sequence */
254 beg = 0; end = val.len;
256 for (p = s + i + 1; i != k; ++i) if (s[i] == '-') break;
261 } else end = val.len;
264 if (beg >= val.len) beg = val.len;
265 if (end >= val.len) end = val.len;
266 if (beg > end) beg = end;
269 // now retrieve the sequence
271 s = (char*)malloc(end - beg + 2);
272 razf_seek(fai->rz, val.offset + beg / val.line_blen * val.line_len + beg % val.line_blen, SEEK_SET);
273 while (razf_read(fai->rz, &c, 1) == 1 && l < end - beg)
274 if (isgraph(c)) s[l++] = c;
280 int faidx_main(int argc, char *argv[])
283 fprintf(stderr, "Usage: faidx <in.fasta> [<reg> [...]]\n");
286 if (argc == 2) fai_build(argv[1]);
291 fai = fai_load(argv[1]);
292 if (fai == 0) return 1;
293 for (i = 2; i != argc; ++i) {
294 printf(">%s\n", argv[i]);
295 s = fai_fetch(fai, argv[i], &l);
296 for (j = 0; j < l; j += 60) {
297 for (k = 0; k < 60 && k < l - j; ++k)
310 int main(int argc, char *argv[]) { return faidx_main(argc, argv); }