+Beta Release 0.1.18 (2 September, 2011)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Notable changes in samtools:
+
+ * Support the new =/X CIGAR operators (by Peter Cock).
+
+ * Allow to subsample BAM while keeping the pairing intact (view -s).
+
+ * Implemented variant distance bias as a new filter (by Petr Danecek).
+
+ * Bugfix: huge memory usage during indexing
+
+ * Bugfix: use of uninitialized variable in mpileup (rare)
+
+ * Bugfix: wrong BAQ probability (rare)
+
+Notable changes in bcftools:
+
+ * Support indel in the contrast caller.
+
+ * Bugfix: LRT2=nan in rare cases
+
+(0.1.18: 2 September 2011, r982:295)
+
+
+
Beta Release 0.1.17 (6 July, 2011)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
int32_t l = 0;
for (k = 0; k < c->n_cigar; ++k) {
int op = cigar[k] & BAM_CIGAR_MASK;
- if (op == BAM_CMATCH || op == BAM_CINS || op == BAM_CSOFT_CLIP)
+ if (op == BAM_CMATCH || op == BAM_CINS || op == BAM_CSOFT_CLIP || op == BAM_CEQUAL || op == BAM_CDIFF)
l += cigar[k] >> BAM_CIGAR_SHIFT;
}
return l;
else {
for (i = 0; i < c->n_cigar; ++i) {
kputw(bam1_cigar(b)[i]>>BAM_CIGAR_SHIFT, &str);
- kputc("MIDNSHP"[bam1_cigar(b)[i]&BAM_CIGAR_MASK], &str);
+ kputc("MIDNSHP=X"[bam1_cigar(b)[i]&BAM_CIGAR_MASK], &str);
}
}
kputc('\t', &str);
@copyright Genome Research Ltd.
*/
-#define BAM_VERSION "0.1.17 (r973:277)"
+#define BAM_VERSION "0.1.18 (r982:295)"
#include <stdint.h>
#include <stdlib.h>
/*
CIGAR operations.
*/
-/*! @abstract CIGAR: match */
+/*! @abstract CIGAR: M = match or mismatch*/
#define BAM_CMATCH 0
-/*! @abstract CIGAR: insertion to the reference */
+/*! @abstract CIGAR: I = insertion to the reference */
#define BAM_CINS 1
-/*! @abstract CIGAR: deletion from the reference */
+/*! @abstract CIGAR: D = deletion from the reference */
#define BAM_CDEL 2
-/*! @abstract CIGAR: skip on the reference (e.g. spliced alignment) */
+/*! @abstract CIGAR: N = skip on the reference (e.g. spliced alignment) */
#define BAM_CREF_SKIP 3
-/*! @abstract CIGAR: clip on the read with clipped sequence present in qseq */
+/*! @abstract CIGAR: S = clip on the read with clipped sequence
+ present in qseq */
#define BAM_CSOFT_CLIP 4
-/*! @abstract CIGAR: clip on the read with clipped sequence trimmed off */
+/*! @abstract CIGAR: H = clip on the read with clipped sequence trimmed off */
#define BAM_CHARD_CLIP 5
-/*! @abstract CIGAR: padding */
+/*! @abstract CIGAR: P = padding */
#define BAM_CPAD 6
+/*! @abstract CIGAR: equals = match */
+#define BAM_CEQUAL 7
+/*! @abstract CIGAR: X = mismatch */
+#define BAM_CDIFF 8
/*! @typedef
@abstract Structure for core alignment information.
for (k = 0; k < c->n_cigar; ++k) {
int op = cigar[k] & BAM_CIGAR_MASK;
int l = cigar[k] >> BAM_CIGAR_SHIFT;
- if (op == BAM_CMATCH) {
+ if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
if (c->pos > tpos) return y;
if (x + l > tpos) {
*_tpos = tpos;
for (k = 0; k < b->core.n_cigar; ++k) {
int op = cigar[k]&0xf;
int j, l = cigar[k]>>4;
- if (op == BAM_CMATCH) {
+ if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
for (j = 0; j < l; ++j)
if (x + j >= left && x + j < right)
cns[x+j-left] += (bam1_seqi(seq, y+j) == ref0[x+j-left])? 1 : 0x10000;
return 0;
}
+int bam_aux_drop_other(bam1_t *b, uint8_t *s)
+{
+ if (s) {
+ uint8_t *p, *aux;
+ aux = bam1_aux(b);
+ p = s - 2;
+ __skip_tag(s);
+ memmove(aux, p, s - p);
+ b->data_len -= b->l_aux - (s - p);
+ b->l_aux = s - p;
+ } else {
+ b->data_len -= b->l_aux;
+ b->l_aux = 0;
+ }
+ return 0;
+}
+
void bam_init_header_hash(bam_header_t *header)
{
if (header->hash == 0) {
#include "kseq.h"
#include "khash.h"
-KSTREAM_INIT(gzFile, gzread, 8192)
+KSTREAM_INIT(gzFile, gzread, 16384)
KHASH_MAP_INIT_STR(ref, uint64_t)
void bam_init_header_hash(bam_header_t *header);
z += str->l + 1;
if (str->s[0] != '*') {
for (s = str->s; *s; ++s) {
- if (isalpha(*s)) ++c->n_cigar;
+ if ((isalpha(*s)) || (*s=='=')) ++c->n_cigar;
else if (!isdigit(*s)) parse_error(fp->n_lines, "invalid CIGAR character");
}
b->data = alloc_data(b, doff + c->n_cigar * 4);
for (i = 0, s = str->s; i != c->n_cigar; ++i) {
x = strtol(s, &t, 10);
op = toupper(*t);
- if (op == 'M' || op == '=' || op == 'X') op = BAM_CMATCH;
+ if (op == 'M') op = BAM_CMATCH;
else if (op == 'I') op = BAM_CINS;
else if (op == 'D') op = BAM_CDEL;
else if (op == 'N') op = BAM_CREF_SKIP;
else if (op == 'S') op = BAM_CSOFT_CLIP;
else if (op == 'H') op = BAM_CHARD_CLIP;
else if (op == 'P') op = BAM_CPAD;
+ else if (op == '=') op = BAM_CEQUAL;
+ else if (op == 'X') op = BAM_CDIFF;
else parse_error(fp->n_lines, "invalid CIGAR operation");
s = t + 1;
bam1_cigar(b)[i] = x << BAM_CIGAR_SHIFT | op;
z += str->l + 1;
if (strcmp(str->s, "*")) {
c->l_qseq = strlen(str->s);
- if (c->n_cigar && c->l_qseq != (int32_t)bam_cigar2qlen(c, bam1_cigar(b)))
- parse_error(fp->n_lines, "CIGAR and sequence length are inconsistent");
+ if (c->n_cigar && c->l_qseq != (int32_t)bam_cigar2qlen(c, bam1_cigar(b))) {
+ fprintf(stderr, "Line %ld, sequence length %i vs %i from CIGAR\n",
+ (long)fp->n_lines, c->l_qseq, (int32_t)bam_cigar2qlen(c, bam1_cigar(b)));
+ parse_error(fp->n_lines, "CIGAR and sequence length are inconsistent");
+ }
p = (uint8_t*)alloc_data(b, doff + c->l_qseq + (c->l_qseq+1)/2) + doff;
memset(p, 0, (c->l_qseq+1)/2);
for (i = 0; i < c->l_qseq; ++i)
#include "kaln.h"
#include "kprobaln.h"
+#define USE_EQUAL 1
+#define DROP_TAG 2
+#define BIN_QUAL 4
+#define UPDATE_NM 8
+#define UPDATE_MD 16
+#define HASH_QNM 32
+
char bam_nt16_nt4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 };
-void bam_fillmd1_core(bam1_t *b, char *ref, int is_equal, int max_nm)
+int bam_aux_drop_other(bam1_t *b, uint8_t *s);
+
+void bam_fillmd1_core(bam1_t *b, char *ref, int flag, int max_nm)
{
uint8_t *seq = bam1_seq(b);
uint32_t *cigar = bam1_cigar(b);
bam1_core_t *c = &b->core;
int i, x, y, u = 0;
kstring_t *str;
- uint8_t *old_md, *old_nm;
int32_t old_nm_i = -1, nm = 0;
str = (kstring_t*)calloc(1, sizeof(kstring_t));
for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) {
int j, l = cigar[i]>>4, op = cigar[i]&0xf;
- if (op == BAM_CMATCH) {
+ if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
for (j = 0; j < l; ++j) {
int z = y + j;
int c1 = bam1_seqi(seq, z), c2 = bam_nt16_table[(int)ref[x+j]];
if (ref[x+j] == 0) break; // out of boundary
if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match
- if (is_equal) seq[z/2] &= (z&1)? 0xf0 : 0x0f;
+ if (flag&USE_EQUAL) seq[z/2] &= (z&1)? 0xf0 : 0x0f;
++u;
} else {
- ksprintf(str, "%d", u);
- kputc(ref[x+j], str);
+ kputw(u, str); kputc(ref[x+j], str);
u = 0; ++nm;
}
}
if (j < l) break;
x += l; y += l;
} else if (op == BAM_CDEL) {
- ksprintf(str, "%d", u);
- kputc('^', str);
+ kputw(u, str); kputc('^', str);
for (j = 0; j < l; ++j) {
if (ref[x+j] == 0) break;
kputc(ref[x+j], str);
x += l;
}
}
- ksprintf(str, "%d", u);
+ kputw(u, str);
// apply max_nm
if (max_nm > 0 && nm >= max_nm) {
for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) {
int j, l = cigar[i]>>4, op = cigar[i]&0xf;
- if (op == BAM_CMATCH) {
+ if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
for (j = 0; j < l; ++j) {
int z = y + j;
int c1 = bam1_seqi(seq, z), c2 = bam_nt16_table[(int)ref[x+j]];
}
}
// update NM
- old_nm = bam_aux_get(b, "NM");
- if (c->flag & BAM_FUNMAP) return;
- if (old_nm) old_nm_i = bam_aux2i(old_nm);
- if (!old_nm) bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm);
- else if (nm != old_nm_i) {
- fprintf(stderr, "[bam_fillmd1] different NM for read '%s': %d -> %d\n", bam1_qname(b), old_nm_i, nm);
- bam_aux_del(b, old_nm);
- bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm);
+ if (flag & UPDATE_NM) {
+ uint8_t *old_nm = bam_aux_get(b, "NM");
+ if (c->flag & BAM_FUNMAP) return;
+ if (old_nm) old_nm_i = bam_aux2i(old_nm);
+ if (!old_nm) bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm);
+ else if (nm != old_nm_i) {
+ fprintf(stderr, "[bam_fillmd1] different NM for read '%s': %d -> %d\n", bam1_qname(b), old_nm_i, nm);
+ bam_aux_del(b, old_nm);
+ bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm);
+ }
}
// update MD
- old_md = bam_aux_get(b, "MD");
- if (!old_md) bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s);
- else {
- int is_diff = 0;
- if (strlen((char*)old_md+1) == str->l) {
- for (i = 0; i < str->l; ++i)
- if (toupper(old_md[i+1]) != toupper(str->s[i]))
- break;
- if (i < str->l) is_diff = 1;
- } else is_diff = 1;
- if (is_diff) {
- fprintf(stderr, "[bam_fillmd1] different MD for read '%s': '%s' -> '%s'\n", bam1_qname(b), old_md+1, str->s);
- bam_aux_del(b, old_md);
- bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s);
+ if (flag & UPDATE_MD) {
+ uint8_t *old_md = bam_aux_get(b, "MD");
+ if (c->flag & BAM_FUNMAP) return;
+ if (!old_md) bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s);
+ else {
+ int is_diff = 0;
+ if (strlen((char*)old_md+1) == str->l) {
+ for (i = 0; i < str->l; ++i)
+ if (toupper(old_md[i+1]) != toupper(str->s[i]))
+ break;
+ if (i < str->l) is_diff = 1;
+ } else is_diff = 1;
+ if (is_diff) {
+ fprintf(stderr, "[bam_fillmd1] different MD for read '%s': '%s' -> '%s'\n", bam1_qname(b), old_md+1, str->s);
+ bam_aux_del(b, old_md);
+ bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s);
+ }
}
}
+ // drop all tags but RG
+ if (flag&DROP_TAG) {
+ uint8_t *q = bam_aux_get(b, "RG");
+ bam_aux_drop_other(b, q);
+ }
+ // reduce the resolution of base quality
+ if (flag&BIN_QUAL) {
+ uint8_t *qual = bam1_qual(b);
+ for (i = 0; i < b->core.l_qseq; ++i)
+ if (qual[i] >= 3) qual[i] = qual[i]/10*10 + 7;
+ }
free(str->s); free(str);
}
-void bam_fillmd1(bam1_t *b, char *ref, int is_equal)
+void bam_fillmd1(bam1_t *b, char *ref, int flag)
{
- bam_fillmd1_core(b, ref, is_equal, 0);
+ bam_fillmd1_core(b, ref, flag, 0);
}
int bam_cap_mapQ(bam1_t *b, char *ref, int thres)
mm = q = len = clip_l = clip_q = 0;
for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) {
int j, l = cigar[i]>>4, op = cigar[i]&0xf;
- if (op == BAM_CMATCH) {
+ if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
for (j = 0; j < l; ++j) {
int z = y + j;
int c1 = bam1_seqi(seq, z), c2 = bam_nt16_table[(int)ref[x+j]];
for (k = 0; k < c->n_cigar; ++k) {
int op, l;
op = cigar[k]&0xf; l = cigar[k]>>4;
- if (op == BAM_CMATCH) {
+ if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
if (yb < 0) yb = y;
if (xb < 0) xb = x;
ye = y + l; xe = x + l;
if (!extend_baq) { // in this block, bq[] is capped by base quality qual[]
for (k = 0, x = c->pos, y = 0; k < c->n_cigar; ++k) {
int op = cigar[k]&0xf, l = cigar[k]>>4;
- if (op == BAM_CMATCH) {
+ if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
for (i = y; i < y + l; ++i) {
if ((state[i]&3) != 0 || state[i]>>2 != x - xb + (i - y)) bq[i] = 0;
else bq[i] = bq[i] < q[i]? bq[i] : q[i];
left = calloc(c->l_qseq, 1); rght = calloc(c->l_qseq, 1);
for (k = 0, x = c->pos, y = 0; k < c->n_cigar; ++k) {
int op = cigar[k]&0xf, l = cigar[k]>>4;
- if (op == BAM_CMATCH) {
+ if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
for (i = y; i < y + l; ++i)
bq[i] = ((state[i]&3) != 0 || state[i]>>2 != x - xb + (i - y))? 0 : q[i];
for (left[y] = bq[y], i = y + 1; i < y + l; ++i)
int bam_fillmd(int argc, char *argv[])
{
- int c, is_equal, tid = -2, ret, len, is_bam_out, is_sam_in, is_uncompressed, max_nm, is_realn, capQ, baq_flag;
+ int c, flt_flag, tid = -2, ret, len, is_bam_out, is_sam_in, is_uncompressed, max_nm, is_realn, capQ, baq_flag;
samfile_t *fp, *fpout = 0;
faidx_t *fai;
char *ref = 0, mode_w[8], mode_r[8];
bam1_t *b;
- is_equal = is_bam_out = is_sam_in = is_uncompressed = is_realn = max_nm = capQ = baq_flag = 0;
+ flt_flag = UPDATE_NM | UPDATE_MD;
+ is_bam_out = is_sam_in = is_uncompressed = is_realn = max_nm = capQ = baq_flag = 0;
mode_w[0] = mode_r[0] = 0;
strcpy(mode_r, "r"); strcpy(mode_w, "w");
- while ((c = getopt(argc, argv, "EreubSC:n:A")) >= 0) {
+ while ((c = getopt(argc, argv, "EqreuNhbSC:n:Ad")) >= 0) {
switch (c) {
case 'r': is_realn = 1; break;
- case 'e': is_equal = 1; break;
+ case 'e': flt_flag |= USE_EQUAL; break;
+ case 'd': flt_flag |= DROP_TAG; break;
+ case 'q': flt_flag |= BIN_QUAL; break;
+ case 'h': flt_flag |= HASH_QNM; break;
+ case 'N': flt_flag &= ~(UPDATE_MD|UPDATE_NM); break;
case 'b': is_bam_out = 1; break;
case 'u': is_uncompressed = is_bam_out = 1; break;
case 'S': is_sam_in = 1; break;
int q = bam_cap_mapQ(b, ref, capQ);
if (b->core.qual > q) b->core.qual = q;
}
- if (ref) bam_fillmd1_core(b, ref, is_equal, max_nm);
+ if (ref) bam_fillmd1_core(b, ref, flt_flag, max_nm);
}
samwrite(fpout, b);
}
if (s->k == -1) { // never processed
is_head = 1;
if (c->n_cigar == 1) { // just one operation, save a loop
- if (_cop(cigar[0]) == BAM_CMATCH) s->k = 0, s->x = c->pos, s->y = 0;
+ if (_cop(cigar[0]) == BAM_CMATCH || _cop(cigar[0]) == BAM_CEQUAL || _cop(cigar[0]) == BAM_CDIFF) s->k = 0, s->x = c->pos, s->y = 0;
} else { // find the first match or deletion
for (k = 0, s->x = c->pos, s->y = 0; k < c->n_cigar; ++k) {
int op = _cop(cigar[k]);
int l = _cln(cigar[k]);
- if (op == BAM_CMATCH || op == BAM_CDEL) break;
+ if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CEQUAL || op == BAM_CDIFF) break;
else if (op == BAM_CREF_SKIP) s->x += l;
else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l;
}
if (pos - s->x >= l) { // jump to the next operation
assert(s->k < c->n_cigar); // otherwise a bug: this function should not be called in this case
op = _cop(cigar[s->k+1]);
- if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP) { // jump to the next without a loop
- if (_cop(cigar[s->k]) == BAM_CMATCH) s->y += l;
+ if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) { // jump to the next without a loop
+ if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l;
s->x += l;
++s->k;
- } else { // find the next M/D/N
- if (_cop(cigar[s->k]) == BAM_CMATCH) s->y += l;
+ } else { // find the next M/D/N/=/X
+ if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l;
s->x += l;
for (k = s->k + 1; k < c->n_cigar; ++k) {
op = _cop(cigar[k]), l = _cln(cigar[k]);
- if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP) break;
+ if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) break;
else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l;
}
s->k = k;
for (k = s->k + 2; k < c->n_cigar; ++k) {
op2 = _cop(cigar[k]); l2 = _cln(cigar[k]);
if (op2 == BAM_CINS) l3 += l2;
- else if (op2 == BAM_CDEL || op2 == BAM_CMATCH || op2 == BAM_CREF_SKIP) break;
+ else if (op2 == BAM_CDEL || op2 == BAM_CMATCH || op2 == BAM_CREF_SKIP || op2 == BAM_CEQUAL || op2 == BAM_CDIFF) break;
}
if (l3 > 0) p->indel = l3;
}
}
- if (op == BAM_CMATCH) {
+ if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
p->qpos = s->y + (pos - s->x);
} else if (op == BAM_CDEL || op == BAM_CREF_SKIP) {
p->is_del = 1; p->qpos = s->y; // FIXME: distinguish D and N!!!!!
/* The MIT License
- Copyright (c) 2008 Genome Research Ltd (GRL).
+ Copyright (c) 2008, 2009, 2011 Attractive Chaos <attractor@live.co.uk>
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
SOFTWARE.
*/
-/* Contact: Heng Li <lh3@sanger.ac.uk> */
-
-/*
- 2009-07-16 (lh3): in kstream_t, change "char*" to "unsigned char*"
- */
-
-/* Last Modified: 12APR2009 */
+/* Last Modified: 18AUG2011 */
#ifndef AC_KSEQ_H
#define AC_KSEQ_H
#endif
#define __KS_GETUNTIL(__read, __bufsize) \
- static int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
+ static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \
{ \
if (dret) *dret = 0; \
- str->l = 0; \
+ str->l = append? str->l : 0; \
if (ks->begin >= ks->end && ks->is_eof) return -1; \
for (;;) { \
int i; \
break; \
} \
} \
- if (str->l == 0) { \
+ if (str->s == 0) { \
str->m = 1; \
str->s = (char*)calloc(1, 1); \
} \
str->s[str->l] = '\0'; \
return str->l; \
- }
+ } \
+ static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
+ { return ks_getuntil2(ks, delimiter, str, dret, 0); }
#define KSTREAM_INIT(type_t, __read, __bufsize) \
__KS_TYPE(type_t) \
-1 end-of-file
-2 truncated quality string
*/
-#define __KSEQ_READ \
- static int kseq_read(kseq_t *seq) \
- { \
- int c; \
- kstream_t *ks = seq->f; \
+#define __KSEQ_READ \
+ static int kseq_read(kseq_t *seq) \
+ { \
+ int c; \
+ kstream_t *ks = seq->f; \
if (seq->last_char == 0) { /* then jump to the next header line */ \
- while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \
- if (c == -1) return -1; /* end of file */ \
- seq->last_char = c; \
- } /* the first header char has been read */ \
- seq->comment.l = seq->seq.l = seq->qual.l = 0; \
- if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; \
- if (c != '\n') ks_getuntil(ks, '\n', &seq->comment, 0); \
+ while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \
+ if (c == -1) return -1; /* end of file */ \
+ seq->last_char = c; \
+ } /* else: the first header char has been read in the previous call */ \
+ seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \
+ if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \
+ if (c != '\n') ks_getuntil(ks, '\n', &seq->comment, 0); /* read FASTA/Q comment */ \
+ if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \
+ seq->seq.m = 256; \
+ seq->seq.s = (char*)malloc(seq->seq.m); \
+ } \
while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \
- if (isgraph(c)) { /* printable non-space character */ \
- if (seq->seq.l + 1 >= seq->seq.m) { /* double the memory */ \
- seq->seq.m = seq->seq.l + 2; \
- kroundup32(seq->seq.m); /* rounded to next closest 2^k */ \
- seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
- } \
- seq->seq.s[seq->seq.l++] = (char)c; \
- } \
- } \
+ seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \
+ ks_getuntil2(ks, '\n', &seq->seq, 0, 1); /* read the rest of the line */ \
+ } \
if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \
- seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \
- if (c != '+') return seq->seq.l; /* FASTA */ \
- if (seq->qual.m < seq->seq.m) { /* allocate enough memory */ \
- seq->qual.m = seq->seq.m; \
- seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \
- } \
+ if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \
+ seq->seq.m = seq->seq.l + 2; \
+ kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \
+ seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
+ } \
+ seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \
+ if (c != '+') return seq->seq.l; /* FASTA */ \
+ if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \
+ seq->qual.m = seq->seq.m; \
+ seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \
+ } \
while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \
- if (c == -1) return -2; /* we should not stop here */ \
- while ((c = ks_getc(ks)) != -1 && seq->qual.l < seq->seq.l) \
- if (c >= 33 && c <= 127) seq->qual.s[seq->qual.l++] = (unsigned char)c; \
- seq->qual.s[seq->qual.l] = 0; /* null terminated string */ \
+ if (c == -1) return -2; /* error: no quality string */ \
+ while (ks_getuntil2(ks, '\n', &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \
seq->last_char = 0; /* we have not come to the next header line */ \
- if (seq->seq.l != seq->qual.l) return -2; /* qual string is shorter than seq string */ \
- return seq->seq.l; \
+ if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \
+ return seq->seq.l; \
}
#define __KSEQ_TYPE(type_t) \
} kseq_t;
#define KSEQ_INIT(type_t, __read) \
- KSTREAM_INIT(type_t, __read, 4096) \
+ KSTREAM_INIT(type_t, __read, 16384) \
__KSEQ_TYPE(type_t) \
__KSEQ_BASIC(type_t) \
__KSEQ_READ
CXXFLAGS= $(CFLAGS)
DFLAGS= -D_FILE_OFFSET_BITS=64
OBJS=
-PROG= md5sum-lite md5fa maq2sam-short maq2sam-long wgsim
+PROG= md5sum-lite md5fa maq2sam-short maq2sam-long wgsim seqtk
INCLUDES= -I..
SUBDIRS= .
lib:
-afs2:afs2.o
- $(CC) $(CFLAGS) -o $@ afs2.o -lm -lz -L.. -lbam
+seqtk:seqtk.o
+ $(CC) $(CFLAGS) -o $@ seqtk.o -lm -lz
wgsim:wgsim.o
- $(CC) $(CFLAGS) -o $@ wgsim.o -lm
+ $(CC) $(CFLAGS) -o $@ wgsim.o -lm -lz
md5fa:md5.o md5fa.o md5.h ../kseq.h
$(CC) $(CFLAGS) -o $@ md5.o md5fa.o -lz
md5fa.o:md5.h md5fa.c
$(CC) $(CFLAGS) -c -I.. -o $@ md5fa.c
-afs2.o:afs2.c ../bam.h
- $(CC) $(CFLAGS) -c -I.. -o $@ afs2.c
+seqtk.o:seqtk.c ../khash.h ../kseq.h
+ $(CC) $(CFLAGS) -c -I.. -o $@ seqtk.c
+
+wgsim.o:wgsim.c ../kseq.h
+ $(CC) $(CFLAGS) -c -I.. -o $@ wgsim.c
cleanlocal:
rm -fr gmon.out *.o a.out *.exe *.dSYM $(PROG) *~ *.a
--- /dev/null
+#include <stdio.h>
+#include <ctype.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <zlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <limits.h>
+
+#include "kseq.h"
+KSEQ_INIT(gzFile, gzread)
+
+typedef struct {
+ int n, m;
+ uint64_t *a;
+} reglist_t;
+
+#include "khash.h"
+KHASH_MAP_INIT_STR(reg, reglist_t)
+
+typedef kh_reg_t reghash_t;
+
+reghash_t *stk_reg_read(const char *fn)
+{
+ reghash_t *h = kh_init(reg);
+ gzFile fp;
+ kstream_t *ks;
+ int dret;
+ kstring_t *str;
+ // read the list
+ str = calloc(1, sizeof(kstring_t));
+ fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
+ ks = ks_init(fp);
+ while (ks_getuntil(ks, 0, str, &dret) >= 0) {
+ int beg = -1, end = -1;
+ reglist_t *p;
+ khint_t k = kh_get(reg, h, str->s);
+ if (k == kh_end(h)) {
+ int ret;
+ char *s = strdup(str->s);
+ k = kh_put(reg, h, s, &ret);
+ memset(&kh_val(h, k), 0, sizeof(reglist_t));
+ }
+ p = &kh_val(h, k);
+ if (dret != '\n') {
+ if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) {
+ beg = atoi(str->s);
+ if (dret != '\n') {
+ if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) {
+ end = atoi(str->s);
+ if (end < 0) end = -1;
+ }
+ }
+ }
+ }
+ // skip the rest of the line
+ if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n');
+ if (end < 0 && beg > 0) end = beg, beg = beg - 1; // if there is only one column
+ if (beg < 0) beg = 0, end = INT_MAX;
+ if (p->n == p->m) {
+ p->m = p->m? p->m<<1 : 4;
+ p->a = realloc(p->a, p->m * 8);
+ }
+ p->a[p->n++] = (uint64_t)beg<<32 | end;
+ }
+ ks_destroy(ks);
+ gzclose(fp);
+ free(str->s); free(str);
+ return h;
+}
+
+void stk_reg_destroy(reghash_t *h)
+{
+ khint_t k;
+ for (k = 0; k < kh_end(h); ++k) {
+ if (kh_exist(h, k)) {
+ free(kh_val(h, k).a);
+ free((char*)kh_key(h, k));
+ }
+ }
+ kh_destroy(reg, h);
+}
+
+/* constant table */
+
+unsigned char seq_nt16_table[256] = {
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15 /*'-'*/,15,15,
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+ 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
+ 15,15, 5, 6, 8,15, 7, 9, 0,10,15,15, 15,15,15,15,
+ 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
+ 15,15, 5, 6, 8,15, 7, 9, 0,10,15,15, 15,15,15,15,
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15
+};
+
+char *seq_nt16_rev_table = "XACMGRSVTWYHKDBN";
+unsigned char seq_nt16to4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 };
+int bitcnt_table[] = { 4, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 };
+
+/* composition */
+int stk_comp(int argc, char *argv[])
+{
+ gzFile fp;
+ kseq_t *seq;
+ int l, c, upper_only = 0;
+ reghash_t *h = 0;
+ reglist_t dummy;
+ while ((c = getopt(argc, argv, "ur:")) >= 0) {
+ switch (c) {
+ case 'u': upper_only = 1; break;
+ case 'r': h = stk_reg_read(optarg); break;
+ }
+ }
+ if (argc == optind) {
+ fprintf(stderr, "Usage: seqtk comp [-u] [-r in.bed] <in.fa>\n\n");
+ fprintf(stderr, "Output format: chr, length, #A, #C, #G, #T, #2, #3, #4, #CpG, #tv, #ts, #CpG-ts\n");
+ return 1;
+ }
+ fp = (strcmp(argv[optind], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[optind], "r");
+ seq = kseq_init(fp);
+ dummy.n= dummy.m = 1; dummy.a = calloc(1, 8);
+ while ((l = kseq_read(seq)) >= 0) {
+ int i, k;
+ reglist_t *p = 0;
+ if (h) {
+ khint_t k = kh_get(reg, h, seq->name.s);
+ if (k != kh_end(h)) p = &kh_val(h, k);
+ } else {
+ p = &dummy;
+ dummy.a[0] = l;
+ }
+ for (k = 0; p && k < p->n; ++k) {
+ int beg = p->a[k]>>32, end = p->a[k]&0xffffffff;
+ int la, lb, lc, na, nb, nc, cnt[11];
+ if (beg > 0) la = seq->seq.s[beg-1], lb = seq_nt16_table[la], lc = bitcnt_table[lb];
+ else la = 'a', lb = -1, lc = 0;
+ na = seq->seq.s[beg]; nb = seq_nt16_table[na]; nc = bitcnt_table[nb];
+ memset(cnt, 0, 11 * sizeof(int));
+ for (i = beg; i < end; ++i) {
+ int is_CpG = 0, a, b, c;
+ a = na; b = nb; c = nc;
+ na = seq->seq.s[i+1]; nb = seq_nt16_table[na]; nc = bitcnt_table[nb];
+ if (b == 2 || b == 10) { // C or Y
+ if (nb == 4 || nb == 5) is_CpG = 1;
+ } else if (b == 4 || b == 5) { // G or R
+ if (lb == 2 || lb == 10) is_CpG = 1;
+ }
+ if (upper_only == 0 || isupper(a)) {
+ if (c > 1) ++cnt[c+2];
+ if (c == 1) ++cnt[seq_nt16to4_table[b]];
+ if (b == 10 || b == 5) ++cnt[9];
+ else if (c == 2) {
+ ++cnt[8];
+ }
+ if (is_CpG) {
+ ++cnt[7];
+ if (b == 10 || b == 5) ++cnt[10];
+ }
+ }
+ la = a; lb = b; lc = c;
+ }
+ if (h) printf("%s\t%d\t%d", seq->name.s, beg, end);
+ else printf("%s\t%d", seq->name.s, l);
+ for (i = 0; i < 11; ++i) printf("\t%d", cnt[i]);
+ putchar('\n');
+ }
+ fflush(stdout);
+ }
+ free(dummy.a);
+ kseq_destroy(seq);
+ gzclose(fp);
+ return 0;
+}
+
+int stk_randbase(int argc, char *argv[])
+{
+ gzFile fp;
+ kseq_t *seq;
+ int l;
+ if (argc == 1) {
+ fprintf(stderr, "Usage: seqtk randbase <in.fa>\n");
+ return 1;
+ }
+ fp = (strcmp(argv[1], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[1], "r");
+ seq = kseq_init(fp);
+ while ((l = kseq_read(seq)) >= 0) {
+ int i;
+ printf(">%s", seq->name.s);
+ for (i = 0; i < l; ++i) {
+ int c, b, a, j, k, m;
+ b = seq->seq.s[i];
+ c = seq_nt16_table[b];
+ a = bitcnt_table[c];
+ if (a == 2) {
+ m = (drand48() < 0.5);
+ for (j = k = 0; j < 4; ++j) {
+ if ((1<<j & c) == 0) continue;
+ if (k == m) break;
+ ++k;
+ }
+ seq->seq.s[i] = islower(b)? "acgt"[j] : "ACGT"[j];
+ }
+ if (i%60 == 0) putchar('\n');
+ putchar(seq->seq.s[i]);
+ }
+ putchar('\n');
+ }
+ kseq_destroy(seq);
+ gzclose(fp);
+ return 0;
+}
+
+int stk_hety(int argc, char *argv[])
+{
+ gzFile fp;
+ kseq_t *seq;
+ int l, c, win_size = 50000, n_start = 5, win_step, is_lower_mask = 0;
+ char *buf;
+ uint32_t cnt[3];
+ if (argc == 1) {
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Usage: seqtk hety [options] <in.fa>\n\n");
+ fprintf(stderr, "Options: -w INT window size [%d]\n", win_size);
+ fprintf(stderr, " -t INT # start positions in a window [%d]\n", n_start);
+ fprintf(stderr, " -m treat lowercases as masked\n");
+ fprintf(stderr, "\n");
+ return 1;
+ }
+ while ((c = getopt(argc, argv, "w:t:m")) >= 0) {
+ switch (c) {
+ case 'w': win_size = atoi(optarg); break;
+ case 't': n_start = atoi(optarg); break;
+ case 'm': is_lower_mask = 1; break;
+ }
+ }
+ fp = (strcmp(argv[optind], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[optind], "r");
+ seq = kseq_init(fp);
+ win_step = win_size / n_start;
+ buf = calloc(win_size, 1);
+ while ((l = kseq_read(seq)) >= 0) {
+ int x, i, y, z, next = 0;
+ cnt[0] = cnt[1] = cnt[2] = 0;
+ for (i = 0; i <= l; ++i) {
+ if ((i >= win_size && i % win_step == 0) || i == l) {
+ if (i == l && l >= win_size) {
+ for (y = l - win_size; y < next; ++y) --cnt[(int)buf[y % win_size]];
+ }
+ if (cnt[1] + cnt[2] > 0)
+ printf("%s\t%d\t%d\t%.2lf\t%d\t%d\n", seq->name.s, next, i,
+ (double)cnt[2] / (cnt[1] + cnt[2]) * win_size, cnt[1] + cnt[2], cnt[2]);
+ next = i;
+ }
+ if (i < l) {
+ y = i % win_size;
+ c = seq->seq.s[i];
+ if (is_lower_mask && islower(c)) c = 'N';
+ c = seq_nt16_table[c];
+ x = bitcnt_table[c];
+ if (i >= win_size) --cnt[(int)buf[y]];
+ buf[y] = z = x > 2? 0 : x == 2? 2 : 1;
+ ++cnt[z];
+ }
+ }
+ }
+ free(buf);
+ kseq_destroy(seq);
+ gzclose(fp);
+ return 0;
+}
+
+/* fq2fa */
+int stk_fq2fa(int argc, char *argv[])
+{
+ gzFile fp;
+ kseq_t *seq;
+ char *buf;
+ int l, i, c, qual_thres = 0, linelen = 60;
+ while ((c = getopt(argc, argv, "q:l:")) >= 0) {
+ switch (c) {
+ case 'q': qual_thres = atoi(optarg); break;
+ case 'l': linelen = atoi(optarg); break;
+ }
+ }
+ if (argc == optind) {
+ fprintf(stderr, "Usage: seqtk fq2fa [-q qualThres=0] [-l lineLen=60] <in.fq>\n");
+ return 1;
+ }
+ buf = linelen > 0? malloc(linelen + 1) : 0;
+ fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r");
+ seq = kseq_init(fp);
+ while ((l = kseq_read(seq)) >= 0) {
+ if (seq->qual.l && qual_thres > 0) {
+ for (i = 0; i < l; ++i)
+ if (seq->qual.s[i] - 33 < qual_thres)
+ seq->seq.s[i] = tolower(seq->seq.s[i]);
+ }
+ putchar('>');
+ if (seq->comment.l) {
+ fputs(seq->name.s, stdout);
+ putchar(' ');
+ puts(seq->comment.s);
+ } else puts(seq->name.s);
+ if (buf) { // multi-line
+ for (i = 0; i < l; i += linelen) {
+ int x = i + linelen < l? linelen : l - i;
+ memcpy(buf, seq->seq.s + i, x);
+ buf[x] = 0;
+ puts(buf);
+ }
+ } else puts(seq->seq.s);
+ }
+ free(buf);
+ kseq_destroy(seq);
+ gzclose(fp);
+ return 0;
+}
+
+int stk_maskseq(int argc, char *argv[])
+{
+ khash_t(reg) *h = kh_init(reg);
+ gzFile fp;
+ kseq_t *seq;
+ int l, i, j, c, is_complement = 0, is_lower = 0;
+ khint_t k;
+ while ((c = getopt(argc, argv, "cl")) >= 0) {
+ switch (c) {
+ case 'c': is_complement = 1; break;
+ case 'l': is_lower = 1; break;
+ }
+ }
+ if (argc - optind < 2) {
+ fprintf(stderr, "Usage: seqtk maskseq [-cl] <in.fa> <in.bed>\n\n");
+ fprintf(stderr, "Options: -c mask the complement regions\n");
+ fprintf(stderr, " -l soft mask (to lower cases)\n");
+ return 1;
+ }
+ h = stk_reg_read(argv[optind+1]);
+ // maskseq
+ fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r");
+ seq = kseq_init(fp);
+ while ((l = kseq_read(seq)) >= 0) {
+ k = kh_get(reg, h, seq->name.s);
+ if (k == kh_end(h)) { // not found in the hash table
+ if (is_complement) {
+ for (j = 0; j < l; ++j)
+ seq->seq.s[j] = is_lower? tolower(seq->seq.s[j]) : 'N';
+ }
+ } else {
+ reglist_t *p = &kh_val(h, k);
+ if (!is_complement) {
+ for (i = 0; i < p->n; ++i) {
+ int beg = p->a[i]>>32, end = p->a[i];
+ if (beg >= seq->seq.l) {
+ fprintf(stderr, "[maskseq] start position >= the sequence length.\n");
+ continue;
+ }
+ if (end >= seq->seq.l) end = seq->seq.l;
+ if (is_lower) for (j = beg; j < end; ++j) seq->seq.s[j] = tolower(seq->seq.s[j]);
+ else for (j = beg; j < end; ++j) seq->seq.s[j] = 'N';
+ }
+ } else {
+ int8_t *mask = calloc(seq->seq.l, 1);
+ for (i = 0; i < p->n; ++i) {
+ int beg = p->a[i]>>32, end = p->a[i];
+ if (end >= seq->seq.l) end = seq->seq.l;
+ for (j = beg; j < end; ++j) mask[j] = 1;
+ }
+ for (j = 0; j < l; ++j)
+ if (mask[j] == 0) seq->seq.s[j] = is_lower? tolower(seq->seq.s[j]) : 'N';
+ free(mask);
+ }
+ }
+ printf(">%s", seq->name.s);
+ for (j = 0; j < seq->seq.l; ++j) {
+ if (j%60 == 0) putchar('\n');
+ putchar(seq->seq.s[j]);
+ }
+ putchar('\n');
+ }
+ // free
+ kseq_destroy(seq);
+ gzclose(fp);
+ stk_reg_destroy(h);
+ return 0;
+}
+
+/* subseq */
+
+int stk_subseq(int argc, char *argv[])
+{
+ khash_t(reg) *h = kh_init(reg);
+ gzFile fp;
+ kseq_t *seq;
+ int l, i, j, c, is_tab = 0;
+ khint_t k;
+ while ((c = getopt(argc, argv, "t")) >= 0) {
+ switch (c) {
+ case 't': is_tab = 1; break;
+ }
+ }
+ if (optind + 2 > argc) {
+ fprintf(stderr, "Usage: seqtk subseq [-t] <in.fa> <in.bed>\n\n");
+ fprintf(stderr, "Note: Use 'samtools faidx' if only a few regions are intended.\n");
+ return 1;
+ }
+ h = stk_reg_read(argv[optind+1]);
+ // subseq
+ fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r");
+ seq = kseq_init(fp);
+ while ((l = kseq_read(seq)) >= 0) {
+ reglist_t *p;
+ k = kh_get(reg, h, seq->name.s);
+ if (k == kh_end(h)) continue;
+ p = &kh_val(h, k);
+ for (i = 0; i < p->n; ++i) {
+ int beg = p->a[i]>>32, end = p->a[i];
+ if (beg >= seq->seq.l) {
+ fprintf(stderr, "[subseq] %s: %d >= %ld\n", seq->name.s, beg, seq->seq.l);
+ continue;
+ }
+ if (end > seq->seq.l) end = seq->seq.l;
+ if (is_tab == 0) {
+ printf("%c%s", seq->qual.l == seq->seq.l? '@' : '>', seq->name.s);
+ if (end == INT_MAX) {
+ if (beg) printf(":%d", beg+1);
+ } else printf(":%d-%d", beg+1, end);
+ } else printf("%s\t%d\t", seq->name.s, beg + 1);
+ if (end > seq->seq.l) end = seq->seq.l;
+ for (j = 0; j < end - beg; ++j) {
+ if (is_tab == 0 && j % 60 == 0) putchar('\n');
+ putchar(seq->seq.s[j + beg]);
+ }
+ putchar('\n');
+ if (seq->qual.l != seq->seq.l || is_tab) continue;
+ printf("+");
+ for (j = 0; j < end - beg; ++j) {
+ if (j % 60 == 0) putchar('\n');
+ putchar(seq->qual.s[j + beg]);
+ }
+ putchar('\n');
+ }
+ }
+ // free
+ kseq_destroy(seq);
+ gzclose(fp);
+ stk_reg_destroy(h);
+ return 0;
+}
+
+/* mergefa */
+int stk_mergefa(int argc, char *argv[])
+{
+ gzFile fp[2];
+ kseq_t *seq[2];
+ int i, l, c, is_intersect = 0, is_haploid = 0, qual = 0, is_mask = 0;
+ while ((c = getopt(argc, argv, "himq:")) >= 0) {
+ switch (c) {
+ case 'i': is_intersect = 1; break;
+ case 'h': is_haploid = 1; break;
+ case 'm': is_mask = 1; break;
+ case 'q': qual = atoi(optarg); break;
+ }
+ }
+ if (is_mask && is_intersect) {
+ fprintf(stderr, "[%s] `-i' and `-h' cannot be applied at the same time.\n", __func__);
+ return 1;
+ }
+ if (optind + 2 > argc) {
+ fprintf(stderr, "\nUsage: seqtk mergefa [options] <in1.fa> <in2.fa>\n\n");
+ fprintf(stderr, "Options: -q INT quality threshold [0]\n");
+ fprintf(stderr, " -i take intersection\n");
+ fprintf(stderr, " -m convert to lowercase when one of the input base is N.\n");
+ fprintf(stderr, " -h suppress hets in the input\n\n");
+ return 1;
+ }
+ for (i = 0; i < 2; ++i) {
+ fp[i] = strcmp(argv[optind+i], "-")? gzopen(argv[optind+i], "r") : gzdopen(fileno(stdin), "r");
+ seq[i] = kseq_init(fp[i]);
+ }
+ while (kseq_read(seq[0]) >= 0) {
+ int min_l, c[2], is_upper;
+ kseq_read(seq[1]);
+ if (strcmp(seq[0]->name.s, seq[1]->name.s))
+ fprintf(stderr, "[%s] Different sequence names: %s != %s\n", __func__, seq[0]->name.s, seq[1]->name.s);
+ if (seq[0]->seq.l != seq[1]->seq.l)
+ fprintf(stderr, "[%s] Unequal sequence length: %ld != %ld\n", __func__, seq[0]->seq.l, seq[1]->seq.l);
+ min_l = seq[0]->seq.l < seq[1]->seq.l? seq[0]->seq.l : seq[1]->seq.l;
+ printf(">%s", seq[0]->name.s);
+ for (l = 0; l < min_l; ++l) {
+ c[0] = seq[0]->seq.s[l]; c[1] = seq[1]->seq.s[l];
+ if (seq[0]->qual.l && seq[0]->qual.s[l] - 33 < qual) c[0] = tolower(c[0]);
+ if (seq[1]->qual.l && seq[1]->qual.s[l] - 33 < qual) c[1] = tolower(c[1]);
+ if (is_intersect) is_upper = (isupper(c[0]) || isupper(c[1]))? 1 : 0;
+ else if (is_mask) is_upper = (isupper(c[0]) || isupper(c[1]))? 1 : 0;
+ else is_upper = (isupper(c[0]) && isupper(c[1]))? 1 : 0;
+ c[0] = seq_nt16_table[c[0]]; c[1] = seq_nt16_table[c[1]];
+ if (c[0] == 0) c[0] = 15;
+ if (c[1] == 0) c[1] = 15;
+ if (is_haploid && (bitcnt_table[c[0]] > 1 || bitcnt_table[c[1]] > 1)) is_upper = 0;
+ if (is_intersect) {
+ c[0] = c[0] & c[1];
+ if (c[0] == 0) is_upper = 0;
+ } else if (is_mask) {
+ if (c[0] == 15 || c[1] == 15) is_upper = 0;
+ c[0] = c[0] & c[1];
+ if (c[0] == 0) is_upper = 0;
+ } else c[0] = c[0] | c[1];
+ c[0] = seq_nt16_rev_table[c[0]];
+ if (!is_upper) c[0] = tolower(c[0]);
+ if (l%60 == 0) putchar('\n');
+ putchar(c[0]);
+ }
+ putchar('\n');
+ }
+ return 0;
+}
+
+int stk_famask(int argc, char *argv[])
+{
+ gzFile fp[2];
+ kseq_t *seq[2];
+ int i, l;
+ if (argc < 3) {
+ fprintf(stderr, "Usage: seqtk famask <src.fa> <mask.fa>\n");
+ return 1;
+ }
+ for (i = 0; i < 2; ++i) {
+ fp[i] = strcmp(argv[optind+i], "-")? gzopen(argv[optind+i], "r") : gzdopen(fileno(stdin), "r");
+ seq[i] = kseq_init(fp[i]);
+ }
+ while (kseq_read(seq[0]) >= 0) {
+ int min_l, c[2];
+ kseq_read(seq[1]);
+ if (strcmp(seq[0]->name.s, seq[1]->name.s))
+ fprintf(stderr, "[%s] Different sequence names: %s != %s\n", __func__, seq[0]->name.s, seq[1]->name.s);
+ if (seq[0]->seq.l != seq[1]->seq.l)
+ fprintf(stderr, "[%s] Unequal sequence length: %ld != %ld\n", __func__, seq[0]->seq.l, seq[1]->seq.l);
+ min_l = seq[0]->seq.l < seq[1]->seq.l? seq[0]->seq.l : seq[1]->seq.l;
+ printf(">%s", seq[0]->name.s);
+ for (l = 0; l < min_l; ++l) {
+ c[0] = seq[0]->seq.s[l]; c[1] = seq[1]->seq.s[l];
+ if (c[1] == 'x') c[0] = tolower(c[0]);
+ else if (c[1] != 'X') c[0] = c[1];
+ if (l%60 == 0) putchar('\n');
+ putchar(c[0]);
+ }
+ putchar('\n');
+ }
+ return 0;
+}
+
+int stk_mutfa(int argc, char *argv[])
+{
+ khash_t(reg) *h = kh_init(reg);
+ gzFile fp;
+ kseq_t *seq;
+ kstream_t *ks;
+ int l, i, dret;
+ kstring_t *str;
+ khint_t k;
+ if (argc < 3) {
+ fprintf(stderr, "Usage: seqtk mutfa <in.fa> <in.snp>\n\n");
+ fprintf(stderr, "Note: <in.snp> contains at least four columns per line which are:\n");
+ fprintf(stderr, " 'chr 1-based-pos any base-changed-to'.\n");
+ return 1;
+ }
+ // read the list
+ str = calloc(1, sizeof(kstring_t));
+ fp = strcmp(argv[2], "-")? gzopen(argv[2], "r") : gzdopen(fileno(stdin), "r");
+ ks = ks_init(fp);
+ while (ks_getuntil(ks, 0, str, &dret) >= 0) {
+ char *s = strdup(str->s);
+ int beg = 0, ret;
+ reglist_t *p;
+ k = kh_get(reg, h, s);
+ if (k == kh_end(h)) {
+ k = kh_put(reg, h, s, &ret);
+ memset(&kh_val(h, k), 0, sizeof(reglist_t));
+ }
+ p = &kh_val(h, k);
+ if (ks_getuntil(ks, 0, str, &dret) > 0) beg = atol(str->s) - 1; // 2nd col
+ ks_getuntil(ks, 0, str, &dret); // 3rd col
+ ks_getuntil(ks, 0, str, &dret); // 4th col
+ // skip the rest of the line
+ if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n');
+ if (isalpha(str->s[0]) && str->l == 1) {
+ if (p->n == p->m) {
+ p->m = p->m? p->m<<1 : 4;
+ p->a = realloc(p->a, p->m * 8);
+ }
+ p->a[p->n++] = (uint64_t)beg<<32 | str->s[0];
+ }
+ }
+ ks_destroy(ks);
+ gzclose(fp);
+ free(str->s); free(str);
+ // mutfa
+ fp = strcmp(argv[1], "-")? gzopen(argv[1], "r") : gzdopen(fileno(stdin), "r");
+ seq = kseq_init(fp);
+ while ((l = kseq_read(seq)) >= 0) {
+ reglist_t *p;
+ k = kh_get(reg, h, seq->name.s);
+ if (k != kh_end(h)) {
+ p = &kh_val(h, k);
+ for (i = 0; i < p->n; ++i) {
+ int beg = p->a[i]>>32;
+ if (beg < seq->seq.l)
+ seq->seq.s[beg] = (int)p->a[i];
+ }
+ }
+ printf(">%s", seq->name.s);
+ for (i = 0; i < l; ++i) {
+ if (i%60 == 0) putchar('\n');
+ putchar(seq->seq.s[i]);
+ }
+ putchar('\n');
+ }
+ // free
+ kseq_destroy(seq);
+ gzclose(fp);
+ for (k = 0; k < kh_end(h); ++k) {
+ if (kh_exist(h, k)) {
+ free(kh_val(h, k).a);
+ free((char*)kh_key(h, k));
+ }
+ }
+ kh_destroy(reg, h);
+ return 0;
+}
+
+int stk_listhet(int argc, char *argv[])
+{
+ gzFile fp;
+ kseq_t *seq;
+ int i, l;
+ if (argc == 1) {
+ fprintf(stderr, "Usage: seqtk listhet <in.fa>\n");
+ return 1;
+ }
+ fp = (strcmp(argv[1], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[1], "r");
+ seq = kseq_init(fp);
+ while ((l = kseq_read(seq)) >= 0) {
+ for (i = 0; i < l; ++i) {
+ int b = seq->seq.s[i];
+ if (bitcnt_table[seq_nt16_table[b]] == 2)
+ printf("%s\t%d\t%c\n", seq->name.s, i+1, b);
+ }
+ }
+ kseq_destroy(seq);
+ gzclose(fp);
+ return 0;
+}
+
+/* cutN */
+static int cutN_min_N_tract = 1000;
+static int cutN_nonN_penalty = 10;
+
+static int find_next_cut(const kseq_t *ks, int k, int *begin, int *end)
+{
+ int i, b, e;
+ while (k < ks->seq.l) {
+ if (seq_nt16_table[(int)ks->seq.s[k]] == 15) {
+ int score, max;
+ score = 0; e = max = -1;
+ for (i = k; i < ks->seq.l && score >= 0; ++i) { /* forward */
+ if (seq_nt16_table[(int)ks->seq.s[i]] == 15) ++score;
+ else score -= cutN_nonN_penalty;
+ if (score > max) max = score, e = i;
+ }
+ score = 0; b = max = -1;
+ for (i = e; i >= 0 && score >= 0; --i) { /* backward */
+ if (seq_nt16_table[(int)ks->seq.s[i]] == 15) ++score;
+ else score -= cutN_nonN_penalty;
+ if (score > max) max = score, b = i;
+ }
+ if (e + 1 - b >= cutN_min_N_tract) {
+ *begin = b;
+ *end = e + 1;
+ return *end;
+ }
+ k = e + 1;
+ } else ++k;
+ }
+ return -1;
+}
+static void print_seq(FILE *fpout, const kseq_t *ks, int begin, int end)
+{
+ int i;
+ if (begin >= end) return; // FIXME: why may this happen? Understand it!
+ fprintf(fpout, ">%s:%d-%d", ks->name.s, begin+1, end);
+ for (i = begin; i < end && i < ks->seq.l; ++i) {
+ if ((i - begin)%60 == 0) fputc('\n', fpout);
+ fputc(ks->seq.s[i], fpout);
+ }
+ fputc('\n', fpout);
+}
+int stk_cutN(int argc, char *argv[])
+{
+ int c, l, gap_only = 0;
+ gzFile fp;
+ kseq_t *ks;
+ while ((c = getopt(argc, argv, "n:p:g")) >= 0) {
+ switch (c) {
+ case 'n': cutN_min_N_tract = atoi(optarg); break;
+ case 'p': cutN_nonN_penalty = atoi(optarg); break;
+ case 'g': gap_only = 1; break;
+ default: return 1;
+ }
+ }
+ if (argc == optind) {
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Usage: seqtk cutN [options] <in.fa>\n\n");
+ fprintf(stderr, "Options: -n INT min size of N tract [%d]\n", cutN_min_N_tract);
+ fprintf(stderr, " -p INT penalty for a non-N [%d]\n", cutN_nonN_penalty);
+ fprintf(stderr, " -g print gaps only, no sequence\n\n");
+ return 1;
+ }
+ fp = (strcmp(argv[optind], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[optind], "r");
+ ks = kseq_init(fp);
+ while ((l = kseq_read(ks)) >= 0) {
+ int k = 0, begin = 0, end = 0;
+ while (find_next_cut(ks, k, &begin, &end) >= 0) {
+ if (begin != 0) {
+ if (gap_only) printf("%s\t%d\t%d\n", ks->name.s, begin, end);
+ else print_seq(stdout, ks, k, begin);
+ }
+ k = end;
+ }
+ if (!gap_only) print_seq(stdout, ks, k, l);
+ }
+ kseq_destroy(ks);
+ gzclose(fp);
+ return 0;
+}
+
+/* main function */
+static int usage()
+{
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Usage: seqtk <command> <arguments>\n\n");
+ fprintf(stderr, "Command: comp get the nucleotide composite of FASTA/Q\n");
+ fprintf(stderr, " hety regional heterozygosity\n");
+ fprintf(stderr, " fq2fa convert FASTQ to FASTA\n");
+ fprintf(stderr, " subseq extract subsequences from FASTA/Q\n");
+ fprintf(stderr, " maskseq mask sequences\n");
+ fprintf(stderr, " mutfa point mutate FASTA at specified positions\n");
+ fprintf(stderr, " mergefa merge two FASTA/Q files\n");
+ fprintf(stderr, " randbase choose a random base from hets\n");
+ fprintf(stderr, " cutN cut sequence at long N\n");
+ fprintf(stderr, " listhet extract the position of each het\n");
+ fprintf(stderr, "\n");
+ return 1;
+}
+
+int main(int argc, char *argv[])
+{
+ if (argc == 1) return usage();
+ if (strcmp(argv[1], "comp") == 0) stk_comp(argc-1, argv+1);
+ else if (strcmp(argv[1], "hety") == 0) stk_hety(argc-1, argv+1);
+ else if (strcmp(argv[1], "fq2fa") == 0) stk_fq2fa(argc-1, argv+1);
+ else if (strcmp(argv[1], "subseq") == 0) stk_subseq(argc-1, argv+1);
+ else if (strcmp(argv[1], "maskseq") == 0) stk_maskseq(argc-1, argv+1);
+ else if (strcmp(argv[1], "mutfa") == 0) stk_mutfa(argc-1, argv+1);
+ else if (strcmp(argv[1], "mergefa") == 0) stk_mergefa(argc-1, argv+1);
+ else if (strcmp(argv[1], "randbase") == 0) stk_randbase(argc-1, argv+1);
+ else if (strcmp(argv[1], "cutN") == 0) stk_cutN(argc-1, argv+1);
+ else if (strcmp(argv[1], "listhet") == 0) stk_listhet(argc-1, argv+1);
+ else if (strcmp(argv[1], "famask") == 0) stk_famask(argc-1, argv+1);
+ else {
+ fprintf(stderr, "[main] unrecognized commad '%s'. Abort!\n", argv[1]);
+ return 1;
+ }
+ return 0;
+}
/* The MIT License
Copyright (c) 2008 Genome Research Ltd (GRL).
+ 2011 Heng Li <lh3@live.co.uk>
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
SOFTWARE.
*/
-/* Contact: Heng Li <lh3@sanger.ac.uk> */
-
/* This program is separated from maq's read simulator with Colin
- * Hercus' modification to allow longer indels. Colin is the chief
- * developer of novoalign. */
+ * Hercus' modification to allow longer indels. */
#include <stdlib.h>
#include <math.h>
#include <stdint.h>
#include <ctype.h>
#include <string.h>
+#include <zlib.h>
+#include "kseq.h"
+KSEQ_INIT(gzFile, gzread)
-#define PACKAGE_VERSION "0.2.3"
+#define PACKAGE_VERSION "0.3.0"
const uint8_t nst_nt4_table[256] = {
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
};
-const int nst_color_space_table[] = { 4, 0, 0, 1, 0, 2, 3, 4, 0, 3, 2, 4, 1, 4, 4, 4};
-
/* Simple normal random number generator, copied from genran.c */
double ran_normal()
}
}
-/* FASTA parser, copied from seq.c */
-
-typedef struct {
- int l, m; /* length and maximum buffer size */
- unsigned char *s; /* sequence */
-} seq_t;
-
-#define INIT_SEQ(seq) (seq).s = 0; (seq).l = (seq).m = 0
-
-static int SEQ_BLOCK_SIZE = 512;
-
-void seq_set_block_size(int size)
-{
- SEQ_BLOCK_SIZE = size;
-}
-
-int seq_read_fasta(FILE *fp, seq_t *seq, char *locus, char *comment)
-{
- int c, l, max;
- char *p;
-
- c = 0;
- while (!feof(fp) && fgetc(fp) != '>');
- if (feof(fp)) return -1;
- p = locus;
- while (!feof(fp) && (c = fgetc(fp)) != ' ' && c != '\t' && c != '\n')
- if (c != '\r') *p++ = c;
- *p = '\0';
- if (comment) {
- p = comment;
- if (c != '\n') {
- while (!feof(fp) && ((c = fgetc(fp)) == ' ' || c == '\t'));
- if (c != '\n') {
- *p++ = c;
- while (!feof(fp) && (c = fgetc(fp)) != '\n')
- if (c != '\r') *p++ = c;
- }
- }
- *p = '\0';
- } else if (c != '\n') while (!feof(fp) && fgetc(fp) != '\n');
- l = 0; max = seq->m;
- while (!feof(fp) && (c = fgetc(fp)) != '>') {
- if (isalpha(c) || c == '-' || c == '.') {
- if (l + 1 >= max) {
- max += SEQ_BLOCK_SIZE;
- seq->s = (unsigned char*)realloc(seq->s, sizeof(char) * max);
- }
- seq->s[l++] = (unsigned char)c;
- }
- }
- if (c == '>') ungetc(c,fp);
- seq->s[l] = 0;
- seq->m = max; seq->l = l;
- return l;
-}
-
-/* Error-checking open, copied from utils.c */
-
-#define xopen(fn, mode) err_xopen_core(__func__, fn, mode)
-
-FILE *err_xopen_core(const char *func, const char *fn, const char *mode)
-{
- FILE *fp = 0;
- if (strcmp(fn, "-") == 0)
- return (strstr(mode, "r"))? stdin : stdout;
- if ((fp = fopen(fn, mode)) == 0) {
- fprintf(stderr, "[%s] fail to open file '%s'. Abort!\n", func, fn);
- abort();
- }
- return fp;
-}
-
/* wgsim */
enum muttype_t {NOCHANGE = 0, INSERT = 0x1000, SUBSTITUTE = 0xe000, DELETE = 0xf000};
static double ERR_RATE = 0.02;
static double MUT_RATE = 0.001;
-static double INDEL_FRAC = 0.1;
+static double INDEL_FRAC = 0.15;
static double INDEL_EXTEND = 0.3;
-static int IS_SOLID = 0;
-static int SHOW_MM_INFO = 1;
+static double MAX_N_RATIO = 0.1;
-void maq_mut_diref(const seq_t *seq, int is_hap, mutseq_t *hap1, mutseq_t *hap2)
+void wgsim_mut_diref(const kseq_t *ks, int is_hap, mutseq_t *hap1, mutseq_t *hap2)
{
int i, deleting = 0;
mutseq_t *ret[2];
ret[0] = hap1; ret[1] = hap2;
- ret[0]->l = seq->l; ret[1]->l = seq->l;
- ret[0]->m = seq->m; ret[1]->m = seq->m;
- ret[0]->s = (mut_t *)calloc(seq->m, sizeof(mut_t));
- ret[1]->s = (mut_t *)calloc(seq->m, sizeof(mut_t));
- for (i = 0; i != seq->l; ++i) {
+ ret[0]->l = ks->seq.l; ret[1]->l = ks->seq.l;
+ ret[0]->m = ks->seq.m; ret[1]->m = ks->seq.m;
+ ret[0]->s = (mut_t *)calloc(ks->seq.m, sizeof(mut_t));
+ ret[1]->s = (mut_t *)calloc(ks->seq.m, sizeof(mut_t));
+ for (i = 0; i != ks->seq.l; ++i) {
int c;
- c = ret[0]->s[i] = ret[1]->s[i] = (mut_t)nst_nt4_table[(int)seq->s[i]];
+ c = ret[0]->s[i] = ret[1]->s[i] = (mut_t)nst_nt4_table[(int)ks->seq.s[i]];
if (deleting) {
if (drand48() < INDEL_EXTEND) {
if (deleting & 1) ret[0]->s[i] |= DELETE;
}
}
}
-void maq_print_mutref(const char *name, const seq_t *seq, mutseq_t *hap1, mutseq_t *hap2)
+void wgsim_print_mutref(const char *name, const kseq_t *ks, mutseq_t *hap1, mutseq_t *hap2)
{
int i;
- for (i = 0; i != seq->l; ++i) {
+ for (i = 0; i != ks->seq.l; ++i) {
int c[3];
- c[0] = nst_nt4_table[(int)seq->s[i]];
+ c[0] = nst_nt4_table[(int)ks->seq.s[i]];
c[1] = hap1->s[i]; c[2] = hap2->s[i];
if (c[0] >= 4) continue;
if ((c[1] & mutmsk) != NOCHANGE || (c[2] & mutmsk) != NOCHANGE) {
} else if (((c[1] & mutmsk) >> 12) <= 5) { // ins
printf("-\t");
int n = (c[1]&mutmsk) >> 12, ins = c[1] >> 4;
- while(n > 0) {
+ while (n > 0) {
putchar("ACGTN"[ins & 0x3]);
+ ins >>= 2;
n--;
}
printf("\t-\n");
int n = (c[1]&mutmsk) >> 12, ins = c[1] >> 4;
while (n > 0) {
putchar("ACGTN"[ins & 0x3]);
+ ins >>= 2;
n--;
}
printf("\t+\n");
}
}
-void wgsim_core(FILE *fpout1, FILE *fpout2, FILE *fp_fa, int is_hap, uint64_t N, int dist, int std_dev, int size_l, int size_r)
+void wgsim_core(FILE *fpout1, FILE *fpout2, const char *fn, int is_hap, uint64_t N, int dist, int std_dev, int size_l, int size_r)
{
- seq_t seq;
+ kseq_t *ks;
mutseq_t rseq[2];
+ gzFile fp_fa;
uint64_t tot_len, ii;
int i, l, n_ref;
- char name[256], *qstr;
- int size[2], Q;
+ char *qstr;
+ int size[2], Q, max_size;
uint8_t *tmp_seq[2];
mut_t *target;
- INIT_SEQ(seq);
- srand48(time(0));
- seq_set_block_size(0x1000000);
l = size_l > size_r? size_l : size_r;
qstr = (char*)calloc(l+1, 1);
tmp_seq[0] = (uint8_t*)calloc(l+2, 1);
tmp_seq[1] = (uint8_t*)calloc(l+2, 1);
size[0] = size_l; size[1] = size_r;
+ max_size = size_l > size_r? size_l : size_r;
Q = (ERR_RATE == 0.0)? 'I' : (int)(-10.0 * log(ERR_RATE) / log(10.0) + 0.499) + 33;
+ fp_fa = gzopen(fn, "r");
+ ks = kseq_init(fp_fa);
tot_len = n_ref = 0;
- while ((l = seq_read_fasta(fp_fa, &seq, name, 0)) >= 0) {
+ fprintf(stderr, "[%s] calculating the total length of the reference sequence...\n", __func__);
+ while ((l = kseq_read(ks)) >= 0) {
tot_len += l;
++n_ref;
}
- fprintf(stderr, "[wgsim_core] %d sequences, total length: %llu\n", n_ref, (long long)tot_len);
- rewind(fp_fa);
+ fprintf(stderr, "[%s] %d sequences, total length: %llu\n", __func__, n_ref, (long long)tot_len);
+ kseq_destroy(ks);
+ gzclose(fp_fa);
- while ((l = seq_read_fasta(fp_fa, &seq, name, 0)) >= 0) {
+ fp_fa = gzopen(fn, "r");
+ ks = kseq_init(fp_fa);
+ while ((l = kseq_read(ks)) >= 0) {
uint64_t n_pairs = (uint64_t)((long double)l / tot_len * N + 0.5);
if (l < dist + 3 * std_dev) {
- fprintf(stderr, "[wgsim_core] kkip sequence '%s' as it is shorter than %d!\n", name, dist + 3 * std_dev);
+ fprintf(stderr, "[%s] skip sequence '%s' as it is shorter than %d!\n", __func__, ks->name.s, dist + 3 * std_dev);
continue;
}
// generate mutations and print them out
- maq_mut_diref(&seq, is_hap, rseq, rseq+1);
- maq_print_mutref(name, &seq, rseq, rseq+1);
+ wgsim_mut_diref(ks, is_hap, rseq, rseq+1);
+ wgsim_print_mutref(ks->name.s, ks, rseq, rseq+1);
for (ii = 0; ii != n_pairs; ++ii) { // the core loop
double ran;
ran = ran_normal();
ran = ran * std_dev + dist;
d = (int)(ran + 0.5);
+ d = d > max_size? d : max_size;
pos = (int)((l - d + 1) * drand48());
- } while (pos < 0 || pos >= seq.l || pos + d - 1 >= seq.l);
+ } while (pos < 0 || pos >= ks->seq.l || pos + d - 1 >= ks->seq.l);
// flip or not
if (drand48() < 0.5) {
n_sub[0] = n_sub[1] = n_indel[0] = n_indel[1] = n_err[0] = n_err[1] = 0;
#define __gen_read(x, start, iter) do { \
- for (i = (start), k = 0, ext_coor[x] = -10; i >= 0 && i < seq.l && k < s[x]; iter) { \
+ for (i = (start), k = 0, ext_coor[x] = -10; i >= 0 && i < ks->seq.l && k < s[x]; iter) { \
int c = target[i], mut_type = c & mutmsk; \
if (ext_coor[x] < 0) { \
if (mut_type != NOCHANGE && mut_type != SUBSTITUTE) continue; \
if (k != s[x]) ext_coor[x] = -10; \
} while (0)
- if (!IS_SOLID) {
- __gen_read(0, pos, ++i);
- __gen_read(1, pos + d - 1, --i);
- for (k = 0; k < s[1]; ++k) tmp_seq[1][k] = tmp_seq[1][k] < 4? 3 - tmp_seq[1][k] : 4; // complement
- } else {
- int c1, c2, c;
- ++s[0]; ++s[1]; // temporarily increase read length by 1
- if (is_flip) { // RR pair
- __gen_read(0, pos + s[0], --i);
- __gen_read(1, pos + d - 1, --i);
- } else { // FF pair
- __gen_read(0, pos, ++i);
- __gen_read(1, pos + d - 1 - s[1], ++i);
- ++ext_coor[0]; ++ext_coor[1];
- }
- // change to color sequence: (0,1,2,3) -> (A,C,G,T)
- for (j = 0; j < 2; ++j) {
- c1 = tmp_seq[j][0];
- for (i = 1; i < s[j]; ++i) {
- c2 = tmp_seq[j][i];
- c = (c1 >= 4 || c2 >= 4)? 4 : nst_color_space_table[(1<<c1)|(1<<c2)];
- tmp_seq[j][i-1] = c;
- c1 = c2;
- }
- }
- --s[0]; --s[1]; // change back
- }
+ __gen_read(0, pos, ++i);
+ __gen_read(1, pos + d - 1, --i);
+ for (k = 0; k < s[1]; ++k) tmp_seq[1][k] = tmp_seq[1][k] < 4? 3 - tmp_seq[1][k] : 4; // complement
if (ext_coor[0] < 0 || ext_coor[1] < 0) { // fail to generate the read(s)
--ii;
continue;
// generate sequencing errors
for (j = 0; j < 2; ++j) {
+ int n_n = 0;
for (i = 0; i < s[j]; ++i) {
int c = tmp_seq[j][i];
- if (c >= 4) c = 4; // actually c should be never larger than 4 if everything is correct
- else if (drand48() < ERR_RATE) {
- c = (c + (int)(drand48() * 3.0 + 1)) & 3;
+ if (c >= 4) { // actually c should be never larger than 4 if everything is correct
+ c = 4;
+ ++n_n;
+ } else if (drand48() < ERR_RATE) {
+ // c = (c + (int)(drand48() * 3.0 + 1)) & 3; // random sequencing errors
+ c = (c + 1) & 3; // recurrent sequencing errors
++n_err[j];
}
tmp_seq[j][i] = c;
}
+ if ((double)n_n / s[j] > MAX_N_RATIO) break;
+ }
+ if (j < 2) { // too many ambiguous bases on one of the reads
+ --ii;
+ continue;
}
// print
for (j = 0; j < 2; ++j) {
for (i = 0; i < s[j]; ++i) qstr[i] = Q;
qstr[i] = 0;
- if (SHOW_MM_INFO) {
- fprintf(fpo[j], "@%s_%u_%u_%d:%d:%d_%d:%d:%d_%llx/%d\n", name, ext_coor[0]+1, ext_coor[1]+1,
- n_err[0], n_sub[0], n_indel[0], n_err[1], n_sub[1], n_indel[1],
- (long long)ii, j==0? is_flip+1 : 2-is_flip);
- } else {
- fprintf(fpo[j], "@%s_%u_%u_%llx/%d %d:%d:%d_%d:%d:%d\n", name, ext_coor[0]+1, ext_coor[1]+1,
- (long long)ii, j==0? is_flip+1 : 2-is_flip,
- n_err[0], n_sub[0], n_indel[0], n_err[1], n_sub[1], n_indel[1]);
- }
+ fprintf(fpo[j], "@%s_%u_%u_%d:%d:%d_%d:%d:%d_%llx/%d\n", ks->name.s, ext_coor[0]+1, ext_coor[1]+1,
+ n_err[0], n_sub[0], n_indel[0], n_err[1], n_sub[1], n_indel[1],
+ (long long)ii, j==0? is_flip+1 : 2-is_flip);
for (i = 0; i < s[j]; ++i)
fputc("ACGTN"[(int)tmp_seq[j][i]], fpo[j]);
fprintf(fpo[j], "\n+\n%s\n", qstr);
}
free(rseq[0].s); free(rseq[1].s);
}
- free(seq.s); free(qstr);
+ kseq_destroy(ks);
+ gzclose(fp_fa);
+ free(qstr);
free(tmp_seq[0]); free(tmp_seq[1]);
}
fprintf(stderr, " -r FLOAT rate of mutations [%.4f]\n", MUT_RATE);
fprintf(stderr, " -R FLOAT fraction of indels [%.2f]\n", INDEL_FRAC);
fprintf(stderr, " -X FLOAT probability an indel is extended [%.2f]\n", INDEL_EXTEND);
- fprintf(stderr, " -c generate reads in color space (SOLiD reads)\n");
- fprintf(stderr, " -C show mismatch info in comment rather than read name\n");
+ fprintf(stderr, " -S INT seed for random generator [-1]\n");
fprintf(stderr, " -h haplotype mode\n");
fprintf(stderr, "\n");
- fprintf(stderr, "Note: For SOLiD reads, the first read is F3 and the second is R3.\n\n");
return 1;
}
{
int64_t N;
int dist, std_dev, c, size_l, size_r, is_hap = 0;
- FILE *fpout1, *fpout2, *fp_fa;
+ FILE *fpout1, *fpout2;
+ int seed = -1;
N = 1000000; dist = 500; std_dev = 50;
size_l = size_r = 70;
- while ((c = getopt(argc, argv, "e:d:s:N:1:2:r:R:hX:cC")) >= 0) {
+ while ((c = getopt(argc, argv, "e:d:s:N:1:2:r:R:hX:S:")) >= 0) {
switch (c) {
case 'd': dist = atoi(optarg); break;
case 's': std_dev = atoi(optarg); break;
case 'r': MUT_RATE = atof(optarg); break;
case 'R': INDEL_FRAC = atof(optarg); break;
case 'X': INDEL_EXTEND = atof(optarg); break;
- case 'c': IS_SOLID = 1; break;
- case 'C': SHOW_MM_INFO = 0; break;
+ case 'S': seed = atoi(optarg); break;
case 'h': is_hap = 1; break;
}
}
if (argc - optind < 3) return simu_usage();
- fp_fa = (strcmp(argv[optind+0], "-") == 0)? stdin : xopen(argv[optind+0], "r");
- fpout1 = xopen(argv[optind+1], "w");
- fpout2 = xopen(argv[optind+2], "w");
- wgsim_core(fpout1, fpout2, fp_fa, is_hap, N, dist, std_dev, size_l, size_r);
+ fpout1 = fopen(argv[optind+1], "w");
+ fpout2 = fopen(argv[optind+2], "w");
+ if (!fpout1 || !fpout2) {
+ fprintf(stderr, "[wgsim] file open error\n");
+ return 1;
+ }
+ srand48(seed > 0? seed : time(0));
+ wgsim_core(fpout1, fpout2, argv[optind], is_hap, N, dist, std_dev, size_l, size_r);
- fclose(fpout1); fclose(fpout2); fclose(fp_fa);
+ fclose(fpout1); fclose(fpout2);
return 0;
}