From: Heng Li Date: Fri, 2 Sep 2011 16:57:34 +0000 (+0000) Subject: * Updated samtools to the latest X-Git-Url: https://git.donarmstrong.com/?p=samtools.git;a=commitdiff_plain;h=4efe4317c377019fbb64bcfae519403a3a9d1f5f * Updated samtools to the latest * Added seqtk.c * Updated wgsim.c * release samtools-0.1.18 --- diff --git a/NEWS b/NEWS index 1aa2359..41a6cc8 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,30 @@ +Beta Release 0.1.18 (2 September, 2011) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Notable changes in samtools: + + * Support the new =/X CIGAR operators (by Peter Cock). + + * Allow to subsample BAM while keeping the pairing intact (view -s). + + * Implemented variant distance bias as a new filter (by Petr Danecek). + + * Bugfix: huge memory usage during indexing + + * Bugfix: use of uninitialized variable in mpileup (rare) + + * Bugfix: wrong BAQ probability (rare) + +Notable changes in bcftools: + + * Support indel in the contrast caller. + + * Bugfix: LRT2=nan in rare cases + +(0.1.18: 2 September 2011, r982:295) + + + Beta Release 0.1.17 (6 July, 2011) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/bam.c b/bam.c index 5fac17d..0055e84 100644 --- a/bam.c +++ b/bam.c @@ -32,7 +32,7 @@ int32_t bam_cigar2qlen(const bam1_core_t *c, const uint32_t *cigar) int32_t l = 0; for (k = 0; k < c->n_cigar; ++k) { int op = cigar[k] & BAM_CIGAR_MASK; - if (op == BAM_CMATCH || op == BAM_CINS || op == BAM_CSOFT_CLIP) + if (op == BAM_CMATCH || op == BAM_CINS || op == BAM_CSOFT_CLIP || op == BAM_CEQUAL || op == BAM_CDIFF) l += cigar[k] >> BAM_CIGAR_SHIFT; } return l; @@ -268,7 +268,7 @@ char *bam_format1_core(const bam_header_t *header, const bam1_t *b, int of) else { for (i = 0; i < c->n_cigar; ++i) { kputw(bam1_cigar(b)[i]>>BAM_CIGAR_SHIFT, &str); - kputc("MIDNSHP"[bam1_cigar(b)[i]&BAM_CIGAR_MASK], &str); + kputc("MIDNSHP=X"[bam1_cigar(b)[i]&BAM_CIGAR_MASK], &str); } } kputc('\t', &str); diff --git a/bam.h b/bam.h index 246b020..346c750 100644 --- a/bam.h +++ b/bam.h @@ -40,7 +40,7 @@ @copyright Genome Research Ltd. */ -#define BAM_VERSION "0.1.17 (r973:277)" +#define BAM_VERSION "0.1.18 (r982:295)" #include #include @@ -134,20 +134,25 @@ typedef struct { /* CIGAR operations. */ -/*! @abstract CIGAR: match */ +/*! @abstract CIGAR: M = match or mismatch*/ #define BAM_CMATCH 0 -/*! @abstract CIGAR: insertion to the reference */ +/*! @abstract CIGAR: I = insertion to the reference */ #define BAM_CINS 1 -/*! @abstract CIGAR: deletion from the reference */ +/*! @abstract CIGAR: D = deletion from the reference */ #define BAM_CDEL 2 -/*! @abstract CIGAR: skip on the reference (e.g. spliced alignment) */ +/*! @abstract CIGAR: N = skip on the reference (e.g. spliced alignment) */ #define BAM_CREF_SKIP 3 -/*! @abstract CIGAR: clip on the read with clipped sequence present in qseq */ +/*! @abstract CIGAR: S = clip on the read with clipped sequence + present in qseq */ #define BAM_CSOFT_CLIP 4 -/*! @abstract CIGAR: clip on the read with clipped sequence trimmed off */ +/*! @abstract CIGAR: H = clip on the read with clipped sequence trimmed off */ #define BAM_CHARD_CLIP 5 -/*! @abstract CIGAR: padding */ +/*! @abstract CIGAR: P = padding */ #define BAM_CPAD 6 +/*! @abstract CIGAR: equals = match */ +#define BAM_CEQUAL 7 +/*! @abstract CIGAR: X = mismatch */ +#define BAM_CDIFF 8 /*! @typedef @abstract Structure for core alignment information. diff --git a/bam2bcf_indel.c b/bam2bcf_indel.c index 2fdee9f..5142b3e 100644 --- a/bam2bcf_indel.c +++ b/bam2bcf_indel.c @@ -66,7 +66,7 @@ static int tpos2qpos(const bam1_core_t *c, const uint32_t *cigar, int32_t tpos, for (k = 0; k < c->n_cigar; ++k) { int op = cigar[k] & BAM_CIGAR_MASK; int l = cigar[k] >> BAM_CIGAR_SHIFT; - if (op == BAM_CMATCH) { + if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { if (c->pos > tpos) return y; if (x + l > tpos) { *_tpos = tpos; @@ -222,7 +222,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla for (k = 0; k < b->core.n_cigar; ++k) { int op = cigar[k]&0xf; int j, l = cigar[k]>>4; - if (op == BAM_CMATCH) { + if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { for (j = 0; j < l; ++j) if (x + j >= left && x + j < right) cns[x+j-left] += (bam1_seqi(seq, y+j) == ref0[x+j-left])? 1 : 0x10000; diff --git a/bam_aux.c b/bam_aux.c index 2247bdf..28b22e3 100644 --- a/bam_aux.c +++ b/bam_aux.c @@ -59,6 +59,23 @@ int bam_aux_del(bam1_t *b, uint8_t *s) return 0; } +int bam_aux_drop_other(bam1_t *b, uint8_t *s) +{ + if (s) { + uint8_t *p, *aux; + aux = bam1_aux(b); + p = s - 2; + __skip_tag(s); + memmove(aux, p, s - p); + b->data_len -= b->l_aux - (s - p); + b->l_aux = s - p; + } else { + b->data_len -= b->l_aux; + b->l_aux = 0; + } + return 0; +} + void bam_init_header_hash(bam_header_t *header) { if (header->hash == 0) { diff --git a/bam_import.c b/bam_import.c index 29516c9..5518a9c 100644 --- a/bam_import.c +++ b/bam_import.c @@ -14,7 +14,7 @@ #include "kseq.h" #include "khash.h" -KSTREAM_INIT(gzFile, gzread, 8192) +KSTREAM_INIT(gzFile, gzread, 16384) KHASH_MAP_INIT_STR(ref, uint64_t) void bam_init_header_hash(bam_header_t *header); @@ -292,20 +292,22 @@ int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b) z += str->l + 1; if (str->s[0] != '*') { for (s = str->s; *s; ++s) { - if (isalpha(*s)) ++c->n_cigar; + if ((isalpha(*s)) || (*s=='=')) ++c->n_cigar; else if (!isdigit(*s)) parse_error(fp->n_lines, "invalid CIGAR character"); } b->data = alloc_data(b, doff + c->n_cigar * 4); for (i = 0, s = str->s; i != c->n_cigar; ++i) { x = strtol(s, &t, 10); op = toupper(*t); - if (op == 'M' || op == '=' || op == 'X') op = BAM_CMATCH; + if (op == 'M') op = BAM_CMATCH; else if (op == 'I') op = BAM_CINS; else if (op == 'D') op = BAM_CDEL; else if (op == 'N') op = BAM_CREF_SKIP; else if (op == 'S') op = BAM_CSOFT_CLIP; else if (op == 'H') op = BAM_CHARD_CLIP; else if (op == 'P') op = BAM_CPAD; + else if (op == '=') op = BAM_CEQUAL; + else if (op == 'X') op = BAM_CDIFF; else parse_error(fp->n_lines, "invalid CIGAR operation"); s = t + 1; bam1_cigar(b)[i] = x << BAM_CIGAR_SHIFT | op; @@ -337,8 +339,11 @@ int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b) z += str->l + 1; if (strcmp(str->s, "*")) { c->l_qseq = strlen(str->s); - if (c->n_cigar && c->l_qseq != (int32_t)bam_cigar2qlen(c, bam1_cigar(b))) - parse_error(fp->n_lines, "CIGAR and sequence length are inconsistent"); + if (c->n_cigar && c->l_qseq != (int32_t)bam_cigar2qlen(c, bam1_cigar(b))) { + fprintf(stderr, "Line %ld, sequence length %i vs %i from CIGAR\n", + (long)fp->n_lines, c->l_qseq, (int32_t)bam_cigar2qlen(c, bam1_cigar(b))); + parse_error(fp->n_lines, "CIGAR and sequence length are inconsistent"); + } p = (uint8_t*)alloc_data(b, doff + c->l_qseq + (c->l_qseq+1)/2) + doff; memset(p, 0, (c->l_qseq+1)/2); for (i = 0; i < c->l_qseq; ++i) diff --git a/bam_md.c b/bam_md.c index 20b1913..d42aa8f 100644 --- a/bam_md.c +++ b/bam_md.c @@ -9,40 +9,46 @@ #include "kaln.h" #include "kprobaln.h" +#define USE_EQUAL 1 +#define DROP_TAG 2 +#define BIN_QUAL 4 +#define UPDATE_NM 8 +#define UPDATE_MD 16 +#define HASH_QNM 32 + char bam_nt16_nt4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 }; -void bam_fillmd1_core(bam1_t *b, char *ref, int is_equal, int max_nm) +int bam_aux_drop_other(bam1_t *b, uint8_t *s); + +void bam_fillmd1_core(bam1_t *b, char *ref, int flag, int max_nm) { uint8_t *seq = bam1_seq(b); uint32_t *cigar = bam1_cigar(b); bam1_core_t *c = &b->core; int i, x, y, u = 0; kstring_t *str; - uint8_t *old_md, *old_nm; int32_t old_nm_i = -1, nm = 0; str = (kstring_t*)calloc(1, sizeof(kstring_t)); for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) { int j, l = cigar[i]>>4, op = cigar[i]&0xf; - if (op == BAM_CMATCH) { + if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { for (j = 0; j < l; ++j) { int z = y + j; int c1 = bam1_seqi(seq, z), c2 = bam_nt16_table[(int)ref[x+j]]; if (ref[x+j] == 0) break; // out of boundary if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match - if (is_equal) seq[z/2] &= (z&1)? 0xf0 : 0x0f; + if (flag&USE_EQUAL) seq[z/2] &= (z&1)? 0xf0 : 0x0f; ++u; } else { - ksprintf(str, "%d", u); - kputc(ref[x+j], str); + kputw(u, str); kputc(ref[x+j], str); u = 0; ++nm; } } if (j < l) break; x += l; y += l; } else if (op == BAM_CDEL) { - ksprintf(str, "%d", u); - kputc('^', str); + kputw(u, str); kputc('^', str); for (j = 0; j < l; ++j) { if (ref[x+j] == 0) break; kputc(ref[x+j], str); @@ -57,12 +63,12 @@ void bam_fillmd1_core(bam1_t *b, char *ref, int is_equal, int max_nm) x += l; } } - ksprintf(str, "%d", u); + kputw(u, str); // apply max_nm if (max_nm > 0 && nm >= max_nm) { for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) { int j, l = cigar[i]>>4, op = cigar[i]&0xf; - if (op == BAM_CMATCH) { + if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { for (j = 0; j < l; ++j) { int z = y + j; int c1 = bam1_seqi(seq, z), c2 = bam_nt16_table[(int)ref[x+j]]; @@ -79,38 +85,54 @@ void bam_fillmd1_core(bam1_t *b, char *ref, int is_equal, int max_nm) } } // update NM - old_nm = bam_aux_get(b, "NM"); - if (c->flag & BAM_FUNMAP) return; - if (old_nm) old_nm_i = bam_aux2i(old_nm); - if (!old_nm) bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm); - else if (nm != old_nm_i) { - fprintf(stderr, "[bam_fillmd1] different NM for read '%s': %d -> %d\n", bam1_qname(b), old_nm_i, nm); - bam_aux_del(b, old_nm); - bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm); + if (flag & UPDATE_NM) { + uint8_t *old_nm = bam_aux_get(b, "NM"); + if (c->flag & BAM_FUNMAP) return; + if (old_nm) old_nm_i = bam_aux2i(old_nm); + if (!old_nm) bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm); + else if (nm != old_nm_i) { + fprintf(stderr, "[bam_fillmd1] different NM for read '%s': %d -> %d\n", bam1_qname(b), old_nm_i, nm); + bam_aux_del(b, old_nm); + bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm); + } } // update MD - old_md = bam_aux_get(b, "MD"); - if (!old_md) bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s); - else { - int is_diff = 0; - if (strlen((char*)old_md+1) == str->l) { - for (i = 0; i < str->l; ++i) - if (toupper(old_md[i+1]) != toupper(str->s[i])) - break; - if (i < str->l) is_diff = 1; - } else is_diff = 1; - if (is_diff) { - fprintf(stderr, "[bam_fillmd1] different MD for read '%s': '%s' -> '%s'\n", bam1_qname(b), old_md+1, str->s); - bam_aux_del(b, old_md); - bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s); + if (flag & UPDATE_MD) { + uint8_t *old_md = bam_aux_get(b, "MD"); + if (c->flag & BAM_FUNMAP) return; + if (!old_md) bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s); + else { + int is_diff = 0; + if (strlen((char*)old_md+1) == str->l) { + for (i = 0; i < str->l; ++i) + if (toupper(old_md[i+1]) != toupper(str->s[i])) + break; + if (i < str->l) is_diff = 1; + } else is_diff = 1; + if (is_diff) { + fprintf(stderr, "[bam_fillmd1] different MD for read '%s': '%s' -> '%s'\n", bam1_qname(b), old_md+1, str->s); + bam_aux_del(b, old_md); + bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s); + } } } + // drop all tags but RG + if (flag&DROP_TAG) { + uint8_t *q = bam_aux_get(b, "RG"); + bam_aux_drop_other(b, q); + } + // reduce the resolution of base quality + if (flag&BIN_QUAL) { + uint8_t *qual = bam1_qual(b); + for (i = 0; i < b->core.l_qseq; ++i) + if (qual[i] >= 3) qual[i] = qual[i]/10*10 + 7; + } free(str->s); free(str); } -void bam_fillmd1(bam1_t *b, char *ref, int is_equal) +void bam_fillmd1(bam1_t *b, char *ref, int flag) { - bam_fillmd1_core(b, ref, is_equal, 0); + bam_fillmd1_core(b, ref, flag, 0); } int bam_cap_mapQ(bam1_t *b, char *ref, int thres) @@ -124,7 +146,7 @@ int bam_cap_mapQ(bam1_t *b, char *ref, int thres) mm = q = len = clip_l = clip_q = 0; for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) { int j, l = cigar[i]>>4, op = cigar[i]&0xf; - if (op == BAM_CMATCH) { + if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { for (j = 0; j < l; ++j) { int z = y + j; int c1 = bam1_seqi(seq, z), c2 = bam_nt16_table[(int)ref[x+j]]; @@ -197,7 +219,7 @@ int bam_prob_realn_core(bam1_t *b, const char *ref, int flag) for (k = 0; k < c->n_cigar; ++k) { int op, l; op = cigar[k]&0xf; l = cigar[k]>>4; - if (op == BAM_CMATCH) { + if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { if (yb < 0) yb = y; if (xb < 0) xb = x; ye = y + l; xe = x + l; @@ -233,7 +255,7 @@ int bam_prob_realn_core(bam1_t *b, const char *ref, int flag) if (!extend_baq) { // in this block, bq[] is capped by base quality qual[] for (k = 0, x = c->pos, y = 0; k < c->n_cigar; ++k) { int op = cigar[k]&0xf, l = cigar[k]>>4; - if (op == BAM_CMATCH) { + if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { for (i = y; i < y + l; ++i) { if ((state[i]&3) != 0 || state[i]>>2 != x - xb + (i - y)) bq[i] = 0; else bq[i] = bq[i] < q[i]? bq[i] : q[i]; @@ -248,7 +270,7 @@ int bam_prob_realn_core(bam1_t *b, const char *ref, int flag) left = calloc(c->l_qseq, 1); rght = calloc(c->l_qseq, 1); for (k = 0, x = c->pos, y = 0; k < c->n_cigar; ++k) { int op = cigar[k]&0xf, l = cigar[k]>>4; - if (op == BAM_CMATCH) { + if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { for (i = y; i < y + l; ++i) bq[i] = ((state[i]&3) != 0 || state[i]>>2 != x - xb + (i - y))? 0 : q[i]; for (left[y] = bq[y], i = y + 1; i < y + l; ++i) @@ -280,19 +302,24 @@ int bam_prob_realn(bam1_t *b, const char *ref) int bam_fillmd(int argc, char *argv[]) { - int c, is_equal, tid = -2, ret, len, is_bam_out, is_sam_in, is_uncompressed, max_nm, is_realn, capQ, baq_flag; + int c, flt_flag, tid = -2, ret, len, is_bam_out, is_sam_in, is_uncompressed, max_nm, is_realn, capQ, baq_flag; samfile_t *fp, *fpout = 0; faidx_t *fai; char *ref = 0, mode_w[8], mode_r[8]; bam1_t *b; - is_equal = is_bam_out = is_sam_in = is_uncompressed = is_realn = max_nm = capQ = baq_flag = 0; + flt_flag = UPDATE_NM | UPDATE_MD; + is_bam_out = is_sam_in = is_uncompressed = is_realn = max_nm = capQ = baq_flag = 0; mode_w[0] = mode_r[0] = 0; strcpy(mode_r, "r"); strcpy(mode_w, "w"); - while ((c = getopt(argc, argv, "EreubSC:n:A")) >= 0) { + while ((c = getopt(argc, argv, "EqreuNhbSC:n:Ad")) >= 0) { switch (c) { case 'r': is_realn = 1; break; - case 'e': is_equal = 1; break; + case 'e': flt_flag |= USE_EQUAL; break; + case 'd': flt_flag |= DROP_TAG; break; + case 'q': flt_flag |= BIN_QUAL; break; + case 'h': flt_flag |= HASH_QNM; break; + case 'N': flt_flag &= ~(UPDATE_MD|UPDATE_NM); break; case 'b': is_bam_out = 1; break; case 'u': is_uncompressed = is_bam_out = 1; break; case 'S': is_sam_in = 1; break; @@ -344,7 +371,7 @@ int bam_fillmd(int argc, char *argv[]) int q = bam_cap_mapQ(b, ref, capQ); if (b->core.qual > q) b->core.qual = q; } - if (ref) bam_fillmd1_core(b, ref, is_equal, max_nm); + if (ref) bam_fillmd1_core(b, ref, flt_flag, max_nm); } samwrite(fpout, b); } diff --git a/bam_pileup.c b/bam_pileup.c index 3e26f74..57434e0 100644 --- a/bam_pileup.c +++ b/bam_pileup.c @@ -78,12 +78,12 @@ static inline int resolve_cigar2(bam_pileup1_t *p, uint32_t pos, cstate_t *s) if (s->k == -1) { // never processed is_head = 1; if (c->n_cigar == 1) { // just one operation, save a loop - if (_cop(cigar[0]) == BAM_CMATCH) s->k = 0, s->x = c->pos, s->y = 0; + if (_cop(cigar[0]) == BAM_CMATCH || _cop(cigar[0]) == BAM_CEQUAL || _cop(cigar[0]) == BAM_CDIFF) s->k = 0, s->x = c->pos, s->y = 0; } else { // find the first match or deletion for (k = 0, s->x = c->pos, s->y = 0; k < c->n_cigar; ++k) { int op = _cop(cigar[k]); int l = _cln(cigar[k]); - if (op == BAM_CMATCH || op == BAM_CDEL) break; + if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CEQUAL || op == BAM_CDIFF) break; else if (op == BAM_CREF_SKIP) s->x += l; else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l; } @@ -95,16 +95,16 @@ static inline int resolve_cigar2(bam_pileup1_t *p, uint32_t pos, cstate_t *s) if (pos - s->x >= l) { // jump to the next operation assert(s->k < c->n_cigar); // otherwise a bug: this function should not be called in this case op = _cop(cigar[s->k+1]); - if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP) { // jump to the next without a loop - if (_cop(cigar[s->k]) == BAM_CMATCH) s->y += l; + if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) { // jump to the next without a loop + if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l; s->x += l; ++s->k; - } else { // find the next M/D/N - if (_cop(cigar[s->k]) == BAM_CMATCH) s->y += l; + } else { // find the next M/D/N/=/X + if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l; s->x += l; for (k = s->k + 1; k < c->n_cigar; ++k) { op = _cop(cigar[k]), l = _cln(cigar[k]); - if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP) break; + if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) break; else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l; } s->k = k; @@ -126,12 +126,12 @@ static inline int resolve_cigar2(bam_pileup1_t *p, uint32_t pos, cstate_t *s) for (k = s->k + 2; k < c->n_cigar; ++k) { op2 = _cop(cigar[k]); l2 = _cln(cigar[k]); if (op2 == BAM_CINS) l3 += l2; - else if (op2 == BAM_CDEL || op2 == BAM_CMATCH || op2 == BAM_CREF_SKIP) break; + else if (op2 == BAM_CDEL || op2 == BAM_CMATCH || op2 == BAM_CREF_SKIP || op2 == BAM_CEQUAL || op2 == BAM_CDIFF) break; } if (l3 > 0) p->indel = l3; } } - if (op == BAM_CMATCH) { + if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { p->qpos = s->y + (pos - s->x); } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) { p->is_del = 1; p->qpos = s->y; // FIXME: distinguish D and N!!!!! diff --git a/kseq.h b/kseq.h index 82face0..0bbc7dc 100644 --- a/kseq.h +++ b/kseq.h @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2008 Genome Research Ltd (GRL). + Copyright (c) 2008, 2009, 2011 Attractive Chaos Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the @@ -23,13 +23,7 @@ SOFTWARE. */ -/* Contact: Heng Li */ - -/* - 2009-07-16 (lh3): in kstream_t, change "char*" to "unsigned char*" - */ - -/* Last Modified: 12APR2009 */ +/* Last Modified: 18AUG2011 */ #ifndef AC_KSEQ_H #define AC_KSEQ_H @@ -94,10 +88,10 @@ typedef struct __kstring_t { #endif #define __KS_GETUNTIL(__read, __bufsize) \ - static int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ + static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \ { \ if (dret) *dret = 0; \ - str->l = 0; \ + str->l = append? str->l : 0; \ if (ks->begin >= ks->end && ks->is_eof) return -1; \ for (;;) { \ int i; \ @@ -132,13 +126,15 @@ typedef struct __kstring_t { break; \ } \ } \ - if (str->l == 0) { \ + if (str->s == 0) { \ str->m = 1; \ str->s = (char*)calloc(1, 1); \ } \ str->s[str->l] = '\0'; \ return str->l; \ - } + } \ + static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ + { return ks_getuntil2(ks, delimiter, str, dret, 0); } #define KSTREAM_INIT(type_t, __read, __bufsize) \ __KS_TYPE(type_t) \ @@ -171,44 +167,45 @@ typedef struct __kstring_t { -1 end-of-file -2 truncated quality string */ -#define __KSEQ_READ \ - static int kseq_read(kseq_t *seq) \ - { \ - int c; \ - kstream_t *ks = seq->f; \ +#define __KSEQ_READ \ + static int kseq_read(kseq_t *seq) \ + { \ + int c; \ + kstream_t *ks = seq->f; \ if (seq->last_char == 0) { /* then jump to the next header line */ \ - while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ - if (c == -1) return -1; /* end of file */ \ - seq->last_char = c; \ - } /* the first header char has been read */ \ - seq->comment.l = seq->seq.l = seq->qual.l = 0; \ - if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; \ - if (c != '\n') ks_getuntil(ks, '\n', &seq->comment, 0); \ + while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ + if (c == -1) return -1; /* end of file */ \ + seq->last_char = c; \ + } /* else: the first header char has been read in the previous call */ \ + seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \ + if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \ + if (c != '\n') ks_getuntil(ks, '\n', &seq->comment, 0); /* read FASTA/Q comment */ \ + if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \ + seq->seq.m = 256; \ + seq->seq.s = (char*)malloc(seq->seq.m); \ + } \ while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \ - if (isgraph(c)) { /* printable non-space character */ \ - if (seq->seq.l + 1 >= seq->seq.m) { /* double the memory */ \ - seq->seq.m = seq->seq.l + 2; \ - kroundup32(seq->seq.m); /* rounded to next closest 2^k */ \ - seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ - } \ - seq->seq.s[seq->seq.l++] = (char)c; \ - } \ - } \ + seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \ + ks_getuntil2(ks, '\n', &seq->seq, 0, 1); /* read the rest of the line */ \ + } \ if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \ - seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ - if (c != '+') return seq->seq.l; /* FASTA */ \ - if (seq->qual.m < seq->seq.m) { /* allocate enough memory */ \ - seq->qual.m = seq->seq.m; \ - seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ - } \ + if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \ + seq->seq.m = seq->seq.l + 2; \ + kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \ + seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ + } \ + seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ + if (c != '+') return seq->seq.l; /* FASTA */ \ + if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \ + seq->qual.m = seq->seq.m; \ + seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ + } \ while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \ - if (c == -1) return -2; /* we should not stop here */ \ - while ((c = ks_getc(ks)) != -1 && seq->qual.l < seq->seq.l) \ - if (c >= 33 && c <= 127) seq->qual.s[seq->qual.l++] = (unsigned char)c; \ - seq->qual.s[seq->qual.l] = 0; /* null terminated string */ \ + if (c == -1) return -2; /* error: no quality string */ \ + while (ks_getuntil2(ks, '\n', &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \ seq->last_char = 0; /* we have not come to the next header line */ \ - if (seq->seq.l != seq->qual.l) return -2; /* qual string is shorter than seq string */ \ - return seq->seq.l; \ + if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \ + return seq->seq.l; \ } #define __KSEQ_TYPE(type_t) \ @@ -219,7 +216,7 @@ typedef struct __kstring_t { } kseq_t; #define KSEQ_INIT(type_t, __read) \ - KSTREAM_INIT(type_t, __read, 4096) \ + KSTREAM_INIT(type_t, __read, 16384) \ __KSEQ_TYPE(type_t) \ __KSEQ_BASIC(type_t) \ __KSEQ_READ diff --git a/misc/Makefile b/misc/Makefile index 6c25c78..d2f8bd8 100644 --- a/misc/Makefile +++ b/misc/Makefile @@ -4,7 +4,7 @@ CFLAGS= -g -Wall -O2 #-m64 #-arch ppc CXXFLAGS= $(CFLAGS) DFLAGS= -D_FILE_OFFSET_BITS=64 OBJS= -PROG= md5sum-lite md5fa maq2sam-short maq2sam-long wgsim +PROG= md5sum-lite md5fa maq2sam-short maq2sam-long wgsim seqtk INCLUDES= -I.. SUBDIRS= . @@ -27,11 +27,11 @@ lib-recur all-recur clean-recur cleanlocal-recur install-recur: lib: -afs2:afs2.o - $(CC) $(CFLAGS) -o $@ afs2.o -lm -lz -L.. -lbam +seqtk:seqtk.o + $(CC) $(CFLAGS) -o $@ seqtk.o -lm -lz wgsim:wgsim.o - $(CC) $(CFLAGS) -o $@ wgsim.o -lm + $(CC) $(CFLAGS) -o $@ wgsim.o -lm -lz md5fa:md5.o md5fa.o md5.h ../kseq.h $(CC) $(CFLAGS) -o $@ md5.o md5fa.o -lz @@ -51,8 +51,11 @@ maq2sam-long:maq2sam.c md5fa.o:md5.h md5fa.c $(CC) $(CFLAGS) -c -I.. -o $@ md5fa.c -afs2.o:afs2.c ../bam.h - $(CC) $(CFLAGS) -c -I.. -o $@ afs2.c +seqtk.o:seqtk.c ../khash.h ../kseq.h + $(CC) $(CFLAGS) -c -I.. -o $@ seqtk.c + +wgsim.o:wgsim.c ../kseq.h + $(CC) $(CFLAGS) -c -I.. -o $@ wgsim.c cleanlocal: rm -fr gmon.out *.o a.out *.exe *.dSYM $(PROG) *~ *.a diff --git a/misc/seqtk.c b/misc/seqtk.c new file mode 100644 index 0000000..591ddff --- /dev/null +++ b/misc/seqtk.c @@ -0,0 +1,783 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kseq.h" +KSEQ_INIT(gzFile, gzread) + +typedef struct { + int n, m; + uint64_t *a; +} reglist_t; + +#include "khash.h" +KHASH_MAP_INIT_STR(reg, reglist_t) + +typedef kh_reg_t reghash_t; + +reghash_t *stk_reg_read(const char *fn) +{ + reghash_t *h = kh_init(reg); + gzFile fp; + kstream_t *ks; + int dret; + kstring_t *str; + // read the list + str = calloc(1, sizeof(kstring_t)); + fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); + ks = ks_init(fp); + while (ks_getuntil(ks, 0, str, &dret) >= 0) { + int beg = -1, end = -1; + reglist_t *p; + khint_t k = kh_get(reg, h, str->s); + if (k == kh_end(h)) { + int ret; + char *s = strdup(str->s); + k = kh_put(reg, h, s, &ret); + memset(&kh_val(h, k), 0, sizeof(reglist_t)); + } + p = &kh_val(h, k); + if (dret != '\n') { + if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) { + beg = atoi(str->s); + if (dret != '\n') { + if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) { + end = atoi(str->s); + if (end < 0) end = -1; + } + } + } + } + // skip the rest of the line + if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n'); + if (end < 0 && beg > 0) end = beg, beg = beg - 1; // if there is only one column + if (beg < 0) beg = 0, end = INT_MAX; + if (p->n == p->m) { + p->m = p->m? p->m<<1 : 4; + p->a = realloc(p->a, p->m * 8); + } + p->a[p->n++] = (uint64_t)beg<<32 | end; + } + ks_destroy(ks); + gzclose(fp); + free(str->s); free(str); + return h; +} + +void stk_reg_destroy(reghash_t *h) +{ + khint_t k; + for (k = 0; k < kh_end(h); ++k) { + if (kh_exist(h, k)) { + free(kh_val(h, k).a); + free((char*)kh_key(h, k)); + } + } + kh_destroy(reg, h); +} + +/* constant table */ + +unsigned char seq_nt16_table[256] = { + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15 /*'-'*/,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15, + 15,15, 5, 6, 8,15, 7, 9, 0,10,15,15, 15,15,15,15, + 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15, + 15,15, 5, 6, 8,15, 7, 9, 0,10,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15 +}; + +char *seq_nt16_rev_table = "XACMGRSVTWYHKDBN"; +unsigned char seq_nt16to4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 }; +int bitcnt_table[] = { 4, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 }; + +/* composition */ +int stk_comp(int argc, char *argv[]) +{ + gzFile fp; + kseq_t *seq; + int l, c, upper_only = 0; + reghash_t *h = 0; + reglist_t dummy; + while ((c = getopt(argc, argv, "ur:")) >= 0) { + switch (c) { + case 'u': upper_only = 1; break; + case 'r': h = stk_reg_read(optarg); break; + } + } + if (argc == optind) { + fprintf(stderr, "Usage: seqtk comp [-u] [-r in.bed] \n\n"); + fprintf(stderr, "Output format: chr, length, #A, #C, #G, #T, #2, #3, #4, #CpG, #tv, #ts, #CpG-ts\n"); + return 1; + } + fp = (strcmp(argv[optind], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[optind], "r"); + seq = kseq_init(fp); + dummy.n= dummy.m = 1; dummy.a = calloc(1, 8); + while ((l = kseq_read(seq)) >= 0) { + int i, k; + reglist_t *p = 0; + if (h) { + khint_t k = kh_get(reg, h, seq->name.s); + if (k != kh_end(h)) p = &kh_val(h, k); + } else { + p = &dummy; + dummy.a[0] = l; + } + for (k = 0; p && k < p->n; ++k) { + int beg = p->a[k]>>32, end = p->a[k]&0xffffffff; + int la, lb, lc, na, nb, nc, cnt[11]; + if (beg > 0) la = seq->seq.s[beg-1], lb = seq_nt16_table[la], lc = bitcnt_table[lb]; + else la = 'a', lb = -1, lc = 0; + na = seq->seq.s[beg]; nb = seq_nt16_table[na]; nc = bitcnt_table[nb]; + memset(cnt, 0, 11 * sizeof(int)); + for (i = beg; i < end; ++i) { + int is_CpG = 0, a, b, c; + a = na; b = nb; c = nc; + na = seq->seq.s[i+1]; nb = seq_nt16_table[na]; nc = bitcnt_table[nb]; + if (b == 2 || b == 10) { // C or Y + if (nb == 4 || nb == 5) is_CpG = 1; + } else if (b == 4 || b == 5) { // G or R + if (lb == 2 || lb == 10) is_CpG = 1; + } + if (upper_only == 0 || isupper(a)) { + if (c > 1) ++cnt[c+2]; + if (c == 1) ++cnt[seq_nt16to4_table[b]]; + if (b == 10 || b == 5) ++cnt[9]; + else if (c == 2) { + ++cnt[8]; + } + if (is_CpG) { + ++cnt[7]; + if (b == 10 || b == 5) ++cnt[10]; + } + } + la = a; lb = b; lc = c; + } + if (h) printf("%s\t%d\t%d", seq->name.s, beg, end); + else printf("%s\t%d", seq->name.s, l); + for (i = 0; i < 11; ++i) printf("\t%d", cnt[i]); + putchar('\n'); + } + fflush(stdout); + } + free(dummy.a); + kseq_destroy(seq); + gzclose(fp); + return 0; +} + +int stk_randbase(int argc, char *argv[]) +{ + gzFile fp; + kseq_t *seq; + int l; + if (argc == 1) { + fprintf(stderr, "Usage: seqtk randbase \n"); + return 1; + } + fp = (strcmp(argv[1], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[1], "r"); + seq = kseq_init(fp); + while ((l = kseq_read(seq)) >= 0) { + int i; + printf(">%s", seq->name.s); + for (i = 0; i < l; ++i) { + int c, b, a, j, k, m; + b = seq->seq.s[i]; + c = seq_nt16_table[b]; + a = bitcnt_table[c]; + if (a == 2) { + m = (drand48() < 0.5); + for (j = k = 0; j < 4; ++j) { + if ((1<seq.s[i] = islower(b)? "acgt"[j] : "ACGT"[j]; + } + if (i%60 == 0) putchar('\n'); + putchar(seq->seq.s[i]); + } + putchar('\n'); + } + kseq_destroy(seq); + gzclose(fp); + return 0; +} + +int stk_hety(int argc, char *argv[]) +{ + gzFile fp; + kseq_t *seq; + int l, c, win_size = 50000, n_start = 5, win_step, is_lower_mask = 0; + char *buf; + uint32_t cnt[3]; + if (argc == 1) { + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: seqtk hety [options] \n\n"); + fprintf(stderr, "Options: -w INT window size [%d]\n", win_size); + fprintf(stderr, " -t INT # start positions in a window [%d]\n", n_start); + fprintf(stderr, " -m treat lowercases as masked\n"); + fprintf(stderr, "\n"); + return 1; + } + while ((c = getopt(argc, argv, "w:t:m")) >= 0) { + switch (c) { + case 'w': win_size = atoi(optarg); break; + case 't': n_start = atoi(optarg); break; + case 'm': is_lower_mask = 1; break; + } + } + fp = (strcmp(argv[optind], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[optind], "r"); + seq = kseq_init(fp); + win_step = win_size / n_start; + buf = calloc(win_size, 1); + while ((l = kseq_read(seq)) >= 0) { + int x, i, y, z, next = 0; + cnt[0] = cnt[1] = cnt[2] = 0; + for (i = 0; i <= l; ++i) { + if ((i >= win_size && i % win_step == 0) || i == l) { + if (i == l && l >= win_size) { + for (y = l - win_size; y < next; ++y) --cnt[(int)buf[y % win_size]]; + } + if (cnt[1] + cnt[2] > 0) + printf("%s\t%d\t%d\t%.2lf\t%d\t%d\n", seq->name.s, next, i, + (double)cnt[2] / (cnt[1] + cnt[2]) * win_size, cnt[1] + cnt[2], cnt[2]); + next = i; + } + if (i < l) { + y = i % win_size; + c = seq->seq.s[i]; + if (is_lower_mask && islower(c)) c = 'N'; + c = seq_nt16_table[c]; + x = bitcnt_table[c]; + if (i >= win_size) --cnt[(int)buf[y]]; + buf[y] = z = x > 2? 0 : x == 2? 2 : 1; + ++cnt[z]; + } + } + } + free(buf); + kseq_destroy(seq); + gzclose(fp); + return 0; +} + +/* fq2fa */ +int stk_fq2fa(int argc, char *argv[]) +{ + gzFile fp; + kseq_t *seq; + char *buf; + int l, i, c, qual_thres = 0, linelen = 60; + while ((c = getopt(argc, argv, "q:l:")) >= 0) { + switch (c) { + case 'q': qual_thres = atoi(optarg); break; + case 'l': linelen = atoi(optarg); break; + } + } + if (argc == optind) { + fprintf(stderr, "Usage: seqtk fq2fa [-q qualThres=0] [-l lineLen=60] \n"); + return 1; + } + buf = linelen > 0? malloc(linelen + 1) : 0; + fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); + seq = kseq_init(fp); + while ((l = kseq_read(seq)) >= 0) { + if (seq->qual.l && qual_thres > 0) { + for (i = 0; i < l; ++i) + if (seq->qual.s[i] - 33 < qual_thres) + seq->seq.s[i] = tolower(seq->seq.s[i]); + } + putchar('>'); + if (seq->comment.l) { + fputs(seq->name.s, stdout); + putchar(' '); + puts(seq->comment.s); + } else puts(seq->name.s); + if (buf) { // multi-line + for (i = 0; i < l; i += linelen) { + int x = i + linelen < l? linelen : l - i; + memcpy(buf, seq->seq.s + i, x); + buf[x] = 0; + puts(buf); + } + } else puts(seq->seq.s); + } + free(buf); + kseq_destroy(seq); + gzclose(fp); + return 0; +} + +int stk_maskseq(int argc, char *argv[]) +{ + khash_t(reg) *h = kh_init(reg); + gzFile fp; + kseq_t *seq; + int l, i, j, c, is_complement = 0, is_lower = 0; + khint_t k; + while ((c = getopt(argc, argv, "cl")) >= 0) { + switch (c) { + case 'c': is_complement = 1; break; + case 'l': is_lower = 1; break; + } + } + if (argc - optind < 2) { + fprintf(stderr, "Usage: seqtk maskseq [-cl] \n\n"); + fprintf(stderr, "Options: -c mask the complement regions\n"); + fprintf(stderr, " -l soft mask (to lower cases)\n"); + return 1; + } + h = stk_reg_read(argv[optind+1]); + // maskseq + fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); + seq = kseq_init(fp); + while ((l = kseq_read(seq)) >= 0) { + k = kh_get(reg, h, seq->name.s); + if (k == kh_end(h)) { // not found in the hash table + if (is_complement) { + for (j = 0; j < l; ++j) + seq->seq.s[j] = is_lower? tolower(seq->seq.s[j]) : 'N'; + } + } else { + reglist_t *p = &kh_val(h, k); + if (!is_complement) { + for (i = 0; i < p->n; ++i) { + int beg = p->a[i]>>32, end = p->a[i]; + if (beg >= seq->seq.l) { + fprintf(stderr, "[maskseq] start position >= the sequence length.\n"); + continue; + } + if (end >= seq->seq.l) end = seq->seq.l; + if (is_lower) for (j = beg; j < end; ++j) seq->seq.s[j] = tolower(seq->seq.s[j]); + else for (j = beg; j < end; ++j) seq->seq.s[j] = 'N'; + } + } else { + int8_t *mask = calloc(seq->seq.l, 1); + for (i = 0; i < p->n; ++i) { + int beg = p->a[i]>>32, end = p->a[i]; + if (end >= seq->seq.l) end = seq->seq.l; + for (j = beg; j < end; ++j) mask[j] = 1; + } + for (j = 0; j < l; ++j) + if (mask[j] == 0) seq->seq.s[j] = is_lower? tolower(seq->seq.s[j]) : 'N'; + free(mask); + } + } + printf(">%s", seq->name.s); + for (j = 0; j < seq->seq.l; ++j) { + if (j%60 == 0) putchar('\n'); + putchar(seq->seq.s[j]); + } + putchar('\n'); + } + // free + kseq_destroy(seq); + gzclose(fp); + stk_reg_destroy(h); + return 0; +} + +/* subseq */ + +int stk_subseq(int argc, char *argv[]) +{ + khash_t(reg) *h = kh_init(reg); + gzFile fp; + kseq_t *seq; + int l, i, j, c, is_tab = 0; + khint_t k; + while ((c = getopt(argc, argv, "t")) >= 0) { + switch (c) { + case 't': is_tab = 1; break; + } + } + if (optind + 2 > argc) { + fprintf(stderr, "Usage: seqtk subseq [-t] \n\n"); + fprintf(stderr, "Note: Use 'samtools faidx' if only a few regions are intended.\n"); + return 1; + } + h = stk_reg_read(argv[optind+1]); + // subseq + fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); + seq = kseq_init(fp); + while ((l = kseq_read(seq)) >= 0) { + reglist_t *p; + k = kh_get(reg, h, seq->name.s); + if (k == kh_end(h)) continue; + p = &kh_val(h, k); + for (i = 0; i < p->n; ++i) { + int beg = p->a[i]>>32, end = p->a[i]; + if (beg >= seq->seq.l) { + fprintf(stderr, "[subseq] %s: %d >= %ld\n", seq->name.s, beg, seq->seq.l); + continue; + } + if (end > seq->seq.l) end = seq->seq.l; + if (is_tab == 0) { + printf("%c%s", seq->qual.l == seq->seq.l? '@' : '>', seq->name.s); + if (end == INT_MAX) { + if (beg) printf(":%d", beg+1); + } else printf(":%d-%d", beg+1, end); + } else printf("%s\t%d\t", seq->name.s, beg + 1); + if (end > seq->seq.l) end = seq->seq.l; + for (j = 0; j < end - beg; ++j) { + if (is_tab == 0 && j % 60 == 0) putchar('\n'); + putchar(seq->seq.s[j + beg]); + } + putchar('\n'); + if (seq->qual.l != seq->seq.l || is_tab) continue; + printf("+"); + for (j = 0; j < end - beg; ++j) { + if (j % 60 == 0) putchar('\n'); + putchar(seq->qual.s[j + beg]); + } + putchar('\n'); + } + } + // free + kseq_destroy(seq); + gzclose(fp); + stk_reg_destroy(h); + return 0; +} + +/* mergefa */ +int stk_mergefa(int argc, char *argv[]) +{ + gzFile fp[2]; + kseq_t *seq[2]; + int i, l, c, is_intersect = 0, is_haploid = 0, qual = 0, is_mask = 0; + while ((c = getopt(argc, argv, "himq:")) >= 0) { + switch (c) { + case 'i': is_intersect = 1; break; + case 'h': is_haploid = 1; break; + case 'm': is_mask = 1; break; + case 'q': qual = atoi(optarg); break; + } + } + if (is_mask && is_intersect) { + fprintf(stderr, "[%s] `-i' and `-h' cannot be applied at the same time.\n", __func__); + return 1; + } + if (optind + 2 > argc) { + fprintf(stderr, "\nUsage: seqtk mergefa [options] \n\n"); + fprintf(stderr, "Options: -q INT quality threshold [0]\n"); + fprintf(stderr, " -i take intersection\n"); + fprintf(stderr, " -m convert to lowercase when one of the input base is N.\n"); + fprintf(stderr, " -h suppress hets in the input\n\n"); + return 1; + } + for (i = 0; i < 2; ++i) { + fp[i] = strcmp(argv[optind+i], "-")? gzopen(argv[optind+i], "r") : gzdopen(fileno(stdin), "r"); + seq[i] = kseq_init(fp[i]); + } + while (kseq_read(seq[0]) >= 0) { + int min_l, c[2], is_upper; + kseq_read(seq[1]); + if (strcmp(seq[0]->name.s, seq[1]->name.s)) + fprintf(stderr, "[%s] Different sequence names: %s != %s\n", __func__, seq[0]->name.s, seq[1]->name.s); + if (seq[0]->seq.l != seq[1]->seq.l) + fprintf(stderr, "[%s] Unequal sequence length: %ld != %ld\n", __func__, seq[0]->seq.l, seq[1]->seq.l); + min_l = seq[0]->seq.l < seq[1]->seq.l? seq[0]->seq.l : seq[1]->seq.l; + printf(">%s", seq[0]->name.s); + for (l = 0; l < min_l; ++l) { + c[0] = seq[0]->seq.s[l]; c[1] = seq[1]->seq.s[l]; + if (seq[0]->qual.l && seq[0]->qual.s[l] - 33 < qual) c[0] = tolower(c[0]); + if (seq[1]->qual.l && seq[1]->qual.s[l] - 33 < qual) c[1] = tolower(c[1]); + if (is_intersect) is_upper = (isupper(c[0]) || isupper(c[1]))? 1 : 0; + else if (is_mask) is_upper = (isupper(c[0]) || isupper(c[1]))? 1 : 0; + else is_upper = (isupper(c[0]) && isupper(c[1]))? 1 : 0; + c[0] = seq_nt16_table[c[0]]; c[1] = seq_nt16_table[c[1]]; + if (c[0] == 0) c[0] = 15; + if (c[1] == 0) c[1] = 15; + if (is_haploid && (bitcnt_table[c[0]] > 1 || bitcnt_table[c[1]] > 1)) is_upper = 0; + if (is_intersect) { + c[0] = c[0] & c[1]; + if (c[0] == 0) is_upper = 0; + } else if (is_mask) { + if (c[0] == 15 || c[1] == 15) is_upper = 0; + c[0] = c[0] & c[1]; + if (c[0] == 0) is_upper = 0; + } else c[0] = c[0] | c[1]; + c[0] = seq_nt16_rev_table[c[0]]; + if (!is_upper) c[0] = tolower(c[0]); + if (l%60 == 0) putchar('\n'); + putchar(c[0]); + } + putchar('\n'); + } + return 0; +} + +int stk_famask(int argc, char *argv[]) +{ + gzFile fp[2]; + kseq_t *seq[2]; + int i, l; + if (argc < 3) { + fprintf(stderr, "Usage: seqtk famask \n"); + return 1; + } + for (i = 0; i < 2; ++i) { + fp[i] = strcmp(argv[optind+i], "-")? gzopen(argv[optind+i], "r") : gzdopen(fileno(stdin), "r"); + seq[i] = kseq_init(fp[i]); + } + while (kseq_read(seq[0]) >= 0) { + int min_l, c[2]; + kseq_read(seq[1]); + if (strcmp(seq[0]->name.s, seq[1]->name.s)) + fprintf(stderr, "[%s] Different sequence names: %s != %s\n", __func__, seq[0]->name.s, seq[1]->name.s); + if (seq[0]->seq.l != seq[1]->seq.l) + fprintf(stderr, "[%s] Unequal sequence length: %ld != %ld\n", __func__, seq[0]->seq.l, seq[1]->seq.l); + min_l = seq[0]->seq.l < seq[1]->seq.l? seq[0]->seq.l : seq[1]->seq.l; + printf(">%s", seq[0]->name.s); + for (l = 0; l < min_l; ++l) { + c[0] = seq[0]->seq.s[l]; c[1] = seq[1]->seq.s[l]; + if (c[1] == 'x') c[0] = tolower(c[0]); + else if (c[1] != 'X') c[0] = c[1]; + if (l%60 == 0) putchar('\n'); + putchar(c[0]); + } + putchar('\n'); + } + return 0; +} + +int stk_mutfa(int argc, char *argv[]) +{ + khash_t(reg) *h = kh_init(reg); + gzFile fp; + kseq_t *seq; + kstream_t *ks; + int l, i, dret; + kstring_t *str; + khint_t k; + if (argc < 3) { + fprintf(stderr, "Usage: seqtk mutfa \n\n"); + fprintf(stderr, "Note: contains at least four columns per line which are:\n"); + fprintf(stderr, " 'chr 1-based-pos any base-changed-to'.\n"); + return 1; + } + // read the list + str = calloc(1, sizeof(kstring_t)); + fp = strcmp(argv[2], "-")? gzopen(argv[2], "r") : gzdopen(fileno(stdin), "r"); + ks = ks_init(fp); + while (ks_getuntil(ks, 0, str, &dret) >= 0) { + char *s = strdup(str->s); + int beg = 0, ret; + reglist_t *p; + k = kh_get(reg, h, s); + if (k == kh_end(h)) { + k = kh_put(reg, h, s, &ret); + memset(&kh_val(h, k), 0, sizeof(reglist_t)); + } + p = &kh_val(h, k); + if (ks_getuntil(ks, 0, str, &dret) > 0) beg = atol(str->s) - 1; // 2nd col + ks_getuntil(ks, 0, str, &dret); // 3rd col + ks_getuntil(ks, 0, str, &dret); // 4th col + // skip the rest of the line + if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n'); + if (isalpha(str->s[0]) && str->l == 1) { + if (p->n == p->m) { + p->m = p->m? p->m<<1 : 4; + p->a = realloc(p->a, p->m * 8); + } + p->a[p->n++] = (uint64_t)beg<<32 | str->s[0]; + } + } + ks_destroy(ks); + gzclose(fp); + free(str->s); free(str); + // mutfa + fp = strcmp(argv[1], "-")? gzopen(argv[1], "r") : gzdopen(fileno(stdin), "r"); + seq = kseq_init(fp); + while ((l = kseq_read(seq)) >= 0) { + reglist_t *p; + k = kh_get(reg, h, seq->name.s); + if (k != kh_end(h)) { + p = &kh_val(h, k); + for (i = 0; i < p->n; ++i) { + int beg = p->a[i]>>32; + if (beg < seq->seq.l) + seq->seq.s[beg] = (int)p->a[i]; + } + } + printf(">%s", seq->name.s); + for (i = 0; i < l; ++i) { + if (i%60 == 0) putchar('\n'); + putchar(seq->seq.s[i]); + } + putchar('\n'); + } + // free + kseq_destroy(seq); + gzclose(fp); + for (k = 0; k < kh_end(h); ++k) { + if (kh_exist(h, k)) { + free(kh_val(h, k).a); + free((char*)kh_key(h, k)); + } + } + kh_destroy(reg, h); + return 0; +} + +int stk_listhet(int argc, char *argv[]) +{ + gzFile fp; + kseq_t *seq; + int i, l; + if (argc == 1) { + fprintf(stderr, "Usage: seqtk listhet \n"); + return 1; + } + fp = (strcmp(argv[1], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[1], "r"); + seq = kseq_init(fp); + while ((l = kseq_read(seq)) >= 0) { + for (i = 0; i < l; ++i) { + int b = seq->seq.s[i]; + if (bitcnt_table[seq_nt16_table[b]] == 2) + printf("%s\t%d\t%c\n", seq->name.s, i+1, b); + } + } + kseq_destroy(seq); + gzclose(fp); + return 0; +} + +/* cutN */ +static int cutN_min_N_tract = 1000; +static int cutN_nonN_penalty = 10; + +static int find_next_cut(const kseq_t *ks, int k, int *begin, int *end) +{ + int i, b, e; + while (k < ks->seq.l) { + if (seq_nt16_table[(int)ks->seq.s[k]] == 15) { + int score, max; + score = 0; e = max = -1; + for (i = k; i < ks->seq.l && score >= 0; ++i) { /* forward */ + if (seq_nt16_table[(int)ks->seq.s[i]] == 15) ++score; + else score -= cutN_nonN_penalty; + if (score > max) max = score, e = i; + } + score = 0; b = max = -1; + for (i = e; i >= 0 && score >= 0; --i) { /* backward */ + if (seq_nt16_table[(int)ks->seq.s[i]] == 15) ++score; + else score -= cutN_nonN_penalty; + if (score > max) max = score, b = i; + } + if (e + 1 - b >= cutN_min_N_tract) { + *begin = b; + *end = e + 1; + return *end; + } + k = e + 1; + } else ++k; + } + return -1; +} +static void print_seq(FILE *fpout, const kseq_t *ks, int begin, int end) +{ + int i; + if (begin >= end) return; // FIXME: why may this happen? Understand it! + fprintf(fpout, ">%s:%d-%d", ks->name.s, begin+1, end); + for (i = begin; i < end && i < ks->seq.l; ++i) { + if ((i - begin)%60 == 0) fputc('\n', fpout); + fputc(ks->seq.s[i], fpout); + } + fputc('\n', fpout); +} +int stk_cutN(int argc, char *argv[]) +{ + int c, l, gap_only = 0; + gzFile fp; + kseq_t *ks; + while ((c = getopt(argc, argv, "n:p:g")) >= 0) { + switch (c) { + case 'n': cutN_min_N_tract = atoi(optarg); break; + case 'p': cutN_nonN_penalty = atoi(optarg); break; + case 'g': gap_only = 1; break; + default: return 1; + } + } + if (argc == optind) { + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: seqtk cutN [options] \n\n"); + fprintf(stderr, "Options: -n INT min size of N tract [%d]\n", cutN_min_N_tract); + fprintf(stderr, " -p INT penalty for a non-N [%d]\n", cutN_nonN_penalty); + fprintf(stderr, " -g print gaps only, no sequence\n\n"); + return 1; + } + fp = (strcmp(argv[optind], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[optind], "r"); + ks = kseq_init(fp); + while ((l = kseq_read(ks)) >= 0) { + int k = 0, begin = 0, end = 0; + while (find_next_cut(ks, k, &begin, &end) >= 0) { + if (begin != 0) { + if (gap_only) printf("%s\t%d\t%d\n", ks->name.s, begin, end); + else print_seq(stdout, ks, k, begin); + } + k = end; + } + if (!gap_only) print_seq(stdout, ks, k, l); + } + kseq_destroy(ks); + gzclose(fp); + return 0; +} + +/* main function */ +static int usage() +{ + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: seqtk \n\n"); + fprintf(stderr, "Command: comp get the nucleotide composite of FASTA/Q\n"); + fprintf(stderr, " hety regional heterozygosity\n"); + fprintf(stderr, " fq2fa convert FASTQ to FASTA\n"); + fprintf(stderr, " subseq extract subsequences from FASTA/Q\n"); + fprintf(stderr, " maskseq mask sequences\n"); + fprintf(stderr, " mutfa point mutate FASTA at specified positions\n"); + fprintf(stderr, " mergefa merge two FASTA/Q files\n"); + fprintf(stderr, " randbase choose a random base from hets\n"); + fprintf(stderr, " cutN cut sequence at long N\n"); + fprintf(stderr, " listhet extract the position of each het\n"); + fprintf(stderr, "\n"); + return 1; +} + +int main(int argc, char *argv[]) +{ + if (argc == 1) return usage(); + if (strcmp(argv[1], "comp") == 0) stk_comp(argc-1, argv+1); + else if (strcmp(argv[1], "hety") == 0) stk_hety(argc-1, argv+1); + else if (strcmp(argv[1], "fq2fa") == 0) stk_fq2fa(argc-1, argv+1); + else if (strcmp(argv[1], "subseq") == 0) stk_subseq(argc-1, argv+1); + else if (strcmp(argv[1], "maskseq") == 0) stk_maskseq(argc-1, argv+1); + else if (strcmp(argv[1], "mutfa") == 0) stk_mutfa(argc-1, argv+1); + else if (strcmp(argv[1], "mergefa") == 0) stk_mergefa(argc-1, argv+1); + else if (strcmp(argv[1], "randbase") == 0) stk_randbase(argc-1, argv+1); + else if (strcmp(argv[1], "cutN") == 0) stk_cutN(argc-1, argv+1); + else if (strcmp(argv[1], "listhet") == 0) stk_listhet(argc-1, argv+1); + else if (strcmp(argv[1], "famask") == 0) stk_famask(argc-1, argv+1); + else { + fprintf(stderr, "[main] unrecognized commad '%s'. Abort!\n", argv[1]); + return 1; + } + return 0; +} diff --git a/misc/wgsim.c b/misc/wgsim.c index 7b5f095..b9c513c 100644 --- a/misc/wgsim.c +++ b/misc/wgsim.c @@ -1,6 +1,7 @@ /* The MIT License Copyright (c) 2008 Genome Research Ltd (GRL). + 2011 Heng Li Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the @@ -23,11 +24,8 @@ SOFTWARE. */ -/* Contact: Heng Li */ - /* This program is separated from maq's read simulator with Colin - * Hercus' modification to allow longer indels. Colin is the chief - * developer of novoalign. */ + * Hercus' modification to allow longer indels. */ #include #include @@ -38,8 +36,11 @@ #include #include #include +#include +#include "kseq.h" +KSEQ_INIT(gzFile, gzread) -#define PACKAGE_VERSION "0.2.3" +#define PACKAGE_VERSION "0.3.0" const uint8_t nst_nt4_table[256] = { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, @@ -60,8 +61,6 @@ const uint8_t nst_nt4_table[256] = { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 }; -const int nst_color_space_table[] = { 4, 0, 0, 1, 0, 2, 3, 4, 0, 3, 2, 4, 1, 4, 4, 4}; - /* Simple normal random number generator, copied from genran.c */ double ran_normal() @@ -85,78 +84,6 @@ double ran_normal() } } -/* FASTA parser, copied from seq.c */ - -typedef struct { - int l, m; /* length and maximum buffer size */ - unsigned char *s; /* sequence */ -} seq_t; - -#define INIT_SEQ(seq) (seq).s = 0; (seq).l = (seq).m = 0 - -static int SEQ_BLOCK_SIZE = 512; - -void seq_set_block_size(int size) -{ - SEQ_BLOCK_SIZE = size; -} - -int seq_read_fasta(FILE *fp, seq_t *seq, char *locus, char *comment) -{ - int c, l, max; - char *p; - - c = 0; - while (!feof(fp) && fgetc(fp) != '>'); - if (feof(fp)) return -1; - p = locus; - while (!feof(fp) && (c = fgetc(fp)) != ' ' && c != '\t' && c != '\n') - if (c != '\r') *p++ = c; - *p = '\0'; - if (comment) { - p = comment; - if (c != '\n') { - while (!feof(fp) && ((c = fgetc(fp)) == ' ' || c == '\t')); - if (c != '\n') { - *p++ = c; - while (!feof(fp) && (c = fgetc(fp)) != '\n') - if (c != '\r') *p++ = c; - } - } - *p = '\0'; - } else if (c != '\n') while (!feof(fp) && fgetc(fp) != '\n'); - l = 0; max = seq->m; - while (!feof(fp) && (c = fgetc(fp)) != '>') { - if (isalpha(c) || c == '-' || c == '.') { - if (l + 1 >= max) { - max += SEQ_BLOCK_SIZE; - seq->s = (unsigned char*)realloc(seq->s, sizeof(char) * max); - } - seq->s[l++] = (unsigned char)c; - } - } - if (c == '>') ungetc(c,fp); - seq->s[l] = 0; - seq->m = max; seq->l = l; - return l; -} - -/* Error-checking open, copied from utils.c */ - -#define xopen(fn, mode) err_xopen_core(__func__, fn, mode) - -FILE *err_xopen_core(const char *func, const char *fn, const char *mode) -{ - FILE *fp = 0; - if (strcmp(fn, "-") == 0) - return (strstr(mode, "r"))? stdin : stdout; - if ((fp = fopen(fn, mode)) == 0) { - fprintf(stderr, "[%s] fail to open file '%s'. Abort!\n", func, fn); - abort(); - } - return fp; -} - /* wgsim */ enum muttype_t {NOCHANGE = 0, INSERT = 0x1000, SUBSTITUTE = 0xe000, DELETE = 0xf000}; @@ -170,24 +97,23 @@ typedef struct { static double ERR_RATE = 0.02; static double MUT_RATE = 0.001; -static double INDEL_FRAC = 0.1; +static double INDEL_FRAC = 0.15; static double INDEL_EXTEND = 0.3; -static int IS_SOLID = 0; -static int SHOW_MM_INFO = 1; +static double MAX_N_RATIO = 0.1; -void maq_mut_diref(const seq_t *seq, int is_hap, mutseq_t *hap1, mutseq_t *hap2) +void wgsim_mut_diref(const kseq_t *ks, int is_hap, mutseq_t *hap1, mutseq_t *hap2) { int i, deleting = 0; mutseq_t *ret[2]; ret[0] = hap1; ret[1] = hap2; - ret[0]->l = seq->l; ret[1]->l = seq->l; - ret[0]->m = seq->m; ret[1]->m = seq->m; - ret[0]->s = (mut_t *)calloc(seq->m, sizeof(mut_t)); - ret[1]->s = (mut_t *)calloc(seq->m, sizeof(mut_t)); - for (i = 0; i != seq->l; ++i) { + ret[0]->l = ks->seq.l; ret[1]->l = ks->seq.l; + ret[0]->m = ks->seq.m; ret[1]->m = ks->seq.m; + ret[0]->s = (mut_t *)calloc(ks->seq.m, sizeof(mut_t)); + ret[1]->s = (mut_t *)calloc(ks->seq.m, sizeof(mut_t)); + for (i = 0; i != ks->seq.l; ++i) { int c; - c = ret[0]->s[i] = ret[1]->s[i] = (mut_t)nst_nt4_table[(int)seq->s[i]]; + c = ret[0]->s[i] = ret[1]->s[i] = (mut_t)nst_nt4_table[(int)ks->seq.s[i]]; if (deleting) { if (drand48() < INDEL_EXTEND) { if (deleting & 1) ret[0]->s[i] |= DELETE; @@ -230,12 +156,12 @@ void maq_mut_diref(const seq_t *seq, int is_hap, mutseq_t *hap1, mutseq_t *hap2) } } } -void maq_print_mutref(const char *name, const seq_t *seq, mutseq_t *hap1, mutseq_t *hap2) +void wgsim_print_mutref(const char *name, const kseq_t *ks, mutseq_t *hap1, mutseq_t *hap2) { int i; - for (i = 0; i != seq->l; ++i) { + for (i = 0; i != ks->seq.l; ++i) { int c[3]; - c[0] = nst_nt4_table[(int)seq->s[i]]; + c[0] = nst_nt4_table[(int)ks->seq.s[i]]; c[1] = hap1->s[i]; c[2] = hap2->s[i]; if (c[0] >= 4) continue; if ((c[1] & mutmsk) != NOCHANGE || (c[2] & mutmsk) != NOCHANGE) { @@ -248,8 +174,9 @@ void maq_print_mutref(const char *name, const seq_t *seq, mutseq_t *hap1, mutseq } else if (((c[1] & mutmsk) >> 12) <= 5) { // ins printf("-\t"); int n = (c[1]&mutmsk) >> 12, ins = c[1] >> 4; - while(n > 0) { + while (n > 0) { putchar("ACGTN"[ins & 0x3]); + ins >>= 2; n--; } printf("\t-\n"); @@ -266,6 +193,7 @@ void maq_print_mutref(const char *name, const seq_t *seq, mutseq_t *hap1, mutseq int n = (c[1]&mutmsk) >> 12, ins = c[1] >> 4; while (n > 0) { putchar("ACGTN"[ins & 0x3]); + ins >>= 2; n--; } printf("\t+\n"); @@ -284,46 +212,51 @@ void maq_print_mutref(const char *name, const seq_t *seq, mutseq_t *hap1, mutseq } } -void wgsim_core(FILE *fpout1, FILE *fpout2, FILE *fp_fa, int is_hap, uint64_t N, int dist, int std_dev, int size_l, int size_r) +void wgsim_core(FILE *fpout1, FILE *fpout2, const char *fn, int is_hap, uint64_t N, int dist, int std_dev, int size_l, int size_r) { - seq_t seq; + kseq_t *ks; mutseq_t rseq[2]; + gzFile fp_fa; uint64_t tot_len, ii; int i, l, n_ref; - char name[256], *qstr; - int size[2], Q; + char *qstr; + int size[2], Q, max_size; uint8_t *tmp_seq[2]; mut_t *target; - INIT_SEQ(seq); - srand48(time(0)); - seq_set_block_size(0x1000000); l = size_l > size_r? size_l : size_r; qstr = (char*)calloc(l+1, 1); tmp_seq[0] = (uint8_t*)calloc(l+2, 1); tmp_seq[1] = (uint8_t*)calloc(l+2, 1); size[0] = size_l; size[1] = size_r; + max_size = size_l > size_r? size_l : size_r; Q = (ERR_RATE == 0.0)? 'I' : (int)(-10.0 * log(ERR_RATE) / log(10.0) + 0.499) + 33; + fp_fa = gzopen(fn, "r"); + ks = kseq_init(fp_fa); tot_len = n_ref = 0; - while ((l = seq_read_fasta(fp_fa, &seq, name, 0)) >= 0) { + fprintf(stderr, "[%s] calculating the total length of the reference sequence...\n", __func__); + while ((l = kseq_read(ks)) >= 0) { tot_len += l; ++n_ref; } - fprintf(stderr, "[wgsim_core] %d sequences, total length: %llu\n", n_ref, (long long)tot_len); - rewind(fp_fa); + fprintf(stderr, "[%s] %d sequences, total length: %llu\n", __func__, n_ref, (long long)tot_len); + kseq_destroy(ks); + gzclose(fp_fa); - while ((l = seq_read_fasta(fp_fa, &seq, name, 0)) >= 0) { + fp_fa = gzopen(fn, "r"); + ks = kseq_init(fp_fa); + while ((l = kseq_read(ks)) >= 0) { uint64_t n_pairs = (uint64_t)((long double)l / tot_len * N + 0.5); if (l < dist + 3 * std_dev) { - fprintf(stderr, "[wgsim_core] kkip sequence '%s' as it is shorter than %d!\n", name, dist + 3 * std_dev); + fprintf(stderr, "[%s] skip sequence '%s' as it is shorter than %d!\n", __func__, ks->name.s, dist + 3 * std_dev); continue; } // generate mutations and print them out - maq_mut_diref(&seq, is_hap, rseq, rseq+1); - maq_print_mutref(name, &seq, rseq, rseq+1); + wgsim_mut_diref(ks, is_hap, rseq, rseq+1); + wgsim_print_mutref(ks->name.s, ks, rseq, rseq+1); for (ii = 0; ii != n_pairs; ++ii) { // the core loop double ran; @@ -335,8 +268,9 @@ void wgsim_core(FILE *fpout1, FILE *fpout2, FILE *fp_fa, int is_hap, uint64_t N, ran = ran_normal(); ran = ran * std_dev + dist; d = (int)(ran + 0.5); + d = d > max_size? d : max_size; pos = (int)((l - d + 1) * drand48()); - } while (pos < 0 || pos >= seq.l || pos + d - 1 >= seq.l); + } while (pos < 0 || pos >= ks->seq.l || pos + d - 1 >= ks->seq.l); // flip or not if (drand48() < 0.5) { @@ -353,7 +287,7 @@ void wgsim_core(FILE *fpout1, FILE *fpout2, FILE *fp_fa, int is_hap, uint64_t N, n_sub[0] = n_sub[1] = n_indel[0] = n_indel[1] = n_err[0] = n_err[1] = 0; #define __gen_read(x, start, iter) do { \ - for (i = (start), k = 0, ext_coor[x] = -10; i >= 0 && i < seq.l && k < s[x]; iter) { \ + for (i = (start), k = 0, ext_coor[x] = -10; i >= 0 && i < ks->seq.l && k < s[x]; iter) { \ int c = target[i], mut_type = c & mutmsk; \ if (ext_coor[x] < 0) { \ if (mut_type != NOCHANGE && mut_type != SUBSTITUTE) continue; \ @@ -374,33 +308,9 @@ void wgsim_core(FILE *fpout1, FILE *fpout2, FILE *fp_fa, int is_hap, uint64_t N, if (k != s[x]) ext_coor[x] = -10; \ } while (0) - if (!IS_SOLID) { - __gen_read(0, pos, ++i); - __gen_read(1, pos + d - 1, --i); - for (k = 0; k < s[1]; ++k) tmp_seq[1][k] = tmp_seq[1][k] < 4? 3 - tmp_seq[1][k] : 4; // complement - } else { - int c1, c2, c; - ++s[0]; ++s[1]; // temporarily increase read length by 1 - if (is_flip) { // RR pair - __gen_read(0, pos + s[0], --i); - __gen_read(1, pos + d - 1, --i); - } else { // FF pair - __gen_read(0, pos, ++i); - __gen_read(1, pos + d - 1 - s[1], ++i); - ++ext_coor[0]; ++ext_coor[1]; - } - // change to color sequence: (0,1,2,3) -> (A,C,G,T) - for (j = 0; j < 2; ++j) { - c1 = tmp_seq[j][0]; - for (i = 1; i < s[j]; ++i) { - c2 = tmp_seq[j][i]; - c = (c1 >= 4 || c2 >= 4)? 4 : nst_color_space_table[(1<= 4) c = 4; // actually c should be never larger than 4 if everything is correct - else if (drand48() < ERR_RATE) { - c = (c + (int)(drand48() * 3.0 + 1)) & 3; + if (c >= 4) { // actually c should be never larger than 4 if everything is correct + c = 4; + ++n_n; + } else if (drand48() < ERR_RATE) { + // c = (c + (int)(drand48() * 3.0 + 1)) & 3; // random sequencing errors + c = (c + 1) & 3; // recurrent sequencing errors ++n_err[j]; } tmp_seq[j][i] = c; } + if ((double)n_n / s[j] > MAX_N_RATIO) break; + } + if (j < 2) { // too many ambiguous bases on one of the reads + --ii; + continue; } // print for (j = 0; j < 2; ++j) { for (i = 0; i < s[j]; ++i) qstr[i] = Q; qstr[i] = 0; - if (SHOW_MM_INFO) { - fprintf(fpo[j], "@%s_%u_%u_%d:%d:%d_%d:%d:%d_%llx/%d\n", name, ext_coor[0]+1, ext_coor[1]+1, - n_err[0], n_sub[0], n_indel[0], n_err[1], n_sub[1], n_indel[1], - (long long)ii, j==0? is_flip+1 : 2-is_flip); - } else { - fprintf(fpo[j], "@%s_%u_%u_%llx/%d %d:%d:%d_%d:%d:%d\n", name, ext_coor[0]+1, ext_coor[1]+1, - (long long)ii, j==0? is_flip+1 : 2-is_flip, - n_err[0], n_sub[0], n_indel[0], n_err[1], n_sub[1], n_indel[1]); - } + fprintf(fpo[j], "@%s_%u_%u_%d:%d:%d_%d:%d:%d_%llx/%d\n", ks->name.s, ext_coor[0]+1, ext_coor[1]+1, + n_err[0], n_sub[0], n_indel[0], n_err[1], n_sub[1], n_indel[1], + (long long)ii, j==0? is_flip+1 : 2-is_flip); for (i = 0; i < s[j]; ++i) fputc("ACGTN"[(int)tmp_seq[j][i]], fpo[j]); fprintf(fpo[j], "\n+\n%s\n", qstr); @@ -439,7 +352,9 @@ void wgsim_core(FILE *fpout1, FILE *fpout2, FILE *fp_fa, int is_hap, uint64_t N, } free(rseq[0].s); free(rseq[1].s); } - free(seq.s); free(qstr); + kseq_destroy(ks); + gzclose(fp_fa); + free(qstr); free(tmp_seq[0]); free(tmp_seq[1]); } @@ -459,11 +374,9 @@ static int simu_usage() fprintf(stderr, " -r FLOAT rate of mutations [%.4f]\n", MUT_RATE); fprintf(stderr, " -R FLOAT fraction of indels [%.2f]\n", INDEL_FRAC); fprintf(stderr, " -X FLOAT probability an indel is extended [%.2f]\n", INDEL_EXTEND); - fprintf(stderr, " -c generate reads in color space (SOLiD reads)\n"); - fprintf(stderr, " -C show mismatch info in comment rather than read name\n"); + fprintf(stderr, " -S INT seed for random generator [-1]\n"); fprintf(stderr, " -h haplotype mode\n"); fprintf(stderr, "\n"); - fprintf(stderr, "Note: For SOLiD reads, the first read is F3 and the second is R3.\n\n"); return 1; } @@ -471,11 +384,12 @@ int main(int argc, char *argv[]) { int64_t N; int dist, std_dev, c, size_l, size_r, is_hap = 0; - FILE *fpout1, *fpout2, *fp_fa; + FILE *fpout1, *fpout2; + int seed = -1; N = 1000000; dist = 500; std_dev = 50; size_l = size_r = 70; - while ((c = getopt(argc, argv, "e:d:s:N:1:2:r:R:hX:cC")) >= 0) { + while ((c = getopt(argc, argv, "e:d:s:N:1:2:r:R:hX:S:")) >= 0) { switch (c) { case 'd': dist = atoi(optarg); break; case 's': std_dev = atoi(optarg); break; @@ -486,17 +400,20 @@ int main(int argc, char *argv[]) case 'r': MUT_RATE = atof(optarg); break; case 'R': INDEL_FRAC = atof(optarg); break; case 'X': INDEL_EXTEND = atof(optarg); break; - case 'c': IS_SOLID = 1; break; - case 'C': SHOW_MM_INFO = 0; break; + case 'S': seed = atoi(optarg); break; case 'h': is_hap = 1; break; } } if (argc - optind < 3) return simu_usage(); - fp_fa = (strcmp(argv[optind+0], "-") == 0)? stdin : xopen(argv[optind+0], "r"); - fpout1 = xopen(argv[optind+1], "w"); - fpout2 = xopen(argv[optind+2], "w"); - wgsim_core(fpout1, fpout2, fp_fa, is_hap, N, dist, std_dev, size_l, size_r); + fpout1 = fopen(argv[optind+1], "w"); + fpout2 = fopen(argv[optind+2], "w"); + if (!fpout1 || !fpout2) { + fprintf(stderr, "[wgsim] file open error\n"); + return 1; + } + srand48(seed > 0? seed : time(0)); + wgsim_core(fpout1, fpout2, argv[optind], is_hap, N, dist, std_dev, size_l, size_r); - fclose(fpout1); fclose(fpout2); fclose(fp_fa); + fclose(fpout1); fclose(fpout2); return 0; }