3 Copyright (c) 2008 Genome Research Ltd (GRL).
4 2011 Heng Li <lh3@live.co.uk>
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
14 The above copyright notice and this permission notice shall be
15 included in all copies or substantial portions of the Software.
17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
21 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
22 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
23 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27 /* This program is separated from maq's read simulator with Colin
28 * Hercus' modification to allow longer indels. */
41 KSEQ_INIT(gzFile, gzread)
43 #define PACKAGE_VERSION "0.3.0"
45 const uint8_t nst_nt4_table[256] = {
46 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
47 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
48 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5 /*'-'*/, 4, 4,
49 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
50 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
51 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
52 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
53 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
54 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
55 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
56 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
57 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
58 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
59 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
60 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
61 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
64 /* Simple normal random number generator, copied from genran.c */
70 double fac, rsq, v1, v2;
73 v1 = 2.0 * drand48() - 1.0;
74 v2 = 2.0 * drand48() - 1.0;
75 rsq = v1 * v1 + v2 * v2;
76 } while (rsq >= 1.0 || rsq == 0.0);
77 fac = sqrt(-2.0 * log(rsq) / rsq);
89 enum muttype_t {NOCHANGE = 0, INSERT = 0x1000, SUBSTITUTE = 0xe000, DELETE = 0xf000};
90 typedef unsigned short mut_t;
91 static mut_t mutmsk = (mut_t)0xf000;
94 int l, m; /* length and maximum buffer size */
95 mut_t *s; /* sequence */
98 static double ERR_RATE = 0.02;
99 static double MUT_RATE = 0.001;
100 static double INDEL_FRAC = 0.15;
101 static double INDEL_EXTEND = 0.3;
102 static double MAX_N_RATIO = 0.1;
104 void wgsim_mut_diref(const kseq_t *ks, int is_hap, mutseq_t *hap1, mutseq_t *hap2)
109 ret[0] = hap1; ret[1] = hap2;
110 ret[0]->l = ks->seq.l; ret[1]->l = ks->seq.l;
111 ret[0]->m = ks->seq.m; ret[1]->m = ks->seq.m;
112 ret[0]->s = (mut_t *)calloc(ks->seq.m, sizeof(mut_t));
113 ret[1]->s = (mut_t *)calloc(ks->seq.m, sizeof(mut_t));
114 for (i = 0; i != ks->seq.l; ++i) {
116 c = ret[0]->s[i] = ret[1]->s[i] = (mut_t)nst_nt4_table[(int)ks->seq.s[i]];
118 if (drand48() < INDEL_EXTEND) {
119 if (deleting & 1) ret[0]->s[i] |= DELETE;
120 if (deleting & 2) ret[1]->s[i] |= DELETE;
124 if (c < 4 && drand48() < MUT_RATE) { // mutation
125 if (drand48() >= INDEL_FRAC) { // substitution
126 double r = drand48();
127 c = (c + (int)(r * 3.0 + 1)) & 3;
128 if (is_hap || drand48() < 0.333333) { // hom
129 ret[0]->s[i] = ret[1]->s[i] = SUBSTITUTE|c;
131 ret[drand48()<0.5?0:1]->s[i] = SUBSTITUTE|c;
134 if (drand48() < 0.5) { // deletion
135 if (is_hap || drand48() < 0.333333) { // hom-del
136 ret[0]->s[i] = ret[1]->s[i] = DELETE;
139 deleting = drand48()<0.5?1:2;
140 ret[deleting-1]->s[i] = DELETE;
142 } else { // insertion
143 int num_ins = 0, ins = 0;
146 ins = (ins << 2) | (int)(drand48() * 4.0);
147 } while (num_ins < 4 && drand48() < INDEL_EXTEND);
149 if (is_hap || drand48() < 0.333333) { // hom-ins
150 ret[0]->s[i] = ret[1]->s[i] = (num_ins << 12) | (ins << 4) | c;
152 ret[drand48()<0.5?0:1]->s[i] = (num_ins << 12) | (ins << 4) | c;
159 void wgsim_print_mutref(const char *name, const kseq_t *ks, mutseq_t *hap1, mutseq_t *hap2)
162 for (i = 0; i != ks->seq.l; ++i) {
164 c[0] = nst_nt4_table[(int)ks->seq.s[i]];
165 c[1] = hap1->s[i]; c[2] = hap2->s[i];
166 if (c[0] >= 4) continue;
167 if ((c[1] & mutmsk) != NOCHANGE || (c[2] & mutmsk) != NOCHANGE) {
168 printf("%s\t%d\t", name, i+1);
169 if (c[1] == c[2]) { // hom
170 if ((c[1]&mutmsk) == SUBSTITUTE) { // substitution
171 printf("%c\t%c\t-\n", "ACGTN"[c[0]], "ACGTN"[c[1]&0xf]);
172 } else if ((c[1]&mutmsk) == DELETE) { // del
173 printf("%c\t-\t-\n", "ACGTN"[c[0]]);
174 } else if (((c[1] & mutmsk) >> 12) <= 5) { // ins
176 int n = (c[1]&mutmsk) >> 12, ins = c[1] >> 4;
178 putchar("ACGTN"[ins & 0x3]);
185 if ((c[1]&mutmsk) == SUBSTITUTE || (c[2]&mutmsk) == SUBSTITUTE) { // substitution
186 printf("%c\t%c\t+\n", "ACGTN"[c[0]], "XACMGRSVTWYHKDBN"[1<<(c[1]&0x3)|1<<(c[2]&0x3)]);
187 } else if ((c[1]&mutmsk) == DELETE) {
188 printf("%c\t-\t+\n", "ACGTN"[c[0]]);
189 } else if ((c[2]&mutmsk) == DELETE) {
190 printf("%c\t-\t+\n", "ACGTN"[c[0]]);
191 } else if (((c[1] & mutmsk) >> 12) <= 4) { // ins1
193 int n = (c[1]&mutmsk) >> 12, ins = c[1] >> 4;
195 putchar("ACGTN"[ins & 0x3]);
200 } else if (((c[2] & mutmsk) >> 12) <= 5) { // ins2
202 int n = (c[2]&mutmsk) >> 12, ins = c[2] >> 4;
204 putchar("ACGTN"[ins & 0x3]);
215 void wgsim_core(FILE *fpout1, FILE *fpout2, const char *fn, int is_hap, uint64_t N, int dist, int std_dev, int size_l, int size_r)
220 uint64_t tot_len, ii;
223 int size[2], Q, max_size;
227 l = size_l > size_r? size_l : size_r;
228 qstr = (char*)calloc(l+1, 1);
229 tmp_seq[0] = (uint8_t*)calloc(l+2, 1);
230 tmp_seq[1] = (uint8_t*)calloc(l+2, 1);
231 size[0] = size_l; size[1] = size_r;
232 max_size = size_l > size_r? size_l : size_r;
234 Q = (ERR_RATE == 0.0)? 'I' : (int)(-10.0 * log(ERR_RATE) / log(10.0) + 0.499) + 33;
236 fp_fa = gzopen(fn, "r");
237 ks = kseq_init(fp_fa);
239 fprintf(stderr, "[%s] calculating the total length of the reference sequence...\n", __func__);
240 while ((l = kseq_read(ks)) >= 0) {
244 fprintf(stderr, "[%s] %d sequences, total length: %llu\n", __func__, n_ref, (long long)tot_len);
248 fp_fa = gzopen(fn, "r");
249 ks = kseq_init(fp_fa);
250 while ((l = kseq_read(ks)) >= 0) {
251 uint64_t n_pairs = (uint64_t)((long double)l / tot_len * N + 0.5);
252 if (l < dist + 3 * std_dev) {
253 fprintf(stderr, "[%s] skip sequence '%s' as it is shorter than %d!\n", __func__, ks->name.s, dist + 3 * std_dev);
257 // generate mutations and print them out
258 wgsim_mut_diref(ks, is_hap, rseq, rseq+1);
259 wgsim_print_mutref(ks->name.s, ks, rseq, rseq+1);
261 for (ii = 0; ii != n_pairs; ++ii) { // the core loop
263 int d, pos, s[2], is_flip = 0;
264 int n_sub[2], n_indel[2], n_err[2], ext_coor[2], j, k;
267 do { // avoid boundary failure
269 ran = ran * std_dev + dist;
270 d = (int)(ran + 0.5);
271 d = d > max_size? d : max_size;
272 pos = (int)((l - d + 1) * drand48());
273 } while (pos < 0 || pos >= ks->seq.l || pos + d - 1 >= ks->seq.l);
276 if (drand48() < 0.5) {
277 fpo[0] = fpout1; fpo[1] = fpout2;
278 s[0] = size[0]; s[1] = size[1];
280 fpo[1] = fpout1; fpo[0] = fpout2;
281 s[1] = size[0]; s[0] = size[1];
285 // generate the read sequences
286 target = rseq[drand48()<0.5?0:1].s; // haplotype from which the reads are generated
287 n_sub[0] = n_sub[1] = n_indel[0] = n_indel[1] = n_err[0] = n_err[1] = 0;
289 #define __gen_read(x, start, iter) do { \
290 for (i = (start), k = 0, ext_coor[x] = -10; i >= 0 && i < ks->seq.l && k < s[x]; iter) { \
291 int c = target[i], mut_type = c & mutmsk; \
292 if (ext_coor[x] < 0) { \
293 if (mut_type != NOCHANGE && mut_type != SUBSTITUTE) continue; \
296 if (mut_type == DELETE) ++n_indel[x]; \
297 else if (mut_type == NOCHANGE || mut_type == SUBSTITUTE) { \
298 tmp_seq[x][k++] = c & 0xf; \
299 if (mut_type == SUBSTITUTE) ++n_sub[x]; \
303 tmp_seq[x][k++] = c & 0xf; \
304 for (n = mut_type>>12, ins = c>>4; n > 0 && k < s[x]; --n, ins >>= 2) \
305 tmp_seq[x][k++] = ins & 0x3; \
308 if (k != s[x]) ext_coor[x] = -10; \
311 __gen_read(0, pos, ++i);
312 __gen_read(1, pos + d - 1, --i);
313 for (k = 0; k < s[1]; ++k) tmp_seq[1][k] = tmp_seq[1][k] < 4? 3 - tmp_seq[1][k] : 4; // complement
314 if (ext_coor[0] < 0 || ext_coor[1] < 0) { // fail to generate the read(s)
319 // generate sequencing errors
320 for (j = 0; j < 2; ++j) {
322 for (i = 0; i < s[j]; ++i) {
323 int c = tmp_seq[j][i];
324 if (c >= 4) { // actually c should be never larger than 4 if everything is correct
327 } else if (drand48() < ERR_RATE) {
328 // c = (c + (int)(drand48() * 3.0 + 1)) & 3; // random sequencing errors
329 c = (c + 1) & 3; // recurrent sequencing errors
334 if ((double)n_n / s[j] > MAX_N_RATIO) break;
336 if (j < 2) { // too many ambiguous bases on one of the reads
342 for (j = 0; j < 2; ++j) {
343 for (i = 0; i < s[j]; ++i) qstr[i] = Q;
345 fprintf(fpo[j], "@%s_%u_%u_%d:%d:%d_%d:%d:%d_%llx/%d\n", ks->name.s, ext_coor[0]+1, ext_coor[1]+1,
346 n_err[0], n_sub[0], n_indel[0], n_err[1], n_sub[1], n_indel[1],
347 (long long)ii, j==0? is_flip+1 : 2-is_flip);
348 for (i = 0; i < s[j]; ++i)
349 fputc("ACGTN"[(int)tmp_seq[j][i]], fpo[j]);
350 fprintf(fpo[j], "\n+\n%s\n", qstr);
353 free(rseq[0].s); free(rseq[1].s);
358 free(tmp_seq[0]); free(tmp_seq[1]);
361 static int simu_usage()
363 fprintf(stderr, "\n");
364 fprintf(stderr, "Program: wgsim (short read simulator)\n");
365 fprintf(stderr, "Version: %s\n", PACKAGE_VERSION);
366 fprintf(stderr, "Contact: Heng Li <lh3@sanger.ac.uk>\n\n");
367 fprintf(stderr, "Usage: wgsim [options] <in.ref.fa> <out.read1.fq> <out.read2.fq>\n\n");
368 fprintf(stderr, "Options: -e FLOAT base error rate [%.3f]\n", ERR_RATE);
369 fprintf(stderr, " -d INT outer distance between the two ends [500]\n");
370 fprintf(stderr, " -s INT standard deviation [50]\n");
371 fprintf(stderr, " -N INT number of read pairs [1000000]\n");
372 fprintf(stderr, " -1 INT length of the first read [70]\n");
373 fprintf(stderr, " -2 INT length of the second read [70]\n");
374 fprintf(stderr, " -r FLOAT rate of mutations [%.4f]\n", MUT_RATE);
375 fprintf(stderr, " -R FLOAT fraction of indels [%.2f]\n", INDEL_FRAC);
376 fprintf(stderr, " -X FLOAT probability an indel is extended [%.2f]\n", INDEL_EXTEND);
377 fprintf(stderr, " -S INT seed for random generator [-1]\n");
378 fprintf(stderr, " -h haplotype mode\n");
379 fprintf(stderr, "\n");
383 int main(int argc, char *argv[])
386 int dist, std_dev, c, size_l, size_r, is_hap = 0;
387 FILE *fpout1, *fpout2;
390 N = 1000000; dist = 500; std_dev = 50;
391 size_l = size_r = 70;
392 while ((c = getopt(argc, argv, "e:d:s:N:1:2:r:R:hX:S:")) >= 0) {
394 case 'd': dist = atoi(optarg); break;
395 case 's': std_dev = atoi(optarg); break;
396 case 'N': N = atoi(optarg); break;
397 case '1': size_l = atoi(optarg); break;
398 case '2': size_r = atoi(optarg); break;
399 case 'e': ERR_RATE = atof(optarg); break;
400 case 'r': MUT_RATE = atof(optarg); break;
401 case 'R': INDEL_FRAC = atof(optarg); break;
402 case 'X': INDEL_EXTEND = atof(optarg); break;
403 case 'S': seed = atoi(optarg); break;
404 case 'h': is_hap = 1; break;
407 if (argc - optind < 3) return simu_usage();
408 fpout1 = fopen(argv[optind+1], "w");
409 fpout2 = fopen(argv[optind+2], "w");
410 if (!fpout1 || !fpout2) {
411 fprintf(stderr, "[wgsim] file open error\n");
414 srand48(seed > 0? seed : time(0));
415 wgsim_core(fpout1, fpout2, argv[optind], is_hap, N, dist, std_dev, size_l, size_r);
417 fclose(fpout1); fclose(fpout2);