3 Copyright (c) 2008 Genome Research Ltd (GRL).
5 Permission is hereby granted, free of charge, to any person obtaining
6 a copy of this software and associated documentation files (the
7 "Software"), to deal in the Software without restriction, including
8 without limitation the rights to use, copy, modify, merge, publish,
9 distribute, sublicense, and/or sell copies of the Software, and to
10 permit persons to whom the Software is furnished to do so, subject to
11 the following conditions:
13 The above copyright notice and this permission notice shall be
14 included in all copies or substantial portions of the Software.
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26 /* Contact: Heng Li <lh3@sanger.ac.uk> */
28 /* This program is separated from maq's read simulator with Colin
29 * Hercus' modification to allow longer indels. Colin is the chief
30 * developer of novoalign. */
42 #define PACKAGE_VERSION "0.2.3"
44 const uint8_t nst_nt4_table[256] = {
45 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
46 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
47 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5 /*'-'*/, 4, 4,
48 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
49 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
50 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
51 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
52 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
53 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
54 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
55 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
56 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
57 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
58 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
59 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
60 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
63 const int nst_color_space_table[] = { 4, 0, 0, 1, 0, 2, 3, 4, 0, 3, 2, 4, 1, 4, 4, 4};
65 /* Simple normal random number generator, copied from genran.c */
71 double fac, rsq, v1, v2;
74 v1 = 2.0 * drand48() - 1.0;
75 v2 = 2.0 * drand48() - 1.0;
76 rsq = v1 * v1 + v2 * v2;
77 } while (rsq >= 1.0 || rsq == 0.0);
78 fac = sqrt(-2.0 * log(rsq) / rsq);
88 /* FASTA parser, copied from seq.c */
91 int l, m; /* length and maximum buffer size */
92 unsigned char *s; /* sequence */
95 #define INIT_SEQ(seq) (seq).s = 0; (seq).l = (seq).m = 0
97 static int SEQ_BLOCK_SIZE = 512;
99 void seq_set_block_size(int size)
101 SEQ_BLOCK_SIZE = size;
104 int seq_read_fasta(FILE *fp, seq_t *seq, char *locus, char *comment)
110 while (!feof(fp) && fgetc(fp) != '>');
111 if (feof(fp)) return -1;
113 while (!feof(fp) && (c = fgetc(fp)) != ' ' && c != '\t' && c != '\n')
114 if (c != '\r') *p++ = c;
119 while (!feof(fp) && ((c = fgetc(fp)) == ' ' || c == '\t'));
122 while (!feof(fp) && (c = fgetc(fp)) != '\n')
123 if (c != '\r') *p++ = c;
127 } else if (c != '\n') while (!feof(fp) && fgetc(fp) != '\n');
129 while (!feof(fp) && (c = fgetc(fp)) != '>') {
130 if (isalpha(c) || c == '-' || c == '.') {
132 max += SEQ_BLOCK_SIZE;
133 seq->s = (unsigned char*)realloc(seq->s, sizeof(char) * max);
135 seq->s[l++] = (unsigned char)c;
138 if (c == '>') ungetc(c,fp);
140 seq->m = max; seq->l = l;
144 /* Error-checking open, copied from utils.c */
146 #define xopen(fn, mode) err_xopen_core(__func__, fn, mode)
148 FILE *err_xopen_core(const char *func, const char *fn, const char *mode)
151 if (strcmp(fn, "-") == 0)
152 return (strstr(mode, "r"))? stdin : stdout;
153 if ((fp = fopen(fn, mode)) == 0) {
154 fprintf(stderr, "[%s] fail to open file '%s'. Abort!\n", func, fn);
162 enum muttype_t {NOCHANGE = 0, INSERT = 0x1000, SUBSTITUTE = 0xe000, DELETE = 0xf000};
163 typedef unsigned short mut_t;
164 static mut_t mutmsk = (mut_t)0xf000;
167 int l, m; /* length and maximum buffer size */
168 mut_t *s; /* sequence */
171 static double ERR_RATE = 0.02;
172 static double MUT_RATE = 0.001;
173 static double INDEL_FRAC = 0.1;
174 static double INDEL_EXTEND = 0.3;
175 static int IS_SOLID = 0;
176 static int SHOW_MM_INFO = 1;
178 void maq_mut_diref(const seq_t *seq, int is_hap, mutseq_t *hap1, mutseq_t *hap2)
183 ret[0] = hap1; ret[1] = hap2;
184 ret[0]->l = seq->l; ret[1]->l = seq->l;
185 ret[0]->m = seq->m; ret[1]->m = seq->m;
186 ret[0]->s = (mut_t *)calloc(seq->m, sizeof(mut_t));
187 ret[1]->s = (mut_t *)calloc(seq->m, sizeof(mut_t));
188 for (i = 0; i != seq->l; ++i) {
190 c = ret[0]->s[i] = ret[1]->s[i] = (mut_t)nst_nt4_table[(int)seq->s[i]];
192 if (drand48() < INDEL_EXTEND) {
193 if (deleting & 1) ret[0]->s[i] |= DELETE;
194 if (deleting & 2) ret[1]->s[i] |= DELETE;
198 if (c < 4 && drand48() < MUT_RATE) { // mutation
199 if (drand48() >= INDEL_FRAC) { // substitution
200 double r = drand48();
201 c = (c + (int)(r * 3.0 + 1)) & 3;
202 if (is_hap || drand48() < 0.333333) { // hom
203 ret[0]->s[i] = ret[1]->s[i] = SUBSTITUTE|c;
205 ret[drand48()<0.5?0:1]->s[i] = SUBSTITUTE|c;
208 if (drand48() < 0.5) { // deletion
209 if (is_hap || drand48() < 0.333333) { // hom-del
210 ret[0]->s[i] = ret[1]->s[i] = DELETE;
213 deleting = drand48()<0.5?1:2;
214 ret[deleting-1]->s[i] = DELETE;
216 } else { // insertion
217 int num_ins = 0, ins = 0;
220 ins = (ins << 2) | (int)(drand48() * 4.0);
221 } while (num_ins < 4 && drand48() < INDEL_EXTEND);
223 if (is_hap || drand48() < 0.333333) { // hom-ins
224 ret[0]->s[i] = ret[1]->s[i] = (num_ins << 12) | (ins << 4) | c;
226 ret[drand48()<0.5?0:1]->s[i] = (num_ins << 12) | (ins << 4) | c;
233 void maq_print_mutref(const char *name, const seq_t *seq, mutseq_t *hap1, mutseq_t *hap2)
236 for (i = 0; i != seq->l; ++i) {
238 c[0] = nst_nt4_table[(int)seq->s[i]];
239 c[1] = hap1->s[i]; c[2] = hap2->s[i];
240 if (c[0] >= 4) continue;
241 if ((c[1] & mutmsk) != NOCHANGE || (c[1] & mutmsk) != NOCHANGE) {
242 printf("%s\t%d\t", name, i+1);
243 if (c[1] == c[2]) { // hom
244 if ((c[1]&mutmsk) == SUBSTITUTE) { // substitution
245 printf("%c\t%c\t-\n", "ACGTN"[c[0]], "ACGTN"[c[1]&0xf]);
246 } else if ((c[1]&mutmsk) == DELETE) { // del
247 printf("%c\t-\t-\n", "ACGTN"[c[0]]);
248 } else if (((c[1] & mutmsk) >> 12) <= 5) { // ins
250 int n = (c[1]&mutmsk) >> 12, ins = c[1] >> 4;
252 putchar("ACGTN"[ins & 0x3]);
258 if ((c[1]&mutmsk) == SUBSTITUTE || (c[2]&mutmsk) == SUBSTITUTE) { // substitution
259 printf("%c\t%c\t+\n", "ACGTN"[c[0]], "XACMGRSVTWYHKDBN"[1<<(c[1]&0x3)|1<<(c[2]&0x3)]);
260 } else if ((c[1]&mutmsk) == DELETE) {
261 printf("%c\t-\t+\n", "ACGTN"[c[0]]);
262 } else if ((c[2]&mutmsk) == DELETE) {
263 printf("%c\t-\t+\n", "ACGTN"[c[0]]);
264 } else if (((c[1] & mutmsk) >> 12) <= 4) { // ins1
266 int n = (c[1]&mutmsk) >> 12, ins = c[1] >> 4;
268 putchar("ACGTN"[ins & 0x3]);
272 } else if (((c[2] & mutmsk) >> 12) <= 5) { // ins2
274 int n = (c[2]&mutmsk) >> 12, ins = c[2] >> 4;
276 putchar("ACGTN"[ins & 0x3]);
287 void wgsim_core(FILE *fpout1, FILE *fpout2, FILE *fp_fa, int is_hap, uint64_t N, int dist, int std_dev, int size_l, int size_r)
291 uint64_t tot_len, ii;
293 char name[256], *qstr;
300 seq_set_block_size(0x1000000);
301 l = size_l > size_r? size_l : size_r;
302 qstr = (char*)calloc(l+1, 1);
303 tmp_seq[0] = (uint8_t*)calloc(l+2, 1);
304 tmp_seq[1] = (uint8_t*)calloc(l+2, 1);
305 size[0] = size_l; size[1] = size_r;
307 Q = (int)(-10.0 * log(ERR_RATE) / log(10.0) + 0.499) + 33;
310 while ((l = seq_read_fasta(fp_fa, &seq, name, 0)) >= 0) {
314 fprintf(stderr, "[wgsim_core] %d sequences, total length: %llu\n", n_ref, (long long)tot_len);
317 while ((l = seq_read_fasta(fp_fa, &seq, name, 0)) >= 0) {
318 uint64_t n_pairs = (uint64_t)((long double)l / tot_len * N + 0.5);
319 if (l < dist + 3 * std_dev) {
320 fprintf(stderr, "[wgsim_core] kkip sequence '%s' as it is shorter than %d!\n", name, dist + 3 * std_dev);
324 // generate mutations and print them out
325 maq_mut_diref(&seq, is_hap, rseq, rseq+1);
326 maq_print_mutref(name, &seq, rseq, rseq+1);
328 for (ii = 0; ii != n_pairs; ++ii) { // the core loop
330 int d, pos, s[2], is_flip = 0;
331 int n_sub[2], n_indel[2], n_err[2], ext_coor[2], j, k;
334 do { // avoid boundary failure
336 ran = ran * std_dev + dist;
337 d = (int)(ran + 0.5);
338 pos = (int)((l - d + 1) * drand48());
339 } while (pos < 0 || pos >= seq.l || pos + d - 1 >= seq.l);
342 if (drand48() < 0.5) {
343 fpo[0] = fpout1; fpo[1] = fpout2;
344 s[0] = size[0]; s[1] = size[1];
346 fpo[1] = fpout1; fpo[0] = fpout2;
347 s[1] = size[0]; s[0] = size[1];
351 // generate the read sequences
352 target = rseq[drand48()<0.5?0:1].s; // haplotype from which the reads are generated
353 n_sub[0] = n_sub[1] = n_indel[0] = n_indel[1] = n_err[0] = n_err[1] = 0;
355 #define __gen_read(x, start, iter) do { \
356 for (i = (start), k = 0, ext_coor[x] = -10; i >= 0 && i < seq.l && k < s[x]; iter) { \
357 int c = target[i], mut_type = c & mutmsk; \
358 if (ext_coor[x] < 0) { \
359 if (mut_type != NOCHANGE && mut_type != SUBSTITUTE) continue; \
362 if (mut_type == DELETE) ++n_indel[x]; \
363 else if (mut_type == NOCHANGE || mut_type == SUBSTITUTE) { \
364 tmp_seq[x][k++] = c & 0xf; \
365 if (mut_type == SUBSTITUTE) ++n_sub[x]; \
369 tmp_seq[x][k++] = c & 0xf; \
370 for (n = mut_type>>12, ins = c>>4; n > 0 && k < s[x]; --n, ins >>= 2) \
371 tmp_seq[x][k++] = ins & 0x3; \
374 if (k != s[x]) ext_coor[x] = -10; \
378 __gen_read(0, pos, ++i);
379 __gen_read(1, pos + d - 1, --i);
380 for (k = 0; k < s[1]; ++k) tmp_seq[1][k] = tmp_seq[1][k] < 4? 3 - tmp_seq[1][k] : 4; // complement
383 ++s[0]; ++s[1]; // temporarily increase read length by 1
384 if (is_flip) { // RR pair
385 __gen_read(0, pos + s[0], --i);
386 __gen_read(1, pos + d - 1, --i);
388 __gen_read(0, pos, ++i);
389 __gen_read(1, pos + d - 1 - s[1], ++i);
390 ++ext_coor[0]; ++ext_coor[1];
392 // change to color sequence: (0,1,2,3) -> (A,C,G,T)
393 for (j = 0; j < 2; ++j) {
395 for (i = 1; i < s[j]; ++i) {
397 c = (c1 >= 4 || c2 >= 4)? 4 : nst_color_space_table[(1<<c1)|(1<<c2)];
402 --s[0]; --s[1]; // change back
404 if (ext_coor[0] < 0 || ext_coor[1] < 0) { // fail to generate the read(s)
409 // generate sequencing errors
410 for (j = 0; j < 2; ++j) {
411 for (i = 0; i < s[j]; ++i) {
412 int c = tmp_seq[j][i];
413 if (c >= 4) c = 4; // actually c should be never larger than 4 if everything is correct
414 else if (drand48() < ERR_RATE) {
415 c = (c + (int)(drand48() * 3.0 + 1)) & 3;
423 for (j = 0; j < 2; ++j) {
424 for (i = 0; i < s[j]; ++i) qstr[i] = Q;
427 fprintf(fpo[j], "@%s_%u_%u_%d:%d:%d_%d:%d:%d_%llx/%d\n", name, ext_coor[0]+1, ext_coor[1]+1,
428 n_err[0], n_sub[0], n_indel[0], n_err[1], n_sub[1], n_indel[1],
429 (long long)ii, j==0? is_flip+1 : 2-is_flip);
431 fprintf(fpo[j], "@%s_%u_%u_%llx/%d %d:%d:%d_%d:%d:%d\n", name, ext_coor[0]+1, ext_coor[1]+1,
432 (long long)ii, j==0? is_flip+1 : 2-is_flip,
433 n_err[0], n_sub[0], n_indel[0], n_err[1], n_sub[1], n_indel[1]);
435 for (i = 0; i < s[j]; ++i)
436 fputc("ACGTN"[(int)tmp_seq[j][i]], fpo[j]);
437 fprintf(fpo[j], "\n+\n%s\n", qstr);
440 free(rseq[0].s); free(rseq[1].s);
442 free(seq.s); free(qstr);
443 free(tmp_seq[0]); free(tmp_seq[1]);
446 static int simu_usage()
448 fprintf(stderr, "\n");
449 fprintf(stderr, "Program: wgsim (short read simulator)\n");
450 fprintf(stderr, "Version: %s\n", PACKAGE_VERSION);
451 fprintf(stderr, "Contact: Heng Li <lh3@sanger.ac.uk>\n\n");
452 fprintf(stderr, "Usage: wgsim [options] <in.ref.fa> <out.read1.fq> <out.read2.fq>\n\n");
453 fprintf(stderr, "Options: -e FLOAT base error rate [%.3f]\n", ERR_RATE);
454 fprintf(stderr, " -d INT outer distance between the two ends [500]\n");
455 fprintf(stderr, " -s INT standard deviation [50]\n");
456 fprintf(stderr, " -N INT number of read pairs [1000000]\n");
457 fprintf(stderr, " -1 INT length of the first read [70]\n");
458 fprintf(stderr, " -2 INT length of the second read [70]\n");
459 fprintf(stderr, " -r FLOAT rate of mutations [%.4f]\n", MUT_RATE);
460 fprintf(stderr, " -R FLOAT fraction of indels [%.2f]\n", INDEL_FRAC);
461 fprintf(stderr, " -X FLOAT probability an indel is extended [%.2f]\n", INDEL_EXTEND);
462 fprintf(stderr, " -c generate reads in color space (SOLiD reads)\n");
463 fprintf(stderr, " -C show mismatch info in comment rather than read name\n");
464 fprintf(stderr, " -h haplotype mode\n");
465 fprintf(stderr, "\n");
466 fprintf(stderr, "Note: For SOLiD reads, the first read is F3 and the second is R3.\n\n");
470 int main(int argc, char *argv[])
473 int dist, std_dev, c, size_l, size_r, is_hap = 0;
474 FILE *fpout1, *fpout2, *fp_fa;
476 N = 1000000; dist = 500; std_dev = 50;
477 size_l = size_r = 70;
478 while ((c = getopt(argc, argv, "e:d:s:N:1:2:r:R:hX:cC")) >= 0) {
480 case 'd': dist = atoi(optarg); break;
481 case 's': std_dev = atoi(optarg); break;
482 case 'N': N = atoi(optarg); break;
483 case '1': size_l = atoi(optarg); break;
484 case '2': size_r = atoi(optarg); break;
485 case 'e': ERR_RATE = atof(optarg); break;
486 case 'r': MUT_RATE = atof(optarg); break;
487 case 'R': INDEL_FRAC = atof(optarg); break;
488 case 'X': INDEL_EXTEND = atof(optarg); break;
489 case 'c': IS_SOLID = 1; break;
490 case 'C': SHOW_MM_INFO = 0; break;
491 case 'h': is_hap = 1; break;
494 if (argc - optind < 3) return simu_usage();
495 fp_fa = (strcmp(argv[optind+0], "-") == 0)? stdin : xopen(argv[optind+0], "r");
496 fpout1 = xopen(argv[optind+1], "w");
497 fpout2 = xopen(argv[optind+2], "w");
498 wgsim_core(fpout1, fpout2, fp_fa, is_hap, N, dist, std_dev, size_l, size_r);
500 fclose(fpout1); fclose(fpout2); fclose(fp_fa);