3 Copyright (c) 2008 Genome Research Ltd (GRL).
5 Permission is hereby granted, free of charge, to any person obtaining
6 a copy of this software and associated documentation files (the
7 "Software"), to deal in the Software without restriction, including
8 without limitation the rights to use, copy, modify, merge, publish,
9 distribute, sublicense, and/or sell copies of the Software, and to
10 permit persons to whom the Software is furnished to do so, subject to
11 the following conditions:
13 The above copyright notice and this permission notice shall be
14 included in all copies or substantial portions of the Software.
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26 /* Contact: Heng Li <lh3@sanger.ac.uk> */
28 /* This program is separated from maq's read simulator with Colin
29 * Hercus' modification to allow longer indels. Colin is the chief
30 * developer of novoalign. */
42 #define PACKAGE_VERSION "0.2.1"
44 const uint8_t nst_nt4_table[256] = {
45 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
46 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
47 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5 /*'-'*/, 4, 4,
48 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
49 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
50 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
51 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
52 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
53 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
54 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
55 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
56 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
57 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
58 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
59 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
60 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
63 const int nst_color_space_table[] = { 4, 0, 0, 1, 0, 2, 3, 4, 0, 3, 2, 4, 1, 4, 4, 4};
65 /* Simple normal random number generator, copied from genran.c */
71 double fac, rsq, v1, v2;
74 v1 = 2.0 * drand48() - 1.0;
75 v2 = 2.0 * drand48() - 1.0;
76 rsq = v1 * v1 + v2 * v2;
77 } while (rsq >= 1.0 || rsq == 0.0);
78 fac = sqrt(-2.0 * log(rsq) / rsq);
88 /* FASTA parser, copied from seq.c */
91 int l, m; /* length and maximum buffer size */
92 unsigned char *s; /* sequence */
95 #define INIT_SEQ(seq) (seq).s = 0; (seq).l = (seq).m = 0
97 static int SEQ_BLOCK_SIZE = 512;
99 void seq_set_block_size(int size)
101 SEQ_BLOCK_SIZE = size;
104 int seq_read_fasta(FILE *fp, seq_t *seq, char *locus, char *comment)
110 while (!feof(fp) && fgetc(fp) != '>');
111 if (feof(fp)) return -1;
113 while (!feof(fp) && (c = fgetc(fp)) != ' ' && c != '\t' && c != '\n')
114 if (c != '\r') *p++ = c;
119 while (!feof(fp) && ((c = fgetc(fp)) == ' ' || c == '\t'));
122 while (!feof(fp) && (c = fgetc(fp)) != '\n')
123 if (c != '\r') *p++ = c;
127 } else if (c != '\n') while (!feof(fp) && fgetc(fp) != '\n');
129 while (!feof(fp) && (c = fgetc(fp)) != '>') {
130 if (isalpha(c) || c == '-' || c == '.') {
132 max += SEQ_BLOCK_SIZE;
133 seq->s = (unsigned char*)realloc(seq->s, sizeof(char) * max);
135 seq->s[l++] = (unsigned char)c;
138 if (c == '>') ungetc(c,fp);
140 seq->m = max; seq->l = l;
144 /* Error-checking open, copied from utils.c */
146 #define xopen(fn, mode) err_xopen_core(__func__, fn, mode)
148 FILE *err_xopen_core(const char *func, const char *fn, const char *mode)
151 if (strcmp(fn, "-") == 0)
152 return (strstr(mode, "r"))? stdin : stdout;
153 if ((fp = fopen(fn, mode)) == 0) {
154 fprintf(stderr, "[%s] fail to open file '%s'. Abort!\n", func, fn);
162 enum muttype_t {NOCHANGE = 0, INSERT = 0x1000, SUBSTITUTE = 0xe000, DELETE = 0xf000};
163 typedef unsigned short mut_t;
164 static mut_t mutmsk = (mut_t)0xf000;
167 int l, m; /* length and maximum buffer size */
168 mut_t *s; /* sequence */
171 static double ERR_RATE = 0.02;
172 static double MUT_RATE = 0.001;
173 static double INDEL_FRAC = 0.1;
174 static double INDEL_EXTEND = 0.3;
175 static int IS_SOLID = 0;
177 void maq_mut_diref(const seq_t *seq, int is_hap, mutseq_t *hap1, mutseq_t *hap2)
182 ret[0] = hap1; ret[1] = hap2;
183 ret[0]->l = seq->l; ret[1]->l = seq->l;
184 ret[0]->m = seq->m; ret[1]->m = seq->m;
185 ret[0]->s = (mut_t *)calloc(seq->m, sizeof(mut_t));
186 ret[1]->s = (mut_t *)calloc(seq->m, sizeof(mut_t));
187 for (i = 0; i != seq->l; ++i) {
189 c = ret[0]->s[i] = ret[1]->s[i] = (mut_t)nst_nt4_table[(int)seq->s[i]];
191 if (drand48() < INDEL_EXTEND) {
192 if (deleting & 1) ret[0]->s[i] |= DELETE;
193 if (deleting & 2) ret[1]->s[i] |= DELETE;
197 if (c < 4 && drand48() < MUT_RATE) { // mutation
198 if (drand48() >= INDEL_FRAC) { // substitution
199 double r = drand48();
200 c = (c + (int)(r * 3.0 + 1)) & 3;
201 if (is_hap || drand48() < 0.333333) { // hom
202 ret[0]->s[i] = ret[1]->s[i] = SUBSTITUTE|c;
204 ret[drand48()<0.5?0:1]->s[i] = SUBSTITUTE|c;
207 if (drand48() < 0.5) { // deletion
208 if (is_hap || drand48() < 0.333333) { // hom-del
209 ret[0]->s[i] = ret[1]->s[i] = DELETE;
212 deleting = drand48()<0.5?1:2;
213 ret[deleting-1]->s[i] = DELETE;
215 } else { // insertion
216 int num_ins = 0, ins = 0;
219 ins = (ins << 2) | (int)(drand48() * 4.0);
220 } while (num_ins < 4 && drand48() < INDEL_EXTEND);
222 if (is_hap || drand48() < 0.333333) { // hom-ins
223 ret[0]->s[i] = ret[1]->s[i] = (num_ins << 12) | (ins << 4) | c;
225 ret[drand48()<0.5?0:1]->s[i] = (num_ins << 12) | (ins << 4) | c;
232 void maq_print_mutref(const char *name, const seq_t *seq, mutseq_t *hap1, mutseq_t *hap2)
235 for (i = 0; i != seq->l; ++i) {
237 c[0] = nst_nt4_table[(int)seq->s[i]];
238 c[1] = hap1->s[i]; c[2] = hap2->s[i];
239 if (c[0] >= 4) continue;
240 if ((c[1] & mutmsk) != NOCHANGE || (c[1] & mutmsk) != NOCHANGE) {
241 printf("%s\t%d\t", name, i+1);
242 if (c[1] == c[2]) { // hom
243 if ((c[1]&mutmsk) == SUBSTITUTE) { // substitution
244 printf("%c\t%c\t-\n", "ACGTN"[c[0]], "ACGTN"[c[1]&0xf]);
245 } else if ((c[1]&mutmsk) == DELETE) { // del
246 printf("%c\t-\t-\n", "ACGTN"[c[0]]);
247 } else if (((c[1] & mutmsk) >> 12) <= 5) { // ins
249 int n = (c[1]&mutmsk) >> 12, ins = c[1] >> 4;
251 putchar("ACGTN"[ins & 0x3]);
257 if ((c[1]&mutmsk) == SUBSTITUTE || (c[2]&mutmsk) == SUBSTITUTE) { // substitution
258 printf("%c\t%c\t+\n", "ACGTN"[c[0]], "XACMGRSVTWYHKDBN"[1<<(c[1]&0x3)|1<<(c[2]&0x3)]);
259 } else if ((c[1]&mutmsk) == DELETE) {
260 printf("%c\t-\t+\n", "ACGTN"[c[0]]);
261 } else if ((c[2]&mutmsk) == DELETE) {
262 printf("%c\t-\t+\n", "ACGTN"[c[0]]);
263 } else if (((c[1] & mutmsk) >> 12) <= 4) { // ins1
265 int n = (c[1]&mutmsk) >> 12, ins = c[1] >> 4;
267 putchar("ACGTN"[ins & 0x3]);
271 } else if (((c[2] & mutmsk) >> 12) <= 5) { // ins2
273 int n = (c[2]&mutmsk) >> 12, ins = c[2] >> 4;
275 putchar("ACGTN"[ins & 0x3]);
286 void wgsim_core(FILE *fpout1, FILE *fpout2, FILE *fp_fa, int is_hap, uint64_t N, int dist, int std_dev, int size_l, int size_r)
290 uint64_t tot_len, ii;
292 char name[256], *qstr;
299 seq_set_block_size(0x1000000);
300 l = size_l > size_r? size_l : size_r;
301 qstr = (char*)calloc(l+1, 1);
302 tmp_seq[0] = (uint8_t*)calloc(l+2, 1);
303 tmp_seq[1] = (uint8_t*)calloc(l+2, 1);
304 size[0] = size_l; size[1] = size_r;
306 Q = (int)(-10.0 * log(ERR_RATE) / log(10.0) + 0.499) + 33;
309 while ((l = seq_read_fasta(fp_fa, &seq, name, 0)) >= 0) {
313 fprintf(stderr, "[wgsim_core] %d sequences, total length: %llu\n", n_ref, (long long)tot_len);
316 while ((l = seq_read_fasta(fp_fa, &seq, name, 0)) >= 0) {
317 uint64_t n_pairs = (uint64_t)((long double)l / tot_len * N + 0.5);
318 if (l < dist + 3 * std_dev) {
319 fprintf(stderr, "[wgsim_core] kkip sequence '%s' as it is shorter than %d!\n", name, dist + 3 * std_dev);
323 // generate mutations and print them out
324 maq_mut_diref(&seq, is_hap, rseq, rseq+1);
325 maq_print_mutref(name, &seq, rseq, rseq+1);
327 for (ii = 0; ii != n_pairs; ++ii) { // the core loop
329 int d, pos, s[2], is_flip = 0;
330 int n_sub[2], n_indel[2], n_err[2], ext_coor[2], j, k;
333 do { // avoid boundary failure
335 ran = ran * std_dev + dist;
336 d = (int)(ran + 0.5);
337 pos = (int)((l - d + 1) * drand48());
338 } while (pos < 0 || pos >= seq.l || pos + d - 1 >= seq.l);
341 if (drand48() < 0.5) {
342 fpo[0] = fpout1; fpo[1] = fpout2;
343 s[0] = size[0]; s[1] = size[1];
345 fpo[1] = fpout1; fpo[0] = fpout2;
346 s[1] = size[0]; s[0] = size[1];
350 // generate the read sequences
351 target = rseq[drand48()<0.5?0:1].s; // haplotype from which the reads are generated
352 n_sub[0] = n_sub[1] = n_indel[0] = n_indel[1] = n_err[0] = n_err[1] = 0;
354 #define __gen_read(x, start, iter) do { \
355 for (i = (start), k = 0, ext_coor[x] = -10; i >= 0 && i < seq.l && k < s[x]; iter) { \
356 int c = target[i], mut_type = c & mutmsk; \
357 if (ext_coor[x] < 0) { \
358 if (mut_type != NOCHANGE && mut_type != SUBSTITUTE) continue; \
361 if (mut_type == DELETE) ++n_indel[x]; \
362 else if (mut_type == NOCHANGE || mut_type == SUBSTITUTE) { \
363 tmp_seq[x][k++] = c & 0xf; \
364 if (mut_type == SUBSTITUTE) ++n_sub[x]; \
368 for (n = mut_type>>12, ins = c>>4; n > 0 && k < s[x]; --n, ins >>= 2) \
369 tmp_seq[x][k++] = ins & 0x3; \
372 if (k != s[x]) ext_coor[x] = -10; \
376 __gen_read(0, pos, ++i);
377 __gen_read(1, pos + d - 1, --i);
378 for (k = 0; k < s[1]; ++k) tmp_seq[1][k] = tmp_seq[1][k] < 4? 3 - tmp_seq[1][k] : 4; // complement
381 ++s[0]; ++s[1]; // temporarily increase read length by 1
382 if (is_flip) { // RR pair
383 __gen_read(0, pos + s[0], --i);
384 __gen_read(1, pos + d - 1, --i);
386 __gen_read(0, pos, ++i);
387 __gen_read(1, pos + d - 1 - s[1], ++i);
388 ++ext_coor[0]; ++ext_coor[1];
390 // change to color sequence: (0,1,2,3) -> (A,C,G,T)
391 for (j = 0; j < 2; ++j) {
393 for (i = 1; i < s[j]; ++i) {
395 c = (c1 >= 4 || c2 >= 4)? 4 : nst_color_space_table[(1<<c1)|(1<<c2)];
400 --s[0]; --s[1]; // change back
402 if (ext_coor[0] < 0 || ext_coor[1] < 0) { // fail to generate the read(s)
407 // generate sequencing errors
408 for (j = 0; j < 2; ++j) {
409 for (i = 0; i < s[j]; ++i) {
410 int c = tmp_seq[j][i];
411 if (c >= 4) c = 4; // actually c should be never larger than 4 if everything is correct
412 else if (drand48() < ERR_RATE) {
413 c = (c + (int)(drand48() * 3.0 + 1)) & 3;
421 for (j = 0; j < 2; ++j) {
422 for (i = 0; i < s[j]; ++i) qstr[i] = Q;
424 fprintf(fpo[j], "@%s_%u_%u_%d:%d:%d_%d:%d:%d_%llx/%d\n", name, ext_coor[0]+1, ext_coor[1]+1,
425 n_err[0], n_sub[0], n_indel[0], n_err[1], n_sub[1], n_indel[1],
426 (long long)ii, j==0? is_flip+1 : 2-is_flip);
427 for (i = 0; i < s[j]; ++i)
428 fputc("ACGTN"[(int)tmp_seq[j][i]], fpo[j]);
429 fprintf(fpo[j], "\n+\n%s\n", qstr);
432 free(rseq[0].s); free(rseq[1].s);
434 free(seq.s); free(qstr);
435 free(tmp_seq[0]); free(tmp_seq[1]);
438 static int simu_usage()
440 fprintf(stderr, "\n");
441 fprintf(stderr, "Program: wgsim (short read simulator)\n");
442 fprintf(stderr, "Version: %s\n", PACKAGE_VERSION);
443 fprintf(stderr, "Contact: Heng Li <lh3@sanger.ac.uk>\n\n");
444 fprintf(stderr, "Usage: wgsim [options] <in.ref.fa> <out.read1.fq> <out.read2.fq>\n\n");
445 fprintf(stderr, "Options: -e FLOAT base error rate [%.3f]\n", ERR_RATE);
446 fprintf(stderr, " -d INT outer distance between the two ends [500]\n");
447 fprintf(stderr, " -s INT standard deviation [50]\n");
448 fprintf(stderr, " -N INT number of read pairs [1000000]\n");
449 fprintf(stderr, " -1 INT length of the first read [70]\n");
450 fprintf(stderr, " -2 INT length of the second read [70]\n");
451 fprintf(stderr, " -r FLOAT rate of mutations [%.4f]\n", MUT_RATE);
452 fprintf(stderr, " -R FLOAT fraction of indels [%.2f]\n", INDEL_FRAC);
453 fprintf(stderr, " -X FLOAT probability an indel is extended [%.2f]\n", INDEL_EXTEND);
454 fprintf(stderr, " -c generate reads in color space (SOLiD reads)\n");
455 fprintf(stderr, " -h haplotype mode\n");
456 fprintf(stderr, "\n");
457 fprintf(stderr, "Note: For SOLiD reads, the first read is F3 and the second is R3.\n\n");
461 int main(int argc, char *argv[])
464 int dist, std_dev, c, size_l, size_r, is_hap = 0;
465 FILE *fpout1, *fpout2, *fp_fa;
467 N = 1000000; dist = 500; std_dev = 50;
468 size_l = size_r = 70;
469 while ((c = getopt(argc, argv, "e:d:s:N:1:2:r:R:hX:c")) >= 0) {
471 case 'd': dist = atoi(optarg); break;
472 case 's': std_dev = atoi(optarg); break;
473 case 'N': N = atoi(optarg); break;
474 case '1': size_l = atoi(optarg); break;
475 case '2': size_r = atoi(optarg); break;
476 case 'e': ERR_RATE = atof(optarg); break;
477 case 'r': MUT_RATE = atof(optarg); break;
478 case 'R': INDEL_FRAC = atof(optarg); break;
479 case 'X': INDEL_EXTEND = atof(optarg); break;
480 case 'c': IS_SOLID = 1; break;
481 case 'h': is_hap = 1; break;
484 if (argc - optind < 3) return simu_usage();
485 fp_fa = (strcmp(argv[optind+0], "-") == 0)? stdin : xopen(argv[optind+0], "r");
486 fpout1 = xopen(argv[optind+1], "w");
487 fpout2 = xopen(argv[optind+2], "w");
488 wgsim_core(fpout1, fpout2, fp_fa, is_hap, N, dist, std_dev, size_l, size_r);
490 fclose(fpout1); fclose(fpout2); fclose(fp_fa);