3 Copyright (c) 2008 Genome Research Ltd (GRL).
5 Permission is hereby granted, free of charge, to any person obtaining
6 a copy of this software and associated documentation files (the
7 "Software"), to deal in the Software without restriction, including
8 without limitation the rights to use, copy, modify, merge, publish,
9 distribute, sublicense, and/or sell copies of the Software, and to
10 permit persons to whom the Software is furnished to do so, subject to
11 the following conditions:
13 The above copyright notice and this permission notice shall be
14 included in all copies or substantial portions of the Software.
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26 /* Contact: Heng Li <lh3@sanger.ac.uk> */
28 /* This program is separated from maq's read simulator with Colin
29 * Hercus' modification to allow longer indels. Colin is the chief
30 * developer of novoalign. */
42 #define PACKAGE_VERSION "0.2.2"
44 const uint8_t nst_nt4_table[256] = {
45 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
46 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
47 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5 /*'-'*/, 4, 4,
48 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
49 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
50 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
51 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
52 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
53 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
54 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
55 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
56 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
57 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
58 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
59 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
60 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
63 const int nst_color_space_table[] = { 4, 0, 0, 1, 0, 2, 3, 4, 0, 3, 2, 4, 1, 4, 4, 4};
65 /* Simple normal random number generator, copied from genran.c */
71 double fac, rsq, v1, v2;
74 v1 = 2.0 * drand48() - 1.0;
75 v2 = 2.0 * drand48() - 1.0;
76 rsq = v1 * v1 + v2 * v2;
77 } while (rsq >= 1.0 || rsq == 0.0);
78 fac = sqrt(-2.0 * log(rsq) / rsq);
88 /* FASTA parser, copied from seq.c */
91 int l, m; /* length and maximum buffer size */
92 unsigned char *s; /* sequence */
95 #define INIT_SEQ(seq) (seq).s = 0; (seq).l = (seq).m = 0
97 static int SEQ_BLOCK_SIZE = 512;
99 void seq_set_block_size(int size)
101 SEQ_BLOCK_SIZE = size;
104 int seq_read_fasta(FILE *fp, seq_t *seq, char *locus, char *comment)
110 while (!feof(fp) && fgetc(fp) != '>');
111 if (feof(fp)) return -1;
113 while (!feof(fp) && (c = fgetc(fp)) != ' ' && c != '\t' && c != '\n')
114 if (c != '\r') *p++ = c;
119 while (!feof(fp) && ((c = fgetc(fp)) == ' ' || c == '\t'));
122 while (!feof(fp) && (c = fgetc(fp)) != '\n')
123 if (c != '\r') *p++ = c;
127 } else if (c != '\n') while (!feof(fp) && fgetc(fp) != '\n');
129 while (!feof(fp) && (c = fgetc(fp)) != '>') {
130 if (isalpha(c) || c == '-' || c == '.') {
132 max += SEQ_BLOCK_SIZE;
133 seq->s = (unsigned char*)realloc(seq->s, sizeof(char) * max);
135 seq->s[l++] = (unsigned char)c;
138 if (c == '>') ungetc(c,fp);
140 seq->m = max; seq->l = l;
144 /* Error-checking open, copied from utils.c */
146 #define xopen(fn, mode) err_xopen_core(__func__, fn, mode)
148 FILE *err_xopen_core(const char *func, const char *fn, const char *mode)
151 if (strcmp(fn, "-") == 0)
152 return (strstr(mode, "r"))? stdin : stdout;
153 if ((fp = fopen(fn, mode)) == 0) {
154 fprintf(stderr, "[%s] fail to open file '%s'. Abort!\n", func, fn);
162 enum muttype_t {NOCHANGE = 0, INSERT = 0x1000, SUBSTITUTE = 0xe000, DELETE = 0xf000};
163 typedef unsigned short mut_t;
164 static mut_t mutmsk = (mut_t)0xf000;
167 int l, m; /* length and maximum buffer size */
168 mut_t *s; /* sequence */
171 static double ERR_RATE = 0.02;
172 static double MUT_RATE = 0.001;
173 static double INDEL_FRAC = 0.1;
174 static double INDEL_EXTEND = 0.3;
175 static int IS_SOLID = 0;
176 static int SHOW_MM_INFO = 1;
178 void maq_mut_diref(const seq_t *seq, int is_hap, mutseq_t *hap1, mutseq_t *hap2)
183 ret[0] = hap1; ret[1] = hap2;
184 ret[0]->l = seq->l; ret[1]->l = seq->l;
185 ret[0]->m = seq->m; ret[1]->m = seq->m;
186 ret[0]->s = (mut_t *)calloc(seq->m, sizeof(mut_t));
187 ret[1]->s = (mut_t *)calloc(seq->m, sizeof(mut_t));
188 for (i = 0; i != seq->l; ++i) {
190 c = ret[0]->s[i] = ret[1]->s[i] = (mut_t)nst_nt4_table[(int)seq->s[i]];
192 if (drand48() < INDEL_EXTEND) {
193 if (deleting & 1) ret[0]->s[i] |= DELETE;
194 if (deleting & 2) ret[1]->s[i] |= DELETE;
198 if (c < 4 && drand48() < MUT_RATE) { // mutation
199 if (drand48() >= INDEL_FRAC) { // substitution
200 double r = drand48();
201 c = (c + (int)(r * 3.0 + 1)) & 3;
202 if (is_hap || drand48() < 0.333333) { // hom
203 ret[0]->s[i] = ret[1]->s[i] = SUBSTITUTE|c;
205 ret[drand48()<0.5?0:1]->s[i] = SUBSTITUTE|c;
208 if (drand48() < 0.5) { // deletion
209 if (is_hap || drand48() < 0.333333) { // hom-del
210 ret[0]->s[i] = ret[1]->s[i] = DELETE;
213 deleting = drand48()<0.5?1:2;
214 ret[deleting-1]->s[i] = DELETE;
216 } else { // insertion
217 int num_ins = 0, ins = 0;
220 ins = (ins << 2) | (int)(drand48() * 4.0);
221 } while (num_ins < 4 && drand48() < INDEL_EXTEND);
223 if (is_hap || drand48() < 0.333333) { // hom-ins
224 ret[0]->s[i] = ret[1]->s[i] = (num_ins << 12) | (ins << 4) | c;
226 ret[drand48()<0.5?0:1]->s[i] = (num_ins << 12) | (ins << 4) | c;
233 void maq_print_mutref(const char *name, const seq_t *seq, mutseq_t *hap1, mutseq_t *hap2)
236 for (i = 0; i != seq->l; ++i) {
238 c[0] = nst_nt4_table[(int)seq->s[i]];
239 c[1] = hap1->s[i]; c[2] = hap2->s[i];
240 if (c[0] >= 4) continue;
241 if ((c[1] & mutmsk) != NOCHANGE || (c[1] & mutmsk) != NOCHANGE) {
242 printf("%s\t%d\t", name, i+1);
243 if (c[1] == c[2]) { // hom
244 if ((c[1]&mutmsk) == SUBSTITUTE) { // substitution
245 printf("%c\t%c\t-\n", "ACGTN"[c[0]], "ACGTN"[c[1]&0xf]);
246 } else if ((c[1]&mutmsk) == DELETE) { // del
247 printf("%c\t-\t-\n", "ACGTN"[c[0]]);
248 } else if (((c[1] & mutmsk) >> 12) <= 5) { // ins
250 int n = (c[1]&mutmsk) >> 12, ins = c[1] >> 4;
252 putchar("ACGTN"[ins & 0x3]);
258 if ((c[1]&mutmsk) == SUBSTITUTE || (c[2]&mutmsk) == SUBSTITUTE) { // substitution
259 printf("%c\t%c\t+\n", "ACGTN"[c[0]], "XACMGRSVTWYHKDBN"[1<<(c[1]&0x3)|1<<(c[2]&0x3)]);
260 } else if ((c[1]&mutmsk) == DELETE) {
261 printf("%c\t-\t+\n", "ACGTN"[c[0]]);
262 } else if ((c[2]&mutmsk) == DELETE) {
263 printf("%c\t-\t+\n", "ACGTN"[c[0]]);
264 } else if (((c[1] & mutmsk) >> 12) <= 4) { // ins1
266 int n = (c[1]&mutmsk) >> 12, ins = c[1] >> 4;
268 putchar("ACGTN"[ins & 0x3]);
272 } else if (((c[2] & mutmsk) >> 12) <= 5) { // ins2
274 int n = (c[2]&mutmsk) >> 12, ins = c[2] >> 4;
276 putchar("ACGTN"[ins & 0x3]);
287 void wgsim_core(FILE *fpout1, FILE *fpout2, FILE *fp_fa, int is_hap, uint64_t N, int dist, int std_dev, int size_l, int size_r)
291 uint64_t tot_len, ii;
293 char name[256], *qstr;
300 seq_set_block_size(0x1000000);
301 l = size_l > size_r? size_l : size_r;
302 qstr = (char*)calloc(l+1, 1);
303 tmp_seq[0] = (uint8_t*)calloc(l+2, 1);
304 tmp_seq[1] = (uint8_t*)calloc(l+2, 1);
305 size[0] = size_l; size[1] = size_r;
307 Q = (int)(-10.0 * log(ERR_RATE) / log(10.0) + 0.499) + 33;
310 while ((l = seq_read_fasta(fp_fa, &seq, name, 0)) >= 0) {
314 fprintf(stderr, "[wgsim_core] %d sequences, total length: %llu\n", n_ref, (long long)tot_len);
317 while ((l = seq_read_fasta(fp_fa, &seq, name, 0)) >= 0) {
318 uint64_t n_pairs = (uint64_t)((long double)l / tot_len * N + 0.5);
319 if (l < dist + 3 * std_dev) {
320 fprintf(stderr, "[wgsim_core] kkip sequence '%s' as it is shorter than %d!\n", name, dist + 3 * std_dev);
324 // generate mutations and print them out
325 maq_mut_diref(&seq, is_hap, rseq, rseq+1);
326 maq_print_mutref(name, &seq, rseq, rseq+1);
328 for (ii = 0; ii != n_pairs; ++ii) { // the core loop
330 int d, pos, s[2], is_flip = 0;
331 int n_sub[2], n_indel[2], n_err[2], ext_coor[2], j, k;
334 do { // avoid boundary failure
336 ran = ran * std_dev + dist;
337 d = (int)(ran + 0.5);
338 pos = (int)((l - d + 1) * drand48());
339 } while (pos < 0 || pos >= seq.l || pos + d - 1 >= seq.l);
342 if (drand48() < 0.5) {
343 fpo[0] = fpout1; fpo[1] = fpout2;
344 s[0] = size[0]; s[1] = size[1];
346 fpo[1] = fpout1; fpo[0] = fpout2;
347 s[1] = size[0]; s[0] = size[1];
351 // generate the read sequences
352 target = rseq[drand48()<0.5?0:1].s; // haplotype from which the reads are generated
353 n_sub[0] = n_sub[1] = n_indel[0] = n_indel[1] = n_err[0] = n_err[1] = 0;
355 #define __gen_read(x, start, iter) do { \
356 for (i = (start), k = 0, ext_coor[x] = -10; i >= 0 && i < seq.l && k < s[x]; iter) { \
357 int c = target[i], mut_type = c & mutmsk; \
358 if (ext_coor[x] < 0) { \
359 if (mut_type != NOCHANGE && mut_type != SUBSTITUTE) continue; \
362 if (mut_type == DELETE) ++n_indel[x]; \
363 else if (mut_type == NOCHANGE || mut_type == SUBSTITUTE) { \
364 tmp_seq[x][k++] = c & 0xf; \
365 if (mut_type == SUBSTITUTE) ++n_sub[x]; \
369 for (n = mut_type>>12, ins = c>>4; n > 0 && k < s[x]; --n, ins >>= 2) \
370 tmp_seq[x][k++] = ins & 0x3; \
373 if (k != s[x]) ext_coor[x] = -10; \
377 __gen_read(0, pos, ++i);
378 __gen_read(1, pos + d - 1, --i);
379 for (k = 0; k < s[1]; ++k) tmp_seq[1][k] = tmp_seq[1][k] < 4? 3 - tmp_seq[1][k] : 4; // complement
382 ++s[0]; ++s[1]; // temporarily increase read length by 1
383 if (is_flip) { // RR pair
384 __gen_read(0, pos + s[0], --i);
385 __gen_read(1, pos + d - 1, --i);
387 __gen_read(0, pos, ++i);
388 __gen_read(1, pos + d - 1 - s[1], ++i);
389 ++ext_coor[0]; ++ext_coor[1];
391 // change to color sequence: (0,1,2,3) -> (A,C,G,T)
392 for (j = 0; j < 2; ++j) {
394 for (i = 1; i < s[j]; ++i) {
396 c = (c1 >= 4 || c2 >= 4)? 4 : nst_color_space_table[(1<<c1)|(1<<c2)];
401 --s[0]; --s[1]; // change back
403 if (ext_coor[0] < 0 || ext_coor[1] < 0) { // fail to generate the read(s)
408 // generate sequencing errors
409 for (j = 0; j < 2; ++j) {
410 for (i = 0; i < s[j]; ++i) {
411 int c = tmp_seq[j][i];
412 if (c >= 4) c = 4; // actually c should be never larger than 4 if everything is correct
413 else if (drand48() < ERR_RATE) {
414 c = (c + (int)(drand48() * 3.0 + 1)) & 3;
422 for (j = 0; j < 2; ++j) {
423 for (i = 0; i < s[j]; ++i) qstr[i] = Q;
426 fprintf(fpo[j], "@%s_%u_%u_%d:%d:%d_%d:%d:%d_%llx/%d\n", name, ext_coor[0]+1, ext_coor[1]+1,
427 n_err[0], n_sub[0], n_indel[0], n_err[1], n_sub[1], n_indel[1],
428 (long long)ii, j==0? is_flip+1 : 2-is_flip);
430 fprintf(fpo[j], "@%s_%u_%u_%llx/%d %d:%d:%d_%d:%d:%d\n", name, ext_coor[0]+1, ext_coor[1]+1,
431 (long long)ii, j==0? is_flip+1 : 2-is_flip,
432 n_err[0], n_sub[0], n_indel[0], n_err[1], n_sub[1], n_indel[1]);
434 for (i = 0; i < s[j]; ++i)
435 fputc("ACGTN"[(int)tmp_seq[j][i]], fpo[j]);
436 fprintf(fpo[j], "\n+\n%s\n", qstr);
439 free(rseq[0].s); free(rseq[1].s);
441 free(seq.s); free(qstr);
442 free(tmp_seq[0]); free(tmp_seq[1]);
445 static int simu_usage()
447 fprintf(stderr, "\n");
448 fprintf(stderr, "Program: wgsim (short read simulator)\n");
449 fprintf(stderr, "Version: %s\n", PACKAGE_VERSION);
450 fprintf(stderr, "Contact: Heng Li <lh3@sanger.ac.uk>\n\n");
451 fprintf(stderr, "Usage: wgsim [options] <in.ref.fa> <out.read1.fq> <out.read2.fq>\n\n");
452 fprintf(stderr, "Options: -e FLOAT base error rate [%.3f]\n", ERR_RATE);
453 fprintf(stderr, " -d INT outer distance between the two ends [500]\n");
454 fprintf(stderr, " -s INT standard deviation [50]\n");
455 fprintf(stderr, " -N INT number of read pairs [1000000]\n");
456 fprintf(stderr, " -1 INT length of the first read [70]\n");
457 fprintf(stderr, " -2 INT length of the second read [70]\n");
458 fprintf(stderr, " -r FLOAT rate of mutations [%.4f]\n", MUT_RATE);
459 fprintf(stderr, " -R FLOAT fraction of indels [%.2f]\n", INDEL_FRAC);
460 fprintf(stderr, " -X FLOAT probability an indel is extended [%.2f]\n", INDEL_EXTEND);
461 fprintf(stderr, " -c generate reads in color space (SOLiD reads)\n");
462 fprintf(stderr, " -C show mismatch info in comment rather than read name\n");
463 fprintf(stderr, " -h haplotype mode\n");
464 fprintf(stderr, "\n");
465 fprintf(stderr, "Note: For SOLiD reads, the first read is F3 and the second is R3.\n\n");
469 int main(int argc, char *argv[])
472 int dist, std_dev, c, size_l, size_r, is_hap = 0;
473 FILE *fpout1, *fpout2, *fp_fa;
475 N = 1000000; dist = 500; std_dev = 50;
476 size_l = size_r = 70;
477 while ((c = getopt(argc, argv, "e:d:s:N:1:2:r:R:hX:cC")) >= 0) {
479 case 'd': dist = atoi(optarg); break;
480 case 's': std_dev = atoi(optarg); break;
481 case 'N': N = atoi(optarg); break;
482 case '1': size_l = atoi(optarg); break;
483 case '2': size_r = atoi(optarg); break;
484 case 'e': ERR_RATE = atof(optarg); break;
485 case 'r': MUT_RATE = atof(optarg); break;
486 case 'R': INDEL_FRAC = atof(optarg); break;
487 case 'X': INDEL_EXTEND = atof(optarg); break;
488 case 'c': IS_SOLID = 1; break;
489 case 'C': SHOW_MM_INFO = 0; break;
490 case 'h': is_hap = 1; break;
493 if (argc - optind < 3) return simu_usage();
494 fp_fa = (strcmp(argv[optind+0], "-") == 0)? stdin : xopen(argv[optind+0], "r");
495 fpout1 = xopen(argv[optind+1], "w");
496 fpout2 = xopen(argv[optind+2], "w");
497 wgsim_core(fpout1, fpout2, fp_fa, is_hap, N, dist, std_dev, size_l, size_r);
499 fclose(fpout1); fclose(fpout2); fclose(fp_fa);