3 Copyright (c) 2008 Genome Research Ltd (GRL).
5 Permission is hereby granted, free of charge, to any person obtaining
6 a copy of this software and associated documentation files (the
7 "Software"), to deal in the Software without restriction, including
8 without limitation the rights to use, copy, modify, merge, publish,
9 distribute, sublicense, and/or sell copies of the Software, and to
10 permit persons to whom the Software is furnished to do so, subject to
11 the following conditions:
13 The above copyright notice and this permission notice shall be
14 included in all copies or substantial portions of the Software.
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26 /* Contact: Heng Li <lh3@sanger.ac.uk> */
28 /* This program is separated from maq's read simulator with Colin
29 * Hercus' modification to allow longer indels. Colin is the chief
30 * developer of novoalign. */
42 uint8_t nst_nt4_table[256] = {
43 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
44 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
45 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5 /*'-'*/, 4, 4,
46 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
47 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
48 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
49 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
50 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
51 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
52 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
53 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
54 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
55 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
56 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
57 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
58 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
61 /* Simple normal random number generator, copied from genran.c */
67 double fac, rsq, v1, v2;
70 v1 = 2.0 * drand48() - 1.0;
71 v2 = 2.0 * drand48() - 1.0;
72 rsq = v1 * v1 + v2 * v2;
73 } while (rsq >= 1.0 || rsq == 0.0);
74 fac = sqrt(-2.0 * log(rsq) / rsq);
84 /* FASTA parser, copied from seq.c */
87 int l, m; /* length and maximum buffer size */
88 unsigned char *s; /* sequence */
91 #define INIT_SEQ(seq) (seq).s = 0; (seq).l = (seq).m = 0
93 static int SEQ_BLOCK_SIZE = 512;
95 void seq_set_block_size(int size)
97 SEQ_BLOCK_SIZE = size;
100 int seq_read_fasta(FILE *fp, seq_t *seq, char *locus, char *comment)
106 while (!feof(fp) && fgetc(fp) != '>');
107 if (feof(fp)) return -1;
109 while (!feof(fp) && (c = fgetc(fp)) != ' ' && c != '\t' && c != '\n')
110 if (c != '\r') *p++ = c;
115 while (!feof(fp) && ((c = fgetc(fp)) == ' ' || c == '\t'));
118 while (!feof(fp) && (c = fgetc(fp)) != '\n')
119 if (c != '\r') *p++ = c;
123 } else if (c != '\n') while (!feof(fp) && fgetc(fp) != '\n');
125 while (!feof(fp) && (c = fgetc(fp)) != '>') {
126 if (isalpha(c) || c == '-' || c == '.') {
128 max += SEQ_BLOCK_SIZE;
129 seq->s = (unsigned char*)realloc(seq->s, sizeof(char) * max);
131 seq->s[l++] = (unsigned char)c;
134 if (c == '>') ungetc(c,fp);
136 seq->m = max; seq->l = l;
140 /* Error-checking open, copied from utils.c */
142 #define xopen(fn, mode) err_xopen_core(__func__, fn, mode)
144 FILE *err_xopen_core(const char *func, const char *fn, const char *mode)
147 if (strcmp(fn, "-") == 0)
148 return (strstr(mode, "r"))? stdin : stdout;
149 if ((fp = fopen(fn, mode)) == 0) {
150 fprintf(stderr, "[%s] fail to open file '%s'. Abort!\n", func, fn);
158 enum muttype_t {NOCHANGE = 0, INSERT = 0x1000, SUBSTITUTE = 0xe000, DELETE = 0xf000};
159 typedef unsigned short mut_t;
160 static mut_t mutmsk = (unsigned short)0xf000;
164 int l, m; /* length and maximum buffer size */
165 mut_t *s; /* sequence */
168 static double ERR_RATE = 0.02;
169 static double MUT_RATE = 0.001;
170 static double INDEL_FRAC = 0.1;
171 static double INDEL_EXTEND = 0.3;
173 void maq_mut_diref(const seq_t *seq, int is_hap, mutseq_t *hap1, mutseq_t *hap2)
178 ret[0] = hap1; ret[1] = hap2;
179 ret[0]->l = seq->l; ret[1]->l = seq->l;
180 ret[0]->m = seq->m; ret[1]->m = seq->m;
181 ret[0]->s = (mut_t *)calloc(seq->m, sizeof(mut_t));
182 ret[1]->s = (mut_t *)calloc(seq->m, sizeof(mut_t));
183 for (i = 0; i != seq->l; ++i) {
185 c = ret[0]->s[i] = ret[1]->s[i] = (mut_t)nst_nt4_table[(int)seq->s[i]];
187 if (drand48() < INDEL_EXTEND) {
188 if (deleting & 1) ret[0]->s[i] |= DELETE;
189 if (deleting & 2) ret[1]->s[i] |= DELETE;
193 if (c < 4 && drand48() < MUT_RATE) { // mutation
194 if (drand48() >= INDEL_FRAC) { // substitution
195 double r = drand48();
196 c = (c + (int)(r * 3.0 + 1)) & 3;
197 if (is_hap || drand48() < 0.333333) { // hom
198 ret[0]->s[i] = ret[1]->s[i] = SUBSTITUTE|c;
200 ret[drand48()<0.5?0:1]->s[i] = SUBSTITUTE|c;
203 if (drand48() < 0.5) { // deletion
204 if (is_hap || drand48() < 0.333333) { // hom-del
205 ret[0]->s[i] = ret[1]->s[i] = DELETE;
208 deleting = drand48()<0.5?1:2;
209 ret[deleting-1]->s[i] = DELETE;
211 } else { // insertion
212 int num_ins = 0, ins = 0;
215 ins = (ins << 2) | (int)(drand48() * 4.0);
216 } while(num_ins < 4 && drand48() < INDEL_EXTEND);
218 if (is_hap || drand48() < 0.333333) { // hom-ins
219 ret[0]->s[i] = ret[1]->s[i] = (num_ins << 12) | (ins << 4) | c;
221 ret[drand48()<0.5?0:1]->s[i] = (num_ins << 12) | (ins << 4) | c;
228 void maq_print_mutref(const char *name, const seq_t *seq, mutseq_t *hap1, mutseq_t *hap2)
231 for (i = 0; i != seq->l; ++i) {
233 c[0] = nst_nt4_table[(int)seq->s[i]];
234 c[1] = hap1->s[i]; c[2] = hap2->s[i];
235 if (c[0] >= 4) continue;
236 if ((c[1] & mutmsk) != NOCHANGE || (c[1] & mutmsk) != NOCHANGE) {
237 printf("%s\t%d\t", name, i+1);
238 if (c[1] == c[2]) { // hom
239 if ((c[1]&mutmsk) == SUBSTITUTE) { // substitution
240 printf("%c\t%c\t-\n", "ACGTN"[c[0]], "ACGTN"[c[1]&0xf]);
241 } else if ((c[1]&mutmsk) == DELETE) { // del
242 printf("%c\t-\t-\n", "ACGTN"[c[0]]);
243 } else if (((c[1] & mutmsk) >> 12) <= 5) { // ins
245 int n = (c[1]&mutmsk) >> 12, ins = c[1] >> 4;
247 putchar("ACGTN"[ins & 0x3]);
253 if ((c[1]&mutmsk) == SUBSTITUTE || (c[2]&mutmsk) == SUBSTITUTE) { // substitution
254 printf("%c\t%c\t+\n", "ACGTN"[c[0]], "XACMGRSVTWYHKDBN"[1<<(c[1]&0x3)|1<<(c[2]&0x3)]);
255 } else if ((c[1]&mutmsk) == DELETE) {
256 printf("%c\t-\t+\n", "ACGTN"[c[0]]);
257 } else if ((c[2]&mutmsk) == DELETE) {
258 printf("%c\t-\t+\n", "ACGTN"[c[0]]);
259 } else if (((c[1] & mutmsk) >> 12) <= 4) { // ins1
261 int n = (c[1]&mutmsk) >> 12, ins = c[1] >> 4;
263 putchar("ACGTN"[ins & 0x3]);
267 } else if (((c[2] & mutmsk) >> 12) <= 5) { // ins2
269 int n = (c[2]&mutmsk) >> 12, ins = c[2] >> 4;
271 putchar("ACGTN"[ins & 0x3]);
282 void wgsim_core(FILE *fpout1, FILE *fpout2, FILE *fp_fa, int is_hap, uint64_t N, int dist, int std_dev, int size_l, int size_r)
286 uint64_t tot_len, ii;
288 char name[256], *qstr, *str;
295 seq_set_block_size(0x1000000);
296 l = size_l > size_r? size_l : size_r;
297 qstr = (char*)calloc(l+1, 1);
298 str = (char*)calloc(l+1, 1);
299 tmp_seq[0] = (uint8_t*)calloc(l+1, 1);
300 tmp_seq[1] = (uint8_t*)calloc(l+1, 1);
301 size[0] = size_l; size[1] = size_r;
303 Q = (int)(-10.0 * log(ERR_RATE) / log(10.0) + 0.499) + 33;
306 while ((l = seq_read_fasta(fp_fa, &seq, name, 0)) >= 0) {
310 fprintf(stderr, "-- %d sequences, total length: %llu\n", n_ref, (long long)tot_len);
313 while ((l = seq_read_fasta(fp_fa, &seq, name, 0)) >= 0) {
314 uint64_t n_pairs = (uint64_t)((long double)l / tot_len * N + 0.5);
315 if (l < dist + 3 * std_dev) {
316 fprintf(stderr, "[wgsim_core] Skip sequence '%s' as it is shorter than %d!\n", name, dist + 3 * std_dev);
319 maq_mut_diref(&seq, is_hap, rseq, rseq+1);
320 maq_print_mutref(name, &seq, rseq, rseq+1);
321 for (ii = 0; ii != n_pairs; ++ii) {
323 int d, pos, s[2], begin, end, is_flip = 0;
327 ran = ran * std_dev + dist;
328 d = (int)(ran + 0.5);
329 pos = (int)((l - d + 1) * drand48());
330 } while (pos < 0 || pos >= seq.l || pos + d - 1 >= seq.l);
332 if (drand48() < 0.5) {
333 fpo[0] = fpout1; fpo[1] = fpout2;
334 s[0] = size[0]; s[1] = size[1];
336 fpo[1] = fpout1; fpo[0] = fpout2;
337 s[1] = size[0]; s[0] = size[1];
340 // generate the read sequences
341 target = rseq[drand48()<0.5?0:1].s; // haploid from which the reads are generated
342 for (i = pos, k = 0, begin = 0; i < seq.l && k < s[0]; ++i) {
344 int mut_type = c & mutmsk;
345 if (mut_type == DELETE) continue; // deletion
348 if (mut_type != NOCHANGE && mut_type != SUBSTITUTE) mut_type = NOCHANGE; // skip ins at the first base
350 if(mut_type == NOCHANGE || mut_type == SUBSTITUTE) {
351 tmp_seq[0][k++] = c&0xf;
354 int n = mut_type >> 12, ins = c >> 4;
356 tmp_seq[0][k++] = ins & 0x3;
361 tmp_seq[0][k++] = c&0xf;
363 for (i = pos + d - 1, k = 0, end = 0; i >= 0 && k < s[1]; --i) {
365 if ((c&mutmsk) == DELETE) continue; // deletion
366 if (end == 0) end = i;
367 tmp_seq[1][k++] = c&0xf;
368 if((c&mutmsk) == NOCHANGE || (c&mutmsk) == SUBSTITUTE) continue;
369 int n = (c&mutmsk) >> 12, ins = c >> 4;
371 if (k == s[1]) break;
372 tmp_seq[1][k++] = ins & 0x3;
377 // start to print out
378 if (!is_flip) sprintf(str, "%s_%u_%u_%llx", name, begin+1, end+1, (long long)ii);
379 else sprintf(str, "%s_%u_%u_%llx", name, begin+1, end+1, (long long)ii);
380 // print forward read
381 fprintf(fpo[0], "@%s/%d\n", str, is_flip+1);
382 for (i = 0; i < s[0]; ++i) {
383 int c = tmp_seq[0][i];
386 if (drand48() < ERR_RATE)
387 c = (c + (int)(drand48() * 3.0 + 1)) & 3;
389 fputc("ACGTN"[c], fpo[0]);
391 for (i = 0; i < s[0]; ++i) qstr[i] = Q;
393 fprintf(fpo[0], "\n+\n%s\n", qstr);
394 // print reverse read
395 fprintf(fpo[1], "@%s/%d\n", str, 2-is_flip);
396 for (i = 0; i < s[1]; ++i) {
397 int c = tmp_seq[1][i];
400 c = 3 - c; // complement
401 if (drand48() < ERR_RATE)
402 c = (c + (int)(drand48() * 3.0 + 1)) & 3;
404 fputc("ACGTN"[c], fpo[1]);
406 for (i = 0; i < s[1]; ++i) qstr[i] = Q;
408 fprintf(fpo[1], "\n+\n%s\n", qstr);
410 free(rseq[0].s); free(rseq[1].s);
413 free(qstr); free(str);
414 free(tmp_seq[0]); free(tmp_seq[1]);
417 static int simu_usage()
419 fprintf(stderr, "\n");
420 fprintf(stderr, "Usage: wgsim [options] <in.ref.fa> <out.read1.fq> <out.read2.fq>\n\n");
421 fprintf(stderr, "Options: -e FLOAT base error rate [%.3f]\n", ERR_RATE);
422 fprintf(stderr, " -d INT outer distance between the two ends [500]\n");
423 fprintf(stderr, " -s INT standard deviation [50]\n");
424 fprintf(stderr, " -N INT number of read pairs [1000000]\n");
425 fprintf(stderr, " -1 INT length of the first read [70]\n");
426 fprintf(stderr, " -2 INT length of the second read [70]\n");
427 fprintf(stderr, " -r FLOAT rate of mutations [%.4f]\n", MUT_RATE);
428 fprintf(stderr, " -R FLOAT fraction of indels [%.2f]\n", INDEL_FRAC);
429 fprintf(stderr, " -X FLOAT probability an indel is extended [%.2f]\n", INDEL_EXTEND);
430 fprintf(stderr, " -h haploid mode\n");
431 fprintf(stderr, "\n");
435 int main(int argc, char *argv[])
438 int dist, std_dev, c, size_l, size_r, is_hap = 0;
439 FILE *fpout1, *fpout2, *fp_fa;
441 N = 1000000; dist = 500; std_dev = 50;
442 size_l = size_r = 70;
443 while ((c = getopt(argc, argv, "e:d:s:N:1:2:r:R:hX:")) >= 0) {
445 case 'd': dist = atoi(optarg); break;
446 case 's': std_dev = atoi(optarg); break;
447 case 'N': N = atoi(optarg); break;
448 case '1': size_l = atoi(optarg); break;
449 case '2': size_r = atoi(optarg); break;
450 case 'e': ERR_RATE = atof(optarg); break;
451 case 'r': MUT_RATE = atof(optarg); break;
452 case 'R': INDEL_FRAC = atof(optarg); break;
453 case 'X': INDEL_EXTEND = atof(optarg); break;
454 case 'h': is_hap = 1; break;
457 if (argc - optind < 3) return simu_usage();
458 fp_fa = (strcmp(argv[optind+0], "-") == 0)? stdin : xopen(argv[optind+0], "r");
459 fpout1 = xopen(argv[optind+1], "w");
460 fpout2 = xopen(argv[optind+2], "w");
461 wgsim_core(fpout1, fpout2, fp_fa, is_hap, N, dist, std_dev, size_l, size_r);
463 fclose(fpout1); fclose(fpout2); fclose(fp_fa);