From 38c2703b74ff9a045933206123c86c4e8f999d14 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 22 Jan 2009 15:36:48 +0000 Subject: [PATCH] * merge from branches/dev/ * all future development will happen here --- misc/Makefile | 5 +- misc/bowtie2sam.pl | 92 +++++++++ misc/export2sam.pl | 4 +- misc/soap2sam.pl | 103 ++++++++++ misc/wgsim.c | 465 +++++++++++++++++++++++++++++++++++++++++++++ misc/wgsim_eval.pl | 60 ++++++ 6 files changed, 726 insertions(+), 3 deletions(-) create mode 100755 misc/bowtie2sam.pl create mode 100755 misc/soap2sam.pl create mode 100644 misc/wgsim.c create mode 100755 misc/wgsim_eval.pl diff --git a/misc/Makefile b/misc/Makefile index f4e50ff..de37cd4 100644 --- a/misc/Makefile +++ b/misc/Makefile @@ -4,7 +4,7 @@ CFLAGS= -g -Wall -O2 -m64 #-arch ppc CXXFLAGS= $(CFLAGS) DFLAGS= #-D_FILE_OFFSET_BITS=64 OBJS= -PROG= faidx md5sum-lite md5fa maq2sam-short maq2sam-long +PROG= faidx md5sum-lite md5fa maq2sam-short maq2sam-long wgsim INCLUDES= LIBS= -lm -lz SUBDIRS= . @@ -28,6 +28,9 @@ lib-recur all-recur clean-recur cleanlocal-recur install-recur: lib: +wgsim:wgsim.o + $(CC) $(CFLAGS) -o $@ wgsim.o -lm + faidx:../faidx.c ../faidx.h $(CC) $(CFLAGS) -D_NO_RAZF -DFAIDX_MAIN -o $@ ../faidx.c diff --git a/misc/bowtie2sam.pl b/misc/bowtie2sam.pl new file mode 100755 index 0000000..4ead839 --- /dev/null +++ b/misc/bowtie2sam.pl @@ -0,0 +1,92 @@ +#!/usr/bin/perl -w + +# Contact: lh3 +# Version: 0.1.1 + +use strict; +use warnings; +use Getopt::Std; + +&bowtie2sam; +exit; + +sub bowtie2sam { + my %opts = (); + die("Usage: bowtie2sam.pl \n") if (@ARGV == 0); + # core loop + my (@s, $last, @staging, $k, $best_s, $subbest_s, $best_k); + $last = ''; + while (<>) { + my ($name, $nm) = &bowtie2sam_aux($_, \@s); # read_name, number of mismatches + if ($name eq $last) { + # I do not know whether the multiple hits are ordered on the + # number of mismatches. I assume they are not and so I have to + # keep all these multiple hits in memory. + @{$staging[$k]} = @s; + if ($best_s > $nm) { + $subbest_s = $best_s; + $best_s = $nm; + $best_k = $k; + } elsif ($subbest_s > $nm) { + $subbest_s = $nm; + } + ++$k; + } else { + if ($last) { + if ($best_s == $subbest_s) { + $staging[$best_k][4] = 0; + } elsif ($subbest_s - $best_s == 1) { + $staging[$best_k][4] = 15 if ($staging[$best_k][4] > 15); + } + print join("\t", @{$staging[$best_k]}), "\n"; + } + $k = 1; $best_s = $nm; $subbest_s = 1000; $best_k = 0; + @{$staging[0]} = @s; + $last = $name; + } + } + print join("\t", @{$staging[$best_k]}), "\n" if ($best_k >= 0); +} + +sub bowtie2sam_aux { + my ($line, $s) = @_; + chomp($line); + my @t = split("\t", $line); + my $ret; + @$s = (); + # read name + $s->[0] = $ret = $t[0]; + $s->[0] =~ s/\/[12]$//g; + # initial flag (will be updated later) + $s->[1] = 0; + # read & quality + $s->[9] = $t[4]; $s->[10] = $t[5]; + # cigar + $s->[5] = length($s->[9]) . "M"; + # coor + $s->[2] = $t[2]; $s->[3] = $t[3] + 1; + $s->[1] |= 0x10 if ($t[1] eq '-'); + # mapQ + $s->[4] = $t[6] == 0? 25 : 0; + # mate coordinate + $s->[6] = '*'; $s->[7] = $s->[8] = 0; + # aux + my $nm = @t - 7; + push(@$s, "NM:i:" . (@t-7)); + push(@$s, "X$nm:i:" . ($t[6]+1)); + my $md = ''; + if ($t[7]) { + $_ = $t[7]; + my $a = 0; + while (/(\d+):[ACGTN]>([ACGTN])/gi) { + my ($y, $z) = ($1, $2); + $md .= (int($y)-$a) . $z; + $a += $y - $a + 1; + } + $md .= length($s->[9]) - $a; + } else { + $md = length($s->[9]); + } + push(@$s, "MD:Z:$md"); + return ($ret, $nm); +} diff --git a/misc/export2sam.pl b/misc/export2sam.pl index ae82123..3cd87ae 100755 --- a/misc/export2sam.pl +++ b/misc/export2sam.pl @@ -1,7 +1,7 @@ #!/usr/bin/perl -w # Contact: lh3 -# Version: 0.1.0 +# Version: 0.1.1 (03JAN2009) use strict; use warnings; @@ -83,7 +83,7 @@ sub export2sam_aux { # coor my $has_coor = 0; $s->[2] = "*"; - if ($t[10] eq 'NM') { + if ($t[10] eq 'NM' || $t[10] eq 'QC') { $s->[1] |= 0x8; # unmapped } elsif ($t[10] =~ /(\d+):(\d+):(\d+)/) { $s->[1] |= 0x8; # TODO: should I set BAM_FUNMAP in this case? diff --git a/misc/soap2sam.pl b/misc/soap2sam.pl new file mode 100755 index 0000000..0f99987 --- /dev/null +++ b/misc/soap2sam.pl @@ -0,0 +1,103 @@ +#!/usr/bin/perl -w + +# Contact: lh3 +# Version: 0.1.0 + +use strict; +use warnings; +use Getopt::Std; + +&soap2sam; +exit; + +sub mating { + my ($s1, $s2) = @_; + my $isize = 0; + if ($s1->[2] ne '*' && $s1->[2] eq $s2->[2]) { # then calculate $isize + my $x1 = ($s1->[1] & 0x10)? $s1->[3] + length($s1->[9]) : $s1->[3]; + my $x2 = ($s2->[1] & 0x10)? $s2->[3] + length($s2->[9]) : $s2->[3]; + $isize = $x2 - $x1; + } + # update mate coordinate + if ($s2->[2] ne '*') { + @$s1[6..8] = (($s2->[2] eq $s1->[2])? "=" : $s2->[2], $s2->[3], $isize); + $s1->[1] |= 0x20 if ($s2->[1] & 0x10); + } else { + $s1->[1] |= 0x8; + } + if ($s1->[2] ne '*') { + @$s2[6..8] = (($s1->[2] eq $s2->[2])? "=" : $s1->[2], $s1->[3], -$isize); + $s2->[1] |= 0x20 if ($s1->[1] & 0x10); + } else { + $s2->[1] |= 0x8; + } +} + +sub soap2sam { + my %opts = (); + getopts("p", \%opts); + die("Usage: soap2sam.pl [-p] \n") if (@ARGV == 0); + my $is_paired = defined($opts{p}); + # core loop + my @s1 = (); + my @s2 = (); + my ($s_last, $s_curr) = (\@s1, \@s2); + while (<>) { + &soap2sam_aux($_, $s_curr, $is_paired); + if (@$s_last != 0 && $s_last->[0] eq $s_curr->[0]) { + &mating($s_last, $s_curr); + print join("\t", @$s_last), "\n"; + print join("\t", @$s_curr), "\n"; + @$s_last = (); @$s_curr = (); + } else { + print join("\t", @$s_last), "\n" if (@$s_last != 0); + my $s = $s_last; $s_last = $s_curr; $s_curr = $s; + } + } + print join("\t", @$s_last), "\n" if (@$s_last != 0); +} + +sub soap2sam_aux { + my ($line, $s, $is_paired) = @_; + chomp($line); + my @t = split("\t", $line); + @$s = (); + # read name + $s->[0] = $t[0]; + $s->[0] =~ s/\/[12]$//g; + # initial flag (will be updated later) + $s->[1] = 0; + $s->[1] |= 1 | 1<<($t[4] eq 'a'? 6 : 7); + $s->[1] |= 2 if ($is_paired); + # read & quality + $s->[9] = $t[1]; $s->[10] = $t[2]; + # cigar + $s->[5] = length($s->[9]) . "M"; + # coor + $s->[2] = $t[7]; $s->[3] = $t[8]; + $s->[1] |= 0x10 if ($t[6] eq '-'); + # mapQ + $s->[4] = $t[3] == 1? 30 : 0; + # mate coordinate + $s->[6] = '*'; $s->[7] = $s->[8] = 0; + # aux + push(@$s, "NM:i:$t[9]"); + my $md = ''; + if ($t[9]) { + my @x; + for (10 .. $#t) { + push(@x, sprintf("%.3d,$1", $2)) if ($t[$_] =~ /^([ACGT])->(\d+)/i); + } + @x = sort(@x); + my $a = 0; + for (@x) { + my ($y, $z) = split(","); + $md .= (int($y)-$a) . $z; + $a += $y - $a + 1; + } + $md .= length($t[1]) - $a; + } else { + $md = length($t[1]); + } + push(@$s, "MD:Z:$md"); +} diff --git a/misc/wgsim.c b/misc/wgsim.c new file mode 100644 index 0000000..685a8fd --- /dev/null +++ b/misc/wgsim.c @@ -0,0 +1,465 @@ +/* The MIT License + + Copyright (c) 2008 Genome Research Ltd (GRL). + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Contact: Heng Li */ + +/* This program is separated from maq's read simulator with Colin + * Hercus' modification to allow longer indels. Colin is the chief + * developer of novoalign. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +uint8_t nst_nt4_table[256] = { + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5 /*'-'*/, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 +}; + +/* Simple normal random number generator, copied from genran.c */ + +double ran_normal() +{ + static int iset = 0; + static double gset; + double fac, rsq, v1, v2; + if (iset == 0) { + do { + v1 = 2.0 * drand48() - 1.0; + v2 = 2.0 * drand48() - 1.0; + rsq = v1 * v1 + v2 * v2; + } while (rsq >= 1.0 || rsq == 0.0); + fac = sqrt(-2.0 * log(rsq) / rsq); + gset = v1 * fac; + iset = 1; + return v2 * fac; + } else { + iset = 0; + return gset; + } +} + +/* FASTA parser, copied from seq.c */ + +typedef struct { + int l, m; /* length and maximum buffer size */ + unsigned char *s; /* sequence */ +} seq_t; + +#define INIT_SEQ(seq) (seq).s = 0; (seq).l = (seq).m = 0 + +static int SEQ_BLOCK_SIZE = 512; + +void seq_set_block_size(int size) +{ + SEQ_BLOCK_SIZE = size; +} + +int seq_read_fasta(FILE *fp, seq_t *seq, char *locus, char *comment) +{ + int c, l, max; + char *p; + + c = 0; + while (!feof(fp) && fgetc(fp) != '>'); + if (feof(fp)) return -1; + p = locus; + while (!feof(fp) && (c = fgetc(fp)) != ' ' && c != '\t' && c != '\n') + if (c != '\r') *p++ = c; + *p = '\0'; + if (comment) { + p = comment; + if (c != '\n') { + while (!feof(fp) && ((c = fgetc(fp)) == ' ' || c == '\t')); + if (c != '\n') { + *p++ = c; + while (!feof(fp) && (c = fgetc(fp)) != '\n') + if (c != '\r') *p++ = c; + } + } + *p = '\0'; + } else if (c != '\n') while (!feof(fp) && fgetc(fp) != '\n'); + l = 0; max = seq->m; + while (!feof(fp) && (c = fgetc(fp)) != '>') { + if (isalpha(c) || c == '-' || c == '.') { + if (l + 1 >= max) { + max += SEQ_BLOCK_SIZE; + seq->s = (unsigned char*)realloc(seq->s, sizeof(char) * max); + } + seq->s[l++] = (unsigned char)c; + } + } + if (c == '>') ungetc(c,fp); + seq->s[l] = 0; + seq->m = max; seq->l = l; + return l; +} + +/* Error-checking open, copied from utils.c */ + +#define xopen(fn, mode) err_xopen_core(__func__, fn, mode) + +FILE *err_xopen_core(const char *func, const char *fn, const char *mode) +{ + FILE *fp = 0; + if (strcmp(fn, "-") == 0) + return (strstr(mode, "r"))? stdin : stdout; + if ((fp = fopen(fn, mode)) == 0) { + fprintf(stderr, "[%s] fail to open file '%s'. Abort!\n", func, fn); + abort(); + } + return fp; +} + +/* wgsim */ + +enum muttype_t {NOCHANGE = 0, INSERT = 0x1000, SUBSTITUTE = 0xe000, DELETE = 0xf000}; +typedef unsigned short mut_t; +static mut_t mutmsk = (unsigned short)0xf000; + +typedef struct +{ + int l, m; /* length and maximum buffer size */ + mut_t *s; /* sequence */ +} mutseq_t; + +static double ERR_RATE = 0.02; +static double MUT_RATE = 0.001; +static double INDEL_FRAC = 0.1; +static double INDEL_EXTEND = 0.3; + +void maq_mut_diref(const seq_t *seq, int is_hap, mutseq_t *hap1, mutseq_t *hap2) +{ + int i, deleting = 0; + mutseq_t *ret[2]; + + ret[0] = hap1; ret[1] = hap2; + ret[0]->l = seq->l; ret[1]->l = seq->l; + ret[0]->m = seq->m; ret[1]->m = seq->m; + ret[0]->s = (mut_t *)calloc(seq->m, sizeof(mut_t)); + ret[1]->s = (mut_t *)calloc(seq->m, sizeof(mut_t)); + for (i = 0; i != seq->l; ++i) { + int c; + c = ret[0]->s[i] = ret[1]->s[i] = (mut_t)nst_nt4_table[(int)seq->s[i]]; + if (deleting) { + if (drand48() < INDEL_EXTEND) { + if (deleting & 1) ret[0]->s[i] |= DELETE; + if (deleting & 2) ret[1]->s[i] |= DELETE; + continue; + } else deleting = 0; + } + if (c < 4 && drand48() < MUT_RATE) { // mutation + if (drand48() >= INDEL_FRAC) { // substitution + double r = drand48(); + c = (c + (int)(r * 3.0 + 1)) & 3; + if (is_hap || drand48() < 0.333333) { // hom + ret[0]->s[i] = ret[1]->s[i] = SUBSTITUTE|c; + } else { // het + ret[drand48()<0.5?0:1]->s[i] = SUBSTITUTE|c; + } + } else { // indel + if (drand48() < 0.5) { // deletion + if (is_hap || drand48() < 0.333333) { // hom-del + ret[0]->s[i] = ret[1]->s[i] = DELETE; + deleting = 3; + } else { // het-del + deleting = drand48()<0.5?1:2; + ret[deleting-1]->s[i] = DELETE; + } + } else { // insertion + int num_ins = 0, ins = 0; + do { + num_ins++; + ins = (ins << 2) | (int)(drand48() * 4.0); + } while(num_ins < 4 && drand48() < INDEL_EXTEND); + + if (is_hap || drand48() < 0.333333) { // hom-ins + ret[0]->s[i] = ret[1]->s[i] = (num_ins << 12) | (ins << 4) | c; + } else { // het-ins + ret[drand48()<0.5?0:1]->s[i] = (num_ins << 12) | (ins << 4) | c; + } + } + } + } + } +} +void maq_print_mutref(const char *name, const seq_t *seq, mutseq_t *hap1, mutseq_t *hap2) +{ + int i; + for (i = 0; i != seq->l; ++i) { + int c[3]; + c[0] = nst_nt4_table[(int)seq->s[i]]; + c[1] = hap1->s[i]; c[2] = hap2->s[i]; + if (c[0] >= 4) continue; + if ((c[1] & mutmsk) != NOCHANGE || (c[1] & mutmsk) != NOCHANGE) { + printf("%s\t%d\t", name, i+1); + if (c[1] == c[2]) { // hom + if ((c[1]&mutmsk) == SUBSTITUTE) { // substitution + printf("%c\t%c\t-\n", "ACGTN"[c[0]], "ACGTN"[c[1]&0xf]); + } else if ((c[1]&mutmsk) == DELETE) { // del + printf("%c\t-\t-\n", "ACGTN"[c[0]]); + } else if (((c[1] & mutmsk) >> 12) <= 5) { // ins + printf("-\t"); + int n = (c[1]&mutmsk) >> 12, ins = c[1] >> 4; + while(n > 0) { + putchar("ACGTN"[ins & 0x3]); + n--; + } + printf("\t-\n"); + } else assert(0); + } else { // het + if ((c[1]&mutmsk) == SUBSTITUTE || (c[2]&mutmsk) == SUBSTITUTE) { // substitution + printf("%c\t%c\t+\n", "ACGTN"[c[0]], "XACMGRSVTWYHKDBN"[1<<(c[1]&0x3)|1<<(c[2]&0x3)]); + } else if ((c[1]&mutmsk) == DELETE) { + printf("%c\t-\t+\n", "ACGTN"[c[0]]); + } else if ((c[2]&mutmsk) == DELETE) { + printf("%c\t-\t+\n", "ACGTN"[c[0]]); + } else if (((c[1] & mutmsk) >> 12) <= 4) { // ins1 + printf("-\t"); + int n = (c[1]&mutmsk) >> 12, ins = c[1] >> 4; + while (n > 0) { + putchar("ACGTN"[ins & 0x3]); + n--; + } + printf("\t+\n"); + } else if (((c[2] & mutmsk) >> 12) <= 5) { // ins2 + printf("-\t"); + int n = (c[2]&mutmsk) >> 12, ins = c[2] >> 4; + while (n > 0) { + putchar("ACGTN"[ins & 0x3]); + ins >>= 2; + n--; + } + printf("\t+\n"); + } else assert(0); + } + } + } +} + +void wgsim_core(FILE *fpout1, FILE *fpout2, FILE *fp_fa, int is_hap, uint64_t N, int dist, int std_dev, int size_l, int size_r) +{ + seq_t seq; + mutseq_t rseq[2]; + uint64_t tot_len, ii; + int i, l, n_ref, k; + char name[256], *qstr, *str; + int size[2], Q; + uint8_t *tmp_seq[2]; + mut_t *target; + + INIT_SEQ(seq); + srand48(time(0)); + seq_set_block_size(0x1000000); + l = size_l > size_r? size_l : size_r; + qstr = (char*)calloc(l+1, 1); + str = (char*)calloc(l+1, 1); + tmp_seq[0] = (uint8_t*)calloc(l+1, 1); + tmp_seq[1] = (uint8_t*)calloc(l+1, 1); + size[0] = size_l; size[1] = size_r; + + Q = (int)(-10.0 * log(ERR_RATE) / log(10.0) + 0.499) + 33; + + tot_len = n_ref = 0; + while ((l = seq_read_fasta(fp_fa, &seq, name, 0)) >= 0) { + tot_len += l; + ++n_ref; + } + fprintf(stderr, "-- %d sequences, total length: %llu\n", n_ref, (long long)tot_len); + rewind(fp_fa); + + while ((l = seq_read_fasta(fp_fa, &seq, name, 0)) >= 0) { + uint64_t n_pairs = (uint64_t)((long double)l / tot_len * N + 0.5); + if (l < dist + 3 * std_dev) { + fprintf(stderr, "[wgsim_core] Skip sequence '%s' as it is shorter than %d!\n", name, dist + 3 * std_dev); + continue; + } + maq_mut_diref(&seq, is_hap, rseq, rseq+1); + maq_print_mutref(name, &seq, rseq, rseq+1); + for (ii = 0; ii != n_pairs; ++ii) { + double ran; + int d, pos, s[2], begin, end, is_flip = 0; + FILE *fpo[2]; + do { + ran = ran_normal(); + ran = ran * std_dev + dist; + d = (int)(ran + 0.5); + pos = (int)((l - d + 1) * drand48()); + } while (pos < 0 || pos >= seq.l || pos + d - 1 >= seq.l); + // flip or not + if (drand48() < 0.5) { + fpo[0] = fpout1; fpo[1] = fpout2; + s[0] = size[0]; s[1] = size[1]; + } else { + fpo[1] = fpout1; fpo[0] = fpout2; + s[1] = size[0]; s[0] = size[1]; + is_flip = 1; + } + // generate the read sequences + target = rseq[drand48()<0.5?0:1].s; // haploid from which the reads are generated + for (i = pos, k = 0, begin = 0; i < seq.l && k < s[0]; ++i) { + int c = target[i]; + int mut_type = c & mutmsk; + if (mut_type == DELETE) continue; // deletion + if (begin == 0) { + begin = i; + if (mut_type != NOCHANGE && mut_type != SUBSTITUTE) mut_type = NOCHANGE; // skip ins at the first base + } + if(mut_type == NOCHANGE || mut_type == SUBSTITUTE) { + tmp_seq[0][k++] = c&0xf; + continue; + } + int n = mut_type >> 12, ins = c >> 4; + while (n > 0) { + tmp_seq[0][k++] = ins & 0x3; + ins >>= 2; + n--; + if(k == s[0]) break; + } + tmp_seq[0][k++] = c&0xf; + } + for (i = pos + d - 1, k = 0, end = 0; i >= 0 && k < s[1]; --i) { + int c = target[i]; + if ((c&mutmsk) == DELETE) continue; // deletion + if (end == 0) end = i; + tmp_seq[1][k++] = c&0xf; + if((c&mutmsk) == NOCHANGE || (c&mutmsk) == SUBSTITUTE) continue; + int n = (c&mutmsk) >> 12, ins = c >> 4; + while (n > 0) { + if (k == s[1]) break; + tmp_seq[1][k++] = ins & 0x3; + ins >>= 2; + n--; + } + } + // start to print out + if (!is_flip) sprintf(str, "%s_%u_%u_%llx", name, begin+1, end+1, (long long)ii); + else sprintf(str, "%s_%u_%u_%llx", name, begin+1, end+1, (long long)ii); + // print forward read + fprintf(fpo[0], "@%s/%d\n", str, is_flip+1); + for (i = 0; i < s[0]; ++i) { + int c = tmp_seq[0][i]; + if (c > 4) c = 4; + if (c < 4) { + if (drand48() < ERR_RATE) + c = (c + (int)(drand48() * 3.0 + 1)) & 3; + } + fputc("ACGTN"[c], fpo[0]); + } + for (i = 0; i < s[0]; ++i) qstr[i] = Q; + qstr[s[0]] = 0; + fprintf(fpo[0], "\n+\n%s\n", qstr); + // print reverse read + fprintf(fpo[1], "@%s/%d\n", str, 2-is_flip); + for (i = 0; i < s[1]; ++i) { + int c = tmp_seq[1][i]; + if (c > 4) c = 4; + if (c < 4) { + c = 3 - c; // complement + if (drand48() < ERR_RATE) + c = (c + (int)(drand48() * 3.0 + 1)) & 3; + } + fputc("ACGTN"[c], fpo[1]); + } + for (i = 0; i < s[1]; ++i) qstr[i] = Q; + qstr[s[1]] = 0; + fprintf(fpo[1], "\n+\n%s\n", qstr); + } + free(rseq[0].s); free(rseq[1].s); + } + free(seq.s); + free(qstr); free(str); + free(tmp_seq[0]); free(tmp_seq[1]); +} + +static int simu_usage() +{ + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: wgsim [options] \n\n"); + fprintf(stderr, "Options: -e FLOAT base error rate [%.3f]\n", ERR_RATE); + fprintf(stderr, " -d INT outer distance between the two ends [500]\n"); + fprintf(stderr, " -s INT standard deviation [50]\n"); + fprintf(stderr, " -N INT number of read pairs [1000000]\n"); + fprintf(stderr, " -1 INT length of the first read [70]\n"); + fprintf(stderr, " -2 INT length of the second read [70]\n"); + fprintf(stderr, " -r FLOAT rate of mutations [%.4f]\n", MUT_RATE); + fprintf(stderr, " -R FLOAT fraction of indels [%.2f]\n", INDEL_FRAC); + fprintf(stderr, " -X FLOAT probability an indel is extended [%.2f]\n", INDEL_EXTEND); + fprintf(stderr, " -h haploid mode\n"); + fprintf(stderr, "\n"); + return 1; +} + +int main(int argc, char *argv[]) +{ + int64_t N; + int dist, std_dev, c, size_l, size_r, is_hap = 0; + FILE *fpout1, *fpout2, *fp_fa; + + N = 1000000; dist = 500; std_dev = 50; + size_l = size_r = 70; + while ((c = getopt(argc, argv, "e:d:s:N:1:2:r:R:hX:")) >= 0) { + switch (c) { + case 'd': dist = atoi(optarg); break; + case 's': std_dev = atoi(optarg); break; + case 'N': N = atoi(optarg); break; + case '1': size_l = atoi(optarg); break; + case '2': size_r = atoi(optarg); break; + case 'e': ERR_RATE = atof(optarg); break; + case 'r': MUT_RATE = atof(optarg); break; + case 'R': INDEL_FRAC = atof(optarg); break; + case 'X': INDEL_EXTEND = atof(optarg); break; + case 'h': is_hap = 1; break; + } + } + if (argc - optind < 3) return simu_usage(); + fp_fa = (strcmp(argv[optind+0], "-") == 0)? stdin : xopen(argv[optind+0], "r"); + fpout1 = xopen(argv[optind+1], "w"); + fpout2 = xopen(argv[optind+2], "w"); + wgsim_core(fpout1, fpout2, fp_fa, is_hap, N, dist, std_dev, size_l, size_r); + + fclose(fpout1); fclose(fpout2); fclose(fp_fa); + return 0; +} diff --git a/misc/wgsim_eval.pl b/misc/wgsim_eval.pl new file mode 100755 index 0000000..2f3499b --- /dev/null +++ b/misc/wgsim_eval.pl @@ -0,0 +1,60 @@ +#!/usr/bin/perl -w + +# Contact: lh3 +# Version: 0.1.1 + +use strict; +use warnings; +use Getopt::Std; + +&wgsim_eval; +exit; + +sub wgsim_eval { + my %opts; + getopts('p', \%opts); + die("Usage: wgsim_eval.pl [-p] \n") if (@ARGV == 0 && -t STDIN); + my (@c0, @c1); + my $max_q = 0; + while (<>) { + my @t = split; + my $line = $_; + my ($q, $is_correct, $chr, $left, $rght) = (int($t[4]/10), 1, $t[2], $t[3], $t[3]); + $max_q = $q if ($q > $max_q); + # right coordinate + $_ = $t[5]; s/(\d+)[MDN]/$rght+=$1,'x'/eg; + --$rght; + # correct for soft clipping + $left -= $1 if (/^(\d+)S/); + $rght += $1 if (/(\d+)S$/); + # skip unmapped reads + next if (($t[1]&0x4) || $chr eq '*'); + # parse read name and check + ++$c0[$q]; + if ($t[0] =~ /^(\S+)_(\d+)_(\d+)_[0-9a-f]+(\/[12])?$/) { + if ($1 ne $chr) { # different chr + $is_correct = 0; + } elsif ($2 < $3) { # should always be true if reads are generated from wgsim + if ($t[1] & 0x10) { # reverse + $is_correct = 0 if (abs($3 - $rght) > 5); # in case of indels that are close to the end of a reads + } else { + $is_correct = 0 if (abs($2 - $left) > 5); + } + } else { + die("[wgsim_eval] bug in wgsim?\n"); + } + } else { + die("[wgsim_eval] read '$t[0]' was not generated by wgsim?\n"); + } + ++$c1[$q] unless ($is_correct); + print $line if (defined($opts{p}) && !$is_correct); + } + # print + my ($cc0, $cc1) = (0, 0); + for (my $i = $max_q; $i >= 0; --$i) { + $c0[$i] = 0 unless (defined $c0[$i]); + $c1[$i] = 0 unless (defined $c1[$i]); + $cc0 += $c0[$i]; $cc1 += $c1[$i]; + printf("%.2dx %12d / %-12d %12d %.3e\n", $i, $c1[$i], $c0[$i], $cc0, $cc1/$cc0); + } +} -- 2.39.2