X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;ds=sidebyside;f=misc%2Fwgsim.c;h=7b5f095d910b06f3242419066ac8879b7cdb7454;hb=6ad0dc55ff3abf908a99935321ff8fa5efed4d88;hp=1ae7572a6a6aff77b04acfea9f6fd194f3ee6f3e;hpb=115b2e9f98151f208207aae6286acd12adf019de;p=samtools.git diff --git a/misc/wgsim.c b/misc/wgsim.c index 1ae7572..7b5f095 100644 --- a/misc/wgsim.c +++ b/misc/wgsim.c @@ -39,7 +39,7 @@ #include #include -#define PACKAGE_VERSION "0.2.0" +#define PACKAGE_VERSION "0.2.3" const uint8_t nst_nt4_table[256] = { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, @@ -173,6 +173,7 @@ static double MUT_RATE = 0.001; static double INDEL_FRAC = 0.1; static double INDEL_EXTEND = 0.3; static int IS_SOLID = 0; +static int SHOW_MM_INFO = 1; void maq_mut_diref(const seq_t *seq, int is_hap, mutseq_t *hap1, mutseq_t *hap2) { @@ -237,7 +238,7 @@ void maq_print_mutref(const char *name, const seq_t *seq, mutseq_t *hap1, mutseq c[0] = nst_nt4_table[(int)seq->s[i]]; c[1] = hap1->s[i]; c[2] = hap2->s[i]; if (c[0] >= 4) continue; - if ((c[1] & mutmsk) != NOCHANGE || (c[1] & mutmsk) != NOCHANGE) { + if ((c[1] & mutmsk) != NOCHANGE || (c[2] & mutmsk) != NOCHANGE) { printf("%s\t%d\t", name, i+1); if (c[1] == c[2]) { // hom if ((c[1]&mutmsk) == SUBSTITUTE) { // substitution @@ -303,7 +304,7 @@ void wgsim_core(FILE *fpout1, FILE *fpout2, FILE *fp_fa, int is_hap, uint64_t N, tmp_seq[1] = (uint8_t*)calloc(l+2, 1); size[0] = size_l; size[1] = size_r; - Q = (int)(-10.0 * log(ERR_RATE) / log(10.0) + 0.499) + 33; + Q = (ERR_RATE == 0.0)? 'I' : (int)(-10.0 * log(ERR_RATE) / log(10.0) + 0.499) + 33; tot_len = n_ref = 0; while ((l = seq_read_fasta(fp_fa, &seq, name, 0)) >= 0) { @@ -352,14 +353,11 @@ void wgsim_core(FILE *fpout1, FILE *fpout2, FILE *fp_fa, int is_hap, uint64_t N, n_sub[0] = n_sub[1] = n_indel[0] = n_indel[1] = n_err[0] = n_err[1] = 0; #define __gen_read(x, start, iter) do { \ - for (i = (start), k = 0, ext_coor[x] = -1; i >= 0 && i < seq.l && k < s[x]; iter) { \ + for (i = (start), k = 0, ext_coor[x] = -10; i >= 0 && i < seq.l && k < s[x]; iter) { \ int c = target[i], mut_type = c & mutmsk; \ if (ext_coor[x] < 0) { \ + if (mut_type != NOCHANGE && mut_type != SUBSTITUTE) continue; \ ext_coor[x] = i; \ - if (mut_type != NOCHANGE && mut_type != SUBSTITUTE) { \ - ext_coor[x] = -1; \ - break; \ - } \ } \ if (mut_type == DELETE) ++n_indel[x]; \ else if (mut_type == NOCHANGE || mut_type == SUBSTITUTE) { \ @@ -368,11 +366,12 @@ void wgsim_core(FILE *fpout1, FILE *fpout2, FILE *fp_fa, int is_hap, uint64_t N, } else { \ int n, ins; \ ++n_indel[x]; \ + tmp_seq[x][k++] = c & 0xf; \ for (n = mut_type>>12, ins = c>>4; n > 0 && k < s[x]; --n, ins >>= 2) \ tmp_seq[x][k++] = ins & 0x3; \ } \ } \ - if (k != s[x]) ext_coor[x] = -1; \ + if (k != s[x]) ext_coor[x] = -10; \ } while (0) if (!IS_SOLID) { @@ -388,6 +387,7 @@ void wgsim_core(FILE *fpout1, FILE *fpout2, FILE *fp_fa, int is_hap, uint64_t N, } else { // FF pair __gen_read(0, pos, ++i); __gen_read(1, pos + d - 1 - s[1], ++i); + ++ext_coor[0]; ++ext_coor[1]; } // change to color sequence: (0,1,2,3) -> (A,C,G,T) for (j = 0; j < 2; ++j) { @@ -423,8 +423,15 @@ void wgsim_core(FILE *fpout1, FILE *fpout2, FILE *fp_fa, int is_hap, uint64_t N, for (j = 0; j < 2; ++j) { for (i = 0; i < s[j]; ++i) qstr[i] = Q; qstr[i] = 0; - fprintf(fpo[j], "@%s_%u_%u_%d:%d:%d_%llx/%d\n", name, ext_coor[0]+1, ext_coor[1]+1, - n_err[j], n_sub[j], n_indel[j], (long long)ii, j==0? is_flip+1 : 2-is_flip); + if (SHOW_MM_INFO) { + fprintf(fpo[j], "@%s_%u_%u_%d:%d:%d_%d:%d:%d_%llx/%d\n", name, ext_coor[0]+1, ext_coor[1]+1, + n_err[0], n_sub[0], n_indel[0], n_err[1], n_sub[1], n_indel[1], + (long long)ii, j==0? is_flip+1 : 2-is_flip); + } else { + fprintf(fpo[j], "@%s_%u_%u_%llx/%d %d:%d:%d_%d:%d:%d\n", name, ext_coor[0]+1, ext_coor[1]+1, + (long long)ii, j==0? is_flip+1 : 2-is_flip, + n_err[0], n_sub[0], n_indel[0], n_err[1], n_sub[1], n_indel[1]); + } for (i = 0; i < s[j]; ++i) fputc("ACGTN"[(int)tmp_seq[j][i]], fpo[j]); fprintf(fpo[j], "\n+\n%s\n", qstr); @@ -453,6 +460,7 @@ static int simu_usage() fprintf(stderr, " -R FLOAT fraction of indels [%.2f]\n", INDEL_FRAC); fprintf(stderr, " -X FLOAT probability an indel is extended [%.2f]\n", INDEL_EXTEND); fprintf(stderr, " -c generate reads in color space (SOLiD reads)\n"); + fprintf(stderr, " -C show mismatch info in comment rather than read name\n"); fprintf(stderr, " -h haplotype mode\n"); fprintf(stderr, "\n"); fprintf(stderr, "Note: For SOLiD reads, the first read is F3 and the second is R3.\n\n"); @@ -467,7 +475,7 @@ int main(int argc, char *argv[]) N = 1000000; dist = 500; std_dev = 50; size_l = size_r = 70; - while ((c = getopt(argc, argv, "e:d:s:N:1:2:r:R:hX:c")) >= 0) { + while ((c = getopt(argc, argv, "e:d:s:N:1:2:r:R:hX:cC")) >= 0) { switch (c) { case 'd': dist = atoi(optarg); break; case 's': std_dev = atoi(optarg); break; @@ -479,6 +487,7 @@ int main(int argc, char *argv[]) case 'R': INDEL_FRAC = atof(optarg); break; case 'X': INDEL_EXTEND = atof(optarg); break; case 'c': IS_SOLID = 1; break; + case 'C': SHOW_MM_INFO = 0; break; case 'h': is_hap = 1; break; } }