]> git.donarmstrong.com Git - samtools.git/blobdiff - misc/wgsim.c
* samtools-0.1.7-6
[samtools.git] / misc / wgsim.c
index b7e7b2fe3751170944d3f75fccd5a6e213837333..7b5f095d910b06f3242419066ac8879b7cdb7454 100644 (file)
@@ -39,7 +39,7 @@
 #include <ctype.h>
 #include <string.h>
 
-#define PACKAGE_VERSION "0.2.0"
+#define PACKAGE_VERSION "0.2.3"
 
 const uint8_t nst_nt4_table[256] = {
        4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
@@ -173,6 +173,7 @@ static double MUT_RATE = 0.001;
 static double INDEL_FRAC = 0.1;
 static double INDEL_EXTEND = 0.3;
 static int IS_SOLID = 0;
+static int SHOW_MM_INFO = 1;
 
 void maq_mut_diref(const seq_t *seq, int is_hap, mutseq_t *hap1, mutseq_t *hap2)
 {
@@ -237,7 +238,7 @@ void maq_print_mutref(const char *name, const seq_t *seq, mutseq_t *hap1, mutseq
                c[0] = nst_nt4_table[(int)seq->s[i]];
                c[1] = hap1->s[i]; c[2] = hap2->s[i];
                if (c[0] >= 4) continue;
-               if ((c[1] & mutmsk) != NOCHANGE || (c[1] & mutmsk) != NOCHANGE) {
+               if ((c[1] & mutmsk) != NOCHANGE || (c[2] & mutmsk) != NOCHANGE) {
                        printf("%s\t%d\t", name, i+1);
                        if (c[1] == c[2]) { // hom
                                if ((c[1]&mutmsk) == SUBSTITUTE) { // substitution
@@ -303,7 +304,7 @@ void wgsim_core(FILE *fpout1, FILE *fpout2, FILE *fp_fa, int is_hap, uint64_t N,
        tmp_seq[1] = (uint8_t*)calloc(l+2, 1);
        size[0] = size_l; size[1] = size_r;
 
-       Q = (int)(-10.0 * log(ERR_RATE) / log(10.0) + 0.499) + 33;
+       Q = (ERR_RATE == 0.0)? 'I' : (int)(-10.0 * log(ERR_RATE) / log(10.0) + 0.499) + 33;
 
        tot_len = n_ref = 0;
        while ((l = seq_read_fasta(fp_fa, &seq, name, 0)) >= 0) {
@@ -352,14 +353,11 @@ void wgsim_core(FILE *fpout1, FILE *fpout2, FILE *fp_fa, int is_hap, uint64_t N,
                        n_sub[0] = n_sub[1] = n_indel[0] = n_indel[1] = n_err[0] = n_err[1] = 0;
 
 #define __gen_read(x, start, iter) do {                                                                        \
-                               for (i = (start), k = 0, ext_coor[x] = -1; i >= 0 && i < seq.l && k < s[x]; iter) {     \
+                               for (i = (start), k = 0, ext_coor[x] = -10; i >= 0 && i < seq.l && k < s[x]; iter) {    \
                                        int c = target[i], mut_type = c & mutmsk;                       \
                                        if (ext_coor[x] < 0) {                                                          \
+                                               if (mut_type != NOCHANGE && mut_type != SUBSTITUTE) continue; \
                                                ext_coor[x] = i;                                                                \
-                                               if (mut_type != NOCHANGE && mut_type != SUBSTITUTE) { \
-                                                       ext_coor[x] = -1;                                                       \
-                                                       break;                                                                          \
-                                               }                                                                                               \
                                        }                                                                                                       \
                                        if (mut_type == DELETE) ++n_indel[x];                           \
                                        else if (mut_type == NOCHANGE || mut_type == SUBSTITUTE) { \
@@ -368,11 +366,12 @@ void wgsim_core(FILE *fpout1, FILE *fpout2, FILE *fp_fa, int is_hap, uint64_t N,
                                        } else {                                                                                        \
                                                int n, ins;                                                                             \
                                                ++n_indel[x];                                                                   \
+                                               tmp_seq[x][k++] = c & 0xf;                                              \
                                                for (n = mut_type>>12, ins = c>>4; n > 0 && k < s[x]; --n, ins >>= 2) \
                                                        tmp_seq[x][k++] = ins & 0x3;                            \
                                        }                                                                                                       \
                                }                                                                                                               \
-                               if (k != s[x]) ext_coor[x] = -1                                               \
+                               if (k != s[x]) ext_coor[x] = -10;                                               \
                        } while (0)
 
                        if (!IS_SOLID) {
@@ -388,6 +387,7 @@ void wgsim_core(FILE *fpout1, FILE *fpout2, FILE *fp_fa, int is_hap, uint64_t N,
                                } else { // FF pair
                                        __gen_read(0, pos, ++i);
                                        __gen_read(1, pos + d - 1 - s[1], ++i);
+                                       ++ext_coor[0]; ++ext_coor[1];
                                }
                                // change to color sequence: (0,1,2,3) -> (A,C,G,T)
                                for (j = 0; j < 2; ++j) {
@@ -423,8 +423,15 @@ void wgsim_core(FILE *fpout1, FILE *fpout2, FILE *fp_fa, int is_hap, uint64_t N,
                        for (j = 0; j < 2; ++j) {
                                for (i = 0; i < s[j]; ++i) qstr[i] = Q;
                                qstr[i] = 0;
-                               fprintf(fpo[j], "@%s_%u_%u_%d:%d:%d_%llx/%d\n", name, ext_coor[0]+1, ext_coor[1]+1,
-                                               n_err[j], n_sub[j], n_indel[j], (long long)ii, j==0? is_flip+1 : 2-is_flip);
+                               if (SHOW_MM_INFO) {
+                                       fprintf(fpo[j], "@%s_%u_%u_%d:%d:%d_%d:%d:%d_%llx/%d\n", name, ext_coor[0]+1, ext_coor[1]+1,
+                                                       n_err[0], n_sub[0], n_indel[0], n_err[1], n_sub[1], n_indel[1],
+                                                       (long long)ii, j==0? is_flip+1 : 2-is_flip);
+                               } else {
+                                       fprintf(fpo[j], "@%s_%u_%u_%llx/%d %d:%d:%d_%d:%d:%d\n", name, ext_coor[0]+1, ext_coor[1]+1,
+                                                       (long long)ii, j==0? is_flip+1 : 2-is_flip,
+                                                       n_err[0], n_sub[0], n_indel[0], n_err[1], n_sub[1], n_indel[1]);
+                               }
                                for (i = 0; i < s[j]; ++i)
                                        fputc("ACGTN"[(int)tmp_seq[j][i]], fpo[j]);
                                fprintf(fpo[j], "\n+\n%s\n", qstr);
@@ -453,8 +460,10 @@ static int simu_usage()
        fprintf(stderr, "         -R FLOAT      fraction of indels [%.2f]\n", INDEL_FRAC);
        fprintf(stderr, "         -X FLOAT      probability an indel is extended [%.2f]\n", INDEL_EXTEND);
        fprintf(stderr, "         -c            generate reads in color space (SOLiD reads)\n");
-       fprintf(stderr, "         -h            haploid mode\n");
+       fprintf(stderr, "         -C            show mismatch info in comment rather than read name\n");
+       fprintf(stderr, "         -h            haplotype mode\n");
        fprintf(stderr, "\n");
+       fprintf(stderr, "Note: For SOLiD reads, the first read is F3 and the second is R3.\n\n");
        return 1;
 }
 
@@ -466,7 +475,7 @@ int main(int argc, char *argv[])
 
        N = 1000000; dist = 500; std_dev = 50;
        size_l = size_r = 70;
-       while ((c = getopt(argc, argv, "e:d:s:N:1:2:r:R:hX:c")) >= 0) {
+       while ((c = getopt(argc, argv, "e:d:s:N:1:2:r:R:hX:cC")) >= 0) {
                switch (c) {
                case 'd': dist = atoi(optarg); break;
                case 's': std_dev = atoi(optarg); break;
@@ -478,6 +487,7 @@ int main(int argc, char *argv[])
                case 'R': INDEL_FRAC = atof(optarg); break;
                case 'X': INDEL_EXTEND = atof(optarg); break;
                case 'c': IS_SOLID = 1; break;
+               case 'C': SHOW_MM_INFO = 0; break;
                case 'h': is_hap = 1; break;
                }
        }