X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=phase.c;h=ef4eff952bed637e4e9f89ad1b8c10619869c59f;hb=70c740facc966321754c6bfcc6d61ea056480638;hp=7f069584b9fe4d1cee05514ae7286a08cfd8a2db;hpb=7251efa37992752a8cf62ff363da0ad5099937bd;p=samtools.git diff --git a/phase.c b/phase.c index 7f06958..ef4eff9 100644 --- a/phase.c +++ b/phase.c @@ -17,6 +17,7 @@ KSTREAM_INIT(gzFile, gzread, 16384) #define FLAG_FIX_CHIMERA 0x1 #define FLAG_LIST_EXCL 0x4 +#define FLAG_DROP_AMBI 0x8 typedef struct { // configurations, initialized in the main function @@ -34,7 +35,8 @@ typedef struct { typedef struct { int8_t seq[MAX_VARS]; // TODO: change to dynamic memory allocation! int vpos, beg, end; - uint32_t vlen:16, single:1, flip:1, phase:1, phased:1; + uint32_t vlen:16, single:1, flip:1, phase:1, phased:1, ambig:1; + uint32_t in:16, out:16; // in-phase and out-phase } frag_t, *frag_p; #define rseq_lt(a,b) ((a)->vpos < (b)->vpos) @@ -174,7 +176,9 @@ static uint64_t *fragphase(int vpos, const int8_t *path, nseq_t *hash, int flip) ++c[f->seq[i] == path[f->vpos + i] + 1? 0 : 1]; } f->phase = c[0] > c[1]? 0 : 1; - f->phased = c[0] == c[1]? 0 : 1; + f->in = c[f->phase]; f->out = c[1 - f->phase]; + f->phased = f->in == f->out? 0 : 1; + f->ambig = (f->in && f->out && f->out < 3 && f->in <= f->out + 1)? 1 : 0; // fix chimera f->flip = 0; if (flip && c[0] >= 3 && c[1] >= 3) { @@ -305,7 +309,8 @@ static int clean_seqs(int vpos, nseq_t *hash) static void dump_aln(phaseg_t *g, int min_pos, const nseq_t *hash) { - int i, is_flip; + int i, is_flip, drop_ambi; + drop_ambi = g->flag & FLAG_DROP_AMBI; is_flip = (drand48() < 0.5); for (i = 0; i < g->n; ++i) { int end, which; @@ -319,7 +324,8 @@ static void dump_aln(phaseg_t *g, int min_pos, const nseq_t *hash) if (k == kh_end(hash)) which = 3; else { frag_t *f = &kh_val(hash, k); - if (f->phased && f->flip) which = 2; + if (f->ambig) which = drop_ambi? 2 : 3; + else if (f->phased && f->flip) which = 2; else if (f->phased == 0) which = 3; else { // phased and not flipped char c = 'Y'; @@ -395,7 +401,7 @@ static int phase(phaseg_t *g, const char *chr, int vpos, uint64_t *cns, nseq_t * int8_t c[2]; c[0] = (cns[i]&0xffff)>>2 == 0? 4 : (cns[i]&3); c[1] = (cns[i]>>16&0xffff)>>2 == 0? 4 : (cns[i]>>16&3); - printf("M%d\t%s\t%d\t%d\t%c\t%c\t%d\t%d\t%d\t%d\t%d\n", sitemask[i]+1, chr, (int)(cns[0]>>32), (int)(cns[i]>>32) + 1, "ACGTX"[c[path[i]]], "ACGTX"[c[1-path[i]]], + printf("M%d\t%s\t%d\t%d\t%c\t%c\t%d\t%d\t%d\t%d\t%d\n", sitemask[i]+1, chr, (int)(cns[0]>>32) + 1, (int)(cns[i]>>32) + 1, "ACGTX"[c[path[i]]], "ACGTX"[c[1-path[i]]], i + g->vpos_shift + 1, (int)(x&0xffff), (int)(x>>16&0xffff), (int)(x>>32&0xffff), (int)(x>>48&0xffff)); } free(path); free(pcnt); free(regmask); free(sitemask); @@ -413,7 +419,7 @@ static int phase(phaseg_t *g, const char *chr, int vpos, uint64_t *cns, nseq_t * if (f->seq[j] == 0) putchar('N'); else putchar("ACGT"[f->seq[j] == 1? (c&3) : (c>>16&3)]); } - printf("\t*\tYP:i:%d\tYF:i:%d\n", f->phase, f->flip); + printf("\t*\tYP:i:%d\tYF:i:%d\tYI:i:%d\tYO:i:%d\tYS:i:%d\n", f->phase, f->flip, f->in, f->out, f->beg+1); } free(seqs); printf("//\n"); @@ -445,6 +451,7 @@ static int readaln(void *data, bam1_t *b) phaseg_t *g = (phaseg_t*)data; int ret; ret = bam_read1(g->fp, b); + if (ret < 0) return ret; if (!(b->core.flag & (BAM_FUNMAP|BAM_FSECONDARY|BAM_FQCFAIL|BAM_FDUP)) && g->pre) { if (g->n == g->m) { g->m = g->m? g->m<<1 : 16; @@ -515,8 +522,8 @@ int main_phase(int argc, char *argv[]) memset(&g, 0, sizeof(phaseg_t)); g.flag = FLAG_FIX_CHIMERA; - g.min_varLOD = 40; g.k = 13; g.min_baseQ = 13; g.max_depth = 256; - while ((c = getopt(argc, argv, "Q:eFq:k:b:l:D:")) >= 0) { + g.min_varLOD = 37; g.k = 13; g.min_baseQ = 13; g.max_depth = 256; + while ((c = getopt(argc, argv, "Q:eFq:k:b:l:D:A:")) >= 0) { switch (c) { case 'D': g.max_depth = atoi(optarg); break; case 'q': g.min_varLOD = atoi(optarg); break; @@ -524,6 +531,7 @@ int main_phase(int argc, char *argv[]) case 'k': g.k = atoi(optarg); break; case 'F': g.flag &= ~FLAG_FIX_CHIMERA; break; case 'e': g.flag |= FLAG_LIST_EXCL; break; + case 'A': g.flag |= FLAG_DROP_AMBI; break; case 'b': g.pre = strdup(optarg); break; case 'l': fn_list = strdup(optarg); break; } @@ -538,6 +546,7 @@ int main_phase(int argc, char *argv[]) fprintf(stderr, " -D INT max read depth [%d]\n", g.max_depth); // fprintf(stderr, " -l FILE list of sites to phase [null]\n"); fprintf(stderr, " -F do not attempt to fix chimeras\n"); + fprintf(stderr, " -A drop reads with ambiguous phase\n"); // fprintf(stderr, " -e do not discover SNPs (effective with -l)\n"); fprintf(stderr, "\n"); return 1; @@ -649,7 +658,7 @@ int main_phase(int argc, char *argv[]) memset(f->seq, 0, MAX_VARS); f->beg = p->b->core.pos; f->end = bam_calend(&p->b->core, bam1_cigar(p->b)); - f->vpos = vpos, f->vlen = 1, f->seq[0] = c, f->single = f->phased = f->flip = 0; + f->vpos = vpos, f->vlen = 1, f->seq[0] = c, f->single = f->phased = f->flip = f->ambig = 0; } } if (dophase) {