// collect read positions for ReadPosBias
int len, pos = get_position(p, &len);
- int epos = (double)pos/len * bca->npos;
+ int epos = (double)pos/(len+1) * bca->npos;
if ( bam1_seqi(bam1_seq(p->b),p->qpos) == ref_base )
bca->ref_pos[epos]++;
else
nref += bca->ref_pos[i];
nalt += bca->alt_pos[i];
U += nref*bca->alt_pos[i];
+ bca->ref_pos[i] = 0;
+ bca->alt_pos[i] = 0;
}
+#if 0
+//todo
double var = 0, avg = (double)(nref+nalt)/bca->npos;
for (i=0; i<bca->npos; i++)
{
call->read_pos.avg = avg;
call->read_pos.var = sqrt(var/bca->npos);
call->read_pos.dp = nref+nalt;
+#endif
if ( !nref || !nalt )
{
call->read_pos_bias = -1;
if (i) kputc(',', &s);
kputw(bc->anno[i], &s);
}
- ksprintf(&s,";RPS=%d,%f,%f", bc->read_pos.dp,bc->read_pos.avg,bc->read_pos.var);
+ //ksprintf(&s,";RPS=%d,%f,%f", bc->read_pos.dp,bc->read_pos.avg,bc->read_pos.var);
ksprintf(&s,";QS=%f,%f,%f,%f", bc->qsum[0],bc->qsum[1],bc->qsum[2],bc->qsum[3]);
if (bc->vdb != -1)
ksprintf(&s, ";VDB=%e", bc->vdb);
* reduces the power because sometimes, substitutions caused by
* indels are not distinguishable from true mutations. Multiple
* sequence realignment helps to increase the power.
+ *
+ * Masks mismatches present in at least 70% of the reads with 'N'.
*/
{ // construct per-sample consensus
int L = right - left + 1, max_i, max2_i;
if ((double)(max2&0xffff) / ((max2&0xffff) + (max2>>16)) >= 0.7) max2_i = -1;
if (max_i >= 0) r[max_i] = 15;
if (max2_i >= 0) r[max2_i] = 15;
-// for (i = 0; i < right - left; ++i) fputc("=ACMGRSVTWYHKDBN"[(int)r[i]], stderr); fputc('\n', stderr);
+ //for (i = 0; i < right - left; ++i) fputc("=ACMGRSVTWYHKDBN"[(int)r[i]], stderr); fputc('\n', stderr);
}
free(ref0); free(cns);
}
}
}
// construct the consensus sequence
- max_ins = types[n_types - 1]; // max_ins is at least 0
+ max_ins = types[n_types - 1]; // max_ins is at least 0
if (max_ins > 0) {
- int *inscns_aux = calloc(4 * n_types * max_ins, sizeof(int));
+ int *inscns_aux = calloc(5 * n_types * max_ins, sizeof(int));
// count the number of occurrences of each base at each position for each type of insertion
for (t = 0; t < n_types; ++t) {
if (types[t] > 0) {
uint8_t *seq = bam1_seq(p->b);
for (k = 1; k <= p->indel; ++k) {
int c = bam_nt16_nt4_table[bam1_seqi(seq, p->qpos + k)];
- if (c < 4) ++inscns_aux[(t*max_ins+(k-1))*4 + c];
+ assert(c<5);
+ ++inscns_aux[(t*max_ins+(k-1))*5 + c];
}
}
}
inscns = calloc(n_types * max_ins, 1);
for (t = 0; t < n_types; ++t) {
for (j = 0; j < types[t]; ++j) {
- int max = 0, max_k = -1, *ia = &inscns_aux[(t*max_ins+j)*4];
- for (k = 0; k < 4; ++k)
+ int max = 0, max_k = -1, *ia = &inscns_aux[(t*max_ins+j)*5];
+ for (k = 0; k < 5; ++k)
if (ia[k] > max)
max = ia[k], max_k = k;
inscns[t*max_ins + j] = max? max_k : 4;
+ if ( max_k==4 ) { types[t] = 0; break; } // discard insertions which contain N's
}
}
free(inscns_aux);
#include "sam.h"
#include "faidx.h"
#include "kstring.h"
+#include "sam_header.h"
static inline int printw(int c, FILE *fp)
{
int rflag_require, rflag_filter;
int openQ, extQ, tandemQ, min_support; // for indels
double min_frac; // for indels
- char *reg, *pl_list;
+ char *reg, *pl_list, *fai_fname;
faidx_t *fai;
void *bed, *rghash;
} mplp_conf_t;
bh->l_smpl = s.l;
bh->sname = malloc(s.l);
memcpy(bh->sname, s.s, s.l);
- bh->txt = malloc(strlen(BAM_VERSION) + 64);
- bh->l_txt = 1 + sprintf(bh->txt, "##samtoolsVersion=%s\n", BAM_VERSION);
- free(s.s);
+ s.l = 0;
+ ksprintf(&s, "##samtoolsVersion=%s\n", BAM_VERSION);
+ if (conf->fai_fname) ksprintf(&s, "##reference=file://%s\n", conf->fai_fname);
+ h->dict = sam_header_parse2(h->text);
+ int nseq;
+ const char *tags[] = {"SN","LN","UR","M5",NULL};
+ char **tbl = sam_header2tbl_n(h->dict, "SQ", tags, &nseq);
+ for (i=0; i<nseq; i++)
+ {
+ ksprintf(&s, "##contig=<ID=%s", tbl[4*i]);
+ if ( tbl[4*i+1] ) ksprintf(&s, ",length=%s", tbl[4*i+1]);
+ if ( tbl[4*i+2] ) ksprintf(&s, ",URL=%s", tbl[4*i+2]);
+ if ( tbl[4*i+3] ) ksprintf(&s, ",md5=%s", tbl[4*i+3]);
+ kputs(">\n", &s);
+ }
+ if (tbl) free(tbl);
+ bh->txt = s.s;
+ bh->l_txt = 1 + s.l;
bcf_hdr_sync(bh);
bcf_hdr_write(bp, bh);
bca = bcf_call_init(-1., conf->min_baseQ);
case 'f':
mplp.fai = fai_load(optarg);
if (mplp.fai == 0) return 1;
+ mplp.fai_fname = optarg;
break;
case 'd': mplp.max_depth = atoi(optarg); break;
case 'r': mplp.reg = strdup(optarg); break;
kputs("##INFO=<ID=PR,Number=1,Type=Integer,Description=\"# permutations yielding a smaller PCHI2.\">\n", &str);
if (!strstr(str.s, "##INFO=<ID=QBD,"))
kputs("##INFO=<ID=QBD,Number=1,Type=Float,Description=\"Quality by Depth: QUAL/#reads\">\n", &str);
- if (!strstr(str.s, "##INFO=<ID=QBDNR,"))
- kputs("##INFO=<ID=QBDNR,Number=1,Type=Float,Description=\"Quality by Depth: QUAL/#nref-reads\">\n", &str);
- if (!strstr(str.s, "##INFO=<ID=RPS,"))
- kputs("##INFO=<ID=RPS,Number=3,Type=Float,Description=\"Read Position Stats: depth, average, stddev\">\n", &str);
+ //if (!strstr(str.s, "##INFO=<ID=RPS,"))
+ // kputs("##INFO=<ID=RPS,Number=3,Type=Float,Description=\"Read Position Stats: depth, average, stddev\">\n", &str);
if (!strstr(str.s, "##INFO=<ID=RPB,"))
kputs("##INFO=<ID=RPB,Number=1,Type=Float,Description=\"Read Position Bias\">\n", &str);
if (!strstr(str.s, "##INFO=<ID=MDV,"))
while ((c = getopt(argc, argv, "FN1:l:cC:eHAGvbSuP:t:p:QgLi:IMs:D:U:X:d:T:Ywm:K:")) >= 0) {
switch (c) {
case '1': vc.n1 = atoi(optarg); break;
- case 'l': vc.bed = bed_read(optarg); if (!vc.bed) fprintf(stderr,"Could not read \"%s\"\n", optarg); return 1; break;
+ case 'l': vc.bed = bed_read(optarg); if (!vc.bed) { fprintf(stderr,"Could not read \"%s\"\n", optarg); return 1; } break;
case 'D': vc.fn_dict = strdup(optarg); break;
case 'F': vc.flag |= VC_FIX_PL; break;
case 'N': vc.flag |= VC_ACGT_ONLY; break;
// Call GTs
int isample, gts=0, ac[4] = {0,0,0,0};
- int nRR = 0, nAA = 0, nRA = 0, max_dv = 0, dp_nref = 0;
+ int nRR = 0, nAA = 0, nRA = 0, max_dv = 0;
for (isample = 0; isample < b->n_smpl; isample++)
{
int ploidy = b->ploidy ? b->ploidy[isample] : 2;
if ( als && idv>=0 && (dv=((uint16_t*)b->gi[idv].data)[isample]) )
{
if ( max_dv < dv ) max_dv = dv;
- dp_nref += dp;
}
// For HWE annotation; multiple ALT alleles treated as one
if ( a.is_tested) ksprintf(&s, ";PV4=%.2g,%.2g,%.2g,%.2g", a.p[0], a.p[1], a.p[2], a.p[3]);
ksprintf(&s, ";DP4=%d,%d,%d,%d;MQ=%d", a.d[0], a.d[1], a.d[2], a.d[3], a.mq);
ksprintf(&s, ";QBD=%e", b->qual/(a.d[0] + a.d[1] + a.d[2] + a.d[3]));
- if ( dp_nref ) ksprintf(&s, ";QBDNR=%e", b->qual/dp_nref);
if ( max_dv ) ksprintf(&s, ";MDV=%d", max_dv);
}
if ( nAA+nRA )
return out_dict;
}
+char **sam_header2tbl_n(const void *dict, const char type[2], const char *tags[], int *n)
+{
+ int nout = 0;
+ char **out = NULL;
+
+ *n = 0;
+ list_t *l = (list_t *)dict;
+ if ( !l ) return NULL;
+
+ int i, ntags = 0;
+ while ( tags[ntags] ) ntags++;
+
+ while (l)
+ {
+ HeaderLine *hline = l->data;
+ if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] )
+ {
+ l = l->next;
+ continue;
+ }
+ out = (char**) realloc(out, sizeof(char*)*(nout+1)*ntags);
+ for (i=0; i<ntags; i++)
+ {
+ HeaderTag *key = header_line_has_tag(hline, tags[i]);
+ if ( !key )
+ {
+ out[nout*ntags+i] = NULL;
+ continue;
+ }
+ out[nout*ntags+i] = key->value;
+ }
+ nout++;
+ l = l->next;
+ }
+ *n = nout;
+ return out;
+}
void *sam_header2key_val(void *iter, const char type[2], const char key_tag[2], const char value_tag[2], const char **key, const char **value);
char **sam_header2list(const void *_dict, char type[2], char key_tag[2], int *_n);
+ /*
+ // Usage example
+ int i, j, n;
+ const char *tags[] = {"SN","LN","UR","M5",NULL};
+ void *dict = sam_header_parse2(bam->header->text);
+ char **tbl = sam_header2tbl_n(h->dict, "SQ", tags, &n);
+ for (i=0; i<n; i++)
+ {
+ for (j=0; j<4; j++)
+ if ( tbl[4*i+j] ) printf("\t%s", tbl[4*i+j]);
+ else printf("-");
+ printf("\n");
+ }
+ if (tbl) free(tbl);
+ */
+ char **sam_header2tbl_n(const void *dict, const char type[2], const char *tags[], int *n);
+
void *sam_header2tbl(const void *dict, char type[2], char key_tag[2], char value_tag[2]);
const char *sam_tbl_get(void *h, const char *key);
int sam_tbl_size(void *h);