From 3d8c0bd16da8a830aac83db0b9ac404acba0a502 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 9 Mar 2012 02:39:19 +0000 Subject: [PATCH] Two auxilary Lua scripts --- misc/r2plot.lua | 83 ++++++ misc/vcfutils.lua | 694 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 777 insertions(+) create mode 100755 misc/r2plot.lua create mode 100755 misc/vcfutils.lua diff --git a/misc/r2plot.lua b/misc/r2plot.lua new file mode 100755 index 0000000..0a1b9f1 --- /dev/null +++ b/misc/r2plot.lua @@ -0,0 +1,83 @@ +#!/usr/bin/env luajit + +function string:split(sep, n) + local a, start = {}, 1; + sep = sep or "%s+"; + repeat + local b, e = self:find(sep, start); + if b == nil then + table.insert(a, self:sub(start)); + break + end + a[#a+1] = self:sub(start, b - 1); + start = e + 1; + if n and #a == n then + table.insert(a, self:sub(start)); + break + end + until start > #self; + return a; +end + +function io.xopen(fn, mode) + mode = mode or 'r'; + if fn == nil then return io.stdin; + elseif fn == '-' then return (mode == 'r' and io.stdin) or io.stdout; + elseif fn:sub(-3) == '.gz' then return (mode == 'r' and io.popen('gzip -dc ' .. fn, 'r')) or io.popen('gzip > ' .. fn, 'w'); + elseif fn:sub(-4) == '.bz2' then return (mode == 'r' and io.popen('bzip2 -dc ' .. fn, 'r')) or io.popen('bgzip2 > ' .. fn, 'w'); + else return io.open(fn, mode) end +end + +local eps = {}; + +function eps.func(fp) + fp = fp or io.stdout + fp:write("/C { dup 255 and 255 div exch dup -8 bitshift 255 and 255 div 3 1 roll -16 bitshift 255 and 255 div 3 1 roll setrgbcolor } bind def\n") + fp:write("/L { 4 2 roll moveto lineto } bind def\n") + fp:write("/LX { dup 4 -1 roll exch moveto lineto } bind def\n") + fp:write("/LY { dup 4 -1 roll moveto exch lineto } bind def\n") + fp:write("/LS { 3 1 roll moveto show } bind def\n") + fp:write("/RS { dup stringwidth pop 4 -1 roll exch sub 3 -1 roll moveto show } bind def\n") + fp:write("/B { 4 copy 3 1 roll exch 6 2 roll 8 -2 roll moveto lineto lineto lineto closepath } bind def\n") +end + +function eps.font(ft, size, fp) + fp = fp or io.stdout + fp:write(string.format('/FS %d def\n', size)); + fp:write('/FS4 FS 4 div def\n'); + fp:write('/' .. ft .. ' findfont FS scalefont setfont\n'); +end + +local scale = 8; + +if #arg == 0 then + print("Usage: r2plot.lua "); + os.exit(1) +end + +local fp = io.xopen(arg[1]); +local n = tonumber(fp:read()); + +print('%!PS-Adobe-3.0 EPSF-3.0'); +print('%%' .. string.format('BoundingBox: -%d -%d %.3f %.3f\n', 10*scale, scale, (n+1)*scale, (n+1)*scale)); +print(string.format('%.3f setlinewidth', scale)); +print(string.format('/plot { setgray moveto 0 %d rlineto } def', scale)); +print(string.format('/plothalf { setgray moveto 0 %.2f rlineto } def', scale/2)); +eps.func(); +eps.font('Helvetica', scale-1); + +local i = 1; +for l in fp:lines() do + local t = l:split('\t'); + print(string.format("%d %d FS4 add (%s) RS", (i-1)*scale-2, (i-1)*scale, t[1])); + for j = 2, #t do + if tonumber(t[j]) > 0.01 then + print(string.format('%.2f %.2f %.2f plot stroke', (i-1+.5)*scale, (j-2)*scale, 1.-t[j])); + end + end + i = i + 1; +end +for j = 1, 21 do + print(string.format('%.2f %.2f %.2f plothalf stroke', -8*scale, (j-1) * scale/2, 1.-(j-1)/20)); +end +print('showpage'); diff --git a/misc/vcfutils.lua b/misc/vcfutils.lua new file mode 100755 index 0000000..51d374e --- /dev/null +++ b/misc/vcfutils.lua @@ -0,0 +1,694 @@ +#!/usr/bin/env luajit + +----------------------------------- +-- BEGIN: routines from klib.lua -- +----------------------------------- + +-- Description: getopt() translated from the BSD getopt(); compatible with the default Unix getopt() +--[[ Example: + for o, a in os.getopt(arg, 'a:b') do + print(o, a) + end +]]-- +function os.getopt(args, ostr) + local arg, place = nil, 0; + return function () + if place == 0 then -- update scanning pointer + place = 1 + if #args == 0 or args[1]:sub(1, 1) ~= '-' then place = 0; return nil end + if #args[1] >= 2 then + place = place + 1 + if args[1]:sub(2, 2) == '-' then -- found "--" + table.remove(args, 1); + place = 0 + return nil; + end + end + end + local optopt = place <= #args[1] and args[1]:sub(place, place) or nil + place = place + 1; + local oli = optopt and ostr:find(optopt) or nil + if optopt == ':' or oli == nil then -- unknown option + if optopt == '-' then return nil end + if place > #args[1] then + table.remove(args, 1); + place = 0; + end + return '?'; + end + oli = oli + 1; + if ostr:sub(oli, oli) ~= ':' then -- do not need argument + arg = nil; + if place > #args[1] then + table.remove(args, 1); + place = 0; + end + else -- need an argument + if place <= #args[1] then -- no white space + arg = args[1]:sub(place); + else + table.remove(args, 1); + if #args == 0 then -- an option requiring argument is the last one + place = 0; + if ostr:sub(1, 1) == ':' then return ':' end + return '?'; + else arg = args[1] end + end + table.remove(args, 1); + place = 0; + end + return optopt, arg; + end +end + +-- Description: string split +function string:split(sep, n) + local a, start = {}, 1; + sep = sep or "%s+"; + repeat + local b, e = self:find(sep, start); + if b == nil then + table.insert(a, self:sub(start)); + break + end + a[#a+1] = self:sub(start, b - 1); + start = e + 1; + if n and #a == n then + table.insert(a, self:sub(start)); + break + end + until start > #self; + return a; +end + +-- Description: smart file open +function io.xopen(fn, mode) + mode = mode or 'r'; + if fn == nil then return io.stdin; + elseif fn == '-' then return (mode == 'r' and io.stdin) or io.stdout; + elseif fn:sub(-3) == '.gz' then return (mode == 'r' and io.popen('gzip -dc ' .. fn, 'r')) or io.popen('gzip > ' .. fn, 'w'); + elseif fn:sub(-4) == '.bz2' then return (mode == 'r' and io.popen('bzip2 -dc ' .. fn, 'r')) or io.popen('bgzip2 > ' .. fn, 'w'); + else return io.open(fn, mode) end +end + +-- Description: log gamma function +-- Required by: math.lbinom() +-- Reference: AS245, 2nd algorithm, http://lib.stat.cmu.edu/apstat/245 +function math.lgamma(z) + local x; + x = 0.1659470187408462e-06 / (z+7); + x = x + 0.9934937113930748e-05 / (z+6); + x = x - 0.1385710331296526 / (z+5); + x = x + 12.50734324009056 / (z+4); + x = x - 176.6150291498386 / (z+3); + x = x + 771.3234287757674 / (z+2); + x = x - 1259.139216722289 / (z+1); + x = x + 676.5203681218835 / z; + x = x + 0.9999999999995183; + return math.log(x) - 5.58106146679532777 - z + (z-0.5) * math.log(z+6.5); +end + +-- Description: regularized incomplete gamma function +-- Dependent on: math.lgamma() +--[[ + Formulas are taken from Wiki, with additional input from Numerical + Recipes in C (for modified Lentz's algorithm) and AS245 + (http://lib.stat.cmu.edu/apstat/245). + + A good online calculator is available at: + + http://www.danielsoper.com/statcalc/calc23.aspx + + It calculates upper incomplete gamma function, which equals + math.igamma(s,z,true)*math.exp(math.lgamma(s)) +]]-- +function math.igamma(s, z, complement) + + local function _kf_gammap(s, z) + local sum, x = 1, 1; + for k = 1, 100 do + x = x * z / (s + k); + sum = sum + x; + if x / sum < 1e-14 then break end + end + return math.exp(s * math.log(z) - z - math.lgamma(s + 1.) + math.log(sum)); + end + + local function _kf_gammaq(s, z) + local C, D, f, TINY; + f = 1. + z - s; C = f; D = 0.; TINY = 1e-290; + -- Modified Lentz's algorithm for computing continued fraction. See Numerical Recipes in C, 2nd edition, section 5.2 + for j = 1, 100 do + local d; + local a, b = j * (s - j), j*2 + 1 + z - s; + D = b + a * D; + if D < TINY then D = TINY end + C = b + a / C; + if C < TINY then C = TINY end + D = 1. / D; + d = C * D; + f = f * d; + if math.abs(d - 1) < 1e-14 then break end + end + return math.exp(s * math.log(z) - z - math.lgamma(s) - math.log(f)); + end + + if complement then + return ((z <= 1 or z < s) and 1 - _kf_gammap(s, z)) or _kf_gammaq(s, z); + else + return ((z <= 1 or z < s) and _kf_gammap(s, z)) or (1 - _kf_gammaq(s, z)); + end +end + +function math.brent(func, a, b, tol) + local gold1, gold2, tiny, max_iter = 1.6180339887, 0.3819660113, 1e-20, 100 + + local fa, fb = func(a, data), func(b, data) + if fb > fa then -- swap, such that f(a) > f(b) + a, b, fa, fb = b, a, fb, fa + end + local c = b + gold1 * (b - a) + local fc = func(c) -- golden section extrapolation + while fb > fc do + local bound = b + 100.0 * (c - b) -- the farthest point where we want to go + local r = (b - a) * (fb - fc) + local q = (b - c) * (fb - fa) + if math.abs(q - r) < tiny then -- avoid 0 denominator + tmp = q > r and tiny or 0.0 - tiny + else tmp = q - r end + u = b - ((b - c) * q - (b - a) * r) / (2.0 * tmp) -- u is the parabolic extrapolation point + if (b > u and u > c) or (b < u and u < c) then -- u lies between b and c + fu = func(u) + if fu < fc then -- (b,u,c) bracket the minimum + a, b, fa, fb = b, u, fb, fu + break + elseif fu > fb then -- (a,b,u) bracket the minimum + c, fc = u, fu + break + end + u = c + gold1 * (c - b) + fu = func(u) -- golden section extrapolation + elseif (c > u and u > bound) or (c < u and u < bound) then -- u lies between c and bound + fu = func(u) + if fu < fc then -- fb > fc > fu + b, c, u = c, u, c + gold1 * (c - b) + fb, fc, fu = fc, fu, func(u) + else -- (b,c,u) bracket the minimum + a, b, c = b, c, u + fa, fb, fc = fb, fc, fu + break + end + elseif (u > bound and bound > c) or (u < bound and bound < c) then -- u goes beyond the bound + u = bound + fu = func(u) + else -- u goes the other way around, use golden section extrapolation + u = c + gold1 * (c - b) + fu = func(u) + end + a, b, c = b, c, u + fa, fb, fc = fb, fc, fu + end + if a > c then a, c = c, a end -- swap + + -- now, afb and fb tol1 then + -- related to parabolic interpolation + local r = (b - w) * (fb - fv) + local q = (b - v) * (fb - fw) + local p = (b - v) * q - (b - w) * r + q = 2.0 * (q - r) + if q > 0.0 then p = 0.0 - p + else q = 0.0 - q end + eold, e = e, d + if math.abs(p) >= math.abs(0.5 * q * eold) or p <= q * (a - b) or p >= q * (c - b) then + e = b >= mid and a - b or c - b + d = gold2 * e + else + d, u = p / q, b + d -- actual parabolic interpolation happens here + if u - a < tol2 or c - u < tol2 then + d = mid > b and tol1 or 0.0 - tol1 + end + end + else -- golden section interpolation + e = b >= min and a - b or c - b + d = gold2 * e + end + u = fabs(d) >= tol1 and b + d or b + (d > 0.0 and tol1 or -tol1); + fu = func(u) + if fu <= fb then -- u is the minimum point so far + if u >= b then a = b + else c = b end + v, w, b = w, b, u + fv, fw, fb = fw, fb, fu + else -- adjust (a,c) and (u,v,w) + if u < b then a = u + else c = u end + if fu <= fw or w == b then + v, w = w, u + fv, fw = fw, fu + elseif fu <= fv or v == b or v == w then + v, fv = u, fu; + end + end + end + return fb, b +end + +matrix = {} + +-- Description: chi^2 test for contingency tables +-- Dependent on: math.igamma() +function matrix.chi2(a) + if #a == 2 and #a[1] == 2 then -- 2x2 table + local x, z + x = (a[1][1] + a[1][2]) * (a[2][1] + a[2][2]) * (a[1][1] + a[2][1]) * (a[1][2] + a[2][2]) + if x == 0 then return 0, 1, false end + z = a[1][1] * a[2][2] - a[1][2] * a[2][1] + z = (a[1][1] + a[1][2] + a[2][1] + a[2][2]) * z * z / x + return z, math.igamma(.5, .5 * z, true), true + else -- generic table + local rs, cs, n, m, N, z = {}, {}, #a, #a[1], 0, 0 + for i = 1, n do rs[i] = 0 end + for j = 1, m do cs[j] = 0 end + for i = 1, n do -- compute column sum and row sum + for j = 1, m do cs[j], rs[i] = cs[j] + a[i][j], rs[i] + a[i][j] end + end + for i = 1, n do N = N + rs[i] end + for i = 1, n do -- compute the chi^2 statistics + for j = 1, m do + local E = rs[i] * cs[j] / N; + z = z + (a[i][j] - E) * (a[i][j] - E) / E + end + end + return z, math.igamma(.5 * (n-1) * (m-1), .5 * z, true), true; + end +end + +--------------------------------- +-- END: routines from klib.lua -- +--------------------------------- + + +-------------------------- +-- BEGIN: misc routines -- +-------------------------- + +-- precompute an array for PL->probability conversion +-- @param m maximum PL +function algo_init_q2p(m) + local q2p = {} + for i = 0, m do + q2p[i] = math.pow(10, -i / 10) + end + return q2p +end + +-- given the haplotype frequency, compute r^2 +-- @param f 4 haplotype frequencies; f[] is 0-indexed. +-- @return r^2 +function algo_r2(f) + local p = { f[0] + f[1], f[0] + f[2] } + local D = f[0] * f[3] - f[1] * f[2] + return (p[1] == 0 or p[2] == 0 or 1-p[1] == 0 or 1-p[2] == 0) and 0 or D * D / (p[1] * p[2] * (1 - p[1]) * (1 - p[2])) +end + +-- parse a VCF line to get PL +-- @param q2p is computed by algo_init_q2p() +function text_parse_pl(t, q2p, parse_GT) + parse_GT = parse_GT == nil and true or false + local ht, gt, pl = {}, {}, {} + local s, j0 = t[9]:split(':'), 0 + for j = 1, #s do + if s[j] == 'PL' then j0 = j break end + end + local has_GT = (s[1] == 'GT' and parse_GT) and true or false + for i = 10, #t do + if j0 > 0 then + local s = t[i]:split(':') + local a, b = 1, s[j0]:find(',') + pl[#pl+1] = q2p[tonumber(s[j0]:sub(a, b - 1))] + a, b = b + 1, s[j0]:find(',', b + 1) + pl[#pl+1] = q2p[tonumber(s[j0]:sub(a, b - 1))] + a, b = b + 1, s[j0]:find(',', b + 1) + pl[#pl+1] = q2p[tonumber(s[j0]:sub(a, (b and b - 1) or nil))] + end + if has_GT then + if t[i]:sub(1, 1) ~= '.' then + local g = tonumber(t[i]:sub(1, 1)) + tonumber(t[i]:sub(3, 3)); + gt[#gt+1] = 1e-6; gt[#gt+1] = 1e-6; gt[#gt+1] = 1e-6 + gt[#gt - 2 + g] = 1 + ht[#ht+1] = tonumber(t[i]:sub(1, 1)); ht[#ht+1] = tonumber(t[i]:sub(3, 3)); + else + gt[#gt+1] = 1; gt[#gt+1] = 1; gt[#gt+1] = 1 + ht[#ht+1] = -1; ht[#ht+1] = -1; + end + end +-- print(t[i], pl[#pl-2], pl[#pl-1], pl[#pl], gt[#gt-2], gt[#gt-1], gt[#gt]) + end + if #pl == 0 then pl = nil end + local x = has_GT and { t[1], t[2], ht, gt, pl } or { t[1], t[2], nil, nil, pl } + return x +end + +-- Infer haplotype frequency +-- @param pdg genotype likelihoods P(D|g) generated by text_parse_pl(). pdg[] is 1-indexed. +-- @param eps precision [1e-5] +-- @return 2-locus haplotype frequencies, 0-indexed array +function algo_hapfreq2(pdg, eps) + eps = eps or 1e-5 + local n, f = #pdg[1] / 3, {[0]=0.25, 0.25, 0.25, 0.25} + for iter = 1, 100 do + local F = {[0]=0, 0, 0, 0} + for i = 0, n - 1 do + local p1, p2 = {[0]=pdg[1][i*3+1], pdg[1][i*3+2], pdg[1][i*3+3]}, {[0]=pdg[2][i*3+1], pdg[2][i*3+2], pdg[2][i*3+3]} + local u = { [0]= + f[0] * (f[0] * p1[0] * p2[0] + f[1] * p1[0] * p2[1] + f[2] * p1[1] * p2[0] + f[3] * p1[1] * p2[1]), + f[1] * (f[0] * p1[0] * p2[1] + f[1] * p1[0] * p2[2] + f[2] * p1[1] * p2[1] + f[3] * p1[1] * p2[2]), + f[2] * (f[0] * p1[1] * p2[0] + f[1] * p1[1] * p2[1] + f[2] * p1[2] * p2[0] + f[3] * p1[2] * p2[1]), + f[3] * (f[0] * p1[1] * p2[1] + f[1] * p1[1] * p2[2] + f[2] * p1[2] * p2[1] + f[3] * p1[2] * p2[2]) + } + local s = u[0] + u[1] + u[2] + u[3] + s = 1 / (s * n) + F[0] = F[0] + u[0] * s + F[1] = F[1] + u[1] * s + F[2] = F[2] + u[2] * s + F[3] = F[3] + u[3] * s + end + local e = 0 + for k = 0, 3 do + e = math.abs(f[k] - F[k]) > e and math.abs(f[k] - F[k]) or e + end + for k = 0, 3 do f[k] = F[k] end + if e < eps then break end +-- print(f[0], f[1], f[2], f[3]) + end + return f +end + +------------------------ +-- END: misc routines -- +------------------------ + + +--------------------- +-- BEGIN: commands -- +--------------------- + +-- CMD vcf2bgl: convert PL tagged VCF to Beagle input -- +function cmd_vcf2bgl() + if #arg == 0 then + print("\nUsage: vcf2bgl.lua ") + print("\nNB: This command finds PL by matching /(\\d+),(\\d+),(\\d+)/.\n"); + os.exit(1) + end + + local lookup = {} + for i = 0, 10000 do lookup[i] = string.format("%.4f", math.pow(10, -i/10)) end + + local fp = io.xopen(arg[1]) + for l in fp:lines() do + if l:sub(1, 2) == '##' then -- meta lines; do nothing + elseif l:sub(1, 1) == '#' then -- sample lines + local t, s = l:split('\t'), {} + for i = 10, #t do s[#s+1] = t[i]; s[#s+1] = t[i]; s[#s+1] = t[i] end + print('marker', 'alleleA', 'alleleB', table.concat(s, '\t')) + else -- data line + local t = l:split('\t'); + if t[5] ~= '.' and t[5]:find(",") == nil and #t[5] == 1 and #t[4] == 1 then -- biallic SNP + local x, z = -1, {}; + if t[9]:find('PL') then + for i = 10, #t do + local AA, Aa, aa = t[i]:match('(%d+),(%d+),(%d+)') + AA = tonumber(AA); Aa = tonumber(Aa); aa = tonumber(aa); + if AA ~= nil then + z[#z+1] = lookup[AA]; z[#z+1] = lookup[Aa]; z[#z+1] = lookup[aa]; + else z[#z+1] = 1; z[#z+1] = 1; z[#z+1] = 1; end + end + print(t[1]..':'..t[2], t[4], t[5], table.concat(z, '\t')) + elseif t[9]:find('GL') then + print('Error: not implemented') + os.exit(1) + end + end + end + end + fp:close() +end + +-- CMD bgl2vcf: convert Beagle output to VCF +function cmd_bgl2vcf() + if #arg < 2 then + print('Usage: bgl2vcf.lua ') + os.exit(1) + end + + local fpp = io.xopen(arg[1]); + local fpg = io.xopen(arg[2]); + for lg in fpg:lines() do + local tp, tg, a = fpp:read():split('%s'), lg:split('%s', 4), {} + if tp[1] == 'I' then + for i = 3, #tp, 2 do a[#a+1] = tp[i] end + print('#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', table.concat(a, '\t')) + else + local chr, pos = tg[1]:match('(%S+):(%d+)$') + a = {chr, pos, '.', tg[2], tg[3], 30, '.', '.', 'GT'} + for i = 3, #tp, 2 do + a[#a+1] = ((tp[i] == tg[2] and 0) or 1) .. '|' .. ((tp[i+1] == tg[2] and 0) or 1) + end + print(table.concat(a, '\t')) + end + end + fpg:close(); fpp:close(); +end + +-- CMD freq: count alleles in each population +function cmd_freq() + -- parse the command line + local site_only = true; -- print site allele frequency or not + for c in os.getopt(arg, 's') do + if c == 's' then site_only = false end + end + if #arg == 0 then + print("\nUsage: vcfutils.lua freq [-s] [samples.txt]\n") + print("NB: 1) This command only considers biallelic variants.") + print(" 2) Apply '-s' to get the allele frequency spectrum.") + print(" 3) 'samples.txt' is TAB-delimited with each line consisting of sample and population.") + print("") + os.exit(1) + end + + -- read the sample-population pairs + local pop, sample = {}, {} + if #arg > 1 then + local fp = io.xopen(arg[2]); + for l in fp:lines() do + local s, p = l:match("^(%S+)%s+(%S+)"); -- sample, population pair + sample[s] = p; -- FIXME: check duplications + if pop[p] then table.insert(pop[p], s) + else pop[p] = {s} end + end + fp:close(); + end + pop['NA'] = {} + + -- parse VCF + fp = (#arg >= 2 and io.xopen(arg[1])) or io.stdin; + local col, cnt = {}, {}; + for k in pairs(pop) do + col[k], cnt[k] = {}, {[0]=0}; + end + for l in fp:lines() do + if l:sub(1, 2) == '##' then -- meta lines; do nothing + elseif l:sub(1, 1) == '#' then -- the sample line + local t, del_NA = l:split('\t'), true; + for i = 10, #t do + local k = sample[t[i]] + if k == nil then + k, del_NA = 'NA', false + table.insert(pop[k], t[i]) + end + table.insert(col[k], i); + table.insert(cnt[k], 0); + table.insert(cnt[k], 0); + end + if del_NA then pop['NA'], col['NA'], cnt['NA'] = nil, nil, nil end + else -- data lines + local t = l:split('\t'); + if t[5] ~= '.' and t[5]:find(",") == nil then -- biallic + if site_only == true then io.write(t[1], '\t', t[2], '\t', t[4], '\t', t[5]) end + for k, v in pairs(col) do + local ac, an = 0, 0; + for i = 1, #v do + local a1, a2 = t[v[i]]:match("^(%d).(%d)"); + if a1 ~= nil then ac, an = ac + a1 + a2, an + 2 end + end + if site_only == true then io.write('\t', k, ':', an, ':', ac) end + if an == #cnt[k] then cnt[k][ac] = cnt[k][ac] + 1 end + end + if site_only == true then io.write('\n') end + end + end + end + fp:close(); + + -- print + if site_only == false then + for k, v in pairs(cnt) do + io.write(k .. "\t" .. #v); + for i = 0, #v do io.write("\t" .. v[i]) end + io.write('\n'); + end + end +end + +function cmd_vcf2chi2() + if #arg < 3 then + print("Usage: vcfutils.lua vcf2chi2 "); + os.exit(1) + end + + local g = {}; + + -- read the list of groups + local fp = io.xopen(arg[2]); + for l in fp:lines() do local x = l:match("^(%S+)"); g[x] = 1 end -- FIXME: check duplicate + fp:close() + fp = io.xopen(arg[3]); + for l in fp:lines() do local x = l:match("^(%S+)"); g[x] = 2 end + fp:close() + + -- process VCF + fp = io.xopen(arg[1]) + local h = {{}, {}} + for l in fp:lines() do + if l:sub(1, 2) == '##' then print(l) -- meta lines; do nothing + elseif l:sub(1, 1) == '#' then -- sample lines + local t = l:split('\t'); + for i = 10, #t do + if g[t[i]] == 1 then table.insert(h[1], i) + elseif g[t[i]] == 2 then table.insert(h[2], i) end + end + while #t > 8 do table.remove(t) end + print(table.concat(t, "\t")) + else -- data line + local t = l:split('\t'); + if t[5] ~= '.' and t[5]:find(",") == nil then -- biallic + local a = {{0, 0}, {0, 0}} + for i = 1, 2 do + for _, k in pairs(h[i]) do + if t[k]:find("^0.0") then a[i][1] = a[i][1] + 2 + elseif t[k]:find("^1.1") then a[i][2] = a[i][2] + 2 + elseif t[k]:find("^0.1") or t[k]:find("^1.0") then + a[i][1], a[i][2] = a[i][1] + 1, a[i][2] + 1 + end + end + end + local chi2, p, succ = matrix.chi2(a); + while #t > 8 do table.remove(t) end + --print(a[1][1], a[1][2], a[2][1], a[2][2], chi2, p); + if succ then print(table.concat(t, "\t") .. ";PCHI2=" .. string.format("%.3g", p) + .. string.format(';AF1=%.4g;AF2=%.4g,%.4g', (a[1][2]+a[2][2]) / (a[1][1]+a[1][2]+a[2][1]+a[2][2]), + a[1][2]/(a[1][1]+a[1][2]), a[2][2]/(a[2][1]+a[2][2]))) + else print(table.concat(t, "\t")) end + end + end + end + fp:close() +end + +-- CMD: compute r^2 +function cmd_r2() + local w, is_ht, is_gt = 1, false, false + for o, a in os.getopt(arg, 'w:hg') do + if o == 'w' then w = tonumber(a) + elseif o == 'h' then is_ht, is_gt = true, true + elseif o == 'g' then is_gt = true + end + end + if #arg == 0 then + print("Usage: vcfutils.lua r2 [-hg] [-w 1] ") + os.exit(1) + end + local stack, fp, q2p = {}, io.xopen(arg[1]), algo_init_q2p(1023) + for l in fp:lines() do + if l:sub(1, 1) ~= '#' then + local t = l:split('\t') + local x = text_parse_pl(t, q2p) + if #t[5] == 1 and t[5] ~= '.' then -- biallelic + local r2 = {} + for k = 1, w do + if is_gt == false then -- use PL + if stack[k] then + local pdg = { stack[k][5], x[5] } + r2[#r2+1] = algo_r2(algo_hapfreq2(pdg)) + else r2[#r2+1] = 0 end + elseif is_ht == false then -- use unphased GT + if stack[k] then + local pdg = { stack[k][4], x[4] } + r2[#r2+1] = algo_r2(algo_hapfreq2(pdg)) + else r2[#r2+1] = 0 end + else -- use phased GT + if stack[k] then + local f, ht = { [0]=0, 0, 0, 0 }, { stack[k][3], x[3] } + for i = 1, #ht[1] do + local j = ht[1][i] * 2 + ht[2][i] + f[j] = f[j] + 1 + end + local sum = f[0] + f[1] + f[2] + f[3] + for k = 0, 3 do f[k] = f[k] / sum end + r2[#r2+1] = algo_r2(f) + else r2[#r2+1] = 0 end + end + end + for k = 1, #r2 do + r2[k] = string.format('%.3f', r2[k]) + end + print(x[1], x[2], table.concat(r2, '\t')) + if #stack == w then table.remove(stack, 1) end + stack[#stack+1] = x + end + end + end + fp:close() +end + +------------------- +-- END: commands -- +------------------- + + +------------------- +-- MAIN FUNCTION -- +------------------- + +if #arg == 0 then + print("\nUsage: vcfutils.lua \n") + print("Command: freq count biallelic alleles in each population") + print(" r2 compute r^2") + print(" vcf2chi2 compute 1-degree chi-square between two groups of samples") + print(" vcf2bgl convert PL annotated VCF to Beagle input") + print(" bgl2vcf convert Beagle input to VCF") + print("") + os.exit(1) +end + +local cmd = arg[1] +table.remove(arg, 1) +if cmd == 'vcf2bgl' then cmd_vcf2bgl() +elseif cmd == 'bgl2vcf' then cmd_bgl2vcf() +elseif cmd == 'freq' then cmd_freq() +elseif cmd == 'r2' then cmd_r2() +elseif cmd == 'vcf2chi2' then cmd_vcf2chi2() +else + print('ERROR: unknown command "' .. cmd .. '"') + os.exit(1) +end -- 2.39.2