3 # Author: lh3, converted to python and modified to add -C option by Aylwyn Scally
6 # varfilter.py is a port of Heng's samtools.pl varFilter script into
7 # python, with an additional -C INT option. This option sets a minimum
8 # consensus score, above which the script will output a pileup line
9 # wherever it _could have_ called a variant, even if none is actually
10 # called (i.e. hom-ref positions). This is important if you want to
11 # subsequently merge the calls with those for another individual to get a
12 # synoptic view of calls at each site. Without this option, and in all
13 # other respects, it behaves like samtools.pl varFilter.
15 # Aylwyn Scally as6@sanger.ac.uk
20 # C low CNS quality (hom-ref only)
23 # W too many SNPs in a window (SNP only)
24 # G close to a high-quality indel (SNP only)
25 # Q low RMS mapping quality (SNP only)
26 # g close to another indel with higher quality (indel only)
27 # s low SNP quality (SNP only)
28 # i low indel quality (indel only)
35 print '''usage: varfilter.py [options] [cns-pileup]
37 Options: -Q INT minimum RMS mapping quality for SNPs
38 -q INT minimum RMS mapping quality for gaps
39 -d INT minimum read depth
40 -D INT maximum read depth
41 -S INT minimum SNP quality
42 -i INT minimum indel quality
43 -C INT minimum consensus quality for hom-ref sites
45 -G INT min indel score for nearby SNP filtering
46 -w INT SNP within INT bp around a gap to be filtered
48 -W INT window size for filtering dense SNPs
49 -N INT max number of SNPs in a window
51 -l INT window size for filtering adjacent gaps
53 -p print filtered variants'''
55 def varFilter_aux(first, is_print):
58 sys.stdout.write("\t".join(first[4:]) + "\n")
60 sys.stderr.write("\t".join(["UQdDWGgsiCX"[first[1]]] + first[4:]) + "\n")
80 options, args = getopt.gnu_getopt(sys.argv[1:], 'pq:d:D:l:Q:w:W:N:G:S:i:C:', [])
81 except getopt.GetoptError:
84 for (oflag, oarg) in options:
85 if oflag == '-d': mindepth = int(oarg)
86 if oflag == '-D': maxdepth = int(oarg)
87 if oflag == '-l': gapgapwin = int(oarg)
88 if oflag == '-Q': minsnpmapq = int(oarg)
89 if oflag == '-q': mingapmapq = int(oarg)
90 if oflag == '-G': minindelscore = int(oarg)
91 if oflag == '-s': scorefactor = int(oarg)
92 if oflag == '-w': snpgapwin = int(oarg)
93 if oflag == '-W': densesnpwin = int(oarg)
94 if oflag == '-C': mincnsq = int(oarg)
95 if oflag == '-N': densesnps = int(oarg)
96 if oflag == '-p': printfilt = True
97 if oflag == '-S': minsnpq = int(oarg)
98 if oflag == '-i': minindelq = int(oarg)
105 # calculate the window size
106 max_dist = max(gapgapwin, snpgapwin, densesnpwin)
109 for t in (line.strip().split() for line in inp):
110 (flt, score) = (0, -1)
114 is_snp = t[2].upper() != t[3].upper()
115 if not (is_snp or mincnsq):
117 # clear the out-of-range elements
119 # Still on the same chromosome and the first element's window still affects this position?
120 if staging[0][4] == t[0] and int(staging[0][5]) + staging[0][2] + max_dist >= int(t[1]):
122 varFilter_aux(staging.pop(0), printfilt)
124 # first a simple filter
125 if int(t[7]) < mindepth:
127 elif int(t[7]) > maxdepth:
129 if t[2] == '*': # an indel
130 if minindelq and minindelq > int(t[5]):
133 if minsnpq and minsnpq> int(t[5]):
136 if mincnsq and mincnsq > int(t[4]):
139 # site dependent filters
142 if t[2] == '*': # an indel
143 # If deletion, remember the length of the deletion
144 (a,b) = t[3].split('/')
148 if a[0] == '-': dlen=alen
149 elif b[0] == '-': dlen=blen
151 if int(t[6]) < mingapmapq:
154 if int(t[5]) >= minindelscore:
155 for x in (y for y in staging if y[3]):
156 # Is it a SNP and is it outside the SNP filter window?
157 if x[0] >= 0 or int(x[5]) + x[2] + snpgapwin < int(t[1]):
162 # calculate the filtering score (different from indel quality)
165 score += scorefactor * int(t[10])
167 score += scorefactor * int(t[11])
168 # check the staging list for indel filtering
169 for x in (y for y in staging if y[3]):
170 # Is it a SNP and is it outside the gap filter window
171 if x[0] < 0 or int(x[5]) + x[2] + gapgapwin < int(t[1]):
178 else: # a SNP or hom-ref
179 if int(t[6]) < minsnpmapq:
181 # check adjacent SNPs
183 for x in (y for y in staging if y[3]):
184 if x[0] < 0 and int(x[5]) + x[2] + densesnpwin >= int(t[1]) and (x[1] == 0 or x[1] == 4 or x[1] == 5):
187 # filtering is necessary
190 for x in (y for y in staging if y[3]):
191 if x[0] < 0 and int(x[5]) + x[2] + densesnpwin >= int(t[1]) and x[1] == 0:
193 else: # then check gap filter
194 for x in (y for y in staging if y[3]):
195 if x[0] < 0 or int(x[5]) + x[2] + snpgapwin < int(t[1]):
197 if x[0] >= minindelscore:
201 staging.append([score, flt, dlen, is_snp] + t)
203 # output the last few elements in the staging list
205 varFilter_aux(staging.pop(0), printfilt)