3 # Copyright (C) 2011 Martin A. Hansen (mail@maasha.dk).
5 # This program is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU General Public License
7 # as published by the Free Software Foundation; either version 2
8 # of the License, or (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 # http://www.gnu.org/copyleft/gpl.html
27 def initialize(sff_file)
31 @seq_analysis = SeqAnalyze.new(@sff_file, tmpdir)
32 @plots = PlotData.new(@sff_file, tmpdir)
33 @table_mid_join = MidTable.new(@sff_file, tmpdir)
34 @table_freq = FreqTable.new(@sff_file, tmpdir)
37 # Support templating of member data.
45 attr_reader :count, :min, :max, :mean, :bases, :gc, :hard, :soft
47 def initialize(sff_file, tmpdir)
49 @out1_file = File.join(tmpdir, "out1.txt")
50 @out2_file = File.join(tmpdir, "out2.txt")
68 STDERR.puts "Analyzing sequences ... "
70 "read_sff -i #{@sff_file} |
72 analyze_vals -k SEQ -o #{@out1_file} |
74 mean_vals -k 'GC%,HARD_MASK%,SOFT_MASK%' -o #{@out2_file} -x"
79 def parse_analyze_vals
80 File.open(@out1_file, "r") do |ios|
82 line = ios.readline.chomp
85 when /COUNT\s+(\d+)/; then @count = $1
86 when /MIN\s+(\d+)/; then @min = $1
87 when /MAX\s+(\d+)/; then @max = $1
88 when /MEAN\s+(\d+)/; then @mean = $1
89 when /SUM\s+(\d+)/; then @bases = $1
96 File.open(@out2_file, "r") do |ios|
98 line = ios.readline.chomp
101 when /GC%_MEAN: (.+)/; then @gc = $1
102 when /HARD_MASK%_MEAN: (.+)/; then @hard = $1
103 when /SOFT_MASK%_MEAN: (.+)/; then @soft = $1
111 attr_reader :lendist_unclipped, :lendist_clipped, :scores_unclipped, :scores_clipped, :mean_scores
113 def initialize(sff_file, tmpdir)
115 @plot1 = File.join(tmpdir, "plot1.png")
116 @plot2 = File.join(tmpdir, "plot2.png")
117 @plot3 = File.join(tmpdir, "plot3.png")
118 @plot4 = File.join(tmpdir, "plot4.png")
119 @plot5 = File.join(tmpdir, "plot5.png")
123 @lendist_unclipped = png2base64(@plot1)
124 @lendist_clipped = png2base64(@plot3)
125 @scores_unclipped = png2base64(@plot2)
126 @scores_clipped = png2base64(@plot4)
127 @mean_scores = png2base64(@plot5)
131 STDERR.puts "Creating plots ... "
133 "read_sff -m -i #{@sff_file} |
135 plot_distribution -k SEQ_LEN -T 'Length Distribution - unclipped' -t png -o #{@plot1} |
136 plot_scores -c -T 'Mean Quality Scores - unclipped' -t png -o #{@plot2} |
138 plot_distribution -k SEQ_LEN -T 'Length Distribution - clipped' -t png -o #{@plot3} |
139 plot_scores -c -T 'Mean Quality Scores - clipped' -t png -o #{@plot4} |
141 bin_vals -k SCORES_MEAN -b 5 |
142 plot_histogram -s num -k SCORES_MEAN_BIN -T 'Mean score bins' -X 'Bins (size 5)' -Y 'Count' -t png -o #{@plot5} -x"
144 STDERR.puts "done.\n"
152 File.open(file, "r") do |ios|
156 "data:image/png;base64," + Base64.encode64(png)
161 def initialize(sff_file, tmpdir)
163 @mid1_file = File.join(tmpdir, "mid1.tab")
164 @mid2_file = File.join(tmpdir, "mid2.tab")
165 @mid3_file = File.join(tmpdir, "mid3.tab")
166 @mid4_file = File.join(tmpdir, "mid_join.tab")
173 File.open(@mid4_file, "r") do |ios|
174 while not ios.eof? do
175 fields = ios.readline.chomp.split("\t")
176 yield MidRow.new(fields[0], fields[1], fields[2], fields[3], fields[4])
184 STDERR.puts "Finding barcodes in raw sequences ... "
186 "read_sff -i #{@sff_file} |
187 find_barcodes -p 4 -gr |
188 count_vals -k BARCODE_NAME |
189 uniq_vals -k BARCODE_NAME |
190 write_tab -c -k BARCODE_NAME,BARCODE,BARCODE_NAME_COUNT -o #{@mid1_file} -x"
192 STDERR.puts "done.\n"
193 STDERR.puts "Finding barcodes in sequences >= 250 ... "
195 "read_sff -i #{@sff_file} |
196 grab -e 'SEQ_LEN >= 250' |
197 find_barcodes -p 4 -gr |
198 count_vals -k BARCODE_NAME |
199 uniq_vals -k BARCODE_NAME |
200 write_tab -c -k BARCODE_NAME,BARCODE,BARCODE_NAME_COUNT -o #{@mid2_file} -x"
202 STDERR.puts "done.\n"
203 STDERR.puts "Finding barcodes in sequences >= 250 with mean score >= 20 ... "
205 "read_sff -i #{@sff_file} |
207 grab -e 'SEQ_LEN >= 250' |
208 grab -e 'SCORES_MEAN >= 20' |
209 find_barcodes -p 4 -gr |
210 count_vals -k BARCODE_NAME |
211 uniq_vals -k BARCODE_NAME |
212 write_tab -c -k BARCODE_NAME,BARCODE,BARCODE_NAME_COUNT -o #{@mid3_file} -x"
214 STDERR.puts "done.\n"
217 def bp_merge_mid_tables
218 STDERR.print "Joining MID tables ... "
220 "read_tab -i #{@mid1_file} |
221 rename_keys -k BARCODE_NAME,A |
222 rename_keys -k BARCODE_NAME_COUNT,TOTAL |
223 read_tab -i #{@mid2_file} |
224 rename_keys -k BARCODE_NAME,B |
225 rename_keys -k BARCODE_NAME_COUNT,L250 |
226 merge_records -k A,B |
227 read_tab -i #{@mid3_file} |
228 rename_keys -k BARCODE_NAME,C |
229 rename_keys -k BARCODE_NAME_COUNT,L250_S20 |
230 merge_records -k A,C |
231 rename_keys -k A,BARCODE_NAME |
232 sort_records -k BARCODE_NAME |
233 write_tab -ck BARCODE_NAME,BARCODE,TOTAL,L250,L250_S20 -o #{@mid4_file} -x"
235 STDERR.puts "done.\n"
239 attr_reader :mid_num, :mid_seq, :total, :l250, :l250_s20
241 def initialize(mid_num, mid_seq, total, l250, l250_s20)
252 def initialize(sff_file, tmpdir)
254 @tab_file = File.join(tmpdir, "freq.tab")
260 File.open(@tab_file, "r") do |ios|
261 while not ios.eof? do
262 fields = ios.readline.chomp.split("\t")
263 yield FreqRow.new(fields[0], fields[1], fields[2], fields[3], fields[4])
270 def bp_frequency_table
271 STDERR.puts "Creating residue frequency table ... "
273 "read_sff -i #{@sff_file} |
277 create_weight_matrix -p |
279 write_tab -o #{@tab_file} -x"
281 STDERR.puts "done.\n"
285 attr_reader :res_A, :res_C, :res_G, :res_N, :res_T
287 def initialize(res_A, res_C, res_G, res_N, res_T)
301 <title>QA 454 Report</title>
304 <h1>QA 454 Report</h1>
305 <p>Date: #{Time.now}</p>
306 <p>File: <%= @sff_file %></p>
307 <h2>Sequence analysis</h2>
309 <li>Number of sequences in the file: <%= @seq_analysis.count %></li>
310 <li>Minimum sequence length found: <%= @seq_analysis.min %></li>
311 <li>Maximum sequence length found: <%= @seq_analysis.max %></li>
312 <li>Mean sequence length found: <%= @seq_analysis.mean %></li>
313 <li>Total number of bases in the file: <%= @seq_analysis.bases %></li>
314 <li>Mean GC% content: <%= @seq_analysis.gc %></li>
315 <li>Mean of hard masked sequence (i.e. % of N's): <%= @seq_analysis.hard %></li>
316 <li>Mean of soft masked sequence (i.e. % lowercase residues = clipped sequence): <%= @seq_analysis.soft %></li>
318 <h2>Sequence length distribution</h2>
319 <p>The length distribution of unclipped reads:</p>
320 <p><img alt="plot_lendist_unclipped" src="<%= @plots.lendist_unclipped %>" width="600" /></p>
321 <p>The length distribution of clipped reads:</p>
322 <p><img alt="plot_lendist_clipped" src="<%= @plots.lendist_clipped %>" width="600" /></p>
323 <h2>Quality score means</h2>
324 <p>The mean scores of the unclipped sequences:</p>
325 <p><img alt="plot_scores_unclipped" src="<%= @plots.scores_unclipped %>" width="600" /></p>
326 <p>The mean scores of the clipped sequences:</p>
327 <p><img alt="plot_scores_clipped" src="<%= @plots.scores_clipped %>" width="600" /></p>
328 <p>Histogram of bins with mean quality scores:</p>
329 <p><img alt="plot_mean_scores" src="<%= @plots.mean_scores %>" width="600" /></p>
330 <h2>MID tag analysis</h2>
331 <p>The below table contains the identified MID tags and the number of times they were found:<p>
333 <li>BARCODE_NAME is the MID tag identifier.</li>
334 <li>BARCODE is the sequence of the MID tag.</li>
335 <li>TOTAL is the number of times this MID tag was found.</li>
336 <li>L250 is the a subset count of TOTAL af sequences longer than 250 bases</li>
337 <li>L250_S20 is a subset count of L250 af sequences with a mean score above 20</li>
340 <% @table_mid_join.each do |row| %>
342 <td><%= row.mid_num %></td>
343 <td><%= row.mid_seq %></td>
344 <td><%= row.total %></td>
345 <td><%= row.l250 %></td>
346 <td><%= row.l250_s20 %></td>
350 <h2>Residue frequency analysis</h2>
351 <p>The below table contains the residue frequency (in percent) of the first 50 bases:</p>
353 <% @table_freq.each do |row| %>
355 <td><%= row.res_A %></td>
356 <td><%= row.res_T %></td>
357 <td><%= row.res_C %></td>
358 <td><%= row.res_G %></td>
359 <td><%= row.res_N %></td>
367 html = ERB.new(template)
370 report = Report.new(file)
372 html.run(report.get_binding)