3 # Copyright (C) 2011 Martin A. Hansen (mail@maasha.dk).
5 # This program is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU General Public License
7 # as published by the Free Software Foundation; either version 2
8 # of the License, or (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 # http://www.gnu.org/copyleft/gpl.html
27 def initialize(sff_file)
31 @seq_analysis = SeqAnalyze.new(@sff_file, tmpdir)
32 @plots = PlotData.new(@sff_file, tmpdir)
33 @table_mid_join = MidTable.new(@sff_file, tmpdir)
36 # Support templating of member data.
44 attr_reader :count, :min, :max, :mean, :bases, :gc, :hard, :soft
46 def initialize(sff_file, tmpdir)
48 @anal_file = File.join(tmpdir, "out1.txt")
65 STDERR.puts "Analyzing sequences ... "
67 "read_sff -i #{@sff_file} |
70 analyze_vals -k GC%,HARD_MASK%,SOFT_MASK% |
71 write_tab -co #{@anal_file} -x"
76 def parse_analyze_vals
77 File.open(@anal_file, "r") do |ios|
79 line = ios.readline.chomp
82 when /COUNT\s+(\d+)/ then @count = $1
83 when /MIN\s+(\d+)/ then @min = $1
84 when /MAX\s+(\d+)/ then @max = $1
85 when /MEAN\s+(\d+)/ then @mean = $1
86 when /SUM\s+(\d+)/ then @bases = $1
87 when /GC%_MEAN:\s+(.+)/ then @gc = $1
88 when /HARD_MASK%_MEAN:\s+(.+)/ then @hard = $1
89 when /SOFT_MASK%_MEAN:\s+(.+)/ then @soft = $1
96 File.open(@out2_file, "r") do |ios|
98 line = ios.readline.chomp
101 when /GC%_MEAN: (.+)/; then @gc = $1
102 when /HARD_MASK%_MEAN: (.+)/; then @hard = $1
103 when /SOFT_MASK%_MEAN: (.+)/; then @soft = $1
111 attr_reader :lendist_unclipped, :lendist_clipped, :scores_unclipped, :scores_clipped, :mean_scores, :nucleotide_dist500, :nucleotide_dist50
113 def initialize(sff_file, tmpdir)
115 @plot1 = File.join(tmpdir, "plot1.png")
116 @plot2 = File.join(tmpdir, "plot2.png")
117 @plot3 = File.join(tmpdir, "plot3.png")
118 @plot4 = File.join(tmpdir, "plot4.png")
119 @plot5 = File.join(tmpdir, "plot5.png")
120 @plot6 = File.join(tmpdir, "plot6.png")
121 @plot7 = File.join(tmpdir, "plot7.png")
125 @lendist_unclipped = png2base64(@plot1)
126 @lendist_clipped = png2base64(@plot3)
127 @scores_unclipped = png2base64(@plot2)
128 @scores_clipped = png2base64(@plot4)
129 @mean_scores = png2base64(@plot5)
130 @nucleotide_dist500 = png2base64(@plot6)
131 @nucleotide_dist50 = png2base64(@plot7)
135 STDERR.puts "Creating plots ... "
137 "read_sff -m -i #{@sff_file} |
139 plot_distribution -k SEQ_LEN -T 'Length Distribution - unclipped' -t png -o #{@plot1} |
140 plot_scores -c -T 'Mean Quality Scores - unclipped' -t png -o #{@plot2} |
142 plot_distribution -k SEQ_LEN -T 'Length Distribution - clipped' -t png -o #{@plot3} |
143 plot_scores -c -T 'Mean Quality Scores - clipped' -t png -o #{@plot4} |
145 bin_vals -k SCORES_MEAN -b 5 |
146 plot_histogram -s num -k SCORES_MEAN_BIN -T 'Mean score bins' -X 'Bins (size 5)' -Y 'Count' -t png -o #{@plot5} |
148 plot_nucleotide_distribution -c -t png -o #{@plot6} |
150 plot_nucleotide_distribution -t png -o #{@plot7} -x"
152 STDERR.puts "done.\n"
160 File.open(file, "r") do |ios|
164 "data:image/png;base64," + Base64.encode64(png)
169 def initialize(sff_file, tmpdir)
171 @mid1_file = File.join(tmpdir, "mid1.tab")
172 @mid2_file = File.join(tmpdir, "mid2.tab")
173 @mid3_file = File.join(tmpdir, "mid3.tab")
174 @mid4_file = File.join(tmpdir, "mid_join.tab")
181 File.open(@mid4_file, "r") do |ios|
182 while not ios.eof? do
183 fields = ios.readline.chomp.split("\t")
184 yield MidRow.new(fields[0], fields[1], fields[2], fields[3], fields[4])
192 STDERR.puts "Finding barcodes in raw sequences ... "
194 "read_sff -i #{@sff_file} |
195 find_barcodes -p 4 -gr |
196 count_vals -k BARCODE_NAME |
197 uniq_vals -k BARCODE_NAME |
198 write_tab -c -k BARCODE_NAME,BARCODE,BARCODE_NAME_COUNT -o #{@mid1_file} -x"
200 STDERR.puts "done.\n"
201 STDERR.puts "Finding barcodes in sequences >= 250 ... "
203 "read_sff -i #{@sff_file} |
204 grab -e 'SEQ_LEN >= 250' |
205 find_barcodes -p 4 -gr |
206 count_vals -k BARCODE_NAME |
207 uniq_vals -k BARCODE_NAME |
208 write_tab -c -k BARCODE_NAME,BARCODE,BARCODE_NAME_COUNT -o #{@mid2_file} -x"
210 STDERR.puts "done.\n"
211 STDERR.puts "Finding barcodes in sequences >= 250 with mean score >= 20 ... "
213 "read_sff -i #{@sff_file} |
215 grab -e 'SEQ_LEN >= 250' |
216 grab -e 'SCORES_MEAN >= 20' |
217 find_barcodes -p 4 -gr |
218 count_vals -k BARCODE_NAME |
219 uniq_vals -k BARCODE_NAME |
220 write_tab -c -k BARCODE_NAME,BARCODE,BARCODE_NAME_COUNT -o #{@mid3_file} -x"
222 STDERR.puts "done.\n"
225 def bp_merge_mid_tables
226 STDERR.print "Joining MID tables ... "
228 "read_tab -i #{@mid1_file} |
229 rename_keys -k BARCODE_NAME,A |
230 rename_keys -k BARCODE_NAME_COUNT,TOTAL |
231 read_tab -i #{@mid2_file} |
232 rename_keys -k BARCODE_NAME,B |
233 rename_keys -k BARCODE_NAME_COUNT,L250 |
234 merge_records -k A,B |
235 read_tab -i #{@mid3_file} |
236 rename_keys -k BARCODE_NAME,C |
237 rename_keys -k BARCODE_NAME_COUNT,L250_S20 |
238 merge_records -k A,C |
239 rename_keys -k A,BARCODE_NAME |
240 sort_records -k BARCODE_NAME |
241 write_tab -ck BARCODE_NAME,BARCODE,TOTAL,L250,L250_S20 -o #{@mid4_file} -x"
243 STDERR.puts "done.\n"
247 attr_reader :mid_num, :mid_seq, :total, :l250, :l250_s20
249 def initialize(mid_num, mid_seq, total, l250, l250_s20)
263 <title>QA 454 Report</title>
266 <h1>QA 454 Report</h1>
267 <p>Date: #{Time.now}</p>
268 <p>File: <%= @sff_file %></p>
269 <h2>Sequence analysis</h2>
271 <li>Number of sequences in the file: <%= @seq_analysis.count %></li>
272 <li>Minimum sequence length found: <%= @seq_analysis.min %></li>
273 <li>Maximum sequence length found: <%= @seq_analysis.max %></li>
274 <li>Mean sequence length found: <%= @seq_analysis.mean %></li>
275 <li>Total number of bases in the file: <%= @seq_analysis.bases %></li>
276 <li>Mean GC% content: <%= @seq_analysis.gc %></li>
277 <li>Mean of hard masked sequence (i.e. % of N's): <%= @seq_analysis.hard %></li>
278 <li>Mean of soft masked sequence (i.e. % lowercase residues = clipped sequence): <%= @seq_analysis.soft %></li>
280 <h2>Sequence length distribution</h2>
281 <p>The length distribution of unclipped reads:</p>
282 <p><img alt="plot_lendist_unclipped" src="<%= @plots.lendist_unclipped %>" width="600" /></p>
283 <p>The length distribution of clipped reads:</p>
284 <p><img alt="plot_lendist_clipped" src="<%= @plots.lendist_clipped %>" width="600" /></p>
285 <h2>Quality score means</h2>
286 <p>The mean scores of the unclipped sequences:</p>
287 <p><img alt="plot_scores_unclipped" src="<%= @plots.scores_unclipped %>" width="600" /></p>
288 <p>The mean scores of the clipped sequences:</p>
289 <p><img alt="plot_scores_clipped" src="<%= @plots.scores_clipped %>" width="600" /></p>
290 <p>Histogram of bins with mean quality scores:</p>
291 <p><img alt="plot_mean_scores" src="<%= @plots.mean_scores %>" width="600" /></p>
292 <h2>MID tag analysis</h2>
293 <p>The below table contains the identified MID tags and the number of times they were found:<p>
295 <li>BARCODE_NAME is the MID tag identifier.</li>
296 <li>BARCODE is the sequence of the MID tag.</li>
297 <li>TOTAL is the number of times this MID tag was found.</li>
298 <li>L250 is the a subset count of TOTAL af sequences longer than 250 bases</li>
299 <li>L250_S20 is a subset count of L250 af sequences with a mean score above 20</li>
302 <% @table_mid_join.each do |row| %>
304 <td><%= row.mid_num %></td>
305 <td><%= row.mid_seq %></td>
306 <td><%= row.total %></td>
307 <td><%= row.l250 %></td>
308 <td><%= row.l250_s20 %></td>
312 <h2>Residue frequency analysis</h2>
313 <p>Plot of nucleotide distribution in percent of the first 50 bases:</p>
314 <p><img alt="plot_nucleotide_distribution" src="<%= @plots.nucleotide_dist50 %>" width="600" /></p>
315 <p>Plot of nucleotide distribution in percent of the first 500 bases:</p>
316 <p><img alt="plot_nucleotide_distribution" src="<%= @plots.nucleotide_dist500 %>" width="600" /></p>
321 html = ERB.new(template)
324 report = Report.new(file)
326 html.run(report.get_binding)