From 46c9296a4c05233a498f4b21d7562ae84e787682 Mon Sep 17 00:00:00 2001 From: martinahansen Date: Thu, 27 Oct 2011 08:19:25 +0000 Subject: [PATCH] added QA_454_report ruby script git-svn-id: http://biopieces.googlecode.com/svn/trunk@1601 74ccb610-7750-0410-82ae-013aeee3265d --- bp_scripts/QA_454_report.rb | 375 ++++++++++++++++++++++++++++++++++++ 1 file changed, 375 insertions(+) create mode 100755 bp_scripts/QA_454_report.rb diff --git a/bp_scripts/QA_454_report.rb b/bp_scripts/QA_454_report.rb new file mode 100755 index 0000000..1201389 --- /dev/null +++ b/bp_scripts/QA_454_report.rb @@ -0,0 +1,375 @@ +#!/usr/bin/env ruby + +# Copyright (C) 2011 Martin A. Hansen (mail@maasha.dk). + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + +require 'pp' +require 'tmpdir' +require 'base64' +require 'erb' + +class Report + def initialize(sff_file) + @sff_file = sff_file + tmpdir = Dir.mktmpdir + + @seq_analysis = SeqAnalyze.new(@sff_file, tmpdir) + @plots = PlotData.new(@sff_file, tmpdir) + @table_mid_join = MidTable.new(@sff_file, tmpdir) + @table_freq = FreqTable.new(@sff_file, tmpdir) + end + + # Support templating of member data. + def get_binding + binding + end + + private + + class SeqAnalyze + attr_reader :count, :min, :max, :mean, :bases, :gc, :hard, :soft + + def initialize(sff_file, tmpdir) + @sff_file = sff_file + @out1_file = File.join(tmpdir, "out1.txt") + @out2_file = File.join(tmpdir, "out2.txt") + @count = 0 + @min = 0 + @max = 0 + @mean = 0 + @bases = 0 + @gc = 0 + @hard = 0 + @soft = 0 + + bp_seq_analyze + parse_analyze_vals + parse_mean_vals + end + + private + + def bp_seq_analyze + STDERR.puts "Analyzing sequences ... " + system( + "read_sff -i #{@sff_file} | + progress_meter | + analyze_vals -k SEQ -o #{@out1_file} | + analyze_seq | + mean_vals -k 'GC%,HARD_MASK%,SOFT_MASK%' -o #{@out2_file} -x" + ) + STDERR.puts "done.\n" + end + + def parse_analyze_vals + File.open(@out1_file, "r") do |ios| + while not ios.eof? + line = ios.readline.chomp + + case line + when /COUNT\s+(\d+)/; then @count = $1 + when /MIN\s+(\d+)/; then @min = $1 + when /MAX\s+(\d+)/; then @max = $1 + when /MEAN\s+(\d+)/; then @mean = $1 + when /SUM\s+(\d+)/; then @bases = $1 + end + end + end + end + + def parse_mean_vals + File.open(@out2_file, "r") do |ios| + while not ios.eof? + line = ios.readline.chomp + + case line + when /GC%_MEAN: (.+)/; then @gc = $1 + when /HARD_MASK%_MEAN: (.+)/; then @hard = $1 + when /SOFT_MASK%_MEAN: (.+)/; then @soft = $1 + end + end + end + end + end + + class PlotData + attr_reader :lendist_unclipped, :lendist_clipped, :scores_unclipped, :scores_clipped, :mean_scores + + def initialize(sff_file, tmpdir) + @sff_file = sff_file + @plot1 = File.join(tmpdir, "plot1.png") + @plot2 = File.join(tmpdir, "plot2.png") + @plot3 = File.join(tmpdir, "plot3.png") + @plot4 = File.join(tmpdir, "plot4.png") + @plot5 = File.join(tmpdir, "plot5.png") + + bp_plot + + @lendist_unclipped = png2base64(@plot1) + @lendist_clipped = png2base64(@plot3) + @scores_unclipped = png2base64(@plot2) + @scores_clipped = png2base64(@plot4) + @mean_scores = png2base64(@plot5) + end + + def bp_plot + STDERR.puts "Creating plots ... " + system( + "read_sff -c -i #{@sff_file} | + progress_meter | + plot_distribution -k SEQ_LEN -T 'Length Distribution - unclipped' -t png -o #{@plot1} | + plot_scores -T 'Mean Quality Scores - unclipped' -t png -o #{@plot2} | + clip_seq | + plot_distribution -k SEQ_LEN -T 'Length Distribution - clipped' -t png -o #{@plot3} | + plot_scores -T 'Mean Quality Scores - clipped' -t png -o #{@plot4} | + mean_scores | + bin_vals -k SCORES_MEAN -b 5 | + plot_histogram -s num -k SCORES_MEAN_BIN -T 'Mean score bins' -X 'Bins (size 5)' -Y 'Count' -t png -o #{@plot5} -x" + ) + STDERR.puts "done.\n" + end + + private + + def png2base64(file) + png = "" + + File.open(file, "r") do |ios| + png = ios.read + end + + "data:image/png;base64," + Base64.encode64(png) + end + end + + class MidTable + def initialize(sff_file, tmpdir) + @sff_file = sff_file + @mid1_file = File.join(tmpdir, "mid1.tab") + @mid2_file = File.join(tmpdir, "mid2.tab") + @mid3_file = File.join(tmpdir, "mid3.tab") + @mid4_file = File.join(tmpdir, "mid_join.tab") + + bp_find_mids + bp_merge_mid_tables + end + + def each + File.open(@mid4_file, "r") do |ios| + while not ios.eof? do + fields = ios.readline.chomp.split("\t") + yield MidRow.new(fields[0], fields[1], fields[2], fields[3], fields[4]) + end + end + end + + private + + def bp_find_mids + STDERR.puts "Finding barcodes in raw sequences ... " + system( + "read_sff -i #{@sff_file} | + find_barcodes -p 4 -gr | + count_vals -k BARCODE_NAME | + uniq_vals -k BARCODE_NAME | + write_tab -c -k BARCODE_NAME,BARCODE,BARCODE_NAME_COUNT -o #{@mid1_file} -x" + ) + STDERR.puts "done.\n" + STDERR.puts "Finding barcodes in sequences >= 250 ... " + system( + "read_sff -i #{@sff_file} | + grab -e 'SEQ_LEN >= 250' | + find_barcodes -p 4 -gr | + count_vals -k BARCODE_NAME | + uniq_vals -k BARCODE_NAME | + write_tab -c -k BARCODE_NAME,BARCODE,BARCODE_NAME_COUNT -o #{@mid2_file} -x" + ) + STDERR.puts "done.\n" + STDERR.puts "Finding barcodes in sequences >= 250 with mean score >= 20 ... " + system( + "read_sff -i #{@sff_file} | + mean_scores | + grab -e 'SEQ_LEN >= 250' | + grab -e 'SCORES_MEAN >= 20' | + find_barcodes -p 4 -gr | + count_vals -k BARCODE_NAME | + uniq_vals -k BARCODE_NAME | + write_tab -c -k BARCODE_NAME,BARCODE,BARCODE_NAME_COUNT -o #{@mid3_file} -x" + ) + STDERR.puts "done.\n" + end + + def bp_merge_mid_tables + STDERR.print "Joining MID tables ... " + system( + "read_tab -i #{@mid1_file} | + rename_keys -k BARCODE_NAME,A | + rename_keys -k BARCODE_NAME_COUNT,TOTAL | + read_tab -i #{@mid2_file} | + rename_keys -k BARCODE_NAME,B | + rename_keys -k BARCODE_NAME_COUNT,L250 | + merge_records -k A,B | + read_tab -i #{@mid3_file} | + rename_keys -k BARCODE_NAME,C | + rename_keys -k BARCODE_NAME_COUNT,L250_S20 | + merge_records -k A,C | + rename_keys -k A,BARCODE_NAME | + sort_records -k BARCODE_NAME | + write_tab -ck BARCODE_NAME,BARCODE,TOTAL,L250,L250_S20 -o #{@mid4_file} -x" + ) + STDERR.puts "done.\n" + end + + class MidRow + attr_reader :mid_num, :mid_seq, :total, :l250, :l250_s20 + + def initialize(mid_num, mid_seq, total, l250, l250_s20) + @mid_num = mid_num + @mid_seq = mid_seq + @total = total + @l250 = l250 + @l250_s20 = l250_s20 + end + end + end + + class FreqTable + def initialize(sff_file, tmpdir) + @sff_file = sff_file + @tab_file = File.join(tmpdir, "freq.tab") + + bp_frequency_table + end + + def each + File.open(@tab_file, "r") do |ios| + while not ios.eof? do + fields = ios.readline.chomp.split("\t") + yield FreqRow.new(fields[0], fields[1], fields[2], fields[3], fields[4]) + end + end + end + + private + + def bp_frequency_table + STDERR.puts "Creating residue frequency table ... " + system( + "read_sff -i #{@sff_file} | + progress_meter | + extract_seq -l 50 | + uppercase_seq | + create_weight_matrix -p | + flip_tab | + write_tab -o #{@tab_file} -x" + ) + STDERR.puts "done.\n" + end + + class FreqRow + attr_reader :res_A, :res_C, :res_G, :res_N, :res_T + + def initialize(res_A, res_C, res_G, res_N, res_T) + @res_A = res_A + @res_C = res_C + @res_G = res_G + @res_N = res_N + @res_T = res_T + end + end + end +end + +template = %{ + + + QA 454 Report + + +

QA 454 Report

+

Date: #{Time.now}

+

File: <%= @sff_file %>

+

Sequence analysis

+ +

Sequence length distribution

+

The length distribution of unclipped reads:

+

plot_lendist_unclipped

+

The length distribution of clipped reads:

+

plot_lendist_clipped

+

Quality score means

+

The mean scores of the unclipped sequences:

+

plot_scores_unclipped

+

The mean scores of the clipped sequences:

+

plot_scores_clipped

+

Histogram of bins with mean quality scores:

+

plot_mean_scores

+

MID tag analysis

+

The below table contains the identified MID tags and the number of times they were found:

+

+ + <% @table_mid_join.each do |row| %> + + + + + + + + <% end %> +
<%= row.mid_num %><%= row.mid_seq %><%= row.total %><%= row.l250 %><%= row.l250_s20 %>
+

Residue frequency analysis

+

The below table contains the residue frequency (in percent) of the first 50 bases:

+ + <% @table_freq.each do |row| %> + + + + + + + + <% end %> +
<%= row.res_A %><%= row.res_T %><%= row.res_C %><%= row.res_G %><%= row.res_N %>
+ + +}.gsub(/^\s+/, '') + +html = ERB.new(template) + +ARGV.each do |file| + report = Report.new(file) + + html.run(report.get_binding) +end + +__END__ -- 2.39.2