bp_scripts/QA_454_report.rb

   1 #!/usr/bin/env ruby
   2
   3 # Copyright (C) 2011 Martin A. Hansen (mail@maasha.dk).
   4
   5 # This program is free software; you can redistribute it and/or
   6 # modify it under the terms of the GNU General Public License
   7 # as published by the Free Software Foundation; either version 2
   8 # of the License, or (at your option) any later version.
   9
  10 # This program is distributed in the hope that it will be useful,
  11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 # GNU General Public License for more details.
  14
  15 # You should have received a copy of the GNU General Public License
  16 # along with this program; if not, write to the Free Software
  17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  18
  19 # http://www.gnu.org/copyleft/gpl.html
  20
  21 require 'pp'
  22 require 'tmpdir'
  23 require 'base64'
  24 require 'erb'
  25
  26 class Report
  27   def initialize(sff_file)
  28     @sff_file = sff_file
  29     tmpdir    = Dir.mktmpdir
  30
  31     @seq_analysis   = SeqAnalyze.new(@sff_file, tmpdir)
  32     @plots          = PlotData.new(@sff_file, tmpdir)
  33     @table_mid_join = MidTable.new(@sff_file, tmpdir)
  34   end
  35
  36   # Support templating of member data.
  37   def get_binding
  38     binding
  39   end
  40
  41   private
  42
  43   class SeqAnalyze
  44     attr_reader :count, :min, :max, :mean, :bases, :gc, :hard, :soft
  45
  46     def initialize(sff_file, tmpdir)
  47       @sff_file  = sff_file
  48       @anal_file = File.join(tmpdir, "out1.txt")
  49       @count     = 0
  50       @min       = 0
  51       @max       = 0
  52       @mean      = 0
  53       @bases     = 0
  54       @gc        = 0
  55       @hard      = 0
  56       @soft      = 0
  57
  58       bp_seq_analyze
  59       parse_analyze_vals
  60     end
  61
  62     private
  63
  64     def bp_seq_analyze
  65       STDERR.puts "Analyzing sequences ... "
  66       system(
  67         "read_sff -i #{@sff_file} |
  68          progress_meter |
  69          analyze_seq |
  70          analyze_vals -k GC%,HARD_MASK%,SOFT_MASK% |
  71          write_tab -co #{@anal_file} -x"
  72        )
  73       STDERR.puts "done.\n"
  74     end
  75
  76     def parse_analyze_vals
  77       File.open(@anal_file, "r") do |ios|
  78         while not ios.eof?
  79           line = ios.readline.chomp
  80
  81           case line
  82           when /COUNT\s+(\d+)/           then @count = $1
  83           when /MIN\s+(\d+)/             then @min   = $1
  84           when /MAX\s+(\d+)/             then @max   = $1
  85           when /MEAN\s+(\d+)/            then @mean  = $1
  86           when /SUM\s+(\d+)/             then @bases = $1
  87           when /GC%_MEAN:\s+(.+)/        then @gc    = $1
  88           when /HARD_MASK%_MEAN:\s+(.+)/ then @hard  = $1
  89           when /SOFT_MASK%_MEAN:\s+(.+)/ then @soft  = $1
  90           end
  91         end
  92       end
  93     end
  94
  95     def parse_mean_vals
  96       File.open(@out2_file, "r") do |ios|
  97         while not ios.eof?
  98           line = ios.readline.chomp
  99
 100           case line
 101           when /GC%_MEAN: (.+)/;        then @gc   = $1
 102           when /HARD_MASK%_MEAN: (.+)/; then @hard = $1
 103           when /SOFT_MASK%_MEAN: (.+)/; then @soft = $1
 104           end
 105         end
 106       end
 107     end
 108   end
 109
 110   class PlotData
 111     attr_reader :lendist_unclipped, :lendist_clipped, :scores_unclipped, :scores_clipped, :mean_scores, :nucleotide_dist500, :nucleotide_dist50
 112
 113     def initialize(sff_file, tmpdir)
 114       @sff_file = sff_file
 115       @plot1    = File.join(tmpdir, "plot1.png")
 116       @plot2    = File.join(tmpdir, "plot2.png")
 117       @plot3    = File.join(tmpdir, "plot3.png")
 118       @plot4    = File.join(tmpdir, "plot4.png")
 119       @plot5    = File.join(tmpdir, "plot5.png")
 120       @plot6    = File.join(tmpdir, "plot6.png")
 121       @plot7    = File.join(tmpdir, "plot7.png")
 122
 123       bp_plot
 124
 125       @lendist_unclipped  = png2base64(@plot1)
 126       @lendist_clipped    = png2base64(@plot3)
 127       @scores_unclipped   = png2base64(@plot2)
 128       @scores_clipped     = png2base64(@plot4)
 129       @mean_scores        = png2base64(@plot5)
 130       @nucleotide_dist500 = png2base64(@plot6)
 131       @nucleotide_dist50  = png2base64(@plot7)
 132     end
 133
 134     def bp_plot
 135       STDERR.puts "Creating plots ... "
 136       system(
 137         "read_sff -m -i #{@sff_file} |
 138          progress_meter |
 139          plot_distribution -k SEQ_LEN -T 'Length Distribution - unclipped' -t png -o #{@plot1} |
 140          plot_scores -c -T 'Mean Quality Scores - unclipped' -t png -o #{@plot2} |
 141          clip_seq |
 142          plot_distribution -k SEQ_LEN -T 'Length Distribution - clipped' -t png -o #{@plot3} |
 143          plot_scores -c -T 'Mean Quality Scores - clipped' -t png -o #{@plot4} |
 144          mean_scores |
 145          bin_vals -k SCORES_MEAN -b 5 |
 146          plot_histogram -s num -k SCORES_MEAN_BIN -T 'Mean score bins' -X 'Bins (size 5)' -Y 'Count' -t png -o #{@plot5} |
 147          extract_seq -l 500 |
 148          plot_nucleotide_distribution -c -t png -o #{@plot6} |
 149          extract_seq -l 50 |
 150          plot_nucleotide_distribution -t png -o #{@plot7} -x"
 151       )
 152       STDERR.puts "done.\n"
 153     end
 154
 155     private
 156
 157     def png2base64(file)
 158       png = ""
 159
 160       File.open(file, "r") do |ios|
 161         png = ios.read
 162       end
 163
 164       "data:image/png;base64," + Base64.encode64(png)
 165     end
 166   end
 167
 168   class MidTable
 169     def initialize(sff_file, tmpdir)
 170       @sff_file  = sff_file
 171       @mid1_file = File.join(tmpdir, "mid1.tab")
 172       @mid2_file = File.join(tmpdir, "mid2.tab")
 173       @mid3_file = File.join(tmpdir, "mid3.tab")
 174       @mid4_file = File.join(tmpdir, "mid_join.tab")
 175
 176       bp_find_mids
 177       bp_merge_mid_tables
 178     end
 179
 180     def each
 181       File.open(@mid4_file, "r") do |ios|
 182         while not ios.eof? do
 183           fields = ios.readline.chomp.split("\t")
 184           yield MidRow.new(fields[0], fields[1], fields[2], fields[3], fields[4])
 185         end
 186       end
 187     end
 188
 189     private
 190
 191     def bp_find_mids
 192       STDERR.puts "Finding barcodes in raw sequences ... "
 193       system(
 194         "read_sff -i #{@sff_file} |
 195          find_barcodes -p 4 -gr |
 196          count_vals -k BARCODE_NAME |
 197          uniq_vals -k BARCODE_NAME |
 198          write_tab -c -k BARCODE_NAME,BARCODE,BARCODE_NAME_COUNT -o #{@mid1_file} -x"
 199       )
 200       STDERR.puts "done.\n"
 201       STDERR.puts "Finding barcodes in sequences >= 250 ... "
 202       system(
 203         "read_sff -i #{@sff_file} |
 204          grab -e 'SEQ_LEN >= 250' |
 205          find_barcodes -p 4 -gr |
 206          count_vals -k BARCODE_NAME |
 207          uniq_vals -k BARCODE_NAME |
 208          write_tab -c -k BARCODE_NAME,BARCODE,BARCODE_NAME_COUNT -o #{@mid2_file} -x"
 209       )
 210       STDERR.puts "done.\n"
 211       STDERR.puts "Finding barcodes in sequences >= 250 with mean score >= 20 ... "
 212       system(
 213         "read_sff -i #{@sff_file} |
 214          mean_scores |
 215          grab -e 'SEQ_LEN >= 250' |
 216          grab -e 'SCORES_MEAN >= 20' |
 217          find_barcodes -p 4 -gr |
 218          count_vals -k BARCODE_NAME |
 219          uniq_vals -k BARCODE_NAME |
 220          write_tab -c -k BARCODE_NAME,BARCODE,BARCODE_NAME_COUNT -o #{@mid3_file} -x"
 221       )
 222       STDERR.puts "done.\n"
 223     end
 224
 225     def bp_merge_mid_tables
 226       STDERR.print "Joining MID tables ... "
 227       system(
 228         "read_tab -i #{@mid1_file} |
 229          rename_keys -k BARCODE_NAME,A |
 230          rename_keys -k BARCODE_NAME_COUNT,TOTAL |
 231          read_tab -i #{@mid2_file} |
 232          rename_keys -k BARCODE_NAME,B |
 233          rename_keys -k BARCODE_NAME_COUNT,L250 |
 234          merge_records -k A,B |
 235          read_tab -i #{@mid3_file} |
 236          rename_keys -k BARCODE_NAME,C |
 237          rename_keys -k BARCODE_NAME_COUNT,L250_S20 |
 238          merge_records -k A,C |
 239          rename_keys -k A,BARCODE_NAME |
 240          sort_records -k BARCODE_NAME |
 241          write_tab -ck BARCODE_NAME,BARCODE,TOTAL,L250,L250_S20 -o #{@mid4_file} -x"
 242       )
 243       STDERR.puts "done.\n"
 244     end
 245
 246     class MidRow
 247       attr_reader :mid_num, :mid_seq, :total, :l250, :l250_s20
 248
 249       def initialize(mid_num, mid_seq, total, l250, l250_s20)
 250         @mid_num  = mid_num
 251         @mid_seq  = mid_seq
 252         @total    = total
 253         @l250     = l250
 254         @l250_s20 = l250_s20
 255       end
 256     end
 257   end
 258 end
 259
 260 template = %{
 261   <html>
 262     <head>
 263       <title>QA 454 Report</title>
 264     </head>
 265     <body>
 266       <h1>QA 454 Report</h1>
 267       <p>Date: #{Time.now}</p>
 268       <p>File: <%= @sff_file %></p>
 269       <h2>Sequence analysis</h2>
 270       <ul>
 271         <li>Number of sequences in the file: <%= @seq_analysis.count %></li>
 272         <li>Minimum sequence length found: <%= @seq_analysis.min %></li>
 273         <li>Maximum sequence length found: <%= @seq_analysis.max %></li>
 274         <li>Mean sequence length found: <%= @seq_analysis.mean %></li>
 275         <li>Total number of bases in the file: <%= @seq_analysis.bases %></li>
 276         <li>Mean GC% content: <%= @seq_analysis.gc %></li>
 277         <li>Mean of hard masked sequence (i.e. % of N's): <%= @seq_analysis.hard %></li>
 278         <li>Mean of soft masked sequence (i.e. % lowercase residues = clipped sequence): <%= @seq_analysis.soft %></li>
 279       </ul>
 280       <h2>Sequence length distribution</h2>
 281       <p>The length distribution of unclipped reads:</p>
 282       <p><img alt="plot_lendist_unclipped" src="<%= @plots.lendist_unclipped %>" width="600" /></p>
 283       <p>The length distribution of clipped reads:</p>
 284       <p><img alt="plot_lendist_clipped" src="<%= @plots.lendist_clipped %>" width="600" /></p>
 285       <h2>Quality score means</h2>
 286       <p>The mean scores of the unclipped sequences:</p>
 287       <p><img alt="plot_scores_unclipped" src="<%= @plots.scores_unclipped %>" width="600" /></p>
 288       <p>The mean scores of the clipped sequences:</p>
 289       <p><img alt="plot_scores_clipped" src="<%= @plots.scores_clipped %>" width="600" /></p>
 290       <p>Histogram of bins with mean quality scores:</p>
 291       <p><img alt="plot_mean_scores" src="<%= @plots.mean_scores %>" width="600" /></p>
 292       <h2>MID tag analysis</h2>
 293       <p>The below table contains the identified MID tags and the number of times they were found:<p>
 294       <ul>
 295       <li>BARCODE_NAME is the MID tag identifier.</li>
 296       <li>BARCODE is the sequence of the MID tag.</li>
 297       <li>TOTAL is the number of times this MID tag was found.</li>
 298       <li>L250 is the a subset count of TOTAL af sequences longer than 250 bases</li>
 299       <li>L250_S20 is a subset count of L250 af sequences with a mean score above 20</li>
 300       </ul>
 301       <table>
 302       <% @table_mid_join.each do |row| %>
 303         <tr>
 304         <td><%= row.mid_num %></td>
 305         <td><%= row.mid_seq %></td>
 306         <td><%= row.total %></td>
 307         <td><%= row.l250 %></td>
 308         <td><%= row.l250_s20 %></td>
 309         </tr>
 310       <% end %>
 311       </table>
 312       <h2>Residue frequency analysis</h2>
 313       <p>Plot of nucleotide distribution in percent of the first 50 bases:</p>
 314       <p><img alt="plot_nucleotide_distribution" src="<%= @plots.nucleotide_dist50 %>" width="600" /></p>
 315       <p>Plot of nucleotide distribution in percent of the first 500 bases:</p>
 316       <p><img alt="plot_nucleotide_distribution" src="<%= @plots.nucleotide_dist500 %>" width="600" /></p>
 317     </body>
 318   </html>
 319 }.gsub(/^\s+/, '')
 320
 321 html = ERB.new(template)
 322
 323 ARGV.each do |file|
 324   report = Report.new(file)
 325
 326   html.run(report.get_binding)
 327 end
 328
 329 __END__