bp_scripts/QA_Illumina_report.rb

   1 #!/usr/bin/env ruby
   2
   3 # Copyright (C) 2012 Martin A. Hansen (mail@maasha.dk).
   4
   5 # This program is free software; you can redistribute it and/or
   6 # modify it under the terms of the GNU General Public License
   7 # as published by the Free Software Foundation; either version 2
   8 # of the License, or (at your option) any later version.
   9
  10 # This program is distributed in the hope that it will be useful,
  11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 # GNU General Public License for more details.
  14
  15 # You should have received a copy of the GNU General Public License
  16 # along with this program; if not, write to the Free Software
  17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  18
  19 # http://www.gnu.org/copyleft/gpl.html
  20
  21 require 'pp'
  22 require 'tmpdir'
  23 require 'base64'
  24 require 'erb'
  25
  26 class Numeric
  27   def commify
  28     self.to_s.gsub(/(^[-+]?\d+?(?=(?>(?:\d{3})+)(?!\d))|\G\d{3}(?=\d))/, '\1,')
  29   end
  30 end
  31
  32 def parse_analysis(file)
  33   data = {}
  34
  35   File.open(file, 'r') do |ios|
  36     ios.each do |line|
  37       key, val = line.chomp.split(': ')
  38         begin Integer(val)
  39           val = val.to_i.commify
  40         rescue
  41           begin Float(val)
  42             val = val.to_f.commify
  43           rescue
  44           end
  45         end
  46
  47       data[key] = val
  48     end
  49   end
  50
  51   return data
  52 end
  53
  54 def png2base64(file)
  55   png = ""
  56
  57   File.open(file, "r") do |ios|
  58     png = ios.read
  59   end
  60
  61   "data:image/png;base64," + Base64.encode64(png)
  62 end
  63
  64 if ARGV.empty?
  65   $stderr.puts "Usage: QA_Illumina_report.rb <FASTQ file> > <HTML file>"
  66   exit
  67 end
  68
  69 tmpdir   = Dir.mktmpdir
  70 seq_file = ARGV.shift
  71
  72 analyze_vals_file              = File.join(tmpdir, 'analyze_vals.txt')
  73 analyze_vals_trim_file         = File.join(tmpdir, 'analyze_vals_trim_noadapt.txt')
  74 analyze_vals_trim_noadapt_file = File.join(tmpdir, 'analyze_vals_trim.txt')
  75 lendist_file                   = File.join(tmpdir, 'lendist.png')
  76 scores_file                    = File.join(tmpdir, 'scores.png')
  77 nucdist_file                   = File.join(tmpdir, 'nucdist.png')
  78 lendist_bin_file               = File.join(tmpdir, 'lendist_bin.png')
  79 scores_bin_file                = File.join(tmpdir, 'scores_bin.png')
  80
  81 STDERR.puts "Analyzing sequences ... "
  82
  83 system(
  84   "read_fastq -e base_33 -i #{seq_file} |
  85    progress_meter |
  86    analyze_vals -k SEQ -o #{analyze_vals_file} |
  87    trim_seq -l 3 -m 25 |
  88    grab -e 'SEQ_LEN > 20' |
  89    analyze_vals -k SEQ -o #{analyze_vals_trim_file} |
  90    find_adaptor -l 6 -L 6 -f ACACGACGCTCTTCCGATCT -r AGATCGGAAGAGCACACGTC |
  91    clip_adaptor |
  92    grab -e 'SEQ_LEN > 0' |
  93    analyze_vals -k SEQ -o #{analyze_vals_trim_noadapt_file} |
  94    plot_distribution -k SEQ_LEN -T 'Sequence length distribution' -X 'Sequence length' -t png -o #{lendist_file} |
  95    plot_scores -c -t png -o #{scores_file} |
  96    plot_nucleotide_distribution -c -t png -o #{nucdist_file} |
  97    bin_vals -k SEQ_LEN -b 25 |
  98    plot_distribution -T '25 bases bin sequence length distribution' -X 'Sequence length' -k SEQ_LEN_BIN -t png -o #{lendist_bin_file} |
  99    mean_scores |
 100    bin_vals -k SCORES_MEAN -b 5 |
 101    plot_distribution -k SCORES_MEAN_BIN -T '5 bin mean score distribution' -X 'Mean scores' -t png -o #{scores_bin_file} -x"
 102 )
 103
 104 STDERR.puts "done.\n"
 105
 106 analysis1 = parse_analysis(analyze_vals_file)
 107 analysis2 = parse_analysis(analyze_vals_trim_file)
 108 analysis3 = parse_analysis(analyze_vals_trim_noadapt_file)
 109
 110 template = %{
 111   <html>
 112     <head>
 113       <title>QA Illumina Report</title>
 114     </head>
 115     <body>
 116       <h1>QA Illumina Report</h1>
 117       <p>Date: #{Time.now}</p>
 118       <p>File: <%= seq_file %></p>
 119       <h2>Sequence composition</h2>
 120       <table>
 121       <tr><td></td><td>Before trimming</td><td>After trimming</td><td>After adaptor removal</td></tr>
 122       <tr><td>Number of sequences</td><td align='right'><%= analysis1['COUNT'] %></td><td align='right'><%= analysis2['COUNT'] %></td><td align='right'><%= analysis3['COUNT'] %></td></tr>
 123       <tr><td>Number of bases</td><td align='right'><%= analysis1['SUM'] %></td><td align='right'><%= analysis2['SUM'] %></td><td align='right'><%= analysis3['SUM'] %></td></tr>
 124       <tr><td>Min sequence length</td><td align='right'><%= analysis1['MIN'] %></td><td align='right'><%= analysis2['MIN'] %></td><td align='right'><%= analysis3['MIN'] %></td></tr>
 125       <tr><td>Max sequence length</td><td align='right'><%= analysis1['MAX'] %></td><td align='right'><%= analysis2['MAX'] %></td><td align='right'><%= analysis3['MAX'] %></td></tr>
 126       <tr><td>Mean sequence length</td><td align='right'><%= analysis1['MEAN'] %></td><td align='right'><%= analysis2['MEAN'] %></td><td align='right'><%= analysis3['MEAN'] %></td></tr>
 127       </table>
 128       <p>Sequence trimming was performed by removing from the ends all residues until 3 consecutive</p>
 129       <p>residues with quality score larger than or equal to 25.</p>
 130       <p>All plots are after sequence trimming and adaptor removal.</p>
 131       <h2>Sequence length distribution</h2>
 132       <p><img src="<%= png2base64(lendist_file) %>" width="600" /></p>
 133       <p><img src="<%= png2base64(lendist_bin_file) %>" width="600" /></p>
 134       <h2>Sequence quality scores</h2>
 135       <p><img src="<%= png2base64(scores_file) %>" width="600" /></p>
 136       <p><img src="<%= png2base64(scores_bin_file) %>" width="600" /></p>
 137       <h2>Sequence nucleotide distribution</h2>
 138       <p><img src="<%= png2base64(nucdist_file) %>" width="600" /></p>
 139     </body>
 140   </html>
 141 }.gsub(/^\s+/, '')
 142
 143 html = ERB.new(template)
 144
 145 puts html.result(binding)
 146
 147 __END__