3 # Copyright (C) 2012 Martin A. Hansen (mail@maasha.dk).
5 # This program is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU General Public License
7 # as published by the Free Software Foundation; either version 2
8 # of the License, or (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 # http://www.gnu.org/copyleft/gpl.html
28 self.to_s.gsub(/(^[-+]?\d+?(?=(?>(?:\d{3})+)(?!\d))|\G\d{3}(?=\d))/, '\1,')
32 def parse_analysis(file)
35 File.open(file, 'r') do |ios|
37 key, val = line.chomp.split(': ')
39 val = val.to_i.commify
42 val = val.to_f.commify
57 File.open(file, "r") do |ios|
61 "data:image/png;base64," + Base64.encode64(png)
65 $stderr.puts "Usage: QA_Illumina_report.rb <FASTQ file> > <HTML file>"
72 analyze_vals_file = File.join(tmpdir, 'analyze_vals.txt')
73 analyze_vals_trim_file = File.join(tmpdir, 'analyze_vals_trim_noadapt.txt')
74 analyze_vals_trim_noadapt_file = File.join(tmpdir, 'analyze_vals_trim.txt')
75 lendist_file = File.join(tmpdir, 'lendist.png')
76 scores_file = File.join(tmpdir, 'scores.png')
77 nucdist_file = File.join(tmpdir, 'nucdist.png')
78 lendist_bin_file = File.join(tmpdir, 'lendist_bin.png')
79 scores_bin_file = File.join(tmpdir, 'scores_bin.png')
81 STDERR.puts "Analyzing sequences ... "
84 "read_fastq -e base_33 -i #{seq_file} |
86 analyze_vals -k SEQ -o #{analyze_vals_file} |
88 grab -e 'SEQ_LEN > 20' |
89 analyze_vals -k SEQ -o #{analyze_vals_trim_file} |
90 find_adaptor -l 6 -L 6 -f ACACGACGCTCTTCCGATCT -r AGATCGGAAGAGCACACGTC |
92 grab -e 'SEQ_LEN > 0' |
93 analyze_vals -k SEQ -o #{analyze_vals_trim_noadapt_file} |
94 plot_distribution -k SEQ_LEN -T 'Sequence length distribution' -X 'Sequence length' -t png -o #{lendist_file} |
95 plot_scores -c -t png -o #{scores_file} |
96 plot_nucleotide_distribution -c -t png -o #{nucdist_file} |
97 bin_vals -k SEQ_LEN -b 25 |
98 plot_distribution -T '25 bases bin sequence length distribution' -X 'Sequence length' -k SEQ_LEN_BIN -t png -o #{lendist_bin_file} |
100 bin_vals -k SCORES_MEAN -b 5 |
101 plot_distribution -k SCORES_MEAN_BIN -T '5 bin mean score distribution' -X 'Mean scores' -t png -o #{scores_bin_file} -x"
104 STDERR.puts "done.\n"
106 analysis1 = parse_analysis(analyze_vals_file)
107 analysis2 = parse_analysis(analyze_vals_trim_file)
108 analysis3 = parse_analysis(analyze_vals_trim_noadapt_file)
113 <title>QA Illumina Report</title>
116 <h1>QA Illumina Report</h1>
117 <p>Date: #{Time.now}</p>
118 <p>File: <%= seq_file %></p>
119 <h2>Sequence composition</h2>
121 <tr><td></td><td>Before trimming</td><td>After trimming</td><td>After adaptor removal</td></tr>
122 <tr><td>Number of sequences</td><td align='right'><%= analysis1['COUNT'] %></td><td align='right'><%= analysis2['COUNT'] %></td><td align='right'><%= analysis3['COUNT'] %></td></tr>
123 <tr><td>Number of bases</td><td align='right'><%= analysis1['SUM'] %></td><td align='right'><%= analysis2['SUM'] %></td><td align='right'><%= analysis3['SUM'] %></td></tr>
124 <tr><td>Min sequence length</td><td align='right'><%= analysis1['MIN'] %></td><td align='right'><%= analysis2['MIN'] %></td><td align='right'><%= analysis3['MIN'] %></td></tr>
125 <tr><td>Max sequence length</td><td align='right'><%= analysis1['MAX'] %></td><td align='right'><%= analysis2['MAX'] %></td><td align='right'><%= analysis3['MAX'] %></td></tr>
126 <tr><td>Mean sequence length</td><td align='right'><%= analysis1['MEAN'] %></td><td align='right'><%= analysis2['MEAN'] %></td><td align='right'><%= analysis3['MEAN'] %></td></tr>
128 <p>Sequence trimming was performed by removing from the ends all residues until 3 consecutive</p>
129 <p>residues with quality score larger than or equal to 25.</p>
130 <p>All plots are after sequence trimming and adaptor removal.</p>
131 <h2>Sequence length distribution</h2>
132 <p><img src="<%= png2base64(lendist_file) %>" width="600" /></p>
133 <p><img src="<%= png2base64(lendist_bin_file) %>" width="600" /></p>
134 <h2>Sequence quality scores</h2>
135 <p><img src="<%= png2base64(scores_file) %>" width="600" /></p>
136 <p><img src="<%= png2base64(scores_bin_file) %>" width="600" /></p>
137 <h2>Sequence nucleotide distribution</h2>
138 <p><img src="<%= png2base64(nucdist_file) %>" width="600" /></p>
143 html = ERB.new(template)
145 puts html.result(binding)