X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=bp_scripts%2FQA_Illumina_report.rb;h=cc6269f1d79280aaf06ba76a20a40d6af204bffb;hb=5de6112b70b59420b245ce636a8b2e3c90acbe00;hp=c7061a527ab2ad3f28402f2c813fd0ef89397eda;hpb=2c9aea983d8a632d93316c073031c797747514e5;p=biopieces.git diff --git a/bp_scripts/QA_Illumina_report.rb b/bp_scripts/QA_Illumina_report.rb index c7061a5..cc6269f 100755 --- a/bp_scripts/QA_Illumina_report.rb +++ b/bp_scripts/QA_Illumina_report.rb @@ -23,13 +23,28 @@ require 'tmpdir' require 'base64' require 'erb' +class Numeric + def commify + self.to_s.gsub(/(^[-+]?\d+?(?=(?>(?:\d{3})+)(?!\d))|\G\d{3}(?=\d))/, '\1,') + end +end + def parse_analysis(file) data = {} File.open(file, 'r') do |ios| ios.each do |line| - key, val = line.chomp.split(' ') - data[key] = val.to_i; + key, val = line.chomp.split(': ') + begin Integer(val) + val = val.to_i.commify + rescue + begin Float(val) + val = val.to_f.commify + rescue + end + end + + data[key] = val end end @@ -54,28 +69,33 @@ end tmpdir = Dir.mktmpdir seq_file = ARGV.shift -analyze_vals_file = File.join(tmpdir, 'analyze_vals.txt') -analyze_vals_trim_file = File.join(tmpdir, 'analyze_vals_trim.txt') -lendist_file = File.join(tmpdir, 'lendist.png') -scores_file = File.join(tmpdir, 'scores.png') -nucdist_file = File.join(tmpdir, 'nucdist.png') -lendist_bin_file = File.join(tmpdir, 'lendist_bin.png') -scores_bin_file = File.join(tmpdir, 'scores_bin.png') +analyze_vals_file = File.join(tmpdir, 'analyze_vals.txt') +analyze_vals_trim_file = File.join(tmpdir, 'analyze_vals_trim_noadapt.txt') +analyze_vals_trim_noadapt_file = File.join(tmpdir, 'analyze_vals_trim.txt') +lendist_file = File.join(tmpdir, 'lendist.png') +scores_file = File.join(tmpdir, 'scores.png') +nucdist_file = File.join(tmpdir, 'nucdist.png') +lendist_bin_file = File.join(tmpdir, 'lendist_bin.png') +scores_bin_file = File.join(tmpdir, 'scores_bin.png') STDERR.puts "Analyzing sequences ... " system( - "read_fastq -i #{seq_file} | + "read_fastq -e base_33 -i #{seq_file} | progress_meter | analyze_vals -k SEQ -o #{analyze_vals_file} | trim_seq -l 3 -m 25 | - grab -e 'SEQ_LEN > 0' | + grab -e 'SEQ_LEN > 20' | analyze_vals -k SEQ -o #{analyze_vals_trim_file} | + find_adaptor -l 6 -L 6 -f ACACGACGCTCTTCCGATCT -r AGATCGGAAGAGCACACGTC | + clip_adaptor | + grab -e 'SEQ_LEN > 0' | + analyze_vals -k SEQ -o #{analyze_vals_trim_noadapt_file} | plot_distribution -k SEQ_LEN -T 'Sequence length distribution' -X 'Sequence length' -t png -o #{lendist_file} | plot_scores -c -t png -o #{scores_file} | - plot_nucleotide_distribution -t png -o #{nucdist_file} | - bin_vals -k SEQ_LEN -b 50 | - plot_distribution -T '50 bases bin sequence length distribution' -X 'Sequence length' -k SEQ_LEN_BIN -t png -o #{lendist_bin_file} | + plot_nucleotide_distribution -c -t png -o #{nucdist_file} | + bin_vals -k SEQ_LEN -b 25 | + plot_distribution -T '25 bases bin sequence length distribution' -X 'Sequence length' -k SEQ_LEN_BIN -t png -o #{lendist_bin_file} | mean_scores | bin_vals -k SCORES_MEAN -b 5 | plot_distribution -k SCORES_MEAN_BIN -T '5 bin mean score distribution' -X 'Mean scores' -t png -o #{scores_bin_file} -x" @@ -85,6 +105,7 @@ STDERR.puts "done.\n" analysis1 = parse_analysis(analyze_vals_file) analysis2 = parse_analysis(analyze_vals_trim_file) +analysis3 = parse_analysis(analyze_vals_trim_noadapt_file) template = %{ @@ -97,16 +118,16 @@ template = %{

File: <%= seq_file %>

Sequence composition

- - - - - - + + + + + +
Before trimmingAfter trimming
Number of sequences<%= analysis1['COUNT'] %><%= analysis2['COUNT'] %>
Number of bases<%= analysis1['SUM'] %><%= analysis2['SUM'] %>
Min sequence length<%= analysis1['MIN'] %><%= analysis2['MIN'] %>
Max sequence length<%= analysis1['MAX'] %><%= analysis2['MAX'] %>
Max sequence length<%= analysis1['MEAN'] %><%= analysis2['MEAN'] %>
Before trimmingAfter trimmingAfter adaptor removal
Number of sequences<%= analysis1['COUNT'] %><%= analysis2['COUNT'] %><%= analysis3['COUNT'] %>
Number of bases<%= analysis1['SUM'] %><%= analysis2['SUM'] %><%= analysis3['SUM'] %>
Min sequence length<%= analysis1['MIN'] %><%= analysis2['MIN'] %><%= analysis3['MIN'] %>
Max sequence length<%= analysis1['MAX'] %><%= analysis2['MAX'] %><%= analysis3['MAX'] %>
Mean sequence length<%= analysis1['MEAN'] %><%= analysis2['MEAN'] %><%= analysis3['MEAN'] %>

Sequence trimming was performed by removing from the ends all residues until 3 consecutive

residues with quality score larger than or equal to 25.

-

All plots are after sequence trimming.

+

All plots are after sequence trimming and adaptor removal.

Sequence length distribution