X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=bp_scripts%2FQA_Illumina_report.rb;h=6e0ba0a4c10c9ec695daf0c81dd1fb0c5f591c6c;hb=af282a65d141826c15944437b07a0353dd14e79c;hp=7b54e4df88ac875912e37bd15ca8434c786dbce7;hpb=725eb685e891858a443645f0bddc4eee3ac86a66;p=biopieces.git diff --git a/bp_scripts/QA_Illumina_report.rb b/bp_scripts/QA_Illumina_report.rb index 7b54e4d..6e0ba0a 100755 --- a/bp_scripts/QA_Illumina_report.rb +++ b/bp_scripts/QA_Illumina_report.rb @@ -29,7 +29,7 @@ def parse_analysis(file) File.open(file, 'r') do |ios| ios.each do |line| key, val = line.chomp.split(' ') - data[key] = val.to_i; + data[key] = val.reverse.gsub(/(\d{3})(?=\d)/, '\\1,').reverse end end @@ -54,28 +54,33 @@ end tmpdir = Dir.mktmpdir seq_file = ARGV.shift -analyze_vals_file = File.join(tmpdir, 'analyze_vals.txt') -analyze_vals_trim_file = File.join(tmpdir, 'analyze_vals_trim.txt') -lendist_file = File.join(tmpdir, 'lendist.png') -scores_file = File.join(tmpdir, 'scores.png') -nucdist_file = File.join(tmpdir, 'nucdist.png') -lendist_bin_file = File.join(tmpdir, 'lendist_bin.png') -scores_bin_file = File.join(tmpdir, 'scores_bin.png') +analyze_vals_file = File.join(tmpdir, 'analyze_vals.txt') +analyze_vals_trim_file = File.join(tmpdir, 'analyze_vals_trim_noadapt.txt') +analyze_vals_trim_noadapt_file = File.join(tmpdir, 'analyze_vals_trim.txt') +lendist_file = File.join(tmpdir, 'lendist.png') +scores_file = File.join(tmpdir, 'scores.png') +nucdist_file = File.join(tmpdir, 'nucdist.png') +lendist_bin_file = File.join(tmpdir, 'lendist_bin.png') +scores_bin_file = File.join(tmpdir, 'scores_bin.png') STDERR.puts "Analyzing sequences ... " system( - "read_fastq -i #{seq_file} | + "read_fastq -e base_33 -i #{seq_file} | progress_meter | analyze_vals -k SEQ -o #{analyze_vals_file} | trim_seq -l 3 -m 25 | - grab -e 'SEQ_LEN > 0' | + grab -e 'SEQ_LEN > 20' | analyze_vals -k SEQ -o #{analyze_vals_trim_file} | + find_adaptor -l 6 -L 6 -f ACACGACGCTCTTCCGATCT -r AGATCGGAAGAGCACACGTC | + clip_adaptor | + grab -e 'SEQ_LEN > 0' | + analyze_vals -k SEQ -o #{analyze_vals_trim_noadapt_file} | plot_distribution -k SEQ_LEN -T 'Sequence length distribution' -X 'Sequence length' -t png -o #{lendist_file} | plot_scores -c -t png -o #{scores_file} | - plot_nucleotide_distribution -t png -o #{nucdist_file} | - bin_vals -k SEQ_LEN -b 50 | - plot_distribution -T '50 bases bin sequence length distribution' -X 'Sequence length' -k SEQ_LEN_BIN -t png -o #{lendist_bin_file} | + plot_nucleotide_distribution -c -t png -o #{nucdist_file} | + bin_vals -k SEQ_LEN -b 25 | + plot_distribution -T '25 bases bin sequence length distribution' -X 'Sequence length' -k SEQ_LEN_BIN -t png -o #{lendist_bin_file} | mean_scores | bin_vals -k SCORES_MEAN -b 5 | plot_distribution -k SCORES_MEAN_BIN -T '5 bin mean score distribution' -X 'Mean scores' -t png -o #{scores_bin_file} -x" @@ -85,6 +90,7 @@ STDERR.puts "done.\n" analysis1 = parse_analysis(analyze_vals_file) analysis2 = parse_analysis(analyze_vals_trim_file) +analysis3 = parse_analysis(analyze_vals_trim_noadapt_file) template = %{ @@ -97,16 +103,16 @@ template = %{

File: <%= seq_file %>

Sequence composition

- - - - - - + + + + + +
Before trimmingAfter trimming
Number of sequences<%= analysis1['COUNT'] %><%= analysis2['COUNT'] %>
Number of bases<%= analysis1['SUM'] %><%= analysis2['SUM'] %>
Min sequence length<%= analysis1['MIN'] %><%= analysis2['MIN'] %>
Max sequence length<%= analysis1['MAX'] %><%= analysis2['MAX'] %>
Mean sequence length<%= analysis1['MEAN'] %><%= analysis2['MEAN'] %>
Before trimmingAfter trimmingAfter adaptor removal
Number of sequences<%= analysis1['COUNT'] %><%= analysis2['COUNT'] %><%= analysis3['COUNT'] %>
Number of bases<%= analysis1['SUM'] %><%= analysis2['SUM'] %><%= analysis3['SUM'] %>
Min sequence length<%= analysis1['MIN'] %><%= analysis2['MIN'] %><%= analysis3['MIN'] %>
Max sequence length<%= analysis1['MAX'] %><%= analysis2['MAX'] %><%= analysis3['MAX'] %>
Mean sequence length<%= analysis1['MEAN'] %><%= analysis2['MEAN'] %><%= analysis3['MEAN'] %>

Sequence trimming was performed by removing from the ends all residues until 3 consecutive

residues with quality score larger than or equal to 25.

-

All plots are after sequence trimming.

+

All plots are after sequence trimming and adaptor removal.

Sequence length distribution