X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=bp_bin%2Fplot_scores;h=8cc276d2f0e81055e2641bda2c0f683e183b4f94;hb=a2f82dca77855723ddc50dfb6a763da8558b7610;hp=bfe87ccacd6fba8da2f24436665ca4d7382d4b75;hpb=f72fb3b8d2883afdb21efd27d16412b7211883d9;p=biopieces.git diff --git a/bp_bin/plot_scores b/bp_bin/plot_scores index bfe87cc..8cc276d 100755 --- a/bp_bin/plot_scores +++ b/bp_bin/plot_scores @@ -1,6 +1,6 @@ -#!/usr/bin/env perl +#!/usr/bin/env ruby -# Copyright (C) 2007-2010 Martin A. Hansen. +# Copyright (C) 2007-2012 Martin A. Hansen. # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License @@ -18,112 +18,93 @@ # http://www.gnu.org/copyleft/gpl.html - -# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - -# Create a lineplot from quality scores in the stream. - # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - -use warnings; -use strict; -use Maasha::Biopieces; -use Maasha::Fastq; -use Maasha::Plot; -use Maasha::Calc; - +# Plot a histogram of mean sequence quality scores. # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - -my ( $options, $in, $out, $default, $terminals, $formats, $record, %count_hash, %sum_hash, $i, @scores, @data_list, $result, $fh, $tmp_dir ); - -$default = "Quality Scores"; -$terminals = "dumb,x11,aqua,post,svg"; -$formats = "solexa,phred,decimal"; - -$options = Maasha::Biopieces::parse_options( - [ - { long => 'no_stream', short => 'x', type => 'flag', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, - { long => 'data_out', short => 'o', type => 'file', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, - { long => 'format' , short => 'f', type => 'string', mandatory => 'no', default => "solexa", allowed => $formats, disallowed => undef }, - { long => 'terminal', short => 't', type => 'string', mandatory => 'no', default => 'dumb', allowed => $terminals, disallowed => undef }, - { long => 'title', short => 'T', type => 'string', mandatory => 'no', default => $default, allowed => undef, disallowed => undef }, - { long => 'xlabel', short => 'X', type => 'string', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, - { long => 'ylabel', short => 'Y', type => 'string', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, - ] -); - -$in = Maasha::Biopieces::read_stream( $options->{ "stream_in" } ); -$out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } ); - -while ( $record = Maasha::Biopieces::get_record( $in ) ) -{ - if ( $record->{ 'SCORES' } ) - { - if ( $options->{ 'format' } eq "solexa" ) - { - for ( $i = 0; $i < length $record->{ 'SCORES' }; $i++ ) - { - $count_hash{ $i }++; - $sum_hash{ $i } += Maasha::Fastq::solexa2dec( substr $record->{ "SCORES" }, $i, 1 ); - } - } - elsif ( $options->{ 'format' } eq "phred" ) - { - for ( $i = 0; $i < length $record->{ 'SCORES' }; $i++ ) - { - $count_hash{ $i }++; - $sum_hash{ $i } += Maasha::Fastq::phred2dec( substr $record->{ "SCORES" }, $i, 1 ); - } - } - else - { - @scores = split ";", $record->{ 'SCORES' }; - - for ( $i = 0; $i < @scores; $i++ ) - { - $count_hash{ $i }++; - $sum_hash{ $i } += $scores[ $i ]; - } - } - } - - Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" }; -} - -for ( $i = 0; $i < keys %count_hash; $i++ ) { - push @data_list, [ $sum_hash{ $i } / $count_hash{ $i } ]; -} - -$tmp_dir = Maasha::Biopieces::get_tmpdir(); - -$result = Maasha::Plot::lineplot_simple( \@data_list, $options, $tmp_dir ); - -$fh = Maasha::Biopieces::write_stream( $options->{ "data_out" } ); - -print $fh "$_\n" foreach @{ $result }; - -close $fh; - -Maasha::Biopieces::close_stream( $in ); -Maasha::Biopieces::close_stream( $out ); - - -# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - - -BEGIN -{ - Maasha::Biopieces::status_set(); -} - - -END -{ - Maasha::Biopieces::status_log(); -} +require 'maasha/biopieces' +require 'maasha/seq' +require 'gnuplot' +require 'narray' + +terminals = "dumb,x11,aqua,post,pdf,png,svg" +title = "Mean Quality Scores" +xlabel = "Sequence position" +ylabel = "Mean score" + +casts = [] +casts << {:long=>'no_stream', :short=>'x', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'count', :short=>'c', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'data_out', :short=>'o', :type=>'file', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'terminal', :short=>'t', :type=>'string', :mandatory=>false, :default=>'dumb', :allowed=>terminals, :disallowed=>nil} +casts << {:long=>'title', :short=>'T', :type=>'string', :mandatory=>false, :default=>title, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'xlabel', :short=>'X', :type=>'string', :mandatory=>false, :default=>xlabel, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'ylabel', :short=>'Y', :type=>'string', :mandatory=>false, :default=>ylabel, :allowed=>nil, :disallowed=>nil} + +options = Biopieces.options_parse(ARGV, casts) + +SCORES_MAX = 100_000 + +scores_vec = NArray.int(SCORES_MAX) +count_vec = NArray.int(SCORES_MAX) +max = 0 + +Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output| + input.each_record do |record| + if record[:SCORES] + scores = record[:SCORES] + + if scores.length > 0 + raise BiopiecesError, "score string too long: #{scores.length} > #{SCORES_MAX}" if scores.length > SCORES_MAX + + scores_vec[0 ... scores.length] += NArray.to_na(scores, "byte") - Seq::SCORE_BASE + count_vec[0 ... scores.length] += 1 + + max = scores.length if scores.length > max + end + end + + output.puts record unless options[:no_stream] + end +end + +mean_vec = NArray.sfloat(max) +mean_vec = scores_vec[0 ... max].to_f / count_vec[0 ... max] +count_vec = count_vec[0 ... max].to_f +count_vec *= (Seq::SCORE_MAX / count_vec.max(0).to_f) + +x = (1 .. max).to_a +y1 = mean_vec.to_a +y2 = count_vec.to_a + +Gnuplot.open do |gp| + Gnuplot::Plot.new(gp) do |plot| + plot.terminal options[:terminal] + plot.title options[:title] + plot.xlabel options[:xlabel] + plot.ylabel options[:ylabel] + plot.output options[:data_out] if options[:data_out] + plot.xrange "[#{x.min - 1}:#{x.max + 1}]" + plot.yrange "[#{Seq::SCORE_MIN}:#{Seq::SCORE_MAX}]" + plot.style "fill solid 0.5 border" + plot.xtics "out" + plot.ytics "out" + + plot.data << Gnuplot::DataSet.new([x, y1]) do |ds| + ds.with = "boxes" + ds.title = "mean score" + end + + if options[:count] + plot.data << Gnuplot::DataSet.new([x, y2]) do |ds| + ds.with = "lines lt rgb \"black\"" + ds.title = "relative count" + end + end + end +end # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<