X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=bp_bin%2Fanalyze_vals;h=e7e8249a8faf73c53bce080d333e56a73a2b7aa1;hb=9e506c77bf020025b9788beac9ece546e0dd93ee;hp=d1d9ede3af31a59045be0d51fe0f21fe57a46efe;hpb=ee3bad350dd7f0b98ef97086e0ed78c343020e4e;p=biopieces.git diff --git a/bp_bin/analyze_vals b/bp_bin/analyze_vals index d1d9ede..e7e8249 100755 --- a/bp_bin/analyze_vals +++ b/bp_bin/analyze_vals @@ -1,6 +1,6 @@ -#!/usr/bin/env perl +#!/usr/bin/env ruby -# Copyright (C) 2007-2009 Martin A. Hansen. +# Copyright (C) 2007-2013 Martin A. Hansen. # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License @@ -18,135 +18,95 @@ # http://www.gnu.org/copyleft/gpl.html - -# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - -# Determine type, count, min, max, sum and mean for values in stream. - # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< +# This program is part of the Biopieces framework (www.biopieces.org). -use warnings; -use strict; -use Maasha::Biopieces; -use Maasha::Calc; -use Data::Dumper; +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< +# Determine basic stats for records in the stream. # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - -my ( $options, $in, $out, $record, $analysis, $key, $len, %skip_hash, - %key_hash, $skip, $keys, $types, $counts, $mins, $maxs, $sums, $means ); - -$options = Maasha::Biopieces::parse_options( - [ - { long => 'no_stream', short => 'x', type => 'flag', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, - { long => 'keys', short => 'k', type => 'list', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, - { long => 'no_keys', short => 'K', type => 'list', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, - ] -); - -$in = Maasha::Biopieces::read_stream( $options->{ "stream_in" } ); -$out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } ); - -map { $skip_hash{ $_ } = 1 } @{ $options->{ "no_keys" } }; -map { $key_hash{ $_ } = 1; $skip = 1 } @{ $options->{ "keys" } }; - -while ( $record = Maasha::Biopieces::get_record( $in ) ) -{ - foreach $key ( keys %{ $record } ) - { - next if $skip and not exists $key_hash{ $key }; - next if $skip_hash{ $key }; - - if ( Maasha::Calc::is_a_number( $record->{ $key } ) ) - { - if ( not exists $analysis->{ $key } ) - { - $analysis->{ $key }->{ "MIN" } = $record->{ $key }; - $analysis->{ $key }->{ "MAX" } = $record->{ $key }; - } - - $analysis->{ $key }->{ "MAX" } = Maasha::Calc::max( $analysis->{ $key }->{ "MAX" }, $record->{ $key } ); - $analysis->{ $key }->{ "MIN" } = Maasha::Calc::min( $analysis->{ $key }->{ "MIN" }, $record->{ $key } ); - - $analysis->{ $key }->{ "TYPE" } = "num"; - $analysis->{ $key }->{ "SUM" } += $record->{ $key }; - } - else - { - $len = length $record->{ $key }; - - if ( not exists $analysis->{ $key } ) - { - $analysis->{ $key }->{ "MIN" } = $len; - $analysis->{ $key }->{ "MAX" } = $len; - } - - $analysis->{ $key }->{ "MAX" } = Maasha::Calc::max( $analysis->{ $key }->{ "MAX" }, $len ); - $analysis->{ $key }->{ "MIN" } = Maasha::Calc::min( $analysis->{ $key }->{ "MIN" }, $len ); - - $analysis->{ $key }->{ "TYPE" } = "alph"; - $analysis->{ $key }->{ "SUM" } += $len; - } - - $analysis->{ $key }->{ "COUNT" }++; +require 'maasha/biopieces' + +casts = [] +casts << {long: 'keys', short: 'k', type: 'list', mandatory: false, default: nil, allowed: nil, disallowed: nil} +casts << {long: 'no_keys', short: 'K', type: 'list', mandatory: false, default: nil, allowed: nil, disallowed: nil} + +options = Biopieces.options_parse(ARGV, casts) + +stats = Hash.new { |h, k| h[k] = {} } + +def to_number(value) + begin num = Integer(value) + return num + rescue + begin num = Float(value) + return num + rescue + end + end +end + +def min(value1, value2) + return value2 unless value1 + [value1, value2].min +end + +def max(value1, value2) + return value2 unless value1 + [value1, value2].max +end + +keys = options[:keys].map { |k| k.to_sym } if options[:keys] +no_keys = options[:no_keys].each_with_object({}) { |i, h| h[i.to_sym] = true } if options[:no_keys] +get_keys = nil + +Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output| + input.each do |record| + unless get_keys + get_keys = keys || record.keys + get_keys.reject! { |k| no_keys[k] } if no_keys + end + + get_keys.each do |key| + key = key.to_sym + value = record[key] + + stats[key][:sum] ||= 0 + stats[key][:count] ||= 0 + + if num = to_number(value) + stats[key][:min] = min(stats[key][:min], num) + stats[key][:max] = max(stats[key][:max], num) + stats[key][:type] = :numeric + stats[key][:sum] += num + else + stats[key][:min] = min(stats[key][:min], value.length) + stats[key][:max] = max(stats[key][:max], value.length) + stats[key][:type] = :alphabetic + stats[key][:sum] += value.length + end + + stats[key][:count] += 1 + end + end + + stats.each do |key, value| + stat_record = { + :KEY => key, + :TYPE => value[:type].to_s.capitalize, + :COUNT => value[:count], + :MIN => value[:min].is_a?(Float) ? "%0.2f" % value[:min] : value[:min], + :MAX => value[:max].is_a?(Float) ? "%0.2f" % value[:max] : value[:max], + :SUM => value[:sum].is_a?(Float) ? "%0.2f" % value[:sum] : value[:sum], + :MEAN => "%0.2f" % (value[:sum] / value[:count].to_f) } - Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" }; -} - -foreach $key ( keys %{ $analysis } ) -{ - $analysis->{ $key }->{ "MEAN" } = sprintf "%.2f", $analysis->{ $key }->{ "SUM" } / $analysis->{ $key }->{ "COUNT" }; - $analysis->{ $key }->{ "SUM" } = sprintf "%.2f", $analysis->{ $key }->{ "SUM" }; -} - -$keys = "KEY "; -$types = "TYPE "; -$counts = "COUNT"; -$mins = "MIN "; -$maxs = "MAX "; -$sums = "SUM "; -$means = "MEAN "; - -foreach $key ( sort keys %{ $analysis } ) -{ - $keys .= sprintf "% 15s", $key; - $types .= sprintf "% 15s", $analysis->{ $key }->{ "TYPE" }; - $counts .= sprintf "% 15s", $analysis->{ $key }->{ "COUNT" }; - $mins .= sprintf "% 15s", $analysis->{ $key }->{ "MIN" }; - $maxs .= sprintf "% 15s", $analysis->{ $key }->{ "MAX" }; - $sums .= sprintf "% 15s", $analysis->{ $key }->{ "SUM" }; - $means .= sprintf "% 15s", $analysis->{ $key }->{ "MEAN" }; -} - -print "$keys\n"; -print "$types\n"; -print "$counts\n"; -print "$mins\n"; -print "$maxs\n"; -print "$sums\n"; -print "$means\n"; - -Maasha::Biopieces::close_stream( $in ); -Maasha::Biopieces::close_stream( $out ); - - -# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - - -BEGIN -{ - Maasha::Biopieces::status_set(); -} - - -END -{ - Maasha::Biopieces::status_log(); -} + output.puts stat_record + end +end # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<