From 4c574917dc12688ccd8899c48f0635a82e6617d1 Mon Sep 17 00:00:00 2001 From: martinahansen Date: Tue, 15 Oct 2013 20:06:45 +0000 Subject: [PATCH] rewrote analyze_vals git-svn-id: http://biopieces.googlecode.com/svn/trunk@2234 74ccb610-7750-0410-82ae-013aeee3265d --- bp_bin/analyze_vals | 210 +++++++++++++-------------------- bp_test/out/analyze_vals.out.1 | 23 ++-- bp_test/out/analyze_vals.out.2 | 15 +-- bp_test/out/analyze_vals.out.3 | 15 +-- bp_test/test/test_analyze_vals | 6 +- 5 files changed, 117 insertions(+), 152 deletions(-) diff --git a/bp_bin/analyze_vals b/bp_bin/analyze_vals index 726c91c..aa9d12a 100755 --- a/bp_bin/analyze_vals +++ b/bp_bin/analyze_vals @@ -1,6 +1,6 @@ -#!/usr/bin/env perl +#!/usr/bin/env ruby -# Copyright (C) 2007-2009 Martin A. Hansen. +# Copyright (C) 2007-2013 Martin A. Hansen. # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License @@ -18,141 +18,95 @@ # http://www.gnu.org/copyleft/gpl.html - -# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - -# Determine type, count, min, max, sum and mean for values in stream. - # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< +# This program is part of the Biopieces framework (www.biopieces.org). -use warnings; -use strict; -use Maasha::Biopieces; -use Maasha::Calc; -use Data::Dumper; +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< +# Determine basic stats for records in the stream. # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - -my ( $options, $in, $out, $record, $analysis, $key, $len, %skip_hash, $data_out, - %key_hash, $skip, $keys, $types, $counts, $mins, $maxs, $sums, $means ); - -$options = Maasha::Biopieces::parse_options( - [ - { long => 'no_stream', short => 'x', type => 'flag', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, - { long => 'data_out', short => 'o', type => 'file', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, - { long => 'keys', short => 'k', type => 'list', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, - { long => 'no_keys', short => 'K', type => 'list', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, - ] -); - -$in = Maasha::Biopieces::read_stream( $options->{ "stream_in" } ); -$out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } ); - -$data_out = Maasha::Biopieces::write_stream( $options->{ "data_out" }, $options->{ "compress" } ); -$data_out ||= \*STDOUT; - -map { $skip_hash{ $_ } = 1 } @{ $options->{ "no_keys" } }; -map { $key_hash{ $_ } = 1; $skip = 1 } @{ $options->{ "keys" } }; - -while ( $record = Maasha::Biopieces::get_record( $in ) ) -{ - foreach $key ( keys %{ $record } ) - { - next if $skip and not exists $key_hash{ $key }; - next if $skip_hash{ $key }; - - if ( Maasha::Calc::is_a_number( $record->{ $key } ) ) - { - if ( not exists $analysis->{ $key } ) - { - $analysis->{ $key }->{ "MIN" } = $record->{ $key }; - $analysis->{ $key }->{ "MAX" } = $record->{ $key }; - } - - $analysis->{ $key }->{ "MAX" } = Maasha::Calc::max( $analysis->{ $key }->{ "MAX" }, $record->{ $key } ); - $analysis->{ $key }->{ "MIN" } = Maasha::Calc::min( $analysis->{ $key }->{ "MIN" }, $record->{ $key } ); - - $analysis->{ $key }->{ "TYPE" } = "num"; - $analysis->{ $key }->{ "SUM" } += $record->{ $key }; - } - else - { - $len = length $record->{ $key }; - - if ( not exists $analysis->{ $key } ) - { - $analysis->{ $key }->{ "MIN" } = $len; - $analysis->{ $key }->{ "MAX" } = $len; - } - - $analysis->{ $key }->{ "MAX" } = Maasha::Calc::max( $analysis->{ $key }->{ "MAX" }, $len ); - $analysis->{ $key }->{ "MIN" } = Maasha::Calc::min( $analysis->{ $key }->{ "MIN" }, $len ); - - $analysis->{ $key }->{ "TYPE" } = "alph"; - $analysis->{ $key }->{ "SUM" } += $len; - } - - $analysis->{ $key }->{ "COUNT" }++; +require 'maasha/biopieces' + +casts = [] +casts << {long: 'keys', short: 'k', type: 'list', mandatory: false, default: nil, allowed: nil, disallowed: nil} +casts << {long: 'no_keys', short: 'K', type: 'list', mandatory: false, default: nil, allowed: nil, disallowed: nil} + +options = Biopieces.options_parse(ARGV, casts) + +stats = Hash.new { |h, k| h[k] = {} } + +def to_number(value) + begin num = Integer(value) + return num + rescue + begin num = Float(value) + return num + rescue + end + end +end + +def min(value1, value2) + return value2 unless value1 + [value1, value2].min +end + +def max(value1, value2) + return value2 unless value1 + [value1, value2].max +end + +keys = options[:keys].map { |k| k.to_sym } if options[:keys] +no_keys = options[:no_keys].each_with_object({}) { |i, h| h[i.to_sym] = true } if options[:no_keys] +get_keys = nil + +Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output| + input.each do |record| + unless get_keys + get_keys = keys || record.keys + get_keys.reject! { |k| no_keys[k] } if no_keys + end + + get_keys.each do |key| + key = key.to_sym + value = record[key] + + stats[key][:sum] ||= 0 + stats[key][:count] ||= 0 + + if num = to_number(value) + stats[key][:min] = min(stats[key][:min], num) + stats[key][:max] = max(stats[key][:max], num) + stats[key][:type] = :numeric + stats[key][:sum] += num + else + stats[key][:min] = min(stats[key][:min], value.length) + stats[key][:max] = max(stats[key][:max], value.length) + stats[key][:type] = :alphabetic + stats[key][:sum] += value.length + end + + stats[key][:count] += 1 + end + end + + stats.each do |key, value| + stat_record = { + :KEY => key, + :TYPE => value[:type].to_s.capitalize, + :COUNT => value[:count], + :MIN => value[:min], + :MAX => value[:max], + :SUM => value[:sum], + :MEAN => (value[:sum] / value[:count].to_f).round(1) } - Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" }; -} - -foreach $key ( keys %{ $analysis } ) -{ - $analysis->{ $key }->{ "MEAN" } = sprintf "%.2f", $analysis->{ $key }->{ "SUM" } / $analysis->{ $key }->{ "COUNT" }; - $analysis->{ $key }->{ "SUM" } = sprintf "%.2f", $analysis->{ $key }->{ "SUM" }; -} - -$keys = "KEY "; -$types = "TYPE "; -$counts = "COUNT"; -$mins = "MIN "; -$maxs = "MAX "; -$sums = "SUM "; -$means = "MEAN "; - -foreach $key ( sort keys %{ $analysis } ) -{ - $keys .= sprintf "% 15s", $key; - $types .= sprintf "% 15s", $analysis->{ $key }->{ "TYPE" }; - $counts .= sprintf "% 15s", $analysis->{ $key }->{ "COUNT" }; - $mins .= sprintf "% 15s", $analysis->{ $key }->{ "MIN" }; - $maxs .= sprintf "% 15s", $analysis->{ $key }->{ "MAX" }; - $sums .= sprintf "% 15s", $analysis->{ $key }->{ "SUM" }; - $means .= sprintf "% 15s", $analysis->{ $key }->{ "MEAN" }; -} - -print $data_out "$keys\n"; -print $data_out "$types\n"; -print $data_out "$counts\n"; -print $data_out "$mins\n"; -print $data_out "$maxs\n"; -print $data_out "$sums\n"; -print $data_out "$means\n"; - -close $data_out; - -Maasha::Biopieces::close_stream( $in ); -Maasha::Biopieces::close_stream( $out ); - - -# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - - -BEGIN -{ - Maasha::Biopieces::status_set(); -} - - -END -{ - Maasha::Biopieces::status_log(); -} + output.puts stat_record + end +end # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< diff --git a/bp_test/out/analyze_vals.out.1 b/bp_test/out/analyze_vals.out.1 index ea45064..313969c 100644 --- a/bp_test/out/analyze_vals.out.1 +++ b/bp_test/out/analyze_vals.out.1 @@ -1,7 +1,16 @@ -KEY V0 V1 -TYPE alph num -COUNT 3 3 -MIN 3 6 -MAX 5 123 -SUM 13.00 174.00 -MEAN 4.33 58.00 +KEY: V0 +TYPE: Alphabetic +COUNT: 3 +MIN: 3 +MAX: 5 +SUM: 13 +MEAN: 4.3 +--- +KEY: V1 +TYPE: Numeric +COUNT: 3 +MIN: 6 +MAX: 123 +SUM: 174 +MEAN: 58.0 +--- diff --git a/bp_test/out/analyze_vals.out.2 b/bp_test/out/analyze_vals.out.2 index 7065732..90c8c4e 100644 --- a/bp_test/out/analyze_vals.out.2 +++ b/bp_test/out/analyze_vals.out.2 @@ -1,7 +1,8 @@ -KEY V0 -TYPE alph -COUNT 3 -MIN 3 -MAX 5 -SUM 13.00 -MEAN 4.33 +KEY: V0 +TYPE: Alphabetic +COUNT: 3 +MIN: 3 +MAX: 5 +SUM: 13 +MEAN: 4.3 +--- diff --git a/bp_test/out/analyze_vals.out.3 b/bp_test/out/analyze_vals.out.3 index 6726a07..345c5dc 100644 --- a/bp_test/out/analyze_vals.out.3 +++ b/bp_test/out/analyze_vals.out.3 @@ -1,7 +1,8 @@ -KEY V1 -TYPE num -COUNT 3 -MIN 6 -MAX 123 -SUM 174.00 -MEAN 58.00 +KEY: V1 +TYPE: Numeric +COUNT: 3 +MIN: 6 +MAX: 123 +SUM: 174 +MEAN: 58.0 +--- diff --git a/bp_test/test/test_analyze_vals b/bp_test/test/test_analyze_vals index 1e4b180..918b3fc 100755 --- a/bp_test/test/test_analyze_vals +++ b/bp_test/test/test_analyze_vals @@ -2,14 +2,14 @@ source "$BP_DIR/bp_test/lib/test.sh" -run "$bp -I $in -o $tmp -x" +run "$bp -I $in -O $tmp" assert_no_diff $tmp $out.1 clean -run "$bp -I $in -k V0 -o $tmp -x" +run "$bp -I $in -k V0 -O $tmp" assert_no_diff $tmp $out.2 clean -run "$bp -I $in -K V0 -o $tmp -x" +run "$bp -I $in -K V0 -O $tmp" assert_no_diff $tmp $out.3 clean -- 2.39.2