-#!/usr/bin/env perl
+#!/usr/bin/env ruby
-# Copyright (C) 2007-2009 Martin A. Hansen.
+# Copyright (C) 2007-2013 Martin A. Hansen.
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# http://www.gnu.org/copyleft/gpl.html
-
-# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
-
-# Determine type, count, min, max, sum and mean for values in stream.
-
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+# This program is part of the Biopieces framework (www.biopieces.org).
-use warnings;
-use strict;
-use Maasha::Biopieces;
-use Maasha::Calc;
-use Data::Dumper;
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+# Determine basic stats for records in the stream.
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
-
-my ( $options, $in, $out, $record, $analysis, $key, $len, %skip_hash, $data_out,
- %key_hash, $skip, $keys, $types, $counts, $mins, $maxs, $sums, $means );
-
-$options = Maasha::Biopieces::parse_options(
- [
- { long => 'no_stream', short => 'x', type => 'flag', mandatory => 'no', default => undef, allowed => undef, disallowed => undef },
- { long => 'data_out', short => 'o', type => 'file', mandatory => 'no', default => undef, allowed => undef, disallowed => undef },
- { long => 'keys', short => 'k', type => 'list', mandatory => 'no', default => undef, allowed => undef, disallowed => undef },
- { long => 'no_keys', short => 'K', type => 'list', mandatory => 'no', default => undef, allowed => undef, disallowed => undef },
- ]
-);
-
-$in = Maasha::Biopieces::read_stream( $options->{ "stream_in" } );
-$out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } );
-
-$data_out = Maasha::Biopieces::write_stream( $options->{ "data_out" }, $options->{ "compress" } );
-$data_out ||= \*STDOUT;
-
-map { $skip_hash{ $_ } = 1 } @{ $options->{ "no_keys" } };
-map { $key_hash{ $_ } = 1; $skip = 1 } @{ $options->{ "keys" } };
-
-while ( $record = Maasha::Biopieces::get_record( $in ) )
-{
- foreach $key ( keys %{ $record } )
- {
- next if $skip and not exists $key_hash{ $key };
- next if $skip_hash{ $key };
-
- if ( Maasha::Calc::is_a_number( $record->{ $key } ) )
- {
- if ( not exists $analysis->{ $key } )
- {
- $analysis->{ $key }->{ "MIN" } = $record->{ $key };
- $analysis->{ $key }->{ "MAX" } = $record->{ $key };
- }
-
- $analysis->{ $key }->{ "MAX" } = Maasha::Calc::max( $analysis->{ $key }->{ "MAX" }, $record->{ $key } );
- $analysis->{ $key }->{ "MIN" } = Maasha::Calc::min( $analysis->{ $key }->{ "MIN" }, $record->{ $key } );
-
- $analysis->{ $key }->{ "TYPE" } = "num";
- $analysis->{ $key }->{ "SUM" } += $record->{ $key };
- }
+require 'maasha/biopieces'
+
+casts = []
+casts << {long: 'keys', short: 'k', type: 'list', mandatory: false, default: nil, allowed: nil, disallowed: nil}
+casts << {long: 'no_keys', short: 'K', type: 'list', mandatory: false, default: nil, allowed: nil, disallowed: nil}
+casts << {long: 'no_stream', short: 'x', type: 'flag', mandatory: false, default: nil, allowed: nil, disallowed: nil}
+casts << {long: 'data_out', short: 'o', type: 'file', mandatory: false, default: nil, allowed: nil, disallowed: nil}
+
+options = Biopieces.options_parse(ARGV, casts)
+
+stats = Hash.new { |h, k| h[k] = {} }
+
+def to_number(value)
+ begin num = Integer(value)
+ return num
+ rescue
+ begin num = Float(value)
+ return num
+ rescue
+ end
+ end
+end
+
+def min(value1, value2)
+ return value2 unless value1
+ [value1, value2].min
+end
+
+def max(value1, value2)
+ return value2 unless value1
+ [value1, value2].max
+end
+
+keys = options[:keys].map { |k| k.to_sym } if options[:keys]
+no_keys = options[:no_keys].each_with_object({}) { |i, h| h[i.to_sym] = true } if options[:no_keys]
+get_keys = nil
+
+Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output|
+ input.each do |record|
+ unless get_keys
+ get_keys = keys || record.keys
+ get_keys.reject! { |k| no_keys[k] } if no_keys
+ end
+
+ get_keys.each do |key|
+ key = key.to_sym
+ value = record[key]
+
+ if value
+ stats[key][:sum] ||= 0
+ stats[key][:count] ||= 0
+
+ if num = to_number(value)
+ stats[key][:min] = min(stats[key][:min], num)
+ stats[key][:max] = max(stats[key][:max], num)
+ stats[key][:type] = :numeric
+ stats[key][:sum] += num
else
- {
- $len = length $record->{ $key };
-
- if ( not exists $analysis->{ $key } )
- {
- $analysis->{ $key }->{ "MIN" } = $len;
- $analysis->{ $key }->{ "MAX" } = $len;
- }
-
- $analysis->{ $key }->{ "MAX" } = Maasha::Calc::max( $analysis->{ $key }->{ "MAX" }, $len );
- $analysis->{ $key }->{ "MIN" } = Maasha::Calc::min( $analysis->{ $key }->{ "MIN" }, $len );
-
- $analysis->{ $key }->{ "TYPE" } = "alph";
- $analysis->{ $key }->{ "SUM" } += $len;
- }
-
- $analysis->{ $key }->{ "COUNT" }++;
+ stats[key][:min] = min(stats[key][:min], value.length)
+ stats[key][:max] = max(stats[key][:max], value.length)
+ stats[key][:type] = :alphabetic
+ stats[key][:sum] += value.length
+ end
+
+ stats[key][:count] += 1
+ end
+ end
+
+ output.puts record unless options[:no_stream]
+ end
+
+ if options[:data_out]
+ data_out = File.open(options[:data_out], 'w')
+ end
+
+ stats.each do |key, value|
+ stat_record = {
+ :KEY => key,
+ :TYPE => value[:type].to_s.capitalize,
+ :COUNT => value[:count],
+ :MIN => value[:min].is_a?(Float) ? "%0.2f" % value[:min] : value[:min],
+ :MAX => value[:max].is_a?(Float) ? "%0.2f" % value[:max] : value[:max],
+ :SUM => value[:sum].is_a?(Float) ? "%0.2f" % value[:sum] : value[:sum],
+ :MEAN => "%0.2f" % (value[:sum] / value[:count].to_f)
}
- Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
-}
-
-foreach $key ( keys %{ $analysis } )
-{
- $analysis->{ $key }->{ "MEAN" } = sprintf "%.2f", $analysis->{ $key }->{ "SUM" } / $analysis->{ $key }->{ "COUNT" };
- $analysis->{ $key }->{ "SUM" } = sprintf "%.2f", $analysis->{ $key }->{ "SUM" };
-}
-
-$keys = "KEY ";
-$types = "TYPE ";
-$counts = "COUNT";
-$mins = "MIN ";
-$maxs = "MAX ";
-$sums = "SUM ";
-$means = "MEAN ";
-
-foreach $key ( sort keys %{ $analysis } )
-{
- $keys .= sprintf "% 15s", $key;
- $types .= sprintf "% 15s", $analysis->{ $key }->{ "TYPE" };
- $counts .= sprintf "% 15s", $analysis->{ $key }->{ "COUNT" };
- $mins .= sprintf "% 15s", $analysis->{ $key }->{ "MIN" };
- $maxs .= sprintf "% 15s", $analysis->{ $key }->{ "MAX" };
- $sums .= sprintf "% 15s", $analysis->{ $key }->{ "SUM" };
- $means .= sprintf "% 15s", $analysis->{ $key }->{ "MEAN" };
-}
-
-print $data_out "$keys\n";
-print $data_out "$types\n";
-print $data_out "$counts\n";
-print $data_out "$mins\n";
-print $data_out "$maxs\n";
-print $data_out "$sums\n";
-print $data_out "$means\n";
-
-close $data_out;
-
-Maasha::Biopieces::close_stream( $in );
-Maasha::Biopieces::close_stream( $out );
-
-
-# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
-
-
-BEGIN
-{
- Maasha::Biopieces::status_set();
-}
-
-
-END
-{
- Maasha::Biopieces::status_log();
-}
+ if options[:data_out]
+ data_out.puts stat_record
+ else
+ output.puts stat_record
+ end
+ end
+end
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<