From 52cd51e0b58ed81c412770c2e458838603c2d2ad Mon Sep 17 00:00:00 2001 From: martinahansen Date: Sun, 24 May 2009 15:48:07 +0000 Subject: [PATCH] magrated analyze biopieces git-svn-id: http://biopieces.googlecode.com/svn/trunk@395 74ccb610-7750-0410-82ae-013aeee3265d --- bp_bin/align_seq | 2 + bp_bin/analyze_bed | 78 +++++++++++++++++++++- bp_bin/analyze_seq | 84 +++++++++++++++++++++++- bp_bin/analyze_tags | 104 +++++++++++++++++++++++++++++- bp_bin/analyze_vals | 154 +++++++++++++++++++++++++++++++++++++++++++- 5 files changed, 410 insertions(+), 12 deletions(-) diff --git a/bp_bin/align_seq b/bp_bin/align_seq index 33930cb..e34759a 100755 --- a/bp_bin/align_seq +++ b/bp_bin/align_seq @@ -41,6 +41,8 @@ use constant { my ( $run_time_beg, $run_time_end, $options, $in, $out, $record, @entries, $entry ); +$options = Maasha::Biopieces::parse_options(); + $in = Maasha::Biopieces::read_stream( $options->{ "stream_in" } ); $out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } ); diff --git a/bp_bin/analyze_bed b/bp_bin/analyze_bed index fdf5bd2..47f2102 100755 --- a/bp_bin/analyze_bed +++ b/bp_bin/analyze_bed @@ -1,6 +1,78 @@ -#!/usr/bin/env perl +#!/usr/bin/env perl -w + +# Copyright (C) 2007-2009 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# Analyze BED entries in the stream. + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + -use warnings; use strict; +use Maasha::Biopieces; +use Maasha::UCSC; + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +my ( $run_time_beg, $run_time_end, $options, $in, $out, $record ); + +$options = Maasha::Biopieces::parse_options(); + +$in = Maasha::Biopieces::read_stream( $options->{ "stream_in" } ); +$out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } ); + +while ( $record = Maasha::Biopieces::get_record( $in ) ) +{ + $record = Maasha::UCSC::bed_analyze( $record ) if $record->{ "REC_TYPE" } eq "BED"; + + Maasha::Biopieces::put_record( $record, $out ); +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +BEGIN +{ + $run_time_beg = Maasha::Biopieces::run_time(); + + Maasha::Biopieces::log_biopiece(); +} + + +END +{ + Maasha::Biopieces::close_stream( $in ); + Maasha::Biopieces::close_stream( $out ); + + $run_time_end = Maasha::Biopieces::run_time(); + + Maasha::Biopieces::run_time_print( $run_time_beg, $run_time_end, $options ); +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +__END__ -use Maasha::BioRun; diff --git a/bp_bin/analyze_seq b/bp_bin/analyze_seq index fdf5bd2..1af08dc 100755 --- a/bp_bin/analyze_seq +++ b/bp_bin/analyze_seq @@ -1,6 +1,84 @@ -#!/usr/bin/env perl +#!/usr/bin/env perl -w + +# Copyright (C) 2007-2009 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +# Analyze BED entries in the stream. + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + -use warnings; use strict; +use Maasha::Biopieces; +use Maasha::Seq; + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +my ( $run_time_beg, $run_time_end, $options, $in, $out, $record, $analysis ); + +$options = Maasha::Biopieces::parse_options(); + +$in = Maasha::Biopieces::read_stream( $options->{ "stream_in" } ); +$out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } ); + +while ( $record = Maasha::Biopieces::get_record( $in ) ) +{ + if ( $record->{ "SEQ" } ) + { + $analysis = Maasha::Seq::seq_analyze( $record->{ "SEQ" } ); + + map { $record->{ $_ } = $analysis->{ $_ } } keys %{ $analysis }; + } + + Maasha::Biopieces::put_record( $record, $out ); +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +BEGIN +{ + $run_time_beg = Maasha::Biopieces::run_time(); + + Maasha::Biopieces::log_biopiece(); +} + + +END +{ + Maasha::Biopieces::close_stream( $in ); + Maasha::Biopieces::close_stream( $out ); + + $run_time_end = Maasha::Biopieces::run_time(); + + Maasha::Biopieces::run_time_print( $run_time_beg, $run_time_end, $options ); +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +__END__ -use Maasha::BioRun; diff --git a/bp_bin/analyze_tags b/bp_bin/analyze_tags index fdf5bd2..f21478d 100755 --- a/bp_bin/analyze_tags +++ b/bp_bin/analyze_tags @@ -1,6 +1,104 @@ -#!/usr/bin/env perl +#!/usr/bin/env perl -w + +# Copyright (C) 2007-2009 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# Analyze sequence tags in sequence or BED records from the stream. + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + -use warnings; use strict; +use Maasha::Biopieces; +use Maasha::Seq; + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +my ( $run_time_beg, $run_time_end, $options, $in, $out, $record, %len_hash, %clone_hash, $clones, $key, $tag_record ); + +$options = Maasha::Biopieces::parse_options(); + +$in = Maasha::Biopieces::read_stream( $options->{ "stream_in" } ); +$out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } ); + +while ( $record = Maasha::Biopieces::get_record( $in ) ) +{ + if ( $record->{ "SEQ_NAME" } and $record->{ "SEQ" } ) + { + if ( $record->{ "SEQ_NAME" } =~ /_(\d+)$/ ) + { + $clones = $1; + + $len_hash{ length( $record->{ "SEQ" } ) }++; + $clone_hash{ length( $record->{ "SEQ" } ) } += $clones; + } + } + elsif ( $record->{ "Q_ID" } and $record->{ "BED_LEN" } ) + { + if ( $record->{ "Q_ID" } =~ /_(\d+)$/ ) + { + $clones = $1; + + $len_hash{ $record->{ "BED_LEN" } }++; + $clone_hash{ $record->{ "BED_LEN" } } += $clones; + } + } +} + +foreach $key ( sort { $a <=> $b } keys %len_hash ) +{ + $tag_record->{ "TAG_LEN" } = $key; + $tag_record->{ "TAG_COUNT" } = $len_hash{ $key }; + $tag_record->{ "TAG_CLONES" } = $clone_hash{ $key }; + + Maasha::Biopieces::put_record( $tag_record, $out ); +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +BEGIN +{ + $run_time_beg = Maasha::Biopieces::run_time(); + + Maasha::Biopieces::log_biopiece(); +} + + +END +{ + Maasha::Biopieces::close_stream( $in ); + Maasha::Biopieces::close_stream( $out ); + + $run_time_end = Maasha::Biopieces::run_time(); + + Maasha::Biopieces::run_time_print( $run_time_beg, $run_time_end, $options ); +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +__END__ -use Maasha::BioRun; diff --git a/bp_bin/analyze_vals b/bp_bin/analyze_vals index fdf5bd2..1ddd36f 100755 --- a/bp_bin/analyze_vals +++ b/bp_bin/analyze_vals @@ -1,6 +1,154 @@ -#!/usr/bin/env perl +#!/usr/bin/env perl -w + +# Copyright (C) 2007-2009 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# Determine type, count, min, max, sum and mean for values in stream. + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + -use warnings; use strict; +use Maasha::Biopieces; +use Maasha::Calc; +use Data::Dumper; + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +my ( $run_time_beg, $run_time_end, $options, $in, $out, $record, $analysis, $key, $len, + %key_hash, $skip, $keys, $types, $counts, $mins, $maxs, $sums, $means ); + +$options = Maasha::Biopieces::parse_options( + [ + { long => 'no_stream', short => 'x', type => 'flag', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, + { long => 'keys', short => 'k', type => 'list', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, + ] +); + +$in = Maasha::Biopieces::read_stream( $options->{ "stream_in" } ); +$out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } ); + +map { $key_hash{ $_ } = 1; $skip = 1 } @{ $options->{ "keys" } }; + +while ( $record = Maasha::Biopieces::get_record( $in ) ) +{ + foreach $key ( keys %{ $record } ) + { + next if $skip and not exists $key_hash{ $key }; + + if ( Maasha::Calc::is_a_number( $record->{ $key } ) ) + { + if ( not exists $analysis->{ $key } ) + { + $analysis->{ $key }->{ "MIN" } = $record->{ $key }; + $analysis->{ $key }->{ "MAX" } = $record->{ $key }; + } + + $analysis->{ $key }->{ "MAX" } = Maasha::Calc::max( $analysis->{ $key }->{ "MAX" }, $record->{ $key } ); + $analysis->{ $key }->{ "MIN" } = Maasha::Calc::min( $analysis->{ $key }->{ "MIN" }, $record->{ $key } ); + + $analysis->{ $key }->{ "TYPE" } = "num"; + $analysis->{ $key }->{ "SUM" } += $record->{ $key }; + } + else + { + $len = length $record->{ $key }; + + if ( not exists $analysis->{ $key } ) + { + $analysis->{ $key }->{ "MIN" } = $len; + $analysis->{ $key }->{ "MAX" } = $len; + } + + $analysis->{ $key }->{ "MAX" } = Maasha::Calc::max( $analysis->{ $key }->{ "MAX" }, $len ); + $analysis->{ $key }->{ "MIN" } = Maasha::Calc::min( $analysis->{ $key }->{ "MIN" }, $len ); + + $analysis->{ $key }->{ "TYPE" } = "alph"; + $analysis->{ $key }->{ "SUM" } += $len; + } + + $analysis->{ $key }->{ "COUNT" }++; + } + + Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" }; +} + +foreach $key ( keys %{ $analysis } ) +{ + $analysis->{ $key }->{ "MEAN" } = sprintf "%.2f", $analysis->{ $key }->{ "SUM" } / $analysis->{ $key }->{ "COUNT" }; + $analysis->{ $key }->{ "SUM" } = sprintf "%.2f", $analysis->{ $key }->{ "SUM" }; +} + +$keys = "KEY "; +$types = "TYPE "; +$counts = "COUNT"; +$mins = "MIN "; +$maxs = "MAX "; +$sums = "SUM "; +$means = "MEAN "; + +foreach $key ( keys %{ $analysis } ) +{ + $keys .= sprintf "% 15s", $key; + $types .= sprintf "% 15s", $analysis->{ $key }->{ "TYPE" }; + $counts .= sprintf "% 15s", $analysis->{ $key }->{ "COUNT" }; + $mins .= sprintf "% 15s", $analysis->{ $key }->{ "MIN" }; + $maxs .= sprintf "% 15s", $analysis->{ $key }->{ "MAX" }; + $sums .= sprintf "% 15s", $analysis->{ $key }->{ "SUM" }; + $means .= sprintf "% 15s", $analysis->{ $key }->{ "MEAN" }; +} + +print "$keys\n"; +print "$types\n"; +print "$counts\n"; +print "$mins\n"; +print "$maxs\n"; +print "$sums\n"; +print "$means\n"; + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +BEGIN +{ + $run_time_beg = Maasha::Biopieces::run_time(); + + Maasha::Biopieces::log_biopiece(); +} + +END +{ + Maasha::Biopieces::close_stream( $in ); + Maasha::Biopieces::close_stream( $out ); + + $run_time_end = Maasha::Biopieces::run_time(); + + Maasha::Biopieces::run_time_print( $run_time_beg, $run_time_end, $options ); +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + -use Maasha::BioRun; +__END__ -- 2.39.5