X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=bp_bin%2Fanalyze_seq;h=14188894cdbd07a88da5282a3b12b108518b923a;hb=2ea9743691410c29f94d49c03e0ea7617bc9e372;hp=1af08dcc47ee40f1988c69f42c97c2d688000078;hpb=52cd51e0b58ed81c412770c2e458838603c2d2ad;p=biopieces.git diff --git a/bp_bin/analyze_seq b/bp_bin/analyze_seq index 1af08dc..1418889 100755 --- a/bp_bin/analyze_seq +++ b/bp_bin/analyze_seq @@ -1,4 +1,4 @@ -#!/usr/bin/env perl -w +#!/usr/bin/env perl # Copyright (C) 2007-2009 Martin A. Hansen. @@ -22,20 +22,23 @@ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< -# Analyze BED entries in the stream. +# Analyze sequences in the stream. + # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< +use warnings; use strict; use Maasha::Biopieces; +use Maasha::Common; use Maasha::Seq; # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< -my ( $run_time_beg, $run_time_end, $options, $in, $out, $record, $analysis ); +my ( $options, $in, $out, $record, $analysis ); $options = Maasha::Biopieces::parse_options(); @@ -44,14 +47,72 @@ $out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } ); while ( $record = Maasha::Biopieces::get_record( $in ) ) { - if ( $record->{ "SEQ" } ) + seq_analyze( $record ) if $record->{ "SEQ" }; + + Maasha::Biopieces::put_record( $record, $out ); +} + +Maasha::Biopieces::close_stream( $in ); +Maasha::Biopieces::close_stream( $out ); + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +sub seq_analyze +{ + # Martin A. Hansen, July 2009. + + # Analyzes the composition of the string in the record and appends + # the analysis to the record. + + my ( $record, # Biopiece record with a SEQ entry. + ) = @_; + + # Returns nothing. + + my ( %char_hash, @indels, @alph, $char, $gc, $at, $lc, $max, $indels ); + + %char_hash = Maasha::Common::str_analyze( $record->{ 'SEQ' } ); + + $record->{ 'SEQ_TYPE' } = Maasha::Seq::seq_guess_type( $record->{ 'SEQ' } ); + $record->{ 'SEQ_LEN' } = length $record->{ 'SEQ' }; + + @alph = Maasha::Seq::seq_alph( $record->{ 'SEQ_TYPE' } . "_AMBI" ); + @indels = qw( - ~ . _ ); + + $max = 0; + + foreach $char ( @alph ) { - $analysis = Maasha::Seq::seq_analyze( $record->{ "SEQ" } ); + $char_hash{ $char } += $char_hash{ lc $char } || 0; + + $record->{ "RES[$char]" } = $char_hash{ $char }; + + $max = $char_hash{ $char } if $char_hash{ $char } > $max; - map { $record->{ $_ } = $analysis->{ $_ } } keys %{ $analysis }; + $record->{ "RES_SUM" } += $char_hash{ $char }; } - Maasha::Biopieces::put_record( $record, $out ); + $indels = 0; + + map { $record->{ "RES[$_]" } = $char_hash{ $_ }; $indels += $char_hash{ $_ } } @indels; + + if ( $record->{ "SEQ_TYPE" } =~ /DNA|RNA/i ) + { + $gc = $char_hash{ "G" } + $char_hash{ "C" }; + $at = $char_hash{ "A" } + $char_hash{ "T" } + $char_hash{ "U" }; + + $lc = 0; + + map { $lc += $char_hash{ lc $_ } || 0 } @alph; + + $record->{ "MIX_INDEX" } = sprintf( "%.2f", $max / ( $record->{ "SEQ_LEN" } - $indels ) ); + $record->{ "GC%" } = sprintf( "%.2f", 100 * $gc / ( $record->{ "SEQ_LEN" } - $indels ) ); + $record->{ "SOFT_MASK%" } = sprintf( "%.2f", 100 * $lc / ( $record->{ "SEQ_LEN" } - $indels ) ); + $record->{ "HARD_MASK%" } = sprintf( "%.2f", 100 * ( $char_hash{ "n" } + $char_hash{ "N" } ) / ( $record->{ "SEQ_LEN" } - $indels ) ); + $record->{ "MELT_TEMP" } = sprintf( "%.2f", 4 * $gc + 2 * $at ); + } } @@ -60,20 +121,13 @@ while ( $record = Maasha::Biopieces::get_record( $in ) ) BEGIN { - $run_time_beg = Maasha::Biopieces::run_time(); - - Maasha::Biopieces::log_biopiece(); + Maasha::Biopieces::status_set(); } END { - Maasha::Biopieces::close_stream( $in ); - Maasha::Biopieces::close_stream( $out ); - - $run_time_end = Maasha::Biopieces::run_time(); - - Maasha::Biopieces::run_time_print( $run_time_beg, $run_time_end, $options ); + Maasha::Biopieces::status_log(); } @@ -81,4 +135,3 @@ END __END__ -