From ad46292d0fd8f1d757ae876d5c9771964bce27e7 Mon Sep 17 00:00:00 2001 From: martinahansen Date: Fri, 10 Jul 2009 18:38:43 +0000 Subject: [PATCH] speedup of analyze_seq 3 fold git-svn-id: http://biopieces.googlecode.com/svn/trunk@564 74ccb610-7750-0410-82ae-013aeee3265d --- bp_bin/analyze_seq | 71 +++++++++++++++++++++++++++++++++++++---- code_perl/Maasha/Seq.pm | 28 +++++++++------- 2 files changed, 80 insertions(+), 19 deletions(-) diff --git a/bp_bin/analyze_seq b/bp_bin/analyze_seq index b6f7d8c..1418889 100755 --- a/bp_bin/analyze_seq +++ b/bp_bin/analyze_seq @@ -22,7 +22,8 @@ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< -# Analyze BED entries in the stream. +# Analyze sequences in the stream. + # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< @@ -30,6 +31,7 @@ use warnings; use strict; use Maasha::Biopieces; +use Maasha::Common; use Maasha::Seq; @@ -45,12 +47,7 @@ $out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } ); while ( $record = Maasha::Biopieces::get_record( $in ) ) { - if ( $record->{ "SEQ" } ) - { - $analysis = Maasha::Seq::seq_analyze( $record->{ "SEQ" } ); - - map { $record->{ $_ } = $analysis->{ $_ } } keys %{ $analysis }; - } + seq_analyze( $record ) if $record->{ "SEQ" }; Maasha::Biopieces::put_record( $record, $out ); } @@ -62,6 +59,66 @@ Maasha::Biopieces::close_stream( $out ); # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< +sub seq_analyze +{ + # Martin A. Hansen, July 2009. + + # Analyzes the composition of the string in the record and appends + # the analysis to the record. + + my ( $record, # Biopiece record with a SEQ entry. + ) = @_; + + # Returns nothing. + + my ( %char_hash, @indels, @alph, $char, $gc, $at, $lc, $max, $indels ); + + %char_hash = Maasha::Common::str_analyze( $record->{ 'SEQ' } ); + + $record->{ 'SEQ_TYPE' } = Maasha::Seq::seq_guess_type( $record->{ 'SEQ' } ); + $record->{ 'SEQ_LEN' } = length $record->{ 'SEQ' }; + + @alph = Maasha::Seq::seq_alph( $record->{ 'SEQ_TYPE' } . "_AMBI" ); + @indels = qw( - ~ . _ ); + + $max = 0; + + foreach $char ( @alph ) + { + $char_hash{ $char } += $char_hash{ lc $char } || 0; + + $record->{ "RES[$char]" } = $char_hash{ $char }; + + $max = $char_hash{ $char } if $char_hash{ $char } > $max; + + $record->{ "RES_SUM" } += $char_hash{ $char }; + } + + $indels = 0; + + map { $record->{ "RES[$_]" } = $char_hash{ $_ }; $indels += $char_hash{ $_ } } @indels; + + if ( $record->{ "SEQ_TYPE" } =~ /DNA|RNA/i ) + { + $gc = $char_hash{ "G" } + $char_hash{ "C" }; + $at = $char_hash{ "A" } + $char_hash{ "T" } + $char_hash{ "U" }; + + $lc = 0; + + map { $lc += $char_hash{ lc $_ } || 0 } @alph; + + $record->{ "MIX_INDEX" } = sprintf( "%.2f", $max / ( $record->{ "SEQ_LEN" } - $indels ) ); + $record->{ "GC%" } = sprintf( "%.2f", 100 * $gc / ( $record->{ "SEQ_LEN" } - $indels ) ); + $record->{ "SOFT_MASK%" } = sprintf( "%.2f", 100 * $lc / ( $record->{ "SEQ_LEN" } - $indels ) ); + $record->{ "HARD_MASK%" } = sprintf( "%.2f", 100 * ( $char_hash{ "n" } + $char_hash{ "N" } ) / ( $record->{ "SEQ_LEN" } - $indels ) ); + $record->{ "MELT_TEMP" } = sprintf( "%.2f", 4 * $gc + 2 * $at ); + } +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + BEGIN { Maasha::Biopieces::status_set(); diff --git a/code_perl/Maasha/Seq.pm b/code_perl/Maasha/Seq.pm index cfedd89..2c64f15 100644 --- a/code_perl/Maasha/Seq.pm +++ b/code_perl/Maasha/Seq.pm @@ -65,11 +65,11 @@ sub seq_guess_type } if ( $count = $check_seq =~ tr/FLPQIEflpqie// and $count > 0 ) { - return "protein"; + return "PROTEIN"; } elsif ( $count = $check_seq =~ tr/Uu// and $count > 0 ) { - return "rna"; + return "RNA"; } else { - return "dna"; + return "DNA"; } } @@ -914,26 +914,30 @@ sub seq_alph { # Martin A. Hansen, May 2007. - # returns a requested alphabet + # Returns a requested sequence alphabet. my ( $type, # alphabet type ) = @_; # returns list - my ( @alph ); + my ( %alph_hash, $alph ); - if ( $type =~ /^dna$/i ) { - @alph = qw( A T C G ); - } elsif ( $type =~ /^rna$/i ) { - @alph = qw( A U C G ); - } elsif ( $type =~ /^prot/i ) { - @alph = qw( F L S Y C W P H Q R I M T N K V A D E G ); + %alph_hash = ( + DNA => [ qw( A T C G ) ], + DNA_AMBI => [ qw( A G C U T R Y W S M K H D V B N ) ], + RNA => [ qw( A U C G ) ], + RNA_AMBI => [ qw( A G C U T R Y W S M K H D V B N ) ], + PROTEIN => [ qw( F L S Y C W P H Q R I M T N K V A D E G ) ], + ); + + if ( exists $alph_hash{ $type } ) { + $alph = $alph_hash{ $type }; } else { die qq(ERROR: Unknown alphabet type: "$type"\n); } - return wantarray ? @alph : \@alph; + return wantarray ? @{ $alph } : $alph; } -- 2.39.5