X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=code_perl%2FMaasha%2FSeq.pm;h=4d6c7997665eb2ff19d4e2e44b6c035de841dbd4;hb=381ac0da9f2a23e39e5fe3708b120cea3530b8d6;hp=c224808f2ad109bf31140d947130ab0e79bf57cb;hpb=7c9609e8a165301350118cde21387c8a2e63dd42;p=biopieces.git diff --git a/code_perl/Maasha/Seq.pm b/code_perl/Maasha/Seq.pm index c224808..4d6c799 100644 --- a/code_perl/Maasha/Seq.pm +++ b/code_perl/Maasha/Seq.pm @@ -28,7 +28,10 @@ package Maasha::Seq; # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< +use warnings; use strict; +use POSIX qw( islower ); +use Maasha::Common; use Data::Dumper; use IPC::Open2; use List::Util qw( shuffle ); @@ -62,11 +65,11 @@ sub seq_guess_type } if ( $count = $check_seq =~ tr/FLPQIEflpqie// and $count > 0 ) { - return "protein"; + return "PROTEIN"; } elsif ( $count = $check_seq =~ tr/Uu// and $count > 0 ) { - return "rna"; + return "RNA"; } else { - return "dna"; + return "DNA"; } } @@ -636,7 +639,7 @@ sub generate_dna_oligos # Generates all possible DNA oligos of a given wordsize. - # alternative way: perl -MData::Dumper -e '@CONV = glob( "{T,C,A,G}" x 4 ); print Dumper( \@CONV )' + # alternative way: perl -MData::Dumper -e '@CONV = glob( "{A,T,C,G}" x 4 ); print Dumper( \@CONV )' my ( $wordsize, # size of DNA oligos @@ -763,7 +766,7 @@ sub seq_generate $alph, # sequence alphabet ) = @_; - # returns string + # Returns a string my ( $alph_len, $i, $seq ); @@ -777,6 +780,117 @@ sub seq_generate } +sub seq_insert +{ + # Martin A. Hansen, June 2009. + + # Randomly duplicates a given number of residues in a given sequence. + + my ( $seq, # sequence to mutate + $insertions, # number of residues to insert + ) = @_; + + # Returns a string. + + my ( $i, $pos ); + + for ( $i = 0; $i < $insertions; $i++ ) + { + $pos = int( rand( length $seq ) ); + + substr $seq, $pos, 0, substr $seq , $pos, 1; + } + + return $seq; +} + + +sub seq_delete +{ + # Martin A. Hansen, June 2009. + + # Randomly deletes a given number of residues from a given sequence. + + my ( $seq, # sequence to mutate + $deletions, # number of residues to delete + ) = @_; + + # Returns a string. + + my ( $i, $pos ); + + for ( $i = 0; $i < $deletions; $i++ ) + { + $pos = int( rand( length $seq ) ); + + substr $seq, $pos, 1, ''; + } + + return $seq; +} + + +sub seq_mutate +{ + # Martin A. Hansen, June 2009. + + # Introduces a given number of random mutations in a + # given sequence of a specified alphabet. + + my ( $seq, # sequence to mutate + $mutations, # number of mutations + $alph, # alphabet of sequence + ) = @_; + + # Returns a string. + + my ( $i, $pos, %lookup_hash ); + + $i = 0; + + while ( $i < $mutations ) + { + $pos = int( rand( length $seq ) ); + + if ( not exists $lookup_hash{ $pos } ) + { + substr $seq, $pos, 1, res_mutate( substr( $seq , $pos, 1 ), $alph ); + + $lookup_hash{ $pos } = 1; + + $i++; + } + } + + return $seq; +} + + +sub res_mutate +{ + # Martin A. Hansen, June 2009. + + # Mutates a given residue to another from a given alphabet. + + my ( $res, # residue to mutate + $alph, # alphabet + ) = @_; + + # Returns a char. + + my ( $alph_len, $new ); + + $alph_len = scalar @{ $alph }; + $new = $res; + + while ( uc $new eq uc $res ) { + $new = $alph->[ int( rand( $alph_len ) ) ]; + } + + return POSIX::islower( $res ) ? lc $new : uc $new; +} + + sub seq_shuffle { # Martin A. Hansen, December 2007. @@ -786,7 +900,7 @@ sub seq_shuffle my ( $seq, # sequence string ) = @_; - # Returns string. + # Returns a string. my ( @list ); @@ -800,26 +914,30 @@ sub seq_alph { # Martin A. Hansen, May 2007. - # returns a requested alphabet + # Returns a requested sequence alphabet. my ( $type, # alphabet type ) = @_; # returns list - my ( @alph ); + my ( %alph_hash, $alph ); - if ( $type =~ /^dna$/i ) { - @alph = qw( A T C G ); - } elsif ( $type =~ /^rna$/i ) { - @alph = qw( A U C G ); - } elsif ( $type =~ /^prot/i ) { - @alph = qw( F L S Y C W P H Q R I M T N K V A D E G ); + %alph_hash = ( + DNA => [ qw(A T C G) ], + DNA_AMBI => [ qw(A G C U T R Y W S M K H D V B N) ], + RNA => [ qw(A U C G) ], + RNA_AMBI => [ qw(A G C U T R Y W S M K H D V B N) ], + PROTEIN => [ qw(F L S Y C W P H Q R I M T N K V A D E G) ], + ); + + if ( exists $alph_hash{ uc $type } ) { + $alph = $alph_hash{ uc $type }; } else { die qq(ERROR: Unknown alphabet type: "$type"\n); } - return wantarray ? @alph : \@alph; + return wantarray ? @{ $alph } : $alph; } @@ -836,7 +954,7 @@ sub seq_analyze my ( %analysis, @chars, @chars_lc, $char, %char_hash, $gc, $at, $lc, $max, $res_sum, @indels, %indel_hash ); - $analysis{ "SEQ_TYPE" } = uc Maasha::Seq::seq_guess_type( $seq ); + $analysis{ "SEQ_TYPE" } = Maasha::Seq::seq_guess_type( $seq ); $analysis{ "SEQ_LEN" } = length $seq; @indels = qw( - ~ . _ ); @@ -881,17 +999,17 @@ sub seq_analyze $char_hash{ $char } += $char_hash{ lc $char }; - $analysis{ "RES:$char" } = $char_hash{ $char }; + $analysis{ "RES[$char]" } = $char_hash{ $char }; $max = $char_hash{ $char } if $char_hash{ $char } > $max; $analysis{ "RES_SUM" } += $char_hash{ $char }; } - map { $analysis{ "RES:$_" } = $indel_hash{ $_ } } @indels; + map { $analysis{ "RES[$_]" } = $indel_hash{ $_ } } @indels; $analysis{ "MIX_INDEX" } = sprintf( "%.2f", $max / $analysis{ "SEQ_LEN" } ); - $analysis{ "MELT_TEMP" } = sprintf( "%.2f", 4 * $gc + 2 * $at ); + #$analysis{ "MELT_TEMP" } = sprintf( "%.2f", 4 * $gc + 2 * $at ); return wantarray ? %analysis : \%analysis; } @@ -1389,6 +1507,10 @@ sub seq_word_unpack # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> ADAPTOR LOCATING <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< +############################### REDUNDANT ############################## + +# these functions have been replaced by index_m and match_m in Common.pm + sub find_adaptor { # Martin A. Hansen & Selene Fernandez, August 2008