fixed seq qual length check

[biopieces.git] / code_perl / Maasha / Seq.pm
diff --git a/code_perl/Maasha/Seq.pm b/code_perl/Maasha/Seq.pm

index f2b8089c1535b1249d0732e494c2f959970dc998..4d6c7997665eb2ff19d4e2e44b6c035de841dbd4 100644 (file)
--- a/code_perl/Maasha/Seq.pm
+++ b/code_perl/Maasha/Seq.pm
@@ -28,7 +28,10 @@ package Maasha::Seq;
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  
  
+use warnings;
  use strict;
+use POSIX qw( islower );
+use Maasha::Common;
  use Data::Dumper;
  use IPC::Open2;
  use List::Util qw( shuffle );
@@ -62,11 +65,11 @@ sub seq_guess_type
      }
  
      if ( $count = $check_seq =~ tr/FLPQIEflpqie// and $count > 0 ) {
-        return "protein";
+        return "PROTEIN";
      } elsif ( $count = $check_seq =~ tr/Uu// and $count > 0 ) {
-        return "rna";
+        return "RNA";
      } else {
-        return "dna";
+        return "DNA";
      }
  }
  
@@ -450,7 +453,7 @@ sub translate
      if ( $frame =~ /-?[1-3]/ )
      {
          if ( $frame < 0 ) {
-            $dna = &Maasha::Seq::dna_revcomp( $dna );
+            $dna = Maasha::Seq::dna_revcomp( $dna );
          }
  
          $frame = abs( $frame ) - 1;
@@ -459,7 +462,7 @@ sub translate
      }
      else
      {
-        &Maasha::Common::error( qq(Badly formated frame "$frame") );
+        Maasha::Common::error( qq(Badly formated frame "$frame") );
      }
  
      $pos = 0;
@@ -468,7 +471,7 @@ sub translate
      {
          last if not length $codon == 3;
  
-        $pep .= &codon2aa( $codon );
+        $pep .= codon2aa( $codon );
      
          $pos += 3;
      }
@@ -495,7 +498,7 @@ sub fold_struct_rnafold
  
      $pid = open2( $fh_out, $fh_in, "RNAfold -noPS" );
  
-    &Maasha::Fasta::put_entry( [ "RNAfold", $seq ], $fh_in );
+    Maasha::Fasta::put_entry( [ "RNAfold", $seq ], $fh_in );
  
      close $fh_in;
  
@@ -537,13 +540,13 @@ sub fold_struct_contrastruct
      $out_file1 = "$tmp_dir/fold.out1";
      $out_file2 = "$tmp_dir/fold.out2";
  
-    &Maasha::Fasta::put_entries( [ [ "fold", $seq ] ], $tmp_file );
+    Maasha::Fasta::put_entries( [ [ "fold", $seq ] ], $tmp_file );
  
-    &Maasha::Common::run( "contrafold", "predict --parens $out_file1 --bpseq $out_file2 $tmp_file" );
+    Maasha::Common::run( "contrafold", "predict --parens $out_file1 --bpseq $out_file2 $tmp_file" );
  
      unlink $tmp_file;
  
-    $fh = &Maasha::Common::read_open( $out_file1 );
+    $fh = Maasha::Common::read_open( $out_file1 );
  
      while ( $line = <$fh> )
      {
@@ -556,7 +559,7 @@ sub fold_struct_contrastruct
  
      unlink $out_file1;
  
-    $fh = &Maasha::Common::read_open( $out_file2 );
+    $fh = Maasha::Common::read_open( $out_file2 );
  
      while ( $line = <$fh> )
      {
@@ -575,7 +578,7 @@ sub fold_struct_contrastruct
          {
              last if $AoA[ $i ]->[ 0 ] > $AoA[ $i ]->[ 2 ];
  
-            $temp += &base_pair_melting_temp( $AoA[ $i ]->[ 1 ] . $AoA[ $AoA[ $i ]->[ 2 ] - 1 ]->[ 1 ] );
+            $temp += base_pair_melting_temp( $AoA[ $i ]->[ 1 ] . $AoA[ $AoA[ $i ]->[ 2 ] - 1 ]->[ 1 ] );
          }
      }
  
@@ -636,7 +639,7 @@ sub generate_dna_oligos
  
      # Generates all possible DNA oligos of a given wordsize.
  
-    # alternative way: perl -MData::Dumper -e '@CONV = glob( "{T,C,A,G}" x 4 ); print Dumper( \@CONV )'
+    # alternative way: perl -MData::Dumper -e '@CONV = glob( "{A,T,C,G}" x 4 ); print Dumper( \@CONV )'
  
  
      my ( $wordsize,   # size of DNA oligos
@@ -763,7 +766,7 @@ sub seq_generate
           $alph,   # sequence alphabet
         ) = @_;
  
-    # returns string
+    # Returns a string
  
      my ( $alph_len, $i, $seq );
      
@@ -777,6 +780,117 @@ sub seq_generate
  }
  
  
+sub seq_insert
+{
+    # Martin A. Hansen, June 2009.
+
+    # Randomly duplicates a given number of residues in a given sequence.
+
+    my ( $seq,          # sequence to mutate
+         $insertions,   # number of residues to insert
+       ) = @_;
+
+    # Returns a string.
+
+    my ( $i, $pos );
+
+    for ( $i = 0; $i < $insertions; $i++ )
+    {
+        $pos = int( rand( length $seq ) );
+
+        substr $seq, $pos, 0, substr $seq , $pos, 1;
+    }
+
+    return $seq;
+}
+
+
+sub seq_delete
+{
+    # Martin A. Hansen, June 2009.
+
+    # Randomly deletes a given number of residues from a given sequence.
+
+    my ( $seq,         # sequence to mutate
+         $deletions,   # number of residues to delete
+       ) = @_;
+
+    # Returns a string.
+
+    my ( $i, $pos );
+
+    for ( $i = 0; $i < $deletions; $i++ )
+    {
+        $pos = int( rand( length $seq ) );
+
+        substr $seq, $pos, 1, '';
+    }
+
+    return $seq;
+}
+
+
+sub seq_mutate
+{
+    # Martin A. Hansen, June 2009.
+
+    # Introduces a given number of random mutations in a 
+    # given sequence of a specified alphabet.
+
+    my ( $seq,         # sequence to mutate
+         $mutations,   # number of mutations
+         $alph,        # alphabet of sequence
+       ) = @_;
+
+    # Returns a string.
+
+    my ( $i, $pos, %lookup_hash );
+
+    $i = 0;
+
+    while ( $i < $mutations )
+    {
+        $pos = int( rand( length $seq ) );
+
+        if ( not exists $lookup_hash{ $pos } )
+        {
+            substr $seq, $pos, 1, res_mutate( substr( $seq , $pos, 1 ), $alph );
+
+            $lookup_hash{ $pos } = 1;
+        
+            $i++;
+        }
+    }
+
+    return $seq;
+}
+
+
+sub res_mutate
+{
+    # Martin A. Hansen, June 2009.
+
+    # Mutates a given residue to another from a given alphabet.
+
+    my ( $res,    # residue to mutate
+         $alph,   # alphabet
+       ) = @_;
+
+    # Returns a char.
+
+    my ( $alph_len, $new );
+
+    $alph_len = scalar @{ $alph };
+    $new      = $res;
+
+    while ( uc $new eq uc $res ) {
+        $new = $alph->[ int( rand( $alph_len ) ) ];
+    }
+
+    return POSIX::islower( $res ) ? lc $new : uc $new;
+}
+
+
  sub seq_shuffle
  {
      # Martin A. Hansen, December 2007.
@@ -786,7 +900,7 @@ sub seq_shuffle
      my ( $seq,   # sequence string
         ) = @_;
  
-    # Returns string.
+    # Returns a string.
  
      my ( @list );
  
@@ -800,26 +914,30 @@ sub seq_alph
  {
      # Martin A. Hansen, May 2007.
  
-    # returns a requested alphabet
+    # Returns a requested sequence alphabet.
  
      my ( $type,   # alphabet type
         ) = @_;
  
      # returns list
  
-    my ( @alph );
+    my ( %alph_hash, $alph );
+
+    %alph_hash = (
+        DNA      => [ qw(A T C G) ],
+        DNA_AMBI => [ qw(A G C U T R Y W S M K H D V B N) ],
+        RNA      => [ qw(A U C G) ],
+        RNA_AMBI => [ qw(A G C U T R Y W S M K H D V B N) ],
+        PROTEIN  => [ qw(F L S Y C W P H Q R I M T N K V A D E G) ],
+    );
  
-    if ( $type =~ /^dna$/i ) {
-        @alph = qw( A T C G );
-    } elsif ( $type =~ /^rna$/i ) {
-        @alph = qw( A U C G );
-    } elsif ( $type =~ /^prot/i ) {
-        @alph = qw( F L S Y C W P H Q R I M T N K V A D E G );
+    if ( exists $alph_hash{ uc $type } ) {
+        $alph = $alph_hash{ uc $type };
      } else {
          die qq(ERROR: Unknown alphabet type: "$type"\n);
      }
  
-    return wantarray ? @alph : \@alph;
+    return wantarray ? @{ $alph } : $alph;
  }
  
  
@@ -836,7 +954,7 @@ sub seq_analyze
  
      my ( %analysis, @chars, @chars_lc, $char, %char_hash, $gc, $at, $lc, $max, $res_sum, @indels, %indel_hash );
  
-    $analysis{ "SEQ_TYPE" } = uc &Maasha::Seq::seq_guess_type( $seq );
+    $analysis{ "SEQ_TYPE" } = Maasha::Seq::seq_guess_type( $seq );
      $analysis{ "SEQ_LEN" }  = length $seq;
  
      @indels = qw( - ~ . _ );
@@ -881,17 +999,17 @@ sub seq_analyze
  
          $char_hash{ $char } += $char_hash{ lc $char };
  
-        $analysis{ "RES:$char" } = $char_hash{ $char };
+        $analysis{ "RES[$char]" } = $char_hash{ $char };
  
          $max = $char_hash{ $char } if $char_hash{ $char } > $max;
  
          $analysis{ "RES_SUM" } += $char_hash{ $char };
      }
  
-    map { $analysis{ "RES:$_" } = $indel_hash{ $_ } } @indels;
+    map { $analysis{ "RES[$_]" } = $indel_hash{ $_ } } @indels;
  
      $analysis{ "MIX_INDEX" } = sprintf( "%.2f", $max / $analysis{ "SEQ_LEN" } );
-    $analysis{ "MELT_TEMP" } = sprintf( "%.2f", 4 * $gc + 2 * $at );
+    #$analysis{ "MELT_TEMP" } = sprintf( "%.2f", 4 * $gc + 2 * $at );
  
      return wantarray ? %analysis : \%analysis;
  }
@@ -965,11 +1083,11 @@ sub seqlogo_calc
          delete $char_hash{ "~" };
          delete $char_hash{ "." };
  
-        $bit_height = &seqlogo_calc_bit_height( \%char_hash, $char_tot );
+        $bit_height = seqlogo_calc_bit_height( \%char_hash, $char_tot );
  
          $bit_diff = $bit_max - $bit_height;
  
-        $char_heights = &seqlogo_calc_char_heights( \%char_hash, $char_tot, $bit_diff );
+        $char_heights = seqlogo_calc_char_heights( \%char_hash, $char_tot, $bit_diff );
  
          push @logo, $char_heights;
      }
@@ -1385,6 +1503,103 @@ sub seq_word_unpack
      return $word;
  }
  
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> ADAPTOR LOCATING <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+###############################    REDUNDANT   ##############################
+
+# these functions have been replaced by index_m and match_m in Common.pm
+
+sub find_adaptor
+{
+    # Martin A. Hansen & Selene Fernandez, August 2008
+
+    # Locates an adaptor sequence in a given sequence
+    # starting from a given offset position allowing a given
+    # number of mismatches (no indels).
+
+    my ( $adaptor,       # list of chars
+         $tag,           # list of chars
+         $adaptor_len,   # length of adaptor sequence
+         $tag_len,       # length of tag sequence
+         $tag_offset,    # offset position
+         $max_match,     # number of matches indicating success
+         $max_mismatch,  # number of mismatches
+       ) = @_;
+
+    # Returns an integer.
+
+    my ( $i, $j, $match, $mismatch );
+
+    $i = $tag_offset;
+
+    while ( $i < $tag_len - ( $max_match + $max_mismatch ) + 1 )
+    {
+        $j = 0;
+        
+        while ( $j < $adaptor_len - ( $max_match + $max_mismatch ) + 1 )
+        {
+            return $i if check_diag( $adaptor, $tag, $adaptor_len, $tag_len, $j, $i, $max_match, $max_mismatch );
+
+            $j++
+        }
+    
+        $i++;
+    }
+
+    return -1;
+}
+
+
+sub check_diag
+{
+    # Martin A. Hansen & Selene Fernandez, August 2008
+
+    # Checks the diagonal starting at a given coordinate 
+    # of a search space constituted by an adaptor and a tag sequence.
+    # Residues in the diagonal are compared between the sequences allowing
+    # for a given number of mismatches. We terminate when search space is
+    # exhausted or if max matches or mismatches is reached.
+
+    my ( $adaptor,       # list of chars
+         $tag,           # list of chars
+         $adaptor_len,   # length of adaptor sequence
+         $tag_len,       # length of tag sequence
+         $adaptor_beg,   # adaptor begin coordinate
+         $tag_beg,       # tag begin coordinate
+         $max_match,     # number of matches indicating success
+         $max_mismatch,  # number of mismatches
+       ) = @_;
+
+    # Returns boolean.
+
+    my ( $match, $mismatch );
+
+    $match    = 0;
+    $mismatch = 0;
+
+    while ( $adaptor_beg <= $adaptor_len and $tag_beg <= $tag_len )
+    {
+        if ( $adaptor->[ $adaptor_beg ] eq $tag->[ $tag_beg ] )
+        {
+            $match++;
+
+            return 1 if $match >= $max_match;
+        }
+        else
+        {
+            $mismatch++;
+
+            return 0 if $mismatch > $max_mismatch;
+        }
+    
+        $adaptor_beg++;
+        $tag_beg++;
+    }
+
+    return 0;
+}
          
  
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<