code_perl/Maasha/Seq.pm

   1 package Maasha::Seq;
   2
   3 # Copyright (C) 2007 Martin A. Hansen.
   4
   5 # This program is free software; you can redistribute it and/or
   6 # modify it under the terms of the GNU General Public License
   7 # as published by the Free Software Foundation; either version 2
   8 # of the License, or (at your option) any later version.
   9
  10 # This program is distributed in the hope that it will be useful,
  11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 # GNU General Public License for more details.
  14
  15 # You should have received a copy of the GNU General Public License
  16 # along with this program; if not, write to the Free Software
  17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  18
  19 # http://www.gnu.org/copyleft/gpl.html
  20
  21
  22 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  23
  24
  25 # yak yak yak
  26
  27
  28 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  29
  30
  31 use strict;
  32 use Data::Dumper;
  33 use IPC::Open2;
  34 use List::Util qw( shuffle );
  35 use Time::HiRes qw( gettimeofday );
  36
  37 use vars qw ( @ISA @EXPORT );
  38
  39 @ISA = qw( Exporter );
  40
  41
  42 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  43
  44
  45 sub seq_guess_type
  46 {
  47     # Martin A. Hansen, May 2007.
  48
  49     # Makes a qualified guess on the type of a given squence.
  50
  51     my ( $seq,   # sequence to check
  52        ) = @_;
  53
  54     # returns string.
  55
  56     my ( $check_seq, $count );
  57
  58     if ( length $seq > 100 ) {
  59         $check_seq = substr $seq, 0, 100;
  60     } else {
  61         $check_seq = $seq;
  62     }
  63
  64     if ( $count = $check_seq =~ tr/FLPQIEflpqie// and $count > 0 ) {
  65         return "protein";
  66     } elsif ( $count = $check_seq =~ tr/Uu// and $count > 0 ) {
  67         return "rna";
  68     } else {
  69         return "dna";
  70     }
  71 }
  72
  73
  74 sub wrap
  75 {
  76     # Martin A. Hansen, July 2007.
  77
  78     # Wraps a given string reference accoring to given width.
  79
  80     my ( $strref,   # ref to sting to wrap
  81          $wrap,     # wrap width
  82        ) = @_;
  83
  84     # Returns nothing.
  85
  86     ${ $strref } =~ s/(.{$wrap})/$1\n/g;
  87
  88     chomp ${ $strref };
  89 }
  90
  91
  92 sub dna_revcomp
  93 {
  94     # Niels Larsen
  95     # modified Martin A. Hansen, March 2005.
  96
  97     # Returns the reverse complement of a dna sequence with preservation of case
  98     # according to this mapping,
  99     #
 100     # AGCUTRYWSMKHDVBNagcutrywsmkhdvbn
 101     # TCGAAYRWSKMDHBVNtcgaayrwskmdhbvn
 102
 103     my ( $seq,   # seq
 104        ) = @_;
 105
 106     # returns string
 107
 108     $seq = reverse $seq;
 109
 110     $seq =~ tr/AGCUTRYWSMKHDVBNagcutrywsmkhdvbn/TCGAAYRWSKMDHBVNtcgaayrwskmdhbvn/;
 111
 112     return $seq;
 113 }
 114
 115
 116 sub rna_revcomp
 117 {
 118     # Niels Larsen
 119     # modified Martin A. Hansen, March 2005.
 120
 121     # Returns the complement of a rna sequence with preservation of case
 122     # according to this mapping,
 123     #
 124     # AGCUTRYWSMKHDVBNagcutrywsmkhdvbn
 125     # UCGAAYRWSKMDHBVNucgaayrwskmdhbvn
 126
 127     my ( $seq,   # seq
 128        ) = @_;
 129
 130     $seq = reverse $seq;
 131
 132     $seq =~ tr/AGCUTRYWSMKHDVBNagcutrywsmkhdvbn/UCGAAYRWSKMDHBVNucgaayrwskmdhbvn/;
 133
 134     return $seq;
 135 }
 136
 137
 138 sub dna_comp
 139 {
 140     # Niels Larsen
 141     # modified Martin A. Hansen, March 2005.
 142
 143     # Returns the reverse complement of a dna sequence with preservation of case
 144     # according to this mapping,
 145     #
 146     # AGCUTRYWSMKHDVBNagcutrywsmkhdvbn
 147     # TCGAAYRWSKMDHBVNtcgaayrwskmdhbvn
 148
 149     my ( $seqref,   # seqref
 150        ) = @_;
 151
 152     # Returns nothing.
 153
 154     ${ $seqref } =~ tr/AGCUTRYWSMKHDVBNagcutrywsmkhdvbn/TCGAAYRWSKMDHBVNtcgaayrwskmdhbvn/;
 155 }
 156
 157
 158 sub rna_comp
 159 {
 160     # Niels Larsen
 161     # modified Martin A. Hansen, March 2005.
 162
 163     # Returns the complement of a rna sequence with preservation of case
 164     # according to this mapping,
 165     #
 166     # AGCUTRYWSMKHDVBNagcutrywsmkhdvbn
 167     # UCGAAYRWSKMDHBVNucgaayrwskmdhbvn
 168
 169     my ( $seqref,   # seqref
 170        ) = @_;
 171
 172     # Returns nothing.
 173
 174     ${ $seqref } =~ tr/AGCUTRYWSMKHDVBNagcutrywsmkhdvbn/UCGAAYRWSKMDHBVNucgaayrwskmdhbvn/;
 175 }
 176
 177
 178 sub dna2rna
 179 {
 180     # Martin A. Hansen, March 2007
 181
 182     # Converts DNA sequence to RNA
 183
 184     my ( $seq,   # nucleotide sequence
 185        ) = @_;
 186
 187     # returns string
 188
 189     $seq =~ tr/Tt/Uu/;
 190
 191     return $seq;
 192 }
 193
 194
 195 sub rna2dna
 196 {
 197     # Martin A. Hansen, March 2007
 198
 199     # Converts RNA sequence to DNA
 200
 201     my ( $seq,   # nucleotide sequence
 202        ) = @_;
 203
 204     # returns string
 205
 206     $seq =~ tr/Uu/Tt/;
 207
 208     return $seq;
 209 }
 210
 211
 212 sub nuc2ambiguity
 213 {
 214     # Martin A. Hansen, March 2005.
 215
 216     # given a string of nucleotides
 217     # returns the corresponding ambiguity code
 218
 219     my ( $str,
 220          $type,    # DNA or RNA - DEFAULT DNA
 221        ) = @_;
 222
 223     my ( %hash, @nts, $key, $code, %nt_hash );
 224
 225     $str = uc $str;
 226
 227     if ( not $type or $type =~ /dna/i )
 228     {
 229         $str =~ s/N/ACGT/g;
 230         $str =~ s/B/CGT/g;
 231         $str =~ s/D/AGT/g;
 232         $str =~ s/H/ACT/g;
 233         $str =~ s/V/ACG/g;
 234         $str =~ s/K/GT/g;
 235         $str =~ s/Y/CT/g;
 236         $str =~ s/S/CG/g;
 237         $str =~ s/W/AT/g;
 238         $str =~ s/R/AG/g;
 239         $str =~ s/M/AC/g;
 240     }
 241     else
 242     {
 243         $str =~ s/N/ACGU/g;
 244         $str =~ s/B/CGU/g;
 245         $str =~ s/D/AGU/g;
 246         $str =~ s/H/ACU/g;
 247         $str =~ s/V/ACG/g;
 248         $str =~ s/K/GU/g;
 249         $str =~ s/Y/CU/g;
 250         $str =~ s/S/CG/g;
 251         $str =~ s/W/AU/g;
 252         $str =~ s/R/AG/g;
 253         $str =~ s/M/AC/g;
 254     }
 255
 256     @nts = split //, $str;
 257
 258     %nt_hash = map { $_ => 1 } @nts;
 259
 260     @nts = sort keys %nt_hash;
 261
 262     $key = join "", @nts;
 263
 264     %hash = (
 265         'A'    => 'A',
 266         'C'    => 'C',
 267         'G'    => 'G',
 268         'T'    => 'T',
 269         'U'    => 'U',
 270         'AC'   => 'M',
 271         'AG'   => 'R',
 272         'AT'   => 'W',
 273         'AU'   => 'W',
 274         'CG'   => 'S',
 275         'CT'   => 'Y',
 276         'CU'   => 'Y',
 277         'GT'   => 'K',
 278         'GU'   => 'K',
 279         'ACG'  => 'V',
 280         'ACT'  => 'H',
 281         'ACU'  => 'H',
 282         'AGT'  => 'D',
 283         'AGU'  => 'D',
 284         'CGT'  => 'B',
 285         'CGU'  => 'B',
 286         'ACGT' => 'N',
 287         'ACGU' => 'N',
 288     );
 289
 290     $code = $hash{ $key };
 291
 292     warn qq(WARNING: No ambiguity code for key->$key\n) if not $code;
 293
 294     return $code;
 295 }
 296
 297
 298 sub aa2codons
 299 {
 300     # Martin A. Hansen, March 2005.
 301
 302     # given an amino acid, returns a list of corresponding codons
 303
 304     my ( $aa,   # amino acid to translate
 305        ) = @_;
 306
 307     # returns list
 308
 309     my ( %hash, $codons );
 310
 311     $aa = uc $aa;
 312
 313     %hash = (
 314         'F' => [ 'TTT', 'TTC' ],                               # Phe
 315         'L' => [ 'TTA', 'TTG', 'CTT', 'CTC', 'CTA', 'CTG' ],   # Leu
 316         'S' => [ 'TCT', 'TCC', 'TCA', 'TCG', 'AGT', 'AGC' ],   # Ser
 317         'Y' => [ 'TAT', 'TAC' ],                               # Tyr
 318         '*' => [ 'TAA', 'TAG', 'TGA' ],                        # Stop
 319         'X' => [ 'TAA', 'TAG', 'TGA' ],                        # Stop
 320         'C' => [ 'TGT', 'TGC' ],                               # Cys
 321         'W' => [ 'TGG' ],                                      # Trp
 322         'P' => [ 'CCT', 'CCC', 'CCA', 'CCG' ],                 # Pro
 323         'H' => [ 'CAT', 'CAC' ],                               # His
 324         'Q' => [ 'CAA', 'CAG' ],                               # Gln
 325         'R' => [ 'CGT', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG' ],   # Arg
 326         'I' => [ 'ATT', 'ATC', 'ATA' ],                        # Ile
 327         'M' => [ 'ATG' ],                                      # Met
 328         'T' => [ 'ACT', 'ACC', 'ACA', 'ACG' ],                 # Thr
 329         'N' => [ 'AAT', 'AAC' ],                               # Asn
 330         'K' => [ 'AAA', 'AAG' ],                               # Lys
 331         'V' => [ 'GTT', 'GTC', 'GTA', 'GTG' ],                 # Val
 332         'A' => [ 'GCT', 'GCC', 'GCA', 'GCG' ],                 # Ala
 333         'D' => [ 'GAT', 'GAC' ],                               # Asp
 334         'E' => [ 'GAA', 'GAG' ],                               # Glu
 335         'G' => [ 'GGT', 'GGC', 'GGA', 'GGG' ],                 # Gly
 336     );
 337
 338     $codons = $hash{ $aa };
 339
 340     return wantarray ? @{ $codons } : $codons;
 341 }
 342
 343
 344 sub codon2aa
 345 {
 346     # Martin A. Hansen, March 2005.
 347
 348     # given a codon, returns the correponding
 349     # vertebrate amino acid.
 350
 351     my ( $codon,   # codon to translate
 352        ) = @_;
 353
 354     # returns string
 355
 356     my ( %hash, $aa );
 357
 358     die qq(ERROR: Bad codon: "$codon"\n) if not $codon =~ /[ATCGatcg]{3}/;
 359
 360     %hash = (
 361         'TTT' => 'F',   # Phe
 362         'TTC' => 'F',   # Phe
 363         'TTA' => 'L',   # Leu
 364         'TTG' => 'L',   # Leu
 365         'TCT' => 'S',   # Ser
 366         'TCC' => 'S',   # Ser
 367         'TCA' => 'S',   # Ser
 368         'TCG' => 'S',   # Ser
 369         'TAT' => 'Y',   # Tyr
 370         'TAC' => 'Y',   # Tyr
 371         'TAA' => '*',   # Stop
 372         'TAG' => '*',   # Stop
 373         'TGT' => 'C',   # Cys
 374         'TGC' => 'C',   # Cys
 375         'TGA' => '*',   # Stop
 376         'TGG' => 'W',   # Trp
 377         'CTT' => 'L',   # Leu
 378         'CTC' => 'L',   # Leu
 379         'CTA' => 'L',   # Leu
 380         'CTG' => 'L',   # Leu
 381         'CCT' => 'P',   # Pro
 382         'CCC' => 'P',   # Pro
 383         'CCA' => 'P',   # Pro
 384         'CCG' => 'P',   # Pro
 385         'CAT' => 'H',   # His
 386         'CAC' => 'H',   # His
 387         'CAA' => 'Q',   # Gln
 388         'CAG' => 'Q',   # Gln
 389         'CGT' => 'R',   # Arg
 390         'CGC' => 'R',   # Arg
 391         'CGA' => 'R',   # Arg
 392         'CGG' => 'R',   # Arg
 393         'ATT' => 'I',   # Ile
 394         'ATC' => 'I',   # Ile
 395         'ATA' => 'I',   # Ile
 396         'ATG' => 'M',   # Met
 397         'ACT' => 'T',   # Thr
 398         'ACC' => 'T',   # Thr
 399         'ACA' => 'T',   # Thr
 400         'ACG' => 'T',   # Thr
 401         'AAT' => 'N',   # Asn
 402         'AAC' => 'N',   # Asn
 403         'AAA' => 'K',   # Lys
 404         'AAG' => 'K',   # Lys
 405         'AGT' => 'S',   # Ser
 406         'AGC' => 'S',   # Ser
 407         'AGA' => 'R',   # Arg
 408         'AGG' => 'R',   # Arg
 409         'GTT' => 'V',   # Val
 410         'GTC' => 'V',   # Val
 411         'GTA' => 'V',   # Val
 412         'GTG' => 'V',   # Val
 413         'GCT' => 'A',   # Ala
 414         'GCC' => 'A',   # Ala
 415         'GCA' => 'A',   # Ala
 416         'GCG' => 'A',   # Ala
 417         'GAT' => 'D',   # Asp
 418         'GAC' => 'D',   # Asp
 419         'GAA' => 'E',   # Glu
 420         'GAG' => 'E',   # Glu
 421         'GGT' => 'G',   # Gly
 422         'GGC' => 'G',   # Gly
 423         'GGA' => 'G',   # Gly
 424         'GGG' => 'G',   # Gly
 425     );
 426
 427     $aa = $hash{ uc $codon };
 428
 429     return $aa;
 430 }
 431
 432
 433 sub translate
 434 {
 435     # Martin A. Hansen, June 2005.
 436
 437     # translates a dna sequence to protein according to a optional given
 438     # frame.
 439
 440     my ( $dna,     # dna sequence
 441          $frame,   # frame of translation - OPTIONAL
 442        ) = @_;
 443
 444     # returns string
 445
 446     my ( $codon, $pos, $pep );
 447
 448     $frame ||= 1;
 449
 450     if ( $frame =~ /-?[1-3]/ )
 451     {
 452         if ( $frame < 0 ) {
 453             $dna = Maasha::Seq::dna_revcomp( $dna );
 454         }
 455
 456         $frame = abs( $frame ) - 1;
 457
 458         $dna =~ s/^.{${frame}}//;
 459     }
 460     else
 461     {
 462         Maasha::Common::error( qq(Badly formated frame "$frame") );
 463     }
 464
 465     $pos = 0;
 466
 467     while ( $codon = substr $dna, $pos, 3 )
 468     {
 469         last if not length $codon == 3;
 470
 471         $pep .= codon2aa( $codon );
 472
 473         $pos += 3;
 474     }
 475
 476     return $pep;
 477 }
 478
 479
 480 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> RNA FOLDING <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
 481
 482
 483 sub fold_struct_rnafold
 484 {
 485     # Martin A. Hansen, February 2008.
 486
 487     # Given a squence fold this using RNAfold.
 488
 489     my ( $seq,   # sequence to fold
 490        ) = @_;
 491
 492     # Returns a tuple of fold string and free energy.
 493
 494     my ( $pid, $fh_out, $fh_in, @lines, $struct, $energy );
 495
 496     $pid = open2( $fh_out, $fh_in, "RNAfold -noPS" );
 497
 498     Maasha::Fasta::put_entry( [ "RNAfold", $seq ], $fh_in );
 499
 500     close $fh_in;
 501
 502     @lines = <$fh_out>;
 503
 504     close $fh_out;
 505
 506     waitpid $pid, 0;
 507
 508     chomp @lines;
 509
 510     if ( $lines[ - 1 ] =~ /^([^ ]+) \((.+)\)$/ )
 511     {
 512         $struct = $1;
 513         $energy = $2;
 514     }
 515
 516     return wantarray ? ( $struct, $energy ) : [ $struct, $energy ];
 517 }
 518
 519
 520 sub fold_struct_contrastruct
 521 {
 522     # Martin A. Hansen, February 2008.
 523
 524     # Given a sequence fold this using Contrafold.
 525
 526     my ( $seq,       # sequence to fold
 527          $tmp_dir,   # temporary directory - OPTIONAL
 528        ) = @_;
 529
 530     # Returns a tuple of fold string and temp index.
 531
 532     my ( $tmp_file, $out_file1, $out_file2, $fh, $line, $struct, @AoA, $i, $temp, $index );
 533
 534     $tmp_dir ||= $ENV{ 'BP_TMP' };
 535
 536     $tmp_file  = "$tmp_dir/fold.fna";
 537     $out_file1 = "$tmp_dir/fold.out1";
 538     $out_file2 = "$tmp_dir/fold.out2";
 539
 540     Maasha::Fasta::put_entries( [ [ "fold", $seq ] ], $tmp_file );
 541
 542     Maasha::Common::run( "contrafold", "predict --parens $out_file1 --bpseq $out_file2 $tmp_file" );
 543
 544     unlink $tmp_file;
 545
 546     $fh = Maasha::Common::read_open( $out_file1 );
 547
 548     while ( $line = <$fh> )
 549     {
 550         chomp $line;
 551
 552         $struct = $line;
 553     }
 554
 555     close $fh;
 556
 557     unlink $out_file1;
 558
 559     $fh = Maasha::Common::read_open( $out_file2 );
 560
 561     while ( $line = <$fh> )
 562     {
 563         chomp $line;
 564
 565         push @AoA, [ split " ", $line ];
 566     }
 567
 568     close $fh;
 569
 570     unlink $out_file2;
 571
 572     for ( $i = 0; $i < @AoA; $i++ )
 573     {
 574         if ( $AoA[ $i ]->[ 2 ] != 0 )
 575         {
 576             last if $AoA[ $i ]->[ 0 ] > $AoA[ $i ]->[ 2 ];
 577
 578             $temp += base_pair_melting_temp( $AoA[ $i ]->[ 1 ] . $AoA[ $AoA[ $i ]->[ 2 ] - 1 ]->[ 1 ] );
 579         }
 580     }
 581
 582     $index = sprintf( "%.2f", $temp / length $seq );
 583
 584     return wantarray ? ( $struct, $index ) : [ $struct, $index ];
 585 }
 586
 587
 588 sub base_pair_melting_temp
 589 {
 590     # Martin A. Hansen, February 2008.
 591
 592     # Given a basepair, returns the melting temperature.
 593
 594     my ( $bp,   # basepair string
 595        ) = @_;
 596
 597     # Returns integer
 598
 599     my ( %melt_hash );
 600
 601     %melt_hash = (
 602         AA => 0,
 603         AT => 2,
 604         AC => 0,
 605         AG => 0,
 606         AU => 2,
 607         TA => 2,
 608         TT => 0,
 609         TC => 0,
 610         TG => 1, #
 611         TU => 0,
 612         CA => 0,
 613         CT => 0,
 614         CC => 0,
 615         CG => 4,
 616         CU => 0,
 617         GA => 0,
 618         GT => 1, #
 619         GC => 4,
 620         GG => 0,
 621         GU => 1, #
 622         UA => 2,
 623         UT => 0,
 624         UC => 0,
 625         UG => 1, #
 626         UU => 0,
 627     );
 628
 629     return $melt_hash{ uc $bp };
 630 }
 631
 632
 633 sub generate_dna_oligos
 634 {
 635     # Martin A. Hansen, April 2007.
 636
 637     # Generates all possible DNA oligos of a given wordsize.
 638
 639     # alternative way: perl -MData::Dumper -e '@CONV = glob( "{T,C,A,G}" x 4 ); print Dumper( \@CONV )'
 640
 641
 642     my ( $wordsize,   # size of DNA oligos
 643        ) = @_;
 644
 645     # Returns list
 646
 647     my ( @alph, @oligos, $oligo, $char, @list );
 648
 649     @alph   = ( qw( A T C G N ) );
 650     @oligos = ( '' );
 651
 652     for ( 1 .. $wordsize )
 653     {
 654         foreach $oligo ( @oligos )
 655         {
 656             foreach $char ( @alph ) {
 657                 push @list, $oligo . $char;
 658             }
 659         }
 660
 661         @oligos = @list;
 662
 663         undef @list;
 664     }
 665
 666     return wantarray ? @oligos : \@oligos;
 667 }
 668
 669
 670 sub seq2oligos
 671 {
 672     # Martin A. Hansen, April 2007
 673
 674     # Given a sequence and a wordsize,
 675     # breaks the sequence into overlapping
 676     # oligos of that wordsize.
 677
 678     my ( $seq,       # sequence reference
 679          $wordsize,  # wordsize
 680        ) = @_;
 681
 682     # returns list
 683
 684     my ( $i, $oligo, @oligos );
 685
 686     for ( $i = 0; $i < length( ${ $seq } ) - $wordsize + 1; $i++ )
 687     {
 688         $oligo = substr ${ $seq }, $i, $wordsize;
 689
 690         push @oligos, $oligo;
 691     }
 692
 693     return wantarray ? @oligos : \@oligos;
 694 }
 695
 696
 697 sub seq2oligos_uniq
 698 {
 699     # Martin A. Hansen, April 2007
 700
 701     # Given a sequence and a wordsize,
 702     # breaks the sequence into overlapping
 703     # oligos of that wordsize and return
 704     # only unique words.
 705
 706     my ( $seq,       # sequence reference
 707          $wordsize,  # wordsize
 708        ) = @_;
 709
 710     # returns list
 711
 712     my ( $i, $oligo, %lookup, @oligos );
 713
 714     for ( $i = 0; $i < length( ${ $seq } ) - $wordsize + 1; $i++ )
 715     {
 716         $oligo = substr ${ $seq }, $i, $wordsize;
 717
 718         if ( not exists $lookup{ $oligo } )
 719         {
 720             push @oligos, $oligo;
 721             $lookup{ $oligo } = 1;
 722         }
 723     }
 724
 725     return wantarray ? @oligos : \@oligos;
 726 }
 727
 728
 729 sub oligo_freq
 730 {
 731     # Martin A. Hansen, August 2007.
 732
 733     # Given a hashref with oligo=>count, calculates
 734     # a frequency table. Returns a list of hashes
 735
 736     my ( $oligo_freq,   # hashref
 737        ) = @_;
 738
 739     # Returns data structure
 740
 741     my ( @freq_table, $total );
 742
 743     $total = 0;
 744
 745     map { push @freq_table, { OLIGO => $_, COUNT => $oligo_freq->{ $_ } }; $total += $oligo_freq->{ $_ } } keys %{ $oligo_freq };
 746
 747     @freq_table = sort { $b->{ "COUNT" } <=> $a->{ "COUNT" } or $a->{ "OLIGO" } cmp $b->{ "OLIGO" } } @freq_table;
 748
 749     map { $_->{ "FREQ" } = sprintf( "%.4f", $_->{ "COUNT" } / $total ) } @freq_table;
 750
 751     return wantarray ? return @freq_table : \@freq_table;
 752 }
 753
 754
 755 sub seq_generate
 756 {
 757     # Martin A. Hansen, May 2007
 758
 759     # Generates a random sequence given a sequence length
 760     # and a alphabet.
 761
 762     my ( $len,    # sequence length
 763          $alph,   # sequence alphabet
 764        ) = @_;
 765
 766     # returns string
 767
 768     my ( $alph_len, $i, $seq );
 769
 770     $alph_len = scalar @{ $alph };
 771
 772     for ( $i = 0; $i < $len; $i++ ) {
 773         $seq .= $alph->[ int( rand( $alph_len ) ) ];
 774     }
 775
 776     return $seq;
 777 }
 778
 779
 780 sub seq_shuffle
 781 {
 782     # Martin A. Hansen, December 2007.
 783
 784     # Shuffles sequence of a given string.
 785
 786     my ( $seq,   # sequence string
 787        ) = @_;
 788
 789     # Returns string.
 790
 791     my ( @list );
 792
 793     @list = split "", $seq;
 794
 795     return join "", shuffle( @list );
 796 }
 797
 798
 799 sub seq_alph
 800 {
 801     # Martin A. Hansen, May 2007.
 802
 803     # returns a requested alphabet
 804
 805     my ( $type,   # alphabet type
 806        ) = @_;
 807
 808     # returns list
 809
 810     my ( @alph );
 811
 812     if ( $type =~ /^dna$/i ) {
 813         @alph = qw( A T C G );
 814     } elsif ( $type =~ /^rna$/i ) {
 815         @alph = qw( A U C G );
 816     } elsif ( $type =~ /^prot/i ) {
 817         @alph = qw( F L S Y C W P H Q R I M T N K V A D E G );
 818     } else {
 819         die qq(ERROR: Unknown alphabet type: "$type"\n);
 820     }
 821
 822     return wantarray ? @alph : \@alph;
 823 }
 824
 825
 826 sub seq_analyze
 827 {
 828     # Martin A. Hansen, August 2007.
 829
 830     # Analyses the sequence composition of a given sequence.
 831
 832     my ( $seq,   # sequence to analyze
 833        ) = @_;
 834
 835     # Returns hash
 836
 837     my ( %analysis, @chars, @chars_lc, $char, %char_hash, $gc, $at, $lc, $max, $res_sum, @indels, %indel_hash );
 838
 839     $analysis{ "SEQ_TYPE" } = uc Maasha::Seq::seq_guess_type( $seq );
 840     $analysis{ "SEQ_LEN" }  = length $seq;
 841
 842     @indels = qw( - ~ . _ );
 843
 844     if ( $analysis{ "SEQ_TYPE" } eq "DNA" )
 845     {
 846         @chars    = split //, "AGCUTRYWSMKHDVBNagcutrywsmkhdvbn";
 847         @chars_lc = split //, "agcutrywsmkhdvbn";
 848     }
 849     elsif ( $analysis{ "SEQ_TYPE" } eq "RNA" )
 850     {
 851         @chars    = split //, "AGCUTRYWSMKHDVBNagcutrywsmkhdvbn";
 852         @chars_lc = split //, "agcutrywsmkhdvbn";
 853     }
 854     else
 855     {
 856         @chars    = split //, "FLSYCWPHQRIMTNKVADEGflsycwphqrimtnkvadeg";
 857         @chars_lc = split //, "flsycwphqrimtnkvadeg";
 858     }
 859
 860     @char_hash{ @chars }   = map { eval "scalar \$seq =~ tr/$_//" } @chars;
 861     @indel_hash{ @indels } = map { eval "scalar \$seq =~ tr/$_//" } @indels;
 862
 863     if ( $analysis{ "SEQ_TYPE" } =~ /DNA|RNA/ )
 864     {
 865         $gc = $char_hash{ "g" } + $char_hash{ "G" } + $char_hash{ "c" } + $char_hash{ "C" };
 866         $at = $char_hash{ "a" } + $char_hash{ "A" } + $char_hash{ "t" } + $char_hash{ "T" } + $char_hash{ "u" } + $char_hash{ "U" };
 867
 868         $analysis{ "GC%" } = sprintf( "%.2f", 100 * $gc / $analysis{ "SEQ_LEN" } );
 869
 870         map { $lc += $char_hash{ lc $_ } } @chars_lc;
 871
 872         $analysis{ "SOFT_MASK%" } = sprintf( "%.2f", 100 * $lc / $analysis{ "SEQ_LEN" } );
 873         $analysis{ "HARD_MASK%" } = sprintf( "%.2f", 100 * ( $char_hash{ "n" } + $char_hash{ "N" } ) / $analysis{ "SEQ_LEN" } );
 874     }
 875
 876     $max = 0;
 877
 878     foreach $char ( @chars_lc )
 879     {
 880         $char = uc $char;
 881
 882         $char_hash{ $char } += $char_hash{ lc $char };
 883
 884         $analysis{ "RES:$char" } = $char_hash{ $char };
 885
 886         $max = $char_hash{ $char } if $char_hash{ $char } > $max;
 887
 888         $analysis{ "RES_SUM" } += $char_hash{ $char };
 889     }
 890
 891     map { $analysis{ "RES:$_" } = $indel_hash{ $_ } } @indels;
 892
 893     $analysis{ "MIX_INDEX" } = sprintf( "%.2f", $max / $analysis{ "SEQ_LEN" } );
 894     $analysis{ "MELT_TEMP" } = sprintf( "%.2f", 4 * $gc + 2 * $at );
 895
 896     return wantarray ? %analysis : \%analysis;
 897 }
 898
 899
 900 sub seq_complexity
 901 {
 902     # Martin A. Hansen, May 2008.
 903
 904     # Given a sequence computes a complexity index
 905     # as the most common di-residue over
 906     # the sequence length. Return ~1 if the entire
 907     # sequence is homopolymeric. Above 0.4 indicates
 908     # low complexity sequence.
 909
 910     my ( $seq,   # sequence
 911        ) = @_;
 912
 913     # Returns float.
 914
 915     my ( $len, $i, $max, $di, %hash );
 916
 917     $seq = uc $seq;
 918     $len = length $seq;
 919     $max = 0;
 920
 921     for ( $i = 0; $i < $len - 1; $i++ ) {
 922         $hash{ substr $seq, $i, 2  }++;
 923     }
 924
 925     foreach $di ( keys %hash ) {
 926         $max = $hash{ $di } if $hash{ $di } > $max;
 927     }
 928
 929     return $max / $len;
 930 }
 931
 932
 933 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SEQLOGO <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
 934
 935
 936 sub seqlogo_calc
 937 {
 938     # Martin A. Hansen, January 2007.
 939
 940     # given max bit size and a list of aligned entries
 941     # in FASTA format, calculates for each sequence position
 942     # the height of the letters in bits.
 943     # returns a data structure with [ letter, height ] tuples
 944     # for all letters at each position.
 945
 946     my ( $bit_max,   # maximum bit height
 947          $entries,   # FASTA entries
 948        ) = @_;
 949
 950     # returns data structure
 951
 952     my ( $logo_len, $char_tot, $i, %char_hash, $bit_height, $bit_diff, $char_heights, @logo );
 953
 954     $logo_len = length $entries->[ 0 ]->[ 1 ];
 955     $char_tot = scalar @{ $entries };
 956
 957     for ( $i = 0; $i < $logo_len; $i++ )
 958     {
 959         undef %char_hash;
 960
 961         map { $char_hash{ uc substr( $_->[ 1 ], $i, 1 ) }++ } @{ $entries };
 962
 963         delete $char_hash{ "-" };
 964         delete $char_hash{ "_" };
 965         delete $char_hash{ "~" };
 966         delete $char_hash{ "." };
 967
 968         $bit_height = seqlogo_calc_bit_height( \%char_hash, $char_tot );
 969
 970         $bit_diff = $bit_max - $bit_height;
 971
 972         $char_heights = seqlogo_calc_char_heights( \%char_hash, $char_tot, $bit_diff );
 973
 974         push @logo, $char_heights;
 975     }
 976
 977     return wantarray ? @logo : \@logo;
 978 }
 979
 980
 981 sub seqlogo_calc_bit_height
 982 {
 983     # Martin A. Hansen, January 2007.
 984
 985     # calculates the bit height using Shannon's famous
 986     # general formula for uncertainty as documentet:
 987     # http://www.ccrnp.ncifcrf.gov/~toms/paper/hawaii/latex/node5.html
 988
 989     my ( $char_hash,   # hashref with chars and frequencies
 990          $tot,         # total number of chars
 991        ) = @_;
 992
 993     # returns float
 994
 995     my ( $char, $freq, $bit_height );
 996
 997     foreach $char ( keys %{ $char_hash } )
 998     {
 999         $freq = $char_hash->{ $char } / $tot;
1000
1001         $bit_height += $freq * ( log( $freq ) / log( 2 ) );
1002     }
1003
1004     $bit_height *= -1;
1005
1006     return $bit_height;
1007 }
1008
1009
1010 sub seqlogo_calc_char_heights
1011 {
1012     # Martin A. Hansen, January 2007.
1013
1014     # calculates the hight of each char in bits, and sorts
1015     # according to height.
1016
1017     my ( $char_hash,   # hashref with chars and frequencies
1018          $tot,         # tot number of chars
1019          $bit_diff,    # information gained from uncertainties
1020        ) = @_;
1021
1022     # returns list of tuples
1023
1024     my ( $char, $freq, $char_height, @char_heights );
1025
1026     foreach $char ( keys %{ $char_hash } )
1027     {
1028         $freq = $char_hash->{ $char } / $tot;
1029
1030         $char_height = $freq * $bit_diff;   # char height in bits
1031
1032         push @char_heights, [ $char, $char_height ];
1033     }
1034
1035     @char_heights = sort { $a->[ 1 ] <=> $b->[ 1 ] } @char_heights;
1036
1037     return wantarray ? @char_heights : \@char_heights;
1038 }
1039
1040
1041 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> RESIDUE COLORS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1042
1043
1044 sub color_pep
1045 {
1046     # Martin A. Hansen, October 2005.
1047
1048     # color scheme for proteins as defined in Mview.
1049     # given a char returns the appropriate color.
1050     # The amino acids are colored according to physicochemical properties:
1051     # bright green = hydrophobic; dark green = large hydrophobic;
1052     # bright blue = negative charge; red = positive charge;
1053     # dull blue = small alcohol; purple = polar; yellow = cysteine.
1054
1055     my ( $char,   # char to decorate
1056        ) = @_;
1057
1058     # returns string
1059
1060     my ( %hash, $color_set );
1061
1062     %hash = (
1063         K   => "bright-red",
1064         R   => "bright-red",
1065         H   => "dark-green",
1066         D   => "bright-blue",
1067         E   => "bright-blue",
1068         S   => "dull-blue",
1069         T   => "dull-blue",
1070         N   => "purple",
1071         Q   => "purple",
1072         A   => "bright-green",
1073         V   => "bright-green",
1074         I   => "bright-green",
1075         L   => "bright-green",
1076         M   => "bright-green",
1077         F   => "dark-green",
1078         Y   => "dark-green",
1079         W   => "dark-green",
1080         C   => "yellow",
1081         G   => "bright-green",
1082         P   => "bright-green",
1083         Z   => "dark-gray",
1084         B   => "dark-gray",
1085         "?" => "light-gray",
1086         "~" => "light-gray",
1087         "*" => "dark-gray",
1088     );
1089
1090     if ( exists $hash{ uc $char } ) {
1091         $color_set = $hash{ uc $char };
1092     } else {
1093         $color_set = "black";
1094     }
1095
1096     return $color_set;
1097 }
1098
1099
1100 sub color_nuc
1101 {
1102     # Martin A. Hansen, October 2005.
1103
1104     # color scheme for nucleotides as defined in Mview.
1105     # given a char returns the appropriate color
1106     # according to physical/chemical proterties.
1107
1108     my ( $char,   # char to decorate
1109        ) = @_;
1110
1111     # returns string
1112
1113     my ( %hash, $color_set );
1114
1115     %hash = (
1116         A => "bright-red",
1117         G => "yellow",
1118         C => "blue",
1119         T => "green",
1120         U => "green",
1121     );
1122
1123     if ( exists $hash{ uc $char } ) {
1124         $color_set = $hash{ uc $char };
1125     } else {
1126         $color_set = "black";
1127     }
1128
1129     return $color_set;
1130 }
1131
1132
1133 sub color_palette
1134 {
1135     # Martin A. Hansen, October 2005.
1136
1137     # hash table with color-names and color-hex.
1138
1139     my ( $color,   # common color name
1140        ) = @_;
1141
1142     # returns string
1143
1144     my ( %hash );
1145
1146     %hash = (
1147         "black"                => "#000000",
1148         "white"                => "#ffffff",
1149         "red"                  => "#ff0000",
1150         "green"                => "#00ff00",
1151         "blue"                 => "#0000ff",
1152         "cyan"                 => "#00ffff",
1153         "magenta"              => "#ff00ff",
1154 #        "yellow"               => "#ffff00",
1155         "yellow"               => "#ffc800",
1156         "purple"               => "#6600cc",
1157         "dull-blue"            => "#0099ff",
1158         "dark-green-blue"      => "#33cccc",
1159         "medium-green-blue"    => "#00ffcc",
1160         "bright-blue"          => "#0033ff",
1161         "dark-green"           => "#009900",
1162         "bright-green"         => "#33cc00",
1163         "orange"               => "#ff3333",
1164         "orange-brown"         => "#cc6600",
1165         "bright-red"           => "#cc0000",
1166         "light-gray"           => "#999999",
1167         "dark-gray"            => "#666666",
1168         "gray0"                => "#ffffff",
1169         "gray1"                => "#eeeeee",
1170         "gray2"                => "#dddddd",
1171         "gray3"                => "#cccccc",
1172         "gray4"                => "#bbbbbb",
1173         "gray5"                => "#aaaaaa",
1174         "gray6"                => "#999999",
1175         "gray7"                => "#888888",
1176         "gray8"                => "#777777",
1177         "gray9"                => "#666666",
1178         "gray10"               => "#555555",
1179         "gray11"               => "#444444",
1180         "gray12"               => "#333333",
1181         "gray13"               => "#222222",
1182         "gray14"               => "#111111",
1183         "gray15"               => "#000000",
1184         "clustal-red"          => "#ff1111",
1185         "clustal-blue"         => "#1155ff",
1186         "clustal-green"        => "#11dd11",
1187         "clustal-cyan"         => "#11ffff",
1188         "clustal-yellow"       => "#ffff11",
1189         "clustal-orange"       => "#ff7f11",
1190         "clustal-pink"         => "#ff11ff",
1191         "clustal-purple"       => "#6611cc",
1192         "clustal-dull-blue"    => "#197fe5",
1193         "clustal-dark-gray"    => "#666666",
1194         "clustal-light-gray"   => "#999999",
1195         "lin-A"                => "#90fe23",
1196         "lin-R"                => "#fe5e2d",
1197         "lin-N"                => "#2e3d2d",
1198         "lin-D"                => "#00903b",
1199         "lin-C"                => "#004baa",
1200         "lin-Q"                => "#864b00",
1201         "lin-E"                => "#3fa201",
1202         "lin-G"                => "#10fe68",
1203         "lin-H"                => "#b2063b",
1204         "lin-I"                => "#04ced9",
1205         "lin-L"                => "#4972fe",
1206         "lin-K"                => "#c4a100",
1207         "lin-M"                => "#2a84dd",
1208         "lin-F"                => "#a60ade",
1209         "lin-P"                => "#fe61fe",
1210         "lin-S"                => "#f7e847",
1211         "lin-T"                => "#fefeb3",
1212         "lin-W"                => "#4a007f",
1213         "lin-Y"                => "#e903a8",
1214         "lin-V"                => "#5bfdfd",
1215     );
1216
1217     if ( exists $hash{ $color } ) {
1218         return $hash{ $color };
1219     } else {
1220         print STDERR qq(WARNING: color "$color" not found in palette!\n);
1221     }
1222 }
1223
1224
1225 sub color_contrast
1226 {
1227     # Martin A. Hansen, October 2005.
1228
1229     # Hash table with contrast colors to be used for frontground
1230     # text on a given background color.
1231
1232     my ( $color,    # background color
1233        ) = @_;
1234
1235     # returns string
1236
1237     my ( %hash );
1238
1239     %hash = (
1240         "black"                => "white",
1241         "white"                => "black",
1242         "red"                  => "white",
1243         "green"                => "white",
1244         "blue"                 => "white",
1245         "cyan"                 => "white",
1246         "magenta"              => "white",
1247         "yellow"               => "black",
1248         "purple"               => "white",
1249         "dull-blue"            => "white",
1250         "dark-green-blue"      => "white",
1251         "medium-green-blue"    => "white",
1252         "bright-blue"          => "white",
1253         "dark-green"           => "white",
1254         "bright-green"         => "black",
1255         "orange"               => "",
1256         "orange-brown"         => "",
1257         "bright-red"           => "white",
1258         "light-gray"           => "black",
1259         "dark-gray"            => "white",
1260         "gray0"                => "",
1261         "gray1"                => "",
1262         "gray2"                => "",
1263         "gray3"                => "",
1264         "gray4"                => "",
1265         "gray5"                => "",
1266         "gray6"                => "",
1267         "gray7"                => "",
1268         "gray8"                => "",
1269         "gray9"                => "",
1270         "gray10"               => "",
1271         "gray11"               => "",
1272         "gray12"               => "",
1273         "gray13"               => "",
1274         "gray14"               => "",
1275         "gray15"               => "",
1276         "clustal-red"          => "black",
1277         "clustal-blue"         => "black",
1278         "clustal-green"        => "black",
1279         "clustal-cyan"         => "black",
1280         "clustal-yellow"       => "black",
1281         "clustal-orange"       => "black",
1282         "clustal-pink"         => "black",
1283         "clustal-purple"       => "black",
1284         "clustal-dull-blue"    => "black",
1285         "clustal-dark-gray"    => "black",
1286         "clustal-light-gray"   => "black",
1287         "lin-A"                => "",
1288         "lin-R"                => "",
1289         "lin-N"                => "",
1290         "lin-D"                => "",
1291         "lin-C"                => "",
1292         "lin-Q"                => "",
1293         "lin-E"                => "",
1294         "lin-G"                => "",
1295         "lin-H"                => "",
1296         "lin-I"                => "",
1297         "lin-L"                => "",
1298         "lin-K"                => "",
1299         "lin-M"                => "",
1300         "lin-F"                => "",
1301         "lin-P"                => "",
1302         "lin-S"                => "",
1303         "lin-T"                => "",
1304         "lin-W"                => "",
1305         "lin-Y"                => "",
1306         "lin-V"                => "",
1307     );
1308
1309     if ( exists $hash{ $color } ) {
1310         return $hash{ $color };
1311     } else {
1312         print STDERR qq(WARNING: color "$color" not found in palette!\n);
1313     }
1314 }
1315
1316
1317 sub seq_word_pack
1318 {
1319     # Martin A. Hansen, April 2008.
1320
1321     # Packs a sequence word into a binary number.
1322
1323     my ( $word,        # Word to be packed
1324        ) = @_;
1325
1326     # Returns integer.
1327
1328     my ( %hash, $bin, $word_size, $pad );
1329
1330     %hash = (
1331         'A' => '000',
1332         'T' => '001',
1333         'C' => '010',
1334         'G' => '100',
1335         'N' => '011',
1336         '-' => '101',
1337         '.' => '110',
1338         '~' => '111',
1339     );
1340
1341     map { $bin .= pack "B3", $hash{ $_ } } split //, $word;
1342
1343     $word_size = length $word;
1344
1345     $pad = ( 3 * $word_size ) / 8;
1346
1347     if ( $pad =~ /\./ )
1348     {
1349         $pad = ( ( int $pad + 1 ) * 8 ) - 3 * $word_size;
1350
1351         $bin .= pack "B$pad", 0 x $pad;
1352     }
1353
1354     return $bin;
1355 }
1356
1357
1358 sub seq_word_unpack
1359 {
1360     # Martin A. Hansen, April 2008.
1361
1362     # Unpacks a binary sequence word to ASCII.
1363
1364     my ( $bin,         # Binary sequence word
1365          $word_size,   # Size of word
1366        ) = @_;
1367
1368     # Returns string.
1369
1370     my ( %hash, $word );
1371
1372     %hash = (
1373         '000' => 'A',
1374         '001' => 'T',
1375         '010' => 'C',
1376         '100' => 'G',
1377         '011' => 'N',
1378         '101' => '-',
1379         '110' => '.',
1380         '111' => '~',
1381     );
1382
1383     map { $word .= $hash{ $_ } } unpack "(B3)$word_size", $bin;
1384
1385     return $word;
1386 }
1387
1388
1389 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> ADAPTOR LOCATING <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1390
1391
1392 ###############################    REDUNDANT   ##############################
1393
1394 # these functions have been replaced by index_m and match_m in Common.pm
1395
1396 sub find_adaptor
1397 {
1398     # Martin A. Hansen & Selene Fernandez, August 2008
1399
1400     # Locates an adaptor sequence in a given sequence
1401     # starting from a given offset position allowing a given
1402     # number of mismatches (no indels).
1403
1404     my ( $adaptor,       # list of chars
1405          $tag,           # list of chars
1406          $adaptor_len,   # length of adaptor sequence
1407          $tag_len,       # length of tag sequence
1408          $tag_offset,    # offset position
1409          $max_match,     # number of matches indicating success
1410          $max_mismatch,  # number of mismatches
1411        ) = @_;
1412
1413     # Returns an integer.
1414
1415     my ( $i, $j, $match, $mismatch );
1416
1417     $i = $tag_offset;
1418
1419     while ( $i < $tag_len - ( $max_match + $max_mismatch ) + 1 )
1420     {
1421         $j = 0;
1422
1423         while ( $j < $adaptor_len - ( $max_match + $max_mismatch ) + 1 )
1424         {
1425             return $i if check_diag( $adaptor, $tag, $adaptor_len, $tag_len, $j, $i, $max_match, $max_mismatch );
1426
1427             $j++
1428         }
1429
1430         $i++;
1431     }
1432
1433     return -1;
1434 }
1435
1436
1437 sub check_diag
1438 {
1439     # Martin A. Hansen & Selene Fernandez, August 2008
1440
1441     # Checks the diagonal starting at a given coordinate
1442     # of a search space constituted by an adaptor and a tag sequence.
1443     # Residues in the diagonal are compared between the sequences allowing
1444     # for a given number of mismatches. We terminate when search space is
1445     # exhausted or if max matches or mismatches is reached.
1446
1447     my ( $adaptor,       # list of chars
1448          $tag,           # list of chars
1449          $adaptor_len,   # length of adaptor sequence
1450          $tag_len,       # length of tag sequence
1451          $adaptor_beg,   # adaptor begin coordinate
1452          $tag_beg,       # tag begin coordinate
1453          $max_match,     # number of matches indicating success
1454          $max_mismatch,  # number of mismatches
1455        ) = @_;
1456
1457     # Returns boolean.
1458
1459     my ( $match, $mismatch );
1460
1461     $match    = 0;
1462     $mismatch = 0;
1463
1464     while ( $adaptor_beg <= $adaptor_len and $tag_beg <= $tag_len )
1465     {
1466         if ( $adaptor->[ $adaptor_beg ] eq $tag->[ $tag_beg ] )
1467         {
1468             $match++;
1469
1470             return 1 if $match >= $max_match;
1471         }
1472         else
1473         {
1474             $mismatch++;
1475
1476             return 0 if $mismatch > $max_mismatch;
1477         }
1478
1479         $adaptor_beg++;
1480         $tag_beg++;
1481     }
1482
1483     return 0;
1484 }
1485
1486
1487 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<