code_perl/Maasha/Seq.pm

   1 package Maasha::Seq;
   2
   3 # Copyright (C) 2007 Martin A. Hansen.
   4
   5 # This program is free software; you can redistribute it and/or
   6 # modify it under the terms of the GNU General Public License
   7 # as published by the Free Software Foundation; either version 2
   8 # of the License, or (at your option) any later version.
   9
  10 # This program is distributed in the hope that it will be useful,
  11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 # GNU General Public License for more details.
  14
  15 # You should have received a copy of the GNU General Public License
  16 # along with this program; if not, write to the Free Software
  17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  18
  19 # http://www.gnu.org/copyleft/gpl.html
  20
  21
  22 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  23
  24
  25 # yak yak yak
  26
  27
  28 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  29
  30
  31 use strict;
  32 use Data::Dumper;
  33 use IPC::Open2;
  34 use List::Util qw( shuffle );
  35 use Time::HiRes qw( gettimeofday );
  36
  37 use vars qw ( @ISA @EXPORT );
  38
  39 @ISA = qw( Exporter );
  40
  41 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  42
  43
  44 sub seq_guess_type
  45 {
  46     # Martin A. Hansen, May 2007.
  47
  48     # Makes a qualified guess on the type of a given squence.
  49
  50     my ( $seq,   # sequence to check
  51        ) = @_;
  52
  53     # returns string.
  54
  55     my ( $check_seq, $count );
  56
  57     if ( length $seq > 100 ) {
  58         $check_seq = substr $seq, 0, 100;
  59     } else {
  60         $check_seq = $seq;
  61     }
  62
  63     if ( $count = $check_seq =~ tr/FLPQIEflpqie// and $count > 0 ) {
  64         return "protein";
  65     } elsif ( $count = $check_seq =~ tr/Uu// and $count > 0 ) {
  66         return "rna";
  67     } else {
  68         return "dna";
  69     }
  70 }
  71
  72
  73 sub wrap
  74 {
  75     # Martin A. Hansen, July 2007.
  76
  77     # Wraps a given string reference accoring to given width.
  78
  79     my ( $strref,   # ref to sting to wrap
  80          $wrap,     # wrap width
  81        ) = @_;
  82
  83     # Returns nothing.
  84
  85     ${ $strref } =~ s/(.{$wrap})/$1\n/g;
  86
  87     chomp ${ $strref };
  88 }
  89
  90
  91 sub dna_revcomp
  92 {
  93     # Niels Larsen
  94     # modified Martin A. Hansen, March 2005.
  95
  96     # Returns the reverse complement of a dna sequence with preservation of case
  97     # according to this mapping,
  98     #
  99     # AGCUTRYWSMKHDVBNagcutrywsmkhdvbn
 100     # TCGAAYRWSKMDHBVNtcgaayrwskmdhbvn
 101
 102     my ( $seq,   # seq
 103        ) = @_;
 104
 105     # returns string
 106
 107     $seq = reverse $seq;
 108
 109     $seq =~ tr/AGCUTRYWSMKHDVBNagcutrywsmkhdvbn/TCGAAYRWSKMDHBVNtcgaayrwskmdhbvn/;
 110
 111     return $seq;
 112 }
 113
 114
 115 sub rna_revcomp
 116 {
 117     # Niels Larsen
 118     # modified Martin A. Hansen, March 2005.
 119
 120     # Returns the complement of a rna sequence with preservation of case
 121     # according to this mapping,
 122     #
 123     # AGCUTRYWSMKHDVBNagcutrywsmkhdvbn
 124     # UCGAAYRWSKMDHBVNucgaayrwskmdhbvn
 125
 126     my ( $seq,   # seq
 127        ) = @_;
 128
 129     $seq = reverse $seq;
 130
 131     $seq =~ tr/AGCUTRYWSMKHDVBNagcutrywsmkhdvbn/UCGAAYRWSKMDHBVNucgaayrwskmdhbvn/;
 132
 133     return $seq;
 134 }
 135
 136
 137 sub dna_comp
 138 {
 139     # Niels Larsen
 140     # modified Martin A. Hansen, March 2005.
 141
 142     # Returns the reverse complement of a dna sequence with preservation of case
 143     # according to this mapping,
 144     #
 145     # AGCUTRYWSMKHDVBNagcutrywsmkhdvbn
 146     # TCGAAYRWSKMDHBVNtcgaayrwskmdhbvn
 147
 148     my ( $seqref,   # seqref
 149        ) = @_;
 150
 151     # Returns nothing.
 152
 153     ${ $seqref } =~ tr/AGCUTRYWSMKHDVBNagcutrywsmkhdvbn/TCGAAYRWSKMDHBVNtcgaayrwskmdhbvn/;
 154 }
 155
 156
 157 sub rna_comp
 158 {
 159     # Niels Larsen
 160     # modified Martin A. Hansen, March 2005.
 161
 162     # Returns the complement of a rna sequence with preservation of case
 163     # according to this mapping,
 164     #
 165     # AGCUTRYWSMKHDVBNagcutrywsmkhdvbn
 166     # UCGAAYRWSKMDHBVNucgaayrwskmdhbvn
 167
 168     my ( $seqref,   # seqref
 169        ) = @_;
 170
 171     # Returns nothing.
 172
 173     ${ $seqref } =~ tr/AGCUTRYWSMKHDVBNagcutrywsmkhdvbn/UCGAAYRWSKMDHBVNucgaayrwskmdhbvn/;
 174 }
 175
 176
 177 sub dna2rna
 178 {
 179     # Martin A. Hansen, March 2007
 180
 181     # Converts DNA sequence to RNA
 182
 183     my ( $seq,   # nucleotide sequence
 184        ) = @_;
 185
 186     # returns string
 187
 188     $seq =~ tr/Tt/Uu/;
 189
 190     return $seq;
 191 }
 192
 193
 194 sub rna2dna
 195 {
 196     # Martin A. Hansen, March 2007
 197
 198     # Converts RNA sequence to DNA
 199
 200     my ( $seq,   # nucleotide sequence
 201        ) = @_;
 202
 203     # returns string
 204
 205     $seq =~ tr/Uu/Tt/;
 206
 207     return $seq;
 208 }
 209
 210
 211 sub nuc2ambiguity
 212 {
 213     # Martin A. Hansen, March 2005.
 214
 215     # given a string of nucleotides
 216     # returns the corresponding ambiguity code
 217
 218     my ( $str,
 219          $type,    # DNA or RNA - DEFAULT DNA
 220        ) = @_;
 221
 222     my ( %hash, @nts, $key, $code, %nt_hash );
 223
 224     $str = uc $str;
 225
 226     if ( not $type or $type =~ /dna/i )
 227     {
 228         $str =~ s/N/ACGT/g;
 229         $str =~ s/B/CGT/g;
 230         $str =~ s/D/AGT/g;
 231         $str =~ s/H/ACT/g;
 232         $str =~ s/V/ACG/g;
 233         $str =~ s/K/GT/g;
 234         $str =~ s/Y/CT/g;
 235         $str =~ s/S/CG/g;
 236         $str =~ s/W/AT/g;
 237         $str =~ s/R/AG/g;
 238         $str =~ s/M/AC/g;
 239     }
 240     else
 241     {
 242         $str =~ s/N/ACGU/g;
 243         $str =~ s/B/CGU/g;
 244         $str =~ s/D/AGU/g;
 245         $str =~ s/H/ACU/g;
 246         $str =~ s/V/ACG/g;
 247         $str =~ s/K/GU/g;
 248         $str =~ s/Y/CU/g;
 249         $str =~ s/S/CG/g;
 250         $str =~ s/W/AU/g;
 251         $str =~ s/R/AG/g;
 252         $str =~ s/M/AC/g;
 253     }
 254
 255     @nts = split //, $str;
 256
 257     %nt_hash = map { $_ => 1 } @nts;
 258
 259     @nts = sort keys %nt_hash;
 260
 261     $key = join "", @nts;
 262
 263     %hash = (
 264         'A'    => 'A',
 265         'C'    => 'C',
 266         'G'    => 'G',
 267         'T'    => 'T',
 268         'U'    => 'U',
 269         'AC'   => 'M',
 270         'AG'   => 'R',
 271         'AT'   => 'W',
 272         'AU'   => 'W',
 273         'CG'   => 'S',
 274         'CT'   => 'Y',
 275         'CU'   => 'Y',
 276         'GT'   => 'K',
 277         'GU'   => 'K',
 278         'ACG'  => 'V',
 279         'ACT'  => 'H',
 280         'ACU'  => 'H',
 281         'AGT'  => 'D',
 282         'AGU'  => 'D',
 283         'CGT'  => 'B',
 284         'CGU'  => 'B',
 285         'ACGT' => 'N',
 286         'ACGU' => 'N',
 287     );
 288
 289     $code = $hash{ $key };
 290
 291     warn qq(WARNING: No ambiguity code for key->$key\n) if not $code;
 292
 293     return $code;
 294 }
 295
 296
 297 sub aa2codons
 298 {
 299     # Martin A. Hansen, March 2005.
 300
 301     # given an amino acid, returns a list of corresponding codons
 302
 303     my ( $aa,   # amino acid to translate
 304        ) = @_;
 305
 306     # returns list
 307
 308     my ( %hash, $codons );
 309
 310     $aa = uc $aa;
 311
 312     %hash = (
 313         'F' => [ 'TTT', 'TTC' ],                               # Phe
 314         'L' => [ 'TTA', 'TTG', 'CTT', 'CTC', 'CTA', 'CTG' ],   # Leu
 315         'S' => [ 'TCT', 'TCC', 'TCA', 'TCG', 'AGT', 'AGC' ],   # Ser
 316         'Y' => [ 'TAT', 'TAC' ],                               # Tyr
 317         '*' => [ 'TAA', 'TAG', 'TGA' ],                        # Stop
 318         'X' => [ 'TAA', 'TAG', 'TGA' ],                        # Stop
 319         'C' => [ 'TGT', 'TGC' ],                               # Cys
 320         'W' => [ 'TGG' ],                                      # Trp
 321         'P' => [ 'CCT', 'CCC', 'CCA', 'CCG' ],                 # Pro
 322         'H' => [ 'CAT', 'CAC' ],                               # His
 323         'Q' => [ 'CAA', 'CAG' ],                               # Gln
 324         'R' => [ 'CGT', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG' ],   # Arg
 325         'I' => [ 'ATT', 'ATC', 'ATA' ],                        # Ile
 326         'M' => [ 'ATG' ],                                      # Met
 327         'T' => [ 'ACT', 'ACC', 'ACA', 'ACG' ],                 # Thr
 328         'N' => [ 'AAT', 'AAC' ],                               # Asn
 329         'K' => [ 'AAA', 'AAG' ],                               # Lys
 330         'V' => [ 'GTT', 'GTC', 'GTA', 'GTG' ],                 # Val
 331         'A' => [ 'GCT', 'GCC', 'GCA', 'GCG' ],                 # Ala
 332         'D' => [ 'GAT', 'GAC' ],                               # Asp
 333         'E' => [ 'GAA', 'GAG' ],                               # Glu
 334         'G' => [ 'GGT', 'GGC', 'GGA', 'GGG' ],                 # Gly
 335     );
 336
 337     $codons = $hash{ $aa };
 338
 339     return wantarray ? @{ $codons } : $codons;
 340 }
 341
 342
 343 sub codon2aa
 344 {
 345     # Martin A. Hansen, March 2005.
 346
 347     # given a codon, returns the correponding
 348     # vertebrate amino acid.
 349
 350     my ( $codon,   # codon to translate
 351        ) = @_;
 352
 353     # returns string
 354
 355     my ( %hash, $aa );
 356
 357     die qq(ERROR: Bad codon: "$codon"\n) if not $codon =~ /[ATCGatcg]{3}/;
 358
 359     %hash = (
 360         'TTT' => 'F',   # Phe
 361         'TTC' => 'F',   # Phe
 362         'TTA' => 'L',   # Leu
 363         'TTG' => 'L',   # Leu
 364         'TCT' => 'S',   # Ser
 365         'TCC' => 'S',   # Ser
 366         'TCA' => 'S',   # Ser
 367         'TCG' => 'S',   # Ser
 368         'TAT' => 'Y',   # Tyr
 369         'TAC' => 'Y',   # Tyr
 370         'TAA' => '*',   # Stop
 371         'TAG' => '*',   # Stop
 372         'TGT' => 'C',   # Cys
 373         'TGC' => 'C',   # Cys
 374         'TGA' => '*',   # Stop
 375         'TGG' => 'W',   # Trp
 376         'CTT' => 'L',   # Leu
 377         'CTC' => 'L',   # Leu
 378         'CTA' => 'L',   # Leu
 379         'CTG' => 'L',   # Leu
 380         'CCT' => 'P',   # Pro
 381         'CCC' => 'P',   # Pro
 382         'CCA' => 'P',   # Pro
 383         'CCG' => 'P',   # Pro
 384         'CAT' => 'H',   # His
 385         'CAC' => 'H',   # His
 386         'CAA' => 'Q',   # Gln
 387         'CAG' => 'Q',   # Gln
 388         'CGT' => 'R',   # Arg
 389         'CGC' => 'R',   # Arg
 390         'CGA' => 'R',   # Arg
 391         'CGG' => 'R',   # Arg
 392         'ATT' => 'I',   # Ile
 393         'ATC' => 'I',   # Ile
 394         'ATA' => 'I',   # Ile
 395         'ATG' => 'M',   # Met
 396         'ACT' => 'T',   # Thr
 397         'ACC' => 'T',   # Thr
 398         'ACA' => 'T',   # Thr
 399         'ACG' => 'T',   # Thr
 400         'AAT' => 'N',   # Asn
 401         'AAC' => 'N',   # Asn
 402         'AAA' => 'K',   # Lys
 403         'AAG' => 'K',   # Lys
 404         'AGT' => 'S',   # Ser
 405         'AGC' => 'S',   # Ser
 406         'AGA' => 'R',   # Arg
 407         'AGG' => 'R',   # Arg
 408         'GTT' => 'V',   # Val
 409         'GTC' => 'V',   # Val
 410         'GTA' => 'V',   # Val
 411         'GTG' => 'V',   # Val
 412         'GCT' => 'A',   # Ala
 413         'GCC' => 'A',   # Ala
 414         'GCA' => 'A',   # Ala
 415         'GCG' => 'A',   # Ala
 416         'GAT' => 'D',   # Asp
 417         'GAC' => 'D',   # Asp
 418         'GAA' => 'E',   # Glu
 419         'GAG' => 'E',   # Glu
 420         'GGT' => 'G',   # Gly
 421         'GGC' => 'G',   # Gly
 422         'GGA' => 'G',   # Gly
 423         'GGG' => 'G',   # Gly
 424     );
 425
 426     $aa = $hash{ uc $codon };
 427
 428     return $aa;
 429 }
 430
 431
 432 sub translate
 433 {
 434     # Martin A. Hansen, June 2005.
 435
 436     # translates a dna sequence to protein according to a optional given
 437     # frame.
 438
 439     my ( $dna,     # dna sequence
 440          $frame,   # frame of translation - OPTIONAL
 441        ) = @_;
 442
 443     # returns string
 444
 445     my ( $codon, $pos, $pep );
 446
 447     $frame ||= 1;
 448
 449     if ( $frame =~ /-?[1-3]/ )
 450     {
 451         if ( $frame < 0 ) {
 452             $dna = Maasha::Seq::dna_revcomp( $dna );
 453         }
 454
 455         $frame = abs( $frame ) - 1;
 456
 457         $dna =~ s/^.{${frame}}//;
 458     }
 459     else
 460     {
 461         Maasha::Common::error( qq(Badly formated frame "$frame") );
 462     }
 463
 464     $pos = 0;
 465
 466     while ( $codon = substr $dna, $pos, 3 )
 467     {
 468         last if not length $codon == 3;
 469
 470         $pep .= codon2aa( $codon );
 471
 472         $pos += 3;
 473     }
 474
 475     return $pep;
 476 }
 477
 478
 479 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> RNA FOLDING <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
 480
 481
 482 sub fold_struct_rnafold
 483 {
 484     # Martin A. Hansen, February 2008.
 485
 486     # Given a squence fold this using RNAfold.
 487
 488     my ( $seq,   # sequence to fold
 489        ) = @_;
 490
 491     # Returns a tuple of fold string and free energy.
 492
 493     my ( $pid, $fh_out, $fh_in, @lines, $struct, $energy );
 494
 495     $pid = open2( $fh_out, $fh_in, "RNAfold -noPS" );
 496
 497     Maasha::Fasta::put_entry( [ "RNAfold", $seq ], $fh_in );
 498
 499     close $fh_in;
 500
 501     @lines = <$fh_out>;
 502
 503     close $fh_out;
 504
 505     waitpid $pid, 0;
 506
 507     chomp @lines;
 508
 509     if ( $lines[ - 1 ] =~ /^([^ ]+) \((.+)\)$/ )
 510     {
 511         $struct = $1;
 512         $energy = $2;
 513     }
 514
 515     return wantarray ? ( $struct, $energy ) : [ $struct, $energy ];
 516 }
 517
 518
 519 sub fold_struct_contrastruct
 520 {
 521     # Martin A. Hansen, February 2008.
 522
 523     # Given a sequence fold this using Contrafold.
 524
 525     my ( $seq,       # sequence to fold
 526          $tmp_dir,   # temporary directory - OPTIONAL
 527        ) = @_;
 528
 529     # Returns a tuple of fold string and temp index.
 530
 531     my ( $tmp_file, $out_file1, $out_file2, $fh, $line, $struct, @AoA, $i, $temp, $index );
 532
 533     $tmp_dir ||= $ENV{ 'BP_TMP' };
 534
 535     $tmp_file  = "$tmp_dir/fold.fna";
 536     $out_file1 = "$tmp_dir/fold.out1";
 537     $out_file2 = "$tmp_dir/fold.out2";
 538
 539     Maasha::Fasta::put_entries( [ [ "fold", $seq ] ], $tmp_file );
 540
 541     Maasha::Common::run( "contrafold", "predict --parens $out_file1 --bpseq $out_file2 $tmp_file" );
 542
 543     unlink $tmp_file;
 544
 545     $fh = Maasha::Common::read_open( $out_file1 );
 546
 547     while ( $line = <$fh> )
 548     {
 549         chomp $line;
 550
 551         $struct = $line;
 552     }
 553
 554     close $fh;
 555
 556     unlink $out_file1;
 557
 558     $fh = Maasha::Common::read_open( $out_file2 );
 559
 560     while ( $line = <$fh> )
 561     {
 562         chomp $line;
 563
 564         push @AoA, [ split " ", $line ];
 565     }
 566
 567     close $fh;
 568
 569     unlink $out_file2;
 570
 571     for ( $i = 0; $i < @AoA; $i++ )
 572     {
 573         if ( $AoA[ $i ]->[ 2 ] != 0 )
 574         {
 575             last if $AoA[ $i ]->[ 0 ] > $AoA[ $i ]->[ 2 ];
 576
 577             $temp += base_pair_melting_temp( $AoA[ $i ]->[ 1 ] . $AoA[ $AoA[ $i ]->[ 2 ] - 1 ]->[ 1 ] );
 578         }
 579     }
 580
 581     $index = sprintf( "%.2f", $temp / length $seq );
 582
 583     return wantarray ? ( $struct, $index ) : [ $struct, $index ];
 584 }
 585
 586
 587 sub base_pair_melting_temp
 588 {
 589     # Martin A. Hansen, February 2008.
 590
 591     # Given a basepair, returns the melting temperature.
 592
 593     my ( $bp,   # basepair string
 594        ) = @_;
 595
 596     # Returns integer
 597
 598     my ( %melt_hash );
 599
 600     %melt_hash = (
 601         AA => 0,
 602         AT => 2,
 603         AC => 0,
 604         AG => 0,
 605         AU => 2,
 606         TA => 2,
 607         TT => 0,
 608         TC => 0,
 609         TG => 1, #
 610         TU => 0,
 611         CA => 0,
 612         CT => 0,
 613         CC => 0,
 614         CG => 4,
 615         CU => 0,
 616         GA => 0,
 617         GT => 1, #
 618         GC => 4,
 619         GG => 0,
 620         GU => 1, #
 621         UA => 2,
 622         UT => 0,
 623         UC => 0,
 624         UG => 1, #
 625         UU => 0,
 626     );
 627
 628     return $melt_hash{ uc $bp };
 629 }
 630
 631
 632 sub generate_dna_oligos
 633 {
 634     # Martin A. Hansen, April 2007.
 635
 636     # Generates all possible DNA oligos of a given wordsize.
 637
 638     # alternative way: perl -MData::Dumper -e '@CONV = glob( "{T,C,A,G}" x 4 ); print Dumper( \@CONV )'
 639
 640
 641     my ( $wordsize,   # size of DNA oligos
 642        ) = @_;
 643
 644     # Returns list
 645
 646     my ( @alph, @oligos, $oligo, $char, @list );
 647
 648     @alph   = ( qw( A T C G N ) );
 649     @oligos = ( '' );
 650
 651     for ( 1 .. $wordsize )
 652     {
 653         foreach $oligo ( @oligos )
 654         {
 655             foreach $char ( @alph ) {
 656                 push @list, $oligo . $char;
 657             }
 658         }
 659
 660         @oligos = @list;
 661
 662         undef @list;
 663     }
 664
 665     return wantarray ? @oligos : \@oligos;
 666 }
 667
 668
 669 sub seq2oligos
 670 {
 671     # Martin A. Hansen, April 2007
 672
 673     # Given a sequence and a wordsize,
 674     # breaks the sequence into overlapping
 675     # oligos of that wordsize.
 676
 677     my ( $seq,       # sequence reference
 678          $wordsize,  # wordsize
 679        ) = @_;
 680
 681     # returns list
 682
 683     my ( $i, $oligo, @oligos );
 684
 685     for ( $i = 0; $i < length( ${ $seq } ) - $wordsize + 1; $i++ )
 686     {
 687         $oligo = substr ${ $seq }, $i, $wordsize;
 688
 689         push @oligos, $oligo;
 690     }
 691
 692     return wantarray ? @oligos : \@oligos;
 693 }
 694
 695
 696 sub seq2oligos_uniq
 697 {
 698     # Martin A. Hansen, April 2007
 699
 700     # Given a sequence and a wordsize,
 701     # breaks the sequence into overlapping
 702     # oligos of that wordsize and return
 703     # only unique words.
 704
 705     my ( $seq,       # sequence reference
 706          $wordsize,  # wordsize
 707        ) = @_;
 708
 709     # returns list
 710
 711     my ( $i, $oligo, %lookup, @oligos );
 712
 713     for ( $i = 0; $i < length( ${ $seq } ) - $wordsize + 1; $i++ )
 714     {
 715         $oligo = substr ${ $seq }, $i, $wordsize;
 716
 717         if ( not exists $lookup{ $oligo } )
 718         {
 719             push @oligos, $oligo;
 720             $lookup{ $oligo } = 1;
 721         }
 722     }
 723
 724     return wantarray ? @oligos : \@oligos;
 725 }
 726
 727
 728 sub oligo_freq
 729 {
 730     # Martin A. Hansen, August 2007.
 731
 732     # Given a hashref with oligo=>count, calculates
 733     # a frequency table. Returns a list of hashes
 734
 735     my ( $oligo_freq,   # hashref
 736        ) = @_;
 737
 738     # Returns data structure
 739
 740     my ( @freq_table, $total );
 741
 742     $total = 0;
 743
 744     map { push @freq_table, { OLIGO => $_, COUNT => $oligo_freq->{ $_ } }; $total += $oligo_freq->{ $_ } } keys %{ $oligo_freq };
 745
 746     @freq_table = sort { $b->{ "COUNT" } <=> $a->{ "COUNT" } or $a->{ "OLIGO" } cmp $b->{ "OLIGO" } } @freq_table;
 747
 748     map { $_->{ "FREQ" } = sprintf( "%.4f", $_->{ "COUNT" } / $total ) } @freq_table;
 749
 750     return wantarray ? return @freq_table : \@freq_table;
 751 }
 752
 753
 754 sub seq_generate
 755 {
 756     # Martin A. Hansen, May 2007
 757
 758     # Generates a random sequence given a sequence length
 759     # and a alphabet.
 760
 761     my ( $len,    # sequence length
 762          $alph,   # sequence alphabet
 763        ) = @_;
 764
 765     # returns string
 766
 767     my ( $alph_len, $i, $seq );
 768
 769     $alph_len = scalar @{ $alph };
 770
 771     for ( $i = 0; $i < $len; $i++ ) {
 772         $seq .= $alph->[ int( rand( $alph_len ) ) ];
 773     }
 774
 775     return $seq;
 776 }
 777
 778
 779 sub seq_shuffle
 780 {
 781     # Martin A. Hansen, December 2007.
 782
 783     # Shuffles sequence of a given string.
 784
 785     my ( $seq,   # sequence string
 786        ) = @_;
 787
 788     # Returns string.
 789
 790     my ( @list );
 791
 792     @list = split "", $seq;
 793
 794     return join "", shuffle( @list );
 795 }
 796
 797
 798 sub seq_alph
 799 {
 800     # Martin A. Hansen, May 2007.
 801
 802     # returns a requested alphabet
 803
 804     my ( $type,   # alphabet type
 805        ) = @_;
 806
 807     # returns list
 808
 809     my ( @alph );
 810
 811     if ( $type =~ /^dna$/i ) {
 812         @alph = qw( A T C G );
 813     } elsif ( $type =~ /^rna$/i ) {
 814         @alph = qw( A U C G );
 815     } elsif ( $type =~ /^prot/i ) {
 816         @alph = qw( F L S Y C W P H Q R I M T N K V A D E G );
 817     } else {
 818         die qq(ERROR: Unknown alphabet type: "$type"\n);
 819     }
 820
 821     return wantarray ? @alph : \@alph;
 822 }
 823
 824
 825 sub seq_analyze
 826 {
 827     # Martin A. Hansen, August 2007.
 828
 829     # Analyses the sequence composition of a given sequence.
 830
 831     my ( $seq,   # sequence to analyze
 832        ) = @_;
 833
 834     # Returns hash
 835
 836     my ( %analysis, @chars, @chars_lc, $char, %char_hash, $gc, $at, $lc, $max, $res_sum, @indels, %indel_hash );
 837
 838     $analysis{ "SEQ_TYPE" } = uc Maasha::Seq::seq_guess_type( $seq );
 839     $analysis{ "SEQ_LEN" }  = length $seq;
 840
 841     @indels = qw( - ~ . _ );
 842
 843     if ( $analysis{ "SEQ_TYPE" } eq "DNA" )
 844     {
 845         @chars    = split //, "AGCUTRYWSMKHDVBNagcutrywsmkhdvbn";
 846         @chars_lc = split //, "agcutrywsmkhdvbn";
 847     }
 848     elsif ( $analysis{ "SEQ_TYPE" } eq "RNA" )
 849     {
 850         @chars    = split //, "AGCUTRYWSMKHDVBNagcutrywsmkhdvbn";
 851         @chars_lc = split //, "agcutrywsmkhdvbn";
 852     }
 853     else
 854     {
 855         @chars    = split //, "FLSYCWPHQRIMTNKVADEGflsycwphqrimtnkvadeg";
 856         @chars_lc = split //, "flsycwphqrimtnkvadeg";
 857     }
 858
 859     @char_hash{ @chars }   = map { eval "scalar \$seq =~ tr/$_//" } @chars;
 860     @indel_hash{ @indels } = map { eval "scalar \$seq =~ tr/$_//" } @indels;
 861
 862     if ( $analysis{ "SEQ_TYPE" } =~ /DNA|RNA/ )
 863     {
 864         $gc = $char_hash{ "g" } + $char_hash{ "G" } + $char_hash{ "c" } + $char_hash{ "C" };
 865         $at = $char_hash{ "a" } + $char_hash{ "A" } + $char_hash{ "t" } + $char_hash{ "T" } + $char_hash{ "u" } + $char_hash{ "U" };
 866
 867         $analysis{ "GC%" } = sprintf( "%.2f", 100 * $gc / $analysis{ "SEQ_LEN" } );
 868
 869         map { $lc += $char_hash{ lc $_ } } @chars_lc;
 870
 871         $analysis{ "SOFT_MASK%" } = sprintf( "%.2f", 100 * $lc / $analysis{ "SEQ_LEN" } );
 872         $analysis{ "HARD_MASK%" } = sprintf( "%.2f", 100 * ( $char_hash{ "n" } + $char_hash{ "N" } ) / $analysis{ "SEQ_LEN" } );
 873     }
 874
 875     $max = 0;
 876
 877     foreach $char ( @chars_lc )
 878     {
 879         $char = uc $char;
 880
 881         $char_hash{ $char } += $char_hash{ lc $char };
 882
 883         $analysis{ "RES:$char" } = $char_hash{ $char };
 884
 885         $max = $char_hash{ $char } if $char_hash{ $char } > $max;
 886
 887         $analysis{ "RES_SUM" } += $char_hash{ $char };
 888     }
 889
 890     map { $analysis{ "RES:$_" } = $indel_hash{ $_ } } @indels;
 891
 892     $analysis{ "MIX_INDEX" } = sprintf( "%.2f", $max / $analysis{ "SEQ_LEN" } );
 893     $analysis{ "MELT_TEMP" } = sprintf( "%.2f", 4 * $gc + 2 * $at );
 894
 895     return wantarray ? %analysis : \%analysis;
 896 }
 897
 898
 899 sub seq_complexity
 900 {
 901     # Martin A. Hansen, May 2008.
 902
 903     # Given a sequence computes a complexity index
 904     # as the most common di-residue over
 905     # the sequence length. Return ~1 if the entire
 906     # sequence is homopolymeric. Above 0.4 indicates
 907     # low complexity sequence.
 908
 909     my ( $seq,   # sequence
 910        ) = @_;
 911
 912     # Returns float.
 913
 914     my ( $len, $i, $max, $di, %hash );
 915
 916     $seq = uc $seq;
 917     $len = length $seq;
 918     $max = 0;
 919
 920     for ( $i = 0; $i < $len - 1; $i++ ) {
 921         $hash{ substr $seq, $i, 2  }++;
 922     }
 923
 924     foreach $di ( keys %hash ) {
 925         $max = $hash{ $di } if $hash{ $di } > $max;
 926     }
 927
 928     return $max / $len;
 929 }
 930
 931
 932 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SEQLOGO <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
 933
 934
 935 sub seqlogo_calc
 936 {
 937     # Martin A. Hansen, January 2007.
 938
 939     # given max bit size and a list of aligned entries
 940     # in FASTA format, calculates for each sequence position
 941     # the height of the letters in bits.
 942     # returns a data structure with [ letter, height ] tuples
 943     # for all letters at each position.
 944
 945     my ( $bit_max,   # maximum bit height
 946          $entries,   # FASTA entries
 947        ) = @_;
 948
 949     # returns data structure
 950
 951     my ( $logo_len, $char_tot, $i, %char_hash, $bit_height, $bit_diff, $char_heights, @logo );
 952
 953     $logo_len = length $entries->[ 0 ]->[ 1 ];
 954     $char_tot = scalar @{ $entries };
 955
 956     for ( $i = 0; $i < $logo_len; $i++ )
 957     {
 958         undef %char_hash;
 959
 960         map { $char_hash{ uc substr( $_->[ 1 ], $i, 1 ) }++ } @{ $entries };
 961
 962         delete $char_hash{ "-" };
 963         delete $char_hash{ "_" };
 964         delete $char_hash{ "~" };
 965         delete $char_hash{ "." };
 966
 967         $bit_height = seqlogo_calc_bit_height( \%char_hash, $char_tot );
 968
 969         $bit_diff = $bit_max - $bit_height;
 970
 971         $char_heights = seqlogo_calc_char_heights( \%char_hash, $char_tot, $bit_diff );
 972
 973         push @logo, $char_heights;
 974     }
 975
 976     return wantarray ? @logo : \@logo;
 977 }
 978
 979
 980 sub seqlogo_calc_bit_height
 981 {
 982     # Martin A. Hansen, January 2007.
 983
 984     # calculates the bit height using Shannon's famous
 985     # general formula for uncertainty as documentet:
 986     # http://www.ccrnp.ncifcrf.gov/~toms/paper/hawaii/latex/node5.html
 987
 988     my ( $char_hash,   # hashref with chars and frequencies
 989          $tot,         # total number of chars
 990        ) = @_;
 991
 992     # returns float
 993
 994     my ( $char, $freq, $bit_height );
 995
 996     foreach $char ( keys %{ $char_hash } )
 997     {
 998         $freq = $char_hash->{ $char } / $tot;
 999
1000         $bit_height += $freq * ( log( $freq ) / log( 2 ) );
1001     }
1002
1003     $bit_height *= -1;
1004
1005     return $bit_height;
1006 }
1007
1008
1009 sub seqlogo_calc_char_heights
1010 {
1011     # Martin A. Hansen, January 2007.
1012
1013     # calculates the hight of each char in bits, and sorts
1014     # according to height.
1015
1016     my ( $char_hash,   # hashref with chars and frequencies
1017          $tot,         # tot number of chars
1018          $bit_diff,    # information gained from uncertainties
1019        ) = @_;
1020
1021     # returns list of tuples
1022
1023     my ( $char, $freq, $char_height, @char_heights );
1024
1025     foreach $char ( keys %{ $char_hash } )
1026     {
1027         $freq = $char_hash->{ $char } / $tot;
1028
1029         $char_height = $freq * $bit_diff;   # char height in bits
1030
1031         push @char_heights, [ $char, $char_height ];
1032     }
1033
1034     @char_heights = sort { $a->[ 1 ] <=> $b->[ 1 ] } @char_heights;
1035
1036     return wantarray ? @char_heights : \@char_heights;
1037 }
1038
1039
1040 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> RESIDUE COLORS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1041
1042
1043 sub color_pep
1044 {
1045     # Martin A. Hansen, October 2005.
1046
1047     # color scheme for proteins as defined in Mview.
1048     # given a char returns the appropriate color.
1049     # The amino acids are colored according to physicochemical properties:
1050     # bright green = hydrophobic; dark green = large hydrophobic;
1051     # bright blue = negative charge; red = positive charge;
1052     # dull blue = small alcohol; purple = polar; yellow = cysteine.
1053
1054     my ( $char,   # char to decorate
1055        ) = @_;
1056
1057     # returns string
1058
1059     my ( %hash, $color_set );
1060
1061     %hash = (
1062         K   => "bright-red",
1063         R   => "bright-red",
1064         H   => "dark-green",
1065         D   => "bright-blue",
1066         E   => "bright-blue",
1067         S   => "dull-blue",
1068         T   => "dull-blue",
1069         N   => "purple",
1070         Q   => "purple",
1071         A   => "bright-green",
1072         V   => "bright-green",
1073         I   => "bright-green",
1074         L   => "bright-green",
1075         M   => "bright-green",
1076         F   => "dark-green",
1077         Y   => "dark-green",
1078         W   => "dark-green",
1079         C   => "yellow",
1080         G   => "bright-green",
1081         P   => "bright-green",
1082         Z   => "dark-gray",
1083         B   => "dark-gray",
1084         "?" => "light-gray",
1085         "~" => "light-gray",
1086         "*" => "dark-gray",
1087     );
1088
1089     if ( exists $hash{ uc $char } ) {
1090         $color_set = $hash{ uc $char };
1091     } else {
1092         $color_set = "black";
1093     }
1094
1095     return $color_set;
1096 }
1097
1098
1099 sub color_nuc
1100 {
1101     # Martin A. Hansen, October 2005.
1102
1103     # color scheme for nucleotides as defined in Mview.
1104     # given a char returns the appropriate color
1105     # according to physical/chemical proterties.
1106
1107     my ( $char,   # char to decorate
1108        ) = @_;
1109
1110     # returns string
1111
1112     my ( %hash, $color_set );
1113
1114     %hash = (
1115         A => "bright-red",
1116         G => "yellow",
1117         C => "blue",
1118         T => "green",
1119         U => "green",
1120     );
1121
1122     if ( exists $hash{ uc $char } ) {
1123         $color_set = $hash{ uc $char };
1124     } else {
1125         $color_set = "black";
1126     }
1127
1128     return $color_set;
1129 }
1130
1131
1132 sub color_palette
1133 {
1134     # Martin A. Hansen, October 2005.
1135
1136     # hash table with color-names and color-hex.
1137
1138     my ( $color,   # common color name
1139        ) = @_;
1140
1141     # returns string
1142
1143     my ( %hash );
1144
1145     %hash = (
1146         "black"                => "#000000",
1147         "white"                => "#ffffff",
1148         "red"                  => "#ff0000",
1149         "green"                => "#00ff00",
1150         "blue"                 => "#0000ff",
1151         "cyan"                 => "#00ffff",
1152         "magenta"              => "#ff00ff",
1153 #        "yellow"               => "#ffff00",
1154         "yellow"               => "#ffc800",
1155         "purple"               => "#6600cc",
1156         "dull-blue"            => "#0099ff",
1157         "dark-green-blue"      => "#33cccc",
1158         "medium-green-blue"    => "#00ffcc",
1159         "bright-blue"          => "#0033ff",
1160         "dark-green"           => "#009900",
1161         "bright-green"         => "#33cc00",
1162         "orange"               => "#ff3333",
1163         "orange-brown"         => "#cc6600",
1164         "bright-red"           => "#cc0000",
1165         "light-gray"           => "#999999",
1166         "dark-gray"            => "#666666",
1167         "gray0"                => "#ffffff",
1168         "gray1"                => "#eeeeee",
1169         "gray2"                => "#dddddd",
1170         "gray3"                => "#cccccc",
1171         "gray4"                => "#bbbbbb",
1172         "gray5"                => "#aaaaaa",
1173         "gray6"                => "#999999",
1174         "gray7"                => "#888888",
1175         "gray8"                => "#777777",
1176         "gray9"                => "#666666",
1177         "gray10"               => "#555555",
1178         "gray11"               => "#444444",
1179         "gray12"               => "#333333",
1180         "gray13"               => "#222222",
1181         "gray14"               => "#111111",
1182         "gray15"               => "#000000",
1183         "clustal-red"          => "#ff1111",
1184         "clustal-blue"         => "#1155ff",
1185         "clustal-green"        => "#11dd11",
1186         "clustal-cyan"         => "#11ffff",
1187         "clustal-yellow"       => "#ffff11",
1188         "clustal-orange"       => "#ff7f11",
1189         "clustal-pink"         => "#ff11ff",
1190         "clustal-purple"       => "#6611cc",
1191         "clustal-dull-blue"    => "#197fe5",
1192         "clustal-dark-gray"    => "#666666",
1193         "clustal-light-gray"   => "#999999",
1194         "lin-A"                => "#90fe23",
1195         "lin-R"                => "#fe5e2d",
1196         "lin-N"                => "#2e3d2d",
1197         "lin-D"                => "#00903b",
1198         "lin-C"                => "#004baa",
1199         "lin-Q"                => "#864b00",
1200         "lin-E"                => "#3fa201",
1201         "lin-G"                => "#10fe68",
1202         "lin-H"                => "#b2063b",
1203         "lin-I"                => "#04ced9",
1204         "lin-L"                => "#4972fe",
1205         "lin-K"                => "#c4a100",
1206         "lin-M"                => "#2a84dd",
1207         "lin-F"                => "#a60ade",
1208         "lin-P"                => "#fe61fe",
1209         "lin-S"                => "#f7e847",
1210         "lin-T"                => "#fefeb3",
1211         "lin-W"                => "#4a007f",
1212         "lin-Y"                => "#e903a8",
1213         "lin-V"                => "#5bfdfd",
1214     );
1215
1216     if ( exists $hash{ $color } ) {
1217         return $hash{ $color };
1218     } else {
1219         print STDERR qq(WARNING: color "$color" not found in palette!\n);
1220     }
1221 }
1222
1223
1224 sub color_contrast
1225 {
1226     # Martin A. Hansen, October 2005.
1227
1228     # Hash table with contrast colors to be used for frontground
1229     # text on a given background color.
1230
1231     my ( $color,    # background color
1232        ) = @_;
1233
1234     # returns string
1235
1236     my ( %hash );
1237
1238     %hash = (
1239         "black"                => "white",
1240         "white"                => "black",
1241         "red"                  => "white",
1242         "green"                => "white",
1243         "blue"                 => "white",
1244         "cyan"                 => "white",
1245         "magenta"              => "white",
1246         "yellow"               => "black",
1247         "purple"               => "white",
1248         "dull-blue"            => "white",
1249         "dark-green-blue"      => "white",
1250         "medium-green-blue"    => "white",
1251         "bright-blue"          => "white",
1252         "dark-green"           => "white",
1253         "bright-green"         => "black",
1254         "orange"               => "",
1255         "orange-brown"         => "",
1256         "bright-red"           => "white",
1257         "light-gray"           => "black",
1258         "dark-gray"            => "white",
1259         "gray0"                => "",
1260         "gray1"                => "",
1261         "gray2"                => "",
1262         "gray3"                => "",
1263         "gray4"                => "",
1264         "gray5"                => "",
1265         "gray6"                => "",
1266         "gray7"                => "",
1267         "gray8"                => "",
1268         "gray9"                => "",
1269         "gray10"               => "",
1270         "gray11"               => "",
1271         "gray12"               => "",
1272         "gray13"               => "",
1273         "gray14"               => "",
1274         "gray15"               => "",
1275         "clustal-red"          => "black",
1276         "clustal-blue"         => "black",
1277         "clustal-green"        => "black",
1278         "clustal-cyan"         => "black",
1279         "clustal-yellow"       => "black",
1280         "clustal-orange"       => "black",
1281         "clustal-pink"         => "black",
1282         "clustal-purple"       => "black",
1283         "clustal-dull-blue"    => "black",
1284         "clustal-dark-gray"    => "black",
1285         "clustal-light-gray"   => "black",
1286         "lin-A"                => "",
1287         "lin-R"                => "",
1288         "lin-N"                => "",
1289         "lin-D"                => "",
1290         "lin-C"                => "",
1291         "lin-Q"                => "",
1292         "lin-E"                => "",
1293         "lin-G"                => "",
1294         "lin-H"                => "",
1295         "lin-I"                => "",
1296         "lin-L"                => "",
1297         "lin-K"                => "",
1298         "lin-M"                => "",
1299         "lin-F"                => "",
1300         "lin-P"                => "",
1301         "lin-S"                => "",
1302         "lin-T"                => "",
1303         "lin-W"                => "",
1304         "lin-Y"                => "",
1305         "lin-V"                => "",
1306     );
1307
1308     if ( exists $hash{ $color } ) {
1309         return $hash{ $color };
1310     } else {
1311         print STDERR qq(WARNING: color "$color" not found in palette!\n);
1312     }
1313 }
1314
1315
1316 sub seq_word_pack
1317 {
1318     # Martin A. Hansen, April 2008.
1319
1320     # Packs a sequence word into a binary number.
1321
1322     my ( $word,        # Word to be packed
1323        ) = @_;
1324
1325     # Returns integer.
1326
1327     my ( %hash, $bin, $word_size, $pad );
1328
1329     %hash = (
1330         'A' => '000',
1331         'T' => '001',
1332         'C' => '010',
1333         'G' => '100',
1334         'N' => '011',
1335         '-' => '101',
1336         '.' => '110',
1337         '~' => '111',
1338     );
1339
1340     map { $bin .= pack "B3", $hash{ $_ } } split //, $word;
1341
1342     $word_size = length $word;
1343
1344     $pad = ( 3 * $word_size ) / 8;
1345
1346     if ( $pad =~ /\./ )
1347     {
1348         $pad = ( ( int $pad + 1 ) * 8 ) - 3 * $word_size;
1349
1350         $bin .= pack "B$pad", 0 x $pad;
1351     }
1352
1353     return $bin;
1354 }
1355
1356
1357 sub seq_word_unpack
1358 {
1359     # Martin A. Hansen, April 2008.
1360
1361     # Unpacks a binary sequence word to ASCII.
1362
1363     my ( $bin,         # Binary sequence word
1364          $word_size,   # Size of word
1365        ) = @_;
1366
1367     # Returns string.
1368
1369     my ( %hash, $word );
1370
1371     %hash = (
1372         '000' => 'A',
1373         '001' => 'T',
1374         '010' => 'C',
1375         '100' => 'G',
1376         '011' => 'N',
1377         '101' => '-',
1378         '110' => '.',
1379         '111' => '~',
1380     );
1381
1382     map { $word .= $hash{ $_ } } unpack "(B3)$word_size", $bin;
1383
1384     return $word;
1385 }
1386
1387
1388 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> ADAPTOR LOCATING <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1389
1390
1391 ###############################    REDUNDANT   ##############################
1392
1393 # these functions have been replaced by index_m and match_m in Common.pm
1394
1395 sub find_adaptor
1396 {
1397     # Martin A. Hansen & Selene Fernandez, August 2008
1398
1399     # Locates an adaptor sequence in a given sequence
1400     # starting from a given offset position allowing a given
1401     # number of mismatches (no indels).
1402
1403     my ( $adaptor,       # list of chars
1404          $tag,           # list of chars
1405          $adaptor_len,   # length of adaptor sequence
1406          $tag_len,       # length of tag sequence
1407          $tag_offset,    # offset position
1408          $max_match,     # number of matches indicating success
1409          $max_mismatch,  # number of mismatches
1410        ) = @_;
1411
1412     # Returns an integer.
1413
1414     my ( $i, $j, $match, $mismatch );
1415
1416     $i = $tag_offset;
1417
1418     while ( $i < $tag_len - ( $max_match + $max_mismatch ) + 1 )
1419     {
1420         $j = 0;
1421
1422         while ( $j < $adaptor_len - ( $max_match + $max_mismatch ) + 1 )
1423         {
1424             return $i if check_diag( $adaptor, $tag, $adaptor_len, $tag_len, $j, $i, $max_match, $max_mismatch );
1425
1426             $j++
1427         }
1428
1429         $i++;
1430     }
1431
1432     return -1;
1433 }
1434
1435
1436 sub check_diag
1437 {
1438     # Martin A. Hansen & Selene Fernandez, August 2008
1439
1440     # Checks the diagonal starting at a given coordinate
1441     # of a search space constituted by an adaptor and a tag sequence.
1442     # Residues in the diagonal are compared between the sequences allowing
1443     # for a given number of mismatches. We terminate when search space is
1444     # exhausted or if max matches or mismatches is reached.
1445
1446     my ( $adaptor,       # list of chars
1447          $tag,           # list of chars
1448          $adaptor_len,   # length of adaptor sequence
1449          $tag_len,       # length of tag sequence
1450          $adaptor_beg,   # adaptor begin coordinate
1451          $tag_beg,       # tag begin coordinate
1452          $max_match,     # number of matches indicating success
1453          $max_mismatch,  # number of mismatches
1454        ) = @_;
1455
1456     # Returns boolean.
1457
1458     my ( $match, $mismatch );
1459
1460     $match    = 0;
1461     $mismatch = 0;
1462
1463     while ( $adaptor_beg <= $adaptor_len and $tag_beg <= $tag_len )
1464     {
1465         if ( $adaptor->[ $adaptor_beg ] eq $tag->[ $tag_beg ] )
1466         {
1467             $match++;
1468
1469             return 1 if $match >= $max_match;
1470         }
1471         else
1472         {
1473             $mismatch++;
1474
1475             return 0 if $mismatch > $max_mismatch;
1476         }
1477
1478         $adaptor_beg++;
1479         $tag_beg++;
1480     }
1481
1482     return 0;
1483 }
1484
1485
1486 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<