code_perl/Maasha/Seq.pm

   1 package Maasha::Seq;
   2
   3 # Copyright (C) 2007 Martin A. Hansen.
   4
   5 # This program is free software; you can redistribute it and/or
   6 # modify it under the terms of the GNU General Public License
   7 # as published by the Free Software Foundation; either version 2
   8 # of the License, or (at your option) any later version.
   9
  10 # This program is distributed in the hope that it will be useful,
  11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 # GNU General Public License for more details.
  14
  15 # You should have received a copy of the GNU General Public License
  16 # along with this program; if not, write to the Free Software
  17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  18
  19 # http://www.gnu.org/copyleft/gpl.html
  20
  21
  22 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  23
  24
  25 # yak yak yak
  26
  27
  28 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  29
  30
  31 use warnings;
  32 use strict;
  33 use POSIX qw( islower );
  34 use Maasha::Common;
  35 use Data::Dumper;
  36 use IPC::Open2;
  37 use List::Util qw( shuffle );
  38 use Time::HiRes qw( gettimeofday );
  39
  40 use vars qw ( @ISA @EXPORT );
  41
  42 @ISA = qw( Exporter );
  43
  44
  45 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  46
  47
  48 sub seq_guess_type
  49 {
  50     # Martin A. Hansen, May 2007.
  51
  52     # Makes a qualified guess on the type of a given squence.
  53
  54     my ( $seq,   # sequence to check
  55        ) = @_;
  56
  57     # returns string.
  58
  59     my ( $check_seq, $count );
  60
  61     if ( length $seq > 100 ) {
  62         $check_seq = substr $seq, 0, 100;
  63     } else {
  64         $check_seq = $seq;
  65     }
  66
  67     if ( $count = $check_seq =~ tr/FLPQIEflpqie// and $count > 0 ) {
  68         return "PROTEIN";
  69     } elsif ( $count = $check_seq =~ tr/Uu// and $count > 0 ) {
  70         return "RNA";
  71     } else {
  72         return "DNA";
  73     }
  74 }
  75
  76
  77 sub wrap
  78 {
  79     # Martin A. Hansen, July 2007.
  80
  81     # Wraps a given string reference accoring to given width.
  82
  83     my ( $strref,   # ref to sting to wrap
  84          $wrap,     # wrap width
  85        ) = @_;
  86
  87     # Returns nothing.
  88
  89     ${ $strref } =~ s/(.{$wrap})/$1\n/g;
  90
  91     chomp ${ $strref };
  92 }
  93
  94
  95 sub dna_revcomp
  96 {
  97     # Niels Larsen
  98     # modified Martin A. Hansen, March 2005.
  99
 100     # Returns the reverse complement of a dna sequence with preservation of case
 101     # according to this mapping,
 102     #
 103     # AGCUTRYWSMKHDVBNagcutrywsmkhdvbn
 104     # TCGAAYRWSKMDHBVNtcgaayrwskmdhbvn
 105
 106     my ( $seq,   # seq
 107        ) = @_;
 108
 109     # returns string
 110
 111     $seq = reverse $seq;
 112
 113     $seq =~ tr/AGCUTRYWSMKHDVBNagcutrywsmkhdvbn/TCGAAYRWSKMDHBVNtcgaayrwskmdhbvn/;
 114
 115     return $seq;
 116 }
 117
 118
 119 sub rna_revcomp
 120 {
 121     # Niels Larsen
 122     # modified Martin A. Hansen, March 2005.
 123
 124     # Returns the complement of a rna sequence with preservation of case
 125     # according to this mapping,
 126     #
 127     # AGCUTRYWSMKHDVBNagcutrywsmkhdvbn
 128     # UCGAAYRWSKMDHBVNucgaayrwskmdhbvn
 129
 130     my ( $seq,   # seq
 131        ) = @_;
 132
 133     $seq = reverse $seq;
 134
 135     $seq =~ tr/AGCUTRYWSMKHDVBNagcutrywsmkhdvbn/UCGAAYRWSKMDHBVNucgaayrwskmdhbvn/;
 136
 137     return $seq;
 138 }
 139
 140
 141 sub dna_comp
 142 {
 143     # Niels Larsen
 144     # modified Martin A. Hansen, March 2005.
 145
 146     # Returns the reverse complement of a dna sequence with preservation of case
 147     # according to this mapping,
 148     #
 149     # AGCUTRYWSMKHDVBNagcutrywsmkhdvbn
 150     # TCGAAYRWSKMDHBVNtcgaayrwskmdhbvn
 151
 152     my ( $seqref,   # seqref
 153        ) = @_;
 154
 155     # Returns nothing.
 156
 157     ${ $seqref } =~ tr/AGCUTRYWSMKHDVBNagcutrywsmkhdvbn/TCGAAYRWSKMDHBVNtcgaayrwskmdhbvn/;
 158 }
 159
 160
 161 sub rna_comp
 162 {
 163     # Niels Larsen
 164     # modified Martin A. Hansen, March 2005.
 165
 166     # Returns the complement of a rna sequence with preservation of case
 167     # according to this mapping,
 168     #
 169     # AGCUTRYWSMKHDVBNagcutrywsmkhdvbn
 170     # UCGAAYRWSKMDHBVNucgaayrwskmdhbvn
 171
 172     my ( $seqref,   # seqref
 173        ) = @_;
 174
 175     # Returns nothing.
 176
 177     ${ $seqref } =~ tr/AGCUTRYWSMKHDVBNagcutrywsmkhdvbn/UCGAAYRWSKMDHBVNucgaayrwskmdhbvn/;
 178 }
 179
 180
 181 sub dna2rna
 182 {
 183     # Martin A. Hansen, March 2007
 184
 185     # Converts DNA sequence to RNA
 186
 187     my ( $seq,   # nucleotide sequence
 188        ) = @_;
 189
 190     # returns string
 191
 192     $seq =~ tr/Tt/Uu/;
 193
 194     return $seq;
 195 }
 196
 197
 198 sub rna2dna
 199 {
 200     # Martin A. Hansen, March 2007
 201
 202     # Converts RNA sequence to DNA
 203
 204     my ( $seq,   # nucleotide sequence
 205        ) = @_;
 206
 207     # returns string
 208
 209     $seq =~ tr/Uu/Tt/;
 210
 211     return $seq;
 212 }
 213
 214
 215 sub nuc2ambiguity
 216 {
 217     # Martin A. Hansen, March 2005.
 218
 219     # given a string of nucleotides
 220     # returns the corresponding ambiguity code
 221
 222     my ( $str,
 223          $type,    # DNA or RNA - DEFAULT DNA
 224        ) = @_;
 225
 226     my ( %hash, @nts, $key, $code, %nt_hash );
 227
 228     $str = uc $str;
 229
 230     if ( not $type or $type =~ /dna/i )
 231     {
 232         $str =~ s/N/ACGT/g;
 233         $str =~ s/B/CGT/g;
 234         $str =~ s/D/AGT/g;
 235         $str =~ s/H/ACT/g;
 236         $str =~ s/V/ACG/g;
 237         $str =~ s/K/GT/g;
 238         $str =~ s/Y/CT/g;
 239         $str =~ s/S/CG/g;
 240         $str =~ s/W/AT/g;
 241         $str =~ s/R/AG/g;
 242         $str =~ s/M/AC/g;
 243     }
 244     else
 245     {
 246         $str =~ s/N/ACGU/g;
 247         $str =~ s/B/CGU/g;
 248         $str =~ s/D/AGU/g;
 249         $str =~ s/H/ACU/g;
 250         $str =~ s/V/ACG/g;
 251         $str =~ s/K/GU/g;
 252         $str =~ s/Y/CU/g;
 253         $str =~ s/S/CG/g;
 254         $str =~ s/W/AU/g;
 255         $str =~ s/R/AG/g;
 256         $str =~ s/M/AC/g;
 257     }
 258
 259     @nts = split //, $str;
 260
 261     %nt_hash = map { $_ => 1 } @nts;
 262
 263     @nts = sort keys %nt_hash;
 264
 265     $key = join "", @nts;
 266
 267     %hash = (
 268         'A'    => 'A',
 269         'C'    => 'C',
 270         'G'    => 'G',
 271         'T'    => 'T',
 272         'U'    => 'U',
 273         'AC'   => 'M',
 274         'AG'   => 'R',
 275         'AT'   => 'W',
 276         'AU'   => 'W',
 277         'CG'   => 'S',
 278         'CT'   => 'Y',
 279         'CU'   => 'Y',
 280         'GT'   => 'K',
 281         'GU'   => 'K',
 282         'ACG'  => 'V',
 283         'ACT'  => 'H',
 284         'ACU'  => 'H',
 285         'AGT'  => 'D',
 286         'AGU'  => 'D',
 287         'CGT'  => 'B',
 288         'CGU'  => 'B',
 289         'ACGT' => 'N',
 290         'ACGU' => 'N',
 291     );
 292
 293     $code = $hash{ $key };
 294
 295     warn qq(WARNING: No ambiguity code for key->$key\n) if not $code;
 296
 297     return $code;
 298 }
 299
 300
 301 sub aa2codons
 302 {
 303     # Martin A. Hansen, March 2005.
 304
 305     # given an amino acid, returns a list of corresponding codons
 306
 307     my ( $aa,   # amino acid to translate
 308        ) = @_;
 309
 310     # returns list
 311
 312     my ( %hash, $codons );
 313
 314     $aa = uc $aa;
 315
 316     %hash = (
 317         'F' => [ 'TTT', 'TTC' ],                               # Phe
 318         'L' => [ 'TTA', 'TTG', 'CTT', 'CTC', 'CTA', 'CTG' ],   # Leu
 319         'S' => [ 'TCT', 'TCC', 'TCA', 'TCG', 'AGT', 'AGC' ],   # Ser
 320         'Y' => [ 'TAT', 'TAC' ],                               # Tyr
 321         '*' => [ 'TAA', 'TAG', 'TGA' ],                        # Stop
 322         'X' => [ 'TAA', 'TAG', 'TGA' ],                        # Stop
 323         'C' => [ 'TGT', 'TGC' ],                               # Cys
 324         'W' => [ 'TGG' ],                                      # Trp
 325         'P' => [ 'CCT', 'CCC', 'CCA', 'CCG' ],                 # Pro
 326         'H' => [ 'CAT', 'CAC' ],                               # His
 327         'Q' => [ 'CAA', 'CAG' ],                               # Gln
 328         'R' => [ 'CGT', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG' ],   # Arg
 329         'I' => [ 'ATT', 'ATC', 'ATA' ],                        # Ile
 330         'M' => [ 'ATG' ],                                      # Met
 331         'T' => [ 'ACT', 'ACC', 'ACA', 'ACG' ],                 # Thr
 332         'N' => [ 'AAT', 'AAC' ],                               # Asn
 333         'K' => [ 'AAA', 'AAG' ],                               # Lys
 334         'V' => [ 'GTT', 'GTC', 'GTA', 'GTG' ],                 # Val
 335         'A' => [ 'GCT', 'GCC', 'GCA', 'GCG' ],                 # Ala
 336         'D' => [ 'GAT', 'GAC' ],                               # Asp
 337         'E' => [ 'GAA', 'GAG' ],                               # Glu
 338         'G' => [ 'GGT', 'GGC', 'GGA', 'GGG' ],                 # Gly
 339     );
 340
 341     $codons = $hash{ $aa };
 342
 343     return wantarray ? @{ $codons } : $codons;
 344 }
 345
 346
 347 sub codon2aa
 348 {
 349     # Martin A. Hansen, March 2005.
 350
 351     # given a codon, returns the correponding
 352     # vertebrate amino acid.
 353
 354     my ( $codon,   # codon to translate
 355        ) = @_;
 356
 357     # returns string
 358
 359     my ( %hash, $aa );
 360
 361     die qq(ERROR: Bad codon: "$codon"\n) if not $codon =~ /[ATCGatcg]{3}/;
 362
 363     %hash = (
 364         'TTT' => 'F',   # Phe
 365         'TTC' => 'F',   # Phe
 366         'TTA' => 'L',   # Leu
 367         'TTG' => 'L',   # Leu
 368         'TCT' => 'S',   # Ser
 369         'TCC' => 'S',   # Ser
 370         'TCA' => 'S',   # Ser
 371         'TCG' => 'S',   # Ser
 372         'TAT' => 'Y',   # Tyr
 373         'TAC' => 'Y',   # Tyr
 374         'TAA' => '*',   # Stop
 375         'TAG' => '*',   # Stop
 376         'TGT' => 'C',   # Cys
 377         'TGC' => 'C',   # Cys
 378         'TGA' => '*',   # Stop
 379         'TGG' => 'W',   # Trp
 380         'CTT' => 'L',   # Leu
 381         'CTC' => 'L',   # Leu
 382         'CTA' => 'L',   # Leu
 383         'CTG' => 'L',   # Leu
 384         'CCT' => 'P',   # Pro
 385         'CCC' => 'P',   # Pro
 386         'CCA' => 'P',   # Pro
 387         'CCG' => 'P',   # Pro
 388         'CAT' => 'H',   # His
 389         'CAC' => 'H',   # His
 390         'CAA' => 'Q',   # Gln
 391         'CAG' => 'Q',   # Gln
 392         'CGT' => 'R',   # Arg
 393         'CGC' => 'R',   # Arg
 394         'CGA' => 'R',   # Arg
 395         'CGG' => 'R',   # Arg
 396         'ATT' => 'I',   # Ile
 397         'ATC' => 'I',   # Ile
 398         'ATA' => 'I',   # Ile
 399         'ATG' => 'M',   # Met
 400         'ACT' => 'T',   # Thr
 401         'ACC' => 'T',   # Thr
 402         'ACA' => 'T',   # Thr
 403         'ACG' => 'T',   # Thr
 404         'AAT' => 'N',   # Asn
 405         'AAC' => 'N',   # Asn
 406         'AAA' => 'K',   # Lys
 407         'AAG' => 'K',   # Lys
 408         'AGT' => 'S',   # Ser
 409         'AGC' => 'S',   # Ser
 410         'AGA' => 'R',   # Arg
 411         'AGG' => 'R',   # Arg
 412         'GTT' => 'V',   # Val
 413         'GTC' => 'V',   # Val
 414         'GTA' => 'V',   # Val
 415         'GTG' => 'V',   # Val
 416         'GCT' => 'A',   # Ala
 417         'GCC' => 'A',   # Ala
 418         'GCA' => 'A',   # Ala
 419         'GCG' => 'A',   # Ala
 420         'GAT' => 'D',   # Asp
 421         'GAC' => 'D',   # Asp
 422         'GAA' => 'E',   # Glu
 423         'GAG' => 'E',   # Glu
 424         'GGT' => 'G',   # Gly
 425         'GGC' => 'G',   # Gly
 426         'GGA' => 'G',   # Gly
 427         'GGG' => 'G',   # Gly
 428     );
 429
 430     $aa = $hash{ uc $codon };
 431
 432     return $aa;
 433 }
 434
 435
 436 sub translate
 437 {
 438     # Martin A. Hansen, June 2005.
 439
 440     # translates a dna sequence to protein according to a optional given
 441     # frame.
 442
 443     my ( $dna,     # dna sequence
 444          $frame,   # frame of translation - OPTIONAL
 445        ) = @_;
 446
 447     # returns string
 448
 449     my ( $codon, $pos, $pep );
 450
 451     $frame ||= 1;
 452
 453     if ( $frame =~ /-?[1-3]/ )
 454     {
 455         if ( $frame < 0 ) {
 456             $dna = Maasha::Seq::dna_revcomp( $dna );
 457         }
 458
 459         $frame = abs( $frame ) - 1;
 460
 461         $dna =~ s/^.{${frame}}//;
 462     }
 463     else
 464     {
 465         Maasha::Common::error( qq(Badly formated frame "$frame") );
 466     }
 467
 468     $pos = 0;
 469
 470     while ( $codon = substr $dna, $pos, 3 )
 471     {
 472         last if not length $codon == 3;
 473
 474         $pep .= codon2aa( $codon );
 475
 476         $pos += 3;
 477     }
 478
 479     return $pep;
 480 }
 481
 482
 483 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> RNA FOLDING <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
 484
 485
 486 sub fold_struct_rnafold
 487 {
 488     # Martin A. Hansen, February 2008.
 489
 490     # Given a squence fold this using RNAfold.
 491
 492     my ( $seq,   # sequence to fold
 493        ) = @_;
 494
 495     # Returns a tuple of fold string and free energy.
 496
 497     my ( $pid, $fh_out, $fh_in, @lines, $struct, $energy );
 498
 499     $pid = open2( $fh_out, $fh_in, "RNAfold -noPS" );
 500
 501     Maasha::Fasta::put_entry( [ "RNAfold", $seq ], $fh_in );
 502
 503     close $fh_in;
 504
 505     @lines = <$fh_out>;
 506
 507     close $fh_out;
 508
 509     waitpid $pid, 0;
 510
 511     chomp @lines;
 512
 513     if ( $lines[ - 1 ] =~ /^([^ ]+) \((.+)\)$/ )
 514     {
 515         $struct = $1;
 516         $energy = $2;
 517     }
 518
 519     return wantarray ? ( $struct, $energy ) : [ $struct, $energy ];
 520 }
 521
 522
 523 sub fold_struct_contrastruct
 524 {
 525     # Martin A. Hansen, February 2008.
 526
 527     # Given a sequence fold this using Contrafold.
 528
 529     my ( $seq,       # sequence to fold
 530          $tmp_dir,   # temporary directory - OPTIONAL
 531        ) = @_;
 532
 533     # Returns a tuple of fold string and temp index.
 534
 535     my ( $tmp_file, $out_file1, $out_file2, $fh, $line, $struct, @AoA, $i, $temp, $index );
 536
 537     $tmp_dir ||= $ENV{ 'BP_TMP' };
 538
 539     $tmp_file  = "$tmp_dir/fold.fna";
 540     $out_file1 = "$tmp_dir/fold.out1";
 541     $out_file2 = "$tmp_dir/fold.out2";
 542
 543     Maasha::Fasta::put_entries( [ [ "fold", $seq ] ], $tmp_file );
 544
 545     Maasha::Common::run( "contrafold", "predict --parens $out_file1 --bpseq $out_file2 $tmp_file" );
 546
 547     unlink $tmp_file;
 548
 549     $fh = Maasha::Common::read_open( $out_file1 );
 550
 551     while ( $line = <$fh> )
 552     {
 553         chomp $line;
 554
 555         $struct = $line;
 556     }
 557
 558     close $fh;
 559
 560     unlink $out_file1;
 561
 562     $fh = Maasha::Common::read_open( $out_file2 );
 563
 564     while ( $line = <$fh> )
 565     {
 566         chomp $line;
 567
 568         push @AoA, [ split " ", $line ];
 569     }
 570
 571     close $fh;
 572
 573     unlink $out_file2;
 574
 575     for ( $i = 0; $i < @AoA; $i++ )
 576     {
 577         if ( $AoA[ $i ]->[ 2 ] != 0 )
 578         {
 579             last if $AoA[ $i ]->[ 0 ] > $AoA[ $i ]->[ 2 ];
 580
 581             $temp += base_pair_melting_temp( $AoA[ $i ]->[ 1 ] . $AoA[ $AoA[ $i ]->[ 2 ] - 1 ]->[ 1 ] );
 582         }
 583     }
 584
 585     $index = sprintf( "%.2f", $temp / length $seq );
 586
 587     return wantarray ? ( $struct, $index ) : [ $struct, $index ];
 588 }
 589
 590
 591 sub base_pair_melting_temp
 592 {
 593     # Martin A. Hansen, February 2008.
 594
 595     # Given a basepair, returns the melting temperature.
 596
 597     my ( $bp,   # basepair string
 598        ) = @_;
 599
 600     # Returns integer
 601
 602     my ( %melt_hash );
 603
 604     %melt_hash = (
 605         AA => 0,
 606         AT => 2,
 607         AC => 0,
 608         AG => 0,
 609         AU => 2,
 610         TA => 2,
 611         TT => 0,
 612         TC => 0,
 613         TG => 1, #
 614         TU => 0,
 615         CA => 0,
 616         CT => 0,
 617         CC => 0,
 618         CG => 4,
 619         CU => 0,
 620         GA => 0,
 621         GT => 1, #
 622         GC => 4,
 623         GG => 0,
 624         GU => 1, #
 625         UA => 2,
 626         UT => 0,
 627         UC => 0,
 628         UG => 1, #
 629         UU => 0,
 630     );
 631
 632     return $melt_hash{ uc $bp };
 633 }
 634
 635
 636 sub generate_dna_oligos
 637 {
 638     # Martin A. Hansen, April 2007.
 639
 640     # Generates all possible DNA oligos of a given wordsize.
 641
 642     # alternative way: perl -MData::Dumper -e '@CONV = glob( "{A,T,C,G}" x 4 ); print Dumper( \@CONV )'
 643
 644
 645     my ( $wordsize,   # size of DNA oligos
 646        ) = @_;
 647
 648     # Returns list
 649
 650     my ( @alph, @oligos, $oligo, $char, @list );
 651
 652     @alph   = ( qw( A T C G N ) );
 653     @oligos = ( '' );
 654
 655     for ( 1 .. $wordsize )
 656     {
 657         foreach $oligo ( @oligos )
 658         {
 659             foreach $char ( @alph ) {
 660                 push @list, $oligo . $char;
 661             }
 662         }
 663
 664         @oligos = @list;
 665
 666         undef @list;
 667     }
 668
 669     return wantarray ? @oligos : \@oligos;
 670 }
 671
 672
 673 sub seq2oligos
 674 {
 675     # Martin A. Hansen, April 2007
 676
 677     # Given a sequence and a wordsize,
 678     # breaks the sequence into overlapping
 679     # oligos of that wordsize.
 680
 681     my ( $seq,       # sequence reference
 682          $wordsize,  # wordsize
 683        ) = @_;
 684
 685     # returns list
 686
 687     my ( $i, $oligo, @oligos );
 688
 689     for ( $i = 0; $i < length( ${ $seq } ) - $wordsize + 1; $i++ )
 690     {
 691         $oligo = substr ${ $seq }, $i, $wordsize;
 692
 693         push @oligos, $oligo;
 694     }
 695
 696     return wantarray ? @oligos : \@oligos;
 697 }
 698
 699
 700 sub seq2oligos_uniq
 701 {
 702     # Martin A. Hansen, April 2007
 703
 704     # Given a sequence and a wordsize,
 705     # breaks the sequence into overlapping
 706     # oligos of that wordsize and return
 707     # only unique words.
 708
 709     my ( $seq,       # sequence reference
 710          $wordsize,  # wordsize
 711        ) = @_;
 712
 713     # returns list
 714
 715     my ( $i, $oligo, %lookup, @oligos );
 716
 717     for ( $i = 0; $i < length( ${ $seq } ) - $wordsize + 1; $i++ )
 718     {
 719         $oligo = substr ${ $seq }, $i, $wordsize;
 720
 721         if ( not exists $lookup{ $oligo } )
 722         {
 723             push @oligos, $oligo;
 724             $lookup{ $oligo } = 1;
 725         }
 726     }
 727
 728     return wantarray ? @oligos : \@oligos;
 729 }
 730
 731
 732 sub oligo_freq
 733 {
 734     # Martin A. Hansen, August 2007.
 735
 736     # Given a hashref with oligo=>count, calculates
 737     # a frequency table. Returns a list of hashes
 738
 739     my ( $oligo_freq,   # hashref
 740        ) = @_;
 741
 742     # Returns data structure
 743
 744     my ( @freq_table, $total );
 745
 746     $total = 0;
 747
 748     map { push @freq_table, { OLIGO => $_, COUNT => $oligo_freq->{ $_ } }; $total += $oligo_freq->{ $_ } } keys %{ $oligo_freq };
 749
 750     @freq_table = sort { $b->{ "COUNT" } <=> $a->{ "COUNT" } or $a->{ "OLIGO" } cmp $b->{ "OLIGO" } } @freq_table;
 751
 752     map { $_->{ "FREQ" } = sprintf( "%.4f", $_->{ "COUNT" } / $total ) } @freq_table;
 753
 754     return wantarray ? return @freq_table : \@freq_table;
 755 }
 756
 757
 758 sub seq_generate
 759 {
 760     # Martin A. Hansen, May 2007
 761
 762     # Generates a random sequence given a sequence length
 763     # and a alphabet.
 764
 765     my ( $len,    # sequence length
 766          $alph,   # sequence alphabet
 767        ) = @_;
 768
 769     # Returns a string
 770
 771     my ( $alph_len, $i, $seq );
 772
 773     $alph_len = scalar @{ $alph };
 774
 775     for ( $i = 0; $i < $len; $i++ ) {
 776         $seq .= $alph->[ int( rand( $alph_len ) ) ];
 777     }
 778
 779     return $seq;
 780 }
 781
 782
 783 sub seq_insert
 784 {
 785     # Martin A. Hansen, June 2009.
 786
 787     # Randomly duplicates a given number of residues in a given sequence.
 788
 789     my ( $seq,          # sequence to mutate
 790          $insertions,   # number of residues to insert
 791        ) = @_;
 792
 793     # Returns a string.
 794
 795     my ( $i, $pos );
 796
 797     for ( $i = 0; $i < $insertions; $i++ )
 798     {
 799         $pos = int( rand( length $seq ) );
 800
 801         substr $seq, $pos, 0, substr $seq , $pos, 1;
 802     }
 803
 804     return $seq;
 805 }
 806
 807
 808 sub seq_delete
 809 {
 810     # Martin A. Hansen, June 2009.
 811
 812     # Randomly deletes a given number of residues from a given sequence.
 813
 814     my ( $seq,         # sequence to mutate
 815          $deletions,   # number of residues to delete
 816        ) = @_;
 817
 818     # Returns a string.
 819
 820     my ( $i, $pos );
 821
 822     for ( $i = 0; $i < $deletions; $i++ )
 823     {
 824         $pos = int( rand( length $seq ) );
 825
 826         substr $seq, $pos, 1, '';
 827     }
 828
 829     return $seq;
 830 }
 831
 832
 833 sub seq_mutate
 834 {
 835     # Martin A. Hansen, June 2009.
 836
 837     # Introduces a given number of random mutations in a
 838     # given sequence of a specified alphabet.
 839
 840     my ( $seq,         # sequence to mutate
 841          $mutations,   # number of mutations
 842          $alph,        # alphabet of sequence
 843        ) = @_;
 844
 845     # Returns a string.
 846
 847     my ( $i, $pos, %lookup_hash );
 848
 849     $i = 0;
 850
 851     while ( $i < $mutations )
 852     {
 853         $pos = int( rand( length $seq ) );
 854
 855         if ( not exists $lookup_hash{ $pos } )
 856         {
 857             substr $seq, $pos, 1, res_mutate( substr( $seq , $pos, 1 ), $alph );
 858
 859             $lookup_hash{ $pos } = 1;
 860
 861             $i++;
 862         }
 863     }
 864
 865     return $seq;
 866 }
 867
 868
 869 sub res_mutate
 870 {
 871     # Martin A. Hansen, June 2009.
 872
 873     # Mutates a given residue to another from a given alphabet.
 874
 875     my ( $res,    # residue to mutate
 876          $alph,   # alphabet
 877        ) = @_;
 878
 879     # Returns a char.
 880
 881     my ( $alph_len, $new );
 882
 883     $alph_len = scalar @{ $alph };
 884     $new      = $res;
 885
 886     while ( uc $new eq uc $res ) {
 887         $new = $alph->[ int( rand( $alph_len ) ) ];
 888     }
 889
 890     return POSIX::islower( $res ) ? lc $new : uc $new;
 891 }
 892
 893
 894 sub seq_shuffle
 895 {
 896     # Martin A. Hansen, December 2007.
 897
 898     # Shuffles sequence of a given string.
 899
 900     my ( $seq,   # sequence string
 901        ) = @_;
 902
 903     # Returns a string.
 904
 905     my ( @list );
 906
 907     @list = split "", $seq;
 908
 909     return join "", shuffle( @list );
 910 }
 911
 912
 913 sub seq_alph
 914 {
 915     # Martin A. Hansen, May 2007.
 916
 917     # Returns a requested sequence alphabet.
 918
 919     my ( $type,   # alphabet type
 920        ) = @_;
 921
 922     # returns list
 923
 924     my ( %alph_hash, $alph );
 925
 926     %alph_hash = (
 927         DNA      => [ qw(A T C G) ],
 928         DNA_AMBI => [ qw(A G C U T R Y W S M K H D V B N) ],
 929         RNA      => [ qw(A U C G) ],
 930         RNA_AMBI => [ qw(A G C U T R Y W S M K H D V B N) ],
 931         PROTEIN  => [ qw(F L S Y C W P H Q R I M T N K V A D E G) ],
 932     );
 933
 934     if ( exists $alph_hash{ uc $type } ) {
 935         $alph = $alph_hash{ uc $type };
 936     } else {
 937         die qq(ERROR: Unknown alphabet type: "$type"\n);
 938     }
 939
 940     return wantarray ? @{ $alph } : $alph;
 941 }
 942
 943
 944 sub seq_analyze
 945 {
 946     # Martin A. Hansen, August 2007.
 947
 948     # Analyses the sequence composition of a given sequence.
 949
 950     my ( $seq,   # sequence to analyze
 951        ) = @_;
 952
 953     # Returns hash
 954
 955     my ( %analysis, @chars, @chars_lc, $char, %char_hash, $gc, $at, $lc, $max, $res_sum, @indels, %indel_hash );
 956
 957     $analysis{ "SEQ_TYPE" } = Maasha::Seq::seq_guess_type( $seq );
 958     $analysis{ "SEQ_LEN" }  = length $seq;
 959
 960     @indels = qw( - ~ . _ );
 961
 962     if ( $analysis{ "SEQ_TYPE" } eq "DNA" )
 963     {
 964         @chars    = split //, "AGCUTRYWSMKHDVBNagcutrywsmkhdvbn";
 965         @chars_lc = split //, "agcutrywsmkhdvbn";
 966     }
 967     elsif ( $analysis{ "SEQ_TYPE" } eq "RNA" )
 968     {
 969         @chars    = split //, "AGCUTRYWSMKHDVBNagcutrywsmkhdvbn";
 970         @chars_lc = split //, "agcutrywsmkhdvbn";
 971     }
 972     else
 973     {
 974         @chars    = split //, "FLSYCWPHQRIMTNKVADEGflsycwphqrimtnkvadeg";
 975         @chars_lc = split //, "flsycwphqrimtnkvadeg";
 976     }
 977
 978     @char_hash{ @chars }   = map { eval "scalar \$seq =~ tr/$_//" } @chars;
 979     @indel_hash{ @indels } = map { eval "scalar \$seq =~ tr/$_//" } @indels;
 980
 981     if ( $analysis{ "SEQ_TYPE" } =~ /DNA|RNA/ )
 982     {
 983         $gc = $char_hash{ "g" } + $char_hash{ "G" } + $char_hash{ "c" } + $char_hash{ "C" };
 984         $at = $char_hash{ "a" } + $char_hash{ "A" } + $char_hash{ "t" } + $char_hash{ "T" } + $char_hash{ "u" } + $char_hash{ "U" };
 985
 986         $analysis{ "GC%" } = sprintf( "%.2f", 100 * $gc / $analysis{ "SEQ_LEN" } );
 987
 988         map { $lc += $char_hash{ lc $_ } } @chars_lc;
 989
 990         $analysis{ "SOFT_MASK%" } = sprintf( "%.2f", 100 * $lc / $analysis{ "SEQ_LEN" } );
 991         $analysis{ "HARD_MASK%" } = sprintf( "%.2f", 100 * ( $char_hash{ "n" } + $char_hash{ "N" } ) / $analysis{ "SEQ_LEN" } );
 992     }
 993
 994     $max = 0;
 995
 996     foreach $char ( @chars_lc )
 997     {
 998         $char = uc $char;
 999
1000         $char_hash{ $char } += $char_hash{ lc $char };
1001
1002         $analysis{ "RES[$char]" } = $char_hash{ $char };
1003
1004         $max = $char_hash{ $char } if $char_hash{ $char } > $max;
1005
1006         $analysis{ "RES_SUM" } += $char_hash{ $char };
1007     }
1008
1009     map { $analysis{ "RES[$_]" } = $indel_hash{ $_ } } @indels;
1010
1011     $analysis{ "MIX_INDEX" } = sprintf( "%.2f", $max / $analysis{ "SEQ_LEN" } );
1012     #$analysis{ "MELT_TEMP" } = sprintf( "%.2f", 4 * $gc + 2 * $at );
1013
1014     return wantarray ? %analysis : \%analysis;
1015 }
1016
1017
1018 sub seq_complexity
1019 {
1020     # Martin A. Hansen, May 2008.
1021
1022     # Given a sequence computes a complexity index
1023     # as the most common di-residue over
1024     # the sequence length. Return ~1 if the entire
1025     # sequence is homopolymeric. Above 0.4 indicates
1026     # low complexity sequence.
1027
1028     my ( $seq,   # sequence
1029        ) = @_;
1030
1031     # Returns float.
1032
1033     my ( $len, $i, $max, $di, %hash );
1034
1035     $seq = uc $seq;
1036     $len = length $seq;
1037     $max = 0;
1038
1039     for ( $i = 0; $i < $len - 1; $i++ ) {
1040         $hash{ substr $seq, $i, 2  }++;
1041     }
1042
1043     foreach $di ( keys %hash ) {
1044         $max = $hash{ $di } if $hash{ $di } > $max;
1045     }
1046
1047     return $max / $len;
1048 }
1049
1050
1051 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SEQLOGO <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1052
1053
1054 sub seqlogo_calc
1055 {
1056     # Martin A. Hansen, January 2007.
1057
1058     # given max bit size and a list of aligned entries
1059     # in FASTA format, calculates for each sequence position
1060     # the height of the letters in bits.
1061     # returns a data structure with [ letter, height ] tuples
1062     # for all letters at each position.
1063
1064     my ( $bit_max,   # maximum bit height
1065          $entries,   # FASTA entries
1066        ) = @_;
1067
1068     # returns data structure
1069
1070     my ( $logo_len, $char_tot, $i, %char_hash, $bit_height, $bit_diff, $char_heights, @logo );
1071
1072     $logo_len = length $entries->[ 0 ]->[ 1 ];
1073     $char_tot = scalar @{ $entries };
1074
1075     for ( $i = 0; $i < $logo_len; $i++ )
1076     {
1077         undef %char_hash;
1078
1079         map { $char_hash{ uc substr( $_->[ 1 ], $i, 1 ) }++ } @{ $entries };
1080
1081         delete $char_hash{ "-" };
1082         delete $char_hash{ "_" };
1083         delete $char_hash{ "~" };
1084         delete $char_hash{ "." };
1085
1086         $bit_height = seqlogo_calc_bit_height( \%char_hash, $char_tot );
1087
1088         $bit_diff = $bit_max - $bit_height;
1089
1090         $char_heights = seqlogo_calc_char_heights( \%char_hash, $char_tot, $bit_diff );
1091
1092         push @logo, $char_heights;
1093     }
1094
1095     return wantarray ? @logo : \@logo;
1096 }
1097
1098
1099 sub seqlogo_calc_bit_height
1100 {
1101     # Martin A. Hansen, January 2007.
1102
1103     # calculates the bit height using Shannon's famous
1104     # general formula for uncertainty as documentet:
1105     # http://www.ccrnp.ncifcrf.gov/~toms/paper/hawaii/latex/node5.html
1106
1107     my ( $char_hash,   # hashref with chars and frequencies
1108          $tot,         # total number of chars
1109        ) = @_;
1110
1111     # returns float
1112
1113     my ( $char, $freq, $bit_height );
1114
1115     foreach $char ( keys %{ $char_hash } )
1116     {
1117         $freq = $char_hash->{ $char } / $tot;
1118
1119         $bit_height += $freq * ( log( $freq ) / log( 2 ) );
1120     }
1121
1122     $bit_height *= -1;
1123
1124     return $bit_height;
1125 }
1126
1127
1128 sub seqlogo_calc_char_heights
1129 {
1130     # Martin A. Hansen, January 2007.
1131
1132     # calculates the hight of each char in bits, and sorts
1133     # according to height.
1134
1135     my ( $char_hash,   # hashref with chars and frequencies
1136          $tot,         # tot number of chars
1137          $bit_diff,    # information gained from uncertainties
1138        ) = @_;
1139
1140     # returns list of tuples
1141
1142     my ( $char, $freq, $char_height, @char_heights );
1143
1144     foreach $char ( keys %{ $char_hash } )
1145     {
1146         $freq = $char_hash->{ $char } / $tot;
1147
1148         $char_height = $freq * $bit_diff;   # char height in bits
1149
1150         push @char_heights, [ $char, $char_height ];
1151     }
1152
1153     @char_heights = sort { $a->[ 1 ] <=> $b->[ 1 ] } @char_heights;
1154
1155     return wantarray ? @char_heights : \@char_heights;
1156 }
1157
1158
1159 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> RESIDUE COLORS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1160
1161
1162 sub color_pep
1163 {
1164     # Martin A. Hansen, October 2005.
1165
1166     # color scheme for proteins as defined in Mview.
1167     # given a char returns the appropriate color.
1168     # The amino acids are colored according to physicochemical properties:
1169     # bright green = hydrophobic; dark green = large hydrophobic;
1170     # bright blue = negative charge; red = positive charge;
1171     # dull blue = small alcohol; purple = polar; yellow = cysteine.
1172
1173     my ( $char,   # char to decorate
1174        ) = @_;
1175
1176     # returns string
1177
1178     my ( %hash, $color_set );
1179
1180     %hash = (
1181         K   => "bright-red",
1182         R   => "bright-red",
1183         H   => "dark-green",
1184         D   => "bright-blue",
1185         E   => "bright-blue",
1186         S   => "dull-blue",
1187         T   => "dull-blue",
1188         N   => "purple",
1189         Q   => "purple",
1190         A   => "bright-green",
1191         V   => "bright-green",
1192         I   => "bright-green",
1193         L   => "bright-green",
1194         M   => "bright-green",
1195         F   => "dark-green",
1196         Y   => "dark-green",
1197         W   => "dark-green",
1198         C   => "yellow",
1199         G   => "bright-green",
1200         P   => "bright-green",
1201         Z   => "dark-gray",
1202         B   => "dark-gray",
1203         "?" => "light-gray",
1204         "~" => "light-gray",
1205         "*" => "dark-gray",
1206     );
1207
1208     if ( exists $hash{ uc $char } ) {
1209         $color_set = $hash{ uc $char };
1210     } else {
1211         $color_set = "black";
1212     }
1213
1214     return $color_set;
1215 }
1216
1217
1218 sub color_nuc
1219 {
1220     # Martin A. Hansen, October 2005.
1221
1222     # color scheme for nucleotides as defined in Mview.
1223     # given a char returns the appropriate color
1224     # according to physical/chemical proterties.
1225
1226     my ( $char,   # char to decorate
1227        ) = @_;
1228
1229     # returns string
1230
1231     my ( %hash, $color_set );
1232
1233     %hash = (
1234         A => "bright-red",
1235         G => "yellow",
1236         C => "blue",
1237         T => "green",
1238         U => "green",
1239     );
1240
1241     if ( exists $hash{ uc $char } ) {
1242         $color_set = $hash{ uc $char };
1243     } else {
1244         $color_set = "black";
1245     }
1246
1247     return $color_set;
1248 }
1249
1250
1251 sub color_palette
1252 {
1253     # Martin A. Hansen, October 2005.
1254
1255     # hash table with color-names and color-hex.
1256
1257     my ( $color,   # common color name
1258        ) = @_;
1259
1260     # returns string
1261
1262     my ( %hash );
1263
1264     %hash = (
1265         "black"                => "#000000",
1266         "white"                => "#ffffff",
1267         "red"                  => "#ff0000",
1268         "green"                => "#00ff00",
1269         "blue"                 => "#0000ff",
1270         "cyan"                 => "#00ffff",
1271         "magenta"              => "#ff00ff",
1272 #        "yellow"               => "#ffff00",
1273         "yellow"               => "#ffc800",
1274         "purple"               => "#6600cc",
1275         "dull-blue"            => "#0099ff",
1276         "dark-green-blue"      => "#33cccc",
1277         "medium-green-blue"    => "#00ffcc",
1278         "bright-blue"          => "#0033ff",
1279         "dark-green"           => "#009900",
1280         "bright-green"         => "#33cc00",
1281         "orange"               => "#ff3333",
1282         "orange-brown"         => "#cc6600",
1283         "bright-red"           => "#cc0000",
1284         "light-gray"           => "#999999",
1285         "dark-gray"            => "#666666",
1286         "gray0"                => "#ffffff",
1287         "gray1"                => "#eeeeee",
1288         "gray2"                => "#dddddd",
1289         "gray3"                => "#cccccc",
1290         "gray4"                => "#bbbbbb",
1291         "gray5"                => "#aaaaaa",
1292         "gray6"                => "#999999",
1293         "gray7"                => "#888888",
1294         "gray8"                => "#777777",
1295         "gray9"                => "#666666",
1296         "gray10"               => "#555555",
1297         "gray11"               => "#444444",
1298         "gray12"               => "#333333",
1299         "gray13"               => "#222222",
1300         "gray14"               => "#111111",
1301         "gray15"               => "#000000",
1302         "clustal-red"          => "#ff1111",
1303         "clustal-blue"         => "#1155ff",
1304         "clustal-green"        => "#11dd11",
1305         "clustal-cyan"         => "#11ffff",
1306         "clustal-yellow"       => "#ffff11",
1307         "clustal-orange"       => "#ff7f11",
1308         "clustal-pink"         => "#ff11ff",
1309         "clustal-purple"       => "#6611cc",
1310         "clustal-dull-blue"    => "#197fe5",
1311         "clustal-dark-gray"    => "#666666",
1312         "clustal-light-gray"   => "#999999",
1313         "lin-A"                => "#90fe23",
1314         "lin-R"                => "#fe5e2d",
1315         "lin-N"                => "#2e3d2d",
1316         "lin-D"                => "#00903b",
1317         "lin-C"                => "#004baa",
1318         "lin-Q"                => "#864b00",
1319         "lin-E"                => "#3fa201",
1320         "lin-G"                => "#10fe68",
1321         "lin-H"                => "#b2063b",
1322         "lin-I"                => "#04ced9",
1323         "lin-L"                => "#4972fe",
1324         "lin-K"                => "#c4a100",
1325         "lin-M"                => "#2a84dd",
1326         "lin-F"                => "#a60ade",
1327         "lin-P"                => "#fe61fe",
1328         "lin-S"                => "#f7e847",
1329         "lin-T"                => "#fefeb3",
1330         "lin-W"                => "#4a007f",
1331         "lin-Y"                => "#e903a8",
1332         "lin-V"                => "#5bfdfd",
1333     );
1334
1335     if ( exists $hash{ $color } ) {
1336         return $hash{ $color };
1337     } else {
1338         print STDERR qq(WARNING: color "$color" not found in palette!\n);
1339     }
1340 }
1341
1342
1343 sub color_contrast
1344 {
1345     # Martin A. Hansen, October 2005.
1346
1347     # Hash table with contrast colors to be used for frontground
1348     # text on a given background color.
1349
1350     my ( $color,    # background color
1351        ) = @_;
1352
1353     # returns string
1354
1355     my ( %hash );
1356
1357     %hash = (
1358         "black"                => "white",
1359         "white"                => "black",
1360         "red"                  => "white",
1361         "green"                => "white",
1362         "blue"                 => "white",
1363         "cyan"                 => "white",
1364         "magenta"              => "white",
1365         "yellow"               => "black",
1366         "purple"               => "white",
1367         "dull-blue"            => "white",
1368         "dark-green-blue"      => "white",
1369         "medium-green-blue"    => "white",
1370         "bright-blue"          => "white",
1371         "dark-green"           => "white",
1372         "bright-green"         => "black",
1373         "orange"               => "",
1374         "orange-brown"         => "",
1375         "bright-red"           => "white",
1376         "light-gray"           => "black",
1377         "dark-gray"            => "white",
1378         "gray0"                => "",
1379         "gray1"                => "",
1380         "gray2"                => "",
1381         "gray3"                => "",
1382         "gray4"                => "",
1383         "gray5"                => "",
1384         "gray6"                => "",
1385         "gray7"                => "",
1386         "gray8"                => "",
1387         "gray9"                => "",
1388         "gray10"               => "",
1389         "gray11"               => "",
1390         "gray12"               => "",
1391         "gray13"               => "",
1392         "gray14"               => "",
1393         "gray15"               => "",
1394         "clustal-red"          => "black",
1395         "clustal-blue"         => "black",
1396         "clustal-green"        => "black",
1397         "clustal-cyan"         => "black",
1398         "clustal-yellow"       => "black",
1399         "clustal-orange"       => "black",
1400         "clustal-pink"         => "black",
1401         "clustal-purple"       => "black",
1402         "clustal-dull-blue"    => "black",
1403         "clustal-dark-gray"    => "black",
1404         "clustal-light-gray"   => "black",
1405         "lin-A"                => "",
1406         "lin-R"                => "",
1407         "lin-N"                => "",
1408         "lin-D"                => "",
1409         "lin-C"                => "",
1410         "lin-Q"                => "",
1411         "lin-E"                => "",
1412         "lin-G"                => "",
1413         "lin-H"                => "",
1414         "lin-I"                => "",
1415         "lin-L"                => "",
1416         "lin-K"                => "",
1417         "lin-M"                => "",
1418         "lin-F"                => "",
1419         "lin-P"                => "",
1420         "lin-S"                => "",
1421         "lin-T"                => "",
1422         "lin-W"                => "",
1423         "lin-Y"                => "",
1424         "lin-V"                => "",
1425     );
1426
1427     if ( exists $hash{ $color } ) {
1428         return $hash{ $color };
1429     } else {
1430         print STDERR qq(WARNING: color "$color" not found in palette!\n);
1431     }
1432 }
1433
1434
1435 sub seq_word_pack
1436 {
1437     # Martin A. Hansen, April 2008.
1438
1439     # Packs a sequence word into a binary number.
1440
1441     my ( $word,        # Word to be packed
1442        ) = @_;
1443
1444     # Returns integer.
1445
1446     my ( %hash, $bin, $word_size, $pad );
1447
1448     %hash = (
1449         'A' => '000',
1450         'T' => '001',
1451         'C' => '010',
1452         'G' => '100',
1453         'N' => '011',
1454         '-' => '101',
1455         '.' => '110',
1456         '~' => '111',
1457     );
1458
1459     map { $bin .= pack "B3", $hash{ $_ } } split //, $word;
1460
1461     $word_size = length $word;
1462
1463     $pad = ( 3 * $word_size ) / 8;
1464
1465     if ( $pad =~ /\./ )
1466     {
1467         $pad = ( ( int $pad + 1 ) * 8 ) - 3 * $word_size;
1468
1469         $bin .= pack "B$pad", 0 x $pad;
1470     }
1471
1472     return $bin;
1473 }
1474
1475
1476 sub seq_word_unpack
1477 {
1478     # Martin A. Hansen, April 2008.
1479
1480     # Unpacks a binary sequence word to ASCII.
1481
1482     my ( $bin,         # Binary sequence word
1483          $word_size,   # Size of word
1484        ) = @_;
1485
1486     # Returns string.
1487
1488     my ( %hash, $word );
1489
1490     %hash = (
1491         '000' => 'A',
1492         '001' => 'T',
1493         '010' => 'C',
1494         '100' => 'G',
1495         '011' => 'N',
1496         '101' => '-',
1497         '110' => '.',
1498         '111' => '~',
1499     );
1500
1501     map { $word .= $hash{ $_ } } unpack "(B3)$word_size", $bin;
1502
1503     return $word;
1504 }
1505
1506
1507 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> ADAPTOR LOCATING <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1508
1509
1510 ###############################    REDUNDANT   ##############################
1511
1512 # these functions have been replaced by index_m and match_m in Common.pm
1513
1514 sub find_adaptor
1515 {
1516     # Martin A. Hansen & Selene Fernandez, August 2008
1517
1518     # Locates an adaptor sequence in a given sequence
1519     # starting from a given offset position allowing a given
1520     # number of mismatches (no indels).
1521
1522     my ( $adaptor,       # list of chars
1523          $tag,           # list of chars
1524          $adaptor_len,   # length of adaptor sequence
1525          $tag_len,       # length of tag sequence
1526          $tag_offset,    # offset position
1527          $max_match,     # number of matches indicating success
1528          $max_mismatch,  # number of mismatches
1529        ) = @_;
1530
1531     # Returns an integer.
1532
1533     my ( $i, $j, $match, $mismatch );
1534
1535     $i = $tag_offset;
1536
1537     while ( $i < $tag_len - ( $max_match + $max_mismatch ) + 1 )
1538     {
1539         $j = 0;
1540
1541         while ( $j < $adaptor_len - ( $max_match + $max_mismatch ) + 1 )
1542         {
1543             return $i if check_diag( $adaptor, $tag, $adaptor_len, $tag_len, $j, $i, $max_match, $max_mismatch );
1544
1545             $j++
1546         }
1547
1548         $i++;
1549     }
1550
1551     return -1;
1552 }
1553
1554
1555 sub check_diag
1556 {
1557     # Martin A. Hansen & Selene Fernandez, August 2008
1558
1559     # Checks the diagonal starting at a given coordinate
1560     # of a search space constituted by an adaptor and a tag sequence.
1561     # Residues in the diagonal are compared between the sequences allowing
1562     # for a given number of mismatches. We terminate when search space is
1563     # exhausted or if max matches or mismatches is reached.
1564
1565     my ( $adaptor,       # list of chars
1566          $tag,           # list of chars
1567          $adaptor_len,   # length of adaptor sequence
1568          $tag_len,       # length of tag sequence
1569          $adaptor_beg,   # adaptor begin coordinate
1570          $tag_beg,       # tag begin coordinate
1571          $max_match,     # number of matches indicating success
1572          $max_mismatch,  # number of mismatches
1573        ) = @_;
1574
1575     # Returns boolean.
1576
1577     my ( $match, $mismatch );
1578
1579     $match    = 0;
1580     $mismatch = 0;
1581
1582     while ( $adaptor_beg <= $adaptor_len and $tag_beg <= $tag_len )
1583     {
1584         if ( $adaptor->[ $adaptor_beg ] eq $tag->[ $tag_beg ] )
1585         {
1586             $match++;
1587
1588             return 1 if $match >= $max_match;
1589         }
1590         else
1591         {
1592             $mismatch++;
1593
1594             return 0 if $mismatch > $max_mismatch;
1595         }
1596
1597         $adaptor_beg++;
1598         $tag_beg++;
1599     }
1600
1601     return 0;
1602 }
1603
1604
1605 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<