3 # Copyright (C) 2007 Martin A. Hansen.
5 # This program is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU General Public License
7 # as published by the Free Software Foundation; either version 2
8 # of the License, or (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 # http://www.gnu.org/copyleft/gpl.html
22 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
28 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
34 use List::Util qw( shuffle );
35 use Time::HiRes qw( gettimeofday );
37 use vars qw ( @ISA @EXPORT );
39 @ISA = qw( Exporter );
41 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
46 # Martin A. Hansen, May 2007.
48 # Makes a qualified guess on the type of a given squence.
50 my ( $seq, # sequence to check
55 my ( $check_seq, $count );
57 if ( length $seq > 100 ) {
58 $check_seq = substr $seq, 0, 100;
63 if ( $count = $check_seq =~ tr/FLPQIEflpqie// and $count > 0 ) {
65 } elsif ( $count = $check_seq =~ tr/Uu// and $count > 0 ) {
75 # Martin A. Hansen, July 2007.
77 # Wraps a given string reference accoring to given width.
79 my ( $strref, # ref to sting to wrap
85 ${ $strref } =~ s/(.{$wrap})/$1\n/g;
94 # modified Martin A. Hansen, March 2005.
96 # Returns the reverse complement of a dna sequence with preservation of case
97 # according to this mapping,
99 # AGCUTRYWSMKHDVBNagcutrywsmkhdvbn
100 # TCGAAYRWSKMDHBVNtcgaayrwskmdhbvn
109 $seq =~ tr/AGCUTRYWSMKHDVBNagcutrywsmkhdvbn/TCGAAYRWSKMDHBVNtcgaayrwskmdhbvn/;
118 # modified Martin A. Hansen, March 2005.
120 # Returns the complement of a rna sequence with preservation of case
121 # according to this mapping,
123 # AGCUTRYWSMKHDVBNagcutrywsmkhdvbn
124 # UCGAAYRWSKMDHBVNucgaayrwskmdhbvn
131 $seq =~ tr/AGCUTRYWSMKHDVBNagcutrywsmkhdvbn/UCGAAYRWSKMDHBVNucgaayrwskmdhbvn/;
140 # modified Martin A. Hansen, March 2005.
142 # Returns the reverse complement of a dna sequence with preservation of case
143 # according to this mapping,
145 # AGCUTRYWSMKHDVBNagcutrywsmkhdvbn
146 # TCGAAYRWSKMDHBVNtcgaayrwskmdhbvn
148 my ( $seqref, # seqref
153 ${ $seqref } =~ tr/AGCUTRYWSMKHDVBNagcutrywsmkhdvbn/TCGAAYRWSKMDHBVNtcgaayrwskmdhbvn/;
160 # modified Martin A. Hansen, March 2005.
162 # Returns the complement of a rna sequence with preservation of case
163 # according to this mapping,
165 # AGCUTRYWSMKHDVBNagcutrywsmkhdvbn
166 # UCGAAYRWSKMDHBVNucgaayrwskmdhbvn
168 my ( $seqref, # seqref
173 ${ $seqref } =~ tr/AGCUTRYWSMKHDVBNagcutrywsmkhdvbn/UCGAAYRWSKMDHBVNucgaayrwskmdhbvn/;
179 # Martin A. Hansen, March 2007
181 # Converts DNA sequence to RNA
183 my ( $seq, # nucleotide sequence
196 # Martin A. Hansen, March 2007
198 # Converts RNA sequence to DNA
200 my ( $seq, # nucleotide sequence
213 # Martin A. Hansen, March 2005.
215 # given a string of nucleotides
216 # returns the corresponding ambiguity code
219 $type, # DNA or RNA - DEFAULT DNA
222 my ( %hash, @nts, $key, $code, %nt_hash );
226 if ( not $type or $type =~ /dna/i )
255 @nts = split //, $str;
257 %nt_hash = map { $_ => 1 } @nts;
259 @nts = sort keys %nt_hash;
261 $key = join "", @nts;
289 $code = $hash{ $key };
291 warn qq(WARNING: No ambiguity code for key->$key\n) if not $code;
299 # Martin A. Hansen, March 2005.
301 # given an amino acid, returns a list of corresponding codons
303 my ( $aa, # amino acid to translate
308 my ( %hash, $codons );
313 'F' => [ 'TTT', 'TTC' ], # Phe
314 'L' => [ 'TTA', 'TTG', 'CTT', 'CTC', 'CTA', 'CTG' ], # Leu
315 'S' => [ 'TCT', 'TCC', 'TCA', 'TCG', 'AGT', 'AGC' ], # Ser
316 'Y' => [ 'TAT', 'TAC' ], # Tyr
317 '*' => [ 'TAA', 'TAG', 'TGA' ], # Stop
318 'X' => [ 'TAA', 'TAG', 'TGA' ], # Stop
319 'C' => [ 'TGT', 'TGC' ], # Cys
320 'W' => [ 'TGG' ], # Trp
321 'P' => [ 'CCT', 'CCC', 'CCA', 'CCG' ], # Pro
322 'H' => [ 'CAT', 'CAC' ], # His
323 'Q' => [ 'CAA', 'CAG' ], # Gln
324 'R' => [ 'CGT', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG' ], # Arg
325 'I' => [ 'ATT', 'ATC', 'ATA' ], # Ile
326 'M' => [ 'ATG' ], # Met
327 'T' => [ 'ACT', 'ACC', 'ACA', 'ACG' ], # Thr
328 'N' => [ 'AAT', 'AAC' ], # Asn
329 'K' => [ 'AAA', 'AAG' ], # Lys
330 'V' => [ 'GTT', 'GTC', 'GTA', 'GTG' ], # Val
331 'A' => [ 'GCT', 'GCC', 'GCA', 'GCG' ], # Ala
332 'D' => [ 'GAT', 'GAC' ], # Asp
333 'E' => [ 'GAA', 'GAG' ], # Glu
334 'G' => [ 'GGT', 'GGC', 'GGA', 'GGG' ], # Gly
337 $codons = $hash{ $aa };
339 return wantarray ? @{ $codons } : $codons;
345 # Martin A. Hansen, March 2005.
347 # given a codon, returns the correponding
348 # vertebrate amino acid.
350 my ( $codon, # codon to translate
357 die qq(ERROR: Bad codon: "$codon"\n) if not $codon =~ /[ATCGatcg]{3}/;
426 $aa = $hash{ uc $codon };
434 # Martin A. Hansen, June 2005.
436 # translates a dna sequence to protein according to a optional given
439 my ( $dna, # dna sequence
440 $frame, # frame of translation - OPTIONAL
445 my ( $codon, $pos, $pep );
449 if ( $frame =~ /-?[1-3]/ )
452 $dna = Maasha::Seq::dna_revcomp( $dna );
455 $frame = abs( $frame ) - 1;
457 $dna =~ s/^.{${frame}}//;
461 Maasha::Common::error( qq(Badly formated frame "$frame") );
466 while ( $codon = substr $dna, $pos, 3 )
468 last if not length $codon == 3;
470 $pep .= codon2aa( $codon );
479 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> RNA FOLDING <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
482 sub fold_struct_rnafold
484 # Martin A. Hansen, February 2008.
486 # Given a squence fold this using RNAfold.
488 my ( $seq, # sequence to fold
491 # Returns a tuple of fold string and free energy.
493 my ( $pid, $fh_out, $fh_in, @lines, $struct, $energy );
495 $pid = open2( $fh_out, $fh_in, "RNAfold -noPS" );
497 Maasha::Fasta::put_entry( [ "RNAfold", $seq ], $fh_in );
509 if ( $lines[ - 1 ] =~ /^([^ ]+) \((.+)\)$/ )
515 return wantarray ? ( $struct, $energy ) : [ $struct, $energy ];
519 sub fold_struct_contrastruct
521 # Martin A. Hansen, February 2008.
523 # Given a sequence fold this using Contrafold.
525 my ( $seq, # sequence to fold
526 $tmp_dir, # temporary directory - OPTIONAL
529 # Returns a tuple of fold string and temp index.
531 my ( $tmp_file, $out_file1, $out_file2, $fh, $line, $struct, @AoA, $i, $temp, $index );
533 $tmp_dir ||= $ENV{ 'BP_TMP' };
535 $tmp_file = "$tmp_dir/fold.fna";
536 $out_file1 = "$tmp_dir/fold.out1";
537 $out_file2 = "$tmp_dir/fold.out2";
539 Maasha::Fasta::put_entries( [ [ "fold", $seq ] ], $tmp_file );
541 Maasha::Common::run( "contrafold", "predict --parens $out_file1 --bpseq $out_file2 $tmp_file" );
545 $fh = Maasha::Common::read_open( $out_file1 );
547 while ( $line = <$fh> )
558 $fh = Maasha::Common::read_open( $out_file2 );
560 while ( $line = <$fh> )
564 push @AoA, [ split " ", $line ];
571 for ( $i = 0; $i < @AoA; $i++ )
573 if ( $AoA[ $i ]->[ 2 ] != 0 )
575 last if $AoA[ $i ]->[ 0 ] > $AoA[ $i ]->[ 2 ];
577 $temp += base_pair_melting_temp( $AoA[ $i ]->[ 1 ] . $AoA[ $AoA[ $i ]->[ 2 ] - 1 ]->[ 1 ] );
581 $index = sprintf( "%.2f", $temp / length $seq );
583 return wantarray ? ( $struct, $index ) : [ $struct, $index ];
587 sub base_pair_melting_temp
589 # Martin A. Hansen, February 2008.
591 # Given a basepair, returns the melting temperature.
593 my ( $bp, # basepair string
628 return $melt_hash{ uc $bp };
632 sub generate_dna_oligos
634 # Martin A. Hansen, April 2007.
636 # Generates all possible DNA oligos of a given wordsize.
638 # alternative way: perl -MData::Dumper -e '@CONV = glob( "{T,C,A,G}" x 4 ); print Dumper( \@CONV )'
641 my ( $wordsize, # size of DNA oligos
646 my ( @alph, @oligos, $oligo, $char, @list );
648 @alph = ( qw( A T C G N ) );
651 for ( 1 .. $wordsize )
653 foreach $oligo ( @oligos )
655 foreach $char ( @alph ) {
656 push @list, $oligo . $char;
665 return wantarray ? @oligos : \@oligos;
671 # Martin A. Hansen, April 2007
673 # Given a sequence and a wordsize,
674 # breaks the sequence into overlapping
675 # oligos of that wordsize.
677 my ( $seq, # sequence reference
678 $wordsize, # wordsize
683 my ( $i, $oligo, @oligos );
685 for ( $i = 0; $i < length( ${ $seq } ) - $wordsize + 1; $i++ )
687 $oligo = substr ${ $seq }, $i, $wordsize;
689 push @oligos, $oligo;
692 return wantarray ? @oligos : \@oligos;
698 # Martin A. Hansen, April 2007
700 # Given a sequence and a wordsize,
701 # breaks the sequence into overlapping
702 # oligos of that wordsize and return
705 my ( $seq, # sequence reference
706 $wordsize, # wordsize
711 my ( $i, $oligo, %lookup, @oligos );
713 for ( $i = 0; $i < length( ${ $seq } ) - $wordsize + 1; $i++ )
715 $oligo = substr ${ $seq }, $i, $wordsize;
717 if ( not exists $lookup{ $oligo } )
719 push @oligos, $oligo;
720 $lookup{ $oligo } = 1;
724 return wantarray ? @oligos : \@oligos;
730 # Martin A. Hansen, August 2007.
732 # Given a hashref with oligo=>count, calculates
733 # a frequency table. Returns a list of hashes
735 my ( $oligo_freq, # hashref
738 # Returns data structure
740 my ( @freq_table, $total );
744 map { push @freq_table, { OLIGO => $_, COUNT => $oligo_freq->{ $_ } }; $total += $oligo_freq->{ $_ } } keys %{ $oligo_freq };
746 @freq_table = sort { $b->{ "COUNT" } <=> $a->{ "COUNT" } or $a->{ "OLIGO" } cmp $b->{ "OLIGO" } } @freq_table;
748 map { $_->{ "FREQ" } = sprintf( "%.4f", $_->{ "COUNT" } / $total ) } @freq_table;
750 return wantarray ? return @freq_table : \@freq_table;
756 # Martin A. Hansen, May 2007
758 # Generates a random sequence given a sequence length
761 my ( $len, # sequence length
762 $alph, # sequence alphabet
767 my ( $alph_len, $i, $seq );
769 $alph_len = scalar @{ $alph };
771 for ( $i = 0; $i < $len; $i++ ) {
772 $seq .= $alph->[ int( rand( $alph_len ) ) ];
781 # Martin A. Hansen, December 2007.
783 # Shuffles sequence of a given string.
785 my ( $seq, # sequence string
792 @list = split "", $seq;
794 return join "", shuffle( @list );
800 # Martin A. Hansen, May 2007.
802 # returns a requested alphabet
804 my ( $type, # alphabet type
811 if ( $type =~ /^dna$/i ) {
812 @alph = qw( A T C G );
813 } elsif ( $type =~ /^rna$/i ) {
814 @alph = qw( A U C G );
815 } elsif ( $type =~ /^prot/i ) {
816 @alph = qw( F L S Y C W P H Q R I M T N K V A D E G );
818 die qq(ERROR: Unknown alphabet type: "$type"\n);
821 return wantarray ? @alph : \@alph;
827 # Martin A. Hansen, August 2007.
829 # Analyses the sequence composition of a given sequence.
831 my ( $seq, # sequence to analyze
836 my ( %analysis, @chars, @chars_lc, $char, %char_hash, $gc, $at, $lc, $max, $res_sum, @indels, %indel_hash );
838 $analysis{ "SEQ_TYPE" } = uc Maasha::Seq::seq_guess_type( $seq );
839 $analysis{ "SEQ_LEN" } = length $seq;
841 @indels = qw( - ~ . _ );
843 if ( $analysis{ "SEQ_TYPE" } eq "DNA" )
845 @chars = split //, "AGCUTRYWSMKHDVBNagcutrywsmkhdvbn";
846 @chars_lc = split //, "agcutrywsmkhdvbn";
848 elsif ( $analysis{ "SEQ_TYPE" } eq "RNA" )
850 @chars = split //, "AGCUTRYWSMKHDVBNagcutrywsmkhdvbn";
851 @chars_lc = split //, "agcutrywsmkhdvbn";
855 @chars = split //, "FLSYCWPHQRIMTNKVADEGflsycwphqrimtnkvadeg";
856 @chars_lc = split //, "flsycwphqrimtnkvadeg";
859 @char_hash{ @chars } = map { eval "scalar \$seq =~ tr/$_//" } @chars;
860 @indel_hash{ @indels } = map { eval "scalar \$seq =~ tr/$_//" } @indels;
862 if ( $analysis{ "SEQ_TYPE" } =~ /DNA|RNA/ )
864 $gc = $char_hash{ "g" } + $char_hash{ "G" } + $char_hash{ "c" } + $char_hash{ "C" };
865 $at = $char_hash{ "a" } + $char_hash{ "A" } + $char_hash{ "t" } + $char_hash{ "T" } + $char_hash{ "u" } + $char_hash{ "U" };
867 $analysis{ "GC%" } = sprintf( "%.2f", 100 * $gc / $analysis{ "SEQ_LEN" } );
869 map { $lc += $char_hash{ lc $_ } } @chars_lc;
871 $analysis{ "SOFT_MASK%" } = sprintf( "%.2f", 100 * $lc / $analysis{ "SEQ_LEN" } );
872 $analysis{ "HARD_MASK%" } = sprintf( "%.2f", 100 * ( $char_hash{ "n" } + $char_hash{ "N" } ) / $analysis{ "SEQ_LEN" } );
877 foreach $char ( @chars_lc )
881 $char_hash{ $char } += $char_hash{ lc $char };
883 $analysis{ "RES:$char" } = $char_hash{ $char };
885 $max = $char_hash{ $char } if $char_hash{ $char } > $max;
887 $analysis{ "RES_SUM" } += $char_hash{ $char };
890 map { $analysis{ "RES:$_" } = $indel_hash{ $_ } } @indels;
892 $analysis{ "MIX_INDEX" } = sprintf( "%.2f", $max / $analysis{ "SEQ_LEN" } );
893 $analysis{ "MELT_TEMP" } = sprintf( "%.2f", 4 * $gc + 2 * $at );
895 return wantarray ? %analysis : \%analysis;
901 # Martin A. Hansen, May 2008.
903 # Given a sequence computes a complexity index
904 # as the most common di-residue over
905 # the sequence length. Return ~1 if the entire
906 # sequence is homopolymeric. Above 0.4 indicates
907 # low complexity sequence.
909 my ( $seq, # sequence
914 my ( $len, $i, $max, $di, %hash );
920 for ( $i = 0; $i < $len - 1; $i++ ) {
921 $hash{ substr $seq, $i, 2 }++;
924 foreach $di ( keys %hash ) {
925 $max = $hash{ $di } if $hash{ $di } > $max;
932 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SEQLOGO <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
937 # Martin A. Hansen, January 2007.
939 # given max bit size and a list of aligned entries
940 # in FASTA format, calculates for each sequence position
941 # the height of the letters in bits.
942 # returns a data structure with [ letter, height ] tuples
943 # for all letters at each position.
945 my ( $bit_max, # maximum bit height
946 $entries, # FASTA entries
949 # returns data structure
951 my ( $logo_len, $char_tot, $i, %char_hash, $bit_height, $bit_diff, $char_heights, @logo );
953 $logo_len = length $entries->[ 0 ]->[ 1 ];
954 $char_tot = scalar @{ $entries };
956 for ( $i = 0; $i < $logo_len; $i++ )
960 map { $char_hash{ uc substr( $_->[ 1 ], $i, 1 ) }++ } @{ $entries };
962 delete $char_hash{ "-" };
963 delete $char_hash{ "_" };
964 delete $char_hash{ "~" };
965 delete $char_hash{ "." };
967 $bit_height = seqlogo_calc_bit_height( \%char_hash, $char_tot );
969 $bit_diff = $bit_max - $bit_height;
971 $char_heights = seqlogo_calc_char_heights( \%char_hash, $char_tot, $bit_diff );
973 push @logo, $char_heights;
976 return wantarray ? @logo : \@logo;
980 sub seqlogo_calc_bit_height
982 # Martin A. Hansen, January 2007.
984 # calculates the bit height using Shannon's famous
985 # general formula for uncertainty as documentet:
986 # http://www.ccrnp.ncifcrf.gov/~toms/paper/hawaii/latex/node5.html
988 my ( $char_hash, # hashref with chars and frequencies
989 $tot, # total number of chars
994 my ( $char, $freq, $bit_height );
996 foreach $char ( keys %{ $char_hash } )
998 $freq = $char_hash->{ $char } / $tot;
1000 $bit_height += $freq * ( log( $freq ) / log( 2 ) );
1009 sub seqlogo_calc_char_heights
1011 # Martin A. Hansen, January 2007.
1013 # calculates the hight of each char in bits, and sorts
1014 # according to height.
1016 my ( $char_hash, # hashref with chars and frequencies
1017 $tot, # tot number of chars
1018 $bit_diff, # information gained from uncertainties
1021 # returns list of tuples
1023 my ( $char, $freq, $char_height, @char_heights );
1025 foreach $char ( keys %{ $char_hash } )
1027 $freq = $char_hash->{ $char } / $tot;
1029 $char_height = $freq * $bit_diff; # char height in bits
1031 push @char_heights, [ $char, $char_height ];
1034 @char_heights = sort { $a->[ 1 ] <=> $b->[ 1 ] } @char_heights;
1036 return wantarray ? @char_heights : \@char_heights;
1040 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> RESIDUE COLORS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1045 # Martin A. Hansen, October 2005.
1047 # color scheme for proteins as defined in Mview.
1048 # given a char returns the appropriate color.
1049 # The amino acids are colored according to physicochemical properties:
1050 # bright green = hydrophobic; dark green = large hydrophobic;
1051 # bright blue = negative charge; red = positive charge;
1052 # dull blue = small alcohol; purple = polar; yellow = cysteine.
1054 my ( $char, # char to decorate
1059 my ( %hash, $color_set );
1071 A => "bright-green",
1072 V => "bright-green",
1073 I => "bright-green",
1074 L => "bright-green",
1075 M => "bright-green",
1080 G => "bright-green",
1081 P => "bright-green",
1084 "?" => "light-gray",
1085 "~" => "light-gray",
1089 if ( exists $hash{ uc $char } ) {
1090 $color_set = $hash{ uc $char };
1092 $color_set = "black";
1101 # Martin A. Hansen, October 2005.
1103 # color scheme for nucleotides as defined in Mview.
1104 # given a char returns the appropriate color
1105 # according to physical/chemical proterties.
1107 my ( $char, # char to decorate
1112 my ( %hash, $color_set );
1122 if ( exists $hash{ uc $char } ) {
1123 $color_set = $hash{ uc $char };
1125 $color_set = "black";
1134 # Martin A. Hansen, October 2005.
1136 # hash table with color-names and color-hex.
1138 my ( $color, # common color name
1146 "black" => "#000000",
1147 "white" => "#ffffff",
1149 "green" => "#00ff00",
1150 "blue" => "#0000ff",
1151 "cyan" => "#00ffff",
1152 "magenta" => "#ff00ff",
1153 # "yellow" => "#ffff00",
1154 "yellow" => "#ffc800",
1155 "purple" => "#6600cc",
1156 "dull-blue" => "#0099ff",
1157 "dark-green-blue" => "#33cccc",
1158 "medium-green-blue" => "#00ffcc",
1159 "bright-blue" => "#0033ff",
1160 "dark-green" => "#009900",
1161 "bright-green" => "#33cc00",
1162 "orange" => "#ff3333",
1163 "orange-brown" => "#cc6600",
1164 "bright-red" => "#cc0000",
1165 "light-gray" => "#999999",
1166 "dark-gray" => "#666666",
1167 "gray0" => "#ffffff",
1168 "gray1" => "#eeeeee",
1169 "gray2" => "#dddddd",
1170 "gray3" => "#cccccc",
1171 "gray4" => "#bbbbbb",
1172 "gray5" => "#aaaaaa",
1173 "gray6" => "#999999",
1174 "gray7" => "#888888",
1175 "gray8" => "#777777",
1176 "gray9" => "#666666",
1177 "gray10" => "#555555",
1178 "gray11" => "#444444",
1179 "gray12" => "#333333",
1180 "gray13" => "#222222",
1181 "gray14" => "#111111",
1182 "gray15" => "#000000",
1183 "clustal-red" => "#ff1111",
1184 "clustal-blue" => "#1155ff",
1185 "clustal-green" => "#11dd11",
1186 "clustal-cyan" => "#11ffff",
1187 "clustal-yellow" => "#ffff11",
1188 "clustal-orange" => "#ff7f11",
1189 "clustal-pink" => "#ff11ff",
1190 "clustal-purple" => "#6611cc",
1191 "clustal-dull-blue" => "#197fe5",
1192 "clustal-dark-gray" => "#666666",
1193 "clustal-light-gray" => "#999999",
1194 "lin-A" => "#90fe23",
1195 "lin-R" => "#fe5e2d",
1196 "lin-N" => "#2e3d2d",
1197 "lin-D" => "#00903b",
1198 "lin-C" => "#004baa",
1199 "lin-Q" => "#864b00",
1200 "lin-E" => "#3fa201",
1201 "lin-G" => "#10fe68",
1202 "lin-H" => "#b2063b",
1203 "lin-I" => "#04ced9",
1204 "lin-L" => "#4972fe",
1205 "lin-K" => "#c4a100",
1206 "lin-M" => "#2a84dd",
1207 "lin-F" => "#a60ade",
1208 "lin-P" => "#fe61fe",
1209 "lin-S" => "#f7e847",
1210 "lin-T" => "#fefeb3",
1211 "lin-W" => "#4a007f",
1212 "lin-Y" => "#e903a8",
1213 "lin-V" => "#5bfdfd",
1216 if ( exists $hash{ $color } ) {
1217 return $hash{ $color };
1219 print STDERR qq(WARNING: color "$color" not found in palette!\n);
1226 # Martin A. Hansen, October 2005.
1228 # Hash table with contrast colors to be used for frontground
1229 # text on a given background color.
1231 my ( $color, # background color
1245 "magenta" => "white",
1246 "yellow" => "black",
1247 "purple" => "white",
1248 "dull-blue" => "white",
1249 "dark-green-blue" => "white",
1250 "medium-green-blue" => "white",
1251 "bright-blue" => "white",
1252 "dark-green" => "white",
1253 "bright-green" => "black",
1255 "orange-brown" => "",
1256 "bright-red" => "white",
1257 "light-gray" => "black",
1258 "dark-gray" => "white",
1275 "clustal-red" => "black",
1276 "clustal-blue" => "black",
1277 "clustal-green" => "black",
1278 "clustal-cyan" => "black",
1279 "clustal-yellow" => "black",
1280 "clustal-orange" => "black",
1281 "clustal-pink" => "black",
1282 "clustal-purple" => "black",
1283 "clustal-dull-blue" => "black",
1284 "clustal-dark-gray" => "black",
1285 "clustal-light-gray" => "black",
1308 if ( exists $hash{ $color } ) {
1309 return $hash{ $color };
1311 print STDERR qq(WARNING: color "$color" not found in palette!\n);
1318 # Martin A. Hansen, April 2008.
1320 # Packs a sequence word into a binary number.
1322 my ( $word, # Word to be packed
1327 my ( %hash, $bin, $word_size, $pad );
1340 map { $bin .= pack "B3", $hash{ $_ } } split //, $word;
1342 $word_size = length $word;
1344 $pad = ( 3 * $word_size ) / 8;
1348 $pad = ( ( int $pad + 1 ) * 8 ) - 3 * $word_size;
1350 $bin .= pack "B$pad", 0 x $pad;
1359 # Martin A. Hansen, April 2008.
1361 # Unpacks a binary sequence word to ASCII.
1363 my ( $bin, # Binary sequence word
1364 $word_size, # Size of word
1369 my ( %hash, $word );
1382 map { $word .= $hash{ $_ } } unpack "(B3)$word_size", $bin;
1388 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> ADAPTOR LOCATING <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1391 ############################### REDUNDANT ##############################
1393 # these functions have been replaced by index_m and match_m in Common.pm
1397 # Martin A. Hansen & Selene Fernandez, August 2008
1399 # Locates an adaptor sequence in a given sequence
1400 # starting from a given offset position allowing a given
1401 # number of mismatches (no indels).
1403 my ( $adaptor, # list of chars
1404 $tag, # list of chars
1405 $adaptor_len, # length of adaptor sequence
1406 $tag_len, # length of tag sequence
1407 $tag_offset, # offset position
1408 $max_match, # number of matches indicating success
1409 $max_mismatch, # number of mismatches
1412 # Returns an integer.
1414 my ( $i, $j, $match, $mismatch );
1418 while ( $i < $tag_len - ( $max_match + $max_mismatch ) + 1 )
1422 while ( $j < $adaptor_len - ( $max_match + $max_mismatch ) + 1 )
1424 return $i if check_diag( $adaptor, $tag, $adaptor_len, $tag_len, $j, $i, $max_match, $max_mismatch );
1438 # Martin A. Hansen & Selene Fernandez, August 2008
1440 # Checks the diagonal starting at a given coordinate
1441 # of a search space constituted by an adaptor and a tag sequence.
1442 # Residues in the diagonal are compared between the sequences allowing
1443 # for a given number of mismatches. We terminate when search space is
1444 # exhausted or if max matches or mismatches is reached.
1446 my ( $adaptor, # list of chars
1447 $tag, # list of chars
1448 $adaptor_len, # length of adaptor sequence
1449 $tag_len, # length of tag sequence
1450 $adaptor_beg, # adaptor begin coordinate
1451 $tag_beg, # tag begin coordinate
1452 $max_match, # number of matches indicating success
1453 $max_mismatch, # number of mismatches
1458 my ( $match, $mismatch );
1463 while ( $adaptor_beg <= $adaptor_len and $tag_beg <= $tag_len )
1465 if ( $adaptor->[ $adaptor_beg ] eq $tag->[ $tag_beg ] )
1469 return 1 if $match >= $max_match;
1475 return 0 if $mismatch > $max_mismatch;
1486 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<