3 # Copyright (C) 2007 Martin A. Hansen.
5 # This program is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU General Public License
7 # as published by the Free Software Foundation; either version 2
8 # of the License, or (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 # http://www.gnu.org/copyleft/gpl.html
22 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
28 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
34 use List::Util qw( shuffle );
35 use Time::HiRes qw( gettimeofday );
37 use vars qw ( @ISA @EXPORT );
39 @ISA = qw( Exporter );
42 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
47 # Martin A. Hansen, May 2007.
49 # Makes a qualified guess on the type of a given squence.
51 my ( $seq, # sequence to check
56 my ( $check_seq, $count );
58 if ( length $seq > 100 ) {
59 $check_seq = substr $seq, 0, 100;
64 if ( $count = $check_seq =~ tr/FLPQIEflpqie// and $count > 0 ) {
66 } elsif ( $count = $check_seq =~ tr/Uu// and $count > 0 ) {
76 # Martin A. Hansen, July 2007.
78 # Wraps a given string reference accoring to given width.
80 my ( $strref, # ref to sting to wrap
86 ${ $strref } =~ s/(.{$wrap})/$1\n/g;
95 # modified Martin A. Hansen, March 2005.
97 # Returns the reverse complement of a dna sequence with preservation of case
98 # according to this mapping,
100 # AGCUTRYWSMKHDVBNagcutrywsmkhdvbn
101 # TCGAAYRWSKMDHBVNtcgaayrwskmdhbvn
110 $seq =~ tr/AGCUTRYWSMKHDVBNagcutrywsmkhdvbn/TCGAAYRWSKMDHBVNtcgaayrwskmdhbvn/;
119 # modified Martin A. Hansen, March 2005.
121 # Returns the complement of a rna sequence with preservation of case
122 # according to this mapping,
124 # AGCUTRYWSMKHDVBNagcutrywsmkhdvbn
125 # UCGAAYRWSKMDHBVNucgaayrwskmdhbvn
132 $seq =~ tr/AGCUTRYWSMKHDVBNagcutrywsmkhdvbn/UCGAAYRWSKMDHBVNucgaayrwskmdhbvn/;
141 # modified Martin A. Hansen, March 2005.
143 # Returns the reverse complement of a dna sequence with preservation of case
144 # according to this mapping,
146 # AGCUTRYWSMKHDVBNagcutrywsmkhdvbn
147 # TCGAAYRWSKMDHBVNtcgaayrwskmdhbvn
149 my ( $seqref, # seqref
154 ${ $seqref } =~ tr/AGCUTRYWSMKHDVBNagcutrywsmkhdvbn/TCGAAYRWSKMDHBVNtcgaayrwskmdhbvn/;
161 # modified Martin A. Hansen, March 2005.
163 # Returns the complement of a rna sequence with preservation of case
164 # according to this mapping,
166 # AGCUTRYWSMKHDVBNagcutrywsmkhdvbn
167 # UCGAAYRWSKMDHBVNucgaayrwskmdhbvn
169 my ( $seqref, # seqref
174 ${ $seqref } =~ tr/AGCUTRYWSMKHDVBNagcutrywsmkhdvbn/UCGAAYRWSKMDHBVNucgaayrwskmdhbvn/;
180 # Martin A. Hansen, March 2007
182 # Converts DNA sequence to RNA
184 my ( $seq, # nucleotide sequence
197 # Martin A. Hansen, March 2007
199 # Converts RNA sequence to DNA
201 my ( $seq, # nucleotide sequence
214 # Martin A. Hansen, March 2005.
216 # given a string of nucleotides
217 # returns the corresponding ambiguity code
220 $type, # DNA or RNA - DEFAULT DNA
223 my ( %hash, @nts, $key, $code, %nt_hash );
227 if ( not $type or $type =~ /dna/i )
256 @nts = split //, $str;
258 %nt_hash = map { $_ => 1 } @nts;
260 @nts = sort keys %nt_hash;
262 $key = join "", @nts;
290 $code = $hash{ $key };
292 warn qq(WARNING: No ambiguity code for key->$key\n) if not $code;
300 # Martin A. Hansen, March 2005.
302 # given an amino acid, returns a list of corresponding codons
304 my ( $aa, # amino acid to translate
309 my ( %hash, $codons );
314 'F' => [ 'TTT', 'TTC' ], # Phe
315 'L' => [ 'TTA', 'TTG', 'CTT', 'CTC', 'CTA', 'CTG' ], # Leu
316 'S' => [ 'TCT', 'TCC', 'TCA', 'TCG', 'AGT', 'AGC' ], # Ser
317 'Y' => [ 'TAT', 'TAC' ], # Tyr
318 '*' => [ 'TAA', 'TAG', 'TGA' ], # Stop
319 'X' => [ 'TAA', 'TAG', 'TGA' ], # Stop
320 'C' => [ 'TGT', 'TGC' ], # Cys
321 'W' => [ 'TGG' ], # Trp
322 'P' => [ 'CCT', 'CCC', 'CCA', 'CCG' ], # Pro
323 'H' => [ 'CAT', 'CAC' ], # His
324 'Q' => [ 'CAA', 'CAG' ], # Gln
325 'R' => [ 'CGT', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG' ], # Arg
326 'I' => [ 'ATT', 'ATC', 'ATA' ], # Ile
327 'M' => [ 'ATG' ], # Met
328 'T' => [ 'ACT', 'ACC', 'ACA', 'ACG' ], # Thr
329 'N' => [ 'AAT', 'AAC' ], # Asn
330 'K' => [ 'AAA', 'AAG' ], # Lys
331 'V' => [ 'GTT', 'GTC', 'GTA', 'GTG' ], # Val
332 'A' => [ 'GCT', 'GCC', 'GCA', 'GCG' ], # Ala
333 'D' => [ 'GAT', 'GAC' ], # Asp
334 'E' => [ 'GAA', 'GAG' ], # Glu
335 'G' => [ 'GGT', 'GGC', 'GGA', 'GGG' ], # Gly
338 $codons = $hash{ $aa };
340 return wantarray ? @{ $codons } : $codons;
346 # Martin A. Hansen, March 2005.
348 # given a codon, returns the correponding
349 # vertebrate amino acid.
351 my ( $codon, # codon to translate
358 die qq(ERROR: Bad codon: "$codon"\n) if not $codon =~ /[ATCGatcg]{3}/;
427 $aa = $hash{ uc $codon };
435 # Martin A. Hansen, June 2005.
437 # translates a dna sequence to protein according to a optional given
440 my ( $dna, # dna sequence
441 $frame, # frame of translation - OPTIONAL
446 my ( $codon, $pos, $pep );
450 if ( $frame =~ /-?[1-3]/ )
453 $dna = Maasha::Seq::dna_revcomp( $dna );
456 $frame = abs( $frame ) - 1;
458 $dna =~ s/^.{${frame}}//;
462 Maasha::Common::error( qq(Badly formated frame "$frame") );
467 while ( $codon = substr $dna, $pos, 3 )
469 last if not length $codon == 3;
471 $pep .= codon2aa( $codon );
480 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> RNA FOLDING <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
483 sub fold_struct_rnafold
485 # Martin A. Hansen, February 2008.
487 # Given a squence fold this using RNAfold.
489 my ( $seq, # sequence to fold
492 # Returns a tuple of fold string and free energy.
494 my ( $pid, $fh_out, $fh_in, @lines, $struct, $energy );
496 $pid = open2( $fh_out, $fh_in, "RNAfold -noPS" );
498 Maasha::Fasta::put_entry( [ "RNAfold", $seq ], $fh_in );
510 if ( $lines[ - 1 ] =~ /^([^ ]+) \((.+)\)$/ )
516 return wantarray ? ( $struct, $energy ) : [ $struct, $energy ];
520 sub fold_struct_contrastruct
522 # Martin A. Hansen, February 2008.
524 # Given a sequence fold this using Contrafold.
526 my ( $seq, # sequence to fold
527 $tmp_dir, # temporary directory - OPTIONAL
530 # Returns a tuple of fold string and temp index.
532 my ( $tmp_file, $out_file1, $out_file2, $fh, $line, $struct, @AoA, $i, $temp, $index );
534 $tmp_dir ||= $ENV{ 'BP_TMP' };
536 $tmp_file = "$tmp_dir/fold.fna";
537 $out_file1 = "$tmp_dir/fold.out1";
538 $out_file2 = "$tmp_dir/fold.out2";
540 Maasha::Fasta::put_entries( [ [ "fold", $seq ] ], $tmp_file );
542 Maasha::Common::run( "contrafold", "predict --parens $out_file1 --bpseq $out_file2 $tmp_file" );
546 $fh = Maasha::Common::read_open( $out_file1 );
548 while ( $line = <$fh> )
559 $fh = Maasha::Common::read_open( $out_file2 );
561 while ( $line = <$fh> )
565 push @AoA, [ split " ", $line ];
572 for ( $i = 0; $i < @AoA; $i++ )
574 if ( $AoA[ $i ]->[ 2 ] != 0 )
576 last if $AoA[ $i ]->[ 0 ] > $AoA[ $i ]->[ 2 ];
578 $temp += base_pair_melting_temp( $AoA[ $i ]->[ 1 ] . $AoA[ $AoA[ $i ]->[ 2 ] - 1 ]->[ 1 ] );
582 $index = sprintf( "%.2f", $temp / length $seq );
584 return wantarray ? ( $struct, $index ) : [ $struct, $index ];
588 sub base_pair_melting_temp
590 # Martin A. Hansen, February 2008.
592 # Given a basepair, returns the melting temperature.
594 my ( $bp, # basepair string
629 return $melt_hash{ uc $bp };
633 sub generate_dna_oligos
635 # Martin A. Hansen, April 2007.
637 # Generates all possible DNA oligos of a given wordsize.
639 # alternative way: perl -MData::Dumper -e '@CONV = glob( "{T,C,A,G}" x 4 ); print Dumper( \@CONV )'
642 my ( $wordsize, # size of DNA oligos
647 my ( @alph, @oligos, $oligo, $char, @list );
649 @alph = ( qw( A T C G N ) );
652 for ( 1 .. $wordsize )
654 foreach $oligo ( @oligos )
656 foreach $char ( @alph ) {
657 push @list, $oligo . $char;
666 return wantarray ? @oligos : \@oligos;
672 # Martin A. Hansen, April 2007
674 # Given a sequence and a wordsize,
675 # breaks the sequence into overlapping
676 # oligos of that wordsize.
678 my ( $seq, # sequence reference
679 $wordsize, # wordsize
684 my ( $i, $oligo, @oligos );
686 for ( $i = 0; $i < length( ${ $seq } ) - $wordsize + 1; $i++ )
688 $oligo = substr ${ $seq }, $i, $wordsize;
690 push @oligos, $oligo;
693 return wantarray ? @oligos : \@oligos;
699 # Martin A. Hansen, April 2007
701 # Given a sequence and a wordsize,
702 # breaks the sequence into overlapping
703 # oligos of that wordsize and return
706 my ( $seq, # sequence reference
707 $wordsize, # wordsize
712 my ( $i, $oligo, %lookup, @oligos );
714 for ( $i = 0; $i < length( ${ $seq } ) - $wordsize + 1; $i++ )
716 $oligo = substr ${ $seq }, $i, $wordsize;
718 if ( not exists $lookup{ $oligo } )
720 push @oligos, $oligo;
721 $lookup{ $oligo } = 1;
725 return wantarray ? @oligos : \@oligos;
731 # Martin A. Hansen, August 2007.
733 # Given a hashref with oligo=>count, calculates
734 # a frequency table. Returns a list of hashes
736 my ( $oligo_freq, # hashref
739 # Returns data structure
741 my ( @freq_table, $total );
745 map { push @freq_table, { OLIGO => $_, COUNT => $oligo_freq->{ $_ } }; $total += $oligo_freq->{ $_ } } keys %{ $oligo_freq };
747 @freq_table = sort { $b->{ "COUNT" } <=> $a->{ "COUNT" } or $a->{ "OLIGO" } cmp $b->{ "OLIGO" } } @freq_table;
749 map { $_->{ "FREQ" } = sprintf( "%.4f", $_->{ "COUNT" } / $total ) } @freq_table;
751 return wantarray ? return @freq_table : \@freq_table;
757 # Martin A. Hansen, May 2007
759 # Generates a random sequence given a sequence length
762 my ( $len, # sequence length
763 $alph, # sequence alphabet
768 my ( $alph_len, $i, $seq );
770 $alph_len = scalar @{ $alph };
772 for ( $i = 0; $i < $len; $i++ ) {
773 $seq .= $alph->[ int( rand( $alph_len ) ) ];
782 # Martin A. Hansen, December 2007.
784 # Shuffles sequence of a given string.
786 my ( $seq, # sequence string
793 @list = split "", $seq;
795 return join "", shuffle( @list );
801 # Martin A. Hansen, May 2007.
803 # returns a requested alphabet
805 my ( $type, # alphabet type
812 if ( $type =~ /^dna$/i ) {
813 @alph = qw( A T C G );
814 } elsif ( $type =~ /^rna$/i ) {
815 @alph = qw( A U C G );
816 } elsif ( $type =~ /^prot/i ) {
817 @alph = qw( F L S Y C W P H Q R I M T N K V A D E G );
819 die qq(ERROR: Unknown alphabet type: "$type"\n);
822 return wantarray ? @alph : \@alph;
828 # Martin A. Hansen, August 2007.
830 # Analyses the sequence composition of a given sequence.
832 my ( $seq, # sequence to analyze
837 my ( %analysis, @chars, @chars_lc, $char, %char_hash, $gc, $at, $lc, $max, $res_sum, @indels, %indel_hash );
839 $analysis{ "SEQ_TYPE" } = uc Maasha::Seq::seq_guess_type( $seq );
840 $analysis{ "SEQ_LEN" } = length $seq;
842 @indels = qw( - ~ . _ );
844 if ( $analysis{ "SEQ_TYPE" } eq "DNA" )
846 @chars = split //, "AGCUTRYWSMKHDVBNagcutrywsmkhdvbn";
847 @chars_lc = split //, "agcutrywsmkhdvbn";
849 elsif ( $analysis{ "SEQ_TYPE" } eq "RNA" )
851 @chars = split //, "AGCUTRYWSMKHDVBNagcutrywsmkhdvbn";
852 @chars_lc = split //, "agcutrywsmkhdvbn";
856 @chars = split //, "FLSYCWPHQRIMTNKVADEGflsycwphqrimtnkvadeg";
857 @chars_lc = split //, "flsycwphqrimtnkvadeg";
860 @char_hash{ @chars } = map { eval "scalar \$seq =~ tr/$_//" } @chars;
861 @indel_hash{ @indels } = map { eval "scalar \$seq =~ tr/$_//" } @indels;
863 if ( $analysis{ "SEQ_TYPE" } =~ /DNA|RNA/ )
865 $gc = $char_hash{ "g" } + $char_hash{ "G" } + $char_hash{ "c" } + $char_hash{ "C" };
866 $at = $char_hash{ "a" } + $char_hash{ "A" } + $char_hash{ "t" } + $char_hash{ "T" } + $char_hash{ "u" } + $char_hash{ "U" };
868 $analysis{ "GC%" } = sprintf( "%.2f", 100 * $gc / $analysis{ "SEQ_LEN" } );
870 map { $lc += $char_hash{ lc $_ } } @chars_lc;
872 $analysis{ "SOFT_MASK%" } = sprintf( "%.2f", 100 * $lc / $analysis{ "SEQ_LEN" } );
873 $analysis{ "HARD_MASK%" } = sprintf( "%.2f", 100 * ( $char_hash{ "n" } + $char_hash{ "N" } ) / $analysis{ "SEQ_LEN" } );
878 foreach $char ( @chars_lc )
882 $char_hash{ $char } += $char_hash{ lc $char };
884 $analysis{ "RES:$char" } = $char_hash{ $char };
886 $max = $char_hash{ $char } if $char_hash{ $char } > $max;
888 $analysis{ "RES_SUM" } += $char_hash{ $char };
891 map { $analysis{ "RES:$_" } = $indel_hash{ $_ } } @indels;
893 $analysis{ "MIX_INDEX" } = sprintf( "%.2f", $max / $analysis{ "SEQ_LEN" } );
894 $analysis{ "MELT_TEMP" } = sprintf( "%.2f", 4 * $gc + 2 * $at );
896 return wantarray ? %analysis : \%analysis;
902 # Martin A. Hansen, May 2008.
904 # Given a sequence computes a complexity index
905 # as the most common di-residue over
906 # the sequence length. Return ~1 if the entire
907 # sequence is homopolymeric. Above 0.4 indicates
908 # low complexity sequence.
910 my ( $seq, # sequence
915 my ( $len, $i, $max, $di, %hash );
921 for ( $i = 0; $i < $len - 1; $i++ ) {
922 $hash{ substr $seq, $i, 2 }++;
925 foreach $di ( keys %hash ) {
926 $max = $hash{ $di } if $hash{ $di } > $max;
933 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SEQLOGO <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
938 # Martin A. Hansen, January 2007.
940 # given max bit size and a list of aligned entries
941 # in FASTA format, calculates for each sequence position
942 # the height of the letters in bits.
943 # returns a data structure with [ letter, height ] tuples
944 # for all letters at each position.
946 my ( $bit_max, # maximum bit height
947 $entries, # FASTA entries
950 # returns data structure
952 my ( $logo_len, $char_tot, $i, %char_hash, $bit_height, $bit_diff, $char_heights, @logo );
954 $logo_len = length $entries->[ 0 ]->[ 1 ];
955 $char_tot = scalar @{ $entries };
957 for ( $i = 0; $i < $logo_len; $i++ )
961 map { $char_hash{ uc substr( $_->[ 1 ], $i, 1 ) }++ } @{ $entries };
963 delete $char_hash{ "-" };
964 delete $char_hash{ "_" };
965 delete $char_hash{ "~" };
966 delete $char_hash{ "." };
968 $bit_height = seqlogo_calc_bit_height( \%char_hash, $char_tot );
970 $bit_diff = $bit_max - $bit_height;
972 $char_heights = seqlogo_calc_char_heights( \%char_hash, $char_tot, $bit_diff );
974 push @logo, $char_heights;
977 return wantarray ? @logo : \@logo;
981 sub seqlogo_calc_bit_height
983 # Martin A. Hansen, January 2007.
985 # calculates the bit height using Shannon's famous
986 # general formula for uncertainty as documentet:
987 # http://www.ccrnp.ncifcrf.gov/~toms/paper/hawaii/latex/node5.html
989 my ( $char_hash, # hashref with chars and frequencies
990 $tot, # total number of chars
995 my ( $char, $freq, $bit_height );
997 foreach $char ( keys %{ $char_hash } )
999 $freq = $char_hash->{ $char } / $tot;
1001 $bit_height += $freq * ( log( $freq ) / log( 2 ) );
1010 sub seqlogo_calc_char_heights
1012 # Martin A. Hansen, January 2007.
1014 # calculates the hight of each char in bits, and sorts
1015 # according to height.
1017 my ( $char_hash, # hashref with chars and frequencies
1018 $tot, # tot number of chars
1019 $bit_diff, # information gained from uncertainties
1022 # returns list of tuples
1024 my ( $char, $freq, $char_height, @char_heights );
1026 foreach $char ( keys %{ $char_hash } )
1028 $freq = $char_hash->{ $char } / $tot;
1030 $char_height = $freq * $bit_diff; # char height in bits
1032 push @char_heights, [ $char, $char_height ];
1035 @char_heights = sort { $a->[ 1 ] <=> $b->[ 1 ] } @char_heights;
1037 return wantarray ? @char_heights : \@char_heights;
1041 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> RESIDUE COLORS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1046 # Martin A. Hansen, October 2005.
1048 # color scheme for proteins as defined in Mview.
1049 # given a char returns the appropriate color.
1050 # The amino acids are colored according to physicochemical properties:
1051 # bright green = hydrophobic; dark green = large hydrophobic;
1052 # bright blue = negative charge; red = positive charge;
1053 # dull blue = small alcohol; purple = polar; yellow = cysteine.
1055 my ( $char, # char to decorate
1060 my ( %hash, $color_set );
1072 A => "bright-green",
1073 V => "bright-green",
1074 I => "bright-green",
1075 L => "bright-green",
1076 M => "bright-green",
1081 G => "bright-green",
1082 P => "bright-green",
1085 "?" => "light-gray",
1086 "~" => "light-gray",
1090 if ( exists $hash{ uc $char } ) {
1091 $color_set = $hash{ uc $char };
1093 $color_set = "black";
1102 # Martin A. Hansen, October 2005.
1104 # color scheme for nucleotides as defined in Mview.
1105 # given a char returns the appropriate color
1106 # according to physical/chemical proterties.
1108 my ( $char, # char to decorate
1113 my ( %hash, $color_set );
1123 if ( exists $hash{ uc $char } ) {
1124 $color_set = $hash{ uc $char };
1126 $color_set = "black";
1135 # Martin A. Hansen, October 2005.
1137 # hash table with color-names and color-hex.
1139 my ( $color, # common color name
1147 "black" => "#000000",
1148 "white" => "#ffffff",
1150 "green" => "#00ff00",
1151 "blue" => "#0000ff",
1152 "cyan" => "#00ffff",
1153 "magenta" => "#ff00ff",
1154 # "yellow" => "#ffff00",
1155 "yellow" => "#ffc800",
1156 "purple" => "#6600cc",
1157 "dull-blue" => "#0099ff",
1158 "dark-green-blue" => "#33cccc",
1159 "medium-green-blue" => "#00ffcc",
1160 "bright-blue" => "#0033ff",
1161 "dark-green" => "#009900",
1162 "bright-green" => "#33cc00",
1163 "orange" => "#ff3333",
1164 "orange-brown" => "#cc6600",
1165 "bright-red" => "#cc0000",
1166 "light-gray" => "#999999",
1167 "dark-gray" => "#666666",
1168 "gray0" => "#ffffff",
1169 "gray1" => "#eeeeee",
1170 "gray2" => "#dddddd",
1171 "gray3" => "#cccccc",
1172 "gray4" => "#bbbbbb",
1173 "gray5" => "#aaaaaa",
1174 "gray6" => "#999999",
1175 "gray7" => "#888888",
1176 "gray8" => "#777777",
1177 "gray9" => "#666666",
1178 "gray10" => "#555555",
1179 "gray11" => "#444444",
1180 "gray12" => "#333333",
1181 "gray13" => "#222222",
1182 "gray14" => "#111111",
1183 "gray15" => "#000000",
1184 "clustal-red" => "#ff1111",
1185 "clustal-blue" => "#1155ff",
1186 "clustal-green" => "#11dd11",
1187 "clustal-cyan" => "#11ffff",
1188 "clustal-yellow" => "#ffff11",
1189 "clustal-orange" => "#ff7f11",
1190 "clustal-pink" => "#ff11ff",
1191 "clustal-purple" => "#6611cc",
1192 "clustal-dull-blue" => "#197fe5",
1193 "clustal-dark-gray" => "#666666",
1194 "clustal-light-gray" => "#999999",
1195 "lin-A" => "#90fe23",
1196 "lin-R" => "#fe5e2d",
1197 "lin-N" => "#2e3d2d",
1198 "lin-D" => "#00903b",
1199 "lin-C" => "#004baa",
1200 "lin-Q" => "#864b00",
1201 "lin-E" => "#3fa201",
1202 "lin-G" => "#10fe68",
1203 "lin-H" => "#b2063b",
1204 "lin-I" => "#04ced9",
1205 "lin-L" => "#4972fe",
1206 "lin-K" => "#c4a100",
1207 "lin-M" => "#2a84dd",
1208 "lin-F" => "#a60ade",
1209 "lin-P" => "#fe61fe",
1210 "lin-S" => "#f7e847",
1211 "lin-T" => "#fefeb3",
1212 "lin-W" => "#4a007f",
1213 "lin-Y" => "#e903a8",
1214 "lin-V" => "#5bfdfd",
1217 if ( exists $hash{ $color } ) {
1218 return $hash{ $color };
1220 print STDERR qq(WARNING: color "$color" not found in palette!\n);
1227 # Martin A. Hansen, October 2005.
1229 # Hash table with contrast colors to be used for frontground
1230 # text on a given background color.
1232 my ( $color, # background color
1246 "magenta" => "white",
1247 "yellow" => "black",
1248 "purple" => "white",
1249 "dull-blue" => "white",
1250 "dark-green-blue" => "white",
1251 "medium-green-blue" => "white",
1252 "bright-blue" => "white",
1253 "dark-green" => "white",
1254 "bright-green" => "black",
1256 "orange-brown" => "",
1257 "bright-red" => "white",
1258 "light-gray" => "black",
1259 "dark-gray" => "white",
1276 "clustal-red" => "black",
1277 "clustal-blue" => "black",
1278 "clustal-green" => "black",
1279 "clustal-cyan" => "black",
1280 "clustal-yellow" => "black",
1281 "clustal-orange" => "black",
1282 "clustal-pink" => "black",
1283 "clustal-purple" => "black",
1284 "clustal-dull-blue" => "black",
1285 "clustal-dark-gray" => "black",
1286 "clustal-light-gray" => "black",
1309 if ( exists $hash{ $color } ) {
1310 return $hash{ $color };
1312 print STDERR qq(WARNING: color "$color" not found in palette!\n);
1319 # Martin A. Hansen, April 2008.
1321 # Packs a sequence word into a binary number.
1323 my ( $word, # Word to be packed
1328 my ( %hash, $bin, $word_size, $pad );
1341 map { $bin .= pack "B3", $hash{ $_ } } split //, $word;
1343 $word_size = length $word;
1345 $pad = ( 3 * $word_size ) / 8;
1349 $pad = ( ( int $pad + 1 ) * 8 ) - 3 * $word_size;
1351 $bin .= pack "B$pad", 0 x $pad;
1360 # Martin A. Hansen, April 2008.
1362 # Unpacks a binary sequence word to ASCII.
1364 my ( $bin, # Binary sequence word
1365 $word_size, # Size of word
1370 my ( %hash, $word );
1383 map { $word .= $hash{ $_ } } unpack "(B3)$word_size", $bin;
1389 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> ADAPTOR LOCATING <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1392 ############################### REDUNDANT ##############################
1394 # these functions have been replaced by index_m and match_m in Common.pm
1398 # Martin A. Hansen & Selene Fernandez, August 2008
1400 # Locates an adaptor sequence in a given sequence
1401 # starting from a given offset position allowing a given
1402 # number of mismatches (no indels).
1404 my ( $adaptor, # list of chars
1405 $tag, # list of chars
1406 $adaptor_len, # length of adaptor sequence
1407 $tag_len, # length of tag sequence
1408 $tag_offset, # offset position
1409 $max_match, # number of matches indicating success
1410 $max_mismatch, # number of mismatches
1413 # Returns an integer.
1415 my ( $i, $j, $match, $mismatch );
1419 while ( $i < $tag_len - ( $max_match + $max_mismatch ) + 1 )
1423 while ( $j < $adaptor_len - ( $max_match + $max_mismatch ) + 1 )
1425 return $i if check_diag( $adaptor, $tag, $adaptor_len, $tag_len, $j, $i, $max_match, $max_mismatch );
1439 # Martin A. Hansen & Selene Fernandez, August 2008
1441 # Checks the diagonal starting at a given coordinate
1442 # of a search space constituted by an adaptor and a tag sequence.
1443 # Residues in the diagonal are compared between the sequences allowing
1444 # for a given number of mismatches. We terminate when search space is
1445 # exhausted or if max matches or mismatches is reached.
1447 my ( $adaptor, # list of chars
1448 $tag, # list of chars
1449 $adaptor_len, # length of adaptor sequence
1450 $tag_len, # length of tag sequence
1451 $adaptor_beg, # adaptor begin coordinate
1452 $tag_beg, # tag begin coordinate
1453 $max_match, # number of matches indicating success
1454 $max_mismatch, # number of mismatches
1459 my ( $match, $mismatch );
1464 while ( $adaptor_beg <= $adaptor_len and $tag_beg <= $tag_len )
1466 if ( $adaptor->[ $adaptor_beg ] eq $tag->[ $tag_beg ] )
1470 return 1 if $match >= $max_match;
1476 return 0 if $mismatch > $max_mismatch;
1487 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<