3 # Copyright (C) 2007 Martin A. Hansen.
5 # This program is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU General Public License
7 # as published by the Free Software Foundation; either version 2
8 # of the License, or (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 # http://www.gnu.org/copyleft/gpl.html
22 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
28 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
33 use POSIX qw( islower );
37 use List::Util qw( shuffle );
38 use Time::HiRes qw( gettimeofday );
40 use vars qw ( @ISA @EXPORT );
42 @ISA = qw( Exporter );
45 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
50 # Martin A. Hansen, May 2007.
52 # Makes a qualified guess on the type of a given squence.
54 my ( $seq, # sequence to check
59 my ( $check_seq, $count );
61 if ( length $seq > 100 ) {
62 $check_seq = substr $seq, 0, 100;
67 if ( $count = $check_seq =~ tr/FLPQIEflpqie// and $count > 0 ) {
69 } elsif ( $count = $check_seq =~ tr/Uu// and $count > 0 ) {
79 # Martin A. Hansen, July 2007.
81 # Wraps a given string reference accoring to given width.
83 my ( $strref, # ref to sting to wrap
89 ${ $strref } =~ s/(.{$wrap})/$1\n/g;
98 # modified Martin A. Hansen, March 2005.
100 # Returns the reverse complement of a dna sequence with preservation of case
101 # according to this mapping,
103 # AGCUTRYWSMKHDVBNagcutrywsmkhdvbn
104 # TCGAAYRWSKMDHBVNtcgaayrwskmdhbvn
113 $seq =~ tr/AGCUTRYWSMKHDVBNagcutrywsmkhdvbn/TCGAAYRWSKMDHBVNtcgaayrwskmdhbvn/;
122 # modified Martin A. Hansen, March 2005.
124 # Returns the complement of a rna sequence with preservation of case
125 # according to this mapping,
127 # AGCUTRYWSMKHDVBNagcutrywsmkhdvbn
128 # UCGAAYRWSKMDHBVNucgaayrwskmdhbvn
135 $seq =~ tr/AGCUTRYWSMKHDVBNagcutrywsmkhdvbn/UCGAAYRWSKMDHBVNucgaayrwskmdhbvn/;
144 # modified Martin A. Hansen, March 2005.
146 # Returns the reverse complement of a dna sequence with preservation of case
147 # according to this mapping,
149 # AGCUTRYWSMKHDVBNagcutrywsmkhdvbn
150 # TCGAAYRWSKMDHBVNtcgaayrwskmdhbvn
152 my ( $seqref, # seqref
157 ${ $seqref } =~ tr/AGCUTRYWSMKHDVBNagcutrywsmkhdvbn/TCGAAYRWSKMDHBVNtcgaayrwskmdhbvn/;
164 # modified Martin A. Hansen, March 2005.
166 # Returns the complement of a rna sequence with preservation of case
167 # according to this mapping,
169 # AGCUTRYWSMKHDVBNagcutrywsmkhdvbn
170 # UCGAAYRWSKMDHBVNucgaayrwskmdhbvn
172 my ( $seqref, # seqref
177 ${ $seqref } =~ tr/AGCUTRYWSMKHDVBNagcutrywsmkhdvbn/UCGAAYRWSKMDHBVNucgaayrwskmdhbvn/;
183 # Martin A. Hansen, March 2007
185 # Converts DNA sequence to RNA
187 my ( $seq, # nucleotide sequence
200 # Martin A. Hansen, March 2007
202 # Converts RNA sequence to DNA
204 my ( $seq, # nucleotide sequence
217 # Martin A. Hansen, March 2005.
219 # given a string of nucleotides
220 # returns the corresponding ambiguity code
223 $type, # DNA or RNA - DEFAULT DNA
226 my ( %hash, @nts, $key, $code, %nt_hash );
230 if ( not $type or $type =~ /dna/i )
259 @nts = split //, $str;
261 %nt_hash = map { $_ => 1 } @nts;
263 @nts = sort keys %nt_hash;
265 $key = join "", @nts;
293 $code = $hash{ $key };
295 warn qq(WARNING: No ambiguity code for key->$key\n) if not $code;
303 # Martin A. Hansen, March 2005.
305 # given an amino acid, returns a list of corresponding codons
307 my ( $aa, # amino acid to translate
312 my ( %hash, $codons );
317 'F' => [ 'TTT', 'TTC' ], # Phe
318 'L' => [ 'TTA', 'TTG', 'CTT', 'CTC', 'CTA', 'CTG' ], # Leu
319 'S' => [ 'TCT', 'TCC', 'TCA', 'TCG', 'AGT', 'AGC' ], # Ser
320 'Y' => [ 'TAT', 'TAC' ], # Tyr
321 '*' => [ 'TAA', 'TAG', 'TGA' ], # Stop
322 'X' => [ 'TAA', 'TAG', 'TGA' ], # Stop
323 'C' => [ 'TGT', 'TGC' ], # Cys
324 'W' => [ 'TGG' ], # Trp
325 'P' => [ 'CCT', 'CCC', 'CCA', 'CCG' ], # Pro
326 'H' => [ 'CAT', 'CAC' ], # His
327 'Q' => [ 'CAA', 'CAG' ], # Gln
328 'R' => [ 'CGT', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG' ], # Arg
329 'I' => [ 'ATT', 'ATC', 'ATA' ], # Ile
330 'M' => [ 'ATG' ], # Met
331 'T' => [ 'ACT', 'ACC', 'ACA', 'ACG' ], # Thr
332 'N' => [ 'AAT', 'AAC' ], # Asn
333 'K' => [ 'AAA', 'AAG' ], # Lys
334 'V' => [ 'GTT', 'GTC', 'GTA', 'GTG' ], # Val
335 'A' => [ 'GCT', 'GCC', 'GCA', 'GCG' ], # Ala
336 'D' => [ 'GAT', 'GAC' ], # Asp
337 'E' => [ 'GAA', 'GAG' ], # Glu
338 'G' => [ 'GGT', 'GGC', 'GGA', 'GGG' ], # Gly
341 $codons = $hash{ $aa };
343 return wantarray ? @{ $codons } : $codons;
349 # Martin A. Hansen, March 2005.
351 # given a codon, returns the correponding
352 # vertebrate amino acid.
354 my ( $codon, # codon to translate
361 die qq(ERROR: Bad codon: "$codon"\n) if not $codon =~ /[ATCGatcg]{3}/;
430 $aa = $hash{ uc $codon };
438 # Martin A. Hansen, June 2005.
440 # translates a dna sequence to protein according to a optional given
443 my ( $dna, # dna sequence
444 $frame, # frame of translation - OPTIONAL
449 my ( $codon, $pos, $pep );
453 if ( $frame =~ /-?[1-3]/ )
456 $dna = Maasha::Seq::dna_revcomp( $dna );
459 $frame = abs( $frame ) - 1;
461 $dna =~ s/^.{${frame}}//;
465 Maasha::Common::error( qq(Badly formated frame "$frame") );
470 while ( $codon = substr $dna, $pos, 3 )
472 last if not length $codon == 3;
474 $pep .= codon2aa( $codon );
483 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> RNA FOLDING <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
486 sub fold_struct_rnafold
488 # Martin A. Hansen, February 2008.
490 # Given a squence fold this using RNAfold.
492 my ( $seq, # sequence to fold
495 # Returns a tuple of fold string and free energy.
497 my ( $pid, $fh_out, $fh_in, @lines, $struct, $energy );
499 $pid = open2( $fh_out, $fh_in, "RNAfold -noPS" );
501 Maasha::Fasta::put_entry( [ "RNAfold", $seq ], $fh_in );
513 if ( $lines[ - 1 ] =~ /^([^ ]+) \((.+)\)$/ )
519 return wantarray ? ( $struct, $energy ) : [ $struct, $energy ];
523 sub fold_struct_contrastruct
525 # Martin A. Hansen, February 2008.
527 # Given a sequence fold this using Contrafold.
529 my ( $seq, # sequence to fold
530 $tmp_dir, # temporary directory - OPTIONAL
533 # Returns a tuple of fold string and temp index.
535 my ( $tmp_file, $out_file1, $out_file2, $fh, $line, $struct, @AoA, $i, $temp, $index );
537 $tmp_dir ||= $ENV{ 'BP_TMP' };
539 $tmp_file = "$tmp_dir/fold.fna";
540 $out_file1 = "$tmp_dir/fold.out1";
541 $out_file2 = "$tmp_dir/fold.out2";
543 Maasha::Fasta::put_entries( [ [ "fold", $seq ] ], $tmp_file );
545 Maasha::Common::run( "contrafold", "predict --parens $out_file1 --bpseq $out_file2 $tmp_file" );
549 $fh = Maasha::Common::read_open( $out_file1 );
551 while ( $line = <$fh> )
562 $fh = Maasha::Common::read_open( $out_file2 );
564 while ( $line = <$fh> )
568 push @AoA, [ split " ", $line ];
575 for ( $i = 0; $i < @AoA; $i++ )
577 if ( $AoA[ $i ]->[ 2 ] != 0 )
579 last if $AoA[ $i ]->[ 0 ] > $AoA[ $i ]->[ 2 ];
581 $temp += base_pair_melting_temp( $AoA[ $i ]->[ 1 ] . $AoA[ $AoA[ $i ]->[ 2 ] - 1 ]->[ 1 ] );
585 $index = sprintf( "%.2f", $temp / length $seq );
587 return wantarray ? ( $struct, $index ) : [ $struct, $index ];
591 sub base_pair_melting_temp
593 # Martin A. Hansen, February 2008.
595 # Given a basepair, returns the melting temperature.
597 my ( $bp, # basepair string
632 return $melt_hash{ uc $bp };
636 sub generate_dna_oligos
638 # Martin A. Hansen, April 2007.
640 # Generates all possible DNA oligos of a given wordsize.
642 # alternative way: perl -MData::Dumper -e '@CONV = glob( "{A,T,C,G}" x 4 ); print Dumper( \@CONV )'
645 my ( $wordsize, # size of DNA oligos
650 my ( @alph, @oligos, $oligo, $char, @list );
652 @alph = ( qw( A T C G N ) );
655 for ( 1 .. $wordsize )
657 foreach $oligo ( @oligos )
659 foreach $char ( @alph ) {
660 push @list, $oligo . $char;
669 return wantarray ? @oligos : \@oligos;
675 # Martin A. Hansen, April 2007
677 # Given a sequence and a wordsize,
678 # breaks the sequence into overlapping
679 # oligos of that wordsize.
681 my ( $seq, # sequence reference
682 $wordsize, # wordsize
687 my ( $i, $oligo, @oligos );
689 for ( $i = 0; $i < length( ${ $seq } ) - $wordsize + 1; $i++ )
691 $oligo = substr ${ $seq }, $i, $wordsize;
693 push @oligos, $oligo;
696 return wantarray ? @oligos : \@oligos;
702 # Martin A. Hansen, April 2007
704 # Given a sequence and a wordsize,
705 # breaks the sequence into overlapping
706 # oligos of that wordsize and return
709 my ( $seq, # sequence reference
710 $wordsize, # wordsize
715 my ( $i, $oligo, %lookup, @oligos );
717 for ( $i = 0; $i < length( ${ $seq } ) - $wordsize + 1; $i++ )
719 $oligo = substr ${ $seq }, $i, $wordsize;
721 if ( not exists $lookup{ $oligo } )
723 push @oligos, $oligo;
724 $lookup{ $oligo } = 1;
728 return wantarray ? @oligos : \@oligos;
734 # Martin A. Hansen, August 2007.
736 # Given a hashref with oligo=>count, calculates
737 # a frequency table. Returns a list of hashes
739 my ( $oligo_freq, # hashref
742 # Returns data structure
744 my ( @freq_table, $total );
748 map { push @freq_table, { OLIGO => $_, COUNT => $oligo_freq->{ $_ } }; $total += $oligo_freq->{ $_ } } keys %{ $oligo_freq };
750 @freq_table = sort { $b->{ "COUNT" } <=> $a->{ "COUNT" } or $a->{ "OLIGO" } cmp $b->{ "OLIGO" } } @freq_table;
752 map { $_->{ "FREQ" } = sprintf( "%.4f", $_->{ "COUNT" } / $total ) } @freq_table;
754 return wantarray ? return @freq_table : \@freq_table;
760 # Martin A. Hansen, May 2007
762 # Generates a random sequence given a sequence length
765 my ( $len, # sequence length
766 $alph, # sequence alphabet
771 my ( $alph_len, $i, $seq );
773 $alph_len = scalar @{ $alph };
775 for ( $i = 0; $i < $len; $i++ ) {
776 $seq .= $alph->[ int( rand( $alph_len ) ) ];
785 # Martin A. Hansen, June 2009.
787 # Randomly duplicates a given number of residues in a given sequence.
789 my ( $seq, # sequence to mutate
790 $insertions, # number of residues to insert
797 for ( $i = 0; $i < $insertions; $i++ )
799 $pos = int( rand( length $seq ) );
801 substr $seq, $pos, 0, substr $seq , $pos, 1;
810 # Martin A. Hansen, June 2009.
812 # Randomly deletes a given number of residues from a given sequence.
814 my ( $seq, # sequence to mutate
815 $deletions, # number of residues to delete
822 for ( $i = 0; $i < $deletions; $i++ )
824 $pos = int( rand( length $seq ) );
826 substr $seq, $pos, 1, '';
835 # Martin A. Hansen, June 2009.
837 # Introduces a given number of random mutations in a
838 # given sequence of a specified alphabet.
840 my ( $seq, # sequence to mutate
841 $mutations, # number of mutations
842 $alph, # alphabet of sequence
847 my ( $i, $pos, %lookup_hash );
851 while ( $i < $mutations )
853 $pos = int( rand( length $seq ) );
855 if ( not exists $lookup_hash{ $pos } )
857 substr $seq, $pos, 1, res_mutate( substr( $seq , $pos, 1 ), $alph );
859 $lookup_hash{ $pos } = 1;
871 # Martin A. Hansen, June 2009.
873 # Mutates a given residue to another from a given alphabet.
875 my ( $res, # residue to mutate
881 my ( $alph_len, $new );
883 $alph_len = scalar @{ $alph };
886 while ( uc $new eq uc $res ) {
887 $new = $alph->[ int( rand( $alph_len ) ) ];
890 return POSIX::islower( $res ) ? lc $new : uc $new;
896 # Martin A. Hansen, December 2007.
898 # Shuffles sequence of a given string.
900 my ( $seq, # sequence string
907 @list = split "", $seq;
909 return join "", shuffle( @list );
915 # Martin A. Hansen, May 2007.
917 # Returns a requested sequence alphabet.
919 my ( $type, # alphabet type
924 my ( %alph_hash, $alph );
927 DNA => [ qw(A T C G) ],
928 DNA_AMBI => [ qw(A G C U T R Y W S M K H D V B N) ],
929 RNA => [ qw(A U C G) ],
930 RNA_AMBI => [ qw(A G C U T R Y W S M K H D V B N) ],
931 PROTEIN => [ qw(F L S Y C W P H Q R I M T N K V A D E G) ],
934 if ( exists $alph_hash{ uc $type } ) {
935 $alph = $alph_hash{ uc $type };
937 die qq(ERROR: Unknown alphabet type: "$type"\n);
940 return wantarray ? @{ $alph } : $alph;
946 # Martin A. Hansen, August 2007.
948 # Analyses the sequence composition of a given sequence.
950 my ( $seq, # sequence to analyze
955 my ( %analysis, @chars, @chars_lc, $char, %char_hash, $gc, $at, $lc, $max, $res_sum, @indels, %indel_hash );
957 $analysis{ "SEQ_TYPE" } = Maasha::Seq::seq_guess_type( $seq );
958 $analysis{ "SEQ_LEN" } = length $seq;
960 @indels = qw( - ~ . _ );
962 if ( $analysis{ "SEQ_TYPE" } eq "DNA" )
964 @chars = split //, "AGCUTRYWSMKHDVBNagcutrywsmkhdvbn";
965 @chars_lc = split //, "agcutrywsmkhdvbn";
967 elsif ( $analysis{ "SEQ_TYPE" } eq "RNA" )
969 @chars = split //, "AGCUTRYWSMKHDVBNagcutrywsmkhdvbn";
970 @chars_lc = split //, "agcutrywsmkhdvbn";
974 @chars = split //, "FLSYCWPHQRIMTNKVADEGflsycwphqrimtnkvadeg";
975 @chars_lc = split //, "flsycwphqrimtnkvadeg";
978 @char_hash{ @chars } = map { eval "scalar \$seq =~ tr/$_//" } @chars;
979 @indel_hash{ @indels } = map { eval "scalar \$seq =~ tr/$_//" } @indels;
981 if ( $analysis{ "SEQ_TYPE" } =~ /DNA|RNA/ )
983 $gc = $char_hash{ "g" } + $char_hash{ "G" } + $char_hash{ "c" } + $char_hash{ "C" };
984 $at = $char_hash{ "a" } + $char_hash{ "A" } + $char_hash{ "t" } + $char_hash{ "T" } + $char_hash{ "u" } + $char_hash{ "U" };
986 $analysis{ "GC%" } = sprintf( "%.2f", 100 * $gc / $analysis{ "SEQ_LEN" } );
988 map { $lc += $char_hash{ lc $_ } } @chars_lc;
990 $analysis{ "SOFT_MASK%" } = sprintf( "%.2f", 100 * $lc / $analysis{ "SEQ_LEN" } );
991 $analysis{ "HARD_MASK%" } = sprintf( "%.2f", 100 * ( $char_hash{ "n" } + $char_hash{ "N" } ) / $analysis{ "SEQ_LEN" } );
996 foreach $char ( @chars_lc )
1000 $char_hash{ $char } += $char_hash{ lc $char };
1002 $analysis{ "RES[$char]" } = $char_hash{ $char };
1004 $max = $char_hash{ $char } if $char_hash{ $char } > $max;
1006 $analysis{ "RES_SUM" } += $char_hash{ $char };
1009 map { $analysis{ "RES[$_]" } = $indel_hash{ $_ } } @indels;
1011 $analysis{ "MIX_INDEX" } = sprintf( "%.2f", $max / $analysis{ "SEQ_LEN" } );
1012 #$analysis{ "MELT_TEMP" } = sprintf( "%.2f", 4 * $gc + 2 * $at );
1014 return wantarray ? %analysis : \%analysis;
1020 # Martin A. Hansen, May 2008.
1022 # Given a sequence computes a complexity index
1023 # as the most common di-residue over
1024 # the sequence length. Return ~1 if the entire
1025 # sequence is homopolymeric. Above 0.4 indicates
1026 # low complexity sequence.
1028 my ( $seq, # sequence
1033 my ( $len, $i, $max, $di, %hash );
1039 for ( $i = 0; $i < $len - 1; $i++ ) {
1040 $hash{ substr $seq, $i, 2 }++;
1043 foreach $di ( keys %hash ) {
1044 $max = $hash{ $di } if $hash{ $di } > $max;
1051 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SEQLOGO <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1056 # Martin A. Hansen, January 2007.
1058 # given max bit size and a list of aligned entries
1059 # in FASTA format, calculates for each sequence position
1060 # the height of the letters in bits.
1061 # returns a data structure with [ letter, height ] tuples
1062 # for all letters at each position.
1064 my ( $bit_max, # maximum bit height
1065 $entries, # FASTA entries
1068 # returns data structure
1070 my ( $logo_len, $char_tot, $i, %char_hash, $bit_height, $bit_diff, $char_heights, @logo );
1072 $logo_len = length $entries->[ 0 ]->[ 1 ];
1073 $char_tot = scalar @{ $entries };
1075 for ( $i = 0; $i < $logo_len; $i++ )
1079 map { $char_hash{ uc substr( $_->[ 1 ], $i, 1 ) }++ } @{ $entries };
1081 delete $char_hash{ "-" };
1082 delete $char_hash{ "_" };
1083 delete $char_hash{ "~" };
1084 delete $char_hash{ "." };
1086 $bit_height = seqlogo_calc_bit_height( \%char_hash, $char_tot );
1088 $bit_diff = $bit_max - $bit_height;
1090 $char_heights = seqlogo_calc_char_heights( \%char_hash, $char_tot, $bit_diff );
1092 push @logo, $char_heights;
1095 return wantarray ? @logo : \@logo;
1099 sub seqlogo_calc_bit_height
1101 # Martin A. Hansen, January 2007.
1103 # calculates the bit height using Shannon's famous
1104 # general formula for uncertainty as documentet:
1105 # http://www.ccrnp.ncifcrf.gov/~toms/paper/hawaii/latex/node5.html
1107 my ( $char_hash, # hashref with chars and frequencies
1108 $tot, # total number of chars
1113 my ( $char, $freq, $bit_height );
1115 foreach $char ( keys %{ $char_hash } )
1117 $freq = $char_hash->{ $char } / $tot;
1119 $bit_height += $freq * ( log( $freq ) / log( 2 ) );
1128 sub seqlogo_calc_char_heights
1130 # Martin A. Hansen, January 2007.
1132 # calculates the hight of each char in bits, and sorts
1133 # according to height.
1135 my ( $char_hash, # hashref with chars and frequencies
1136 $tot, # tot number of chars
1137 $bit_diff, # information gained from uncertainties
1140 # returns list of tuples
1142 my ( $char, $freq, $char_height, @char_heights );
1144 foreach $char ( keys %{ $char_hash } )
1146 $freq = $char_hash->{ $char } / $tot;
1148 $char_height = $freq * $bit_diff; # char height in bits
1150 push @char_heights, [ $char, $char_height ];
1153 @char_heights = sort { $a->[ 1 ] <=> $b->[ 1 ] } @char_heights;
1155 return wantarray ? @char_heights : \@char_heights;
1159 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> RESIDUE COLORS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1164 # Martin A. Hansen, October 2005.
1166 # color scheme for proteins as defined in Mview.
1167 # given a char returns the appropriate color.
1168 # The amino acids are colored according to physicochemical properties:
1169 # bright green = hydrophobic; dark green = large hydrophobic;
1170 # bright blue = negative charge; red = positive charge;
1171 # dull blue = small alcohol; purple = polar; yellow = cysteine.
1173 my ( $char, # char to decorate
1178 my ( %hash, $color_set );
1190 A => "bright-green",
1191 V => "bright-green",
1192 I => "bright-green",
1193 L => "bright-green",
1194 M => "bright-green",
1199 G => "bright-green",
1200 P => "bright-green",
1203 "?" => "light-gray",
1204 "~" => "light-gray",
1208 if ( exists $hash{ uc $char } ) {
1209 $color_set = $hash{ uc $char };
1211 $color_set = "black";
1220 # Martin A. Hansen, October 2005.
1222 # color scheme for nucleotides as defined in Mview.
1223 # given a char returns the appropriate color
1224 # according to physical/chemical proterties.
1226 my ( $char, # char to decorate
1231 my ( %hash, $color_set );
1241 if ( exists $hash{ uc $char } ) {
1242 $color_set = $hash{ uc $char };
1244 $color_set = "black";
1253 # Martin A. Hansen, October 2005.
1255 # hash table with color-names and color-hex.
1257 my ( $color, # common color name
1265 "black" => "#000000",
1266 "white" => "#ffffff",
1268 "green" => "#00ff00",
1269 "blue" => "#0000ff",
1270 "cyan" => "#00ffff",
1271 "magenta" => "#ff00ff",
1272 # "yellow" => "#ffff00",
1273 "yellow" => "#ffc800",
1274 "purple" => "#6600cc",
1275 "dull-blue" => "#0099ff",
1276 "dark-green-blue" => "#33cccc",
1277 "medium-green-blue" => "#00ffcc",
1278 "bright-blue" => "#0033ff",
1279 "dark-green" => "#009900",
1280 "bright-green" => "#33cc00",
1281 "orange" => "#ff3333",
1282 "orange-brown" => "#cc6600",
1283 "bright-red" => "#cc0000",
1284 "light-gray" => "#999999",
1285 "dark-gray" => "#666666",
1286 "gray0" => "#ffffff",
1287 "gray1" => "#eeeeee",
1288 "gray2" => "#dddddd",
1289 "gray3" => "#cccccc",
1290 "gray4" => "#bbbbbb",
1291 "gray5" => "#aaaaaa",
1292 "gray6" => "#999999",
1293 "gray7" => "#888888",
1294 "gray8" => "#777777",
1295 "gray9" => "#666666",
1296 "gray10" => "#555555",
1297 "gray11" => "#444444",
1298 "gray12" => "#333333",
1299 "gray13" => "#222222",
1300 "gray14" => "#111111",
1301 "gray15" => "#000000",
1302 "clustal-red" => "#ff1111",
1303 "clustal-blue" => "#1155ff",
1304 "clustal-green" => "#11dd11",
1305 "clustal-cyan" => "#11ffff",
1306 "clustal-yellow" => "#ffff11",
1307 "clustal-orange" => "#ff7f11",
1308 "clustal-pink" => "#ff11ff",
1309 "clustal-purple" => "#6611cc",
1310 "clustal-dull-blue" => "#197fe5",
1311 "clustal-dark-gray" => "#666666",
1312 "clustal-light-gray" => "#999999",
1313 "lin-A" => "#90fe23",
1314 "lin-R" => "#fe5e2d",
1315 "lin-N" => "#2e3d2d",
1316 "lin-D" => "#00903b",
1317 "lin-C" => "#004baa",
1318 "lin-Q" => "#864b00",
1319 "lin-E" => "#3fa201",
1320 "lin-G" => "#10fe68",
1321 "lin-H" => "#b2063b",
1322 "lin-I" => "#04ced9",
1323 "lin-L" => "#4972fe",
1324 "lin-K" => "#c4a100",
1325 "lin-M" => "#2a84dd",
1326 "lin-F" => "#a60ade",
1327 "lin-P" => "#fe61fe",
1328 "lin-S" => "#f7e847",
1329 "lin-T" => "#fefeb3",
1330 "lin-W" => "#4a007f",
1331 "lin-Y" => "#e903a8",
1332 "lin-V" => "#5bfdfd",
1335 if ( exists $hash{ $color } ) {
1336 return $hash{ $color };
1338 print STDERR qq(WARNING: color "$color" not found in palette!\n);
1345 # Martin A. Hansen, October 2005.
1347 # Hash table with contrast colors to be used for frontground
1348 # text on a given background color.
1350 my ( $color, # background color
1364 "magenta" => "white",
1365 "yellow" => "black",
1366 "purple" => "white",
1367 "dull-blue" => "white",
1368 "dark-green-blue" => "white",
1369 "medium-green-blue" => "white",
1370 "bright-blue" => "white",
1371 "dark-green" => "white",
1372 "bright-green" => "black",
1374 "orange-brown" => "",
1375 "bright-red" => "white",
1376 "light-gray" => "black",
1377 "dark-gray" => "white",
1394 "clustal-red" => "black",
1395 "clustal-blue" => "black",
1396 "clustal-green" => "black",
1397 "clustal-cyan" => "black",
1398 "clustal-yellow" => "black",
1399 "clustal-orange" => "black",
1400 "clustal-pink" => "black",
1401 "clustal-purple" => "black",
1402 "clustal-dull-blue" => "black",
1403 "clustal-dark-gray" => "black",
1404 "clustal-light-gray" => "black",
1427 if ( exists $hash{ $color } ) {
1428 return $hash{ $color };
1430 print STDERR qq(WARNING: color "$color" not found in palette!\n);
1437 # Martin A. Hansen, April 2008.
1439 # Packs a sequence word into a binary number.
1441 my ( $word, # Word to be packed
1446 my ( %hash, $bin, $word_size, $pad );
1459 map { $bin .= pack "B3", $hash{ $_ } } split //, $word;
1461 $word_size = length $word;
1463 $pad = ( 3 * $word_size ) / 8;
1467 $pad = ( ( int $pad + 1 ) * 8 ) - 3 * $word_size;
1469 $bin .= pack "B$pad", 0 x $pad;
1478 # Martin A. Hansen, April 2008.
1480 # Unpacks a binary sequence word to ASCII.
1482 my ( $bin, # Binary sequence word
1483 $word_size, # Size of word
1488 my ( %hash, $word );
1501 map { $word .= $hash{ $_ } } unpack "(B3)$word_size", $bin;
1507 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> ADAPTOR LOCATING <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1510 ############################### REDUNDANT ##############################
1512 # these functions have been replaced by index_m and match_m in Common.pm
1516 # Martin A. Hansen & Selene Fernandez, August 2008
1518 # Locates an adaptor sequence in a given sequence
1519 # starting from a given offset position allowing a given
1520 # number of mismatches (no indels).
1522 my ( $adaptor, # list of chars
1523 $tag, # list of chars
1524 $adaptor_len, # length of adaptor sequence
1525 $tag_len, # length of tag sequence
1526 $tag_offset, # offset position
1527 $max_match, # number of matches indicating success
1528 $max_mismatch, # number of mismatches
1531 # Returns an integer.
1533 my ( $i, $j, $match, $mismatch );
1537 while ( $i < $tag_len - ( $max_match + $max_mismatch ) + 1 )
1541 while ( $j < $adaptor_len - ( $max_match + $max_mismatch ) + 1 )
1543 return $i if check_diag( $adaptor, $tag, $adaptor_len, $tag_len, $j, $i, $max_match, $max_mismatch );
1557 # Martin A. Hansen & Selene Fernandez, August 2008
1559 # Checks the diagonal starting at a given coordinate
1560 # of a search space constituted by an adaptor and a tag sequence.
1561 # Residues in the diagonal are compared between the sequences allowing
1562 # for a given number of mismatches. We terminate when search space is
1563 # exhausted or if max matches or mismatches is reached.
1565 my ( $adaptor, # list of chars
1566 $tag, # list of chars
1567 $adaptor_len, # length of adaptor sequence
1568 $tag_len, # length of tag sequence
1569 $adaptor_beg, # adaptor begin coordinate
1570 $tag_beg, # tag begin coordinate
1571 $max_match, # number of matches indicating success
1572 $max_mismatch, # number of mismatches
1577 my ( $match, $mismatch );
1582 while ( $adaptor_beg <= $adaptor_len and $tag_beg <= $tag_len )
1584 if ( $adaptor->[ $adaptor_beg ] eq $tag->[ $tag_beg ] )
1588 return 1 if $match >= $max_match;
1594 return 0 if $mismatch > $max_mismatch;
1605 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<