3 # Copyright (C) 2007 Martin A. Hansen.
5 # This program is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU General Public License
7 # as published by the Free Software Foundation; either version 2
8 # of the License, or (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 # http://www.gnu.org/copyleft/gpl.html
22 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
25 # Routines to perform and print pairwise and multiple alignments
28 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
39 use vars qw ( @ISA @EXPORT );
46 @ISA = qw( Exporter );
49 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
54 # Martin A. Hansen, August 2007.
56 # Aligns a given list of FASTA entries and returns a
57 # list of aligned sequences as FASTA entries.
58 # (currently uses Muscle, but other align engines can
59 # be used with a bit of tweaking).
61 my ( $entries, # Fasta entries
62 $args, # additional alignment program specific arguments - OPTIONAL
67 my ( @aligned_entries, $muscle_args );
69 $muscle_args = "-quiet";
70 $muscle_args .= $args if $args;
72 @aligned_entries = align_muscle( $entries, $muscle_args );
74 return wantarray ? @aligned_entries : \@aligned_entries;
80 # Martin A. Hansen, June 2007.
82 # Aligns a given list of FASTA entries using Muscle.
83 # Returns a list of aligned sequences as FASTA entries.
85 my ( $entries, # FASTA entries
86 $args, # additional Muscle arguments - OPTIONAL
91 my ( $pid, $fh_in, $fh_out, $cmd, $entry, @aligned_entries );
94 $cmd .= " " . $args if $args;
96 $pid = open2( $fh_out, $fh_in, $cmd );
98 map { Maasha::Fasta::put_entry( $_, $fh_in ) } @{ $entries };
102 while ( $entry = Maasha::Fasta::get_entry( $fh_out ) ) {
103 push @aligned_entries, $entry;
110 return wantarray ? @aligned_entries : \@aligned_entries;
114 sub align_print_pairwise
116 # Martin A. Hansen, June 2007.
118 # Prints a given pairwise alignment in FASTA format.
120 my ( $entry1, # first entry
121 $entry2, # second entry
122 $fh, # output filehandle - OPTIONAL
123 $wrap, # wrap width - OPTIONAL
128 my ( @entries, $ruler1, $ruler2, $pins );
130 $ruler1 = align_ruler( $entry1, 1 );
131 $ruler2 = align_ruler( $entry2, 1 );
132 $pins = align_pins( $entry1, $entry2 );
134 push @entries, $ruler1, $entry1, $pins, $entry2, $ruler2;
136 align_print( \@entries, $fh, $wrap );
140 sub align_print_multi
142 # Martin A. Hansen, June 2007.
144 # Prints a given multiple alignment in FASTA format.
146 my ( $entries, # list of aligned FASTA entries
147 $fh, # output filehandle - OPTIONAL
148 $wrap, # wrap width - OPTIONAL
149 $no_ruler, # omit ruler flag - OPTIONAL
150 $no_cons, # omit consensus flag - OPTIONAL
155 my ( @entries, $ruler, $consensus );
157 $ruler = align_ruler( $entries->[ 0 ] );
158 $consensus = align_consensus( $entries ) if not $no_cons;
160 unshift @{ $entries }, $ruler if not $no_ruler;
161 push @{ $entries }, $consensus if not $no_cons;
163 align_print( $entries, $fh, $wrap );
169 # Martin A. Hansen, June 2007.
171 # Prints an alignment.
173 my ( $entries, # Alignment as FASTA entries
174 $fh, # output filehandle - OPTIONAL
175 $wrap, # wrap alignment - OPTIONAL
180 my ( $max, $blocks, $block, $entry );
184 map { $max = length $_->[ SEQ_NAME ] if length $_->[ SEQ_NAME ] > $max } @{ $entries };
186 $blocks = align_wrap( $entries, $wrap );
188 foreach $block ( @{ $blocks } )
190 foreach $entry ( @{ $block } )
192 $entry->[ SEQ_NAME ] =~ s/stats|ruler|consensus//;
195 print $fh $entry->[ SEQ_NAME ], " " x ( $max + 3 - length $entry->[ SEQ_NAME ] ), $entry->[ SEQ ], "\n";
197 print $entry->[ SEQ_NAME ], " " x ( $max + 3 - length $entry->[ SEQ_NAME ] ), $entry->[ SEQ ], "\n";
206 # Martin A. Hansen, October 2005.
208 # Given a set of fasta entries wraps these
209 # according to a given width.
211 my ( $entries, # list of fasta_entries
212 $wrap, # wrap width - OPTIONAL
217 my ( $ruler, $i, $c, @lines, @blocks );
223 while ( $i < length $entries->[ 0 ]->[ SEQ ] )
227 for ( $c = 0; $c < @{ $entries }; $c++ )
229 if ( $entries->[ $c ]->[ SEQ_NAME ] eq "ruler" )
231 $ruler = substr $entries->[ $c ]->[ SEQ ], $i, $wrap;
233 if ( $ruler =~ /^(\d+)/ ) {
234 $ruler =~ s/^($1)/' 'x(length $1)/e;
237 if ( $ruler =~ /(\d+)$/ ) {
238 $ruler =~ s/($1)$/' 'x(length $1)/e;
241 push @lines, [ "ruler", $ruler ];
245 push @lines, [ $entries->[ $c ]->[ SEQ_NAME ], substr $entries->[ $c ]->[ SEQ ], $i, $wrap ];
249 push @blocks, [ @lines ];
254 return wantarray ? @blocks: \@blocks;
260 # Martin A. Hansen, June 2007.
262 # Given two aligned FASTA entries, generates an entry with pins.
264 my ( $entry1, # first entry
265 $entry2, # second entry
266 $type, # residue type - OPTIONAL
271 my ( $blosum, $i, $char1, $char2, $pins );
273 $type ||= Maasha::Seq::seq_guess_type( $entry1->[ SEQ ] );
275 $blosum = blosum_read() if $type =~ "PROTEIN";
277 for ( $i = 0; $i < length $entry1->[ SEQ ]; $i++ )
279 $char1 = uc substr $entry1->[ SEQ ], $i, 1;
280 $char2 = uc substr $entry2->[ SEQ ], $i, 1;
282 if ( $blosum and $char1 eq $char2 ) {
284 } elsif ( $char1 eq $char2 ) {
286 } elsif ( $blosum and defined $blosum->{ $char1 }->{ $char2 } and $blosum->{ $char1 }->{ $char2 } > 0 ) {
293 return wantarray ? ( "consensus", $pins ) : [ "consensus", $pins ];
299 # Martin A. Hansen, February 2007;
301 # Gererates a ruler for a given FASTA entry (with indels).
303 my ( $entry, # FASTA entry
304 $count_gaps, # flag for counting indels in pairwise alignments.
309 my ( $i, $char, $skip, $count, $gap, $tics );
315 while ( $i <= length $entry->[ SEQ ] )
317 $char = substr( $entry->[ SEQ ], $i - 1, 1 ) if $count_gaps;
319 $gap++ if $char eq "-";
328 $count = 1 if $char eq "-";
330 if ( $count % 100 == 0 )
332 if ( $count + length( $count ) >= length $entry->[ SEQ ] )
338 $tics .= "|" . $count;
339 $skip = length $count;
342 elsif ( $count % 50 == 0 ) {
344 } elsif ( $count % 10 == 0 ) {
354 return wantarray ? ( "ruler", $tics ) : [ "ruler", $tics ];
360 # Martin A. Hansen, June 2006.
362 # Given an alignment as a list of FASTA entries,
363 # generates a consensus sequences based on the
364 # entropies for each column similar to the way
365 # a sequence logo i calculated. Returns the
366 # consensus sequence as a FASTA entry.
368 my ( $entries, # list of aligned FASTA entries
369 $type, # residue type - OPTIONAL
370 $min_sim, # minimum similarity - OPTIONAL
375 my ( $bit_max, $data, $pos, $char, $score, $entry );
377 $type ||= Maasha::Seq::seq_guess_type( $entries->[ 0 ]->[ SEQ ] );
380 if ( $type =~ /protein/ ) {
386 $data = Maasha::Seq::seqlogo_calc( $bit_max, $entries );
388 foreach $pos ( @{ $data } )
392 ( $char, $score ) = @{ $pos->[ -1 ] };
394 if ( ( $score / $bit_max ) * 100 >= $min_sim ) {
395 $entry->[ SEQ ] .= $char;
397 $entry->[ SEQ ] .= "-";
402 $entry->[ SEQ ] .= "-";
406 $entry->[ SEQ_NAME ] = "Consensus: $min_sim%";
408 return wantarray ? @{ $entry } : $entry;
414 # Martin A. Hansen, June 2007.
416 # Calculate the global similarity of two aligned entries
417 # The similarity is calculated as the number of matching
418 # residues divided by the length of the shortest sequence.
420 my ( $entry1, # first aligned entry
421 $entry2, # second aligned entry
426 my ( $seq1, $seq2, $len1, $len2, $i, $match_tot, $min, $sim );
428 $seq1 = $entry1->[ SEQ ];
429 $seq2 = $entry2->[ SEQ ];
439 $len1 = length $seq1;
440 $len2 = length $seq2;
442 return 0 if $len1 == 0 or $len2 == 0;
446 for ( $i = 0; $i < $len1; $i++ ) {
447 $match_tot++ if substr( $entry1->[ SEQ ], $i, 1 ) eq substr( $entry2->[ SEQ ], $i, 1 );
450 $min = Maasha::Calc::min( $len1, $len2 );
452 $sim = sprintf( "%.2f", ( $match_tot / $min ) * 100 );
460 # Martin A. Hansen, February 2008.
462 # Tile a list of query sequences agains a reference sequence,
463 # using pairwise alignments. The result is returned as a list of
464 # aligned FASTA entries.
466 my ( $ref_entry, # reference entry as [ SEQ_NAME, SEQ ] tuple
467 $q_entries, # list of [ SEQ_NAME, SEQ ] tuples
468 $args, # argument hash
473 my ( $entry, $seq1, $seq2, $type, $align1, $align2, $sim1, $sim2, $gaps, @entries );
475 $args->{ "identity" } ||= 70;
477 foreach $entry ( @{ $q_entries } )
479 $seq1 = $entry->[ SEQ ];
481 $type = Maasha::Seq::seq_guess_type( $seq1 );
483 if ( $type eq "RNA" ) {
484 $seq2 = Maasha::Seq::rna_revcomp( $seq1 );
485 } elsif ( $type eq "DNA" ) {
486 $seq2 = Maasha::Seq::dna_revcomp( $seq1 );
488 Maasha::Common::error( qq(Bad sequence type->$type) );
491 $align1 = Maasha::Align::align_muscle( [ $ref_entry, [ $entry->[ SEQ_NAME ] . "_+", $seq1 ] ], "-quiet -maxiters 1" );
492 $align2 = Maasha::Align::align_muscle( [ $ref_entry, [ $entry->[ SEQ_NAME ] . "_-", $seq2 ] ], "-quiet -maxiters 1" );
494 if ( $args->{ "supress_indels" } )
496 align_supress_indels( $align1 );
497 align_supress_indels( $align2 );
500 $sim1 = Maasha::Align::align_sim_global( $align1->[ 0 ], $align1->[ 1 ] );
501 $sim2 = Maasha::Align::align_sim_global( $align2->[ 0 ], $align2->[ 1 ] );
503 if ( $sim1 < $args->{ "identity" } and $sim2 < $args->{ "identity" } )
507 elsif ( $sim1 > $sim2 )
509 $gaps = $align1->[ 0 ]->[ SEQ ] =~ tr/-//;
511 $align1->[ 1 ]->[ SEQ ] =~ s/-{$gaps}$// if $gaps;
513 $entry->[ SEQ_NAME ] = "$align1->[ 1 ]->[ SEQ_NAME ]_$sim1";
514 $entry->[ SEQ ] = $align1->[ 1 ]->[ SEQ ];
516 push @entries, $entry;
520 $gaps = $align2->[ 0 ]->[ SEQ ] =~ tr/-//;
522 $align2->[ 1 ]->[ SEQ ] =~ s/-{$gaps}$// if $gaps;
524 $entry->[ SEQ_NAME ] = "$align2->[ 1 ]->[ SEQ_NAME ]_$sim2";
525 $entry->[ SEQ ] = $align2->[ 1 ]->[ SEQ ];
527 push @entries, $entry;
531 @entries = sort { $b->[ SEQ ] cmp $a->[ SEQ ] } @entries;
533 unshift @entries, $ref_entry;
535 return wantarray ? @entries : \@entries;
539 sub align_supress_indels
541 # Martin A. Hansen, June 2008.
543 # Given a pairwise alignment, removes
544 # indels in the first sequence AND corresponding
545 # sequence in the second.
547 my ( $align, # pairwise alignment
552 my ( $count, $seq, $i );
554 $count = $align->[ 0 ]->[ SEQ ] =~ tr/-//;
558 for ( $i = 0; $i < length $align->[ 0 ]->[ SEQ ]; $i++ )
560 if ( substr( $align->[ 0 ]->[ SEQ ], $i, 1 ) ne '-' ) {
561 $seq .= substr( $align->[ 1 ]->[ SEQ ], $i, 1 );
566 $align->[ 0 ]->[ SEQ ] =~ tr/-//d;
567 $align->[ 1 ]->[ SEQ ] = $seq;
574 # Martin A. Hansen, February 2008.
576 # Invert an alignment in such a way that only
577 # residues differing from the first sequence (the reference sequence)
578 # are shown. The matching sequence can either be lowercased (soft) or replaced
581 my ( $entries, # list of FASTA entries.
587 my ( $i, $c, $char1, $char2 );
589 map { $_->[ SEQ ] =~ tr/-/_/ } @{ $entries };
591 for ( $i = 0; $i < length $entries->[ 0 ]->[ SEQ ]; $i++ )
593 $char1 = uc substr $entries->[ 0 ]->[ SEQ ], $i, 1;
595 for ( $c = 1; $c < @{ $entries }; $c++ )
597 $char2 = uc substr $entries->[ $c ]->[ SEQ ], $i, 1;
599 if ( $char1 eq $char2 )
602 substr $entries->[ $c ]->[ SEQ ], $i, 1, lc $char2;
604 substr $entries->[ $c ]->[ SEQ ], $i, 1, "-";
614 # Martin A. Hansen, January 2006.
616 # this routine parses the BLOSUM62 matrix,
617 # which is located in the __DATA__ section
621 my ( @lines, @chars, $i, $c, @list, $HoH );
624 @lines = grep { $_ !~ /^$|^#/ } @lines;
626 @chars = split /\s+/, $lines[ 0 ];
630 while( $lines[ $i ] )
632 last if $lines[ $i ] =~ /^__END__/;
634 @list = split /\s+/, $lines[ $i ];
636 for ( $c = 1; $c < @list; $c++ ) {
637 $HoH->{ $list[ 0 ] }->{ $chars[ $c ] } = $list[ $c ];
643 return wantarray ? %{ $HoH } : $HoH;
647 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
653 # Matrix made by matblas from blosum62.iij
654 # * column uses minimum score
655 # BLOSUM Clustered Scoring Matrix in 1/2 Bit Units
656 # Blocks Database = /data/blocks_5.0/blocks.dat
657 # Cluster Percentage: >= 62
658 # Entropy = 0.6979, Expected = -0.5209
659 A R N D C Q E G H I L K M F P S T W Y V B Z X *
660 A 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 -2 -1 0 -4
661 R -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 -1 0 -1 -4
662 N -2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2 -3 3 0 -1 -4
663 D -2 -2 1 6 -3 0 2 -1 -1 -3 -4 -1 -3 -3 -1 0 -1 -4 -3 -3 4 1 -1 -4
664 C 0 -3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -3 -3 -2 -4
665 Q -1 1 0 0 -3 5 2 -2 0 -3 -2 1 0 -3 -1 0 -1 -2 -1 -2 0 3 -1 -4
666 E -1 0 0 2 -4 2 5 -2 0 -3 -3 1 -2 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4
667 G 0 -2 0 -1 -3 -2 -2 6 -2 -4 -4 -2 -3 -3 -2 0 -2 -2 -3 -3 -1 -2 -1 -4
668 H -2 0 1 -1 -3 0 0 -2 8 -3 -3 -1 -2 -1 -2 -1 -2 -2 2 -3 0 0 -1 -4
669 I -1 -3 -3 -3 -1 -3 -3 -4 -3 4 2 -3 1 0 -3 -2 -1 -3 -1 3 -3 -3 -1 -4
670 L -1 -2 -3 -4 -1 -2 -3 -4 -3 2 4 -2 2 0 -3 -2 -1 -2 -1 1 -4 -3 -1 -4
671 K -1 2 0 -1 -3 1 1 -2 -1 -3 -2 5 -1 -3 -1 0 -1 -3 -2 -2 0 1 -1 -4
672 M -1 -1 -2 -3 -1 0 -2 -3 -2 1 2 -1 5 0 -2 -1 -1 -1 -1 1 -3 -1 -1 -4
673 F -2 -3 -3 -3 -2 -3 -3 -3 -1 0 0 -3 0 6 -4 -2 -2 1 3 -1 -3 -3 -1 -4
674 P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4 7 -1 -1 -4 -3 -2 -2 -1 -2 -4
675 S 1 -1 1 0 -1 0 0 0 -1 -2 -2 0 -1 -2 -1 4 1 -3 -2 -2 0 0 0 -4
676 T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 1 5 -2 -2 0 -1 -1 0 -4
677 W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 11 2 -3 -4 -3 -2 -4
678 Y -2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7 -1 -3 -2 -1 -4
679 V 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 -3 -2 -1 -4
680 B -2 -1 3 4 -3 0 1 -1 0 -3 -4 0 -3 -3 -2 0 -1 -4 -3 -3 4 1 -1 -4
681 Z -1 0 0 1 -3 3 4 -2 0 -3 -3 1 -1 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4
682 X 0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2 0 0 -2 -1 -1 -1 -1 -1 -4
683 * -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 1