3 # Copyright (C) 2007 Martin A. Hansen.
5 # This program is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU General Public License
7 # as published by the Free Software Foundation; either version 2
8 # of the License, or (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 # http://www.gnu.org/copyleft/gpl.html
22 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
25 # Routines to perform and print pairwise and multiple alignments
28 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
38 use vars qw ( @ISA @EXPORT );
45 @ISA = qw( Exporter );
48 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
53 # Martin A. Hansen, August 2007.
55 # Aligns a given list of FASTA entries and returns a
56 # list of aligned sequences as FASTA entries.
57 # (currently uses Muscle, but other align engines can
58 # be used with a bit of tweaking).
60 my ( $entries, # Fasta entries
61 $args, # additional alignment program specific arguments - OPTIONAL
66 my ( @aligned_entries, $muscle_args );
68 $muscle_args = "-quiet";
69 $muscle_args .= $args if $args;
71 @aligned_entries = align_muscle( $entries, $muscle_args );
73 return wantarray ? @aligned_entries : \@aligned_entries;
79 # Martin A. Hansen, June 2007.
81 # Aligns a given list of FASTA entries using Muscle.
82 # Returns a list of aligned sequences as FASTA entries.
84 my ( $entries, # FASTA entries
85 $args, # additional Muscle arguments - OPTIONAL
90 my ( $pid, $fh_in, $fh_out, $cmd, $entry, @aligned_entries );
93 $cmd .= " " . $args if $args;
95 $pid = open2( $fh_out, $fh_in, $cmd );
97 map { Maasha::Fasta::put_entry( $_, $fh_in ) } @{ $entries };
101 while ( $entry = Maasha::Fasta::get_entry( $fh_out ) ) {
102 push @aligned_entries, $entry;
109 return wantarray ? @aligned_entries : \@aligned_entries;
113 sub align_print_pairwise
115 # Martin A. Hansen, June 2007.
117 # Prints a given pairwise alignment in FASTA format.
119 my ( $entry1, # first entry
120 $entry2, # second entry
121 $fh, # output filehandle - OPTIONAL
122 $wrap, # wrap width - OPTIONAL
127 my ( @entries, $ruler1, $ruler2, $pins );
129 $ruler1 = align_ruler( $entry1, 1 );
130 $ruler2 = align_ruler( $entry2, 1 );
131 $pins = align_pins( $entry1, $entry2 );
133 push @entries, $ruler1, $entry1, $pins, $entry2, $ruler2;
135 align_print( \@entries, $fh, $wrap );
139 sub align_print_multi
141 # Martin A. Hansen, June 2007.
143 # Prints a given multiple alignment in FASTA format.
145 my ( $entries, # list of aligned FASTA entries
146 $fh, # output filehandle - OPTIONAL
147 $wrap, # wrap width - OPTIONAL
148 $no_ruler, # omit ruler flag - OPTIONAL
149 $no_cons, # omit consensus flag - OPTIONAL
154 my ( @entries, $ruler, $consensus );
156 $ruler = align_ruler( $entries->[ 0 ] );
157 $consensus = align_consensus( $entries ) if not $no_cons;
159 unshift @{ $entries }, $ruler if not $no_ruler;
160 push @{ $entries }, $consensus if not $no_cons;
162 align_print( $entries, $fh, $wrap );
168 # Martin A. Hansen, June 2007.
170 # Prints an alignment.
172 my ( $entries, # Alignment as FASTA entries
173 $fh, # output filehandle - OPTIONAL
174 $wrap, # wrap alignment - OPTIONAL
179 my ( $max, $blocks, $block, $entry );
183 map { $max = length $_->[ SEQ_NAME ] if length $_->[ SEQ_NAME ] > $max } @{ $entries };
185 $blocks = align_wrap( $entries, $wrap );
187 foreach $block ( @{ $blocks } )
189 foreach $entry ( @{ $block } )
191 $entry->[ SEQ_NAME ] =~ s/stats|ruler|consensus//;
194 print $fh $entry->[ SEQ_NAME ], " " x ( $max + 3 - length $entry->[ SEQ_NAME ] ), $entry->[ SEQ ], "\n";
196 print $entry->[ SEQ_NAME ], " " x ( $max + 3 - length $entry->[ SEQ_NAME ] ), $entry->[ SEQ ], "\n";
205 # Martin A. Hansen, October 2005.
207 # Given a set of fasta entries wraps these
208 # according to a given width.
210 my ( $entries, # list of fasta_entries
211 $wrap, # wrap width - OPTIONAL
216 my ( $ruler, $i, $c, @lines, @blocks );
222 while ( $i < length $entries->[ 0 ]->[ SEQ ] )
226 for ( $c = 0; $c < @{ $entries }; $c++ )
228 if ( $entries->[ $c ]->[ SEQ_NAME ] eq "ruler" )
230 $ruler = substr $entries->[ $c ]->[ SEQ ], $i, $wrap;
232 if ( $ruler =~ /^(\d+)/ ) {
233 $ruler =~ s/^($1)/' 'x(length $1)/e;
236 if ( $ruler =~ /(\d+)$/ ) {
237 $ruler =~ s/($1)$/' 'x(length $1)/e;
240 push @lines, [ "ruler", $ruler ];
244 push @lines, [ $entries->[ $c ]->[ SEQ_NAME ], substr $entries->[ $c ]->[ SEQ ], $i, $wrap ];
248 push @blocks, [ @lines ];
253 return wantarray ? @blocks: \@blocks;
259 # Martin A. Hansen, June 2007.
261 # Given two aligned FASTA entries, generates an entry with pins.
263 my ( $entry1, # first entry
264 $entry2, # second entry
265 $type, # residue type - OPTIONAL
270 my ( $blosum, $i, $char1, $char2, $pins );
272 $type ||= Maasha::Seq::seq_guess_type( $entry1->[ SEQ ] );
274 $blosum = blosum_read() if $type =~ /protein/;
276 for ( $i = 0; $i < length $entry1->[ SEQ ]; $i++ )
278 $char1 = substr $entry1->[ SEQ ], $i, 1;
279 $char2 = substr $entry2->[ SEQ ], $i, 1;
281 if ( $blosum and $char1 eq $char2 ) {
283 } elsif ( uc $char1 eq uc $char2 ) {
285 } elsif ( $blosum and $blosum->{ $char1 }->{ $char2 } > 0 ) {
292 return wantarray ? ( "consensus", $pins ) : [ "consensus", $pins ];
298 # Martin A. Hansen, February 2007;
300 # Gererates a ruler for a given FASTA entry (with indels).
302 my ( $entry, # FASTA entry
303 $count_gaps, # flag for counting indels in pairwise alignments.
308 my ( $i, $char, $skip, $count, $gap, $tics );
314 while ( $i <= length $entry->[ SEQ ] )
316 $char = substr( $entry->[ SEQ ], $i - 1, 1 ) if $count_gaps;
318 $gap++ if $char eq "-";
327 $count = 1 if $char eq "-";
329 if ( $count % 100 == 0 )
331 if ( $count + length( $count ) >= length $entry->[ SEQ ] )
337 $tics .= "|" . $count;
338 $skip = length $count;
341 elsif ( $count % 50 == 0 ) {
343 } elsif ( $count % 10 == 0 ) {
353 return wantarray ? ( "ruler", $tics ) : [ "ruler", $tics ];
359 # Martin A. Hansen, June 2006.
361 # Given an alignment as a list of FASTA entries,
362 # generates a consensus sequences based on the
363 # entropies for each column similar to the way
364 # a sequence logo i calculated. Returns the
365 # consensus sequence as a FASTA entry.
367 my ( $entries, # list of aligned FASTA entries
368 $type, # residue type - OPTIONAL
369 $min_sim, # minimum similarity - OPTIONAL
374 my ( $bit_max, $data, $pos, $char, $score, $entry );
376 $type ||= Maasha::Seq::seq_guess_type( $entries->[ 0 ]->[ SEQ ] );
379 if ( $type =~ /protein/ ) {
385 $data = Maasha::Seq::seqlogo_calc( $bit_max, $entries );
387 foreach $pos ( @{ $data } )
391 ( $char, $score ) = @{ $pos->[ -1 ] };
393 if ( ( $score / $bit_max ) * 100 >= $min_sim ) {
394 $entry->[ SEQ ] .= $char;
396 $entry->[ SEQ ] .= "-";
401 $entry->[ SEQ ] .= "-";
405 $entry->[ SEQ_NAME ] = "Consensus: $min_sim%";
407 return wantarray ? @{ $entry } : $entry;
413 # Martin A. Hansen, June 2007.
415 # Calculate the global similarity of two aligned entries
416 # The similarity is calculated as the number of matching
417 # residues divided by the length of the shortest sequence.
419 my ( $entry1, # first aligned entry
420 $entry2, # second aligned entry
425 my ( $seq1, $seq2, $len1, $len2, $i, $match_tot, $min, $sim );
427 $seq1 = $entry1->[ SEQ ];
428 $seq2 = $entry2->[ SEQ ];
438 $len1 = length $seq1;
439 $len2 = length $seq2;
441 return 0 if $len1 == 0 or $len2 == 0;
445 for ( $i = 0; $i < $len1; $i++ ) {
446 $match_tot++ if substr( $entry1->[ SEQ ], $i, 1 ) eq substr( $entry2->[ SEQ ], $i, 1 );
449 $min = Maasha::Calc::min( $len1, $len2 );
451 $sim = sprintf( "%.2f", ( $match_tot / $min ) * 100 );
459 # Martin A. Hansen, February 2008.
461 # Tile a list of query sequences agains a reference sequence,
462 # using pairwise alignments. The result is returned as a list of
463 # aligned FASTA entries.
465 my ( $ref_entry, # reference entry as [ SEQ_NAME, SEQ ] tuple
466 $q_entries, # list of [ SEQ_NAME, SEQ ] tuples
467 $args, # argument hash
472 my ( $entry, $seq1, $seq2, $type, $align1, $align2, $sim1, $sim2, $gaps, @entries );
474 $args->{ "identity" } ||= 70;
476 foreach $entry ( @{ $q_entries } )
478 $seq1 = $entry->[ SEQ ];
480 $type = Maasha::Seq::seq_guess_type( $seq1 );
482 if ( $type eq "rna" ) {
483 $seq2 = Maasha::Seq::rna_revcomp( $seq1 );
484 } elsif ( $type eq "dna" ) {
485 $seq2 = Maasha::Seq::dna_revcomp( $seq1 );
487 Maasha::Common::error( qq(Bad sequence type->$type) );
490 $align1 = Maasha::Align::align_muscle( [ $ref_entry, [ $entry->[ SEQ_NAME ] . "_+", $seq1 ] ], "-quiet -maxiters 1" );
491 $align2 = Maasha::Align::align_muscle( [ $ref_entry, [ $entry->[ SEQ_NAME ] . "_-", $seq2 ] ], "-quiet -maxiters 1" );
493 if ( $args->{ "supress_indels" } )
495 align_supress_indels( $align1 );
496 align_supress_indels( $align2 );
499 $sim1 = Maasha::Align::align_sim_global( $align1->[ 0 ], $align1->[ 1 ] );
500 $sim2 = Maasha::Align::align_sim_global( $align2->[ 0 ], $align2->[ 1 ] );
502 if ( $sim1 < $args->{ "identity" } and $sim2 < $args->{ "identity" } )
506 elsif ( $sim1 > $sim2 )
508 $gaps = $align1->[ 0 ]->[ SEQ ] =~ tr/-//;
510 $align1->[ 1 ]->[ SEQ ] =~ s/-{$gaps}$// if $gaps;
512 $entry->[ SEQ_NAME ] = "$align1->[ 1 ]->[ SEQ_NAME ]_$sim1";
513 $entry->[ SEQ ] = $align1->[ 1 ]->[ SEQ ];
515 push @entries, $entry;
519 $gaps = $align2->[ 0 ]->[ SEQ ] =~ tr/-//;
521 $align2->[ 1 ]->[ SEQ ] =~ s/-{$gaps}$// if $gaps;
523 $entry->[ SEQ_NAME ] = "$align2->[ 1 ]->[ SEQ_NAME ]_$sim2";
524 $entry->[ SEQ ] = $align2->[ 1 ]->[ SEQ ];
526 push @entries, $entry;
530 @entries = sort { $b->[ SEQ ] cmp $a->[ SEQ ] } @entries;
532 unshift @entries, $ref_entry;
534 return wantarray ? @entries : \@entries;
538 sub align_supress_indels
540 # Martin A. Hansen, June 2008.
542 # Given a pairwise alignment, removes
543 # indels in the first sequence AND corresponding
544 # sequence in the second.
546 my ( $align, # pairwise alignment
551 my ( $count, $seq, $i );
553 $count = $align->[ 0 ]->[ SEQ ] =~ tr/-//;
557 for ( $i = 0; $i < length $align->[ 0 ]->[ SEQ ]; $i++ )
559 if ( substr( $align->[ 0 ]->[ SEQ ], $i, 1 ) ne '-' ) {
560 $seq .= substr( $align->[ 1 ]->[ SEQ ], $i, 1 );
565 $align->[ 0 ]->[ SEQ ] =~ tr/-//d;
566 $align->[ 1 ]->[ SEQ ] = $seq;
573 # Martin A. Hansen, February 2008.
575 # Invert an alignment in such a way that only
576 # residues differing from the first sequence (the reference sequence)
577 # are shown. The matching sequence can either be lowercased (soft) or replaced
580 my ( $entries, # list of FASTA entries.
586 my ( $i, $c, $char1, $char2 );
588 map { $_->[ SEQ ] =~ tr/-/_/ } @{ $entries };
590 for ( $i = 0; $i < length $entries->[ 0 ]->[ SEQ ]; $i++ )
592 $char1 = uc substr $entries->[ 0 ]->[ SEQ ], $i, 1;
594 for ( $c = 1; $c < @{ $entries }; $c++ )
596 $char2 = uc substr $entries->[ $c ]->[ SEQ ], $i, 1;
598 if ( $char1 eq $char2 )
601 substr $entries->[ $c ]->[ SEQ ], $i, 1, lc $char2;
603 substr $entries->[ $c ]->[ SEQ ], $i, 1, "-";
613 # Martin A. Hansen, January 2006.
615 # this routine parses the BLOSUM62 matrix,
616 # which is located in the __DATA__ section
620 my ( @lines, @chars, $i, $c, @list, $HoH );
623 @lines = grep { $_ !~ /^$|^#/ } @lines;
625 @chars = split /\s+/, $lines[ 0 ];
629 while( $lines[ $i ] )
631 last if $lines[ $i ] =~ /^__END__/;
633 @list = split /\s+/, $lines[ $i ];
635 for ( $c = 1; $c < @list; $c++ ) {
636 $HoH->{ $list[ 0 ] }->{ $chars[ $c ] } = $list[ $c ];
642 return wantarray ? %{ $HoH } : $HoH;
646 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
652 # Matrix made by matblas from blosum62.iij
653 # * column uses minimum score
654 # BLOSUM Clustered Scoring Matrix in 1/2 Bit Units
655 # Blocks Database = /data/blocks_5.0/blocks.dat
656 # Cluster Percentage: >= 62
657 # Entropy = 0.6979, Expected = -0.5209
658 A R N D C Q E G H I L K M F P S T W Y V B Z X *
659 A 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 -2 -1 0 -4
660 R -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 -1 0 -1 -4
661 N -2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2 -3 3 0 -1 -4
662 D -2 -2 1 6 -3 0 2 -1 -1 -3 -4 -1 -3 -3 -1 0 -1 -4 -3 -3 4 1 -1 -4
663 C 0 -3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -3 -3 -2 -4
664 Q -1 1 0 0 -3 5 2 -2 0 -3 -2 1 0 -3 -1 0 -1 -2 -1 -2 0 3 -1 -4
665 E -1 0 0 2 -4 2 5 -2 0 -3 -3 1 -2 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4
666 G 0 -2 0 -1 -3 -2 -2 6 -2 -4 -4 -2 -3 -3 -2 0 -2 -2 -3 -3 -1 -2 -1 -4
667 H -2 0 1 -1 -3 0 0 -2 8 -3 -3 -1 -2 -1 -2 -1 -2 -2 2 -3 0 0 -1 -4
668 I -1 -3 -3 -3 -1 -3 -3 -4 -3 4 2 -3 1 0 -3 -2 -1 -3 -1 3 -3 -3 -1 -4
669 L -1 -2 -3 -4 -1 -2 -3 -4 -3 2 4 -2 2 0 -3 -2 -1 -2 -1 1 -4 -3 -1 -4
670 K -1 2 0 -1 -3 1 1 -2 -1 -3 -2 5 -1 -3 -1 0 -1 -3 -2 -2 0 1 -1 -4
671 M -1 -1 -2 -3 -1 0 -2 -3 -2 1 2 -1 5 0 -2 -1 -1 -1 -1 1 -3 -1 -1 -4
672 F -2 -3 -3 -3 -2 -3 -3 -3 -1 0 0 -3 0 6 -4 -2 -2 1 3 -1 -3 -3 -1 -4
673 P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4 7 -1 -1 -4 -3 -2 -2 -1 -2 -4
674 S 1 -1 1 0 -1 0 0 0 -1 -2 -2 0 -1 -2 -1 4 1 -3 -2 -2 0 0 0 -4
675 T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 1 5 -2 -2 0 -1 -1 0 -4
676 W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 11 2 -3 -4 -3 -2 -4
677 Y -2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7 -1 -3 -2 -1 -4
678 V 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 -3 -2 -1 -4
679 B -2 -1 3 4 -3 0 1 -1 0 -3 -4 0 -3 -3 -2 0 -1 -4 -3 -3 4 1 -1 -4
680 Z -1 0 0 1 -3 3 4 -2 0 -3 -3 1 -1 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4
681 X 0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2 0 0 -2 -1 -1 -1 -1 -1 -4
682 * -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 1