3 # Copyright (C) 2009 Martin A. Hansen.
5 # This program is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU General Public License
7 # as published by the Free Software Foundation; either version 2
8 # of the License, or (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 # http://www.gnu.org/copyleft/gpl.html
22 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
25 # Routines to handle SAM and BAM format.
26 # http://samtools.sourceforge.net/
29 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
38 use vars qw( @ISA @EXPORT );
40 @ISA = qw( Exporter );
57 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
62 # Martin A. Hansen, September 2009.
64 # Parses a SAM entry from a given file handle.
66 my ( $fh, # file handle
71 my ( $line, @fields );
73 while ( $line = <$fh> )
77 next if substr( $line, 0, 1 ) eq '@';
79 @fields = split "\t", $line;
81 return wantarray ? @fields : \@fields;
96 # Martin A. Hansen, September 2009.
98 # Converts a SAM entry to a Biopiece record.
100 my ( $entry, # SAM entry
105 my ( $record, $i, $tag, $vtype, $value );
107 return if $entry->[ RNAME ] eq '*';
109 $record->{ 'REC_TYPE' } = 'SAM';
110 $record->{ 'Q_ID' } = $entry->[ QNAME ];
111 $record->{ 'FLAG' } = $entry->[ FLAG ];
112 $record->{ 'S_ID' } = $entry->[ RNAME ];
113 $record->{ 'S_BEG' } = $entry->[ POS ];
114 $record->{ 'MAPQ' } = $entry->[ MAPQ ];
115 $record->{ 'CIGAR' } = $entry->[ CIGAR ];
116 $record->{ 'MRNM' } = $entry->[ MRNM ];
117 $record->{ 'S_BEG2' } = $entry->[ MPOS ];
118 $record->{ 'ISIZE' } = $entry->[ ISIZE ];
119 $record->{ 'SEQ' } = $entry->[ SEQ ];
120 $record->{ 'SCORES' } = $entry->[ QUAL ];
121 $record->{ 'SCORES' } =~ s/(.)/chr( ( ord( $1 ) - 33 ) + 64 )/ge; # convert phred-33 to phred-64 scores
122 $record->{ 'SCORE' } = sprintf( "%.2f", Maasha::Fastq::phred_str_mean( $entry->[ QUAL ] ) );
124 $record->{ 'S_BEG' } -= 1 if $record->{ 'S_BEG' } != 0;
125 $record->{ 'S_BEG' } -= 1 if $record->{ 'S_BEG2' } != 0;
127 $record->{ 'S_END' } = $record->{ 'S_BEG' } + length( $record->{ 'SEQ' } ) - 1;
129 $record->{ 'READ_PAIRED' } = $record->{ 'FLAG' } & 0x0001;
130 $record->{ 'READ_MAPPED' } = $record->{ 'FLAG' } & 0x0002;
131 $record->{ 'Q_SEQ_UNMAPPED' } = $record->{ 'FLAG' } & 0x0004;
132 $record->{ 'MATE_UNMAPPED' } = $record->{ 'FLAG' } & 0x0008;
134 if ( $record->{ 'FLAG' } & 0x0010 ) { # 0 for forward, 1 for reverse
135 $record->{ 'STRAND' } = '-';
137 $record->{ 'STRAND' } = '+';
140 $record->{ 'MATE_STRAND' } = $record->{ 'FLAG' } & 0x0020;
141 $record->{ '1ST_READ' } = $record->{ 'FLAG' } & 0x0040;
142 $record->{ '2ND_READ' } = $record->{ 'FLAG' } & 0x0080;
143 $record->{ 'ALIGN_PRIMARY' } = $record->{ 'FLAG' } & 0x0100;
144 $record->{ 'READ_FAIL' } = $record->{ 'FLAG' } & 0x0200;
145 $record->{ 'READ_BAD' } = $record->{ 'FLAG' } & 0x0400;
147 if ( $record->{ 'READ_PAIRED' } ) {
148 $record->{ 'BLOCK_COUNT' } = 2;
150 $record->{ 'BLOCK_COUNT' } = 1;
153 # extra fields - (hate hate hate SAM!!!)
155 for ( $i = QUAL + 1; $i < @{ $entry }; $i++ )
157 ( $tag, $vtype, $value ) = split /:/, $entry->[ $i ];
159 if ( $tag eq "MD" or $tag eq "NM" ) {
160 $record->{ $tag } = $value;
164 if ( defined $record->{ "NM" } and $record->{ "NM" } > 0 ) {
165 $record->{ "ALIGN" } = sam2align( $record->{ 'SEQ' }, $record->{ 'CIGAR' }, $record->{ 'NM' }, $record->{ 'MD' } );
167 $record->{ "ALIGN" } = ".";
170 return wantarray ? %{ $record } : $record;
184 my ( $offset, $len, $op, $i, $nt, $nt2, @nts, $insertions, @align, $aln_str );
189 while ( length( $cigar ) > 0 )
191 if ( $cigar =~ s/^(\d+)(\w)// )
198 for ( $i = 0; $i < $len; $i++ )
200 $nt = substr $seq, $offset + $i, 1;
202 push @align, [ $offset + $i, "->$nt" ];
216 if ( $md =~ s/^(\d+)// ) # match
220 elsif( $md =~ s/^\^([ATCGN]+)// ) # insertions
226 $nt2 = substr $seq, $offset, 1;
228 push @align, [ $offset, "$nt2>-" ];
237 elsif ( $md =~ s/^([ATCGN]+)// ) # mismatch
243 $nt2 = substr $seq, $offset - $insertions, 1;
245 push @align, [ $offset, "$nt>$nt2" ];
254 Maasha::Common::error( qq(Bad SAM field -> MD: "$md") );
260 @align = sort { $a->[ 0 ] <=> $b->[ 0 ] } @align;
262 map { $aln_str .= "$_->[ 0 ]:$_->[ 1 ]," } @align;
277 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<