From: martinahansen Date: Mon, 9 Nov 2009 08:43:40 +0000 (+0000) Subject: improved SAM parser X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=6d0ad02102c50fbb115cbfe1b1c0dbebac1951d0;p=biopieces.git improved SAM parser git-svn-id: http://biopieces.googlecode.com/svn/trunk@752 74ccb610-7750-0410-82ae-013aeee3265d --- diff --git a/code_perl/Maasha/Fastq.pm b/code_perl/Maasha/Fastq.pm index 4027209..80fea57 100644 --- a/code_perl/Maasha/Fastq.pm +++ b/code_perl/Maasha/Fastq.pm @@ -195,7 +195,7 @@ void solexa_str2phred_str( char *scores ) int i = 0; - for ( i + 0; i < strlen( scores ); i++ ) { + for ( i = 0; i < strlen( scores ); i++ ) { scores[ i ] = solexa2phred( scores[ i ] ); } } @@ -209,12 +209,35 @@ void phred_str2solexa_str( char *scores ) int i = 0; - for ( i + 0; i < strlen( scores ); i++ ) { + for ( i = 0; i < strlen( scores ); i++ ) { scores[ i ] = phred2solexa( scores[ i ] ); } } +double phred_str_mean( char *scores ) +{ + /* Martin A. Hansen, November 2009 */ + + /* Calculates the mean score as a float which is retuned. */ + + int len = 0; + int i = 0; + int sum = 0; + double mean = 0.0; + + len = strlen( scores ); + + for ( i = 0; i < len; i++ ) { + sum += phred2dec( scores[ i ] ); + } + + mean = ( double ) sum / ( double ) len; + + return mean; +} + + void softmask_solexa_str( char *seq, char *scores, int threshold ) { /* Martin A. Hansen, July 2009 */ diff --git a/code_perl/Maasha/SAM.pm b/code_perl/Maasha/SAM.pm index 6683973..e794d75 100644 --- a/code_perl/Maasha/SAM.pm +++ b/code_perl/Maasha/SAM.pm @@ -32,9 +32,10 @@ package Maasha::SAM; use warnings; use strict; use Maasha::Common; +use Maasha::Fastq; use Data::Dumper; -use vars qw ( @ISA @EXPORT ); +use vars qw( @ISA @EXPORT ); @ISA = qw( Exporter ); @@ -50,9 +51,6 @@ use constant { ISIZE => 8, SEQ => 9, QUAL => 10, - TAG => 11, - VTYPE => 12, - VALUE => 13, }; @@ -104,7 +102,9 @@ sub sam2biopiece # Returns a hashref. - my ( $record ); + my ( $record, $i, $tag, $vtype, $value ); + + return if $entry->[ RNAME ] eq '*'; $record->{ 'REC_TYPE' } = 'SAM'; $record->{ 'Q_ID' } = $entry->[ QNAME ]; @@ -118,9 +118,7 @@ sub sam2biopiece $record->{ 'ISIZE' } = $entry->[ ISIZE ]; $record->{ 'SEQ' } = $entry->[ SEQ ]; $record->{ 'SCORES' } = $entry->[ QUAL ]; - $record->{ 'TAG' } = $entry->[ TAG ]; - $record->{ 'VTYPE' } = $entry->[ VTYPE ]; - $record->{ 'VALUE' } = $entry->[ VALUE ]; + $record->{ 'SCORE' } = sprintf( "%.2f", Maasha::Fastq::phred_str_mean( $entry->[ QUAL ] ) ); $record->{ 'S_BEG' } -= 1 if $record->{ 'S_BEG' } != 0; $record->{ 'S_BEG' } -= 1 if $record->{ 'S_BEG2' } != 0; @@ -145,10 +143,114 @@ sub sam2biopiece $record->{ 'READ_FAIL' } = $record->{ 'FLAG' } & 0x0200; $record->{ 'READ_BAD' } = $record->{ 'FLAG' } & 0x0400; + + # extra fields - (hate hate hate SAM!!!) + + for ( $i = QUAL + 1; $i < @{ $entry }; $i++ ) + { + ( $tag, $vtype, $value ) = split /:/, $entry->[ $i ]; + + if ( $tag eq "MD" or $tag eq "NM" ) { + $record->{ $tag } = $value; + } + } + + if ( defined $record->{ "NM" } and $record->{ "NM" } > 0 ) { + $record->{ "ALIGN" } = sam2align( $record->{ 'SEQ' }, $record->{ 'CIGAR' }, $record->{ 'NM' }, $record->{ 'MD' } ); + } else { + $record->{ "ALIGN" } = "."; + } + return wantarray ? %{ $record } : $record; } +sub sam2align +{ + my ( $seq, + $cigar, + $nm, + $md, + ) = @_; + + # Returns a string. + + my ( $offset, $len, $op, $i, $nt, $nt2, @align, $aln_str ); + + $offset = 0; + + while ( length( $cigar ) > 0 ) + { + if ( $cigar =~ s/^(\d+)(\w)// ) + { + $len = $1; + $op = $2; + + if ( $op eq 'I' ) + { + for ( $i = 0; $i < $len; $i++ ) + { + $nt = substr $seq, $offset + $i, 1; + + push @align, [ $offset + $i, "->$nt" ]; + + $nm--; + } + } + + $offset += $len; + } + } + + $offset = 0; + + while ( $nm > 0 ) + { + if ( $md =~ s/^(\d+)// ) + { + $offset += $1; + } + elsif ( $md =~ s/^(\w)// ) + { + $nt = $1; + + $nt2 = substr $seq, $offset, 1; + + push @align, [ $offset, "$nt>$nt2" ]; + + $offset++; + + $nm--; + } + elsif( $md =~ s/^\^// ) + { + $nt = substr $seq, $offset, 1; + + push @align, [ $offset, "$nt>-" ]; + + $offset++; + + $nm--; + } + else + { + Maasha::Common::error( qq(Bad SAM field -> MD: "$md") ); + } + } + + + $aln_str = ""; + + @align = sort { $a->[ 0 ] <=> $b->[ 0 ] } @align; + + map { $aln_str .= "$_->[ 0 ]:$_->[ 1 ]," } @align; + + $aln_str =~ s/,$//; + + return $aln_str; +} + + sub biopiece2sam {