# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+use warnings;
use strict;
use Data::Dumper;
use Storable qw( dclone );
my ( $entry );
- $/ = "//\n";
+ local $/ = "//\n";
$entry = <$fh>;
# returns data structure
- my ( @lines, $line, %hash, $ft, $seq, $key );
+ my ( @lines, $i, %hash, $ft, $seq, $key, $val, $ref );
@lines = split "\n", $entry;
- foreach $line ( @lines )
+ $i = 0;
+
+ while ( $i < @lines )
{
if ( exists $args->{ "keys" } )
{
- if ( $line =~ /^(\w{2})\s+(.*)/ and exists $args->{ "keys" }->{ $1 } )
+ $args->{ "keys" }->{ "RN" } = 1 if grep { $_ =~ /^R/ } keys %{ $args->{ "keys" } };
+
+ if ( $lines[ $i ] =~ /^(\w{2})\s+(.*)/ and exists $args->{ "keys" }->{ $1 } )
{
- if ( exists $hash{ $1 } and $1 eq "FT" ) {
- $hash{ $1 } .= "\n" . $2;
+ $key = $1;
+ $val = $2;
+
+ if ( $key =~ /RN|RX|RP|RG|RA|RT|RL/ ) {
+ add_ref( \%hash, \@lines, $i, $args->{ "keys" } ) if $key eq "RN";
+ } elsif ( exists $hash{ $key } and $key eq "FT" ) {
+ $hash{ $key } .= "\n" . $val;
} elsif ( exists $hash{ $1 } ) {
- $hash{ $1 } .= " " . $2;
+ $hash{ $key } .= " " . $val;
} else {
- $hash{ $1 } = $2;
+ $hash{ $key } = $val;
}
}
- elsif ( $line =~ /^\s+(.*)\s+\d+$/ and exists $args->{ "keys" }->{ "SEQ" } )
+ elsif ( $lines[ $i ] =~ /^\s+(.*)\s+\d+$/ and exists $args->{ "keys" }->{ "SEQ" } )
{
$seq .= $1;
}
}
else
{
- if ( $line =~ /^(\w{2})\s+(.*)/ )
+ if ( $lines[ $i ] =~ /^(\w{2})\s+(.*)/ )
{
- if ( exists $hash{ $1 } and $1 eq "FT" ) {
- $hash{ $1 } .= "\n" . $2;
- } elsif ( exists $hash{ $1 } ) {
- $hash{ $1 } .= " " . $2;
+ $key = $1;
+ $val = $2;
+
+ if ( $key =~ /RN|RX|RP|RG|RA|RT|RL/ ) {
+ add_ref( \%hash, \@lines, $i ) if $key eq "RN";
+ } elsif ( exists $hash{ $1 } and $key eq "FT" ) {
+ $hash{ $key } .= "\n" . $val;
+ } elsif ( exists $hash{ $key } ) {
+ $hash{ $key } .= " " . $val;
} else {
- $hash{ $1 } = $2;
+ $hash{ $key } = $val;
}
}
- elsif ( $line =~ /^\s+(.*)\s+\d+$/ )
+ elsif ( $lines[ $i ] =~ /^\s+(.*)\s+\d+$/ )
{
$seq .= $1;
}
}
+
+ $i++;
}
if ( $seq )
$hash{ "SEQ" } = $seq;
}
-# foreach $key ( keys %hash )
-# {
-# next if $key =~ /^(SEQ|SEQ_FT|FT)/;
-#
-# if ( not $hash{ $key } =~ /$args->{ $key }/i ) {
-# return wantarray ? () : {} ;
-# }
-# }
-
if ( exists $hash{ "FT" } )
{
- $seq =~ tr/ //d;
- $ft = &parse_feature_table( $hash{ "FT" }, $seq, $args );
+ $ft = parse_feature_table( $hash{ "FT" }, $seq, $args );
$hash{ "FT" } = $ft;
}
}
+sub add_ref
+{
+ # Martin A. Hansen, August 2009.
+
+ # Add a EMBL reference.
+
+ my ( $hash, # Parsed EMBL data
+ $lines, # EMBL entry lines
+ $i, # line index
+ $args, # hashref with keys to save - OPTIONAL
+ ) = @_;
+
+ # Returns nothing.
+
+ my ( %ref, $key, $val );
+
+ if ( $args )
+ {
+ while ( $lines->[ $i ] =~ /^(\w{2})\s+(.*)/ )
+ {
+ $key = $1;
+ $val = $2;
+
+ last if $key eq "XX";
+
+ if ( exists $args->{ $key } )
+ {
+ if ( exists $ref{ $key } ) {
+ $ref{ $key } .= " " . $val;
+ } else {
+ $ref{ $key } = $val;
+ }
+ }
+
+ $i++;
+ }
+ }
+ else
+ {
+ while ( $lines->[ $i ] =~ /^(\w{2})\s+(.*)/ and $1 ne 'XX' )
+ {
+ if ( exists $ref{ $1 } ) {
+ $ref{ $1 } .= " " . $2;
+ } else {
+ $ref{ $1 } = $2;
+ }
+
+ $i++;
+ }
+ }
+
+ push @{ $hash->{ 'REF' } }, \%ref;
+}
+
+
sub parse_feature_table
{
# Martin A. Hansen, June 2006.
# argument hash are returned.
my ( $ft, # feature table
- $seq, # entry sequnce
+ $seq, # entry sequence
$args, # argument hash
) = @_;
$p = 1;
- if ( not &balance_params( $locator ) )
+ if ( not balance_params( $locator ) )
{
- while ( not &balance_params( $locator ) )
+ while ( not balance_params( $locator ) )
{
$locator .= $lines[ $i + $p ];
$p++;
push @{ $qual_hash{ "_locator" } }, $locator;
- # ---- getting subsequence
+ if ( $seq )
+ {
+ # ---- getting subsequence
- $subseq = &parse_locator( $locator, $seq );
+ $subseq = parse_locator( $locator, $seq );
- push @{ $qual_hash{ "_seq" } }, $subseq;
+ push @{ $qual_hash{ "_seq" } }, $subseq;
+ }
# ----- getting qualifiers
$q = 1;
- if ( not &balance_quotes( $qual_val ) )
+ if ( not balance_quotes( $qual_val ) )
{
- while ( not &balance_quotes( $qual_val ) )
+ while ( not balance_quotes( $qual_val ) )
{
$qual_val .= " " . $lines[ $i + $p + $q ];
$q++;
@intervals = split ",", $locator;
- if ( not &balance_params( $intervals[ 0 ] ) ) # locator includes a join/comp/order of several ranges
+ if ( not balance_params( $intervals[ 0 ] ) ) # locator includes a join/comp/order of several ranges
{
if ( $locator =~ /^join\((.*)\)$/ )
{
$join = 1;
- $subseq = &parse_locator( $1, $seq, $subseq, $join, $comp, $order );
+ $subseq = parse_locator( $1, $seq, $subseq, $join, $comp, $order );
}
elsif ( $locator =~ /^complement\((.*)\)$/ )
{
$comp = 1;
- $subseq = &parse_locator( $1, $seq, $subseq, $join, $comp, $order );
+ $subseq = parse_locator( $1, $seq, $subseq, $join, $comp, $order );
}
elsif ( $locator =~ /^order\((.*)\)$/ )
{
$order = 1;
- $subseq = &parse_locator( $1, $seq, $subseq, $join, $comp, $order );
+ $subseq = parse_locator( $1, $seq, $subseq, $join, $comp, $order );
}
}
else
if ( $interval =~ /^join\((.*)\)$/ )
{
$join = 1;
- $subseq = &parse_locator( $1, $seq, $subseq, $join, $comp, $order );
+ $subseq = parse_locator( $1, $seq, $subseq, $join, $comp, $order );
}
elsif ( $interval =~ /^complement\((.*)\)$/ )
{
$comp = 1;
- $subseq = &parse_locator( $1, $seq, $subseq, $join, $comp, $order );
+ $subseq = parse_locator( $1, $seq, $subseq, $join, $comp, $order );
}
elsif ( $interval =~ /^order\((.*)\)$/ )
{
$order = 1;
- $subseq = &parse_locator( $1, $seq, $subseq, $join, $comp, $order );
+ $subseq = parse_locator( $1, $seq, $subseq, $join, $comp, $order );
}
elsif ( $interval =~ /^[<>]?(\d+)[^\d]+(\d+)$/ )
{
$newseq = substr $seq, $beg - 1, $end - $beg + 1;
- $newseq = &Maasha::Seq::dna_revcomp( $newseq ) if $comp;
+ $newseq = Maasha::Seq::dna_revcomp( $newseq ) if $comp;
if ( $order ) {
$subseq .= " " . $newseq;
$newseq = substr $seq, $beg - 1, 1 ;
- $newseq = &Maasha::Seq::dna_revcomp( $newseq ) if $comp;
+ $newseq = Maasha::Seq::dna_revcomp( $newseq ) if $comp;
if ( $order ) {
$subseq .= " " . $newseq;
}
+sub embl2biopieces
+{
+ # Martin A. Hansen, July 2009.
+
+ # Given a complete EMBL entry and an option hash,
+ # configure the arguments for the EMBL parser so
+ # only wanted information is retrieved and returned
+ # as a Biopiece record.
+
+ my ( $entry, # EMBL entry to parse
+ $options, # Biopiece options
+ ) = @_;
+
+ # Returns a hashref.
+
+ my ( %args, $data, $record, $key_record, $key, $feat, $feat2, $qual, $qual_val, @records, $no_seq );
+
+ local $Data::Dumper::Terse = 1;
+ local $Data::Dumper::Indent = 0;
+
+ map { $args{ 'keys' }{ $_ } = 1 } @{ $options->{ 'keys' } };
+ map { $args{ 'feats' }{ $_ } = 1 } @{ $options->{ 'features' } };
+ map { $args{ 'quals' }{ $_ } = 1 } @{ $options->{ 'qualifiers' } };
+
+ if ( @{ $options->{ 'features' } } > 0 or @{ $options->{ 'qualifiers' } } > 0 ) {
+ $args{ 'keys' }{ 'FT' } = 1;
+ }
+
+ if ( ( $args{ 'feats' } and $args{ 'feats' }{ 'SEQ' } ) or ( $args{ 'quals' } and $args{ 'quals' }{ 'SEQ' } ) )
+ {
+ $no_seq = 1 if not $args{ 'keys' }{ 'SEQ' };
+
+ $args{ 'keys' }{ 'SEQ' } = 1;
+ delete $args{ 'feats' }{ 'SEQ' } if $args{ 'feats' };
+ delete $args{ 'quals' }{ 'SEQ' } if $args{ 'quals' };
+ }
+
+ $data = parse_embl_entry( $entry, \%args );
+
+ # print Dumper( $data );
+
+ foreach $key ( keys %{ $data } )
+ {
+ if ( $key eq 'SEQ' and $no_seq ) {
+ next;
+ }
+
+ if ( $key eq 'REF' ) {
+ $key_record->{ $key } = Dumper( $data->{ $key } );
+ }
+
+ if ( $key ne 'FT' and $key ne 'REF' ) {
+ $key_record->{ $key } = $data->{ $key };
+ }
+ }
+
+ if ( exists $data->{ 'FT' } )
+ {
+ $record->{ 'FT' } = Dumper( $data->{ 'FT' } );
+
+ map { $record->{ $_ } = $key_record->{ $_ } } keys %{ $key_record };
+
+ push @records, $record;
+ }
+ else
+ {
+ push @records, $key_record;
+ }
+
+ return wantarray ? @records : \@records;
+}
+
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<