3 # Copyright (C) 2007 Martin A. Hansen.
5 # This program is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU General Public License
7 # as published by the Free Software Foundation; either version 2
8 # of the License, or (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 # http://www.gnu.org/copyleft/gpl.html
22 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
25 # Routines to parse EMBL records.
28 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
34 use Storable qw( dclone );
39 use vars qw ( @ISA @EXPORT );
42 @ISA = qw( Exporter );
45 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
50 # Martin A. Hansen, June 2006.
52 # Given a filehandle to an embl file,
53 # fetches the next embl entry, and returns
54 # this as a string with multiple lines.
56 my ( $fh, # filehandle to embl file
73 # Martin A. Hansen, June 2006.
75 # given an embl entry extracts the keys
76 # given in an argument hash. Special care
77 # is taken to parse the feature table if
80 my ( $entry, # embl entry
81 $args, # argument hash
84 # returns data structure
86 my ( @lines, $i, %hash, $ft, $seq, $key, $val, $ref );
88 @lines = split "\n", $entry;
94 if ( exists $args->{ "keys" } )
96 $args->{ "keys" }->{ "RN" } = 1 if grep { $_ =~ /^R/ } keys %{ $args->{ "keys" } };
98 if ( $lines[ $i ] =~ /^(\w{2})\s+(.*)/ and exists $args->{ "keys" }->{ $1 } )
103 if ( $key =~ /RN|RX|RP|RG|RA|RT|RL/ ) {
104 add_ref( \%hash, \@lines, $i, $args->{ "keys" } ) if $key eq "RN";
105 } elsif ( exists $hash{ $key } and $key eq "FT" ) {
106 $hash{ $key } .= "\n" . $val;
107 } elsif ( exists $hash{ $1 } ) {
108 $hash{ $key } .= " " . $val;
110 $hash{ $key } = $val;
113 elsif ( $lines[ $i ] =~ /^\s+(.*)\s+\d+$/ and exists $args->{ "keys" }->{ "SEQ" } )
120 if ( $lines[ $i ] =~ /^(\w{2})\s+(.*)/ )
125 if ( $key =~ /RN|RX|RP|RG|RA|RT|RL/ ) {
126 add_ref( \%hash, \@lines, $i ) if $key eq "RN";
127 } elsif ( exists $hash{ $1 } and $key eq "FT" ) {
128 $hash{ $key } .= "\n" . $val;
129 } elsif ( exists $hash{ $key } ) {
130 $hash{ $key } .= " " . $val;
132 $hash{ $key } = $val;
135 elsif ( $lines[ $i ] =~ /^\s+(.*)\s+\d+$/ )
147 $hash{ "SEQ" } = $seq;
150 if ( exists $hash{ "FT" } )
152 $ft = parse_feature_table( $hash{ "FT" }, $seq, $args );
156 return wantarray ? %hash : \%hash;
162 # Martin A. Hansen, August 2009.
164 # Add a EMBL reference.
166 my ( $hash, # Parsed EMBL data
167 $lines, # EMBL entry lines
169 $args, # hashref with keys to save - OPTIONAL
174 my ( %ref, $key, $val );
178 while ( $lines->[ $i ] =~ /^(\w{2})\s+(.*)/ )
183 last if $key eq "XX";
185 if ( exists $args->{ $key } )
187 if ( exists $ref{ $key } ) {
188 $ref{ $key } .= " " . $val;
199 while ( $lines->[ $i ] =~ /^(\w{2})\s+(.*)/ and $1 ne 'XX' )
201 if ( exists $ref{ $1 } ) {
202 $ref{ $1 } .= " " . $2;
211 push @{ $hash->{ 'REF' } }, \%ref;
215 sub parse_feature_table
217 # Martin A. Hansen, June 2006.
219 # parses the feature table of a EMBL/GenBank/DDBJ entry.
220 # parsing takes place in 4 steps. 1) the feature key is
221 # located. 2) the locator is located taking into # consideration
222 # that it may be split over multiple lines, which is dealt with
223 # by counting the params that always are present in multiline
224 # locators. 3) the locator is used to fetch the corresponding
225 # sequence. 4) qualifier key/value pars are located again taking
226 # into consideration multiline values, which are dealt with by
227 # keeping track of the "-count (value-less qualifers are also
228 # included). only feature keys and qualifers defined in the
229 # argument hash are returned.
231 my ( $ft, # feature table
232 $seq, # entry sequence
233 $args, # argument hash
236 # returns data structure
238 my ( @lines, $key_regex, $i, $p, $q, %key_hash, $key, $locator, %qual_hash, $qual_name, $qual_val, $subseq );
240 @lines = split "\n", $ft;
242 $key_regex = "[A-Za-z0-9_']+"; # this regex should match every possible feature key (gene, misc_feature, 5'UTR ...)
246 while ( $lines[ $i ] )
248 if ( $lines[ $i ] =~ /^($key_regex)\s+(.+)/ )
255 # ---- getting locator
259 if ( not balance_params( $locator ) )
261 while ( not balance_params( $locator ) )
263 $locator .= $lines[ $i + $p ];
268 push @{ $qual_hash{ "_locator" } }, $locator;
272 # ---- getting subsequence
274 $subseq = parse_locator( $locator, $seq );
276 push @{ $qual_hash{ "_seq" } }, $subseq;
279 # ----- getting qualifiers
281 while ( defined( $lines[ $i + $p ] ) and not $lines[ $i + $p ] =~ /^$key_regex/ )
283 if ( $lines[ $i + $p ] =~ /^\// )
285 if ( $lines[ $i + $p ] =~ /^\/([^=]+)=(.*)$/ )
290 elsif ( $lines[ $i + $p ] =~ /^\/(.*)$/ )
296 # ----- getting qualifier value
300 if ( not balance_quotes( $qual_val ) )
302 while ( not balance_quotes( $qual_val ) )
304 $qual_val .= " " . $lines[ $i + $p + $q ];
309 $qual_val =~ s/^"(.*)"$/$1/;
310 $qual_val =~ tr/ //d if $qual_name =~ /translation/i;
312 if ( exists $args->{ "quals" } ) {
313 push @{ $qual_hash{ $qual_name } }, $qual_val if exists $args->{ "quals" }->{ $qual_name };
315 push @{ $qual_hash{ $qual_name } }, $qual_val;
322 if ( scalar keys %qual_hash > 0 )
324 if ( exists $args->{ "feats" } ) {
325 push @{ $key_hash{ $key } }, dclone \%qual_hash if exists $args->{ "feats" }->{ $key };
327 push @{ $key_hash{ $key } }, dclone \%qual_hash;
335 return wantarray ? %key_hash : \%key_hash;
341 # Martin A. Hansen, June 2006.
343 # uses recursion to parse a locator string from a feature
344 # table and fetches the appropriate subsequence. the operators
345 # join(), complement(), and order() are handled.
346 # the locator string is broken into a comma separated lists, and
347 # modified if the params donnot balance. otherwise the comma separated
348 # list of ranges are stripped from operators, and the subsequence are
349 # fetched and handled according to the operators.
350 # SNP locators are also dealt with (single positions).
352 my ( $locator, # locator string
353 $seq, # nucleotide sequence
354 $subseq, # subsequence being joined
355 $join, # join sequences
356 $comp, # complement sequence or not
357 $order, # order sequences
362 my ( @intervals, $interval, $beg, $end, $newseq );
364 @intervals = split ",", $locator;
366 if ( not balance_params( $intervals[ 0 ] ) ) # locator includes a join/comp/order of several ranges
368 if ( $locator =~ /^join\((.*)\)$/ )
371 $subseq = parse_locator( $1, $seq, $subseq, $join, $comp, $order );
373 elsif ( $locator =~ /^complement\((.*)\)$/ )
376 $subseq = parse_locator( $1, $seq, $subseq, $join, $comp, $order );
379 elsif ( $locator =~ /^order\((.*)\)$/ )
382 $subseq = parse_locator( $1, $seq, $subseq, $join, $comp, $order );
387 foreach $interval ( @intervals )
389 if ( $interval =~ /^join\((.*)\)$/ )
392 $subseq = parse_locator( $1, $seq, $subseq, $join, $comp, $order );
394 elsif ( $interval =~ /^complement\((.*)\)$/ )
397 $subseq = parse_locator( $1, $seq, $subseq, $join, $comp, $order );
400 elsif ( $interval =~ /^order\((.*)\)$/ )
403 $subseq = parse_locator( $1, $seq, $subseq, $join, $comp, $order );
405 elsif ( $interval =~ /^[<>]?(\d+)[^\d]+(\d+)$/ )
410 $newseq = substr $seq, $beg - 1, $end - $beg + 1;
412 $newseq = Maasha::Seq::dna_revcomp( $newseq ) if $comp;
415 $subseq .= " " . $newseq;
420 elsif ( $interval =~ /^(\d+)$/ )
424 $newseq = substr $seq, $beg - 1, 1 ;
426 $newseq = Maasha::Seq::dna_revcomp( $newseq ) if $comp;
429 $subseq .= " " . $newseq;
436 warn qq(WARNING: Could not match locator -> $locator\n);
437 # die qq(ERROR: Could not match locator -> $locator\n);
449 # Martin A. Hansen, June 2006.
451 # given a string checks if left and right params
452 # balances. returns 1 if balanced, else 0.
454 my ( $string, # string to check
462 $param_count += $string =~ tr/(//;
463 $param_count -= $string =~ tr/)//;
465 if ( $param_count == 0 ) {
475 # Martin A. Hansen, June 2006.
477 # given a string checks if the number of double quotes
478 # balances. returns 1 if balanced, else 0.
480 my ( $string, # string to check
487 $quote_count = $string =~ tr/"//;
489 if ( $quote_count % 2 == 0 ) {
499 # Martin A. Hansen, July 2009.
501 # Given a complete EMBL entry and an option hash,
502 # configure the arguments for the EMBL parser so
503 # only wanted information is retrieved and returned
504 # as a Biopiece record.
506 my ( $entry, # EMBL entry to parse
507 $options, # Biopiece options
512 my ( %args, $data, $record, $key_record, $key, $feat, $feat2, $qual, $qual_val, @records, $no_seq );
514 local $Data::Dumper::Terse = 1;
515 local $Data::Dumper::Indent = 0;
517 map { $args{ 'keys' }{ $_ } = 1 } @{ $options->{ 'keys' } };
518 map { $args{ 'feats' }{ $_ } = 1 } @{ $options->{ 'features' } };
519 map { $args{ 'quals' }{ $_ } = 1 } @{ $options->{ 'qualifiers' } };
521 if ( @{ $options->{ 'features' } } > 0 or @{ $options->{ 'qualifiers' } } > 0 ) {
522 $args{ 'keys' }{ 'FT' } = 1;
525 if ( ( $args{ 'feats' } and $args{ 'feats' }{ 'SEQ' } ) or ( $args{ 'quals' } and $args{ 'quals' }{ 'SEQ' } ) )
527 $no_seq = 1 if not $args{ 'keys' }{ 'SEQ' };
529 $args{ 'keys' }{ 'SEQ' } = 1;
530 delete $args{ 'feats' }{ 'SEQ' } if $args{ 'feats' };
531 delete $args{ 'quals' }{ 'SEQ' } if $args{ 'quals' };
534 $data = parse_embl_entry( $entry, \%args );
536 # print Dumper( $data );
538 foreach $key ( keys %{ $data } )
540 if ( $key eq 'SEQ' and $no_seq ) {
544 if ( $key eq 'REF' ) {
545 $key_record->{ $key } = Dumper( $data->{ $key } );
548 if ( $key ne 'FT' and $key ne 'REF' ) {
549 $key_record->{ $key } = $data->{ $key };
553 if ( exists $data->{ 'FT' } )
555 $record->{ 'FT' } = Dumper( $data->{ 'FT' } );
557 map { $record->{ $_ } = $key_record->{ $_ } } keys %{ $key_record };
559 push @records, $record;
563 push @records, $key_record;
566 return wantarray ? @records : \@records;
570 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<