3 # Copyright (C) 2007 Martin A. Hansen.
5 # This program is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU General Public License
7 # as published by the Free Software Foundation; either version 2
8 # of the License, or (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 # http://www.gnu.org/copyleft/gpl.html
22 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
25 # Routines to parse EMBL records.
28 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
34 use Storable qw( dclone );
39 use vars qw ( @ISA @EXPORT );
42 @ISA = qw( Exporter );
45 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
50 # Martin A. Hansen, June 2006.
52 # Given a filehandle to an embl file,
53 # fetches the next embl entry, and returns
54 # this as a string with multiple lines.
56 my ( $fh, # filehandle to embl file
73 # Martin A. Hansen, June 2006.
75 # given an embl entry extracts the keys
76 # given in an argument hash. Special care
77 # is taken to parse the feature table if
80 my ( $entry, # embl entry
81 $args, # argument hash
84 # returns data structure
86 my ( @lines, $i, %hash, $ft, $seq, $key, $val );
88 @lines = split "\n", $entry;
94 if ( exists $args->{ "keys" } )
96 $args->{ "keys" }->{ "RN" } = 1 if grep { $_ =~ /^R/ } keys %{ $args->{ "keys" } };
98 if ( $lines[ $i ] =~ /^(\w{2})\s+(.*)/ and exists $args->{ "keys" }->{ $1 } )
103 if ( $key =~ /RN|RX|RP|RG|RA|RT|RL/ ) {
104 add_ref( \%hash, \@lines, $i, $args->{ "keys" } ) if $key eq "RN";
105 } elsif ( exists $hash{ $key } and $key eq "FT" ) {
106 $hash{ $key } .= "\n" . $val;
107 } elsif ( exists $hash{ $1 } ) {
108 $hash{ $key } .= " " . $val;
110 $hash{ $key } = $val;
113 elsif ( $lines[ $i ] =~ /^\s+(.*)\s+\d+$/ and exists $args->{ "keys" }->{ "SEQ" } )
120 if ( $lines[ $i ] =~ /^(\w{2})\s+(.*)/ )
125 if ( $key =~ /RN|RX|RP|RG|RA|RT|RL/ ) {
126 add_ref( \%hash, \@lines, $i ) if $key eq "RN";
127 } elsif ( exists $hash{ $1 } and $key eq "FT" ) {
128 $hash{ $key } .= "\n" . $val;
129 } elsif ( exists $hash{ $key } ) {
130 $hash{ $key } .= " " . $val;
132 $hash{ $key } = $val;
135 elsif ( $lines[ $i ] =~ /^\s+(.*)\s+\d+$/ )
147 $hash{ "SEQ" } = $seq;
150 if ( exists $hash{ "FT" } )
152 $ft = parse_feature_table( $hash{ "FT" }, $seq, $args );
156 return wantarray ? %hash : \%hash;
162 # Martin A. Hansen, August 2009.
164 # Add a EMBL reference.
166 my ( $hash, # Parsed EMBL data
167 $lines, # EMBL entry lines
169 $args, # hashref with keys to save - OPTIONAL
178 while ( $lines->[ $i ] =~ /^(\w{2})\s+(.*)/ and $1 ne 'XX' and exists $args->{ $1 } )
180 if ( exists $ref{ $1 } ) {
181 $ref{ $1 } .= " " . $2;
191 while ( $lines->[ $i ] =~ /^(\w{2})\s+(.*)/ and $1 ne 'XX' )
193 if ( exists $ref{ $1 } ) {
194 $ref{ $1 } .= " " . $2;
203 push @{ $hash->{ 'REF' } }, \%ref;
207 sub parse_feature_table
209 # Martin A. Hansen, June 2006.
211 # parses the feature table of a EMBL/GenBank/DDBJ entry.
212 # parsing takes place in 4 steps. 1) the feature key is
213 # located. 2) the locator is located taking into # consideration
214 # that it may be split over multiple lines, which is dealt with
215 # by counting the params that always are present in multiline
216 # locators. 3) the locator is used to fetch the corresponding
217 # sequence. 4) qualifier key/value pars are located again taking
218 # into consideration multiline values, which are dealt with by
219 # keeping track of the "-count (value-less qualifers are also
220 # included). only feature keys and qualifers defined in the
221 # argument hash are returned.
223 my ( $ft, # feature table
224 $seq, # entry sequence
225 $args, # argument hash
228 # returns data structure
230 my ( @lines, $key_regex, $i, $p, $q, %key_hash, $key, $locator, %qual_hash, $qual_name, $qual_val, $subseq );
232 @lines = split "\n", $ft;
234 $key_regex = "[A-Za-z0-9_']+"; # this regex should match every possible feature key (gene, misc_feature, 5'UTR ...)
238 while ( $lines[ $i ] )
240 if ( $lines[ $i ] =~ /^($key_regex)\s+(.+)/ )
247 # ---- getting locator
251 if ( not balance_params( $locator ) )
253 while ( not balance_params( $locator ) )
255 $locator .= $lines[ $i + $p ];
260 push @{ $qual_hash{ "_locator" } }, $locator;
264 # ---- getting subsequence
266 $subseq = parse_locator( $locator, $seq );
268 push @{ $qual_hash{ "_seq" } }, $subseq;
271 # ----- getting qualifiers
273 while ( defined( $lines[ $i + $p ] ) and not $lines[ $i + $p ] =~ /^$key_regex/ )
275 if ( $lines[ $i + $p ] =~ /^\// )
277 if ( $lines[ $i + $p ] =~ /^\/([^=]+)=(.*)$/ )
282 elsif ( $lines[ $i + $p ] =~ /^\/(.*)$/ )
288 # ----- getting qualifier value
292 if ( not balance_quotes( $qual_val ) )
294 while ( not balance_quotes( $qual_val ) )
296 $qual_val .= " " . $lines[ $i + $p + $q ];
301 $qual_val =~ s/^"(.*)"$/$1/;
302 $qual_val =~ tr/ //d if $qual_name =~ /translation/i;
304 if ( exists $args->{ "quals" } ) {
305 push @{ $qual_hash{ $qual_name } }, $qual_val if exists $args->{ "quals" }->{ $qual_name };
307 push @{ $qual_hash{ $qual_name } }, $qual_val;
314 if ( scalar keys %qual_hash > 0 )
316 if ( exists $args->{ "feats" } ) {
317 push @{ $key_hash{ $key } }, dclone \%qual_hash if exists $args->{ "feats" }->{ $key };
319 push @{ $key_hash{ $key } }, dclone \%qual_hash;
327 return wantarray ? %key_hash : \%key_hash;
333 # Martin A. Hansen, June 2006.
335 # uses recursion to parse a locator string from a feature
336 # table and fetches the appropriate subsequence. the operators
337 # join(), complement(), and order() are handled.
338 # the locator string is broken into a comma separated lists, and
339 # modified if the params donnot balance. otherwise the comma separated
340 # list of ranges are stripped from operators, and the subsequence are
341 # fetched and handled according to the operators.
342 # SNP locators are also dealt with (single positions).
344 my ( $locator, # locator string
345 $seq, # nucleotide sequence
346 $subseq, # subsequence being joined
347 $join, # join sequences
348 $comp, # complement sequence or not
349 $order, # order sequences
354 my ( @intervals, $interval, $beg, $end, $newseq );
356 @intervals = split ",", $locator;
358 if ( not balance_params( $intervals[ 0 ] ) ) # locator includes a join/comp/order of several ranges
360 if ( $locator =~ /^join\((.*)\)$/ )
363 $subseq = parse_locator( $1, $seq, $subseq, $join, $comp, $order );
365 elsif ( $locator =~ /^complement\((.*)\)$/ )
368 $subseq = parse_locator( $1, $seq, $subseq, $join, $comp, $order );
371 elsif ( $locator =~ /^order\((.*)\)$/ )
374 $subseq = parse_locator( $1, $seq, $subseq, $join, $comp, $order );
379 foreach $interval ( @intervals )
381 if ( $interval =~ /^join\((.*)\)$/ )
384 $subseq = parse_locator( $1, $seq, $subseq, $join, $comp, $order );
386 elsif ( $interval =~ /^complement\((.*)\)$/ )
389 $subseq = parse_locator( $1, $seq, $subseq, $join, $comp, $order );
392 elsif ( $interval =~ /^order\((.*)\)$/ )
395 $subseq = parse_locator( $1, $seq, $subseq, $join, $comp, $order );
397 elsif ( $interval =~ /^[<>]?(\d+)[^\d]+(\d+)$/ )
402 $newseq = substr $seq, $beg - 1, $end - $beg + 1;
404 $newseq = Maasha::Seq::dna_revcomp( $newseq ) if $comp;
407 $subseq .= " " . $newseq;
412 elsif ( $interval =~ /^(\d+)$/ )
416 $newseq = substr $seq, $beg - 1, 1 ;
418 $newseq = Maasha::Seq::dna_revcomp( $newseq ) if $comp;
421 $subseq .= " " . $newseq;
428 warn qq(WARNING: Could not match locator -> $locator\n);
429 # die qq(ERROR: Could not match locator -> $locator\n);
441 # Martin A. Hansen, June 2006.
443 # given a string checks if left and right params
444 # balances. returns 1 if balanced, else 0.
446 my ( $string, # string to check
454 $param_count += $string =~ tr/(//;
455 $param_count -= $string =~ tr/)//;
457 if ( $param_count == 0 ) {
467 # Martin A. Hansen, June 2006.
469 # given a string checks if the number of double quotes
470 # balances. returns 1 if balanced, else 0.
472 my ( $string, # string to check
479 $quote_count = $string =~ tr/"//;
481 if ( $quote_count % 2 == 0 ) {
491 # Martin A. Hansen, July 2009.
493 # Given a complete EMBL entry and an option hash,
494 # configure the arguments for the EMBL parser so
495 # only wanted information is retrieved and returned
496 # as a Biopiece record.
498 my ( $entry, # EMBL entry to parse
499 $options, # Biopiece options
504 my ( %args, $data, $record, $key_record, $key, $feat, $feat2, $qual, $qual_val, @records, $no_seq );
506 local $Data::Dumper::Terse = 1;
507 local $Data::Dumper::Indent = 0;
509 map { $args{ 'keys' }{ $_ } = 1 } @{ $options->{ 'keys' } };
510 map { $args{ 'feats' }{ $_ } = 1 } @{ $options->{ 'features' } };
511 map { $args{ 'quals' }{ $_ } = 1 } @{ $options->{ 'qualifiers' } };
513 if ( @{ $options->{ 'features' } } > 0 or @{ $options->{ 'qualifiers' } } > 0 ) {
514 $args{ 'keys' }{ 'FT' } = 1;
517 if ( ( $args{ 'feats' } and $args{ 'feats' }{ 'SEQ' } ) or ( $args{ 'quals' } and $args{ 'quals' }{ 'SEQ' } ) )
519 $no_seq = 1 if not $args{ 'keys' }{ 'SEQ' };
521 $args{ 'keys' }{ 'SEQ' } = 1;
522 delete $args{ 'feats' }{ 'SEQ' } if $args{ 'feats' };
523 delete $args{ 'quals' }{ 'SEQ' } if $args{ 'quals' };
526 $data = parse_embl_entry( $entry, \%args );
528 # print Dumper( $data );
530 foreach $key ( keys %{ $data } )
532 if ( $key eq 'SEQ' and $no_seq ) {
536 if ( $key eq 'REF' ) {
537 $key_record->{ $key } = Dumper( $data->{ $key } );
540 if ( $key ne 'FT' and $key ne 'REF' ) {
541 $key_record->{ $key } = $data->{ $key };
545 if ( exists $data->{ 'FT' } )
547 $record->{ 'FT' } = Dumper( $data->{ 'FT' } );
549 map { $record->{ $_ } = $key_record->{ $_ } } keys %{ $key_record };
551 push @records, $record;
555 push @records, $key_record;
558 return wantarray ? @records : \@records;
562 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<