3 # Copyright (C) 2007 Martin A. Hansen.
5 # This program is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU General Public License
7 # as published by the Free Software Foundation; either version 2
8 # of the License, or (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 # http://www.gnu.org/copyleft/gpl.html
22 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
25 # Routines to parse EMBL records.
28 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
33 use Storable qw( dclone );
38 use vars qw ( @ISA @EXPORT );
41 @ISA = qw( Exporter );
44 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
49 # Martin A. Hansen, June 2006.
51 # Given a filehandle to an embl file,
52 # fetches the next embl entry, and returns
53 # this as a string with multiple lines.
55 my ( $fh, # filehandle to embl file
72 # Martin A. Hansen, June 2006.
74 # given an embl entry extracts the keys
75 # given in an argument hash. Special care
76 # is taken to parse the feature table if
79 my ( $entry, # embl entry
80 $args, # argument hash
83 # returns data structure
85 my ( @lines, $line, %hash, $ft, $seq, $key );
87 @lines = split "\n", $entry;
89 foreach $line ( @lines )
91 if ( exists $args->{ "keys" } )
93 if ( $line =~ /^(\w{2})\s+(.*)/ and exists $args->{ "keys" }->{ $1 } )
95 if ( exists $hash{ $1 } and $1 eq "FT" ) {
96 $hash{ $1 } .= "\n" . $2;
97 } elsif ( exists $hash{ $1 } ) {
98 $hash{ $1 } .= " " . $2;
103 elsif ( $line =~ /^\s+(.*)\s+\d+$/ and exists $args->{ "keys" }->{ "SEQ" } )
110 if ( $line =~ /^(\w{2})\s+(.*)/ )
112 if ( exists $hash{ $1 } and $1 eq "FT" ) {
113 $hash{ $1 } .= "\n" . $2;
114 } elsif ( exists $hash{ $1 } ) {
115 $hash{ $1 } .= " " . $2;
120 elsif ( $line =~ /^\s+(.*)\s+\d+$/ )
130 $hash{ "SEQ" } = $seq;
133 # foreach $key ( keys %hash )
135 # next if $key =~ /^(SEQ|SEQ_FT|FT)/;
137 # if ( not $hash{ $key } =~ /$args->{ $key }/i ) {
138 # return wantarray ? () : {} ;
142 if ( exists $hash{ "FT" } )
145 $ft = parse_feature_table( $hash{ "FT" }, $seq, $args );
149 return wantarray ? %hash : \%hash;
153 sub parse_feature_table
155 # Martin A. Hansen, June 2006.
157 # parses the feature table of a EMBL/GenBank/DDBJ entry.
158 # parsing takes place in 4 steps. 1) the feature key is
159 # located. 2) the locator is located taking into # consideration
160 # that it may be split over multiple lines, which is dealt with
161 # by counting the params that always are present in multiline
162 # locators. 3) the locator is used to fetch the corresponding
163 # sequence. 4) qualifier key/value pars are located again taking
164 # into consideration multiline values, which are dealt with by
165 # keeping track of the "-count (value-less qualifers are also
166 # included). only feature keys and qualifers defined in the
167 # argument hash are returned.
169 my ( $ft, # feature table
170 $seq, # entry sequnce
171 $args, # argument hash
174 # returns data structure
176 my ( @lines, $key_regex, $i, $p, $q, %key_hash, $key, $locator, %qual_hash, $qual_name, $qual_val, $subseq );
178 @lines = split "\n", $ft;
180 $key_regex = "[A-Za-z0-9_']+"; # this regex should match every possible feature key (gene, misc_feature, 5'UTR ...)
184 while ( $lines[ $i ] )
186 if ( $lines[ $i ] =~ /^($key_regex)\s+(.+)/ )
193 # ---- getting locator
197 if ( not balance_params( $locator ) )
199 while ( not balance_params( $locator ) )
201 $locator .= $lines[ $i + $p ];
206 push @{ $qual_hash{ "_locator" } }, $locator;
208 # ---- getting subsequence
210 $subseq = parse_locator( $locator, $seq );
212 push @{ $qual_hash{ "_seq" } }, $subseq;
214 # ----- getting qualifiers
216 while ( defined( $lines[ $i + $p ] ) and not $lines[ $i + $p ] =~ /^$key_regex/ )
218 if ( $lines[ $i + $p ] =~ /^\// )
220 if ( $lines[ $i + $p ] =~ /^\/([^=]+)=(.*)$/ )
225 elsif ( $lines[ $i + $p ] =~ /^\/(.*)$/ )
231 # ----- getting qualifier value
235 if ( not balance_quotes( $qual_val ) )
237 while ( not balance_quotes( $qual_val ) )
239 $qual_val .= " " . $lines[ $i + $p + $q ];
244 $qual_val =~ s/^"(.*)"$/$1/;
245 $qual_val =~ tr/ //d if $qual_name =~ /translation/i;
247 if ( exists $args->{ "quals" } ) {
248 push @{ $qual_hash{ $qual_name } }, $qual_val if exists $args->{ "quals" }->{ $qual_name };
250 push @{ $qual_hash{ $qual_name } }, $qual_val;
257 if ( scalar keys %qual_hash > 0 )
259 if ( exists $args->{ "feats" } ) {
260 push @{ $key_hash{ $key } }, dclone \%qual_hash if exists $args->{ "feats" }->{ $key };
262 push @{ $key_hash{ $key } }, dclone \%qual_hash;
270 return wantarray ? %key_hash : \%key_hash;
276 # Martin A. Hansen, June 2006.
278 # uses recursion to parse a locator string from a feature
279 # table and fetches the appropriate subsequence. the operators
280 # join(), complement(), and order() are handled.
281 # the locator string is broken into a comma separated lists, and
282 # modified if the params donnot balance. otherwise the comma separated
283 # list of ranges are stripped from operators, and the subsequence are
284 # fetched and handled according to the operators.
285 # SNP locators are also dealt with (single positions).
287 my ( $locator, # locator string
288 $seq, # nucleotide sequence
289 $subseq, # subsequence being joined
290 $join, # join sequences
291 $comp, # complement sequence or not
292 $order, # order sequences
297 my ( @intervals, $interval, $beg, $end, $newseq );
299 @intervals = split ",", $locator;
301 if ( not balance_params( $intervals[ 0 ] ) ) # locator includes a join/comp/order of several ranges
303 if ( $locator =~ /^join\((.*)\)$/ )
306 $subseq = parse_locator( $1, $seq, $subseq, $join, $comp, $order );
308 elsif ( $locator =~ /^complement\((.*)\)$/ )
311 $subseq = parse_locator( $1, $seq, $subseq, $join, $comp, $order );
314 elsif ( $locator =~ /^order\((.*)\)$/ )
317 $subseq = parse_locator( $1, $seq, $subseq, $join, $comp, $order );
322 foreach $interval ( @intervals )
324 if ( $interval =~ /^join\((.*)\)$/ )
327 $subseq = parse_locator( $1, $seq, $subseq, $join, $comp, $order );
329 elsif ( $interval =~ /^complement\((.*)\)$/ )
332 $subseq = parse_locator( $1, $seq, $subseq, $join, $comp, $order );
335 elsif ( $interval =~ /^order\((.*)\)$/ )
338 $subseq = parse_locator( $1, $seq, $subseq, $join, $comp, $order );
340 elsif ( $interval =~ /^[<>]?(\d+)[^\d]+(\d+)$/ )
345 $newseq = substr $seq, $beg - 1, $end - $beg + 1;
347 $newseq = Maasha::Seq::dna_revcomp( $newseq ) if $comp;
350 $subseq .= " " . $newseq;
355 elsif ( $interval =~ /^(\d+)$/ )
359 $newseq = substr $seq, $beg - 1, 1 ;
361 $newseq = Maasha::Seq::dna_revcomp( $newseq ) if $comp;
364 $subseq .= " " . $newseq;
371 warn qq(WARNING: Could not match locator -> $locator\n);
372 # die qq(ERROR: Could not match locator -> $locator\n);
384 # Martin A. Hansen, June 2006.
386 # given a string checks if left and right params
387 # balances. returns 1 if balanced, else 0.
389 my ( $string, # string to check
397 $param_count += $string =~ tr/(//;
398 $param_count -= $string =~ tr/)//;
400 if ( $param_count == 0 ) {
410 # Martin A. Hansen, June 2006.
412 # given a string checks if the number of double quotes
413 # balances. returns 1 if balanced, else 0.
415 my ( $string, # string to check
422 $quote_count = $string =~ tr/"//;
424 if ( $quote_count % 2 == 0 ) {
432 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<