code_perl/Maasha/EMBL.pm

   1 package Maasha::EMBL;
   2
   3 # Copyright (C) 2007 Martin A. Hansen.
   4
   5 # This program is free software; you can redistribute it and/or
   6 # modify it under the terms of the GNU General Public License
   7 # as published by the Free Software Foundation; either version 2
   8 # of the License, or (at your option) any later version.
   9
  10 # This program is distributed in the hope that it will be useful,
  11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 # GNU General Public License for more details.
  14
  15 # You should have received a copy of the GNU General Public License
  16 # along with this program; if not, write to the Free Software
  17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  18
  19 # http://www.gnu.org/copyleft/gpl.html
  20
  21
  22 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  23
  24
  25 # Routines to parse EMBL records.
  26
  27
  28 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  29
  30
  31 use strict;
  32 use Data::Dumper;
  33 use Storable qw( dclone );
  34 use Maasha::Common;
  35 use Maasha::Fasta;
  36 use Maasha::Calc;
  37 use Maasha::Seq;
  38 use vars qw ( @ISA @EXPORT );
  39
  40
  41 @ISA = qw( Exporter );
  42
  43
  44 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  45
  46
  47 sub get_embl_entry
  48 {
  49     # Martin A. Hansen, June 2006.
  50
  51     # Given a filehandle to an embl file,
  52     # fetches the next embl entry, and returns
  53     # this as a string with multiple lines.
  54
  55     my ( $fh,    # filehandle to embl file
  56        ) = @_;
  57
  58     # returns string
  59
  60     my ( $entry );
  61
  62     $/ = "//\n";
  63
  64     $entry = <$fh>;
  65
  66     return $entry;
  67 }
  68
  69
  70 sub parse_embl_entry
  71 {
  72     # Martin A. Hansen, June 2006.
  73
  74     # given an embl entry extracts the keys
  75     # given in an argument hash. Special care
  76     # is taken to parse the feature table if
  77     # requested.
  78
  79     my ( $entry,   # embl entry
  80          $args,    # argument hash
  81        ) = @_;
  82
  83     # returns data structure
  84
  85     my ( @lines, $line, %hash, $ft, $seq, $key );
  86
  87     @lines = split "\n", $entry;
  88
  89     foreach $line ( @lines )
  90     {
  91         if ( exists $args->{ "keys" } )
  92         {
  93             if ( $line =~ /^(\w{2})\s+(.*)/ and exists $args->{ "keys" }->{ $1 } )
  94             {
  95                 if ( exists $hash{ $1 } and $1 eq "FT" ) {
  96                     $hash{ $1 } .= "\n" . $2;
  97                 } elsif ( exists $hash{ $1 } ) {
  98                     $hash{ $1 } .= " " . $2;
  99                 } else {
 100                     $hash{ $1 } = $2;
 101                 }
 102             }
 103             elsif ( $line =~ /^\s+(.*)\s+\d+$/ and exists $args->{ "keys" }->{ "SEQ" } )
 104             {
 105                 $seq .= $1;
 106             }
 107         }
 108         else
 109         {
 110             if ( $line =~ /^(\w{2})\s+(.*)/ )
 111             {
 112                 if ( exists $hash{ $1 } and $1 eq "FT" ) {
 113                     $hash{ $1 } .= "\n" . $2;
 114                 } elsif ( exists $hash{ $1 } ) {
 115                     $hash{ $1 } .= " " . $2;
 116                 } else {
 117                     $hash{ $1 } = $2;
 118                 }
 119             }
 120             elsif ( $line =~ /^\s+(.*)\s+\d+$/ )
 121             {
 122                 $seq .= $1;
 123             }
 124         }
 125     }
 126
 127     if ( $seq )
 128     {
 129         $seq =~ tr/ //d;
 130         $hash{ "SEQ" } = $seq;
 131     }
 132
 133 #    foreach $key ( keys %hash )
 134 #    {
 135 #        next if $key =~ /^(SEQ|SEQ_FT|FT)/;
 136 #
 137 #        if ( not $hash{ $key } =~ /$args->{ $key }/i ) {
 138 #            return wantarray ? () : {} ;
 139 #        }
 140 #    }
 141
 142     if ( exists $hash{ "FT" } )
 143     {
 144         $seq =~ tr/ //d;
 145         $ft = parse_feature_table( $hash{ "FT" }, $seq, $args );
 146         $hash{ "FT" } = $ft;
 147     }
 148
 149     return wantarray ? %hash : \%hash;
 150 }
 151
 152
 153 sub parse_feature_table
 154 {
 155     # Martin A. Hansen, June 2006.
 156
 157     # parses the feature table of a EMBL/GenBank/DDBJ entry.
 158     # parsing takes place in 4 steps. 1) the feature key is
 159     # located. 2) the locator is located taking into # consideration
 160     # that it may be split over multiple lines, which is dealt with
 161     # by counting the params that always are present in multiline
 162     # locators. 3) the locator is used to fetch the corresponding
 163     # sequence. 4) qualifier key/value pars are located again taking
 164     # into consideration multiline values, which are dealt with by
 165     # keeping track of the "-count (value-less qualifers are also
 166     # included). only feature keys and qualifers defined in the
 167     # argument hash are returned.
 168
 169     my ( $ft,     # feature table
 170          $seq,    # entry sequnce
 171          $args,   # argument hash
 172        ) = @_;
 173
 174     # returns data structure
 175
 176     my ( @lines, $key_regex, $i, $p, $q, %key_hash, $key, $locator, %qual_hash, $qual_name, $qual_val, $subseq );
 177
 178     @lines = split "\n", $ft;
 179
 180     $key_regex = "[A-Za-z0-9_']+";   # this regex should match every possible feature key (gene, misc_feature, 5'UTR ...)
 181
 182     $i = 0;
 183
 184     while ( $lines[ $i ] )
 185     {
 186         if ( $lines[ $i ] =~ /^($key_regex)\s+(.+)/ )
 187         {
 188             $key     = $1;
 189             $locator = $2;
 190
 191             undef %qual_hash;
 192
 193             # ---- getting locator
 194
 195             $p = 1;
 196
 197             if ( not balance_params( $locator ) )
 198             {
 199                 while ( not balance_params( $locator ) )
 200                 {
 201                     $locator .= $lines[ $i + $p ];
 202                     $p++;
 203                 }
 204             }
 205
 206             push @{ $qual_hash{ "_locator" } }, $locator;
 207
 208             # ---- getting subsequence
 209
 210             $subseq = parse_locator( $locator, $seq );
 211
 212             push @{ $qual_hash{ "_seq" } }, $subseq;
 213
 214             # ----- getting qualifiers
 215
 216             while ( defined( $lines[ $i + $p ] ) and not $lines[ $i + $p ] =~ /^$key_regex/ )
 217             {
 218                 if ( $lines[ $i + $p ] =~ /^\// )
 219                 {
 220                     if ( $lines[ $i + $p ] =~ /^\/([^=]+)=(.*)$/ )
 221                     {
 222                         $qual_name = $1;
 223                         $qual_val  = $2;
 224                     }
 225                     elsif ( $lines[ $i + $p ] =~ /^\/(.*)$/ )
 226                     {
 227                         $qual_name = $1;
 228                         $qual_val  = "";
 229                     }
 230
 231                     # ----- getting qualifier value
 232
 233                     $q = 1;
 234
 235                     if ( not balance_quotes( $qual_val ) )
 236                     {
 237                         while ( not balance_quotes( $qual_val ) )
 238                         {
 239                             $qual_val .= " " . $lines[ $i + $p + $q ];
 240                             $q++;
 241                         }
 242                     }
 243
 244                     $qual_val =~ s/^"(.*)"$/$1/;
 245                     $qual_val =~ tr/ //d if $qual_name =~ /translation/i;
 246
 247                     if ( exists $args->{ "quals" } ) {
 248                         push @{ $qual_hash{ $qual_name } }, $qual_val if exists $args->{ "quals" }->{ $qual_name };
 249                     } else {
 250                         push @{ $qual_hash{ $qual_name } }, $qual_val;
 251                     }
 252                 }
 253
 254                 $p += $q;
 255             }
 256
 257             if ( scalar keys %qual_hash > 0 )
 258             {
 259                 if ( exists $args->{ "feats" } ) {
 260                     push @{ $key_hash{ $key } }, dclone \%qual_hash if exists $args->{ "feats" }->{ $key };
 261                 } else {
 262                     push @{ $key_hash{ $key } }, dclone \%qual_hash;
 263                 }
 264             }
 265         }
 266
 267         $i += $p;
 268     }
 269
 270     return wantarray ? %key_hash : \%key_hash;
 271 }
 272
 273
 274 sub parse_locator
 275 {
 276     # Martin A. Hansen, June 2006.
 277
 278     # uses recursion to parse a locator string from a feature
 279     # table and fetches the appropriate subsequence. the operators
 280     # join(), complement(), and order() are handled.
 281     # the locator string is broken into a comma separated lists, and
 282     # modified if the params donnot balance. otherwise the comma separated
 283     # list of ranges are stripped from operators, and the subsequence are
 284     # fetched and handled according to the operators.
 285     # SNP locators are also dealt with (single positions).
 286
 287     my ( $locator,   # locator string
 288          $seq,       # nucleotide sequence
 289          $subseq,    # subsequence being joined
 290          $join,      # join sequences
 291          $comp,      # complement sequence or not
 292          $order,     # order sequences
 293        ) = @_;
 294
 295     # returns string
 296
 297     my ( @intervals, $interval, $beg, $end, $newseq );
 298
 299     @intervals = split ",", $locator;
 300
 301     if ( not balance_params( $intervals[ 0 ] ) )                          # locator includes a join/comp/order of several ranges
 302     {
 303         if ( $locator =~ /^join\((.*)\)$/ )
 304         {
 305             $join = 1;
 306             $subseq = parse_locator( $1, $seq, $subseq, $join, $comp, $order );
 307         }
 308         elsif ( $locator =~ /^complement\((.*)\)$/ )
 309         {
 310             $comp = 1;
 311             $subseq = parse_locator( $1, $seq, $subseq, $join, $comp, $order );
 312
 313         }
 314         elsif ( $locator =~ /^order\((.*)\)$/ )
 315         {
 316             $order = 1;
 317             $subseq = parse_locator( $1, $seq, $subseq, $join, $comp, $order );
 318         }
 319     }
 320     else
 321     {
 322         foreach $interval ( @intervals )
 323         {
 324             if ( $interval =~ /^join\((.*)\)$/ )
 325             {
 326                 $join = 1;
 327                 $subseq = parse_locator( $1, $seq, $subseq, $join, $comp, $order );
 328             }
 329             elsif ( $interval =~ /^complement\((.*)\)$/ )
 330             {
 331                 $comp = 1;
 332                 $subseq = parse_locator( $1, $seq, $subseq, $join, $comp, $order );
 333
 334             }
 335             elsif ( $interval =~ /^order\((.*)\)$/ )
 336             {
 337                 $order = 1;
 338                 $subseq = parse_locator( $1, $seq, $subseq, $join, $comp, $order );
 339             }
 340             elsif ( $interval =~ /^[<>]?(\d+)[^\d]+(\d+)$/ )
 341             {
 342                 $beg = $1;
 343                 $end = $2;
 344
 345                 $newseq = substr $seq, $beg - 1, $end - $beg + 1;
 346
 347                 $newseq = Maasha::Seq::dna_revcomp( $newseq ) if $comp;
 348
 349                 if ( $order ) {
 350                     $subseq .= " " . $newseq;
 351                 } else {
 352                     $subseq .= $newseq;
 353                 }
 354             }
 355             elsif ( $interval =~ /^(\d+)$/ )
 356             {
 357                 $beg = $1;
 358
 359                 $newseq = substr $seq, $beg - 1, 1 ;
 360
 361                 $newseq = Maasha::Seq::dna_revcomp( $newseq ) if $comp;
 362
 363                 if ( $order ) {
 364                     $subseq .= " " . $newseq;
 365                 } else {
 366                     $subseq .= $newseq;
 367                 }
 368             }
 369             else
 370             {
 371                 warn qq(WARNING: Could not match locator -> $locator\n);
 372                 # die qq(ERROR: Could not match locator -> $locator\n);
 373                 $subseq .= "";
 374             }
 375         }
 376     }
 377
 378     return $subseq;
 379 }
 380
 381
 382 sub balance_params
 383 {
 384     # Martin A. Hansen, June 2006.
 385
 386     # given a string checks if left and right params
 387     # balances. returns 1 if balanced, else 0.
 388
 389     my ( $string,   # string to check
 390        ) = @_;
 391
 392     # returns boolean
 393
 394     my ( $param_count );
 395
 396     $param_count  = 0;
 397     $param_count += $string =~ tr/(//;
 398     $param_count -= $string =~ tr/)//;
 399
 400     if ( $param_count == 0 ) {
 401         return 1;
 402     } else {
 403         return 0;
 404     }
 405 }
 406
 407
 408 sub balance_quotes
 409 {
 410     # Martin A. Hansen, June 2006.
 411
 412     # given a string checks if the number of double quotes
 413     # balances. returns 1 if balanced, else 0.
 414
 415     my ( $string,   # string to check
 416        ) = @_;
 417
 418     # returns boolean
 419
 420     my ( $quote_count );
 421
 422     $quote_count = $string =~ tr/"//;
 423
 424     if ( $quote_count % 2 == 0 ) {
 425         return 1;
 426     } else {
 427         return 0;
 428     }
 429 }
 430
 431
 432 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<