From 2361b25cd0deb0071782aad0cbe2a7c77a8fb621 Mon Sep 17 00:00:00 2001 From: martinahansen Date: Sun, 2 Aug 2009 08:32:24 +0000 Subject: [PATCH] added get_embl_entry git-svn-id: http://biopieces.googlecode.com/svn/trunk@615 74ccb610-7750-0410-82ae-013aeee3265d --- bp_bin/get_embl_entry | 129 +++++++++++++++++++++ bp_bin/read_embl | 48 ++------ code_perl/Maasha/Biopieces.pm | 14 ++- code_perl/Maasha/EMBL.pm | 207 ++++++++++++++++++++++++++++++---- 4 files changed, 330 insertions(+), 68 deletions(-) create mode 100755 bp_bin/get_embl_entry diff --git a/bp_bin/get_embl_entry b/bp_bin/get_embl_entry new file mode 100755 index 0000000..d21fbdd --- /dev/null +++ b/bp_bin/get_embl_entry @@ -0,0 +1,129 @@ +#!/usr/bin/env perl + +# Copyright (C) 2007-2009 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# Get an EMBL entry from the MySQL database and flatfiles. + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +use warnings; +use strict; +use Data::Dumper; +use Maasha::SQL; +use Maasha::Biopieces; +use Maasha::Filesys; +use Maasha::EMBL; + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +my ( $options, $user, $password, $database, $table, $dbh, $in, $out, $record, $entry, @ids, $id, $file, $offset, $len, $results, $result, $fh, %fh_hash ); + +$user = Maasha::Biopieces::biopiecesrc( "MYSQL_USER" ); +$password = Maasha::Biopieces::biopiecesrc( "MYSQL_PASSWORD" ); +$database = Maasha::Biopieces::biopiecesrc( "EMBL_DATABASE" ); +$table = Maasha::Biopieces::biopiecesrc( "EMBL_TABLE" ); + +$options = Maasha::Biopieces::parse_options( + [ + { long => 'user', short => 'u', type => 'string', mandatory => 'no', default => $user, allowed => undef, disallowed => undef }, + { long => 'password', short => 'p', type => 'string', mandatory => 'no', default => $password, allowed => undef, disallowed => undef }, + { long => 'database', short => 'd', type => 'string', mandatory => 'no', default => $database, allowed => undef, disallowed => undef }, + { long => 'table', short => 't', type => 'string', mandatory => 'no', default => $table, allowed => undef, disallowed => undef }, + { long => 'ids', short => 'i', type => 'list', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, + { long => 'keys', short => 'k', type => 'list', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, + { long => 'features', short => 'f', type => 'list', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, + { long => 'qualifiers', short => 'q', type => 'list', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, + ] +); + +$in = Maasha::Biopieces::read_stream( $options->{ "stream_in" } ); +$out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } ); + +if ( not Maasha::SQL::database_exists( $options->{ 'database' }, $options->{ 'user' }, $options->{ 'password' } ) ) { + Maasha::Common::error( qq(Database "$options->{ 'database' }" don't exists) ); +} + +$dbh = Maasha::SQL::connect( $options->{ 'database' }, $options->{ 'user' }, $options->{ 'password' } ); + +if ( not Maasha::SQL::table_exists( $dbh, $options->{ 'table' } ) ) { + Maasha::Common::error( qq(Table "$options->{ 'table' }" don't exists) ); +} + +@ids = @{ $options->{ 'ids' } } if defined $options->{ 'ids' }; + +while ( $record = Maasha::Biopieces::get_record( $in ) ) +{ + push @ids, $record->{ 'ID' } if exists $record->{ 'ID' }; + + Maasha::Biopieces::put_record( $record, $out ); +} + +foreach $id ( @ids ) +{ + $results = Maasha::SQL::query_array( $dbh, qq(SELECT FILE,OFFSET,LEN from $options->{ 'table' } WHERE ID="$id") ); + + foreach $result ( @{ $results } ) + { + ( $file, $offset, $len ) = @{ $result }; + + if ( not exists $fh_hash{ $file } ) + { + $fh = Maasha::Filesys::file_read_open( $file ); + + $fh_hash{ $file } = $fh; + } + + $entry = Maasha::Filesys::file_read( $fh_hash{ $file }, $offset, $len ); + + map { Maasha::Biopieces::put_record( $_, $out ) } Maasha::EMBL::embl2biopieces( $entry, $options ); + } +} + +map { close $fh_hash{ $_ } } keys %fh_hash; + +Maasha::Biopieces::close_stream( $in ); +Maasha::Biopieces::close_stream( $out ); + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +BEGIN +{ + Maasha::Biopieces::status_set(); +} + + +END +{ + Maasha::SQL::disconnect( $dbh ) if $dbh; + Maasha::Biopieces::status_log(); +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +__END__ diff --git a/bp_bin/read_embl b/bp_bin/read_embl index 780d90a..15ebea4 100755 --- a/bp_bin/read_embl +++ b/bp_bin/read_embl @@ -37,25 +37,21 @@ use Maasha::EMBL; # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< -my ( $options, $in, $out, %options2, $file, $data_in, $num, $entry, $record ); +my ( $options, $in, $out, $data_in, $num, $entry, $record ); $options = Maasha::Biopieces::parse_options( [ - { long => 'data_in', short => 'i', type => 'files!', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, - { long => 'num', short => 'n', type => 'uint', mandatory => 'no', default => undef, allowed => undef, disallowed => '0' }, - { long => 'keys', short => 'k', type => 'list', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, - { long => 'feats', short => 'f', type => 'list', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, - { long => 'quals', short => 'q', type => 'list', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, + { long => 'data_in', short => 'i', type => 'files!', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, + { long => 'num', short => 'n', type => 'uint', mandatory => 'no', default => undef, allowed => undef, disallowed => '0' }, + { long => 'keys', short => 'k', type => 'list', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, + { long => 'features', short => 'f', type => 'list', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, + { long => 'qualifiers', short => 'q', type => 'list', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, ] ); $in = Maasha::Biopieces::read_stream( $options->{ "stream_in" } ); $out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } ); -map { $options2{ "keys" }{ $_ } = 1 } @{ $options->{ "keys" } }; -map { $options2{ "feats" }{ $_ } = 1 } @{ $options->{ "feats" } }; -map { $options2{ "quals" }{ $_ } = 1 } @{ $options->{ "quals" } }; - while ( $record = Maasha::Biopieces::get_record( $in ) ) { Maasha::Biopieces::put_record( $record, $out ); } @@ -68,37 +64,7 @@ if ( $options->{ 'data_in' } ) while ( $entry = Maasha::EMBL::get_embl_entry( $data_in ) ) { - $record = Maasha::EMBL::parse_embl_entry( $entry, \%options2 ); - - my ( $feat, $feat2, $qual, $qual_val, $record_copy ); - - $record_copy = dclone $record; - - delete $record_copy->{ "FT" }; - - Maasha::Biopieces::put_record( $record_copy, $out ); - - delete $record_copy->{ "SEQ" }; - - foreach $feat ( keys %{ $record->{ "FT" } } ) - { - $record_copy->{ "FEAT_TYPE" } = $feat; - - foreach $feat2 ( @{ $record->{ "FT" }->{ $feat } } ) - { - foreach $qual ( keys %{ $feat2 } ) - { - $qual_val = join "; ", @{ $feat2->{ $qual } }; - - $qual =~ s/^_//; - $qual = uc $qual; - - $record_copy->{ $qual } = $qual_val; - } - - Maasha::Biopieces::put_record( $record_copy, $out ); - } - } + map { Maasha::Biopieces::put_record( $_, $out ) } Maasha::EMBL::embl2biopieces( $entry, $options ); last if $options->{ "num" } and $num == $options->{ "num" }; diff --git a/code_perl/Maasha/Biopieces.pm b/code_perl/Maasha/Biopieces.pm index 566664c..1865cca 100644 --- a/code_perl/Maasha/Biopieces.pm +++ b/code_perl/Maasha/Biopieces.pm @@ -397,21 +397,25 @@ sub check_print_usage # Returns nothing. - my ( $script, $wiki ); + my ( %options, $help, $script, $wiki ); + + %options = %{ $options }; + $help = $options{ 'help' }; + delete $options{ 'help' }; $script = Maasha::Common::get_scriptname(); if ( $script ne 'print_wiki' ) { - if ( exists $options->{ 'help' } or -t STDIN ) + if ( $help or -t STDIN ) { - if ( not ( exists $options->{ 'stream_in' } or $options->{ 'data_in' } ) ) + if ( not ( exists $options{ 'stream_in' } or $options{ 'data_in' } ) ) { - if ( scalar keys %{ $options } <= 1 ) + if ( scalar keys %options == 0 ) { $wiki = $ENV{ 'BP_DIR' } . "/bp_usage/$script.wiki"; - if ( exists $options->{ 'help' } ) { + if ( $help ) { `print_wiki --data_in=$wiki --help`; } elsif ( $script =~ /^(list_biopieces|list_genomes|list_mysql_databases|biostat)$/ ) { return; diff --git a/code_perl/Maasha/EMBL.pm b/code_perl/Maasha/EMBL.pm index acd7070..730bda0 100644 --- a/code_perl/Maasha/EMBL.pm +++ b/code_perl/Maasha/EMBL.pm @@ -83,17 +83,21 @@ sub parse_embl_entry # returns data structure - my ( @lines, $line, %hash, $ft, $seq, $key ); + my ( @lines, $i, %hash, $ft, $seq ); @lines = split "\n", $entry; - foreach $line ( @lines ) + $i = 0; + + while ( $i < @lines ) { if ( exists $args->{ "keys" } ) { - if ( $line =~ /^(\w{2})\s+(.*)/ and exists $args->{ "keys" }->{ $1 } ) + if ( $lines[ $i ] =~ /^(\w{2})\s+(.*)/ and exists $args->{ "keys" }->{ $1 } ) { - if ( exists $hash{ $1 } and $1 eq "FT" ) { + if ( $1 =~ /RN|RP|RG|RA|RT|RL/ ) { + add_ref( \%hash, \@lines, $i, $args->{ "keys" } ); + } elsif ( exists $hash{ $1 } and $1 eq "FT" ) { $hash{ $1 } .= "\n" . $2; } elsif ( exists $hash{ $1 } ) { $hash{ $1 } .= " " . $2; @@ -101,16 +105,18 @@ sub parse_embl_entry $hash{ $1 } = $2; } } - elsif ( $line =~ /^\s+(.*)\s+\d+$/ and exists $args->{ "keys" }->{ "SEQ" } ) + elsif ( $lines[ $i ] =~ /^\s+(.*)\s+\d+$/ and exists $args->{ "keys" }->{ "SEQ" } ) { $seq .= $1; } } else { - if ( $line =~ /^(\w{2})\s+(.*)/ ) + if ( $lines[ $i ] =~ /^(\w{2})\s+(.*)/ ) { - if ( exists $hash{ $1 } and $1 eq "FT" ) { + if ( $1 eq "RN" ) { + add_ref( \%hash, \@lines, $i ); + } elsif ( exists $hash{ $1 } and $1 eq "FT" ) { $hash{ $1 } .= "\n" . $2; } elsif ( exists $hash{ $1 } ) { $hash{ $1 } .= " " . $2; @@ -118,11 +124,13 @@ sub parse_embl_entry $hash{ $1 } = $2; } } - elsif ( $line =~ /^\s+(.*)\s+\d+$/ ) + elsif ( $lines[ $i ] =~ /^\s+(.*)\s+\d+$/ ) { $seq .= $1; } } + + $i++; } if ( $seq ) @@ -131,18 +139,8 @@ sub parse_embl_entry $hash{ "SEQ" } = $seq; } -# foreach $key ( keys %hash ) -# { -# next if $key =~ /^(SEQ|SEQ_FT|FT)/; -# -# if ( not $hash{ $key } =~ /$args->{ $key }/i ) { -# return wantarray ? () : {} ; -# } -# } - if ( exists $hash{ "FT" } ) { - $seq =~ tr/ //d; $ft = parse_feature_table( $hash{ "FT" }, $seq, $args ); $hash{ "FT" } = $ft; } @@ -151,6 +149,53 @@ sub parse_embl_entry } +sub add_ref +{ + # Martin A. Hansen, August 2009. + + # Add a EMBL reference. + + my ( $hash, # Parsed EMBL data + $lines, # EMBL entry lines + $i, # line index + $args, # hashref with keys to save - OPTIONAL + ) = @_; + + # Returns nothing. + + my ( %ref ); + + if ( $args ) + { + while ( $lines->[ $i ] =~ /^(\w{2})\s+(.*)/ and $1 ne 'XX' and exists $args->{ $1 } ) + { + if ( exists $ref{ $1 } ) { + $ref{ $1 } .= " " . $2; + } else { + $ref{ $1 } = $2; + } + + $i++; + } + } + else + { + while ( $lines->[ $i ] =~ /^(\w{2})\s+(.*)/ and $1 ne 'XX' ) + { + if ( exists $ref{ $1 } ) { + $ref{ $1 } .= " " . $2; + } else { + $ref{ $1 } = $2; + } + + $i++; + } + } + + push @{ $hash->{ 'REF' } }, \%ref; +} + + sub parse_feature_table { # Martin A. Hansen, June 2006. @@ -168,7 +213,7 @@ sub parse_feature_table # argument hash are returned. my ( $ft, # feature table - $seq, # entry sequnce + $seq, # entry sequence $args, # argument hash ) = @_; @@ -206,11 +251,14 @@ sub parse_feature_table push @{ $qual_hash{ "_locator" } }, $locator; - # ---- getting subsequence + if ( $seq ) + { + # ---- getting subsequence - $subseq = parse_locator( $locator, $seq ); + $subseq = parse_locator( $locator, $seq ); - push @{ $qual_hash{ "_seq" } }, $subseq; + push @{ $qual_hash{ "_seq" } }, $subseq; + } # ----- getting qualifiers @@ -430,4 +478,119 @@ sub balance_quotes } +sub embl2biopieces +{ + # Martin A. Hansen, July 2009. + + # Given a complete EMBL entry and an option hash, + # configure the arguments for the EMBL parser so + # only wanted information is retrieved and returned + # as a Biopiece record. + + my ( $entry, # EMBL entry to parse + $options, # Biopiece options + ) = @_; + + # Returns a hashref. + + my ( %args, $data, $record, $key_record, $key, $feat, $feat2, $qual, $qual_val, @records, $no_seq ); + + map { $args{ 'keys' }{ $_ } = 1 } @{ $options->{ 'keys' } }; + map { $args{ 'feats' }{ $_ } = 1 } @{ $options->{ 'features' } }; + map { $args{ 'quals' }{ $_ } = 1 } @{ $options->{ 'qualifiers' } }; + + if ( @{ $options->{ 'features' } } > 0 or @{ $options->{ 'qualifiers' } } > 0 ) { + $args{ 'keys' }{ 'FT' } = 1; + } + + if ( ( $args{ 'feats' } and $args{ 'feats' }{ 'SEQ' } ) or ( $args{ 'quals' } and $args{ 'quals' }{ 'SEQ' } ) ) + { + $no_seq = 1 if not $args{ 'keys' }{ 'SEQ' }; + + $args{ 'keys' }{ 'SEQ' } = 1; + delete $args{ 'feats' }{ 'SEQ' } if $args{ 'feats' }; + delete $args{ 'quals' }{ 'SEQ' } if $args{ 'quals' }; + } + + $data = parse_embl_entry( $entry, \%args ); + + foreach $key ( keys %{ $data } ) + { + if ( $key eq 'SEQ' and $no_seq ) { + next; + } + + if ( $key eq 'REF' ) { + $key_record->{ $key } = inline_ref( $data->{ $key } ); + } + + if ( $key ne 'FT' and $key ne 'REF' ) { + $key_record->{ $key } = $data->{ $key }; + } + } + + if ( exists $data->{ 'FT' } ) + { + foreach $feat ( keys %{ $data->{ 'FT' } } ) + { + $record = {}; + $record->{ "FEATURE" } = $feat; + + foreach $feat2 ( @{ $data->{ 'FT' }->{ $feat } } ) + { + foreach $qual ( keys %{ $feat2 } ) + { + $qual_val = join "; ", @{ $feat2->{ $qual } }; + + $qual = "SEQ" if $qual eq "_seq"; + $qual = "LOCATOR" if $qual eq "_locator"; + + $record->{ $qual } = $qual_val; + } + } + + map { $record->{ $_ } = $key_record->{ $_ } } keys %{ $key_record }; + + push @records, $record; + } + } + else + { + push @records, $key_record; + } + + return wantarray ? @records : \@records; +} + + +sub inline_ref +{ + # Martin A. Hansen, August 2009. + + # Flattens a data structure with EMBL reference info to a string. + + my ( $data, # list of hashrefs + ) = @_; + + # Returns a string. + + my ( $ref, $key, $str, @keys, @refs ); + + foreach $ref ( @{ $data } ) + { + undef @keys; + + foreach $key ( qw( RN RP RG RA RT RL ) ) { + push @keys, "$key => '$ref->{ $key }'" if exists $ref->{ $key }; + } + + push @refs, "{ " . join( ", ", @keys ) . " }"; + } + + $str = '[ ' . join( ', ', @refs ) . ' ]'; + + return $str; +} + + # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< -- 2.39.5