From 24d3b192da2b4949e44971f01abcb0831d49b6e2 Mon Sep 17 00:00:00 2001 From: martinahansen Date: Sun, 18 Oct 2009 18:40:59 +0000 Subject: [PATCH] added write_gff git-svn-id: http://biopieces.googlecode.com/svn/trunk@696 74ccb610-7750-0410-82ae-013aeee3265d --- bp_bin/format_genome | 10 ++- bp_bin/read_gff | 6 +- bp_bin/write_gff | 93 +++++++++++++++++++++ code_perl/Maasha/GFF.pm | 179 ++++++++++++++++++++++++++++------------ 4 files changed, 227 insertions(+), 61 deletions(-) create mode 100755 bp_bin/write_gff diff --git a/bp_bin/format_genome b/bp_bin/format_genome index 2907aa4..36cc541 100755 --- a/bp_bin/format_genome +++ b/bp_bin/format_genome @@ -35,6 +35,7 @@ use Maasha::BWA; use Maasha::NCBI; use Maasha::Match; use Maasha::UCSC; +use Maasha::Jbrowse; # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< @@ -45,7 +46,7 @@ my ( $default, $formats, $options, $in, $out, $record, $data_out, $entry, $tmp_dir = Maasha::Biopieces::get_tmpdir(); $default = $ENV{ 'BP_DATA' }; -$formats = 'fasta,blast,vmatch,bowtie,bwa,phastcons'; +$formats = 'fasta,blast,vmatch,bowtie,bwa,phastcons,jbrowse'; $options = Maasha::Biopieces::parse_options( [ @@ -65,7 +66,7 @@ $genome = $options->{ 'genome' }; Maasha::Filesys::dir_create_if_not_exists( "$dir/genomes" ); Maasha::Filesys::dir_create_if_not_exists( "$dir/genomes/$genome" ); -if ( grep { $_ =~ /fasta|blast|vmatch|bowtie|bwa/i } @{ $options->{ "formats" } } ) +if ( grep { $_ =~ /fasta|blast|vmatch|bowtie|bwa|jbrowse/i } @{ $options->{ "formats" } } ) { if ( -f "$dir/genomes/$genome/fasta/$genome.fna" ) { @@ -95,7 +96,7 @@ while ( $record = Maasha::Biopieces::get_record( $in ) ) { Maasha::Fasta::put_entry( $entry, $fh_out ); } - elsif ( $fh_out and $record->{ "CHR" } and $record->{ "CHR_BEG" } and $record->{ "STEP" } and $record->{ "VALS" } ) + elsif ( $fh_out and $record->{ "CHR" } and $record->{ "CHR_BEG" } and $record->{ "STEP" } and $record->{ "VALS" } ) # TODO: clean this! { print $fh_out "fixedStep chrom=$record->{ 'CHR' } start=$record->{ 'CHR_BEG' } step=$record->{ 'STEP' }\n"; @@ -117,11 +118,12 @@ foreach $format ( @{ $options->{ 'formats' } } ) if ( $format =~ /^fasta$/i ) { Maasha::Fasta::fasta_index( "$fasta_dir/$genome.fna", $fasta_dir, "$genome.index" ) } elsif ( $format =~ /^blast$/i ) { Maasha::NCBI::blast_index( "$genome.fna", $fasta_dir, "$dir/genomes/$genome/blast", "dna", $genome ) } - elsif ( $format =~ /^blat$/i ) { warn "BLAT FORMAT NOT IMPLEMENTED" } + elsif ( $format =~ /^blat$/i ) { warn "BLAT FORMAT NOT IMPLEMENTED\n" } elsif ( $format =~ /^vmatch$/i ) { Maasha::Match::vmatch_index( "$genome.fna", $fasta_dir, "$dir/genomes/$genome/vmatch", $tmp_dir ) } elsif ( $format =~ /^bowtie$/i ) { Maasha::Bowtie::bowtie_index( "$fasta_dir/$genome.fna", "$dir/genomes/$genome/bowtie", $genome, $options->{ 'verbose' } ) } elsif ( $format =~ /^bwa$/i ) { Maasha::BWA::bwa_index( "$fasta_dir/$genome.fna", "$dir/genomes/$genome/bwa", $genome, $options->{ 'verbose' } ) } elsif ( $format =~ /^phastcons$/i ) { Maasha::UCSC::phastcons_index( "$genome.pp", $phastcons_dir ) } + elsif ( $format =~ /^jbrowse$/i ) { Maasha::Jbrowse::install_seq( "$fasta_dir/$genome.fna", "$dir/genomes/$genome/jbrowse", $genome ) } print STDERR qq(done.\n) if $options->{ 'verbose' }; } diff --git a/bp_bin/read_gff b/bp_bin/read_gff index b3d36db..1403219 100755 --- a/bp_bin/read_gff +++ b/bp_bin/read_gff @@ -58,9 +58,11 @@ if ( $options->{ 'data_in' } ) $num = 1; - while ( $entry = Maasha::GFF::get_entry( $data_in ) ) + while ( $entry = Maasha::GFF::gff_entry_get( $data_in ) ) { - Maasha::Biopieces::put_record( $entry, $out ); + if ( $record = Maasha::GFF::gff2biopiece( $entry ) ) { + Maasha::Biopieces::put_record( $record, $out ); + } last if $options->{ "num" } and $num == $options->{ "num" }; diff --git a/bp_bin/write_gff b/bp_bin/write_gff new file mode 100755 index 0000000..4f438f8 --- /dev/null +++ b/bp_bin/write_gff @@ -0,0 +1,93 @@ +#!/usr/bin/env perl + +# Copyright (C) 2007-2009 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# Write features from the stream in GFF format. + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +use warnings; +use strict; +use Maasha::GFF; +use Maasha::Biopieces; + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +my ( $options, $in, $out, $record, $data_out, $entry, $pragmas ); + +$options = Maasha::Biopieces::parse_options( + [ + { long => 'no_stream', short => 'x', type => 'flag', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, + { long => 'data_out', short => 'o', type => 'file', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, + { long => 'compress', short => 'Z', type => 'flag', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, + ] +); + +$in = Maasha::Biopieces::read_stream( $options->{ "stream_in" } ); +$out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } ); + +$data_out = Maasha::Biopieces::write_stream( $options->{ "data_out" }, $options->{ "compress" } ); + +$pragmas = [ '##gff-version 3' ]; + +while ( $record = Maasha::Biopieces::get_record( $in ) ) +{ + if ( $entry = Maasha::GFF::biopiece2gff( $record ) ) + { + Maasha::GFF::gff_pragma_put( $pragmas, $data_out ) if $pragmas; + + undef $pragmas; + + Maasha::GFF::gff_entry_put( $entry, $data_out ); + } + + Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" }; +} + +close $data_out; + +Maasha::Biopieces::close_stream( $in ); +Maasha::Biopieces::close_stream( $out ); + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +BEGIN +{ + Maasha::Biopieces::status_set(); +} + + +END +{ + Maasha::Biopieces::status_log(); +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +__END__ diff --git a/code_perl/Maasha/GFF.pm b/code_perl/Maasha/GFF.pm index dec68bd..67b0623 100644 --- a/code_perl/Maasha/GFF.pm +++ b/code_perl/Maasha/GFF.pm @@ -23,7 +23,9 @@ package Maasha::GFF; # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< -# Routines for manipulation 'Generic Feature Format' - GFF. +# Routines for manipulation 'Generic Feature Format' - GFF version 3. +# Read more here: +# http://www.sequenceontology.org/resources/gff3.html # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< @@ -40,99 +42,166 @@ require Exporter; @ISA = qw( Exporter ); +use constant { + seqid => 0, + source => 1, + type => 2, + start => 3, + end => 4, + score => 5, + strand => 6, + phase => 7, + attributes => 8, +}; + # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< -sub get_entry -{ - # Martin A. Hansen, February 2008. +my %ATTRIBUTES = ( + id => 'ID', + name => 'Name', + alias => 'Alias', + parent => 'Parent', + target => 'Target', + gap => 'Gap', + derives_from => 'Derives_from', + note => 'Note', + dbxref => 'Dbxref', + ontology_term => 'Ontology_term', +); - # Reads a single entry from a filehandle to a GFF file. - my ( $fh, # file handle +sub gff_entry_get +{ + # Martin A. Hansen, October 2009 + + my ( $fh, # file handle ) = @_; - # Returns hashref. + # Returns a - my ( $line, @fields, %entry, $q_beg, $q_end, @atts, $att, $key, $val ); + my ( $line, @fields ); - while ( $line = <$fh> ) + while ( $line = <$fh>) { chomp $line; - @fields = split "\t", $line; - - if ( @fields == 9 ) - { - $q_beg = $fields[ 3 ] - 1; - $q_end = $fields[ 4 ] - 1; - - ( $q_beg, $q_end ) = ( $q_end, $q_beg ) if $q_beg > $q_end; - - %entry = ( - Q_ID => $fields[ 0 ], - SOURCE => $fields[ 1 ], - TYPE => $fields[ 2 ], - Q_BEG => $q_beg, - Q_END => $q_end, - SCORE => $fields[ 5 ], - STRAND => $fields[ 6 ], - PHASE => $fields[ 7 ], - ATT => $fields[ 8 ], - ); - - @atts = split ";", $fields[ 8 ]; - - foreach $att ( @atts ) - { - ( $key, $val ) = split "=", $att; - - $entry{ "ATT_" . uc $key } = $val; - } + next if $line =~ /^$|^#/; # skip empty lines and lines starting with # - return wantarray ? %entry : \%entry; - } + @fields = split /\t/, $line; + + return wantarray ? @fields : \@fields; } } -sub get_entries +sub gff_entry_put { - # Martin A. Hansen, February 2008. - - # Reads GFF file and returns a list of entries. + # Martin A. Hansen, October 2009 - my ( $path, # full path to GFF file. + my ( $entry, # GFF entry + $fh, # file handle ) = @_; - # Returns a list. + $fh ||= \*STDOUT; - my ( $fh, $entry, @entries ); + print $fh join( "\t", @{ $entry } ), "\n"; +} - $fh = Maasha::Common::read_open( $path ); - while ( $entry = get_entry( $fh ) ) { - push @entries, $entry; - } +sub gff_pragma_put +{ + # Martin A. Hansen, October 2009 + + my ( $pragmas, # list of GFF pragma lines + $fh, # file handle + ) = @_; + + my ( $pragma ); - close $fh; + $fh ||= \*STDOUT; - return wantarray ? @entries : \@entries; + foreach $pragma ( @{ $pragmas } ) { + print $fh "$pragma\n"; + } } -sub put_entry +sub gff2biopiece { + # Martin A. Hansen, October 2009 + my ( $entry, # GFF entry + ) = @_; + # Returns a hashref. + + my ( %record, @atts, $att, $key, $val ); + + %record = ( + 'Q_ID' => $entry->[ seqid ], + 'SOURCE' => $entry->[ source ], + 'TYPE' => $entry->[ type ], + 'Q_BEG' => $entry->[ start ], + 'Q_END' => $entry->[ end ], + 'SCORE' => $entry->[ score ], + 'STRAND' => $entry->[ strand ], + 'PHASE' => $entry->[ phase ], + ); + + @atts = split /;/, $entry->[ attributes ]; + + foreach $att ( @atts ) + { + ( $key, $val ) = split /=/, $att; + + $record{ 'ATT_' . uc $key } = $val; + } + + return wantarray ? %record : \%record; } -sub put_entries +sub biopiece2gff { + # Martin A. Hansen, October 2009. + + # Converts a Biopiece record to a GFF entry (a list). + + my ( $record, # Biopiece record + ) = @_; + + # Returns a list. + + my ( @entry, $key, $tag, @atts ); + + $entry[ seqid ] = $record->{ 'Q_ID' }; + $entry[ source ] = $record->{ 'SOURCE' }; + $entry[ type ] = $record->{ 'TYPE' }; + $entry[ start ] = $record->{ 'Q_BEG' }; + $entry[ end ] = $record->{ 'Q_END' }; + $entry[ score ] = $record->{ 'SCORE' }; + $entry[ strand ] = $record->{ 'STRAND' }; + $entry[ phase ] = $record->{ 'PHASE' }; + + foreach $key ( %{ $record } ) + { + if ( $key =~ /ATT_(.+)/ ) + { + $tag = lc $1; + + if ( exists $ATTRIBUTES{ $tag } ) { + $tag = $ATTRIBUTES{ $tag }; + } + + push @atts, "$tag=" . $record->{ $key }; + } + } + $entry[ attributes ] = join ";", @atts; + return wantarray ? @entry : \@entry; } -- 2.39.5