From 112de27df38163c34f7463c8797dae5eb7dc358b Mon Sep 17 00:00:00 2001 From: martinahansen Date: Wed, 9 Sep 2009 06:48:29 +0000 Subject: [PATCH] added BED entry output to check_bed git-svn-id: http://biopieces.googlecode.com/svn/trunk@664 74ccb610-7750-0410-82ae-013aeee3265d --- bp_bin/create_seq_index | 95 +++++++++++ bp_bin/get_genome_seq | 5 +- code_perl/Maasha/Fasta.pm | 305 ++++++----------------------------- code_perl/Maasha/UCSC/BED.pm | 53 +++--- 4 files changed, 171 insertions(+), 287 deletions(-) create mode 100755 bp_bin/create_seq_index diff --git a/bp_bin/create_seq_index b/bp_bin/create_seq_index new file mode 100755 index 0000000..6a8a2d3 --- /dev/null +++ b/bp_bin/create_seq_index @@ -0,0 +1,95 @@ +#!/usr/bin/env perl + +# Copyright (C) 2007-2009 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# Create a FASTA sequence index from sequences in stream for use with [get_seq]. + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +use warnings; +use strict; +use Maasha::Common; +use Maasha::Biopieces; +use Maasha::Filesys; +use Maasha::Fasta; + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +my ( $default, $formats, $options, $in, $out, $record, $tmp_dir, $file_out, $fh_out, $entry ); + +$options = Maasha::Biopieces::parse_options( + [ + { long => 'no_stream', short => 'x', type => 'flag', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, + { long => 'directory', short => 'd', type => 'dir', mandatory => 'yes', default => undef, allowed => undef, disallowed => undef }, + { long => 'index_name', short => 'i', type => 'string', mandatory => 'yes', default => undef, allowed => undef, disallowed => undef }, + ] +); + +Maasha::Common::error( qq(Directory already exists: "$options->{ 'directory' }") ) if -d $options->{ 'directory' }; + +Maasha::Filesys::dir_create_if_not_exists( $options->{ 'directory' } ); + +$in = Maasha::Biopieces::read_stream( $options->{ "stream_in" } ); +$out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } ); + +$file_out = "$options->{ 'directory' }/$options->{ 'index_name' }.seq"; +$fh_out = Maasha::Filesys::file_write_open( $file_out ); + +while ( $record = Maasha::Biopieces::get_record( $in ) ) +{ + if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) ) { + Maasha::Fasta::put_entry( $entry, $fh_out ); + } + + Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" }; +} + +close $fh_out; + +Maasha::Fasta::fasta_index( $file_out, $options->{ 'directory' }, $options->{ 'index_name' } . ".index" ); + +Maasha::Biopieces::close_stream( $in ); +Maasha::Biopieces::close_stream( $out ); + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +BEGIN +{ + Maasha::Biopieces::status_set(); +} + + +END +{ + Maasha::Biopieces::status_log(); +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +__END__ diff --git a/bp_bin/get_genome_seq b/bp_bin/get_genome_seq index dbbc9d4..0e8d1f7 100755 --- a/bp_bin/get_genome_seq +++ b/bp_bin/get_genome_seq @@ -30,6 +30,7 @@ use warnings; use strict; use Maasha::Biopieces; use Maasha::Filesys; +use Maasha::Fasta; use Maasha::Seq; @@ -90,7 +91,9 @@ if ( $options->{ "genome" } ) $beg = $index_beg; } - $len = $index_beg + $index_len - $beg if $beg + $len > $index_beg + $index_len; + if ( $beg + $len > $index_beg + $index_len ) { + $len = $index_beg + $index_len - $beg; + } next if $beg > $index_beg + $index_len; diff --git a/code_perl/Maasha/Fasta.pm b/code_perl/Maasha/Fasta.pm index 795efa6..3110e8d 100644 --- a/code_perl/Maasha/Fasta.pm +++ b/code_perl/Maasha/Fasta.pm @@ -47,109 +47,6 @@ use constant { # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< -sub fasta_format_ok -{ - # Martin A. Hansen, March 2007. - - # Checks if a given FASTA file is formatted with - # one header per line and one sequence per line. - # returns 1 if so, else 0. - - my ( $path, # full path to FASTA file - ) = @_; - - # Returns boolean - - my ( $fh, $line, $count ); - - $fh = Maasha::Common::read_open( $path ); - - $count = 0; - - while ( $line = <$fh> ) - { - if ( not $count % 2 and substr( $line, 0, 1 ) ne ">" ) { - return 0; - } - - $count++; - } - - close $fh; - - return 1; -} - - -sub get_entries -{ - # Martin A. Hansen, December 2006. - - # Parses a fasta file and returns a list of headers and sequence tuples. - - my ( $path, # full path to FASTA file - $count, # number of sequences to read - OPTIONAL - ) = @_; - - # returns list of tuples - - my ( $fh, $entry, @entries ); - - $fh = Maasha::Common::read_open( $path ); - - while ( $entry = get_entry( $fh ) ) - { - push @entries, $entry; - - if ( $count and $count == @entries ) { - last; - } - } - - close $fh; - - return wantarray ? @entries : \@entries; -} - - -sub put_entries -{ - # Martin A. Hansen, March 2004. - - # writes fasta sequences to STDOUT or file - - my ( $entries, # list of fasta entries - $path, # full path to file - OPTIONAL - $wrap, # line width - OPTIONAL - ) = @_; - - my ( $fh ); - - $fh = Maasha::Common::write_open( $path ) if $path; - - map { put_entry( $_, $fh, $wrap ) } @{ $entries }; - - close $fh if defined; -} - - -sub wrap -{ - # Martin A. Hansen, June 2007 - - # Wraps the sequence of a given FASTA entry - # to a given length. - - my ( $entry, # FASTA entry - $wrap, # wrap length - ) = @_; - - # Returns nothing. - - Maasha::Seq::wrap( \$entry->[ SEQ ], $wrap ); -} - - sub get_entry { # Martin A. Hansen, January 2007. @@ -222,209 +119,99 @@ sub put_entry } -sub find_shortest -{ - # Martin A. Hansen, June 2007. - - # Given a stack of FASTA entries, find and return - # the shortest entry. - - my ( $entries, # list of FASTA entries - ) = @_; - - # returns tuple - - my ( $min, $entry, $min_entry ); - - $min = 99999999999; - - foreach $entry ( @{ $entries } ) - { - if ( length( $entry->[ SEQ ] ) < $min ) - { - $min_entry = $entry; - $min = length $entry->[ SEQ ]; - } - } - - return wantarray ? @{ $min_entry } : $min_entry; -} - - -sub find_longest -{ - # Martin A. Hansen, June 2007. - - # Given a stack of FASTA entries, find and return - # the longest entry. - - my ( $entries, # list of FASTA entries - ) = @_; - - # returns tuple - - my ( $max, $entry, $max_entry ); - - $max = 0; - - foreach $entry ( @{ $entries } ) - { - if ( length( $entry->[ SEQ ] ) > $max ) - { - $max_entry = $entry; - $max = length $entry->[ SEQ ]; - } - } - - return wantarray ? @{ $max_entry } : $max_entry; -} - - -sub fasta_get_headers -{ - # Martin A. Hansen, May 2007. - - # Gets the header names of a FASTA file, - # and returns these in a list. - - my ( $path, # full path to FASTA file - ) = @_; - - # returns list - - my ( $fh, $entry, @list ); - - $fh = Maasha::Common::read_open( $path ); - - while ( $entry = get_entry( $fh ) ) { - push @list, $entry->[ SEQ_NAME ]; - } - - close $fh; - - return wantarray ? @list : \@list; -} - - -sub fasta_reformat +sub fasta_index { - # Martin A. Hansen, December 2004. + # Martin A. Hansen, July 2008. - # Given a file of one or more FASTA entries, reformats these so - # each entry consits of one line with header and one line with sequence. + # Given a path to a FASTA file create a FASTA index + # and save to the specified path. - my ( $path, # full path to file with multiple FASTA entries + my ( $fasta_path, # path to FASTA file + $index_dir, # directory + $index_name, # path to index file ) = @_; - my ( $fh_in, $fh_out, $entry ); + # Returns nothing. - $fh_in = Maasha::Common::read_open( $path ); - $fh_out = Maasha::Common::write_open( "$path.temp" ); + my ( $index ); - while ( $entry = get_entry( $fh_in ) ) { - put_entry( $entry, $fh_out ); - } + $index = index_create( $fasta_path ); - close $fh_in; - close $fh_out; + Maasha::Filesys::dir_create_if_not_exists( $index_dir ); - rename( "$path.temp", $path ); + Maasha::Fasta::index_store( "$index_dir/$index_name", $index ); } -sub fasta_index +sub index_check { - # Martin A. Hansen, July 2008. + # Martin A. Hansen, September 2009. - # Given a path to a FASTA file create a FASTA index - # and save to the specified path. + # Given a path to a FASTA file and a FASTA index + # check if the FILESIZE matches. my ( $fasta_path, # path to FASTA file $index_path, # path to index file ) = @_; - # Returns nothing. + # Returns boolean. my ( $index ); - $index = index_create( $fasta_path ); + $index = Maasha::Fasta::index_retrieve( $index_path ); - Maasha::Fasta::index_store( $index_path, $index ); + if ( Maasha::Filesys::file_size( $fasta_path ) == $index->{ 'FILE_SIZE' } ) { + return 1; + } + + return 0; } + sub index_create { # Matin A. Hansen, December 2004. # Given a FASTA file formatted with one line of header and one line of sequence, - # returns a list of header, seq beg and seq length (first nucleotide is 0). Also, + # returns a list of header, seq offset and seq length (first nucleotide is 0). Also, # the file size of the indexed file is written to the index for checking purposes. my ( $path, # full path to file with multiple FASTA entries ) = @_; - # returns a hashref + # Returns a hashref. - my ( $file_size, $fh, $entry, $beg, $len, %hash, @index ); + my ( $fh, $entry, $offset, $len, $tot, %index ); - $file_size = Maasha::Filesys::file_size( $path ); + $index{ 'FILE_SIZE' } = Maasha::Filesys::file_size( $path ); - push @index, "FILE_SIZE=$file_size"; + $offset = 0; + $len = 0; + $tot = 0; $fh = Maasha::Filesys::file_read_open( $path ); - $beg = 0; - $len = 0; - while ( $entry = get_entry( $fh ) ) { - warn qq(WARNING: header->$entry->[ SEQ_NAME ] alread exists in index) if exists $hash{ $entry->[ SEQ_NAME ] }; + warn qq(WARNING: header->$entry->[ SEQ_NAME ] alread exists in index) if exists $index{ $entry->[ SEQ_NAME ] }; - $beg += $len + 2 + length $entry->[ SEQ_NAME ]; - $len = length $entry->[ SEQ ]; + $offset += $len + 2 + length $entry->[ SEQ_NAME ]; + $len = length $entry->[ SEQ ]; - push @index, [ $entry->[ SEQ_NAME ], $beg, $len ]; + $index{ $entry->[ SEQ_NAME ] } = [ $offset, $len ]; - $hash{ $entry->[ SEQ_NAME ] } = 1; + $offset++; - $beg++; + $tot += length( $entry->[ SEQ_NAME ] ) + length( $entry->[ SEQ ] ) + 3; } close $fh; - return wantarray ? @index : \@index; -} - - -sub index_search -{ - # Martin A. Hansen, December 2004. - - # Searches the index for matching entries. - - my ( $index, # index list - $regex, # regex to match FASTA headers [OPTIONAL] - $invert, # invert matching - ) = @_; - - # returns list - - my ( @results ); - - if ( not $regex ) - { - @results = @{ $index }; - } - else - { - if ( $invert ) { - @results = grep { $_->[ 0 ] !~ /$regex/ } @{ $index }; - } else { - @results = grep { $_->[ 0 ] =~ /$regex/ } @{ $index }; - } + if ( $index{ 'FILE_SIZE' } != $tot ) { + Maasha::Common::error( qq(Filesize "$index{ 'FILE_SIZE' }" does not match content "$tot" -> malformatted FASTA file) ); } - return wantarray ? @results : \@results; + return wantarray ? %index : \%index; } @@ -432,22 +219,20 @@ sub index_lookup { # Martin A. Hansen, July 2007. - # Lookup a list of exact matches in the index and returns these + # Lookup a list of exact matches in the index and returns these. my ( $index, # index list $headers, # headers to lookup ) = @_; - # returns a list - - my ( %hash, $head, @results ); + # Returns a list. - map { $hash{ $_->[ 0 ] } = [ $_->[ 1 ], $_->[ 2 ] ] } @{ $index }; + my ( $head, @results ); foreach $head ( @{ $headers } ) { - if ( exists $hash{ $head } ) { - push @results, [ $head, $hash{ $head }->[ 0 ], $hash{ $head }->[ 1 ] ]; + if ( exists $index->{ $head } ) { + push @results, [ $head, $index->{ $head }->[ 0 ], $index->{ $head }->[ 1 ] ]; } } diff --git a/code_perl/Maasha/UCSC/BED.pm b/code_perl/Maasha/UCSC/BED.pm index 6bfa590..3d873e6 100644 --- a/code_perl/Maasha/UCSC/BED.pm +++ b/code_perl/Maasha/UCSC/BED.pm @@ -188,121 +188,122 @@ sub bed_entry_check # Returns nothing. - my ( $cols, @block_sizes, @block_starts ); + my ( $entry, $cols, @block_sizes, @block_starts ); - $cols = scalar @{ $bed }; + $entry = join "\t", @{ $bed }; + $cols = scalar @{ $bed }; if ( $cols < 3 ) { - Maasha::Common::error( qq(Bad BED entry - must contain at least 3 columns - not $cols) ); + Maasha::Common::error( qq(Bad BED entry "$entry" - must contain at least 3 columns - not $cols) ); } if ( $cols > 12 ) { - Maasha::Common::error( qq(Bad BED entry - must contains no more than 12 columns - not $cols) ); + Maasha::Common::error( qq(Bad BED entry "$entry" - must contains no more than 12 columns - not $cols) ); } if ( $bed->[ chrom ] =~ /\s/ ) { - Maasha::Common::error( qq(Bad BED entry - no white space allowed in chrom field: "$bed->[ chrom ]") ); + Maasha::Common::error( qq(Bad BED entry "$entry" - no white space allowed in chrom field: "$bed->[ chrom ]") ); } if ( $bed->[ chromStart ] =~ /\D/ ) { - Maasha::Common::error( qq(Bad BED entry - chromStart must be a whole number - not "$bed->[ chromStart ]") ); + Maasha::Common::error( qq(Bad BED entry "$entry" - chromStart must be a whole number - not "$bed->[ chromStart ]") ); } if ( $bed->[ chromEnd ] =~ /\D/ ) { - Maasha::Common::error( qq(Bad BED entry - chromEnd must be a whole number - not "$bed->[ chromEnd ]") ); + Maasha::Common::error( qq(Bad BED entry "$entry" - chromEnd must be a whole number - not "$bed->[ chromEnd ]") ); } if ( $bed->[ chromEnd ] < $bed->[ chromStart ] ) { - Maasha::Common::error( qq(Bad BED entry - chromEnd must be greater than chromStart - not "$bed->[ chromStart ] > $bed->[ chromEnd ]") ); + Maasha::Common::error( qq(Bad BED entry "$entry" - chromEnd must be greater than chromStart - not "$bed->[ chromStart ] > $bed->[ chromEnd ]") ); } return if $cols == 3; if ( $bed->[ name ] =~ /\s/ ) { - Maasha::Common::error( qq(Bad BED entry - no white space allowed in name field: "$bed->[ name ]") ); + Maasha::Common::error( qq(Bad BED entry "$entry" - no white space allowed in name field: "$bed->[ name ]") ); } return if $cols == 4; if ( $bed->[ score ] =~ /\D/ ) { - Maasha::Common::error( qq(Bad BED entry - score must be a whole number - not "$bed->[ score ]") ); + Maasha::Common::error( qq(Bad BED entry "$entry" - score must be a whole number - not "$bed->[ score ]") ); } # if ( $bed->[ score ] < 0 or $bed->[ score ] > 1000 ) { # disabled - too restrictive ! if ( $bed->[ score ] < 0 ) { - Maasha::Common::error( qq(Bad BED entry - score must be between 0 and 1000 - not "$bed->[ score ]") ); + Maasha::Common::error( qq(Bad BED entry "$entry" - score must be between 0 and 1000 - not "$bed->[ score ]") ); } return if $cols == 5; if ( $bed->[ strand ] ne '+' and $bed->[ strand ] ne '-' ) { - Maasha::Common::error( qq(Bad BED entry - strand must be + or - not "$bed->[ strand ]") ); + Maasha::Common::error( qq(Bad BED entry "$entry" - strand must be + or - not "$bed->[ strand ]") ); } return if $cols == 6; if ( $bed->[ thickStart ] =~ /\D/ ) { - Maasha::Common::error( qq(Bad BED entry - thickStart must be a whole number - not "$bed->[ thickStart ]") ); + Maasha::Common::error( qq(Bad BED entry "$entry" - thickStart must be a whole number - not "$bed->[ thickStart ]") ); } if ( $bed->[ thickEnd ] =~ /\D/ ) { - Maasha::Common::error( qq(Bad BED entry - thickEnd must be a whole number - not "$bed->[ thickEnd ]") ); + Maasha::Common::error( qq(Bad BED entry "$entry" - thickEnd must be a whole number - not "$bed->[ thickEnd ]") ); } if ( $bed->[ thickEnd ] < $bed->[ thickStart ] ) { - Maasha::Common::error( qq(Bad BED entry - thickEnd must be greater than thickStart - not "$bed->[ thickStart ] > $bed->[ thickEnd ]") ); + Maasha::Common::error( qq(Bad BED entry "$entry" - thickEnd must be greater than thickStart - not "$bed->[ thickStart ] > $bed->[ thickEnd ]") ); } if ( $bed->[ thickStart ] < $bed->[ chromStart ] ) { - Maasha::Common::error( qq(Bad BED entry - thickStart must be greater than chromStart - not "$bed->[ thickStart ] < $bed->[ chromStart ]") ); + Maasha::Common::error( qq(Bad BED entry "$entry" - thickStart must be greater than chromStart - not "$bed->[ thickStart ] < $bed->[ chromStart ]") ); } if ( $bed->[ thickStart ] > $bed->[ chromEnd ] ) { - Maasha::Common::error( qq(Bad BED entry - thickStart must be less than chromEnd - not "$bed->[ thickStart ] > $bed->[ chromEnd ]") ); + Maasha::Common::error( qq(Bad BED entry "$entry" - thickStart must be less than chromEnd - not "$bed->[ thickStart ] > $bed->[ chromEnd ]") ); } if ( $bed->[ thickEnd ] < $bed->[ chromStart ] ) { - Maasha::Common::error( qq(Bad BED entry - thickEnd must be greater than chromStart - not "$bed->[ thickEnd ] < $bed->[ chromStart ]") ); + Maasha::Common::error( qq(Bad BED entry "$entry" - thickEnd must be greater than chromStart - not "$bed->[ thickEnd ] < $bed->[ chromStart ]") ); } if ( $bed->[ thickEnd ] > $bed->[ chromEnd ] ) { - Maasha::Common::error( qq(Bad BED entry - thickEnd must be less than chromEnd - not "$bed->[ thickEnd ] > $bed->[ chromEnd ]") ); + Maasha::Common::error( qq(Bad BED entry "$entry" - thickEnd must be less than chromEnd - not "$bed->[ thickEnd ] > $bed->[ chromEnd ]") ); } return if $cols == 8; if ( $bed->[ itemRgb ] !~ /^(0|\d{1,3},\d{1,3},\d{1,3},?)$/ ) { - Maasha::Common::error( qq(Bad BED entry - itemRgb must be 0 or in the form of 255,0,0 - not "$bed->[ itemRgb ]") ); + Maasha::Common::error( qq(Bad BED entry "$entry" - itemRgb must be 0 or in the form of 255,0,0 - not "$bed->[ itemRgb ]") ); } return if $cols == 9; if ( $bed->[ blockCount ] =~ /\D/ ) { - Maasha::Common::error( qq(Bad BED entry - blockCount must be a whole number - not "$bed->[ blockCount ]") ); + Maasha::Common::error( qq(Bad BED entry "$entry" - blockCount must be a whole number - not "$bed->[ blockCount ]") ); } @block_sizes = split ",", $bed->[ blockSizes ]; if ( grep /\D/, @block_sizes ) { - Maasha::Common::error( qq(Bad BED entry - blockSizes must be whole numbers - not "$bed->[ blockSizes ]") ); + Maasha::Common::error( qq(Bad BED entry "$entry" - blockSizes must be whole numbers - not "$bed->[ blockSizes ]") ); } if ( $bed->[ blockCount ] != scalar @block_sizes ) { - Maasha::Common::error( qq(Bad BED entry - blockSizes "$bed->[ blockSizes ]" must match blockCount "$bed->[ blockCount ]") ); + Maasha::Common::error( qq(Bad BED entry "$entry" - blockSizes "$bed->[ blockSizes ]" must match blockCount "$bed->[ blockCount ]") ); } @block_starts = split ",", $bed->[ blockStarts ]; if ( grep /\D/, @block_starts ) { - Maasha::Common::error( qq(Bad BED entry - blockStarts must be whole numbers - not "$bed->[ blockStarts ]") ); + Maasha::Common::error( qq(Bad BED entry "$entry" - blockStarts must be whole numbers - not "$bed->[ blockStarts ]") ); } if ( $bed->[ blockCount ] != scalar @block_starts ) { - Maasha::Common::error( qq(Bad BED entry - blockStarts "$bed->[ blockStarts ]" must match blockCount "$bed->[ blockCount ]") ); + Maasha::Common::error( qq(Bad BED entry "$entry" - blockStarts "$bed->[ blockStarts ]" must match blockCount "$bed->[ blockCount ]") ); } if ( $bed->[ chromStart ] + $block_starts[ -1 ] + $block_sizes[ -1 ] != $bed->[ chromEnd ] ) { - Maasha::Common::error( qq(Bad BED entry - chromStart + blockStarts[last] + blockSizes[last] must equal chromEnd: ) . + Maasha::Common::error( qq(Bad BED entry "$entry" - chromStart + blockStarts[last] + blockSizes[last] must equal chromEnd: ) . qq($bed->[ chromStart ] + $block_starts[ -1 ] + $block_sizes[ -1 ] != $bed->[ chromEnd ]) ); } } -- 2.39.5