--- /dev/null
+#!/usr/bin/env perl
+
+# Copyright (C) 2007-2009 Martin A. Hansen.
+
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+# http://www.gnu.org/copyleft/gpl.html
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# Create a FASTA sequence index from sequences in stream for use with [get_seq].
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+use warnings;
+use strict;
+use Maasha::Common;
+use Maasha::Biopieces;
+use Maasha::Filesys;
+use Maasha::Fasta;
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+my ( $default, $formats, $options, $in, $out, $record, $tmp_dir, $file_out, $fh_out, $entry );
+
+$options = Maasha::Biopieces::parse_options(
+ [
+ { long => 'no_stream', short => 'x', type => 'flag', mandatory => 'no', default => undef, allowed => undef, disallowed => undef },
+ { long => 'directory', short => 'd', type => 'dir', mandatory => 'yes', default => undef, allowed => undef, disallowed => undef },
+ { long => 'index_name', short => 'i', type => 'string', mandatory => 'yes', default => undef, allowed => undef, disallowed => undef },
+ ]
+);
+
+Maasha::Common::error( qq(Directory already exists: "$options->{ 'directory' }") ) if -d $options->{ 'directory' };
+
+Maasha::Filesys::dir_create_if_not_exists( $options->{ 'directory' } );
+
+$in = Maasha::Biopieces::read_stream( $options->{ "stream_in" } );
+$out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } );
+
+$file_out = "$options->{ 'directory' }/$options->{ 'index_name' }.seq";
+$fh_out = Maasha::Filesys::file_write_open( $file_out );
+
+while ( $record = Maasha::Biopieces::get_record( $in ) )
+{
+ if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) ) {
+ Maasha::Fasta::put_entry( $entry, $fh_out );
+ }
+
+ Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
+}
+
+close $fh_out;
+
+Maasha::Fasta::fasta_index( $file_out, $options->{ 'directory' }, $options->{ 'index_name' } . ".index" );
+
+Maasha::Biopieces::close_stream( $in );
+Maasha::Biopieces::close_stream( $out );
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+BEGIN
+{
+ Maasha::Biopieces::status_set();
+}
+
+
+END
+{
+ Maasha::Biopieces::status_log();
+}
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+__END__
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
-sub fasta_format_ok
-{
- # Martin A. Hansen, March 2007.
-
- # Checks if a given FASTA file is formatted with
- # one header per line and one sequence per line.
- # returns 1 if so, else 0.
-
- my ( $path, # full path to FASTA file
- ) = @_;
-
- # Returns boolean
-
- my ( $fh, $line, $count );
-
- $fh = Maasha::Common::read_open( $path );
-
- $count = 0;
-
- while ( $line = <$fh> )
- {
- if ( not $count % 2 and substr( $line, 0, 1 ) ne ">" ) {
- return 0;
- }
-
- $count++;
- }
-
- close $fh;
-
- return 1;
-}
-
-
-sub get_entries
-{
- # Martin A. Hansen, December 2006.
-
- # Parses a fasta file and returns a list of headers and sequence tuples.
-
- my ( $path, # full path to FASTA file
- $count, # number of sequences to read - OPTIONAL
- ) = @_;
-
- # returns list of tuples
-
- my ( $fh, $entry, @entries );
-
- $fh = Maasha::Common::read_open( $path );
-
- while ( $entry = get_entry( $fh ) )
- {
- push @entries, $entry;
-
- if ( $count and $count == @entries ) {
- last;
- }
- }
-
- close $fh;
-
- return wantarray ? @entries : \@entries;
-}
-
-
-sub put_entries
-{
- # Martin A. Hansen, March 2004.
-
- # writes fasta sequences to STDOUT or file
-
- my ( $entries, # list of fasta entries
- $path, # full path to file - OPTIONAL
- $wrap, # line width - OPTIONAL
- ) = @_;
-
- my ( $fh );
-
- $fh = Maasha::Common::write_open( $path ) if $path;
-
- map { put_entry( $_, $fh, $wrap ) } @{ $entries };
-
- close $fh if defined;
-}
-
-
-sub wrap
-{
- # Martin A. Hansen, June 2007
-
- # Wraps the sequence of a given FASTA entry
- # to a given length.
-
- my ( $entry, # FASTA entry
- $wrap, # wrap length
- ) = @_;
-
- # Returns nothing.
-
- Maasha::Seq::wrap( \$entry->[ SEQ ], $wrap );
-}
-
-
sub get_entry
{
# Martin A. Hansen, January 2007.
}
-sub find_shortest
-{
- # Martin A. Hansen, June 2007.
-
- # Given a stack of FASTA entries, find and return
- # the shortest entry.
-
- my ( $entries, # list of FASTA entries
- ) = @_;
-
- # returns tuple
-
- my ( $min, $entry, $min_entry );
-
- $min = 99999999999;
-
- foreach $entry ( @{ $entries } )
- {
- if ( length( $entry->[ SEQ ] ) < $min )
- {
- $min_entry = $entry;
- $min = length $entry->[ SEQ ];
- }
- }
-
- return wantarray ? @{ $min_entry } : $min_entry;
-}
-
-
-sub find_longest
-{
- # Martin A. Hansen, June 2007.
-
- # Given a stack of FASTA entries, find and return
- # the longest entry.
-
- my ( $entries, # list of FASTA entries
- ) = @_;
-
- # returns tuple
-
- my ( $max, $entry, $max_entry );
-
- $max = 0;
-
- foreach $entry ( @{ $entries } )
- {
- if ( length( $entry->[ SEQ ] ) > $max )
- {
- $max_entry = $entry;
- $max = length $entry->[ SEQ ];
- }
- }
-
- return wantarray ? @{ $max_entry } : $max_entry;
-}
-
-
-sub fasta_get_headers
-{
- # Martin A. Hansen, May 2007.
-
- # Gets the header names of a FASTA file,
- # and returns these in a list.
-
- my ( $path, # full path to FASTA file
- ) = @_;
-
- # returns list
-
- my ( $fh, $entry, @list );
-
- $fh = Maasha::Common::read_open( $path );
-
- while ( $entry = get_entry( $fh ) ) {
- push @list, $entry->[ SEQ_NAME ];
- }
-
- close $fh;
-
- return wantarray ? @list : \@list;
-}
-
-
-sub fasta_reformat
+sub fasta_index
{
- # Martin A. Hansen, December 2004.
+ # Martin A. Hansen, July 2008.
- # Given a file of one or more FASTA entries, reformats these so
- # each entry consits of one line with header and one line with sequence.
+ # Given a path to a FASTA file create a FASTA index
+ # and save to the specified path.
- my ( $path, # full path to file with multiple FASTA entries
+ my ( $fasta_path, # path to FASTA file
+ $index_dir, # directory
+ $index_name, # path to index file
) = @_;
- my ( $fh_in, $fh_out, $entry );
+ # Returns nothing.
- $fh_in = Maasha::Common::read_open( $path );
- $fh_out = Maasha::Common::write_open( "$path.temp" );
+ my ( $index );
- while ( $entry = get_entry( $fh_in ) ) {
- put_entry( $entry, $fh_out );
- }
+ $index = index_create( $fasta_path );
- close $fh_in;
- close $fh_out;
+ Maasha::Filesys::dir_create_if_not_exists( $index_dir );
- rename( "$path.temp", $path );
+ Maasha::Fasta::index_store( "$index_dir/$index_name", $index );
}
-sub fasta_index
+sub index_check
{
- # Martin A. Hansen, July 2008.
+ # Martin A. Hansen, September 2009.
- # Given a path to a FASTA file create a FASTA index
- # and save to the specified path.
+ # Given a path to a FASTA file and a FASTA index
+ # check if the FILESIZE matches.
my ( $fasta_path, # path to FASTA file
$index_path, # path to index file
) = @_;
- # Returns nothing.
+ # Returns boolean.
my ( $index );
- $index = index_create( $fasta_path );
+ $index = Maasha::Fasta::index_retrieve( $index_path );
- Maasha::Fasta::index_store( $index_path, $index );
+ if ( Maasha::Filesys::file_size( $fasta_path ) == $index->{ 'FILE_SIZE' } ) {
+ return 1;
+ }
+
+ return 0;
}
+
sub index_create
{
# Matin A. Hansen, December 2004.
# Given a FASTA file formatted with one line of header and one line of sequence,
- # returns a list of header, seq beg and seq length (first nucleotide is 0). Also,
+ # returns a list of header, seq offset and seq length (first nucleotide is 0). Also,
# the file size of the indexed file is written to the index for checking purposes.
my ( $path, # full path to file with multiple FASTA entries
) = @_;
- # returns a hashref
+ # Returns a hashref.
- my ( $file_size, $fh, $entry, $beg, $len, %hash, @index );
+ my ( $fh, $entry, $offset, $len, $tot, %index );
- $file_size = Maasha::Filesys::file_size( $path );
+ $index{ 'FILE_SIZE' } = Maasha::Filesys::file_size( $path );
- push @index, "FILE_SIZE=$file_size";
+ $offset = 0;
+ $len = 0;
+ $tot = 0;
$fh = Maasha::Filesys::file_read_open( $path );
- $beg = 0;
- $len = 0;
-
while ( $entry = get_entry( $fh ) )
{
- warn qq(WARNING: header->$entry->[ SEQ_NAME ] alread exists in index) if exists $hash{ $entry->[ SEQ_NAME ] };
+ warn qq(WARNING: header->$entry->[ SEQ_NAME ] alread exists in index) if exists $index{ $entry->[ SEQ_NAME ] };
- $beg += $len + 2 + length $entry->[ SEQ_NAME ];
- $len = length $entry->[ SEQ ];
+ $offset += $len + 2 + length $entry->[ SEQ_NAME ];
+ $len = length $entry->[ SEQ ];
- push @index, [ $entry->[ SEQ_NAME ], $beg, $len ];
+ $index{ $entry->[ SEQ_NAME ] } = [ $offset, $len ];
- $hash{ $entry->[ SEQ_NAME ] } = 1;
+ $offset++;
- $beg++;
+ $tot += length( $entry->[ SEQ_NAME ] ) + length( $entry->[ SEQ ] ) + 3;
}
close $fh;
- return wantarray ? @index : \@index;
-}
-
-
-sub index_search
-{
- # Martin A. Hansen, December 2004.
-
- # Searches the index for matching entries.
-
- my ( $index, # index list
- $regex, # regex to match FASTA headers [OPTIONAL]
- $invert, # invert matching
- ) = @_;
-
- # returns list
-
- my ( @results );
-
- if ( not $regex )
- {
- @results = @{ $index };
- }
- else
- {
- if ( $invert ) {
- @results = grep { $_->[ 0 ] !~ /$regex/ } @{ $index };
- } else {
- @results = grep { $_->[ 0 ] =~ /$regex/ } @{ $index };
- }
+ if ( $index{ 'FILE_SIZE' } != $tot ) {
+ Maasha::Common::error( qq(Filesize "$index{ 'FILE_SIZE' }" does not match content "$tot" -> malformatted FASTA file) );
}
- return wantarray ? @results : \@results;
+ return wantarray ? %index : \%index;
}
{
# Martin A. Hansen, July 2007.
- # Lookup a list of exact matches in the index and returns these
+ # Lookup a list of exact matches in the index and returns these.
my ( $index, # index list
$headers, # headers to lookup
) = @_;
- # returns a list
-
- my ( %hash, $head, @results );
+ # Returns a list.
- map { $hash{ $_->[ 0 ] } = [ $_->[ 1 ], $_->[ 2 ] ] } @{ $index };
+ my ( $head, @results );
foreach $head ( @{ $headers } )
{
- if ( exists $hash{ $head } ) {
- push @results, [ $head, $hash{ $head }->[ 0 ], $hash{ $head }->[ 1 ] ];
+ if ( exists $index->{ $head } ) {
+ push @results, [ $head, $index->{ $head }->[ 0 ], $index->{ $head }->[ 1 ] ];
}
}
# Returns nothing.
- my ( $cols, @block_sizes, @block_starts );
+ my ( $entry, $cols, @block_sizes, @block_starts );
- $cols = scalar @{ $bed };
+ $entry = join "\t", @{ $bed };
+ $cols = scalar @{ $bed };
if ( $cols < 3 ) {
- Maasha::Common::error( qq(Bad BED entry - must contain at least 3 columns - not $cols) );
+ Maasha::Common::error( qq(Bad BED entry "$entry" - must contain at least 3 columns - not $cols) );
}
if ( $cols > 12 ) {
- Maasha::Common::error( qq(Bad BED entry - must contains no more than 12 columns - not $cols) );
+ Maasha::Common::error( qq(Bad BED entry "$entry" - must contains no more than 12 columns - not $cols) );
}
if ( $bed->[ chrom ] =~ /\s/ ) {
- Maasha::Common::error( qq(Bad BED entry - no white space allowed in chrom field: "$bed->[ chrom ]") );
+ Maasha::Common::error( qq(Bad BED entry "$entry" - no white space allowed in chrom field: "$bed->[ chrom ]") );
}
if ( $bed->[ chromStart ] =~ /\D/ ) {
- Maasha::Common::error( qq(Bad BED entry - chromStart must be a whole number - not "$bed->[ chromStart ]") );
+ Maasha::Common::error( qq(Bad BED entry "$entry" - chromStart must be a whole number - not "$bed->[ chromStart ]") );
}
if ( $bed->[ chromEnd ] =~ /\D/ ) {
- Maasha::Common::error( qq(Bad BED entry - chromEnd must be a whole number - not "$bed->[ chromEnd ]") );
+ Maasha::Common::error( qq(Bad BED entry "$entry" - chromEnd must be a whole number - not "$bed->[ chromEnd ]") );
}
if ( $bed->[ chromEnd ] < $bed->[ chromStart ] ) {
- Maasha::Common::error( qq(Bad BED entry - chromEnd must be greater than chromStart - not "$bed->[ chromStart ] > $bed->[ chromEnd ]") );
+ Maasha::Common::error( qq(Bad BED entry "$entry" - chromEnd must be greater than chromStart - not "$bed->[ chromStart ] > $bed->[ chromEnd ]") );
}
return if $cols == 3;
if ( $bed->[ name ] =~ /\s/ ) {
- Maasha::Common::error( qq(Bad BED entry - no white space allowed in name field: "$bed->[ name ]") );
+ Maasha::Common::error( qq(Bad BED entry "$entry" - no white space allowed in name field: "$bed->[ name ]") );
}
return if $cols == 4;
if ( $bed->[ score ] =~ /\D/ ) {
- Maasha::Common::error( qq(Bad BED entry - score must be a whole number - not "$bed->[ score ]") );
+ Maasha::Common::error( qq(Bad BED entry "$entry" - score must be a whole number - not "$bed->[ score ]") );
}
# if ( $bed->[ score ] < 0 or $bed->[ score ] > 1000 ) { # disabled - too restrictive !
if ( $bed->[ score ] < 0 ) {
- Maasha::Common::error( qq(Bad BED entry - score must be between 0 and 1000 - not "$bed->[ score ]") );
+ Maasha::Common::error( qq(Bad BED entry "$entry" - score must be between 0 and 1000 - not "$bed->[ score ]") );
}
return if $cols == 5;
if ( $bed->[ strand ] ne '+' and $bed->[ strand ] ne '-' ) {
- Maasha::Common::error( qq(Bad BED entry - strand must be + or - not "$bed->[ strand ]") );
+ Maasha::Common::error( qq(Bad BED entry "$entry" - strand must be + or - not "$bed->[ strand ]") );
}
return if $cols == 6;
if ( $bed->[ thickStart ] =~ /\D/ ) {
- Maasha::Common::error( qq(Bad BED entry - thickStart must be a whole number - not "$bed->[ thickStart ]") );
+ Maasha::Common::error( qq(Bad BED entry "$entry" - thickStart must be a whole number - not "$bed->[ thickStart ]") );
}
if ( $bed->[ thickEnd ] =~ /\D/ ) {
- Maasha::Common::error( qq(Bad BED entry - thickEnd must be a whole number - not "$bed->[ thickEnd ]") );
+ Maasha::Common::error( qq(Bad BED entry "$entry" - thickEnd must be a whole number - not "$bed->[ thickEnd ]") );
}
if ( $bed->[ thickEnd ] < $bed->[ thickStart ] ) {
- Maasha::Common::error( qq(Bad BED entry - thickEnd must be greater than thickStart - not "$bed->[ thickStart ] > $bed->[ thickEnd ]") );
+ Maasha::Common::error( qq(Bad BED entry "$entry" - thickEnd must be greater than thickStart - not "$bed->[ thickStart ] > $bed->[ thickEnd ]") );
}
if ( $bed->[ thickStart ] < $bed->[ chromStart ] ) {
- Maasha::Common::error( qq(Bad BED entry - thickStart must be greater than chromStart - not "$bed->[ thickStart ] < $bed->[ chromStart ]") );
+ Maasha::Common::error( qq(Bad BED entry "$entry" - thickStart must be greater than chromStart - not "$bed->[ thickStart ] < $bed->[ chromStart ]") );
}
if ( $bed->[ thickStart ] > $bed->[ chromEnd ] ) {
- Maasha::Common::error( qq(Bad BED entry - thickStart must be less than chromEnd - not "$bed->[ thickStart ] > $bed->[ chromEnd ]") );
+ Maasha::Common::error( qq(Bad BED entry "$entry" - thickStart must be less than chromEnd - not "$bed->[ thickStart ] > $bed->[ chromEnd ]") );
}
if ( $bed->[ thickEnd ] < $bed->[ chromStart ] ) {
- Maasha::Common::error( qq(Bad BED entry - thickEnd must be greater than chromStart - not "$bed->[ thickEnd ] < $bed->[ chromStart ]") );
+ Maasha::Common::error( qq(Bad BED entry "$entry" - thickEnd must be greater than chromStart - not "$bed->[ thickEnd ] < $bed->[ chromStart ]") );
}
if ( $bed->[ thickEnd ] > $bed->[ chromEnd ] ) {
- Maasha::Common::error( qq(Bad BED entry - thickEnd must be less than chromEnd - not "$bed->[ thickEnd ] > $bed->[ chromEnd ]") );
+ Maasha::Common::error( qq(Bad BED entry "$entry" - thickEnd must be less than chromEnd - not "$bed->[ thickEnd ] > $bed->[ chromEnd ]") );
}
return if $cols == 8;
if ( $bed->[ itemRgb ] !~ /^(0|\d{1,3},\d{1,3},\d{1,3},?)$/ ) {
- Maasha::Common::error( qq(Bad BED entry - itemRgb must be 0 or in the form of 255,0,0 - not "$bed->[ itemRgb ]") );
+ Maasha::Common::error( qq(Bad BED entry "$entry" - itemRgb must be 0 or in the form of 255,0,0 - not "$bed->[ itemRgb ]") );
}
return if $cols == 9;
if ( $bed->[ blockCount ] =~ /\D/ ) {
- Maasha::Common::error( qq(Bad BED entry - blockCount must be a whole number - not "$bed->[ blockCount ]") );
+ Maasha::Common::error( qq(Bad BED entry "$entry" - blockCount must be a whole number - not "$bed->[ blockCount ]") );
}
@block_sizes = split ",", $bed->[ blockSizes ];
if ( grep /\D/, @block_sizes ) {
- Maasha::Common::error( qq(Bad BED entry - blockSizes must be whole numbers - not "$bed->[ blockSizes ]") );
+ Maasha::Common::error( qq(Bad BED entry "$entry" - blockSizes must be whole numbers - not "$bed->[ blockSizes ]") );
}
if ( $bed->[ blockCount ] != scalar @block_sizes ) {
- Maasha::Common::error( qq(Bad BED entry - blockSizes "$bed->[ blockSizes ]" must match blockCount "$bed->[ blockCount ]") );
+ Maasha::Common::error( qq(Bad BED entry "$entry" - blockSizes "$bed->[ blockSizes ]" must match blockCount "$bed->[ blockCount ]") );
}
@block_starts = split ",", $bed->[ blockStarts ];
if ( grep /\D/, @block_starts ) {
- Maasha::Common::error( qq(Bad BED entry - blockStarts must be whole numbers - not "$bed->[ blockStarts ]") );
+ Maasha::Common::error( qq(Bad BED entry "$entry" - blockStarts must be whole numbers - not "$bed->[ blockStarts ]") );
}
if ( $bed->[ blockCount ] != scalar @block_starts ) {
- Maasha::Common::error( qq(Bad BED entry - blockStarts "$bed->[ blockStarts ]" must match blockCount "$bed->[ blockCount ]") );
+ Maasha::Common::error( qq(Bad BED entry "$entry" - blockStarts "$bed->[ blockStarts ]" must match blockCount "$bed->[ blockCount ]") );
}
if ( $bed->[ chromStart ] + $block_starts[ -1 ] + $block_sizes[ -1 ] != $bed->[ chromEnd ] ) {
- Maasha::Common::error( qq(Bad BED entry - chromStart + blockStarts[last] + blockSizes[last] must equal chromEnd: ) .
+ Maasha::Common::error( qq(Bad BED entry "$entry" - chromStart + blockStarts[last] + blockSizes[last] must equal chromEnd: ) .
qq($bed->[ chromStart ] + $block_starts[ -1 ] + $block_sizes[ -1 ] != $bed->[ chromEnd ]) );
}
}