# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
-sub fasta_format_ok
-{
- # Martin A. Hansen, March 2007.
-
- # Checks if a given FASTA file is formatted with
- # one header per line and one sequence per line.
- # returns 1 if so, else 0.
-
- my ( $path, # full path to FASTA file
- ) = @_;
-
- # Returns boolean
-
- my ( $fh, $line, $count );
-
- $fh = Maasha::Common::read_open( $path );
-
- $count = 0;
-
- while ( $line = <$fh> )
- {
- if ( not $count % 2 and substr( $line, 0, 1 ) ne ">" ) {
- return 0;
- }
-
- $count++;
- }
-
- close $fh;
-
- return 1;
-}
-
-
-sub get_entries
-{
- # Martin A. Hansen, December 2006.
-
- # Parses a fasta file and returns a list of headers and sequence tuples.
-
- my ( $path, # full path to FASTA file
- $count, # number of sequences to read - OPTIONAL
- ) = @_;
-
- # returns list of tuples
-
- my ( $fh, $entry, @entries );
-
- $fh = Maasha::Common::read_open( $path );
-
- while ( $entry = get_entry( $fh ) )
- {
- push @entries, $entry;
-
- if ( $count and $count == @entries ) {
- last;
- }
- }
-
- close $fh;
-
- return wantarray ? @entries : \@entries;
-}
-
-
-sub put_entries
-{
- # Martin A. Hansen, March 2004.
-
- # writes fasta sequences to STDOUT or file
-
- my ( $entries, # list of fasta entries
- $path, # full path to file - OPTIONAL
- $wrap, # line width - OPTIONAL
- ) = @_;
-
- my ( $fh );
-
- $fh = Maasha::Common::write_open( $path ) if $path;
-
- map { put_entry( $_, $fh, $wrap ) } @{ $entries };
-
- close $fh if defined;
-}
-
-
-sub wrap
-{
- # Martin A. Hansen, June 2007
-
- # Wraps the sequence of a given FASTA entry
- # to a given length.
-
- my ( $entry, # FASTA entry
- $wrap, # wrap length
- ) = @_;
-
- # Returns nothing.
-
- Maasha::Seq::wrap( \$entry->[ SEQ ], $wrap );
-}
-
-
sub get_entry
{
# Martin A. Hansen, January 2007.
}
-sub find_shortest
-{
- # Martin A. Hansen, June 2007.
-
- # Given a stack of FASTA entries, find and return
- # the shortest entry.
-
- my ( $entries, # list of FASTA entries
- ) = @_;
-
- # returns tuple
-
- my ( $min, $entry, $min_entry );
-
- $min = 99999999999;
-
- foreach $entry ( @{ $entries } )
- {
- if ( length( $entry->[ SEQ ] ) < $min )
- {
- $min_entry = $entry;
- $min = length $entry->[ SEQ ];
- }
- }
-
- return wantarray ? @{ $min_entry } : $min_entry;
-}
-
-
-sub find_longest
-{
- # Martin A. Hansen, June 2007.
-
- # Given a stack of FASTA entries, find and return
- # the longest entry.
-
- my ( $entries, # list of FASTA entries
- ) = @_;
-
- # returns tuple
-
- my ( $max, $entry, $max_entry );
-
- $max = 0;
-
- foreach $entry ( @{ $entries } )
- {
- if ( length( $entry->[ SEQ ] ) > $max )
- {
- $max_entry = $entry;
- $max = length $entry->[ SEQ ];
- }
- }
-
- return wantarray ? @{ $max_entry } : $max_entry;
-}
-
-
-sub fasta_get_headers
-{
- # Martin A. Hansen, May 2007.
-
- # Gets the header names of a FASTA file,
- # and returns these in a list.
-
- my ( $path, # full path to FASTA file
- ) = @_;
-
- # returns list
-
- my ( $fh, $entry, @list );
-
- $fh = Maasha::Common::read_open( $path );
-
- while ( $entry = get_entry( $fh ) ) {
- push @list, $entry->[ SEQ_NAME ];
- }
-
- close $fh;
-
- return wantarray ? @list : \@list;
-}
-
-
-sub fasta_reformat
+sub fasta_index
{
- # Martin A. Hansen, December 2004.
+ # Martin A. Hansen, July 2008.
- # Given a file of one or more FASTA entries, reformats these so
- # each entry consits of one line with header and one line with sequence.
+ # Given a path to a FASTA file create a FASTA index
+ # and save to the specified path.
- my ( $path, # full path to file with multiple FASTA entries
+ my ( $fasta_path, # path to FASTA file
+ $index_dir, # directory
+ $index_name, # path to index file
) = @_;
- my ( $fh_in, $fh_out, $entry );
+ # Returns nothing.
- $fh_in = Maasha::Common::read_open( $path );
- $fh_out = Maasha::Common::write_open( "$path.temp" );
+ my ( $index );
- while ( $entry = get_entry( $fh_in ) ) {
- put_entry( $entry, $fh_out );
- }
+ $index = index_create( $fasta_path );
- close $fh_in;
- close $fh_out;
+ Maasha::Filesys::dir_create_if_not_exists( $index_dir );
- rename( "$path.temp", $path );
+ Maasha::Fasta::index_store( "$index_dir/$index_name", $index );
}
-sub fasta_index
+sub index_check
{
- # Martin A. Hansen, July 2008.
+ # Martin A. Hansen, September 2009.
- # Given a path to a FASTA file create a FASTA index
- # and save to the specified path.
+ # Given a path to a FASTA file and a FASTA index
+ # check if the FILESIZE matches.
my ( $fasta_path, # path to FASTA file
$index_path, # path to index file
) = @_;
- # Returns nothing.
+ # Returns boolean.
my ( $index );
- $index = index_create( $fasta_path );
+ $index = Maasha::Fasta::index_retrieve( $index_path );
- Maasha::Fasta::index_store( $index_path, $index );
+ if ( Maasha::Filesys::file_size( $fasta_path ) == $index->{ 'FILE_SIZE' } ) {
+ return 1;
+ }
+
+ return 0;
}
+
sub index_create
{
# Matin A. Hansen, December 2004.
# Given a FASTA file formatted with one line of header and one line of sequence,
- # returns a list of header, seq beg and seq length (first nucleotide is 0). Also,
+ # returns a list of header, seq offset and seq length (first nucleotide is 0). Also,
# the file size of the indexed file is written to the index for checking purposes.
my ( $path, # full path to file with multiple FASTA entries
) = @_;
- # returns a hashref
+ # Returns a hashref.
- my ( $file_size, $fh, $entry, $beg, $len, %hash, @index );
+ my ( $fh, $entry, $offset, $len, $tot, %index );
- $file_size = Maasha::Filesys::file_size( $path );
+ $index{ 'FILE_SIZE' } = Maasha::Filesys::file_size( $path );
- push @index, "FILE_SIZE=$file_size";
+ $offset = 0;
+ $len = 0;
+ $tot = 0;
$fh = Maasha::Filesys::file_read_open( $path );
- $beg = 0;
- $len = 0;
-
while ( $entry = get_entry( $fh ) )
{
- warn qq(WARNING: header->$entry->[ SEQ_NAME ] alread exists in index) if exists $hash{ $entry->[ SEQ_NAME ] };
+ warn qq(WARNING: header->$entry->[ SEQ_NAME ] alread exists in index) if exists $index{ $entry->[ SEQ_NAME ] };
- $beg += $len + 2 + length $entry->[ SEQ_NAME ];
- $len = length $entry->[ SEQ ];
+ $offset += $len + 2 + length $entry->[ SEQ_NAME ];
+ $len = length $entry->[ SEQ ];
- push @index, [ $entry->[ SEQ_NAME ], $beg, $len ];
+ $index{ $entry->[ SEQ_NAME ] } = [ $offset, $len ];
- $hash{ $entry->[ SEQ_NAME ] } = 1;
+ $offset++;
- $beg++;
+ $tot += length( $entry->[ SEQ_NAME ] ) + length( $entry->[ SEQ ] ) + 3;
}
close $fh;
- return wantarray ? @index : \@index;
-}
-
-
-sub index_search
-{
- # Martin A. Hansen, December 2004.
-
- # Searches the index for matching entries.
-
- my ( $index, # index list
- $regex, # regex to match FASTA headers [OPTIONAL]
- $invert, # invert matching
- ) = @_;
-
- # returns list
-
- my ( @results );
-
- if ( not $regex )
- {
- @results = @{ $index };
- }
- else
- {
- if ( $invert ) {
- @results = grep { $_->[ 0 ] !~ /$regex/ } @{ $index };
- } else {
- @results = grep { $_->[ 0 ] =~ /$regex/ } @{ $index };
- }
+ if ( $index{ 'FILE_SIZE' } != $tot ) {
+ Maasha::Common::error( qq(Filesize "$index{ 'FILE_SIZE' }" does not match content "$tot" -> malformatted FASTA file) );
}
- return wantarray ? @results : \@results;
+ return wantarray ? %index : \%index;
}
{
# Martin A. Hansen, July 2007.
- # Lookup a list of exact matches in the index and returns these
+ # Lookup a list of exact matches in the index and returns these.
my ( $index, # index list
$headers, # headers to lookup
) = @_;
- # returns a list
-
- my ( %hash, $head, @results );
+ # Returns a list.
- map { $hash{ $_->[ 0 ] } = [ $_->[ 1 ], $_->[ 2 ] ] } @{ $index };
+ my ( $head, @results );
foreach $head ( @{ $headers } )
{
- if ( exists $hash{ $head } ) {
- push @results, [ $head, $hash{ $head }->[ 0 ], $hash{ $head }->[ 1 ] ];
+ if ( exists $index->{ $head } ) {
+ push @results, [ $head, $index->{ $head }->[ 0 ], $index->{ $head }->[ 1 ] ];
}
}