X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=code_perl%2FMaasha%2FFasta.pm;h=20661555ae220302dd382d3ce9ebb3f0f8f6274d;hb=dac27dcade24185a98c6b56f33f308589546a86a;hp=82307f9911ef1286768b5b194c5c84e310aba702;hpb=5cbfb1e379cfbe79bd871359fd3455b590c9b396;p=biopieces.git diff --git a/code_perl/Maasha/Fasta.pm b/code_perl/Maasha/Fasta.pm index 82307f9..2066155 100644 --- a/code_perl/Maasha/Fasta.pm +++ b/code_perl/Maasha/Fasta.pm @@ -1,6 +1,6 @@ package Maasha::Fasta; -# Copyright (C) 2006 Martin A. Hansen. +# Copyright (C) 2006-2009 Martin A. Hansen. # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License @@ -28,16 +28,18 @@ package Maasha::Fasta; # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< +use warnings; use strict; use Data::Dumper; use Maasha::Common; +use Maasha::Filesys; use Maasha::Seq; use vars qw ( @ISA @EXPORT ); @ISA = qw( Exporter ); use constant { - HEAD => 0, + SEQ_NAME => 0, SEQ => 1, }; @@ -45,109 +47,6 @@ use constant { # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< -sub fasta_format_ok -{ - # Martin A. Hansen, March 2007. - - # Checks if a given FASTA file is formatted with - # one header per line and one sequence per line. - # returns 1 if so, else 0. - - my ( $path, # full path to FASTA file - ) = @_; - - # Returns boolean - - my ( $fh, $line, $count ); - - $fh = Maasha::Common::read_open( $path ); - - $count = 0; - - while ( $line = <$fh> ) - { - if ( not $count % 2 and substr( $line, 0, 1 ) ne ">" ) { - return 0; - } - - $count++; - } - - close $fh; - - return 1; -} - - -sub get_entries -{ - # Martin A. Hansen, December 2006. - - # Parses a fasta file and returns a list of headers and sequence tuples. - - my ( $path, # full path to FASTA file - $count, # number of sequences to read - OPTIONAL - ) = @_; - - # returns list of tuples - - my ( $fh, $entry, @entries ); - - $fh = Maasha::Common::read_open( $path ); - - while ( $entry = get_entry( $fh ) ) - { - push @entries, $entry; - - if ( $count and $count == @entries ) { - last; - } - } - - close $fh; - - return wantarray ? @entries : \@entries; -} - - -sub put_entries -{ - # Martin A. Hansen, March 2004. - - # writes fasta sequences to STDOUT or file - - my ( $entries, # list of fasta entries - $path, # full path to file - OPTIONAL - $wrap, # line width - OPTIONAL - ) = @_; - - my ( $fh ); - - $fh = Maasha::Common::write_open( $path ) if $path; - - map { put_entry( $_, $fh, $wrap ) } @{ $entries }; - - close $fh if defined; -} - - -sub wrap -{ - # Martin A. Hansen, June 2007 - - # Wraps the sequence of a given FASTA entry - # to a given length. - - my ( $entry, # FASTA entry - $wrap, # wrap length - ) = @_; - - # Returns nothing. - - Maasha::Seq::wrap( \$entry->[ SEQ ], $wrap ); -} - - sub get_entry { # Martin A. Hansen, January 2007. @@ -174,7 +73,8 @@ sub get_entry return if not defined $block; - $block =~ />?([^\n]+)\n/m; + # $block =~ />?([^\n]+)\n/m; + $block =~ /^>?(.+)\n/m; $seq_name = $1; $seq = $'; @@ -182,7 +82,8 @@ sub get_entry chomp $seq; - $seq =~ tr/ \t\n//d; + $seq_name =~ tr/\r//d; + $seq =~ tr/ \t\n\r//d; $entry = [ $seq_name, $seq ]; @@ -194,7 +95,7 @@ sub put_entry { # Martin A. Hansen, January 2007. - # Writes FASTA entries to STDOUT or file. + # Writes a FASTA entry to a file handle. my ( $entry, # a FASTA entries $fh, # file handle to output file - OPTIONAL @@ -203,7 +104,7 @@ sub put_entry # Returns nothing. - Maasha::Common::error( qq(FASTA entry has no header) ) if not defined $entry->[ HEAD ]; + Maasha::Common::error( qq(FASTA entry has no header) ) if not defined $entry->[ SEQ_NAME ]; Maasha::Common::error( qq(FASTA entry has no sequence) ) if not defined $entry->[ SEQ ]; if ( $wrap ) { @@ -211,216 +112,146 @@ sub put_entry } if ( defined $fh ) { - print $fh ">$entry->[ HEAD ]\n$entry->[ SEQ ]\n"; + print $fh ">$entry->[ SEQ_NAME ]\n$entry->[ SEQ ]\n"; } else { - print ">$entry->[ HEAD ]\n$entry->[ SEQ ]\n"; + print ">$entry->[ SEQ_NAME ]\n$entry->[ SEQ ]\n"; } } -sub find_shortest -{ - # Martin A. Hansen, June 2007. - - # Given a stack of FASTA entries, find and return - # the shortest entry. - - my ( $entries, # list of FASTA entries - ) = @_; - - # returns tuple - - my ( $min, $entry, $min_entry ); - - $min = 99999999999; - - foreach $entry ( @{ $entries } ) - { - if ( length( $entry->[ SEQ ] ) < $min ) - { - $min_entry = $entry; - $min = length $entry->[ SEQ ]; - } - } - - return wantarray ? @{ $min_entry } : $min_entry; -} - - -sub find_longest +sub put_entries { - # Martin A. Hansen, June 2007. + # Martin A. Hansen, September 2009 - # Given a stack of FASTA entries, find and return - # the longest entry. + # Write FASTA entries to a file. my ( $entries, # list of FASTA entries + $file, # output file + $wrap, # line width - OPTIONAL ) = @_; - # returns tuple + # Returns nothing - my ( $max, $entry, $max_entry ); + my ( $fh ); - $max = 0; + $fh = Maasha::Filesys::file_write_open( $file ); - foreach $entry ( @{ $entries } ) - { - if ( length( $entry->[ SEQ ] ) > $max ) - { - $max_entry = $entry; - $max = length $entry->[ SEQ ]; - } - } + map { put_entry( $_, $fh, $wrap ) } @{ $entries }; - return wantarray ? @{ $max_entry } : $max_entry; + close $fh; } -sub fasta_get_headers +sub wrap { - # Martin A. Hansen, May 2007. + # Martin A. Hansen, June 2007 - # Gets the header names of a FASTA file, - # and returns these in a list. + # Wraps the sequence of a given FASTA entry + # to a given length. - my ( $path, # full path to FASTA file + my ( $entry, # FASTA entry + $wrap, # wrap length ) = @_; - # returns list - - my ( $fh, $entry, @list ); - - $fh = Maasha::Common::read_open( $path ); - - while ( $entry = get_entry( $fh ) ) { - push @list, $entry->[ HEAD ]; - } - - close $fh; + # Returns nothing. - return wantarray ? @list : \@list; + Maasha::Seq::wrap( \$entry->[ SEQ ], $wrap ); } -sub fasta_reformat +sub fasta_index { - # Martin A. Hansen, December 2004. + # Martin A. Hansen, July 2008. - # Given a file of one or more FASTA entries, reformats these so - # each entry consits of one line with header and one line with sequence. + # Given a path to a FASTA file create a FASTA index + # and save to the specified path. - my ( $path, # full path to file with multiple FASTA entries + my ( $fasta_path, # path to FASTA file + $index_dir, # directory + $index_name, # path to index file ) = @_; - my ( $fh_in, $fh_out, $entry ); + # Returns nothing. - $fh_in = Maasha::Common::read_open( $path ); - $fh_out = Maasha::Common::write_open( "$path.temp" ); + my ( $index ); - while ( $entry = get_entry( $fh_in ) ) { - put_entry( $entry, $fh_out ); - } + $index = index_create( $fasta_path ); - close $fh_in; - close $fh_out; + Maasha::Filesys::dir_create_if_not_exists( $index_dir ); - rename( "$path.temp", $path ); + Maasha::Fasta::index_store( "$index_dir/$index_name", $index ); } -sub fasta_index +sub index_check { - # Martin A. Hansen, July 2008. + # Martin A. Hansen, September 2009. - # Given a path to a FASTA file create a FASTA index - # and save to the specified path. + # Given a path to a FASTA file and a FASTA index + # check if the FILESIZE matches. my ( $fasta_path, # path to FASTA file $index_path, # path to index file ) = @_; - # Returns nothing. + # Returns boolean. my ( $index ); - $index = index_create( $fasta_path ); + $index = Maasha::Fasta::index_retrieve( $index_path ); + + if ( Maasha::Filesys::file_size( $fasta_path ) == $index->{ 'FILE_SIZE' } ) { + return 1; + } - Maasha::Fasta::index_store( $index_path, $index ); + return 0; } + sub index_create { # Matin A. Hansen, December 2004. # Given a FASTA file formatted with one line of header and one line of sequence, - # returns a list of header, seq beg and seq length (first nucleotide is 0). Also, + # returns a list of header, seq offset and seq length (first nucleotide is 0). Also, # the file size of the indexed file is written to the index for checking purposes. my ( $path, # full path to file with multiple FASTA entries ) = @_; - # returns a hashref - - my ( $file_size, $fh, $entry, $beg, $len, %hash, @index ); + # Returns a hashref. - $file_size = Maasha::Common::file_size( $path ); + my ( $fh, $entry, $offset, $len, $tot, %index ); - push @index, "FILE_SIZE=$file_size"; + $index{ 'FILE_SIZE' } = Maasha::Filesys::file_size( $path ); - $fh = Maasha::Common::read_open( $path ); + $offset = 0; + $len = 0; + $tot = 0; - $beg = 0; - $len = 0; + $fh = Maasha::Filesys::file_read_open( $path ); while ( $entry = get_entry( $fh ) ) { - warn qq(WARNING: header->$entry->[ HEAD ] alread exists in index) if exists $hash{ $entry->[ HEAD ] }; + warn qq(WARNING: header->$entry->[ SEQ_NAME ] alread exists in index) if exists $index{ $entry->[ SEQ_NAME ] }; - $beg += $len + 2 + length $entry->[ HEAD ]; - $len = length $entry->[ SEQ ]; + $offset += $len + 2 + length $entry->[ SEQ_NAME ]; + $len = length $entry->[ SEQ ]; - push @index, [ $entry->[ HEAD ], $beg, $len ]; + $index{ $entry->[ SEQ_NAME ] } = [ $offset, $len ]; - $hash{ $entry->[ HEAD ] } = 1; + $offset++; - $beg++; + $tot += length( $entry->[ SEQ_NAME ] ) + length( $entry->[ SEQ ] ) + 3; } close $fh; - return wantarray ? @index : \@index; -} - - -sub index_search -{ - # Martin A. Hansen, December 2004. - - # Searches the index for matching entries. - - my ( $index, # index list - $regex, # regex to match FASTA headers [OPTIONAL] - $invert, # invert matching - ) = @_; - - # returns list - - my ( @results ); - - if ( not $regex ) - { - @results = @{ $index }; - } - else - { - if ( $invert ) { - @results = grep { $_->[ 0 ] !~ /$regex/ } @{ $index }; - } else { - @results = grep { $_->[ 0 ] =~ /$regex/ } @{ $index }; - } + if ( $index{ 'FILE_SIZE' } != $tot ) { + Maasha::Common::error( qq(Filesize "$index{ 'FILE_SIZE' }" does not match content "$tot" -> malformatted FASTA file) ); } - return wantarray ? @results : \@results; + return wantarray ? %index : \%index; } @@ -428,22 +259,20 @@ sub index_lookup { # Martin A. Hansen, July 2007. - # Lookup a list of exact matches in the index and returns these + # Lookup a list of exact matches in the index and returns these. my ( $index, # index list $headers, # headers to lookup ) = @_; - # returns a list - - my ( %hash, $head, @results ); + # Returns a list. - map { $hash{ $_->[ 0 ] } = [ $_->[ 1 ], $_->[ 2 ] ] } @{ $index }; + my ( $head, @results ); foreach $head ( @{ $headers } ) { - if ( exists $hash{ $head } ) { - push @results, [ $head, $hash{ $head }->[ 0 ], $hash{ $head }->[ 1 ] ]; + if ( exists $index->{ $head } ) { + push @results, [ $head, $index->{ $head }->[ 0 ], $index->{ $head }->[ 1 ] ]; } } @@ -463,7 +292,7 @@ sub index_store # returns nothing - Maasha::Common::file_store( $path, $index ); + Maasha::Filesys::file_store( $path, $index ); } @@ -480,10 +309,63 @@ sub index_retrieve my $index; - $index = Maasha::Common::file_retrieve( $path ); + $index = Maasha::Filesys::file_retrieve( $path ); return wantarray ? @{ $index } : $index; } +sub biopiece2fasta +{ + # Martin A. Hansen, July 2008. + + # Given a biopiece record converts it to a FASTA record. + # If no generic SEQ or SEQ_NAME is found, the Q_* and S_* are + # tried in that order. + + my ( $record, # record + ) = @_; + + # Returns a tuple. + + my ( $seq_name, $seq ); + + $seq_name = $record->{ "SEQ_NAME" } || $record->{ "Q_ID" } || $record->{ "S_ID" }; + $seq = $record->{ "SEQ" } || $record->{ "Q_SEQ" } || $record->{ "S_SEQ" }; + + if ( defined $seq_name and defined $seq ) { + return wantarray ? ( $seq_name, $seq ) : [ $seq_name, $seq ]; + } else { + return; + } +} + + +sub fasta2biopiece +{ + # Martin A. Hansen, May 2009 + + # Convert a FASTA record to a Biopiece record. + + my ( $entry, # FASTA entry + ) = @_; + + # Returns a hashref. + + my ( $record ); + + if ( defined $entry->[ SEQ_NAME ] and $entry->[ SEQ ] ) + { + $record = { + SEQ_NAME => $entry->[ SEQ_NAME ], + SEQ => $entry->[ SEQ ], + SEQ_LEN => length $entry->[ SEQ ], + }; + } + + return wantarray ? %{ $record } : $record; +} + + + # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<