3 # Copyright (C) 2006 Martin A. Hansen.
5 # This program is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU General Public License
7 # as published by the Free Software Foundation; either version 2
8 # of the License, or (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 # http://www.gnu.org/copyleft/gpl.html
22 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
25 # Routines for manipulation of FASTA files and FASTA entries.
28 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
35 use vars qw ( @ISA @EXPORT );
37 @ISA = qw( Exporter );
45 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
50 # Martin A. Hansen, March 2007.
52 # Checks if a given FASTA file is formatted with
53 # one header per line and one sequence per line.
54 # returns 1 if so, else 0.
56 my ( $path, # full path to FASTA file
61 my ( $fh, $line, $count );
63 $fh = &Maasha::Common::read_open( $path );
67 while ( $line = <$fh> )
69 if ( not $count % 2 and substr( $line, 0, 1 ) ne ">" ) {
84 # Martin A. Hansen, December 2006.
86 # Parses a fasta file and returns a list of headers and sequence tuples.
88 my ( $path, # full path to FASTA file
89 $count, # number of sequences to read - OPTIONAL
92 # returns list of tuples
94 my ( $fh, $entry, @entries );
96 $fh = &Maasha::Common::read_open( $path );
98 while ( $entry = &get_entry( $fh ) )
100 push @entries, $entry;
102 if ( $count and $count == @entries ) {
109 return wantarray ? @entries : \@entries;
115 # Martin A. Hansen, March 2004.
117 # writes fasta sequences to STDOUT or file
119 my ( $entries, # list of fasta entries
120 $path, # full path to file - OPTIONAL
121 $wrap, # line width - OPTIONAL
126 $fh = &Maasha::Common::write_open( $path ) if $path;
128 map { &put_entry( $_, $fh, $wrap ) } @{ $entries };
130 close $fh if defined;
136 # Martin A. Hansen, June 2007
138 # Wraps the sequence of a given FASTA entry
141 my ( $entry, # FASTA entry
147 &Maasha::Seq::wrap( \$entry->[ SEQ ], $wrap );
153 # Martin A. Hansen, January 2007.
155 # Given a filehandle to an FASTA file,
156 # fetches the next FASTA entry, and returns
157 # this as a tuple of [ header, sequence ].
159 my ( $fh, # filehandle to FASTA file
164 my ( $block, @lines, $seq_name, $seq, $entry );
168 while ( $block = <$fh> )
172 last if $block !~ /^\s+$/;
175 return if not defined $block;
177 $block =~ />?([^\n]+)\n/m;
187 $entry = [ $seq_name, $seq ];
189 return wantarray ? @{ $entry } : $entry;
195 # Martin A. Hansen, January 2007.
197 # Writes FASTA entries to STDOUT or file.
199 my ( $entry, # a FASTA entries
200 $fh, # file handle to output file - OPTIONAL
201 $wrap, # line width - OPTIONAL
206 &Maasha::Common::error( qq(FASTA entry has no header) ) if not defined $entry->[ HEAD ];
207 &Maasha::Common::error( qq(FASTA entry has no sequence) ) if not defined $entry->[ SEQ ];
210 &Maasha::Fasta::wrap( $entry, $wrap );
214 print $fh ">$entry->[ HEAD ]\n$entry->[ SEQ ]\n";
216 print ">$entry->[ HEAD ]\n$entry->[ SEQ ]\n";
223 # Martin A. Hansen, June 2007.
225 # Given a stack of FASTA entries, find and return
226 # the shortest entry.
228 my ( $entries, # list of FASTA entries
233 my ( $min, $entry, $min_entry );
237 foreach $entry ( @{ $entries } )
239 if ( length( $entry->[ SEQ ] ) < $min )
242 $min = length $entry->[ SEQ ];
246 return wantarray ? @{ $min_entry } : $min_entry;
252 # Martin A. Hansen, June 2007.
254 # Given a stack of FASTA entries, find and return
257 my ( $entries, # list of FASTA entries
262 my ( $max, $entry, $max_entry );
266 foreach $entry ( @{ $entries } )
268 if ( length( $entry->[ SEQ ] ) > $max )
271 $max = length $entry->[ SEQ ];
275 return wantarray ? @{ $max_entry } : $max_entry;
279 sub fasta_get_headers
281 # Martin A. Hansen, May 2007.
283 # Gets the header names of a FASTA file,
284 # and returns these in a list.
286 my ( $path, # full path to FASTA file
291 my ( $fh, $entry, @list );
293 $fh = &Maasha::Common::read_open( $path );
295 while ( $entry = &get_entry( $fh ) ) {
296 push @list, $entry->[ HEAD ];
301 return wantarray ? @list : \@list;
307 # Martin A. Hansen, December 2004.
309 # Given a file of one or more FASTA entries, reformats these so
310 # each entry consits of one line with header and one line with sequence.
312 my ( $path, # full path to file with multiple FASTA entries
315 my ( $fh_in, $fh_out, $entry );
317 $fh_in = &Maasha::Common::read_open( $path );
318 $fh_out = &Maasha::Common::write_open( "$path.temp" );
320 while ( $entry = &get_entry( $fh_in ) ) {
321 &put_entry( $entry, $fh_out );
327 rename( "$path.temp", $path );
333 # Matin A. Hansen, December 2004.
335 # Given a FASTA file formatted with one line of header and one line of sequence,
336 # returns a list of header, seq beg and seq length (first nucleotide is 0). Also,
337 # the file size of the indexed file is written to the index for checking purposes.
339 my ( $path, # full path to file with multiple FASTA entries
344 my ( $file_size, $fh, $entry, $beg, $len, %hash, @index );
346 $file_size = &Maasha::Common::file_size( $path );
348 push @index, "FILE_SIZE=$file_size";
350 $fh = &Maasha::Common::read_open( $path );
355 while ( $entry = &get_entry( $fh ) )
357 warn qq(WARNING: header->$entry->[ HEAD ] alread exists in index) if exists $hash{ $entry->[ HEAD ] };
359 $beg += $len + 2 + length $entry->[ HEAD ];
360 $len = length $entry->[ SEQ ];
362 push @index, [ $entry->[ HEAD ], $beg, $len ];
364 $hash{ $entry->[ HEAD ] } = 1;
371 return wantarray ? @index : \@index;
377 # Martin A. Hansen, December 2004.
379 # Searches the index for matching entries.
381 my ( $index, # index list
382 $regex, # regex to match FASTA headers [OPTIONAL]
383 $invert, # invert matching
392 @results = @{ $index };
397 @results = grep { $_->[ 0 ] !~ /$regex/ } @{ $index };
399 @results = grep { $_->[ 0 ] =~ /$regex/ } @{ $index };
403 return wantarray ? @results : \@results;
409 # Martin A. Hansen, July 2007.
411 # Lookup a list of exact matches in the index and returns these
413 my ( $index, # index list
414 $headers, # headers to lookup
419 my ( %hash, $head, @results );
421 map { $hash{ $_->[ 0 ] } = [ $_->[ 1 ], $_->[ 2 ] ] } @{ $index };
423 foreach $head ( @{ $headers } )
425 if ( exists $hash{ $head } ) {
426 push @results, [ $head, $hash{ $head }->[ 0 ], $hash{ $head }->[ 1 ] ];
430 return wantarray ? @results : \@results;
436 # Martin A. Hansen, May 2007.
438 # Stores a FASTA index to binary file.
440 my ( $path, # full path to file
441 $index, # list with index
446 &Maasha::Common::file_store( $path, $index );
452 # Martin A. Hansen, May 2007.
454 # Retrieves a FASTA index from binary file.
456 my ( $path, # full path to file
463 $index = &Maasha::Common::file_retrieve( $path );
465 return wantarray ? @{ $index } : $index;
469 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<