3 # Copyright (C) 2006 Martin A. Hansen.
5 # This program is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU General Public License
7 # as published by the Free Software Foundation; either version 2
8 # of the License, or (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 # http://www.gnu.org/copyleft/gpl.html
22 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
25 # Routines for manipulation of FASTA files and FASTA entries.
28 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
35 use vars qw ( @ISA @EXPORT );
37 @ISA = qw( Exporter );
45 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
50 # Martin A. Hansen, March 2007.
52 # Checks if a given FASTA file is formatted with
53 # one header per line and one sequence per line.
54 # returns 1 if so, else 0.
56 my ( $path, # full path to FASTA file
61 my ( $fh, $line, $count );
63 $fh = Maasha::Common::read_open( $path );
67 while ( $line = <$fh> )
69 if ( not $count % 2 and substr( $line, 0, 1 ) ne ">" ) {
84 # Martin A. Hansen, December 2006.
86 # Parses a fasta file and returns a list of headers and sequence tuples.
88 my ( $path, # full path to FASTA file
89 $count, # number of sequences to read - OPTIONAL
92 # returns list of tuples
94 my ( $fh, $entry, @entries );
96 $fh = Maasha::Common::read_open( $path );
98 while ( $entry = get_entry( $fh ) )
100 push @entries, $entry;
102 if ( $count and $count == @entries ) {
109 return wantarray ? @entries : \@entries;
115 # Martin A. Hansen, March 2004.
117 # writes fasta sequences to STDOUT or file
119 my ( $entries, # list of fasta entries
120 $path, # full path to file - OPTIONAL
121 $wrap, # line width - OPTIONAL
126 $fh = Maasha::Common::write_open( $path ) if $path;
128 map { put_entry( $_, $fh, $wrap ) } @{ $entries };
130 close $fh if defined;
136 # Martin A. Hansen, June 2007
138 # Wraps the sequence of a given FASTA entry
141 my ( $entry, # FASTA entry
147 Maasha::Seq::wrap( \$entry->[ SEQ ], $wrap );
153 # Martin A. Hansen, January 2007.
155 # Given a filehandle to an FASTA file,
156 # fetches the next FASTA entry, and returns
157 # this as a tuple of [ header, sequence ].
159 my ( $fh, # filehandle to FASTA file
164 my ( $block, @lines, $seq_name, $seq, $entry );
168 while ( $block = <$fh> )
172 last if $block !~ /^\s+$/;
175 return if not defined $block;
177 # $block =~ />?([^\n]+)\n/m;
178 $block =~ /^>?(.+)\n/m;
186 $seq_name =~ tr/\r//d;
187 $seq =~ tr/ \t\n\r//d;
189 $entry = [ $seq_name, $seq ];
191 return wantarray ? @{ $entry } : $entry;
197 # Martin A. Hansen, January 2007.
199 # Writes FASTA entries to STDOUT or file.
201 my ( $entry, # a FASTA entries
202 $fh, # file handle to output file - OPTIONAL
203 $wrap, # line width - OPTIONAL
208 Maasha::Common::error( qq(FASTA entry has no header) ) if not defined $entry->[ HEAD ];
209 Maasha::Common::error( qq(FASTA entry has no sequence) ) if not defined $entry->[ SEQ ];
212 Maasha::Fasta::wrap( $entry, $wrap );
216 print $fh ">$entry->[ HEAD ]\n$entry->[ SEQ ]\n";
218 print ">$entry->[ HEAD ]\n$entry->[ SEQ ]\n";
225 # Martin A. Hansen, June 2007.
227 # Given a stack of FASTA entries, find and return
228 # the shortest entry.
230 my ( $entries, # list of FASTA entries
235 my ( $min, $entry, $min_entry );
239 foreach $entry ( @{ $entries } )
241 if ( length( $entry->[ SEQ ] ) < $min )
244 $min = length $entry->[ SEQ ];
248 return wantarray ? @{ $min_entry } : $min_entry;
254 # Martin A. Hansen, June 2007.
256 # Given a stack of FASTA entries, find and return
259 my ( $entries, # list of FASTA entries
264 my ( $max, $entry, $max_entry );
268 foreach $entry ( @{ $entries } )
270 if ( length( $entry->[ SEQ ] ) > $max )
273 $max = length $entry->[ SEQ ];
277 return wantarray ? @{ $max_entry } : $max_entry;
281 sub fasta_get_headers
283 # Martin A. Hansen, May 2007.
285 # Gets the header names of a FASTA file,
286 # and returns these in a list.
288 my ( $path, # full path to FASTA file
293 my ( $fh, $entry, @list );
295 $fh = Maasha::Common::read_open( $path );
297 while ( $entry = get_entry( $fh ) ) {
298 push @list, $entry->[ HEAD ];
303 return wantarray ? @list : \@list;
309 # Martin A. Hansen, December 2004.
311 # Given a file of one or more FASTA entries, reformats these so
312 # each entry consits of one line with header and one line with sequence.
314 my ( $path, # full path to file with multiple FASTA entries
317 my ( $fh_in, $fh_out, $entry );
319 $fh_in = Maasha::Common::read_open( $path );
320 $fh_out = Maasha::Common::write_open( "$path.temp" );
322 while ( $entry = get_entry( $fh_in ) ) {
323 put_entry( $entry, $fh_out );
329 rename( "$path.temp", $path );
335 # Martin A. Hansen, July 2008.
337 # Given a path to a FASTA file create a FASTA index
338 # and save to the specified path.
340 my ( $fasta_path, # path to FASTA file
341 $index_path, # path to index file
348 $index = index_create( $fasta_path );
350 Maasha::Fasta::index_store( $index_path, $index );
355 # Matin A. Hansen, December 2004.
357 # Given a FASTA file formatted with one line of header and one line of sequence,
358 # returns a list of header, seq beg and seq length (first nucleotide is 0). Also,
359 # the file size of the indexed file is written to the index for checking purposes.
361 my ( $path, # full path to file with multiple FASTA entries
366 my ( $file_size, $fh, $entry, $beg, $len, %hash, @index );
368 $file_size = Maasha::Common::file_size( $path );
370 push @index, "FILE_SIZE=$file_size";
372 $fh = Maasha::Common::read_open( $path );
377 while ( $entry = get_entry( $fh ) )
379 warn qq(WARNING: header->$entry->[ HEAD ] alread exists in index) if exists $hash{ $entry->[ HEAD ] };
381 $beg += $len + 2 + length $entry->[ HEAD ];
382 $len = length $entry->[ SEQ ];
384 push @index, [ $entry->[ HEAD ], $beg, $len ];
386 $hash{ $entry->[ HEAD ] } = 1;
393 return wantarray ? @index : \@index;
399 # Martin A. Hansen, December 2004.
401 # Searches the index for matching entries.
403 my ( $index, # index list
404 $regex, # regex to match FASTA headers [OPTIONAL]
405 $invert, # invert matching
414 @results = @{ $index };
419 @results = grep { $_->[ 0 ] !~ /$regex/ } @{ $index };
421 @results = grep { $_->[ 0 ] =~ /$regex/ } @{ $index };
425 return wantarray ? @results : \@results;
431 # Martin A. Hansen, July 2007.
433 # Lookup a list of exact matches in the index and returns these
435 my ( $index, # index list
436 $headers, # headers to lookup
441 my ( %hash, $head, @results );
443 map { $hash{ $_->[ 0 ] } = [ $_->[ 1 ], $_->[ 2 ] ] } @{ $index };
445 foreach $head ( @{ $headers } )
447 if ( exists $hash{ $head } ) {
448 push @results, [ $head, $hash{ $head }->[ 0 ], $hash{ $head }->[ 1 ] ];
452 return wantarray ? @results : \@results;
458 # Martin A. Hansen, May 2007.
460 # Stores a FASTA index to binary file.
462 my ( $path, # full path to file
463 $index, # list with index
468 Maasha::Common::file_store( $path, $index );
474 # Martin A. Hansen, May 2007.
476 # Retrieves a FASTA index from binary file.
478 my ( $path, # full path to file
485 $index = Maasha::Common::file_retrieve( $path );
487 return wantarray ? @{ $index } : $index;
493 # Martin A. Hansen, July 2008.
495 # Given a biopiece record converts it to a FASTA record.
496 # If no generic SEQ or SEQ_NAME is found, the Q_* and S_* are
497 # tried in that order.
499 my ( $record, # record
504 my ( $seq_name, $seq );
506 $seq_name = $record->{ "SEQ_NAME" } || $record->{ "Q_ID" } || $record->{ "S_ID" };
507 $seq = $record->{ "SEQ" } || $record->{ "Q_SEQ" } || $record->{ "S_SEQ" };
509 if ( defined $seq_name and defined $seq ) {
510 return wantarray ? ( $seq_name, $seq ) : [ $seq_name, $seq ];
517 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<