3 # Copyright (C) 2006-2009 Martin A. Hansen.
5 # This program is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU General Public License
7 # as published by the Free Software Foundation; either version 2
8 # of the License, or (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 # http://www.gnu.org/copyleft/gpl.html
22 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
25 # Routines for manipulation of FASTA files and FASTA entries.
28 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
36 use vars qw ( @ISA @EXPORT );
38 @ISA = qw( Exporter );
46 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
51 # Martin A. Hansen, March 2007.
53 # Checks if a given FASTA file is formatted with
54 # one header per line and one sequence per line.
55 # returns 1 if so, else 0.
57 my ( $path, # full path to FASTA file
62 my ( $fh, $line, $count );
64 $fh = Maasha::Common::read_open( $path );
68 while ( $line = <$fh> )
70 if ( not $count % 2 and substr( $line, 0, 1 ) ne ">" ) {
85 # Martin A. Hansen, December 2006.
87 # Parses a fasta file and returns a list of headers and sequence tuples.
89 my ( $path, # full path to FASTA file
90 $count, # number of sequences to read - OPTIONAL
93 # returns list of tuples
95 my ( $fh, $entry, @entries );
97 $fh = Maasha::Common::read_open( $path );
99 while ( $entry = get_entry( $fh ) )
101 push @entries, $entry;
103 if ( $count and $count == @entries ) {
110 return wantarray ? @entries : \@entries;
116 # Martin A. Hansen, March 2004.
118 # writes fasta sequences to STDOUT or file
120 my ( $entries, # list of fasta entries
121 $path, # full path to file - OPTIONAL
122 $wrap, # line width - OPTIONAL
127 $fh = Maasha::Common::write_open( $path ) if $path;
129 map { put_entry( $_, $fh, $wrap ) } @{ $entries };
131 close $fh if defined;
137 # Martin A. Hansen, June 2007
139 # Wraps the sequence of a given FASTA entry
142 my ( $entry, # FASTA entry
148 Maasha::Seq::wrap( \$entry->[ SEQ ], $wrap );
154 # Martin A. Hansen, January 2007.
156 # Given a filehandle to an FASTA file,
157 # fetches the next FASTA entry, and returns
158 # this as a tuple of [ header, sequence ].
160 my ( $fh, # filehandle to FASTA file
165 my ( $block, @lines, $seq_name, $seq, $entry );
169 while ( $block = <$fh> )
173 last if $block !~ /^\s+$/;
176 return if not defined $block;
178 # $block =~ />?([^\n]+)\n/m;
179 $block =~ /^>?(.+)\n/m;
187 $seq_name =~ tr/\r//d;
188 $seq =~ tr/ \t\n\r//d;
190 $entry = [ $seq_name, $seq ];
192 return wantarray ? @{ $entry } : $entry;
198 # Martin A. Hansen, January 2007.
200 # Writes FASTA entries to STDOUT or file.
202 my ( $entry, # a FASTA entries
203 $fh, # file handle to output file - OPTIONAL
204 $wrap, # line width - OPTIONAL
209 Maasha::Common::error( qq(FASTA entry has no header) ) if not defined $entry->[ SEQ_NAME ];
210 Maasha::Common::error( qq(FASTA entry has no sequence) ) if not defined $entry->[ SEQ ];
213 Maasha::Fasta::wrap( $entry, $wrap );
217 print $fh ">$entry->[ SEQ_NAME ]\n$entry->[ SEQ ]\n";
219 print ">$entry->[ SEQ_NAME ]\n$entry->[ SEQ ]\n";
226 # Martin A. Hansen, June 2007.
228 # Given a stack of FASTA entries, find and return
229 # the shortest entry.
231 my ( $entries, # list of FASTA entries
236 my ( $min, $entry, $min_entry );
240 foreach $entry ( @{ $entries } )
242 if ( length( $entry->[ SEQ ] ) < $min )
245 $min = length $entry->[ SEQ ];
249 return wantarray ? @{ $min_entry } : $min_entry;
255 # Martin A. Hansen, June 2007.
257 # Given a stack of FASTA entries, find and return
260 my ( $entries, # list of FASTA entries
265 my ( $max, $entry, $max_entry );
269 foreach $entry ( @{ $entries } )
271 if ( length( $entry->[ SEQ ] ) > $max )
274 $max = length $entry->[ SEQ ];
278 return wantarray ? @{ $max_entry } : $max_entry;
282 sub fasta_get_headers
284 # Martin A. Hansen, May 2007.
286 # Gets the header names of a FASTA file,
287 # and returns these in a list.
289 my ( $path, # full path to FASTA file
294 my ( $fh, $entry, @list );
296 $fh = Maasha::Common::read_open( $path );
298 while ( $entry = get_entry( $fh ) ) {
299 push @list, $entry->[ SEQ_NAME ];
304 return wantarray ? @list : \@list;
310 # Martin A. Hansen, December 2004.
312 # Given a file of one or more FASTA entries, reformats these so
313 # each entry consits of one line with header and one line with sequence.
315 my ( $path, # full path to file with multiple FASTA entries
318 my ( $fh_in, $fh_out, $entry );
320 $fh_in = Maasha::Common::read_open( $path );
321 $fh_out = Maasha::Common::write_open( "$path.temp" );
323 while ( $entry = get_entry( $fh_in ) ) {
324 put_entry( $entry, $fh_out );
330 rename( "$path.temp", $path );
336 # Martin A. Hansen, July 2008.
338 # Given a path to a FASTA file create a FASTA index
339 # and save to the specified path.
341 my ( $fasta_path, # path to FASTA file
342 $index_path, # path to index file
349 $index = index_create( $fasta_path );
351 Maasha::Fasta::index_store( $index_path, $index );
356 # Matin A. Hansen, December 2004.
358 # Given a FASTA file formatted with one line of header and one line of sequence,
359 # returns a list of header, seq beg and seq length (first nucleotide is 0). Also,
360 # the file size of the indexed file is written to the index for checking purposes.
362 my ( $path, # full path to file with multiple FASTA entries
367 my ( $file_size, $fh, $entry, $beg, $len, %hash, @index );
369 $file_size = Maasha::Filesys::file_size( $path );
371 push @index, "FILE_SIZE=$file_size";
373 $fh = Maasha::Filesys::file_read_open( $path );
378 while ( $entry = get_entry( $fh ) )
380 warn qq(WARNING: header->$entry->[ SEQ_NAME ] alread exists in index) if exists $hash{ $entry->[ SEQ_NAME ] };
382 $beg += $len + 2 + length $entry->[ SEQ_NAME ];
383 $len = length $entry->[ SEQ ];
385 push @index, [ $entry->[ SEQ_NAME ], $beg, $len ];
387 $hash{ $entry->[ SEQ_NAME ] } = 1;
394 return wantarray ? @index : \@index;
400 # Martin A. Hansen, December 2004.
402 # Searches the index for matching entries.
404 my ( $index, # index list
405 $regex, # regex to match FASTA headers [OPTIONAL]
406 $invert, # invert matching
415 @results = @{ $index };
420 @results = grep { $_->[ 0 ] !~ /$regex/ } @{ $index };
422 @results = grep { $_->[ 0 ] =~ /$regex/ } @{ $index };
426 return wantarray ? @results : \@results;
432 # Martin A. Hansen, July 2007.
434 # Lookup a list of exact matches in the index and returns these
436 my ( $index, # index list
437 $headers, # headers to lookup
442 my ( %hash, $head, @results );
444 map { $hash{ $_->[ 0 ] } = [ $_->[ 1 ], $_->[ 2 ] ] } @{ $index };
446 foreach $head ( @{ $headers } )
448 if ( exists $hash{ $head } ) {
449 push @results, [ $head, $hash{ $head }->[ 0 ], $hash{ $head }->[ 1 ] ];
453 return wantarray ? @results : \@results;
459 # Martin A. Hansen, May 2007.
461 # Stores a FASTA index to binary file.
463 my ( $path, # full path to file
464 $index, # list with index
469 Maasha::Filesys::file_store( $path, $index );
475 # Martin A. Hansen, May 2007.
477 # Retrieves a FASTA index from binary file.
479 my ( $path, # full path to file
486 $index = Maasha::Filesys::file_retrieve( $path );
488 return wantarray ? @{ $index } : $index;
494 # Martin A. Hansen, July 2008.
496 # Given a biopiece record converts it to a FASTA record.
497 # If no generic SEQ or SEQ_NAME is found, the Q_* and S_* are
498 # tried in that order.
500 my ( $record, # record
505 my ( $seq_name, $seq );
507 $seq_name = $record->{ "SEQ_NAME" } || $record->{ "Q_ID" } || $record->{ "S_ID" };
508 $seq = $record->{ "SEQ" } || $record->{ "Q_SEQ" } || $record->{ "S_SEQ" };
510 if ( defined $seq_name and defined $seq ) {
511 return wantarray ? ( $seq_name, $seq ) : [ $seq_name, $seq ];
520 # Martin A. Hansen, May 2009
522 # Convert a FASTA record to a Biopiece record.
524 my ( $entry, # FASTA entry
531 if ( defined $entry->[ SEQ_NAME ] and $entry->[ SEQ ] )
534 SEQ_NAME => $entry->[ SEQ_NAME ],
535 SEQ => $entry->[ SEQ ],
536 SEQ_LEN => length $entry->[ SEQ ],
540 return wantarray ? %{ $record } : $record;
545 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<