3 # Copyright (C) 2006-2009 Martin A. Hansen.
5 # This program is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU General Public License
7 # as published by the Free Software Foundation; either version 2
8 # of the License, or (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 # http://www.gnu.org/copyleft/gpl.html
22 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
25 # Routines for manipulation of FASTA files and FASTA entries.
28 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
38 use vars qw ( @ISA @EXPORT );
40 @ISA = qw( Exporter );
48 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
53 # Martin A. Hansen, January 2007.
55 # Given a filehandle to an FASTA file,
56 # fetches the next FASTA entry, and returns
57 # this as a tuple of [ header, sequence ].
59 my ( $fh, # filehandle to FASTA file
64 my ( $block, @lines, $seq_name, $seq, $entry );
68 while ( $block = <$fh> )
72 last if $block !~ /^\s+$/;
75 return if not defined $block;
77 # $block =~ />?([^\n]+)\n/m;
78 $block =~ /^>?(.+)\n/m;
86 $seq_name =~ tr/\r//d;
87 $seq =~ tr/ \t\n\r//d;
89 $entry = [ $seq_name, $seq ];
91 return wantarray ? @{ $entry } : $entry;
97 # Martin A. Hansen, January 2007.
99 # Writes a FASTA entry to a file handle.
101 my ( $entry, # a FASTA entries
102 $fh, # file handle to output file - OPTIONAL
103 $wrap, # line width - OPTIONAL
108 Maasha::Common::error( qq(FASTA entry has no header) ) if not defined $entry->[ SEQ_NAME ];
109 Maasha::Common::error( qq(FASTA entry has no sequence) ) if not defined $entry->[ SEQ ];
112 Maasha::Fasta::wrap( $entry, $wrap );
116 print $fh ">$entry->[ SEQ_NAME ]\n$entry->[ SEQ ]\n";
118 print ">$entry->[ SEQ_NAME ]\n$entry->[ SEQ ]\n";
125 # Martin A. Hansen, September 2009
127 # Write FASTA entries to a file.
129 my ( $entries, # list of FASTA entries
131 $wrap, # line width - OPTIONAL
138 $fh = Maasha::Filesys::file_write_open( $file );
140 map { put_entry( $_, $fh, $wrap ) } @{ $entries };
148 # Martin A. Hansen, June 2007
150 # Wraps the sequence of a given FASTA entry
153 my ( $entry, # FASTA entry
159 Maasha::Seq::wrap( \$entry->[ SEQ ], $wrap );
165 # Martin A. Hansen, July 2008.
167 # Given a path to a FASTA file create a FASTA index
168 # and save to the specified path.
170 my ( $fasta_path, # path to FASTA file
171 $index_dir, # directory
172 $index_name, # path to index file
179 $index = index_create( $fasta_path );
181 Maasha::Filesys::dir_create_if_not_exists( $index_dir );
183 Maasha::Fasta::index_store( "$index_dir/$index_name", $index );
189 # Martin A. Hansen, September 2009.
191 # Given a path to a FASTA file and a FASTA index
192 # check if the FILESIZE matches.
194 my ( $fasta_path, # path to FASTA file
195 $index_path, # path to index file
202 $index = Maasha::Fasta::index_retrieve( $index_path );
204 if ( Maasha::Filesys::file_size( $fasta_path ) == $index->{ 'FILE_SIZE' } ) {
214 # Matin A. Hansen, December 2004.
216 # Given a FASTA file formatted with one line of header and one line of sequence,
217 # returns a list of header, seq offset and seq length (first nucleotide is 0). Also,
218 # the file size of the indexed file is written to the index for checking purposes.
220 my ( $path, # full path to file with multiple FASTA entries
225 my ( $fh, $entry, $offset, $len, $tot, %index );
227 $index{ 'FILE_SIZE' } = Maasha::Filesys::file_size( $path );
233 $fh = Maasha::Filesys::file_read_open( $path );
235 while ( $entry = get_entry( $fh ) )
237 warn qq(WARNING: header->$entry->[ SEQ_NAME ] alread exists in index) if exists $index{ $entry->[ SEQ_NAME ] };
239 $offset += $len + 2 + length $entry->[ SEQ_NAME ];
240 $len = length $entry->[ SEQ ];
242 $index{ $entry->[ SEQ_NAME ] } = [ $offset, $len ];
246 $tot += length( $entry->[ SEQ_NAME ] ) + length( $entry->[ SEQ ] ) + 3;
251 if ( $index{ 'FILE_SIZE' } != $tot ) {
252 Maasha::Common::error( qq(Filesize "$index{ 'FILE_SIZE' }" does not match content "$tot" -> malformatted FASTA file) );
255 return wantarray ? %index : \%index;
261 # Martin A. Hansen, July 2007.
263 # Lookup a list of exact matches in the index and returns these.
265 my ( $index, # index list
266 $headers, # headers to lookup
271 my ( $head, @results );
273 foreach $head ( @{ $headers } )
275 if ( exists $index->{ $head } ) {
276 push @results, [ $head, $index->{ $head }->[ 0 ], $index->{ $head }->[ 1 ] ];
280 return wantarray ? @results : \@results;
286 # Martin A. Hansen, May 2007.
288 # Stores a FASTA index to binary file.
290 my ( $path, # full path to file
291 $index, # list with index
296 Maasha::Filesys::file_store( $path, $index );
302 # Martin A. Hansen, May 2007.
304 # Retrieves a FASTA index from binary file.
306 my ( $path, # full path to file
313 $index = Maasha::Filesys::file_retrieve( $path );
315 return wantarray ? @{ $index } : $index;
321 # Martin A. Hansen, July 2008.
323 # Given a biopiece record converts it to a FASTA record.
324 # If no generic SEQ or SEQ_NAME is found, the Q_* and S_* are
325 # tried in that order.
327 my ( $record, # record
332 my ( $seq_name, $seq );
334 $seq_name = $record->{ "SEQ_NAME" } || $record->{ "Q_ID" } || $record->{ "S_ID" };
335 $seq = $record->{ "SEQ" } || $record->{ "Q_SEQ" } || $record->{ "S_SEQ" };
337 if ( defined $seq_name and defined $seq ) {
338 return wantarray ? ( $seq_name, $seq ) : [ $seq_name, $seq ];
347 # Martin A. Hansen, May 2009
349 # Convert a FASTA record to a Biopiece record.
351 my ( $entry, # FASTA entry
358 if ( defined $entry->[ SEQ_NAME ] and $entry->[ SEQ ] )
361 SEQ_NAME => $entry->[ SEQ_NAME ],
362 SEQ => $entry->[ SEQ ],
363 SEQ_LEN => length $entry->[ SEQ ],
367 return wantarray ? %{ $record } : $record;
372 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<