3 # Copyright (C) 2009 Martin A. Hansen.
5 # This program is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU General Public License
7 # as published by the Free Software Foundation; either version 2
8 # of the License, or (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 # http://www.gnu.org/copyleft/gpl.html
22 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
25 # Routines for parsing and emitting KISS records.
27 # http://code.google.com/p/biopieces/wiki/KissFormat
30 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
39 use vars qw( @ISA @EXPORT );
41 @ISA = qw( Exporter );
56 INDEX_BLOCK_SIZE => 100,
57 INDEX_LEVEL => 100_000_000,
66 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
71 # Martin A. Hansen, November 2009.
73 # Gets the next KISS entry from a file handle.
75 my ( $fh, # file handle
80 my ( $line, @fields, %entry );
82 while ( $line = <$fh> )
86 next if $line =~ /^$|^#/;
88 @fields = split /\t/, $line;
90 Maasha::Common::error( qq(BAD kiss entry: $line) ) if not @fields == 12;
92 $entry{ 'S_ID' } = $fields[ S_ID ];
93 $entry{ 'S_BEG' } = $fields[ S_BEG ];
94 $entry{ 'S_END' } = $fields[ S_END ];
95 $entry{ 'Q_ID' } = $fields[ Q_ID ];
96 $entry{ 'SCORE' } = $fields[ SCORE ];
97 $entry{ 'STRAND' } = $fields[ STRAND ];
98 $entry{ 'HITS' } = $fields[ HITS ];
99 $entry{ 'ALIGN' } = $fields[ ALIGN ];
100 $entry{ 'BLOCK_COUNT' } = $fields[ BLOCK_COUNT ];
101 $entry{ 'BLOCK_BEGS' } = $fields[ BLOCK_BEGS ];
102 $entry{ 'BLOCK_LENS' } = $fields[ BLOCK_LENS ];
103 $entry{ 'BLOCK_TYPE' } = $fields[ BLOCK_TYPE ];
105 return wantarray ? %entry : \%entry;
112 # Martin A. Hansen, December 2009.
114 # Parses a line with a KISS entry.
116 my ( $line, # KISS line to parse
121 my ( @fields, %entry );
123 @fields = split /\t/, $line;
125 Maasha::Common::error( qq(BAD kiss entry: $line) ) if not @fields == 12;
127 $entry{ 'S_ID' } = $fields[ S_ID ];
128 $entry{ 'S_BEG' } = $fields[ S_BEG ];
129 $entry{ 'S_END' } = $fields[ S_END ];
130 $entry{ 'Q_ID' } = $fields[ Q_ID ];
131 $entry{ 'SCORE' } = $fields[ SCORE ];
132 $entry{ 'STRAND' } = $fields[ STRAND ];
133 $entry{ 'HITS' } = $fields[ HITS ];
134 $entry{ 'ALIGN' } = $fields[ ALIGN ];
135 $entry{ 'BLOCK_COUNT' } = $fields[ BLOCK_COUNT ];
136 $entry{ 'BLOCK_BEGS' } = $fields[ BLOCK_BEGS ];
137 $entry{ 'BLOCK_LENS' } = $fields[ BLOCK_LENS ];
138 $entry{ 'BLOCK_TYPE' } = $fields[ BLOCK_TYPE ];
140 return wantarray ? %entry : \%entry;
146 # Martin A. Hansen, November 2009.
148 # Outputs a KISS record to a given filehandle
149 # or STDOUT if no filehandle is given.
151 my ( $entry, # KISS entry to output
152 $fh, # file handle - OPTIONAL
159 if ( defined $entry->{ 'S_ID' } and
160 defined $entry->{ 'S_BEG' } and
161 defined $entry->{ 'S_END' }
164 Maasha::Common::error( qq(Bad S_BEG value: $entry->{ 'S_BEG' } < 0 ) ) if $entry->{ 'S_BEG' } < 0;
165 Maasha::Common::error( qq(Bad S_END value: $entry->{ 'S_END' } < $entry->{ 'S_BEG' }) ) if $entry->{ 'S_END' } < $entry->{ 'S_BEG' };
169 $fields[ S_ID ] = $entry->{ 'S_ID' };
170 $fields[ S_BEG ] = $entry->{ 'S_BEG' };
171 $fields[ S_END ] = $entry->{ 'S_END' };
172 $fields[ Q_ID ] = $entry->{ 'Q_ID' } || ".";
173 $fields[ SCORE ] = $entry->{ 'SCORE' } || ".";
174 $fields[ STRAND ] = $entry->{ 'STRAND' } || ".";
175 $fields[ HITS ] = $entry->{ 'HITS' } || ".";
176 $fields[ ALIGN ] = $entry->{ 'ALIGN' } || ".";
177 $fields[ BLOCK_COUNT ] = $entry->{ 'BLOCK_COUNT' } || ".";
178 $fields[ BLOCK_BEGS ] = $entry->{ 'BLOCK_BEGS' } || ".";
179 $fields[ BLOCK_LENS ] = $entry->{ 'BLOCK_LENS' } || ".";
180 $fields[ BLOCK_TYPE ] = $entry->{ 'BLOCK_TYPE' } || ".";
182 print $fh join( "\t", @fields ), "\n";
189 # Martin A. Hansen, November 2009.
191 # Sorts a KISS file on S_BEG and S_END
193 my ( $file, # KISS file
198 `sort -k 2,2n -k 3,3n $file > $file.sort`;
200 rename "$file.sort", $file;
206 # Martin A. Hansen, December 2009.
208 # Creates a lookup index of a sorted KISS file.
210 my ( $file, # path to KISS file
215 my ( $fh, $offset, $line, $s_beg, $bucket, $index );
217 $fh = Maasha::Filesys::file_read_open( $file );
221 while ( $line = <$fh> )
223 ( undef, $s_beg ) = split "\t", $line, 3;
225 $bucket = int( $s_beg / BUCKET_SIZE );
227 $index->[ $bucket ]->[ COUNT ]++;
228 $index->[ $bucket ]->[ OFFSET ] = $offset if not defined $index->[ $bucket ]->[ OFFSET ];
230 $offset += length $line;
235 Maasha::KISS::kiss_index_store( "$file.index", $index );
239 sub kiss_index_offset
241 # Martin A. Hansen, December 2009.
243 # Given a KISS index and a begin position,
244 # locate the offset closest to the begin position,
247 my ( $index, # KISS index
248 $beg, # begin position
255 Maasha::Common::error( qq(Negative begin position: "$beg") ) if $beg < 0;
257 $bucket = int( $beg / BUCKET_SIZE );
259 $bucket = scalar @{ $index } if $bucket > scalar @{ $index };
261 while ( $bucket >= 0 )
263 return $index->[ $bucket ]->[ OFFSET ] if defined $index->[ $bucket ];
272 # Martin A. Hansen, December 2009.
274 # Given a KISS index and a begin/end interval
275 # sum the number of counts in that interval,
278 my ( $index, # KISS index
279 $beg, # Begin position
285 my ( $bucket_beg, $bucket_end, $count, $i );
287 Maasha::Common::error( qq(Negative begin position: "$beg") ) if $beg < 0;
289 $bucket_beg = int( $beg / BUCKET_SIZE );
290 $bucket_end = int( $end / BUCKET_SIZE );
292 $bucket_end = scalar @{ $index } if $bucket_end > scalar @{ $index };
296 for ( $i = $bucket_beg; $i <= $bucket_end; $i++ ) {
297 $count += $index->[ $i ]->[ COUNT ] if defined $index->[ $i ];
304 sub kiss_index_get_entries
306 # Martin A. Hansen, November 2009.
308 # Given a path to a KISS file and a KISS index
309 # along with a beg/end interval, locate all entries
310 # in that interval and return those.
312 my ( $file, # path to KISS file
314 $beg, # interval begin
320 my ( $offset, $fh, $entry, @entries );
322 $offset = kiss_index_offset( $index, $beg );
324 $fh = Maasha::Filesys::file_read_open( $file );
326 sysseek( $fh, $offset, 0 );
328 while ( $entry = Maasha::KISS::kiss_entry_get( $fh ) )
330 push @entries, $entry if $entry->{ 'S_END' } > $beg;
332 last if $entry->{ 'S_BEG' } > $end;
337 return wantarray ? @entries : \@entries;
341 sub kiss_index_get_blocks
343 # Martin A. Hansen, December 2009.
345 # Given a KISS index returns blocks from this in a
346 # given position interval. Blocks consisting of
347 # hashes of BEG/END/COUNT.
349 my ( $index, # KISS index node
350 $beg, # interval begin
356 my ( $bucket_beg, $bucket_end, $i, @blocks );
358 Maasha::Common::error( qq(Negative begin position: "$beg") ) if $beg < 0;
360 $bucket_beg = int( $beg / BUCKET_SIZE );
361 $bucket_end = int( $end / BUCKET_SIZE );
363 $bucket_end = scalar @{ $index } if $bucket_end > scalar @{ $index };
365 for ( $i = $bucket_beg; $i <= $bucket_end; $i++ )
367 if ( defined $index->[ $i ] )
370 BEG => BUCKET_SIZE * $i,
371 END => BUCKET_SIZE * ( $i + 1 ),
372 COUNT => $index->[ $i ]->[ COUNT ],
377 return wantarray ? @blocks : \@blocks;
383 # Martin A. Hansen, December 2009.
385 # Given filehandles to two different unsorted KISS files
386 # intersect the entries so that all entries from file1 that
387 # overlap entries in file2 are returned - unless the inverse flag
388 # is given in which case entreis from file1 that does not
389 # overlap any entries in file2 are returned.
391 my ( $fh1, # filehandle to file1
392 $fh2, # filehandle to file2
393 $inverse, # flag indicating inverse matching - OPTIONAL
398 my ( $entry, %lookup, $pos, $overlap, @entries );
400 while ( $entry = kiss_entry_get( $fh2 ) ) {
401 map { $lookup{ $_ } = 1 } ( $entry->{ 'S_BEG' } .. $entry->{ 'S_END' } );
404 while ( $entry = kiss_entry_get( $fh1 ) )
408 foreach $pos ( $entry->{ 'S_BEG' } .. $entry->{ 'S_END' } )
410 if ( exists $lookup{ $pos } )
418 if ( $overlap and not $inverse ) {
419 push @entries, $entry;
420 } elsif ( not $overlap and $inverse ) {
421 push @entries, $entry;
425 return wantarray ? @entries : \@entries;
431 # Martin A. Hansen, November 2009.
433 # Stores a KISS index to a file.
435 my ( $path, # path to KISS index
441 Maasha::Filesys::file_store( $path, $index );
445 sub kiss_index_retrieve
447 # Martin A. Hansen, November 2009.
449 # Retrieves a KISS index from a file.
451 my ( $path, # Path to KISS index
454 # Returns a data structure.
458 $index = Maasha::Filesys::file_retrieve( $path );
460 return wantarray ? @{ $index } : $index;
466 # Martin A. Hansen, November 2009.
468 # Encodes alignment descriptors for two
471 my ( $s_seq, # Subject sequence reference
472 $q_seq, # Query sequence reference
473 $offset, # alternate offset - OPTIONAL
478 my ( $i, $s, $q, @align );
480 # Maasha::Common::error( "Sequence lengths don't match" ) if length ${ $s_seq } != length ${ $q_seq };
482 if ( length ${ $s_seq } != length ${ $q_seq } ) { # for unknown reasons this situation may occur - TODO FIXME
483 return wantarray ? () : [];
488 for ( $i = 0; $i < length ${ $s_seq }; $i++ )
490 $s = uc substr ${ $s_seq }, $i, 1;
491 $q = uc substr ${ $q_seq }, $i, 1;
493 if ( $s eq '-' and $q eq '-' )
503 push @align, "$offset:$s>$q";
505 $offset++ if not $s eq '-';
509 return wantarray ? @align : \@align;
515 # Martin A. Hansen, November 2009.
517 # Test routine of correct resolve of ALIGN descriptors.
519 # S.aur_COL 41 75 5_vnlh2ywXsN1/1 1 - . 17:A>T,31:A>N . . . .
521 my ( $s_seqref, # subject sequence reference
527 my ( $align, $pos, $s_symbol, $q_symbol, $s_seq, $q_seq, $insertions );
529 $s_seq = substr ${ $s_seqref }, $entry->{ 'S_BEG' }, $entry->{ 'S_END' } - $entry->{ 'S_BEG' } + 1;
534 foreach $align ( split /,/, $entry->{ 'ALIGN' } )
536 if ( $align =~ /(\d+):(.)>(.)/ )
542 if ( $s_symbol eq '-' ) # insertion
544 substr $s_seq, $pos + $insertions, 0, $s_symbol;
545 substr $q_seq, $pos + $insertions, 0, $q_symbol;
549 elsif ( $q_symbol eq '-' ) # deletion
551 substr $q_seq, $pos + $insertions, 1, $q_symbol;
555 substr $q_seq, $pos + $insertions, 1, $q_symbol;
560 Maasha::Common::error( qq(Bad alignment descriptor: "$align") );
564 Maasha::Align::align_print_pairwise( [ $entry->{ 'S_ID' }, $s_seq ], [ $entry->{ 'Q_ID' }, $q_seq ] );
570 # Martin A. Hansen, November 2009.
572 # Converts a KISS entry to a Biopiece record.
573 # TODO: Consistency checking
575 my ( $entry, # KISS entry
580 return wantarray ? %{ $entry } : $entry;
586 # Martin A. Hansen, November 2009.
588 # Converts a Biopiece record to a KISS entry.
590 my ( $record, # Biopiece record
595 if ( not defined $record->{ 'S_ID' } and
596 not defined $record->{ 'S_BEG' } and
597 not defined $record->{ 'S_END' } )
602 $record->{ 'SCORE' } ||= $record->{ 'E_VAL' } || ".";
603 $record->{ 'HITS' } ||= ".";
604 $record->{ 'BLOCK_COUNT' } ||= ".";
605 $record->{ 'BLOCK_BEGS' } ||= ".";
606 $record->{ 'BLOCK_LENS' } ||= ".";
607 $record->{ 'BLOCK_TYPE' } ||= ".";
608 $record->{ 'ALIGN' } ||= $record->{ 'DESCRIPTOR' } || ".";
610 return wantarray ? %{ $record } : $record;
614 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<