1 package Maasha::KISS::IO;
3 # Copyright (C) 2009 Martin A. Hansen.
5 # This program is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU General Public License
7 # as published by the Free Software Foundation; either version 2
8 # of the License, or (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 # http://www.gnu.org/copyleft/gpl.html
22 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
25 # Routines for parsing and emitting KISS records.
27 # http://code.google.com/p/biopieces/wiki/KissFormat
30 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
40 use vars qw( @ISA @EXPORT );
42 @ISA = qw( Exporter );
57 INDEX_BLOCK_SIZE => 100,
58 INDEX_LEVEL => 100_000_000,
63 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
68 my ( $fh, # file handle
73 my ( $line, @fields, %entry );
75 while ( $line = <$fh> )
79 next if $line =~ /^$|^#/;
81 @fields = split /\t/, $line;
83 Maasha::Common::error( qq(BAD kiss entry: $line) ) if not @fields == 12;
85 $entry{ 'S_ID' } = $fields[ S_ID ];
86 $entry{ 'S_BEG' } = $fields[ S_BEG ];
87 $entry{ 'S_END' } = $fields[ S_END ];
88 $entry{ 'Q_ID' } = $fields[ Q_ID ];
89 $entry{ 'SCORE' } = $fields[ SCORE ];
90 $entry{ 'STRAND' } = $fields[ STRAND ];
91 $entry{ 'HITS' } = $fields[ HITS ];
92 $entry{ 'ALIGN' } = $fields[ ALIGN ];
93 $entry{ 'BLOCK_COUNT' } = $fields[ BLOCK_COUNT ];
94 $entry{ 'BLOCK_BEGS' } = $fields[ BLOCK_BEGS ];
95 $entry{ 'BLOCK_LENS' } = $fields[ BLOCK_LENS ];
96 $entry{ 'BLOCK_TYPE' } = $fields[ BLOCK_TYPE ];
98 return wantarray ? %entry : \%entry;
105 my ( $entry, # KISS entry to output
106 $fh, # file handle - OPTIONAL
113 if ( defined $entry->{ 'S_ID' } and
114 defined $entry->{ 'S_BEG' } and
115 defined $entry->{ 'S_END' }
120 $fields[ S_ID ] = $entry->{ 'S_ID' };
121 $fields[ S_BEG ] = $entry->{ 'S_BEG' };
122 $fields[ S_END ] = $entry->{ 'S_END' };
123 $fields[ Q_ID ] = $entry->{ 'Q_ID' };
124 $fields[ SCORE ] = $entry->{ 'SCORE' };
125 $fields[ STRAND ] = $entry->{ 'STRAND' };
126 $fields[ HITS ] = $entry->{ 'HITS' };
127 $fields[ ALIGN ] = $entry->{ 'ALIGN' };
128 $fields[ BLOCK_COUNT ] = $entry->{ 'BLOCK_COUNT' };
129 $fields[ BLOCK_BEGS ] = $entry->{ 'BLOCK_BEGS' };
130 $fields[ BLOCK_LENS ] = $entry->{ 'BLOCK_LENS' };
131 $fields[ BLOCK_TYPE ] = $entry->{ 'BLOCK_TYPE' };
133 print $fh join( "\t", @fields ), "\n";
140 my ( $dbh, # Database handle
142 $s_beg, # Subject begin
143 $s_end, # Subject end
146 my ( $sql, $entries );
148 # $sql = "SELECT * FROM $table WHERE S_BEG >= $s_beg AND S_END <= $s_end ORDER BY S_BEG,S_END";
149 $sql = "SELECT S_BEG,S_END,Q_ID,ALIGN FROM $table WHERE S_BEG >= $s_beg AND S_END <= $s_end";
151 $entries = Maasha::SQL::query_hashref_list( $dbh, $sql );
153 return wantarray ? @{ $entries } : $entries;
159 # Martin A. Hansen, November 2009.
161 # Sorts a KISS file on S_BEG and S_END
163 my ( $file, # KISS file
168 `sort -k 2,2n -k 3,3n $file > $file.sort`;
170 rename "$file.sort", $file;
176 # Martin A, Hansen, November 2009.
178 # Creates an index of a sorted KISS file.
180 my ( $file, # KISS file
185 my ( $tree, $offset, $fh, $line, $beg );
190 $fh = Maasha::Filesys::file_read_open( $file );
192 while ( $line = <$fh> )
194 ( undef, $beg ) = split "\t", $line, 3;
196 kiss_index_node_add( $tree, INDEX_LEVEL, INDEX_FACTOR, $beg, $offset );
198 $offset += length $line;
203 kiss_index_store( "$file.index", $tree );
207 sub kiss_index_node_add
209 # Martin A, Hansen, November 2009.
211 # Recursive routine to add nodes to a tree.
224 $bucket = int( $beg / $level );
226 if ( $level >= $factor )
228 $sum += $bucket * $level;
229 $beg -= $bucket * $level;
231 $node->{ 'CHILDREN' }->[ $bucket ]->{ 'COUNT' }++;
232 # $node->{ 'CHILDREN' }->[ $bucket ]->{ 'LEVEL' } = $level;
233 # $node->{ 'CHILDREN' }->[ $bucket ]->{ 'BUCKET' } = $bucket;
234 $node->{ 'CHILDREN' }->[ $bucket ]->{ 'BEG' } = $sum;
235 $node->{ 'CHILDREN' }->[ $bucket ]->{ 'END' } = $sum + $level - 1;
236 $node->{ 'CHILDREN' }->[ $bucket ]->{ 'OFFSET' } = $offset if not defined $node->{ 'CHILDREN' }->[ $bucket ]->{ 'OFFSET' };
238 kiss_index_node_add( $node->{ 'CHILDREN' }->[ $bucket ], $level / $factor, $factor, $beg, $offset, $sum );
243 sub kiss_index_offset
245 # Martin A. Hansen, November 2009.
247 # Given a KISS index and a begin position,
248 # locate the offset closest to the begin position,
251 my ( $index, # KISS index
252 $beg, # begin position
253 $level, # index level - OPTIONAL
254 $factor, # index factor - OPTIONAL
259 my ( $child, $offset );
261 $level ||= INDEX_LEVEL;
262 $factor ||= INDEX_FACTOR;
264 foreach $child ( @{ $index->{ 'CHILDREN' } } )
266 next if not defined $child;
268 if ( $child->{ 'BEG' } <= $beg and $beg <= $child->{ 'END' } )
270 if ( $level == $factor ) {
271 $offset = $child->{ 'OFFSET' };
273 $offset = kiss_index_offset( $child, $beg, $level / $factor, $factor );
284 # Martin A. Hansen, November 2009.
286 # Given a KISS index and a begin/end interval
287 # sum the number of counts in that interval,
290 my ( $index, # KISS index
291 $beg, # Begin position
293 $level, # index level - OPTIONAL
294 $factor, # index factor - OPTIONAL
299 my ( $count, $child );
301 $level ||= INDEX_LEVEL;
302 $factor ||= INDEX_FACTOR;
305 foreach $child ( @{ $index->{ 'CHILDREN' } } )
307 next if not defined $child;
309 if ( $level >= $factor )
311 if ( Maasha::Calc::overlap( $beg, $end, $child->{ 'BEG' }, $child->{ 'END' } ) )
313 $count += $child->{ 'COUNT' } if $level == $factor;
314 $count += kiss_index_count( $child, $beg, $end, $level / $factor, $factor );
329 Maasha::Filesys::file_store( $path, $index );
333 sub kiss_index_retrieve
340 $index = Maasha::Filesys::file_retrieve( $path );
342 return wantarray ? @{ $index } : $index;
346 sub kiss_index_get_entries
354 my ( $offset, $fh, $entry, @entries );
356 $offset = kiss_index_offset( $index, $beg );
358 $fh = Maasha::Filesys::file_read_open( $file );
360 sysseek( $fh, $offset, 0 );
362 while ( $entry = kiss_entry_get( $fh ) )
364 push @entries, $entry if $entry->{ 'S_END' } > $beg;
366 last if $entry->{ 'S_BEG' } > $end;
371 return wantarray ? @entries : \@entries;
375 sub kiss_index_get_blocks
380 $level, # index level - OPTIONAL
381 $factor, # index factor - OPTIONAL
387 my ( $len, @blocks, $child );
389 $level ||= INDEX_LEVEL;
390 $factor ||= INDEX_FACTOR;
392 $size ||= 100; # TODO: lazy list loading?
394 # if ( not defined $size )
396 # $len = $end - $beg + 1;
398 # if ( $len > 100_000_000 ) {
400 # } elsif ( $len > 1_000_000 ) {
407 if ( $level >= $size )
409 foreach $child ( @{ $index->{ 'CHILDREN' } } )
411 next if not defined $child;
413 if ( Maasha::Calc::overlap( $beg, $end, $child->{ 'BEG' }, $child->{ 'END' } ) )
415 if ( $level == $size )
418 BEG => $child->{ 'BEG' },
419 END => $child->{ 'END' },
420 COUNT => $child->{ 'COUNT' },
424 push @blocks, kiss_index_get_blocks( $child, $beg, $end, $level / $factor, $factor, $size );
429 return wantarray ? @blocks : \@blocks;
435 # S.aur_COL 41 75 5_vnlh2ywXsN1/1 1 - . 17:A>T,31:A>N . . . .
437 my ( $s_seqref, # subject sequence reference
443 my ( $align, $pos, $s_symbol, $q_symbol, $s_seq, $q_seq, $insertions );
445 $s_seq = substr ${ $s_seqref }, $entry->{ 'S_BEG' }, $entry->{ 'S_END' } - $entry->{ 'S_BEG' } + 1;
450 foreach $align ( split /,/, $entry->{ 'ALIGN' } )
452 if ( $align =~ /(\d+):(.)>(.)/ )
458 if ( $s_symbol eq '-' ) # insertion
460 substr $s_seq, $pos + $insertions, 0, $s_symbol;
461 substr $q_seq, $pos + $insertions, 0, $q_symbol;
465 elsif ( $q_symbol eq '-' ) # deletion
467 substr $q_seq, $pos + $insertions, 1, $q_symbol;
471 substr $q_seq, $pos + $insertions, 1, $q_symbol;
476 Maasha::Common::error( qq(Bad alignment descriptor: "$align") );
480 Maasha::Align::align_print_pairwise( [ $entry->{ 'S_ID' }, $s_seq ], [ $entry->{ 'Q_ID' }, $q_seq ] );
486 my ( $entry, # KISS entry
489 return wantarray ? %{ $entry } : $entry;
495 my ( $record, # Biopiece record
498 $record->{ 'SCORE' } ||= $record->{ 'E_VAL' } || ".";
499 $record->{ 'HITS' } ||= ".";
500 $record->{ 'BLOCK_COUNT' } ||= ".";
501 $record->{ 'BLOCK_BEGS' } ||= ".";
502 $record->{ 'BLOCK_LENS' } ||= ".";
503 $record->{ 'BLOCK_TYPE' } ||= ".";
504 $record->{ 'ALIGN' } ||= $record->{ 'DESCRIPTOR' } || ".";
506 return wantarray ? %{ $record } : $record;
510 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
518 # Martin A, Hansen, October 2009.
520 # Creates an index of a sorted KISS file that
521 # allowing the location of the byte position
522 # from where records can be read given a
523 # specific S_BEG position. The index consists of
524 # triples: [ beg, end, bytepos ], where beg and
525 # end denotes the interval where the next KISS
526 # record begins at bytepos.
528 my ( $fh, # filehandle to KISS file
533 my ( $line, @fields, $beg, $end, $pos, @index );
538 while ( $line = <$fh> )
542 @fields = split /\t/, $line, 3;
544 $end = $fields[ S_BEG ];
548 push @index, [ $beg, $end, $pos ];
551 elsif ( $end > $beg )
553 push @index, [ $beg, $end - 1, $pos ];
556 elsif ( $end < $beg )
558 Maasha::Common::error( qq(KISS file not sorted: beg > end -> $beg > $end) );
561 $pos += 1 + length $line;
564 return wantarray ? @index : \@index;
569 sub kiss_index_search
577 my ( $high, $low, $try );
580 $high = scalar @{ $index };
582 while ( $low <= $high )
584 $try = int( ( $high + $low ) / 2 );
586 if ( $num < $index->[ $try ]->[ 0 ] ) {
588 } elsif ( $num > $index->[ $try ]->[ 1 ] ) {
591 return $index->[ $try ]->[ 2 ];
595 Maasha::Common::error( "Could not find number->$num in index" );