3 # Copyright (C) 2009 Martin A. Hansen.
5 # This program is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU General Public License
7 # as published by the Free Software Foundation; either version 2
8 # of the License, or (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 # http://www.gnu.org/copyleft/gpl.html
22 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
25 # Routines for parsing and emitting KISS records.
27 # http://code.google.com/p/biopieces/wiki/KissFormat
30 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
39 use vars qw( @ISA @EXPORT );
41 @ISA = qw( Exporter );
56 INDEX_BLOCK_SIZE => 100,
57 INDEX_LEVEL => 100_000_000,
62 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
67 # Martin A. Hansen, November 2009.
69 # Gets the next KISS entry from a file handle.
71 my ( $fh, # file handle
76 my ( $line, @fields, %entry );
78 while ( $line = <$fh> )
82 next if $line =~ /^$|^#/;
84 @fields = split /\t/, $line;
86 Maasha::Common::error( qq(BAD kiss entry: $line) ) if not @fields == 12;
88 $entry{ 'S_ID' } = $fields[ S_ID ];
89 $entry{ 'S_BEG' } = $fields[ S_BEG ];
90 $entry{ 'S_END' } = $fields[ S_END ];
91 $entry{ 'Q_ID' } = $fields[ Q_ID ];
92 $entry{ 'SCORE' } = $fields[ SCORE ];
93 $entry{ 'STRAND' } = $fields[ STRAND ];
94 $entry{ 'HITS' } = $fields[ HITS ];
95 $entry{ 'ALIGN' } = $fields[ ALIGN ];
96 $entry{ 'BLOCK_COUNT' } = $fields[ BLOCK_COUNT ];
97 $entry{ 'BLOCK_BEGS' } = $fields[ BLOCK_BEGS ];
98 $entry{ 'BLOCK_LENS' } = $fields[ BLOCK_LENS ];
99 $entry{ 'BLOCK_TYPE' } = $fields[ BLOCK_TYPE ];
101 return wantarray ? %entry : \%entry;
108 # Martin A. Hansen, December 2009.
110 # Parses a line with a KISS entry.
112 my ( $line, # KISS line to parse
117 my ( @fields, %entry );
119 @fields = split /\t/, $line;
121 Maasha::Common::error( qq(BAD kiss entry: $line) ) if not @fields == 12;
123 $entry{ 'S_ID' } = $fields[ S_ID ];
124 $entry{ 'S_BEG' } = $fields[ S_BEG ];
125 $entry{ 'S_END' } = $fields[ S_END ];
126 $entry{ 'Q_ID' } = $fields[ Q_ID ];
127 $entry{ 'SCORE' } = $fields[ SCORE ];
128 $entry{ 'STRAND' } = $fields[ STRAND ];
129 $entry{ 'HITS' } = $fields[ HITS ];
130 $entry{ 'ALIGN' } = $fields[ ALIGN ];
131 $entry{ 'BLOCK_COUNT' } = $fields[ BLOCK_COUNT ];
132 $entry{ 'BLOCK_BEGS' } = $fields[ BLOCK_BEGS ];
133 $entry{ 'BLOCK_LENS' } = $fields[ BLOCK_LENS ];
134 $entry{ 'BLOCK_TYPE' } = $fields[ BLOCK_TYPE ];
136 return wantarray ? %entry : \%entry;
142 # Martin A. Hansen, November 2009.
144 # Outputs a KISS record to a given filehandle
145 # or STDOUT if no filehandle is given.
147 my ( $entry, # KISS entry to output
148 $fh, # file handle - OPTIONAL
155 if ( defined $entry->{ 'S_ID' } and
156 defined $entry->{ 'S_BEG' } and
157 defined $entry->{ 'S_END' }
160 Maasha::Common::error( qq(Bad S_BEG value: $entry->{ 'S_BEG' } < 0 ) ) if $entry->{ 'S_BEG' } < 0;
161 Maasha::Common::error( qq(Bad S_END value: $entry->{ 'S_END' } < $entry->{ 'S_BEG' }) ) if $entry->{ 'S_END' } < $entry->{ 'S_BEG' };
165 $fields[ S_ID ] = $entry->{ 'S_ID' };
166 $fields[ S_BEG ] = $entry->{ 'S_BEG' };
167 $fields[ S_END ] = $entry->{ 'S_END' };
168 $fields[ Q_ID ] = $entry->{ 'Q_ID' } || ".";
169 $fields[ SCORE ] = $entry->{ 'SCORE' } || ".";
170 $fields[ STRAND ] = $entry->{ 'STRAND' } || ".";
171 $fields[ HITS ] = $entry->{ 'HITS' } || ".";
172 $fields[ ALIGN ] = $entry->{ 'ALIGN' } || ".";
173 $fields[ BLOCK_COUNT ] = $entry->{ 'BLOCK_COUNT' } || ".";
174 $fields[ BLOCK_BEGS ] = $entry->{ 'BLOCK_BEGS' } || ".";
175 $fields[ BLOCK_LENS ] = $entry->{ 'BLOCK_LENS' } || ".";
176 $fields[ BLOCK_TYPE ] = $entry->{ 'BLOCK_TYPE' } || ".";
178 print $fh join( "\t", @fields ), "\n";
185 # Martin A. Hansen, November 2009.
187 # Sorts a KISS file on S_BEG and S_END
189 my ( $file, # KISS file
194 `sort -k 2,2n -k 3,3n $file > $file.sort`;
196 rename "$file.sort", $file;
202 # Martin A. Hansen, December 2009.
204 # Given filehandles to two different unsorted KISS files
205 # intersect the entries so that all entries from file1 that
206 # overlap entries in file2 are returned - unless the inverse flag
207 # is given in which case entreis from file1 that does not
208 # overlap any entries in file2 are returned.
210 my ( $fh1, # filehandle to file1
211 $fh2, # filehandle to file2
212 $inverse, # flag indicating inverse matching - OPTIONAL
217 my ( $entry, %lookup, $pos, $overlap, @entries );
219 while ( $entry = kiss_entry_get( $fh2 ) ) {
220 map { $lookup{ $_ } = 1 } ( $entry->{ 'S_BEG' } .. $entry->{ 'S_END' } );
223 while ( $entry = kiss_entry_get( $fh1 ) )
227 foreach $pos ( $entry->{ 'S_BEG' } .. $entry->{ 'S_END' } )
229 if ( exists $lookup{ $pos } )
237 if ( $overlap and not $inverse ) {
238 push @entries, $entry;
239 } elsif ( not $overlap and $inverse ) {
240 push @entries, $entry;
244 return wantarray ? @entries : \@entries;
250 # Martin A, Hansen, November 2009.
252 # Creates an index of a sorted KISS file.
254 my ( $file, # KISS file
259 my ( $tree, $offset, $fh, $line, $beg );
264 $fh = Maasha::Filesys::file_read_open( $file );
266 while ( $line = <$fh> )
268 ( undef, $beg ) = split "\t", $line, 3;
270 kiss_index_node_add( $tree, INDEX_LEVEL, INDEX_FACTOR, $beg, $offset );
272 $offset += length $line;
277 kiss_index_store( "$file.index", $tree );
281 sub kiss_index_node_add
283 # Martin A, Hansen, November 2009.
285 # Recursive routine to add nodes to a tree.
298 $bucket = int( $beg / $level );
300 if ( $level >= $factor )
302 $sum += $bucket * $level;
303 $beg -= $bucket * $level;
305 $node->{ 'CHILDREN' }->[ $bucket ]->{ 'COUNT' }++;
306 # $node->{ 'CHILDREN' }->[ $bucket ]->{ 'LEVEL' } = $level;
307 # $node->{ 'CHILDREN' }->[ $bucket ]->{ 'BUCKET' } = $bucket;
308 $node->{ 'CHILDREN' }->[ $bucket ]->{ 'BEG' } = $sum;
309 $node->{ 'CHILDREN' }->[ $bucket ]->{ 'END' } = $sum + $level - 1;
310 $node->{ 'CHILDREN' }->[ $bucket ]->{ 'OFFSET' } = $offset if not defined $node->{ 'CHILDREN' }->[ $bucket ]->{ 'OFFSET' };
312 kiss_index_node_add( $node->{ 'CHILDREN' }->[ $bucket ], $level / $factor, $factor, $beg, $offset, $sum );
317 sub kiss_index_offset
319 # Martin A. Hansen, November 2009.
321 # Given a KISS index and a begin position,
322 # locate the offset closest to the begin position,
325 my ( $index, # KISS index
326 $beg, # begin position
327 $level, # index level - OPTIONAL
328 $factor, # index factor - OPTIONAL
333 my ( $child, $offset );
335 $level ||= INDEX_LEVEL;
336 $factor ||= INDEX_FACTOR;
338 foreach $child ( @{ $index->{ 'CHILDREN' } } )
340 next if not defined $child;
342 if ( $child->{ 'BEG' } <= $beg and $beg <= $child->{ 'END' } )
344 if ( $level == $factor ) {
345 $offset = $child->{ 'OFFSET' };
347 $offset = kiss_index_offset( $child, $beg, $level / $factor, $factor );
352 # Maasha::Common::error( "No offset" ) if not defined $offset; # FIXME
360 # Martin A. Hansen, November 2009.
362 # Given a KISS index and a begin/end interval
363 # sum the number of counts in that interval,
366 my ( $index, # KISS index
367 $beg, # Begin position
369 $level, # index level - OPTIONAL
370 $factor, # index factor - OPTIONAL
375 my ( $count, $child );
377 $level ||= INDEX_LEVEL;
378 $factor ||= INDEX_FACTOR;
381 foreach $child ( @{ $index->{ 'CHILDREN' } } )
383 next if not defined $child;
385 if ( $level >= $factor )
387 if ( Maasha::Calc::overlap( $beg, $end, $child->{ 'BEG' }, $child->{ 'END' } ) )
389 $count += $child->{ 'COUNT' } if $level == $factor;
390 $count += kiss_index_count( $child, $beg, $end, $level / $factor, $factor );
401 # Martin A. Hansen, November 2009.
403 # Stores a KISS index to a file.
405 my ( $path, # path to KISS index
411 Maasha::Filesys::file_store( $path, $index );
415 sub kiss_index_retrieve
417 # Martin A. Hansen, November 2009.
419 # Retrieves a KISS index from a file.
421 my ( $path, # Path to KISS index
424 # Returns a data structure.
428 $index = Maasha::Filesys::file_retrieve( $path );
430 return wantarray ? @{ $index } : $index;
434 sub kiss_index_get_entries
436 # Martin A. Hansen, November 2009.
438 # Given a path to a KISS file and a KISS index
439 # along with a beg/end interval, locate all entries
440 # in that interval and return those.
442 my ( $file, # path to KISS file
444 $beg, # interval begin
450 my ( $offset, $fh, $entry, @entries );
452 $offset = kiss_index_offset( $index, $beg );
454 $fh = Maasha::Filesys::file_read_open( $file );
456 sysseek( $fh, $offset, 0 );
458 while ( $entry = kiss_entry_get( $fh ) )
460 push @entries, $entry if $entry->{ 'S_END' } > $beg;
462 last if $entry->{ 'S_BEG' } > $end;
467 return wantarray ? @entries : \@entries;
471 sub kiss_index_get_blocks
473 # Martin A. Hansen, November 2009.
475 # Given a KISS index recursively traverse
476 # this into the appropriate node size determined
477 # by the size of the given beg/end interval.
478 # Blocks consisting of hashes of BEG/END/COUNT
479 # are returned from the requested node size.
481 my ( $index, # KISS index node
482 $beg, # interval begin
484 $level, # index level - OPTIONAL
485 $factor, # index factor - OPTIONAL
486 $size, # requested node size
491 my ( $len, @blocks, $child );
493 $level ||= INDEX_LEVEL;
494 $factor ||= INDEX_FACTOR;
496 $size ||= 100; # TODO: lazy list loading?
498 # if ( not defined $size )
500 # $len = $end - $beg + 1;
502 # if ( $len > 100_000_000 ) {
504 # } elsif ( $len > 1_000_000 ) {
511 if ( $level >= $size )
513 foreach $child ( @{ $index->{ 'CHILDREN' } } )
515 next if not defined $child;
517 if ( Maasha::Calc::overlap( $beg, $end, $child->{ 'BEG' }, $child->{ 'END' } ) )
519 if ( $level == $size )
522 BEG => $child->{ 'BEG' },
523 END => $child->{ 'END' },
524 COUNT => $child->{ 'COUNT' },
528 push @blocks, kiss_index_get_blocks( $child, $beg, $end, $level / $factor, $factor, $size );
533 return wantarray ? @blocks : \@blocks;
539 # Martin A. Hansen, November 2009.
541 # Encodes alignment descriptors for two
544 my ( $s_seq, # Subject sequence reference
545 $q_seq, # Query sequence reference
546 $offset, # alternate offset - OPTIONAL
551 my ( $i, $s, $q, @align );
553 Maasha::Common::error( "Sequence lengths don't match" ) if length ${ $s_seq } != length ${ $q_seq };
557 for ( $i = 0; $i < length ${ $s_seq }; $i++ )
559 $s = uc substr ${ $s_seq }, $i, 1;
560 $q = uc substr ${ $q_seq }, $i, 1;
562 if ( $s eq '-' and $q eq '-' )
572 push @align, "$offset:$s>$q";
574 $offset++ if not $s eq '-';
578 return wantarray ? @align : \@align;
584 # Martin A. Hansen, November 2009.
586 # Test routine of correct resolve of ALIGN descriptors.
588 # S.aur_COL 41 75 5_vnlh2ywXsN1/1 1 - . 17:A>T,31:A>N . . . .
590 my ( $s_seqref, # subject sequence reference
596 my ( $align, $pos, $s_symbol, $q_symbol, $s_seq, $q_seq, $insertions );
598 $s_seq = substr ${ $s_seqref }, $entry->{ 'S_BEG' }, $entry->{ 'S_END' } - $entry->{ 'S_BEG' } + 1;
603 foreach $align ( split /,/, $entry->{ 'ALIGN' } )
605 if ( $align =~ /(\d+):(.)>(.)/ )
611 if ( $s_symbol eq '-' ) # insertion
613 substr $s_seq, $pos + $insertions, 0, $s_symbol;
614 substr $q_seq, $pos + $insertions, 0, $q_symbol;
618 elsif ( $q_symbol eq '-' ) # deletion
620 substr $q_seq, $pos + $insertions, 1, $q_symbol;
624 substr $q_seq, $pos + $insertions, 1, $q_symbol;
629 Maasha::Common::error( qq(Bad alignment descriptor: "$align") );
633 Maasha::Align::align_print_pairwise( [ $entry->{ 'S_ID' }, $s_seq ], [ $entry->{ 'Q_ID' }, $q_seq ] );
639 # Martin A. Hansen, November 2009.
641 # Converts a KISS entry to a Biopiece record.
642 # TODO: Consistency checking
644 my ( $entry, # KISS entry
649 return wantarray ? %{ $entry } : $entry;
655 # Martin A. Hansen, November 2009.
657 # Converts a Biopiece record to a KISS entry.
659 my ( $record, # Biopiece record
664 $record->{ 'SCORE' } ||= $record->{ 'E_VAL' } || ".";
665 $record->{ 'HITS' } ||= ".";
666 $record->{ 'BLOCK_COUNT' } ||= ".";
667 $record->{ 'BLOCK_BEGS' } ||= ".";
668 $record->{ 'BLOCK_LENS' } ||= ".";
669 $record->{ 'BLOCK_TYPE' } ||= ".";
670 $record->{ 'ALIGN' } ||= $record->{ 'DESCRIPTOR' } || ".";
672 return wantarray ? %{ $record } : $record;
676 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<