3 # Copyright (C) 2009 Martin A. Hansen.
5 # This program is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU General Public License
7 # as published by the Free Software Foundation; either version 2
8 # of the License, or (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 # http://www.gnu.org/copyleft/gpl.html
22 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
25 # Routines for parsing and emitting KISS records.
27 # http://code.google.com/p/biopieces/wiki/KissFormat
30 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
41 use vars qw( @ISA @EXPORT );
43 @ISA = qw( Exporter );
58 INDEX_BLOCK_SIZE => 100,
59 INDEX_LEVEL => 100_000_000,
72 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
77 # Martin A. Hansen, November 2009.
79 # Gets the next KISS entry from a file handle.
81 my ( $fh, # file handle
86 my ( $line, @fields );
88 while ( $line = <$fh> )
92 next if $line =~ /^$|^#/;
94 @fields = split /\t/, $line;
96 Maasha::Common::error( qq(BAD kiss entry: $line) ) if not @fields == 12;
98 return wantarray ? @fields : \@fields;
105 # Martin A. Hansen, February 2010.
107 # Retrieves KISS entries from a given sorted KISS file
108 # within an optional interval.
110 my ( $file, # path to KISS file
111 $beg, # interval begin - OPTIONAL
112 $end, # interval end - OPTIONAL
117 my ( $fh, $entry, @entries );
122 $fh = Maasha::Filesys::file_read_open( $file );
124 while ( $entry = kiss_entry_get( $fh ) )
126 last if $entry->[ S_BEG ] > $end;
127 push @entries, $entry if $entry->[ S_END ] > $beg;
132 return wantarray ? @entries : \@entries;
138 # Martin A. Hansen, December 2009.
140 # Parses a line with a KISS entry.
142 my ( $line, # KISS line to parse
147 my ( @fields, %entry );
149 @fields = split /\t/, $line;
151 Maasha::Common::error( qq(BAD kiss entry: $line) ) if not @fields == 12;
153 return wantarray ? @fields : \@fields;
159 # Martin A. Hansen, November 2009.
161 # Outputs a KISS record to a given filehandle
162 # or STDOUT if no filehandle is given.
164 my ( $entry, # KISS entry to output
165 $fh, # file handle - OPTIONAL
170 Maasha::Common::error( qq(BAD kiss entry) ) if not scalar @{ $entry } == 12;
172 if ( defined $entry->[ S_ID ] and
173 defined $entry->[ S_BEG ] and
174 defined $entry->[ S_END ]
177 Maasha::Common::error( qq(Bad S_BEG value: $entry->[ S_BEG ] < 0 ) ) if $entry->[ S_BEG ] < 0;
178 Maasha::Common::error( qq(Bad S_END value: $entry->[ S_END ] < $entry->[ S_BEG ] ) ) if $entry->[ S_END ] < $entry->[ S_BEG ];
182 $entry->[ Q_ID ] = "." if not defined $entry->[ Q_ID ];
183 $entry->[ SCORE ] = "." if not defined $entry->[ SCORE ];
184 $entry->[ STRAND ] = "." if not defined $entry->[ STRAND ];
185 $entry->[ HITS ] = "." if not defined $entry->[ HITS ];
186 $entry->[ ALIGN ] = "." if not defined $entry->[ ALIGN ];
187 $entry->[ BLOCK_COUNT ] = "." if not defined $entry->[ BLOCK_COUNT ];
188 $entry->[ BLOCK_BEGS ] = "." if not defined $entry->[ BLOCK_BEGS ];
189 $entry->[ BLOCK_LENS ] = "." if not defined $entry->[ BLOCK_LENS ];
190 $entry->[ BLOCK_TYPE ] = "." if not defined $entry->[ BLOCK_TYPE ];
192 print $fh join( "\t", @{ $entry } ), "\n";
199 # Martin A. Hansen, November 2009.
201 # Sorts a KISS file on S_BEG and S_END
203 my ( $file, # KISS file
208 `sort -k 2,2n -k 3,3nr $file > $file.sort`;
210 rename "$file.sort", $file;
216 # Martin A. Hansen, December 2009.
218 # Creates a lookup index of a sorted KISS file.
220 my ( $file, # path to KISS file
225 my ( $fh, $offset, $line, $s_beg, $bucket, $index );
227 $fh = Maasha::Filesys::file_read_open( $file );
231 while ( $line = <$fh> )
233 ( undef, $s_beg ) = split "\t", $line, 3;
235 $bucket = int( $s_beg / BUCKET_SIZE );
237 $index->[ $bucket ]->[ COUNT ]++;
238 $index->[ $bucket ]->[ OFFSET ] = $offset if not defined $index->[ $bucket ]->[ OFFSET ];
240 $offset += length $line;
245 Maasha::KISS::kiss_index_store( "$file.index", $index );
249 sub kiss_index_offset
251 # Martin A. Hansen, December 2009.
253 # Given a KISS index and a begin position,
254 # locate the offset closest to the begin position,
257 my ( $index, # KISS index
258 $beg, # begin position
265 Maasha::Common::error( qq(Negative begin position: "$beg") ) if $beg < 0;
267 $bucket = int( $beg / BUCKET_SIZE );
269 $bucket = scalar @{ $index } if $bucket > scalar @{ $index };
271 while ( $bucket >= 0 )
273 return $index->[ $bucket ]->[ OFFSET ] if defined $index->[ $bucket ];
282 # Martin A. Hansen, December 2009.
284 # Given a KISS index and a begin/end interval
285 # sum the number of counts in that interval,
288 my ( $index, # KISS index
289 $beg, # Begin position
295 my ( $bucket_beg, $bucket_end, $count, $i );
297 Maasha::Common::error( qq(Negative begin position: "$beg") ) if $beg < 0;
299 $bucket_beg = int( $beg / BUCKET_SIZE );
300 $bucket_end = int( $end / BUCKET_SIZE );
302 $bucket_end = scalar @{ $index } if $bucket_end > scalar @{ $index };
306 for ( $i = $bucket_beg; $i <= $bucket_end; $i++ ) {
307 $count += $index->[ $i ]->[ COUNT ] if defined $index->[ $i ];
314 sub kiss_index_count_nc
316 # Martin A. Hansen, December 2009.
318 # Given a KISS index and a begin/end interval
319 # sum the number of counts in that interval,
322 my ( $index, # KISS index
323 $beg, # Begin position
331 Maasha::Common::error( qq(Negative begin position: "$beg") ) if $beg < 0;
333 $count = Maasha::NClist::nc_list_count_interval( $index, $beg, $end, INDEX_BEG, INDEX_END, INDEX );
339 sub kiss_index_get_entries
341 # Martin A. Hansen, November 2009.
343 # Given a path to a KISS file and a KISS index
344 # along with a beg/end interval, locate all entries
345 # in that interval and return those.
347 my ( $file, # path to KISS file
349 $beg, # interval begin
355 my ( $offset, $fh, $entry, @entries );
357 # $offset = kiss_index_offset( $index, $beg );
359 $fh = Maasha::Filesys::file_read_open( $file );
361 # sysseek( $fh, $offset, 0 );
363 while ( $entry = Maasha::KISS::kiss_entry_get( $fh ) )
365 push @entries, $entry if $entry->[ S_END ] > $beg;
367 last if $entry->[ S_BEG ] > $end;
372 return wantarray ? @entries : \@entries;
376 sub kiss_index_get_entries_OLD
378 # Martin A. Hansen, November 2009.
380 # Given a path to a KISS file and a KISS index
381 # along with a beg/end interval, locate all entries
382 # in that interval and return those.
384 my ( $file, # path to KISS file
386 $beg, # interval begin
392 my ( $offset, $fh, $entry, @entries );
394 $offset = kiss_index_offset( $index, $beg );
396 $fh = Maasha::Filesys::file_read_open( $file );
398 sysseek( $fh, $offset, 0 );
400 while ( $entry = Maasha::KISS::kiss_entry_get( $fh ) )
402 push @entries, $entry if $entry->[ S_END ] > $beg;
404 last if $entry->[ S_BEG ] > $end;
409 return wantarray ? @entries : \@entries;
413 sub kiss_index_get_blocks
415 # Martin A. Hansen, December 2009.
417 # Given a KISS index returns blocks from this in a
418 # given position interval. Blocks consisting of
419 # hashes of BEG/END/COUNT.
421 my ( $index, # KISS index node
422 $beg, # interval begin
428 my ( $bucket_beg, $bucket_end, $i, @blocks );
430 Maasha::Common::error( qq(Negative begin position: "$beg") ) if $beg < 0;
432 $bucket_beg = int( $beg / BUCKET_SIZE );
433 $bucket_end = int( $end / BUCKET_SIZE );
435 $bucket_end = scalar @{ $index } if $bucket_end > scalar @{ $index };
437 for ( $i = $bucket_beg; $i <= $bucket_end; $i++ )
439 if ( defined $index->[ $i ] )
442 BEG => BUCKET_SIZE * $i,
443 END => BUCKET_SIZE * ( $i + 1 ),
444 COUNT => $index->[ $i ]->[ COUNT ],
449 return wantarray ? @blocks : \@blocks;
455 # Martin A. Hansen, December 2009.
457 # Given filehandles to two different unsorted KISS files
458 # intersect the entries so that all entries from file1 that
459 # overlap entries in file2 are returned - unless the inverse flag
460 # is given in which case entreis from file1 that does not
461 # overlap any entries in file2 are returned.
463 my ( $fh1, # filehandle to file1
464 $fh2, # filehandle to file2
465 $inverse, # flag indicating inverse matching - OPTIONAL
470 my ( $entry, %lookup, $pos, $overlap, @entries );
472 while ( $entry = kiss_entry_get( $fh2 ) ) {
473 map { $lookup{ $_ } = 1 } ( $entry->[ S_BEG ] .. $entry->[ S_END ] );
476 while ( $entry = kiss_entry_get( $fh1 ) )
480 foreach $pos ( $entry->[ S_BEG ] .. $entry->[ S_END ] )
482 if ( exists $lookup{ $pos } )
490 if ( $overlap and not $inverse ) {
491 push @entries, $entry;
492 } elsif ( not $overlap and $inverse ) {
493 push @entries, $entry;
497 return wantarray ? @entries : \@entries;
503 # Martin A. Hansen, November 2009.
505 # Stores a KISS index to a file.
507 my ( $path, # path to KISS index
515 $json = JSON::XS::encode_json( $index );
517 $fh = Maasha::Filesys::file_write_open( $path );
525 sub kiss_index_retrieve
527 # Martin A. Hansen, November 2009.
529 # Retrieves a KISS index from a file.
531 my ( $path, # Path to KISS index
534 # Returns a data structure.
536 my ( $fh, $json, $index );
540 $fh = Maasha::Filesys::file_read_open( $path );
546 $index = JSON::XS::decode_json( $json );
548 return wantarray ? @{ $index } : $index;
554 # Martin A. Hansen, November 2009.
556 # Encodes alignment descriptors for two
559 my ( $s_seq, # Subject sequence reference
560 $q_seq, # Query sequence reference
561 $offset, # alternate offset - OPTIONAL
566 my ( $i, $s, $q, @align );
568 # Maasha::Common::error( "Sequence lengths don't match" ) if length ${ $s_seq } != length ${ $q_seq };
570 if ( length ${ $s_seq } != length ${ $q_seq } ) { # for unknown reasons this situation may occur - TODO FIXME
571 return wantarray ? () : [];
576 for ( $i = 0; $i < length ${ $s_seq }; $i++ )
578 $s = uc substr ${ $s_seq }, $i, 1;
579 $q = uc substr ${ $q_seq }, $i, 1;
581 if ( $s eq '-' and $q eq '-' )
591 push @align, "$offset:$s>$q";
593 $offset++ if not $s eq '-';
597 return wantarray ? @align : \@align;
603 # Martin A. Hansen, November 2009.
605 # Test routine of correct resolve of ALIGN descriptors.
607 # S.aur_COL 41 75 5_vnlh2ywXsN1/1 1 - . 17:A>T,31:A>N . . . .
609 my ( $s_seqref, # subject sequence reference
615 my ( $align, $pos, $s_symbol, $q_symbol, $s_seq, $q_seq, $insertions );
617 $s_seq = substr ${ $s_seqref }, $entry->{ 'S_BEG' }, $entry->{ 'S_END' } - $entry->{ 'S_BEG' } + 1;
622 foreach $align ( split /,/, $entry->{ 'ALIGN' } )
624 if ( $align =~ /(\d+):(.)>(.)/ )
630 if ( $s_symbol eq '-' ) # insertion
632 substr $s_seq, $pos + $insertions, 0, $s_symbol;
633 substr $q_seq, $pos + $insertions, 0, $q_symbol;
637 elsif ( $q_symbol eq '-' ) # deletion
639 substr $q_seq, $pos + $insertions, 1, $q_symbol;
643 substr $q_seq, $pos + $insertions, 1, $q_symbol;
648 Maasha::Common::error( qq(Bad alignment descriptor: "$align") );
652 Maasha::Align::align_print_pairwise( [ $entry->{ 'S_ID' }, $s_seq ], [ $entry->{ 'Q_ID' }, $q_seq ] );
658 # Martin A. Hansen, November 2009.
660 # Converts a KISS entry to a Biopiece record.
662 my ( $entry, # KISS entry
669 Maasha::Common::error( qq(BAD kiss entry) ) if not scalar @{ $entry } == 12;
671 $record{ 'S_ID' } = $entry->[ S_ID ];
672 $record{ 'S_BEG' } = $entry->[ S_BEG ];
673 $record{ 'S_END' } = $entry->[ S_END ];
674 $record{ 'Q_ID' } = $entry->[ Q_ID ];
675 $record{ 'SCORE' } = $entry->[ SCORE ];
676 $record{ 'STRAND' } = $entry->[ STRAND ];
677 $record{ 'HITS' } = $entry->[ HITS ];
678 $record{ 'ALIGN' } = $entry->[ ALIGN ];
679 $record{ 'BLOCK_COUNT' } = $entry->[ BLOCK_COUNT ];
680 $record{ 'BLOCK_BEGS' } = $entry->[ BLOCK_BEGS ];
681 $record{ 'BLOCK_LENS' } = $entry->[ BLOCK_LENS ];
682 $record{ 'BLOCK_TYPE' } = $entry->[ BLOCK_TYPE ];
684 return wantarray ? %record : \%record;
690 # Martin A. Hansen, November 2009.
692 # Converts a Biopiece record to a KISS entry.
694 my ( $record, # Biopiece record
701 if ( not defined $record->{ 'S_ID' } and
702 not defined $record->{ 'S_BEG' } and
703 not defined $record->{ 'S_END' } )
708 $entry->[ S_ID ] = $record->{ 'S_ID' };
709 $entry->[ S_BEG ] = $record->{ 'S_BEG' };
710 $entry->[ S_END ] = $record->{ 'S_END' };
711 $entry->[ Q_ID ] = $record->{ 'Q_ID' } || ".";
712 $entry->[ SCORE ] = $record->{ 'SCORE' } || $record->{ 'BIT_SCORE' } || $record->{ 'ID' } || ".";
713 $entry->[ STRAND ] = $record->{ 'STRAND' } || ".";
714 $entry->[ HITS ] = $record->{ 'HITS' } || ".";
715 $entry->[ ALIGN ] = $record->{ 'ALIGN' } || $record->{ 'DESCRIPTOR' } || ".";
716 $entry->[ BLOCK_COUNT ] = $record->{ 'BLOCK_COUNT' } || ".";
717 $entry->[ BLOCK_BEGS ] = $record->{ 'BLOCK_BEGS' } || $record->{ 'Q_BEGS' } || ".";
718 $entry->[ BLOCK_LENS ] = $record->{ 'BLOCK_LENS' } || ".";
719 $entry->[ BLOCK_TYPE ] = $record->{ 'BLOCK_TYPE' } || ".";
721 return wantarray ? @{ $entry } : $entry;
725 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
733 # Martin A. Hansen, February 2010.
735 # Creates a NC list index of a sorted KISS file.
737 my ( $file, # path to KISS file
742 my ( $fh, $line, @fields, $nc_list );
744 $fh = Maasha::Filesys::file_read_open( $file );
746 while ( $line = <$fh> )
750 @fields = split "\t", $line;
752 if ( not defined $nc_list ) {
753 $nc_list = [ [ @fields ] ];
755 Maasha::NClist::nc_list_add( $nc_list, [ @fields ], INDEX_END, INDEX );
761 Maasha::NClist::nc_list_store( $nc_list, "$file.json" );
765 sub kiss_index_get_entries_nc
767 # Martin A. Hansen, November 2009.
769 # Given a path to a KISS file and a KISS index
770 # along with a beg/end interval, locate all entries
771 # in that interval and return those.
773 my ( $index, # KISS index
774 $beg, # interval begin
782 $features = Maasha::NClist::nc_list_get_interval( $index, $beg, $end, INDEX_BEG, INDEX_END, INDEX );
784 return wantarray ? @{ $features } : $features;
788 sub kiss_index_retrieve_nc
790 # Martin A. Hansen, November 2009.
792 # Retrieves a KISS index from a file.
794 my ( $path, # Path to KISS index
797 # Returns a data structure.
801 $index = Maasha::NClist::nc_list_retrieve( $path );
803 return wantarray ? @{ $index } : $index;