3 # Copyright (C) 2009 Martin A. Hansen.
5 # This program is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU General Public License
7 # as published by the Free Software Foundation; either version 2
8 # of the License, or (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 # http://www.gnu.org/copyleft/gpl.html
22 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
25 # Routines for parsing and emitting KISS records.
27 # http://code.google.com/p/biopieces/wiki/KissFormat
30 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
41 use vars qw( @ISA @EXPORT );
43 @ISA = qw( Exporter );
58 INDEX_BLOCK_SIZE => 100,
59 INDEX_LEVEL => 100_000_000,
72 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
77 # Martin A. Hansen, November 2009.
79 # Gets the next KISS entry from a file handle.
81 my ( $fh, # file handle
86 my ( $line, @fields );
88 while ( $line = <$fh> )
92 next if $line =~ /^$|^#/;
94 @fields = split /\t/, $line;
96 Maasha::Common::error( qq(BAD kiss entry: $line) ) if not @fields == 12;
98 return wantarray ? @fields : \@fields;
105 # Martin A. Hansen, December 2009.
107 # Parses a line with a KISS entry.
109 my ( $line, # KISS line to parse
114 my ( @fields, %entry );
116 @fields = split /\t/, $line;
118 Maasha::Common::error( qq(BAD kiss entry: $line) ) if not @fields == 12;
120 return wantarray ? @fields : \@fields;
126 # Martin A. Hansen, November 2009.
128 # Outputs a KISS record to a given filehandle
129 # or STDOUT if no filehandle is given.
131 my ( $entry, # KISS entry to output
132 $fh, # file handle - OPTIONAL
137 Maasha::Common::error( qq(BAD kiss entry) ) if not scalar @{ $entry } == 12;
139 if ( defined $entry->[ S_ID ] and
140 defined $entry->[ S_BEG ] and
141 defined $entry->[ S_END ]
144 Maasha::Common::error( qq(Bad S_BEG value: $entry->[ S_BEG ] < 0 ) ) if $entry->[ S_BEG ] < 0;
145 Maasha::Common::error( qq(Bad S_END value: $entry->[ S_END ] < $entry->[ S_BEG ] ) ) if $entry->[ S_END ] < $entry->[ S_BEG ];
149 $entry->[ Q_ID ] = "." if not defined $entry->[ Q_ID ];
150 $entry->[ SCORE ] = "." if not defined $entry->[ SCORE ];
151 $entry->[ STRAND ] = "." if not defined $entry->[ STRAND ];
152 $entry->[ HITS ] = "." if not defined $entry->[ HITS ];
153 $entry->[ ALIGN ] = "." if not defined $entry->[ ALIGN ];
154 $entry->[ BLOCK_COUNT ] = "." if not defined $entry->[ BLOCK_COUNT ];
155 $entry->[ BLOCK_BEGS ] = "." if not defined $entry->[ BLOCK_BEGS ];
156 $entry->[ BLOCK_LENS ] = "." if not defined $entry->[ BLOCK_LENS ];
157 $entry->[ BLOCK_TYPE ] = "." if not defined $entry->[ BLOCK_TYPE ];
159 print $fh join( "\t", @{ $entry } ), "\n";
166 # Martin A. Hansen, November 2009.
168 # Sorts a KISS file on S_BEG and S_END
170 my ( $file, # KISS file
175 `sort -k 2,2n -k 3,3nr $file > $file.sort`;
177 rename "$file.sort", $file;
183 # Martin A. Hansen, December 2009.
185 # Creates a lookup index of a sorted KISS file.
187 my ( $file, # path to KISS file
192 my ( $fh, $offset, $line, $s_beg, $bucket, $index );
194 $fh = Maasha::Filesys::file_read_open( $file );
198 while ( $line = <$fh> )
200 ( undef, $s_beg ) = split "\t", $line, 3;
202 $bucket = int( $s_beg / BUCKET_SIZE );
204 $index->[ $bucket ]->[ COUNT ]++;
205 $index->[ $bucket ]->[ OFFSET ] = $offset if not defined $index->[ $bucket ]->[ OFFSET ];
207 $offset += length $line;
212 Maasha::KISS::kiss_index_store( "$file.index", $index );
216 sub kiss_index_offset
218 # Martin A. Hansen, December 2009.
220 # Given a KISS index and a begin position,
221 # locate the offset closest to the begin position,
224 my ( $index, # KISS index
225 $beg, # begin position
232 Maasha::Common::error( qq(Negative begin position: "$beg") ) if $beg < 0;
234 $bucket = int( $beg / BUCKET_SIZE );
236 $bucket = scalar @{ $index } if $bucket > scalar @{ $index };
238 while ( $bucket >= 0 )
240 return $index->[ $bucket ]->[ OFFSET ] if defined $index->[ $bucket ];
249 # Martin A. Hansen, December 2009.
251 # Given a KISS index and a begin/end interval
252 # sum the number of counts in that interval,
255 my ( $index, # KISS index
256 $beg, # Begin position
262 my ( $bucket_beg, $bucket_end, $count, $i );
264 Maasha::Common::error( qq(Negative begin position: "$beg") ) if $beg < 0;
266 $bucket_beg = int( $beg / BUCKET_SIZE );
267 $bucket_end = int( $end / BUCKET_SIZE );
269 $bucket_end = scalar @{ $index } if $bucket_end > scalar @{ $index };
273 for ( $i = $bucket_beg; $i <= $bucket_end; $i++ ) {
274 $count += $index->[ $i ]->[ COUNT ] if defined $index->[ $i ];
281 sub kiss_index_count_nc
283 # Martin A. Hansen, December 2009.
285 # Given a KISS index and a begin/end interval
286 # sum the number of counts in that interval,
289 my ( $index, # KISS index
290 $beg, # Begin position
298 Maasha::Common::error( qq(Negative begin position: "$beg") ) if $beg < 0;
300 $count = Maasha::NClist::nc_list_count_interval( $index, $beg, $end, INDEX_BEG, INDEX_END, INDEX );
306 sub kiss_index_get_entries
308 # Martin A. Hansen, November 2009.
310 # Given a path to a KISS file and a KISS index
311 # along with a beg/end interval, locate all entries
312 # in that interval and return those.
314 my ( $file, # path to KISS file
316 $beg, # interval begin
322 my ( $offset, $fh, $entry, @entries );
324 # $offset = kiss_index_offset( $index, $beg );
326 $fh = Maasha::Filesys::file_read_open( $file );
328 # sysseek( $fh, $offset, 0 );
330 while ( $entry = Maasha::KISS::kiss_entry_get( $fh ) )
332 push @entries, $entry if $entry->[ S_END ] > $beg;
334 last if $entry->[ S_BEG ] > $end;
339 return wantarray ? @entries : \@entries;
343 sub kiss_index_get_entries_OLD
345 # Martin A. Hansen, November 2009.
347 # Given a path to a KISS file and a KISS index
348 # along with a beg/end interval, locate all entries
349 # in that interval and return those.
351 my ( $file, # path to KISS file
353 $beg, # interval begin
359 my ( $offset, $fh, $entry, @entries );
361 $offset = kiss_index_offset( $index, $beg );
363 $fh = Maasha::Filesys::file_read_open( $file );
365 sysseek( $fh, $offset, 0 );
367 while ( $entry = Maasha::KISS::kiss_entry_get( $fh ) )
369 push @entries, $entry if $entry->[ S_END ] > $beg;
371 last if $entry->[ S_BEG ] > $end;
376 return wantarray ? @entries : \@entries;
380 sub kiss_index_get_blocks
382 # Martin A. Hansen, December 2009.
384 # Given a KISS index returns blocks from this in a
385 # given position interval. Blocks consisting of
386 # hashes of BEG/END/COUNT.
388 my ( $index, # KISS index node
389 $beg, # interval begin
395 my ( $bucket_beg, $bucket_end, $i, @blocks );
397 Maasha::Common::error( qq(Negative begin position: "$beg") ) if $beg < 0;
399 $bucket_beg = int( $beg / BUCKET_SIZE );
400 $bucket_end = int( $end / BUCKET_SIZE );
402 $bucket_end = scalar @{ $index } if $bucket_end > scalar @{ $index };
404 for ( $i = $bucket_beg; $i <= $bucket_end; $i++ )
406 if ( defined $index->[ $i ] )
409 BEG => BUCKET_SIZE * $i,
410 END => BUCKET_SIZE * ( $i + 1 ),
411 COUNT => $index->[ $i ]->[ COUNT ],
416 return wantarray ? @blocks : \@blocks;
422 # Martin A. Hansen, December 2009.
424 # Given filehandles to two different unsorted KISS files
425 # intersect the entries so that all entries from file1 that
426 # overlap entries in file2 are returned - unless the inverse flag
427 # is given in which case entreis from file1 that does not
428 # overlap any entries in file2 are returned.
430 my ( $fh1, # filehandle to file1
431 $fh2, # filehandle to file2
432 $inverse, # flag indicating inverse matching - OPTIONAL
437 my ( $entry, %lookup, $pos, $overlap, @entries );
439 while ( $entry = kiss_entry_get( $fh2 ) ) {
440 map { $lookup{ $_ } = 1 } ( $entry->[ S_BEG ] .. $entry->[ S_END ] );
443 while ( $entry = kiss_entry_get( $fh1 ) )
447 foreach $pos ( $entry->[ S_BEG ] .. $entry->[ S_END ] )
449 if ( exists $lookup{ $pos } )
457 if ( $overlap and not $inverse ) {
458 push @entries, $entry;
459 } elsif ( not $overlap and $inverse ) {
460 push @entries, $entry;
464 return wantarray ? @entries : \@entries;
470 # Martin A. Hansen, November 2009.
472 # Stores a KISS index to a file.
474 my ( $path, # path to KISS index
482 $json = JSON::XS::encode_json( $index );
484 $fh = Maasha::Filesys::file_write_open( $path );
492 sub kiss_index_retrieve
494 # Martin A. Hansen, November 2009.
496 # Retrieves a KISS index from a file.
498 my ( $path, # Path to KISS index
501 # Returns a data structure.
503 my ( $fh, $json, $index );
507 $fh = Maasha::Filesys::file_read_open( $path );
513 $index = JSON::XS::decode_json( $json );
515 return wantarray ? @{ $index } : $index;
521 # Martin A. Hansen, November 2009.
523 # Encodes alignment descriptors for two
526 my ( $s_seq, # Subject sequence reference
527 $q_seq, # Query sequence reference
528 $offset, # alternate offset - OPTIONAL
533 my ( $i, $s, $q, @align );
535 # Maasha::Common::error( "Sequence lengths don't match" ) if length ${ $s_seq } != length ${ $q_seq };
537 if ( length ${ $s_seq } != length ${ $q_seq } ) { # for unknown reasons this situation may occur - TODO FIXME
538 return wantarray ? () : [];
543 for ( $i = 0; $i < length ${ $s_seq }; $i++ )
545 $s = uc substr ${ $s_seq }, $i, 1;
546 $q = uc substr ${ $q_seq }, $i, 1;
548 if ( $s eq '-' and $q eq '-' )
558 push @align, "$offset:$s>$q";
560 $offset++ if not $s eq '-';
564 return wantarray ? @align : \@align;
570 # Martin A. Hansen, November 2009.
572 # Test routine of correct resolve of ALIGN descriptors.
574 # S.aur_COL 41 75 5_vnlh2ywXsN1/1 1 - . 17:A>T,31:A>N . . . .
576 my ( $s_seqref, # subject sequence reference
582 my ( $align, $pos, $s_symbol, $q_symbol, $s_seq, $q_seq, $insertions );
584 $s_seq = substr ${ $s_seqref }, $entry->{ 'S_BEG' }, $entry->{ 'S_END' } - $entry->{ 'S_BEG' } + 1;
589 foreach $align ( split /,/, $entry->{ 'ALIGN' } )
591 if ( $align =~ /(\d+):(.)>(.)/ )
597 if ( $s_symbol eq '-' ) # insertion
599 substr $s_seq, $pos + $insertions, 0, $s_symbol;
600 substr $q_seq, $pos + $insertions, 0, $q_symbol;
604 elsif ( $q_symbol eq '-' ) # deletion
606 substr $q_seq, $pos + $insertions, 1, $q_symbol;
610 substr $q_seq, $pos + $insertions, 1, $q_symbol;
615 Maasha::Common::error( qq(Bad alignment descriptor: "$align") );
619 Maasha::Align::align_print_pairwise( [ $entry->{ 'S_ID' }, $s_seq ], [ $entry->{ 'Q_ID' }, $q_seq ] );
625 # Martin A. Hansen, November 2009.
627 # Converts a KISS entry to a Biopiece record.
629 my ( $entry, # KISS entry
636 Maasha::Common::error( qq(BAD kiss entry) ) if not scalar @{ $entry } == 12;
638 $record{ 'S_ID' } = $entry->[ S_ID ];
639 $record{ 'S_BEG' } = $entry->[ S_BEG ];
640 $record{ 'S_END' } = $entry->[ S_END ];
641 $record{ 'Q_ID' } = $entry->[ Q_ID ];
642 $record{ 'SCORE' } = $entry->[ SCORE ];
643 $record{ 'STRAND' } = $entry->[ STRAND ];
644 $record{ 'HITS' } = $entry->[ HITS ];
645 $record{ 'ALIGN' } = $entry->[ ALIGN ];
646 $record{ 'BLOCK_COUNT' } = $entry->[ BLOCK_COUNT ];
647 $record{ 'BLOCK_BEGS' } = $entry->[ BLOCK_BEGS ];
648 $record{ 'BLOCK_LENS' } = $entry->[ BLOCK_LENS ];
649 $record{ 'BLOCK_TYPE' } = $entry->[ BLOCK_TYPE ];
651 return wantarray ? %record : \%record;
657 # Martin A. Hansen, November 2009.
659 # Converts a Biopiece record to a KISS entry.
661 my ( $record, # Biopiece record
668 if ( not defined $record->{ 'S_ID' } and
669 not defined $record->{ 'S_BEG' } and
670 not defined $record->{ 'S_END' } )
675 $entry->[ S_ID ] = $record->{ 'S_ID' };
676 $entry->[ S_BEG ] = $record->{ 'S_BEG' };
677 $entry->[ S_END ] = $record->{ 'S_END' };
678 $entry->[ Q_ID ] = $record->{ 'Q_ID' } || ".";
679 $entry->[ SCORE ] = $record->{ 'SCORE' } || $record->{ 'BIT_SCORE' } || ".";
680 $entry->[ STRAND ] = $record->{ 'STRAND' } || ".";
681 $entry->[ HITS ] = $record->{ 'HITS' } || ".";
682 $entry->[ ALIGN ] = $record->{ 'ALIGN' } || $record->{ 'DESCRIPTOR' } || ".";
683 $entry->[ BLOCK_COUNT ] = $record->{ 'BLOCK_COUNT' } || ".";
684 $entry->[ BLOCK_BEGS ] = $record->{ 'BLOCK_BEGS' } || ".";
685 $entry->[ BLOCK_LENS ] = $record->{ 'BLOCK_LENS' } || ".";
686 $entry->[ BLOCK_TYPE ] = $record->{ 'BLOCK_TYPE' } || ".";
688 return wantarray ? @{ $entry } : $entry;
692 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
700 # Martin A. Hansen, February 2010.
702 # Creates a NC list index of a sorted KISS file.
704 my ( $file, # path to KISS file
709 my ( $fh, $line, @fields, $nc_list );
711 $fh = Maasha::Filesys::file_read_open( $file );
713 while ( $line = <$fh> )
717 @fields = split "\t", $line;
719 if ( not defined $nc_list ) {
720 $nc_list = [ [ @fields ] ];
722 Maasha::NClist::nc_list_add( $nc_list, [ @fields ], INDEX_END, INDEX );
728 Maasha::NClist::nc_list_store( $nc_list, "$file.json" );
732 sub kiss_index_get_entries_nc
734 # Martin A. Hansen, November 2009.
736 # Given a path to a KISS file and a KISS index
737 # along with a beg/end interval, locate all entries
738 # in that interval and return those.
740 my ( $index, # KISS index
741 $beg, # interval begin
749 $features = Maasha::NClist::nc_list_get_interval( $index, $beg, $end, INDEX_BEG, INDEX_END, INDEX );
751 return wantarray ? @{ $features } : $features;
755 sub kiss_index_retrieve_nc
757 # Martin A. Hansen, November 2009.
759 # Retrieves a KISS index from a file.
761 my ( $path, # Path to KISS index
764 # Returns a data structure.
768 $index = Maasha::NClist::nc_list_retrieve( $path );
770 return wantarray ? @{ $index } : $index;