3 # Copyright (C) 2009 Martin A. Hansen.
5 # This program is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU General Public License
7 # as published by the Free Software Foundation; either version 2
8 # of the License, or (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 # http://www.gnu.org/copyleft/gpl.html
22 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
25 # Routines for parsing and emitting KISS records.
27 # http://code.google.com/p/biopieces/wiki/KissFormat
30 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
41 use vars qw( @ISA @EXPORT );
43 @ISA = qw( Exporter );
58 INDEX_BLOCK_SIZE => 100,
59 INDEX_LEVEL => 100_000_000,
72 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
77 # Martin A. Hansen, November 2009.
79 # Gets the next KISS entry from a file handle.
81 my ( $fh, # file handle
86 my ( $line, @fields );
88 while ( $line = <$fh> )
92 next if $line =~ /^$|^#/;
94 @fields = split /\t/, $line;
96 Maasha::Common::error( qq(BAD kiss entry: $line) ) if not @fields == 12;
98 return wantarray ? @fields : \@fields;
105 # Martin A. Hansen, December 2009.
107 # TODO find out what uses this and kill it!
109 # Parses a line with a KISS entry.
111 my ( $line, # KISS line to parse
116 my ( @fields, %entry );
118 @fields = split /\t/, $line;
120 Maasha::Common::error( qq(BAD kiss entry: $line) ) if not @fields == 12;
122 $entry{ 'S_ID' } = $fields[ S_ID ];
123 $entry{ 'S_BEG' } = $fields[ S_BEG ];
124 $entry{ 'S_END' } = $fields[ S_END ];
125 $entry{ 'Q_ID' } = $fields[ Q_ID ];
126 $entry{ 'SCORE' } = $fields[ SCORE ];
127 $entry{ 'STRAND' } = $fields[ STRAND ];
128 $entry{ 'HITS' } = $fields[ HITS ];
129 $entry{ 'ALIGN' } = $fields[ ALIGN ];
130 $entry{ 'BLOCK_COUNT' } = $fields[ BLOCK_COUNT ];
131 $entry{ 'BLOCK_BEGS' } = $fields[ BLOCK_BEGS ];
132 $entry{ 'BLOCK_LENS' } = $fields[ BLOCK_LENS ];
133 $entry{ 'BLOCK_TYPE' } = $fields[ BLOCK_TYPE ];
135 return wantarray ? %entry : \%entry;
141 # Martin A. Hansen, November 2009.
143 # Outputs a KISS record to a given filehandle
144 # or STDOUT if no filehandle is given.
146 my ( $entry, # KISS entry to output
147 $fh, # file handle - OPTIONAL
152 Maasha::Common::error( qq(BAD kiss entry) ) if not scalar @{ $entry } == 12;
154 if ( defined $entry->[ S_ID ] and
155 defined $entry->[ S_BEG ] and
156 defined $entry->[ S_END ]
159 Maasha::Common::error( qq(Bad S_BEG value: $entry->[ S_BEG ] < 0 ) ) if $entry->[ S_BEG ] < 0;
160 Maasha::Common::error( qq(Bad S_END value: $entry->[ S_END ] < $entry->[ S_BEG ] ) ) if $entry->[ S_END ] < $entry->[ S_BEG ];
164 $entry->[ Q_ID ] = "." if not defined $entry->[ Q_ID ];
165 $entry->[ SCORE ] = "." if not defined $entry->[ SCORE ];
166 $entry->[ STRAND ] = "." if not defined $entry->[ STRAND ];
167 $entry->[ HITS ] = "." if not defined $entry->[ HITS ];
168 $entry->[ ALIGN ] = "." if not defined $entry->[ ALIGN ];
169 $entry->[ BLOCK_COUNT ] = "." if not defined $entry->[ BLOCK_COUNT ];
170 $entry->[ BLOCK_BEGS ] = "." if not defined $entry->[ BLOCK_BEGS ];
171 $entry->[ BLOCK_LENS ] = "." if not defined $entry->[ BLOCK_LENS ];
172 $entry->[ BLOCK_TYPE ] = "." if not defined $entry->[ BLOCK_TYPE ];
174 print $fh join( "\t", @{ $entry } ), "\n";
181 # Martin A. Hansen, November 2009.
183 # Sorts a KISS file on S_BEG and S_END
185 my ( $file, # KISS file
190 `sort -k 2,2n -k 3,3nr $file > $file.sort`;
192 rename "$file.sort", $file;
198 # Martin A. Hansen, December 2009.
200 # Creates a lookup index of a sorted KISS file.
202 my ( $file, # path to KISS file
207 my ( $fh, $offset, $line, $s_beg, $bucket, $index );
209 $fh = Maasha::Filesys::file_read_open( $file );
213 while ( $line = <$fh> )
215 ( undef, $s_beg ) = split "\t", $line, 3;
217 $bucket = int( $s_beg / BUCKET_SIZE );
219 $index->[ $bucket ]->[ COUNT ]++;
220 $index->[ $bucket ]->[ OFFSET ] = $offset if not defined $index->[ $bucket ]->[ OFFSET ];
222 $offset += length $line;
227 Maasha::KISS::kiss_index_store( "$file.index", $index );
233 # Martin A. Hansen, February 2010.
235 # Creates a NC list index of a sorted KISS file.
237 my ( $file, # path to KISS file
242 my ( $fh, $line, @fields, $nc_list );
244 $fh = Maasha::Filesys::file_read_open( $file );
246 while ( $line = <$fh> )
250 @fields = split "\t", $line;
252 if ( not defined $nc_list ) {
253 $nc_list = [ [ @fields ] ];
255 Maasha::NClist::nc_list_add( $nc_list, [ @fields ], INDEX_END, INDEX );
261 Maasha::NClist::nc_list_store( $nc_list, "$file.json" );
265 sub kiss_index_offset
267 # Martin A. Hansen, December 2009.
269 # Given a KISS index and a begin position,
270 # locate the offset closest to the begin position,
273 my ( $index, # KISS index
274 $beg, # begin position
281 Maasha::Common::error( qq(Negative begin position: "$beg") ) if $beg < 0;
283 $bucket = int( $beg / BUCKET_SIZE );
285 $bucket = scalar @{ $index } if $bucket > scalar @{ $index };
287 while ( $bucket >= 0 )
289 return $index->[ $bucket ]->[ OFFSET ] if defined $index->[ $bucket ];
298 # Martin A. Hansen, December 2009.
300 # Given a KISS index and a begin/end interval
301 # sum the number of counts in that interval,
304 my ( $index, # KISS index
305 $beg, # Begin position
313 Maasha::Common::error( qq(Negative begin position: "$beg") ) if $beg < 0;
315 $count = Maasha::NClist::nc_list_count_interval( $index, $beg, $end, INDEX_BEG, INDEX_END, INDEX );
321 sub kiss_index_get_entries
323 # Martin A. Hansen, November 2009.
325 # Given a path to a KISS file and a KISS index
326 # along with a beg/end interval, locate all entries
327 # in that interval and return those.
329 my ( $index, # KISS index
330 $beg, # interval begin
338 $features = Maasha::NClist::nc_list_get_interval( $index, $beg, $end, INDEX_BEG, INDEX_END, INDEX );
340 return wantarray ? @{ $features } : $features;
344 sub kiss_index_get_blocks
346 # Martin A. Hansen, December 2009.
348 # Given a KISS index returns blocks from this in a
349 # given position interval. Blocks consisting of
350 # hashes of BEG/END/COUNT.
352 my ( $index, # KISS index node
353 $beg, # interval begin
359 my ( $bucket_beg, $bucket_end, $i, @blocks );
361 Maasha::Common::error( qq(Negative begin position: "$beg") ) if $beg < 0;
363 $bucket_beg = int( $beg / BUCKET_SIZE );
364 $bucket_end = int( $end / BUCKET_SIZE );
366 $bucket_end = scalar @{ $index } if $bucket_end > scalar @{ $index };
368 for ( $i = $bucket_beg; $i <= $bucket_end; $i++ )
370 if ( defined $index->[ $i ] )
373 BEG => BUCKET_SIZE * $i,
374 END => BUCKET_SIZE * ( $i + 1 ),
375 COUNT => $index->[ $i ]->[ COUNT ],
380 return wantarray ? @blocks : \@blocks;
386 # Martin A. Hansen, December 2009.
388 # Given filehandles to two different unsorted KISS files
389 # intersect the entries so that all entries from file1 that
390 # overlap entries in file2 are returned - unless the inverse flag
391 # is given in which case entreis from file1 that does not
392 # overlap any entries in file2 are returned.
394 my ( $fh1, # filehandle to file1
395 $fh2, # filehandle to file2
396 $inverse, # flag indicating inverse matching - OPTIONAL
401 my ( $entry, %lookup, $pos, $overlap, @entries );
403 while ( $entry = kiss_entry_get( $fh2 ) ) {
404 map { $lookup{ $_ } = 1 } ( $entry->[ S_BEG ] .. $entry->[ S_END ] );
407 while ( $entry = kiss_entry_get( $fh1 ) )
411 foreach $pos ( $entry->[ S_BEG ] .. $entry->[ S_END ] )
413 if ( exists $lookup{ $pos } )
421 if ( $overlap and not $inverse ) {
422 push @entries, $entry;
423 } elsif ( not $overlap and $inverse ) {
424 push @entries, $entry;
428 return wantarray ? @entries : \@entries;
434 # Martin A. Hansen, November 2009.
436 # Stores a KISS index to a file.
438 my ( $path, # path to KISS index
444 Maasha::Filesys::file_store( $path, $index );
448 sub kiss_index_retrieve
450 # Martin A. Hansen, November 2009.
452 # Retrieves a KISS index from a file.
454 my ( $path, # Path to KISS index
457 # Returns a data structure.
461 $index = Maasha::NClist::nc_list_retrieve( $path );
463 return wantarray ? @{ $index } : $index;
469 # Martin A. Hansen, November 2009.
471 # Encodes alignment descriptors for two
474 my ( $s_seq, # Subject sequence reference
475 $q_seq, # Query sequence reference
476 $offset, # alternate offset - OPTIONAL
481 my ( $i, $s, $q, @align );
483 # Maasha::Common::error( "Sequence lengths don't match" ) if length ${ $s_seq } != length ${ $q_seq };
485 if ( length ${ $s_seq } != length ${ $q_seq } ) { # for unknown reasons this situation may occur - TODO FIXME
486 return wantarray ? () : [];
491 for ( $i = 0; $i < length ${ $s_seq }; $i++ )
493 $s = uc substr ${ $s_seq }, $i, 1;
494 $q = uc substr ${ $q_seq }, $i, 1;
496 if ( $s eq '-' and $q eq '-' )
506 push @align, "$offset:$s>$q";
508 $offset++ if not $s eq '-';
512 return wantarray ? @align : \@align;
518 # Martin A. Hansen, November 2009.
520 # Test routine of correct resolve of ALIGN descriptors.
522 # S.aur_COL 41 75 5_vnlh2ywXsN1/1 1 - . 17:A>T,31:A>N . . . .
524 my ( $s_seqref, # subject sequence reference
530 my ( $align, $pos, $s_symbol, $q_symbol, $s_seq, $q_seq, $insertions );
532 $s_seq = substr ${ $s_seqref }, $entry->{ 'S_BEG' }, $entry->{ 'S_END' } - $entry->{ 'S_BEG' } + 1;
537 foreach $align ( split /,/, $entry->{ 'ALIGN' } )
539 if ( $align =~ /(\d+):(.)>(.)/ )
545 if ( $s_symbol eq '-' ) # insertion
547 substr $s_seq, $pos + $insertions, 0, $s_symbol;
548 substr $q_seq, $pos + $insertions, 0, $q_symbol;
552 elsif ( $q_symbol eq '-' ) # deletion
554 substr $q_seq, $pos + $insertions, 1, $q_symbol;
558 substr $q_seq, $pos + $insertions, 1, $q_symbol;
563 Maasha::Common::error( qq(Bad alignment descriptor: "$align") );
567 Maasha::Align::align_print_pairwise( [ $entry->{ 'S_ID' }, $s_seq ], [ $entry->{ 'Q_ID' }, $q_seq ] );
573 # Martin A. Hansen, November 2009.
575 # Converts a KISS entry to a Biopiece record.
577 my ( $entry, # KISS entry
584 Maasha::Common::error( qq(BAD kiss entry) ) if not scalar @{ $entry } == 12;
586 $record{ 'S_ID' } = $entry->[ S_ID ];
587 $record{ 'S_BEG' } = $entry->[ S_BEG ];
588 $record{ 'S_END' } = $entry->[ S_END ];
589 $record{ 'Q_ID' } = $entry->[ Q_ID ];
590 $record{ 'SCORE' } = $entry->[ SCORE ];
591 $record{ 'STRAND' } = $entry->[ STRAND ];
592 $record{ 'HITS' } = $entry->[ HITS ];
593 $record{ 'ALIGN' } = $entry->[ ALIGN ];
594 $record{ 'BLOCK_COUNT' } = $entry->[ BLOCK_COUNT ];
595 $record{ 'BLOCK_BEGS' } = $entry->[ BLOCK_BEGS ];
596 $record{ 'BLOCK_LENS' } = $entry->[ BLOCK_LENS ];
597 $record{ 'BLOCK_TYPE' } = $entry->[ BLOCK_TYPE ];
599 return wantarray ? %record : \%record;
605 # Martin A. Hansen, November 2009.
607 # Converts a Biopiece record to a KISS entry.
609 my ( $record, # Biopiece record
616 if ( not defined $record->{ 'S_ID' } and
617 not defined $record->{ 'S_BEG' } and
618 not defined $record->{ 'S_END' } )
623 $entry->[ S_ID ] = $record->{ 'S_ID' };
624 $entry->[ S_BEG ] = $record->{ 'S_BEG' };
625 $entry->[ S_END ] = $record->{ 'S_END' };
626 $entry->[ Q_ID ] = $record->{ 'Q_ID' } || ".";
627 $entry->[ SCORE ] = $record->{ 'SCORE' } || $record->{ 'BIT_SCORE' } || ".";
628 $entry->[ STRAND ] = $record->{ 'STRAND' } || ".";
629 $entry->[ HITS ] = $record->{ 'HITS' } || ".";
630 $entry->[ ALIGN ] = $record->{ 'ALIGN' } || $record->{ 'DESCRIPTOR' } || ".";
631 $entry->[ BLOCK_COUNT ] = $record->{ 'BLOCK_COUNT' } || ".";
632 $entry->[ BLOCK_BEGS ] = $record->{ 'BLOCK_BEGS' } || ".";
633 $entry->[ BLOCK_LENS ] = $record->{ 'BLOCK_LENS' } || ".";
634 $entry->[ BLOCK_TYPE ] = $record->{ 'BLOCK_TYPE' } || ".";
636 return wantarray ? @{ $entry } : $entry;
640 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<