1 package Maasha::KISS::IO;
3 # Copyright (C) 2009 Martin A. Hansen.
5 # This program is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU General Public License
7 # as published by the Free Software Foundation; either version 2
8 # of the License, or (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 # http://www.gnu.org/copyleft/gpl.html
22 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
25 # Routines for parsing and emitting KISS records.
27 # http://code.google.com/p/biopieces/wiki/KissFormat
30 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
39 use vars qw( @ISA @EXPORT );
41 @ISA = qw( Exporter );
59 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
64 my ( $fh, # file handle
69 my ( $line, @fields, %entry );
71 while ( $line = <$fh> )
75 next if $line =~ /^$|^#/;
77 @fields = split /\t/, $line;
79 Maasha::Common::error( qq( BAD kiss entry: $line) ) if not @fields == 12;
81 $entry{ 'S_ID' } = $fields[ S_ID ];
82 $entry{ 'S_BEG' } = $fields[ S_BEG ];
83 $entry{ 'S_END' } = $fields[ S_END ];
84 $entry{ 'Q_ID' } = $fields[ Q_ID ];
85 $entry{ 'SCORE' } = $fields[ SCORE ];
86 $entry{ 'STRAND' } = $fields[ STRAND ];
87 $entry{ 'HITS' } = $fields[ HITS ];
88 $entry{ 'ALIGN' } = $fields[ ALIGN ];
89 $entry{ 'BLOCK_COUNT' } = $fields[ BLOCK_COUNT ];
90 $entry{ 'BLOCK_BEGS' } = $fields[ BLOCK_BEGS ];
91 $entry{ 'BLOCK_LENS' } = $fields[ BLOCK_LENS ];
92 $entry{ 'BLOCK_TYPE' } = $fields[ BLOCK_TYPE ];
94 return wantarray ? %entry : \%entry;
101 my ( $entry, # KISS entry to output
102 $fh, # file handle - OPTIONAL
111 $fields[ S_ID ] = $entry->{ 'S_ID' };
112 $fields[ S_BEG ] = $entry->{ 'S_BEG' };
113 $fields[ S_END ] = $entry->{ 'S_END' };
114 $fields[ Q_ID ] = $entry->{ 'Q_ID' };
115 $fields[ SCORE ] = $entry->{ 'SCORE' };
116 $fields[ STRAND ] = $entry->{ 'STRAND' };
117 $fields[ HITS ] = $entry->{ 'HITS' };
118 $fields[ ALIGN ] = $entry->{ 'ALIGN' };
119 $fields[ BLOCK_COUNT ] = $entry->{ 'BLOCK_COUNT' };
120 $fields[ BLOCK_BEGS ] = $entry->{ 'BLOCK_BEGS' };
121 $fields[ BLOCK_LENS ] = $entry->{ 'BLOCK_LENS' };
122 $fields[ BLOCK_TYPE ] = $entry->{ 'BLOCK_TYPE' };
124 print $fh join( "\t", @fields ), "\n";
130 my ( $dbh, # Database handle
132 $s_beg, # Subject begin
133 $s_end, # Subject end
136 my ( $sql, $entries );
138 # $sql = "SELECT * FROM $table WHERE S_BEG >= $s_beg AND S_END <= $s_end ORDER BY S_BEG,S_END";
139 $sql = "SELECT S_BEG,S_END,Q_ID,ALIGN FROM $table WHERE S_BEG >= $s_beg AND S_END <= $s_end";
141 $entries = Maasha::SQL::query_hashref_list( $dbh, $sql );
143 return wantarray ? @{ $entries } : $entries;
149 # Martin A, Hansen, October 2009.
151 # Creates an index of a sorted KISS file that
152 # allowing the location of the byte position
153 # from where records can be read given a
154 # specific S_BEG position. The index consists of
155 # triples: [ beg, end, bytepos ], where beg and
156 # end denotes the interval where the next KISS
157 # record begins at bytepos.
159 my ( $fh, # filehandle to KISS file
164 my ( $line, @fields, $beg, $end, $pos, @index );
169 while ( $line = <$fh> )
173 @fields = split /\t/, $line, 3;
175 $end = $fields[ S_BEG ];
179 push @index, [ $beg, $end, $pos ];
182 elsif ( $end > $beg )
184 push @index, [ $beg, $end - 1, $pos ];
189 Maasha::Common::error( qq(KISS file not sorted: $end < $beg) );
192 $pos += 1 + length $line;
195 return wantarray ? @index : \@index;
205 Maasha::Filesys::file_store( $path, $index );
209 sub kiss_index_retrieve
216 $index = Maasha::Filesys::file_retrieve( $path );
218 return wantarray ? @{ $index } : $index;
222 sub kiss_index_search
230 my ( $high, $low, $try );
233 $high = scalar @{ $index };
235 while ( $low <= $high )
237 $try = int( ( $high + $low ) / 2 );
239 if ( $num < $index->[ $try ]->[ 0 ] ) {
241 } elsif ( $num > $index->[ $try ]->[ 1 ] ) {
244 return $index->[ $try ]->[ 2 ];
248 Maasha::Common::error( "Could not find number->$num in index" );
259 my ( $index, $offset, $fh, $entry, @entries );
261 $index = Maasha::KISS::IO::kiss_index_retrieve( "$file.index" );
263 $offset = Maasha::KISS::IO::kiss_index_search( $index, $beg );
265 $fh = Maasha::Filesys::file_read_open( $file );
267 sysseek( $fh, $offset, 0 );
269 while ( $entry = kiss_entry_get( $fh ) )
271 push @entries, $entry;
273 last if $entry->{ 'S_END' } > $end;
278 return wantarray ? @entries : \@entries;
284 # S.aur_COL 41 75 5_vnlh2ywXsN1/1 1 - . 17:A>T,31:A>N . . . .
286 my ( $s_seqref, # subject sequence reference
292 my ( $align, $pos, $s_symbol, $q_symbol, $s_seq, $q_seq, $insertions );
294 $s_seq = substr ${ $s_seqref }, $entry->{ 'S_BEG' }, $entry->{ 'S_END' } - $entry->{ 'S_BEG' } + 1;
299 foreach $align ( split /,/, $entry->{ 'ALIGN' } )
301 if ( $align =~ /(\d+):(.)>(.)/ )
307 if ( $s_symbol eq '-' ) # insertion
309 substr $s_seq, $pos + $insertions, 0, $s_symbol;
310 substr $q_seq, $pos + $insertions, 0, $q_symbol;
314 elsif ( $q_symbol eq '-' ) # deletion
316 substr $q_seq, $pos + $insertions, 1, $q_symbol;
320 substr $q_seq, $pos + $insertions, 1, $q_symbol;
325 Maasha::Common::error( qq(Bad alignment descriptor: "$align") );
329 print Dumper( "s_seq: $s_seq", "q_seq: $q_seq" ) and exit;
335 my ( $sam_entry, # SAM entry
340 my ( $cigar, $offset, $op, $len, @descriptors, $kiss_entry );
342 $cigar = $sam_entry->{ 'CIGAR' };
348 while ( length $cigar > 0 )
350 if ( $cigar =~ s/^([MINDSHP])(\d+)// )
355 print "CIGAR: $cigar OP: $op LEN: $len\n";
370 Maasha::Common::error( qq(Bad CIGAR format: "$cigar") );
374 return wantarray ? %{ $kiss_entry } : $kiss_entry;
380 my ( $entry, # KISS entry
383 return wantarray ? %{ $entry } : $entry;
389 my ( $record, # Biopiece record
392 $record->{ 'HITS' } ||= ".";
393 $record->{ 'BLOCK_COUNT' } ||= ".";
394 $record->{ 'BLOCK_BEGS' } ||= ".";
395 $record->{ 'BLOCK_LENS' } ||= ".";
396 $record->{ 'BLOCK_TYPE' } ||= ".";
397 $record->{ 'ALIGN' } ||= $record->{ 'DESCRIPTOR' } || ".";
399 return wantarray ? %{ $record } : $record;
403 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<