1 package Maasha::KISS::IO;
3 # Copyright (C) 2009 Martin A. Hansen.
5 # This program is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU General Public License
7 # as published by the Free Software Foundation; either version 2
8 # of the License, or (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 # http://www.gnu.org/copyleft/gpl.html
22 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
25 # Routines for parsing and emitting KISS records.
28 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
37 use vars qw( @ISA @EXPORT );
39 @ISA = qw( Exporter );
57 # 012345678901234567890
58 # --------------------- S.aur complete genome
59 # -===__===- TAG_000001
62 # S_ID = 'S.aur complete genome'
69 # ALIGN => 0:A>T,3:G>C
76 # 'S.aur complete genome' 3 12 'TAG_000001' 1 + 31 2 1,6 3,3 1,1
79 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
84 my ( $fh, # file handle
89 my ( $line, @fields, %entry );
91 while ( $line = <$fh> )
95 next if $line =~ /^$|^#/;
97 @fields = split /\t/, $line;
99 Maasha::Common::error( qq( BAD kiss entry: $line) ) if not @fields == 12;
101 $entry{ 'S_ID' } = $fields[ S_ID ];
102 $entry{ 'S_BEG' } = $fields[ S_BEG ];
103 $entry{ 'S_END' } = $fields[ S_END ];
104 $entry{ 'Q_ID' } = $fields[ Q_ID ];
105 $entry{ 'SCORE' } = $fields[ SCORE ];
106 $entry{ 'STRAND' } = $fields[ STRAND ];
107 $entry{ 'HITS' } = $fields[ HITS ];
108 $entry{ 'ALIGN' } = $fields[ ALIGN ];
109 $entry{ 'BLOCK_COUNT' } = $fields[ BLOCK_COUNT ];
110 $entry{ 'BLOCK_BEGS' } = $fields[ BLOCK_BEGS ];
111 $entry{ 'BLOCK_LENS' } = $fields[ BLOCK_LENS ];
112 $entry{ 'BLOCK_TYPE' } = $fields[ BLOCK_TYPE ];
114 return wantarray ? %entry : \%entry;
121 my ( $entry, # KISS entry to output
122 $fh, # file handle - OPTIONAL
131 $fields[ S_ID ] = $entry->{ 'S_ID' };
132 $fields[ S_BEG ] = $entry->{ 'S_BEG' };
133 $fields[ S_END ] = $entry->{ 'S_END' };
134 $fields[ Q_ID ] = $entry->{ 'Q_ID' };
135 $fields[ SCORE ] = $entry->{ 'SCORE' };
136 $fields[ STRAND ] = $entry->{ 'STRAND' };
137 $fields[ HITS ] = $entry->{ 'HITS' };
138 $fields[ ALIGN ] = $entry->{ 'ALIGN' };
139 $fields[ BLOCK_COUNT ] = $entry->{ 'BLOCK_COUNT' };
140 $fields[ BLOCK_BEGS ] = $entry->{ 'BLOCK_BEGS' };
141 $fields[ BLOCK_LENS ] = $entry->{ 'BLOCK_LENS' };
142 $fields[ BLOCK_TYPE ] = $entry->{ 'BLOCK_TYPE' };
144 print $fh join( "\t", @fields ), "\n";
150 my ( $dbh, # Database handle
152 $s_beg, # Subject begin
153 $s_end, # Subject end
156 my ( $sql, $entries );
158 # $sql = "SELECT * FROM $table WHERE S_BEG >= $s_beg AND S_END <= $s_end ORDER BY S_BEG,S_END";
159 $sql = "SELECT S_BEG,S_END,Q_ID,ALIGN FROM $table WHERE S_BEG >= $s_beg AND S_END <= $s_end";
161 $entries = Maasha::SQL::query_hashref_list( $dbh, $sql );
163 return wantarray ? @{ $entries } : $entries;
169 # Martin A, Hansen, October 2009.
171 # Creates an index of a sorted KISS file that
172 # allowing the location of the byte position
173 # from where records can be read given a
174 # specific S_BEG position. The index consists of
175 # triples: [ beg, end, bytepos ], where beg and
176 # end denotes the interval where the next KISS
177 # record begins at bytepos.
179 my ( $fh, # filehandle to KISS file
184 my ( $line, @fields, $beg, $end, $pos, @index );
189 while ( $line = <$fh> )
193 @fields = split /\t/, $line, 3;
195 $end = $fields[ S_BEG ];
199 push @index, [ $beg, $end, $pos ];
202 elsif ( $end > $beg )
204 push @index, [ $beg, $end - 1, $pos ];
209 Maasha::Common::error( qq(KISS file not sorted: $end < $beg) );
212 $pos += 1 + length $line;
215 return wantarray ? @index : \@index;
225 Maasha::Filesys::file_store( $path, $index );
229 sub kiss_index_retrieve
236 $index = Maasha::Filesys::file_retrieve( $path );
238 return wantarray ? @{ $index } : $index;
242 sub kiss_index_search
250 my ( $high, $low, $try );
253 $high = scalar @{ $index };
255 while ( $low <= $high )
257 $try = int( ( $high + $low ) / 2 );
259 # print "low: $low high: $high try: $try\n";
261 if ( $num < $index->[ $try ]->[ 0 ] ) {
263 } elsif ( $num > $index->[ $try ]->[ 1 ] ) {
266 return $index->[ $try ]->[ 2 ];
270 Maasha::Common::error( "Could not find number->$num in index" );
281 my ( $index, $offset, $fh, $entry, @entries );
283 $index = Maasha::KISS::IO::kiss_index_retrieve( "$file.index" );
285 $offset = Maasha::KISS::IO::kiss_index_search( $index, $beg );
287 $fh = Maasha::Filesys::file_read_open( $file );
289 sysseek( $fh, $offset, 0 );
291 while ( $entry = kiss_entry_get( $fh ) )
293 push @entries, $entry;
295 last if $entry->{ 'S_END' } > $end;
300 return wantarray ? @entries : \@entries;
306 my ( $entry, # KISS entry
309 return wantarray ? %{ $entry } : $entry;
315 my ( $record, # Biopiece record
318 $record->{ 'HITS' } ||= ".";
319 $record->{ 'BLOCK_COUNT' } ||= ".";
320 $record->{ 'BLOCK_BEGS' } ||= ".";
321 $record->{ 'BLOCK_LENS' } ||= ".";
322 $record->{ 'BLOCK_TYPE' } ||= ".";
323 $record->{ 'ALIGN' } ||= $record->{ 'DESCRIPTOR' } || ".";
325 return wantarray ? %{ $record } : $record;
329 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<