1 package Maasha::NClist;
3 # Copyright (C) 2010 Martin A. Hansen.
5 # This program is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU General Public License
7 # as published by the Free Software Foundation; either version 2
8 # of the License, or (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 # http://www.gnu.org/copyleft/gpl.html
22 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
25 # Nested Containment List (NCList): A new algorithm for accelerating
26 # interval query of genome alignment and interval databases
27 # http://bioinformatics.oxfordjournals.org/cgi/content/abstract/btl647v1
29 # Nested lists are composed of intervals defined by begin and end positions,
30 # and any interval that is contained within another interval is rooted to this.
31 # Thus, fast interval lookups can be performed using binary search.
34 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
45 use vars qw( @ISA @EXPORT );
47 @ISA = qw( Exporter );
50 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SUBROUTINES <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
55 # Martin A. Hansen, February 2010.
57 # Creates a Nested Containment (NC) list from a stack of features.
58 # The features consits of an AoA where beg and end specifies the
59 # elements containing the begin and end position of a feature, and
60 # index specifies a element used for nesting lists.
62 my ( $features, # list of features AoA
63 $beg, # feature element with begin position
64 $end, # feature element with end position
65 $index, # feature element with index position
72 @{ $features } = sort { $a->[ $beg ] <=> $b->[ $beg ] or $b->[ $end ] <=> $a->[ $end ] } @{ $features };
74 $nc_list = [ shift @{ $features } ];
76 map { nc_list_add( $nc_list, $_, $end, $index ) } @{ $features };
78 return wantarray ? @{ $nc_list } : $nc_list;
84 # Martin A. Hansen, February 2010.
86 # Recursively construct a Nested Containment (NC) list by adding
87 # a given feature to an existing NC list.
89 my ( $nc_list, # NC list
90 $feat, # feature (AoA)
91 $end, # feature element with end position
92 $index # feature element with index position
97 if ( $feat->[ $end ] <= $nc_list->[ -1 ]->[ $end ] ) # feature is nested.
99 if ( defined $nc_list->[ -1 ]->[ $index ] ) { # sublist exists so recurse to this.
100 nc_list_add( $nc_list->[ -1 ]->[ $index ], $feat, $end, $index );
102 $nc_list->[ -1 ]->[ $index ] = [ $feat ]; # creating a new sublist.
107 push @{ $nc_list }, $feat;
114 # Martin A. Hansen, February 2010.
116 # Given a Nested Containment (NC) list use binary search to locate
117 # the NC list containing a given search position. The index of the NC
118 # list containing the search position is returned. Extremely fast.
120 my ( $nc_list, # NC list
121 $pos, # search position
122 $beg, # feature element with begin position
123 $end, # feature element with end position
124 $index, # feature element with index position
127 # Returns an integer.
129 my ( $low, $high, $try );
132 $high = scalar @{ $nc_list };
134 while ( $low < $high )
136 $try = int( ( $high + $low ) / 2 );
138 if ( $pos < $nc_list->[ $try ]->[ $beg ] ) {
140 } elsif ( $nc_list->[ $try ]->[ $end ] < $pos ) {
153 # Martin A. Hansen, February 2010.
155 # Traverses a Nested Containment (NC) list recursively from a
156 # given index begin to a given index end and counts all
157 # features. The count is returned.
159 my ( $nc_list, # NC list
160 $index_beg, # index begin
161 $index_end, # index end
162 $index, # feature element with index position
165 # Returns an integer.
169 for ( $i = $index_beg; $i <= $index_end; $i++ )
173 if ( defined $nc_list->[ $i ]->[ $index ] ) { # sublist exists so recurse to this.
174 $count += nc_list_count( $nc_list->[ $i ]->[ $index ], 0, scalar @{ $nc_list->[ $i ]->[ $index ] } - 1, $index );
182 sub nc_list_count_interval
184 # Martin A. Hansen, February 2010.
186 # Counts all features in a Nested Containment (NC) list within a
187 # specified interval. The count is returned.
189 my ( $nc_list, # NC list
190 $int_beg, # interval begin
191 $int_end, # interval end
192 $beg, # feature element with begin position
193 $end, # feature element with end position
194 $index, # feature element with index position
197 # Returns an integer.
199 my ( $index_beg, $index_end, $count );
201 $index_beg = nc_list_lookup( $nc_list, $int_beg, $beg, $end, $index );
202 $index_end = nc_list_lookup( $nc_list, $int_end, $beg, $end, $index );
204 $count = nc_list_count( $nc_list, $index_beg, $index_end, $index );
212 # Martin A. Hansen, February 2010.
214 # Recursively retrieve all features from a Nested Containment (NC) list
215 # from a specified index begin to a specified index end.
217 # WARNING: The NC list is distroyed because the sublists are stripped.
219 my ( $nc_list, # NC list
220 $index_beg, # index begin
221 $index_end, # index end
222 $index, # feature element with index position
227 my ( $i, @features );
229 for ( $i = $index_beg; $i <= $index_end; $i++ )
231 push @features, $nc_list->[ $i ];
233 if ( defined $nc_list->[ $i ]->[ $index ] ) # sublist exists so recurse to this.
235 push @features, nc_list_get( $nc_list->[ $i ]->[ $index ], 0, scalar @{ $nc_list->[ $i ]->[ $index ] } - 1, $index );
237 delete $nc_list->[ $i ]->[ $index ];
241 return wantarray ? @features : \@features;
245 sub nc_list_get_interval
247 # Martin A. Hansen, February 2010.
249 # Retrieve all features from a Nested Containment (NC) list from within
250 # a specified interval.
252 my ( $nc_list, # NC list
253 $int_beg, # interval begin
254 $int_end, # interval end
255 $beg, # feature element with begin position
256 $end, # feature element with end position
257 $index, # feature element with index position
262 my ( $index_beg, $index_end, $features );
264 $index_beg = nc_list_lookup( $nc_list, $int_beg, $beg, $end, $index );
265 $index_end = nc_list_lookup( $nc_list, $int_end, $beg, $end, $index );
267 $features = nc_list_get( $nc_list, $index_beg, $index_end, $index );
269 return wantarray ? @{ $features } : $features;
275 # Martin A. Hansen, February 2010.
277 # Recursively search a Nested Containment (NC) list for features matching
280 my ( $nc_list, # NC list
281 $regex, # regex to search for
282 $index, # feature element with index position
287 my ( $feature, @features );
289 foreach $feature ( @{ $nc_list } )
291 push @features, $feature if grep { $_ =~ /$regex/i if defined $_ } @{ $feature };
293 if ( defined $feature->[ $index ] ) # sublist exists so recurse to this.
295 push @features, nc_list_search( $feature->[ $index ], $regex, $index );
297 delete $feature->[ $index ];
301 return wantarray ? @features : \@features;
307 # Martin A. Hansen, February 2010.
309 # Store a Nested Containment (NC) list to a
312 my ( $nc_list, # NC list
313 $file, # path to file
320 $json = JSON::XS::encode_json( $nc_list );
322 $fh = Maasha::Filesys::file_write_open( $file );
332 # Martin A. Hansen, February 2010.
334 # Retrieves a Nested Containment (NC) list from a
337 my ( $file, # path to JSON file
342 my ( $fh, $json, $nc_list );
346 $fh = Maasha::Filesys::file_read_open( $file );
352 $nc_list = JSON::XS::decode_json( $json );
354 return wantarray ? @{ $nc_list } : $nc_list;
358 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
365 my $t0 = Time::HiRes::gettimeofday();
366 my $t1 = Time::HiRes::gettimeofday(); print STDERR "Time: " . ( $t1 - $t0 ) . "\n";