From: martinahansen Date: Tue, 9 Feb 2010 16:19:48 +0000 (+0000) Subject: moving back to binned index and using JSON for storing X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=01733a43053dfd5acbd8262b52fde377e13efd88;p=biopieces.git moving back to binned index and using JSON for storing git-svn-id: http://biopieces.googlecode.com/svn/trunk@870 74ccb610-7750-0410-82ae-013aeee3265d --- diff --git a/bp_bin/BGB_intersect b/bp_bin/BGB_intersect index b6a5bfe..7bf8574 100755 --- a/bp_bin/BGB_intersect +++ b/bp_bin/BGB_intersect @@ -37,7 +37,7 @@ use Maasha::KISS; # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< -my ( $user, $options, $in, $out, $record, @contigs, $contig, $track, @tracks, %hash, $fh1, $fh2, @records ); +my ( $user, $options, $in, $out, $record, @contigs, $contig, $track, @tracks, %hash, $fh1, $fh2, $entries, $entry ); $user = Maasha::Biopieces::biopiecesrc( "BGB_USER" ); @@ -82,12 +82,19 @@ foreach $contig ( @contigs ) if ( exists $hash{ '1' } and exists $hash{ '2' } ) { + print STDERR "Intersecting:\n$hash{ '1' } and\n$hash{ '2' }\n\n" if $options->{ 'verbose' }; + $fh1 = Maasha::Filesys::file_read_open( "$hash{ '1' }/track_data.kiss" ); $fh2 = Maasha::Filesys::file_read_open( "$hash{ '2' }/track_data.kiss" ); - @records = Maasha::KISS::kiss_intersect( $fh1, $fh2, $options->{ 'invert' } ); + $entries = Maasha::KISS::kiss_intersect( $fh1, $fh2, $options->{ 'invert' } ); - map { Maasha::Biopieces::put_record( $_, $out ) } @records; + foreach $entry ( @{ $entries } ) + { + if ( $record = Maasha::KISS::kiss2biopiece( $entry ) ) { + Maasha::Biopieces::put_record( $record, $out ); + } + } } } diff --git a/bp_bin/BGB_upload b/bp_bin/BGB_upload index 33cc2e5..7f77995 100755 --- a/bp_bin/BGB_upload +++ b/bp_bin/BGB_upload @@ -114,7 +114,6 @@ if ( $options->{ 'track_name' } ) Maasha::KISS::kiss_sort( "$dst_dir/track_data.kiss" ); Maasha::KISS::kiss_index( "$dst_dir/track_data.kiss" ); - unlink "$dst_dir/track_data.kiss"; unlink "$tmp_dir/$key"; } } diff --git a/code_perl/Maasha/BGB/Track.pm b/code_perl/Maasha/BGB/Track.pm index a7bed79..7376d47 100644 --- a/code_perl/Maasha/BGB/Track.pm +++ b/code_perl/Maasha/BGB/Track.pm @@ -180,7 +180,7 @@ sub track_feature $start = $cookie->{ 'NAV_START' }; $end = $cookie->{ 'NAV_END' }; - $index = Maasha::KISS::kiss_index_retrieve( "$track/track_data.kiss.json" ); + $index = Maasha::KISS::kiss_index_retrieve( "$track/track_data.kiss.index" ); $count = Maasha::KISS::kiss_index_count( $index, $start, $end ); $track_name = ( split "/", $track )[ -1 ]; @@ -200,12 +200,12 @@ sub track_feature if ( $count > $cookie->{ 'FEAT_MAX' } ) { -# $entries = Maasha::KISS::kiss_index_get_blocks( $index, $start, $end ); -# push @{ $features }, track_feature_histogram( $cookie, $start, $end, $entries ); + $entries = Maasha::KISS::kiss_index_get_blocks( $index, $start, $end ); + push @{ $features }, track_feature_histogram( $cookie, $start, $end, $entries ); } else { - $entries = Maasha::KISS::kiss_index_get_entries( $index, $start, $end ); + $entries = Maasha::KISS::kiss_index_get_entries( "$track/track_data.kiss", $index, $start, $end ); push @{ $features }, track_feature_linear( $cookie, $start, $end, $entries ); } @@ -302,7 +302,7 @@ sub feature_align if ( $w >= 1 ) { - foreach $align ( split /,/, $entry->{ 'ALIGN' } ) + foreach $align ( split /,/, $entry->[ ALIGN ] ) { if ( $align =~ /(\d+):([ATCGN-])>([ATCGN-])/ ) { @@ -559,7 +559,7 @@ sub search_tracks # Returns a list. - my ( $search_track, $search_term, $contig, @tracks, $track, $file, @features, $track_name, $nc_list ); + my ( $search_track, $search_term, $contig, @tracks, $track, $file, $line, $out_file, $fh, $entry, @entries, $track_name ); if ( $cookie->{ 'SEARCH' } =~ /^(.+)\s+track:\s*(.+)/i ) { @@ -589,16 +589,28 @@ sub search_tracks next if $track_name !~ /$search_track/i; } - $file = "$track/track_data.kiss.json"; + $file = "$track/track_data.kiss"; if ( -f $file ) { - $nc_list = Maasha::NClist::nc_list_retrieve( $file ); - push @features, Maasha::NClist::nc_list_search( $nc_list, $search_term, 12 ); + $fh = Maasha::Filesys::file_read_open( $file ); + + while ( $line = <$fh> ) + { + chomp $line; + + if ( $line =~ /$search_term/i ) + { + $entry = Maasha::KISS::kiss_entry_parse( $line ); + push @entries, $entry; + } + } + + close $fh; } } - return wantarray ? @features : \@features; + return wantarray ? @entries : \@entries; } @@ -606,3 +618,59 @@ sub search_tracks 1; +__END__ + + +sub search_tracks_nc +{ + # Martin A. Hansen, December 2009. + + # Uses grep to search all tracks in all contigs + # for a given pattern and return a list of KISS entries. + + my ( $cookie, # cookie hash + ) = @_; + + # Returns a list. + + my ( $search_track, $search_term, $contig, @tracks, $track, $file, @features, $track_name, $nc_list ); + + if ( $cookie->{ 'SEARCH' } =~ /^(.+)\s+track:\s*(.+)/i ) + { + $search_term = $1; + $search_track = $2; + + $search_track =~ tr/ /_/; + } + else + { + $search_term = $cookie->{ 'SEARCH' }; + } + + foreach $contig ( @{ $cookie->{ 'LIST_CONTIG' } } ) + { + $cookie->{ 'CONTIG' } = $contig; + + push @tracks, path_tracks( $cookie ); + } + + foreach $track ( @tracks ) + { + if ( $search_track ) + { + $track_name = ( split "/", $track )[ -1 ]; + + next if $track_name !~ /$search_track/i; + } + + $file = "$track/track_data.kiss.json"; + + if ( -f $file ) + { + $nc_list = Maasha::NClist::nc_list_retrieve( $file ); + push @features, Maasha::NClist::nc_list_search( $nc_list, $search_term, 12 ); + } + } + + return wantarray ? @features : \@features; +} diff --git a/code_perl/Maasha/KISS.pm b/code_perl/Maasha/KISS.pm index 110d701..e4ac684 100644 --- a/code_perl/Maasha/KISS.pm +++ b/code_perl/Maasha/KISS.pm @@ -33,9 +33,9 @@ package Maasha::KISS; use warnings; use strict; use Data::Dumper; +use JSON::XS; use Maasha::Common; use Maasha::Filesys; -use Maasha::NClist; use Maasha::Align; use vars qw( @ISA @EXPORT ); @@ -104,8 +104,6 @@ sub kiss_entry_parse { # Martin A. Hansen, December 2009. - # TODO find out what uses this and kill it! - # Parses a line with a KISS entry. my ( $line, # KISS line to parse @@ -119,20 +117,7 @@ sub kiss_entry_parse Maasha::Common::error( qq(BAD kiss entry: $line) ) if not @fields == 12; - $entry{ 'S_ID' } = $fields[ S_ID ]; - $entry{ 'S_BEG' } = $fields[ S_BEG ]; - $entry{ 'S_END' } = $fields[ S_END ]; - $entry{ 'Q_ID' } = $fields[ Q_ID ]; - $entry{ 'SCORE' } = $fields[ SCORE ]; - $entry{ 'STRAND' } = $fields[ STRAND ]; - $entry{ 'HITS' } = $fields[ HITS ]; - $entry{ 'ALIGN' } = $fields[ ALIGN ]; - $entry{ 'BLOCK_COUNT' } = $fields[ BLOCK_COUNT ]; - $entry{ 'BLOCK_BEGS' } = $fields[ BLOCK_BEGS ]; - $entry{ 'BLOCK_LENS' } = $fields[ BLOCK_LENS ]; - $entry{ 'BLOCK_TYPE' } = $fields[ BLOCK_TYPE ]; - - return wantarray ? %entry : \%entry; + return wantarray ? @fields : \@fields; } @@ -193,13 +178,13 @@ sub kiss_sort } -sub kiss_index_old +sub kiss_index { # Martin A. Hansen, December 2009. # Creates a lookup index of a sorted KISS file. - my ( $file, # path to KISS file + my ( $file, # path to KISS file ) = @_; # Returns nothing. @@ -228,72 +213,72 @@ sub kiss_index_old } -sub kiss_index +sub kiss_index_offset { - # Martin A. Hansen, February 2010. + # Martin A. Hansen, December 2009. - # Creates a NC list index of a sorted KISS file. + # Given a KISS index and a begin position, + # locate the offset closest to the begin position, + # and return this. - my ( $file, # path to KISS file + my ( $index, # KISS index + $beg, # begin position ) = @_; - # Returns nothing. - - my ( $fh, $line, @fields, $nc_list ); + # Returns a number. - $fh = Maasha::Filesys::file_read_open( $file ); + my ( $bucket ); - while ( $line = <$fh> ) - { - chomp $line; + Maasha::Common::error( qq(Negative begin position: "$beg") ) if $beg < 0; - @fields = split "\t", $line; + $bucket = int( $beg / BUCKET_SIZE ); - if ( not defined $nc_list ) { - $nc_list = [ [ @fields ] ]; - } else { - Maasha::NClist::nc_list_add( $nc_list, [ @fields ], INDEX_END, INDEX ); - } - } + $bucket = scalar @{ $index } if $bucket > scalar @{ $index }; - close $fh; + while ( $bucket >= 0 ) + { + return $index->[ $bucket ]->[ OFFSET ] if defined $index->[ $bucket ]; - Maasha::NClist::nc_list_store( $nc_list, "$file.json" ); + $bucket--; + } } -sub kiss_index_offset +sub kiss_index_count { # Martin A. Hansen, December 2009. - # Given a KISS index and a begin position, - # locate the offset closest to the begin position, + # Given a KISS index and a begin/end interval + # sum the number of counts in that interval, # and return this. - my ( $index, # KISS index - $beg, # begin position + my ( $index, # KISS index + $beg, # Begin position + $end, # End position ) = @_; # Returns a number. - my ( $bucket ); + my ( $bucket_beg, $bucket_end, $count, $i ); Maasha::Common::error( qq(Negative begin position: "$beg") ) if $beg < 0; - $bucket = int( $beg / BUCKET_SIZE ); + $bucket_beg = int( $beg / BUCKET_SIZE ); + $bucket_end = int( $end / BUCKET_SIZE ); - $bucket = scalar @{ $index } if $bucket > scalar @{ $index }; + $bucket_end = scalar @{ $index } if $bucket_end > scalar @{ $index }; - while ( $bucket >= 0 ) - { - return $index->[ $bucket ]->[ OFFSET ] if defined $index->[ $bucket ]; + $count = 0; - $bucket--; + for ( $i = $bucket_beg; $i <= $bucket_end; $i++ ) { + $count += $index->[ $i ]->[ COUNT ] if defined $index->[ $i ]; } + + return $count; } -sub kiss_index_count +sub kiss_index_count_nc { # Martin A. Hansen, December 2009. @@ -326,18 +311,69 @@ sub kiss_index_get_entries # along with a beg/end interval, locate all entries # in that interval and return those. - my ( $index, # KISS index + my ( $file, # path to KISS file + $index, # KISS index $beg, # interval begin $end, # interval end ) = @_; # Returns a list. - my ( $features ); + my ( $offset, $fh, $entry, @entries ); - $features = Maasha::NClist::nc_list_get_interval( $index, $beg, $end, INDEX_BEG, INDEX_END, INDEX ); + # $offset = kiss_index_offset( $index, $beg ); - return wantarray ? @{ $features } : $features; + $fh = Maasha::Filesys::file_read_open( $file ); + + # sysseek( $fh, $offset, 0 ); + + while ( $entry = Maasha::KISS::kiss_entry_get( $fh ) ) + { + push @entries, $entry if $entry->[ S_END ] > $beg; + + last if $entry->[ S_BEG ] > $end; + } + + close $fh; + + return wantarray ? @entries : \@entries; +} + + +sub kiss_index_get_entries_OLD +{ + # Martin A. Hansen, November 2009. + + # Given a path to a KISS file and a KISS index + # along with a beg/end interval, locate all entries + # in that interval and return those. + + my ( $file, # path to KISS file + $index, # KISS index + $beg, # interval begin + $end, # interval end + ) = @_; + + # Returns a list. + + my ( $offset, $fh, $entry, @entries ); + + $offset = kiss_index_offset( $index, $beg ); + + $fh = Maasha::Filesys::file_read_open( $file ); + + sysseek( $fh, $offset, 0 ); + + while ( $entry = Maasha::KISS::kiss_entry_get( $fh ) ) + { + push @entries, $entry if $entry->[ S_END ] > $beg; + + last if $entry->[ S_BEG ] > $end; + } + + close $fh; + + return wantarray ? @entries : \@entries; } @@ -441,7 +477,15 @@ sub kiss_index_store # Returns nothing. - Maasha::Filesys::file_store( $path, $index ); + my ( $fh, $json ); + + $json = JSON::XS::encode_json( $index ); + + $fh = Maasha::Filesys::file_write_open( $path ); + + print $fh $json; + + close $fh; } @@ -456,9 +500,17 @@ sub kiss_index_retrieve # Returns a data structure. - my ( $index ); + my ( $fh, $json, $index ); - $index = Maasha::NClist::nc_list_retrieve( $path ); + local $/ = undef; + + $fh = Maasha::Filesys::file_read_open( $path ); + + $json = <$fh>; + + close $fh; + + $index = JSON::XS::decode_json( $json ); return wantarray ? @{ $index } : $index; } @@ -641,3 +693,79 @@ sub biopiece2kiss 1; +__END__ + +sub kiss_index_nc +{ + # Martin A. Hansen, February 2010. + + # Creates a NC list index of a sorted KISS file. + + my ( $file, # path to KISS file + ) = @_; + + # Returns nothing. + + my ( $fh, $line, @fields, $nc_list ); + + $fh = Maasha::Filesys::file_read_open( $file ); + + while ( $line = <$fh> ) + { + chomp $line; + + @fields = split "\t", $line; + + if ( not defined $nc_list ) { + $nc_list = [ [ @fields ] ]; + } else { + Maasha::NClist::nc_list_add( $nc_list, [ @fields ], INDEX_END, INDEX ); + } + } + + close $fh; + + Maasha::NClist::nc_list_store( $nc_list, "$file.json" ); +} + + +sub kiss_index_get_entries_nc +{ + # Martin A. Hansen, November 2009. + + # Given a path to a KISS file and a KISS index + # along with a beg/end interval, locate all entries + # in that interval and return those. + + my ( $index, # KISS index + $beg, # interval begin + $end, # interval end + ) = @_; + + # Returns a list. + + my ( $features ); + + $features = Maasha::NClist::nc_list_get_interval( $index, $beg, $end, INDEX_BEG, INDEX_END, INDEX ); + + return wantarray ? @{ $features } : $features; +} + + +sub kiss_index_retrieve_nc +{ + # Martin A. Hansen, November 2009. + + # Retrieves a KISS index from a file. + + my ( $path, # Path to KISS index + ) = @_; + + # Returns a data structure. + + my ( $index ); + + $index = Maasha::NClist::nc_list_retrieve( $path ); + + return wantarray ? @{ $index } : $index; +}