From 0e46b9f634fa265d6d677df14ab9237aee70ce64 Mon Sep 17 00:00:00 2001 From: martinahansen Date: Mon, 14 Jul 2008 06:39:05 +0000 Subject: [PATCH] added create_fixedstep_index git-svn-id: http://biopieces.googlecode.com/svn/trunk@147 74ccb610-7750-0410-82ae-013aeee3265d --- bp_bin/create_fixedstep_index | 6 + code_perl/Maasha/Biopieces.pm | 55 ++++++- code_perl/Maasha/UCSC.pm | 302 +++++++++++++++++----------------- 3 files changed, 210 insertions(+), 153 deletions(-) create mode 100755 bp_bin/create_fixedstep_index diff --git a/bp_bin/create_fixedstep_index b/bp_bin/create_fixedstep_index new file mode 100755 index 0000000..4cd1d44 --- /dev/null +++ b/bp_bin/create_fixedstep_index @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biopieces; diff --git a/code_perl/Maasha/Biopieces.pm b/code_perl/Maasha/Biopieces.pm index 9d01329..8a5737a 100644 --- a/code_perl/Maasha/Biopieces.pm +++ b/code_perl/Maasha/Biopieces.pm @@ -214,6 +214,7 @@ sub run_script elsif ( $script eq "blat_seq" ) { script_blat_seq( $in, $out, $options ) } elsif ( $script eq "match_seq" ) { script_match_seq( $in, $out, $options ) } elsif ( $script eq "create_vmatch_index" ) { script_create_vmatch_index( $in, $out, $options ) } + elsif ( $script eq "create_fixedstep_index" ) { script_create_fixedstep_index( $in, $out, $options ) } elsif ( $script eq "vmatch_seq" ) { script_vmatch_seq( $in, $out, $options ) } elsif ( $script eq "write_fasta" ) { script_write_fasta( $in, $out, $options ) } elsif ( $script eq "write_align" ) { script_write_align( $in, $out, $options ) } @@ -580,6 +581,13 @@ sub get_options no_stream|x ); } + elsif ( $script eq "create_fixedstep_index" ) + { + @options = qw( + index_name|i=s + no_stream|x + ); + } elsif ( $script eq "vmatch_seq" ) { @options = qw( @@ -927,7 +935,7 @@ sub get_options # print STDERR Dumper( \%options ); - if ( scalar( keys %options ) == 1 or $options{ "help" } ) { + if ( -t STDIN && scalar( keys %options ) == 1 or $options{ "help" } ) { return wantarray ? %options : \%options; } @@ -999,7 +1007,7 @@ sub get_options } Maasha::Common::error( qq(no --database specified) ) if $script eq "create_blast_db" and not $options{ "database" }; - Maasha::Common::error( qq(no --index_name specified) ) if $script eq "create_vmatch_index" and not $options{ "index_name" }; + Maasha::Common::error( qq(no --index_name specified) ) if $script =~ /create_vmatch_index|create_fixedstep_index/ and not $options{ "index_name" }; Maasha::Common::error( qq(no --database or --genome specified) ) if $script eq "blast_seq" and not $options{ "genome" } and not $options{ "database" }; Maasha::Common::error( qq(both --database and --genome specified) ) if $script eq "blast_seq" and $options{ "genome" } and $options{ "database" }; Maasha::Common::error( qq(no --index_name or --genome specified) ) if $script eq "vmatch_seq" and not $options{ "genome" } and not $options{ "index_name" }; @@ -3627,6 +3635,49 @@ sub script_create_vmatch_index } +sub script_create_fixedstep_index +{ + # Martin A. Hansen, January 2008. + + # Create a fixedStep index from records in the stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $file_tmp, $fh_tmp, $vals, $index ); + + $file_tmp = "$BP_TMP/fixedstep.tmp"; + + $fh_tmp = Maasha::Common::write_open( $file_tmp ); + + while ( $record = get_record( $in ) ) + { + if ( $record->{ "CHR" } and $record->{ "CHR_BEG" } and $record->{ "STEP" } and $record->{ "VALS" } ) + { + print $fh_tmp "fixedStep chrom=$record->{ 'CHR' } start=$record->{ 'CHR_BEG' } step=$record->{ 'STEP' }\n"; + + $vals = $record->{ 'VALS' }; + + $vals =~ tr/,/\n/; + + print $fh_tmp "$vals\n"; + } + } + + close $fh_tmp; + + $index = Maasha::UCSC::fixedstep_index_create( $file_tmp ); + + unlink $file_tmp; + + Maasha::UCSC::fixedstep_index_store( $options->{ 'index_name' }, $index ); +} + + sub script_vmatch_seq { # Martin A. Hansen, August 2007. diff --git a/code_perl/Maasha/UCSC.pm b/code_perl/Maasha/UCSC.pm index 4f932e3..239dea1 100644 --- a/code_perl/Maasha/UCSC.pm +++ b/code_perl/Maasha/UCSC.pm @@ -868,154 +868,15 @@ sub fixedstep_get_entry } -# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> PhastCons format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - - -sub phastcons_parse_entry -{ - # Martin A. Hansen, December 2007. - - # Given a PhastCons entry converts this to a - # list of super blocks. - - my ( $lines, # list of lines - $args, # argument hash - ) = @_; - - # Returns - - my ( $info, $chr, $beg, $step, $i, $c, $j, @blocks, @super_blocks, @entries, $super_block, $block, @lens, @begs ); - - $info = shift @{ $lines }; - - if ( $info =~ /^chrom=([^ ]+) start=(\d+) step=(\d+)$/ ) - { - $chr = $1; - $beg = $2; - $step = $3; - - die qq(ERROR: step size $step != 1 -> problem!\n) if $step != 1; # in an ideal world should would be fixed ... - } - - $i = 0; - - while ( $i < @{ $lines } ) - { - if ( $lines->[ $i ] >= $args->{ "threshold" } ) - { - $c = $i + 1; - - while ( $c < @{ $lines } ) - { - if ( $lines->[ $c ] < $args->{ "threshold" } ) - { - $j = $c + 1; - - while ( $j < @{ $lines } and $lines->[ $j ] < $args->{ "threshold" } ) { - $j++; - } - - if ( $j - $c > $args->{ "gap" } ) - { - if ( $c - $i >= $args->{ "min" } ) - { - push @blocks, { - CHR => $chr, - CHR_BEG => $beg + $i - 1, - CHR_END => $beg + $c - 2, - CHR_LEN => $c - $i, - }; - } - - $i = $j; - - last; - } - - $c = $j - } - else - { - $c++; - } - } - - if ( $c - $i >= $args->{ "min" } ) - { - push @blocks, { - CHR => $chr, - CHR_BEG => $beg + $i - 1, - CHR_END => $beg + $c - 2, - CHR_LEN => $c - $i, - }; - } - - $i = $c; - } - else - { - $i++; - } - } - - $i = 0; - - while ( $i < @blocks ) - { - $c = $i + 1; - - while ( $c < @blocks and $blocks[ $c ]->{ "CHR_BEG" } - $blocks[ $c - 1 ]->{ "CHR_END" } <= $args->{ "dist" } ) - { - $c++; - } - - push @super_blocks, [ @blocks[ $i .. $c - 1 ] ]; - - $i = $c; - } - - foreach $super_block ( @super_blocks ) - { - foreach $block ( @{ $super_block } ) - { - push @begs, $block->{ "CHR_BEG" } - $super_block->[ 0 ]->{ "CHR_BEG" }; - push @lens, $block->{ "CHR_LEN" } - 1; - } - - $lens[ -1 ]++; - - push @entries, { - CHR => $super_block->[ 0 ]->{ "CHR" }, - CHR_BEG => $super_block->[ 0 ]->{ "CHR_BEG" }, - CHR_END => $super_block->[ -1 ]->{ "CHR_END" }, - Q_ID => "Q_ID", - SCORE => 100, - STRAND => "+", - THICK_BEG => $super_block->[ 0 ]->{ "CHR_BEG" }, - THICK_END => $super_block->[ -1 ]->{ "CHR_END" } + 1, - ITEMRGB => "0,200,100", - BLOCKCOUNT => scalar @{ $super_block }, - BLOCKSIZES => join( ",", @lens ), - Q_BEGS => join( ",", @begs ), - }; - - undef @begs; - undef @lens; - } - - return wantarray ? @entries : \@entries; -} - - -sub phastcons_index_create +sub fixedstep_index_create { # Martin A. Hansen, January 2008. - # Indexes a concatenated PhastCons file. + # Indexes a concatenated fixedStep file. # The index consists of a hash with chromosomes as keys, # and a list of [ chr_beg, next_chr_beg, chr_end, index_beg, index_len ] as values. - my ( $path, # path to PhastCons file + my ( $path, # path to fixedStep file ) = @_; # Returns a hashref @@ -1033,12 +894,12 @@ sub phastcons_index_create if ( $locator =~ /chrom=([^ ]+) start=(\d+) step=(\d+)/ ) { $chr = $1; - $beg = $2 - 1; # phastcons files are 1-based + $beg = $2 - 1; # fixedStep files are 1-based $step = $3; } else { - Maasha::Common::error( qq(Could not parse PhastCons locator: $locator) ); + Maasha::Common::error( qq(Could not parse locator: $locator) ); } $pos += length( $locator ) + 11; @@ -1069,11 +930,11 @@ sub phastcons_index_create } -sub phastcons_index_store +sub fixedstep_index_store { # Martin A. Hansen, January 2008. - # Writes a PhastCons index to binary file. + # Writes a fixedStep index to binary file. my ( $path, # full path to file $index, # list with index @@ -1085,11 +946,11 @@ sub phastcons_index_store } -sub phastcons_index_retrieve +sub fixedstep_index_retrieve { # Martin A. Hansen, January 2008. - # Retrieves a PhastCons index from binary file. + # Retrieves a fixedStep index from binary file. my ( $path, # full path to file ) = @_; @@ -1104,12 +965,12 @@ sub phastcons_index_retrieve } -sub phastcons_index_lookup +sub fixedStep_index_lookup { # Martin A. Hansen, January 2008. - # Retrieve PhastCons scores from a indexed - # Phastcons file given a chromosome and + # Retrieve fixedStep scores from a indexed + # fixedStep file given a chromosome and # begin and end positions. my ( $index, # data structure @@ -1234,6 +1095,145 @@ sub phastcons_index_lookup } +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> PhastCons format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +sub phastcons_parse_entry +{ + # Martin A. Hansen, December 2007. + + # Given a PhastCons entry converts this to a + # list of super blocks. + + my ( $lines, # list of lines + $args, # argument hash + ) = @_; + + # Returns + + my ( $info, $chr, $beg, $step, $i, $c, $j, @blocks, @super_blocks, @entries, $super_block, $block, @lens, @begs ); + + $info = shift @{ $lines }; + + if ( $info =~ /^chrom=([^ ]+) start=(\d+) step=(\d+)$/ ) + { + $chr = $1; + $beg = $2; + $step = $3; + + die qq(ERROR: step size $step != 1 -> problem!\n) if $step != 1; # in an ideal world should would be fixed ... + } + + $i = 0; + + while ( $i < @{ $lines } ) + { + if ( $lines->[ $i ] >= $args->{ "threshold" } ) + { + $c = $i + 1; + + while ( $c < @{ $lines } ) + { + if ( $lines->[ $c ] < $args->{ "threshold" } ) + { + $j = $c + 1; + + while ( $j < @{ $lines } and $lines->[ $j ] < $args->{ "threshold" } ) { + $j++; + } + + if ( $j - $c > $args->{ "gap" } ) + { + if ( $c - $i >= $args->{ "min" } ) + { + push @blocks, { + CHR => $chr, + CHR_BEG => $beg + $i - 1, + CHR_END => $beg + $c - 2, + CHR_LEN => $c - $i, + }; + } + + $i = $j; + + last; + } + + $c = $j + } + else + { + $c++; + } + } + + if ( $c - $i >= $args->{ "min" } ) + { + push @blocks, { + CHR => $chr, + CHR_BEG => $beg + $i - 1, + CHR_END => $beg + $c - 2, + CHR_LEN => $c - $i, + }; + } + + $i = $c; + } + else + { + $i++; + } + } + + $i = 0; + + while ( $i < @blocks ) + { + $c = $i + 1; + + while ( $c < @blocks and $blocks[ $c ]->{ "CHR_BEG" } - $blocks[ $c - 1 ]->{ "CHR_END" } <= $args->{ "dist" } ) + { + $c++; + } + + push @super_blocks, [ @blocks[ $i .. $c - 1 ] ]; + + $i = $c; + } + + foreach $super_block ( @super_blocks ) + { + foreach $block ( @{ $super_block } ) + { + push @begs, $block->{ "CHR_BEG" } - $super_block->[ 0 ]->{ "CHR_BEG" }; + push @lens, $block->{ "CHR_LEN" } - 1; + } + + $lens[ -1 ]++; + + push @entries, { + CHR => $super_block->[ 0 ]->{ "CHR" }, + CHR_BEG => $super_block->[ 0 ]->{ "CHR_BEG" }, + CHR_END => $super_block->[ -1 ]->{ "CHR_END" }, + Q_ID => "Q_ID", + SCORE => 100, + STRAND => "+", + THICK_BEG => $super_block->[ 0 ]->{ "CHR_BEG" }, + THICK_END => $super_block->[ -1 ]->{ "CHR_END" } + 1, + ITEMRGB => "0,200,100", + BLOCKCOUNT => scalar @{ $super_block }, + BLOCKSIZES => join( ",", @lens ), + Q_BEGS => join( ",", @begs ), + }; + + undef @begs; + undef @lens; + } + + return wantarray ? @entries : \@entries; +} + + sub phastcons_normalize { # Martin A. Hansen, January 2008. -- 2.39.5