From cdec8a8e71677009113458927768fc5b0c3fa44c Mon Sep 17 00:00:00 2001 From: martinahansen Date: Wed, 16 Jun 2010 14:05:19 +0000 Subject: [PATCH] schrubbing UCSC.pm code git-svn-id: http://biopieces.googlecode.com/svn/trunk@985 74ccb610-7750-0410-82ae-013aeee3265d --- bp_bin/analyze_bed | 4 +- bp_bin/get_genome_phastcons | 12 +- bp_bin/plot_phastcons_profiles | 5 +- bp_bin/upload_to_ucsc | 12 +- bp_bin/write_bed | 2 - code_perl/Maasha/UCSC.pm | 975 -------------------------------- code_perl/Maasha/UCSC/BED.pm | 267 +++++++++ code_perl/Maasha/UCSC/PSL.pm | 23 + code_perl/Maasha/UCSC/Wiggle.pm | 289 +++++++++- 9 files changed, 582 insertions(+), 1007 deletions(-) diff --git a/bp_bin/analyze_bed b/bp_bin/analyze_bed index 5895104..ea16d8a 100755 --- a/bp_bin/analyze_bed +++ b/bp_bin/analyze_bed @@ -29,7 +29,7 @@ use warnings; use strict; use Maasha::Biopieces; -use Maasha::UCSC; +use Maasha::UCSC::BED; # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< @@ -44,7 +44,7 @@ $out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } ); while ( $record = Maasha::Biopieces::get_record( $in ) ) { - $record = Maasha::UCSC::bed_analyze( $record ) if $record->{ "REC_TYPE" } eq "BED"; + $record = Maasha::UCSC::BED::bed_analyze( $record ) if $record->{ "REC_TYPE" } eq "BED"; Maasha::Biopieces::put_record( $record, $out ); } diff --git a/bp_bin/get_genome_phastcons b/bp_bin/get_genome_phastcons index 8531d26..4e348b8 100755 --- a/bp_bin/get_genome_phastcons +++ b/bp_bin/get_genome_phastcons @@ -30,7 +30,7 @@ use warnings; use strict; use Maasha::Biopieces; use Maasha::Filesys; -use Maasha::UCSC; +use Maasha::UCSC::Wiggle; # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< @@ -55,7 +55,7 @@ $out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } ); $phastcons_file = "$ENV{ 'BP_DATA' }/genomes/$options->{ 'genome' }/phastcons/$options->{ 'genome' }.pp"; $phastcons_index = "$ENV{ 'BP_DATA' }/genomes/$options->{ 'genome' }/phastcons/$options->{ 'genome' }.pp.index"; -$index = Maasha::UCSC::fixedstep_index_retrieve( $phastcons_index ); +$index = Maasha::UCSC::Wiggle::fixedstep_index_retrieve( $phastcons_index ); $fh_phastcons = Maasha::Filesys::file_read_open( $phastcons_file ); if ( defined $options->{ "chr" } and defined $options->{ "beg" } and ( defined $options->{ "end" } or defined $options->{ "len" } ) ) @@ -67,7 +67,7 @@ if ( defined $options->{ "chr" } and defined $options->{ "beg" } and ( defined $ $options->{ "end" } = $options->{ "beg" } + $options->{ "len" } - 1; } - $scores = Maasha::UCSC::fixedstep_index_lookup( $index, $fh_phastcons, $options->{ "chr" }, $options->{ "beg" }, $options->{ "end" }, $options->{ "flank" } ); + $scores = Maasha::UCSC::Wiggle::fixedstep_index_lookup( $index, $fh_phastcons, $options->{ "chr" }, $options->{ "beg" }, $options->{ "end" }, $options->{ "flank" } ); $record->{ "CHR" } = $options->{ "chr" }; $record->{ "CHR_BEG" } = $options->{ "beg" } - $options->{ "flank" }; @@ -83,15 +83,15 @@ while ( $record = Maasha::Biopieces::get_record( $in ) ) { if ( $record->{ "REC_TYPE" } eq "BED" ) { - $scores = Maasha::UCSC::fixedstep_index_lookup( $index, $fh_phastcons, $record->{ "CHR" }, $record->{ "CHR_BEG" }, $record->{ "CHR_END" }, $options->{ "flank" } ); + $scores = Maasha::UCSC::Wiggle::fixedstep_index_lookup( $index, $fh_phastcons, $record->{ "CHR" }, $record->{ "CHR_BEG" }, $record->{ "CHR_END" }, $options->{ "flank" } ); } elsif ( $record->{ "REC_TYPE" } eq "PSL" ) { - $scores = Maasha::UCSC::fixedstep_index_lookup( $index, $fh_phastcons, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" }, $options->{ "flank" } ); + $scores = Maasha::UCSC::Wiggle::fixedstep_index_lookup( $index, $fh_phastcons, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" }, $options->{ "flank" } ); } elsif ( $record->{ "REC_TYPE" } eq "BLAST" ) { - $scores = Maasha::UCSC::fixedstep_index_lookup( $index, $fh_phastcons, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" }, $options->{ "flank" } ); + $scores = Maasha::UCSC::Wiggle::fixedstep_index_lookup( $index, $fh_phastcons, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" }, $options->{ "flank" } ); } $record->{ "PHASTCONS" } = join ",", @{ $scores } if @{ $scores }; diff --git a/bp_bin/plot_phastcons_profiles b/bp_bin/plot_phastcons_profiles index 0fb79d1..a72c46b 100755 --- a/bp_bin/plot_phastcons_profiles +++ b/bp_bin/plot_phastcons_profiles @@ -33,6 +33,7 @@ use Maasha::Plot; use Maasha::Matrix; use Maasha::Filesys; use Maasha::UCSC; +use Maasha::UCSC::Wiggle; # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< @@ -66,14 +67,14 @@ $out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } ); $phastcons_file = Maasha::Config::genome_phastcons( $options->{ "genome" } ); $phastcons_index = Maasha::Config::genome_phastcons_index( $options->{ "genome" } ); -$index = Maasha::UCSC::fixedstep_index_retrieve( $phastcons_index ); +$index = Maasha::UCSC::Wiggle::fixedstep_index_retrieve( $phastcons_index ); $fh_phastcons = Maasha::Filesys::file_read_open( $phastcons_file ); while ( $record = Maasha::Biopieces::get_record( $in ) ) { if ( $record->{ "CHR" } and $record->{ "CHR_BEG" } and $record->{ "CHR_END" } ) { - $scores = Maasha::UCSC::fixedstep_index_lookup( $index, $fh_phastcons, $record->{ "CHR" }, + $scores = Maasha::UCSC::Wiggle::fixedstep_index_lookup( $index, $fh_phastcons, $record->{ "CHR" }, $record->{ "CHR_BEG" }, $record->{ "CHR_END" }, $options->{ "flank" } ); diff --git a/bp_bin/upload_to_ucsc b/bp_bin/upload_to_ucsc index aac4928..dec35bf 100755 --- a/bp_bin/upload_to_ucsc +++ b/bp_bin/upload_to_ucsc @@ -158,9 +158,9 @@ while ( $record = Maasha::Biopieces::get_record( $in ) ) close $fh_out; if ( $format eq "BED" ) { - Maasha::UCSC::bed_upload_to_ucsc( $tmp_dir, $file, $options, $append ); + Maasha::UCSC::BED::bed_upload_to_ucsc( $tmp_dir, $file, $options, $append ); } elsif ( $format eq "PSL" ) { - Maasha::UCSC::psl_upload_to_ucsc( $file, $options, $append ); + Maasha::UCSC::BED::psl_upload_to_ucsc( $file, $options, $append ); } unlink $file; @@ -181,19 +181,19 @@ if ( $format eq "BED" ) { $type = "bed $columns"; - Maasha::UCSC::bed_upload_to_ucsc( $tmp_dir, $file, $options, $append ); + Maasha::UCSC::BED::bed_upload_to_ucsc( $tmp_dir, $file, $options, $append ); } elsif ( $format eq "BED_SS" ) { $type = "type bed 6 +"; - Maasha::UCSC::bed_upload_to_ucsc( $tmp_dir, $file, $options, $append ); + Maasha::UCSC::BED::bed_upload_to_ucsc( $tmp_dir, $file, $options, $append ); } elsif ( $format eq "PSL" ) { $type = "psl"; - Maasha::UCSC::psl_upload_to_ucsc( $file, $options, $append ); + Maasha::UCSC::PSL::psl_upload_to_ucsc( $file, $options, $append ); } elsif ( $format eq "WIGGLE" ) { @@ -220,7 +220,7 @@ elsif ( $format eq "WIGGLE" ) $type = "wig 0"; - Maasha::UCSC::wiggle_upload_to_ucsc( $tmp_dir, $wib_dir, $file, $options ); + Maasha::UCSC::Wiggle::wiggle_upload_to_ucsc( $tmp_dir, $wib_dir, $file, $options ); } unlink $file; diff --git a/bp_bin/write_bed b/bp_bin/write_bed index 51cdc75..5b1e156 100755 --- a/bp_bin/write_bed +++ b/bp_bin/write_bed @@ -54,8 +54,6 @@ $fh = Maasha::Biopieces::write_stream( $options->{ 'data_out' }, $options->{ 'co while ( $record = Maasha::Biopieces::get_record( $in ) ) { - $record = Maasha::UCSC::psl2record( $record ) if $record->{ 'tBaseInsert' }; # Dirty addition to allow Affy data from MySQL to be dumped - if ( $bed_entry = Maasha::UCSC::BED::biopiece2bed( $record, $options->{ 'cols' } ) ) { Maasha::UCSC::BED::bed_entry_put( $bed_entry, $fh, $options->{ 'cols' }, $options->{ 'check' } ); } diff --git a/code_perl/Maasha/UCSC.pm b/code_perl/Maasha/UCSC.pm index 2664513..05913c9 100644 --- a/code_perl/Maasha/UCSC.pm +++ b/code_perl/Maasha/UCSC.pm @@ -39,12 +39,6 @@ use Maasha::Calc; use Maasha::Matrix; use constant { - FS_CHR_BEG => 0, - FS_NEXT_CHR_BEG => 1, - FS_CHR_END => 2, - FS_INDEX_BEG => 3, - FS_INDEX_LEN => 4, - CHR => 0, CHR_BEG => 1, CHR_END => 2, @@ -62,653 +56,6 @@ use constant { @ISA = qw( Exporter ); -# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> BED format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - - -# http://genome.ucsc.edu/goldenPath/help/hgTracksHelp.html#BED - - -sub bed_entry_get_array -{ - # Martin A. Hansen, September 2008. - - # Reads a BED entry given a filehandle. - - # This is a new _faster_ BED entry parser that - # uses arrays and not hashrefs. - - # IMPORTANT! This function does not correct the - # BED_END position that is kept the way UCSC - # does. - - my ( $fh, # file handle - $cols, # columns to read - OPTIONAL (3,4,5,6, or 12) - ) = @_; - - # Returns a list. - - my ( $line, @entry ); - - $line = <$fh>; - - $line =~ tr/\n\r//d; # some people have carriage returns in their BED files -> Grrrr - - return if not defined $line; - - if ( not defined $cols ) { - $cols = 1 + $line =~ tr/\t//; - } - - @entry = split "\t", $line, $cols + 1; - - pop @entry if scalar @entry > $cols; - - return wantarray ? @entry : \@entry; -} - - -sub bed_get_entry -{ - # Martin A. Hansen, December 2007. - - # Reads a bed entry given a filehandle. - - my ( $fh, # file handle - $columns, # number of BED columns to read - OPTIONAL - ) = @_; - - # Returns hashref. - - my ( $line, @fields, %entry ); - - $line = <$fh>; - - $line =~ tr/\n\r//d; # some people have carriage returns in their BED files -> Grrrr - - return if not defined $line; - - @fields = split "\t", $line; - - $columns ||= scalar @fields; - - if ( $columns == 3 ) - { - %entry = ( - "CHR" => $fields[ 0 ], - "CHR_BEG" => $fields[ 1 ], - "CHR_END" => $fields[ 2 ] - 1, - ); - } - elsif ( $columns == 4 ) - { - %entry = ( - "CHR" => $fields[ 0 ], - "CHR_BEG" => $fields[ 1 ], - "CHR_END" => $fields[ 2 ] - 1, - "Q_ID" => $fields[ 3 ], - ); - } - elsif ( $columns == 5 ) - { - %entry = ( - "CHR" => $fields[ 0 ], - "CHR_BEG" => $fields[ 1 ], - "CHR_END" => $fields[ 2 ] - 1, - "Q_ID" => $fields[ 3 ], - "SCORE" => $fields[ 4 ], - ); - } - elsif ( $columns == 6 ) - { - %entry = ( - "CHR" => $fields[ 0 ], - "CHR_BEG" => $fields[ 1 ], - "CHR_END" => $fields[ 2 ] - 1, - "Q_ID" => $fields[ 3 ], - "SCORE" => $fields[ 4 ], - "STRAND" => $fields[ 5 ], - ); - } - elsif ( $columns == 12 ) - { - %entry = ( - "CHR" => $fields[ 0 ], - "CHR_BEG" => $fields[ 1 ], - "CHR_END" => $fields[ 2 ] - 1, - "Q_ID" => $fields[ 3 ], - "SCORE" => $fields[ 4 ], - "STRAND" => $fields[ 5 ], - "THICK_BEG" => $fields[ 6 ], - "THICK_END" => $fields[ 7 ] - 1, - "COLOR" => $fields[ 8 ], - "BLOCK_COUNT" => $fields[ 9 ], - "BLOCK_LENS" => $fields[ 10 ], - "Q_BEGS" => $fields[ 11 ], - ); - } - else - { - Maasha::Common::error( qq(Bad BED format in line->$line<-) ); - } - - $entry{ "REC_TYPE" } = "BED"; - $entry{ "BED_LEN" } = $entry{ "CHR_END" } - $entry{ "CHR_BEG" } + 1; - $entry{ "BED_COLS" } = $columns; - - return wantarray ? %entry : \%entry; -} - - -sub bed_get_entries -{ - # Martin A. Hansen, January 2008. - - # Given a path to a BED file, read in all entries - # and return. - - my ( $path, # full path to BED file - $columns, # number of columns in BED file - OPTIONAL (but is faster) - ) = @_; - - # Returns a list. - - my ( $fh, $entry, @list ); - - $fh = Maasha::Filesys::file_read_open( $path ); - - while ( $entry = bed_get_entry( $fh ) ) { - push @list, $entry; - } - - close $fh; - - return wantarray ? @list : \@list; -} - - -sub bed_entry_put_array -{ - # Martin A. Hansen, Septermber 2008. - - # Writes a BED entry array to file. - - # IMPORTANT! This function does not correct the - # BED_END position that is assumed to be in the - # UCSC positions scheme. - - my ( $record, # list - $fh, # file handle - OPTIONAL - $cols, # number of columns in BED file - OPTIONAL - ) = @_; - - # Returns nothing. - - $fh = \*STDOUT if not defined $fh; - - if ( defined $cols ) { - print $fh join( "\t", @{ $record }[ 0 .. $cols - 1 ] ), "\n"; - } else { - print $fh join( "\t", @{ $record } ), "\n"; - } -} - - -sub bed_put_entry -{ - # Martin A. Hansen, Septermber 2007. - - # Writes a BED entry to file. - - # NB, this could really be more robust!? - - my ( $record, # hashref - $fh, # file handle - OPTIONAL - $columns, # number of columns in BED file - OPTIONAL (but is faster) - ) = @_; - - # Returns nothing. - - my ( @fields ); - - $columns ||= 12; # max number of columns possible - - if ( $columns == 3 ) - { - push @fields, $record->{ "CHR" }; - push @fields, $record->{ "CHR_BEG" }; - push @fields, $record->{ "CHR_END" } + 1; - } - elsif ( $columns == 4 ) - { - $record->{ "Q_ID" } =~ s/\s+/_/g; - - push @fields, $record->{ "CHR" }; - push @fields, $record->{ "CHR_BEG" }; - push @fields, $record->{ "CHR_END" } + 1; - push @fields, $record->{ "Q_ID" }; - } - elsif ( $columns == 5 ) - { - $record->{ "Q_ID" } =~ s/\s+/_/g; - $record->{ "SCORE" } =~ s/\.\d*//; - - push @fields, $record->{ "CHR" }; - push @fields, $record->{ "CHR_BEG" }; - push @fields, $record->{ "CHR_END" } + 1; - push @fields, $record->{ "Q_ID" }; - push @fields, $record->{ "SCORE" }; - } - elsif ( $columns == 6 ) - { - $record->{ "Q_ID" } =~ s/\s+/_/g; - $record->{ "SCORE" } =~ s/\.\d*//; - - push @fields, $record->{ "CHR" }; - push @fields, $record->{ "CHR_BEG" }; - push @fields, $record->{ "CHR_END" } + 1; - push @fields, $record->{ "Q_ID" }; - push @fields, $record->{ "SCORE" }; - push @fields, $record->{ "STRAND" }; - } - else - { - $record->{ "Q_ID" } =~ s/\s+/_/g; - $record->{ "SCORE" } =~ s/\.\d*//; - - push @fields, $record->{ "CHR" }; - push @fields, $record->{ "CHR_BEG" }; - push @fields, $record->{ "CHR_END" } + 1; - push @fields, $record->{ "Q_ID" }; - push @fields, $record->{ "SCORE" }; - push @fields, $record->{ "STRAND" }; - push @fields, $record->{ "THICK_BEG" } if defined $record->{ "THICK_BEG" }; - push @fields, $record->{ "THICK_END" } + 1 if defined $record->{ "THICK_END" }; - push @fields, $record->{ "COLOR" } if defined $record->{ "COLOR" }; - push @fields, $record->{ "BLOCK_COUNT" } if defined $record->{ "BLOCK_COUNT" }; - push @fields, $record->{ "BLOCK_LENS" } if defined $record->{ "BLOCK_LENS" }; - push @fields, $record->{ "Q_BEGS" } if defined $record->{ "Q_BEGS" }; - } - - if ( $fh ) { - print $fh join( "\t", @fields ), "\n"; - } else { - print join( "\t", @fields ), "\n"; - } -} - - -sub bed_put_entries -{ - # Martin A. Hansen, January 2008. - - # Write a list of BED entries. - - my ( $entries, # list of entries, - $fh, # file handle - OPTIONAL - ) = @_; - - # Returns nothing. - - map { bed_put_entry( $_, $fh ) } @{ $entries }; -} - - -sub bed_analyze -{ - # Martin A. Hansen, March 2008. - - # Given a bed record, analysis this to give information - # about intron/exon sizes. - - my ( $entry, # BED entry - ) = @_; - - # Returns hashref. - - my ( $i, @begs, @lens, $exon_max, $exon_min, $exon_len, $exon_tot, $intron_max, $intron_min, $intron_len, $intron_tot ); - - $exon_max = 0; - $exon_min = 9999999999; - $intron_max = 0; - $intron_min = 9999999999; - - $entry->{ "EXONS" } = $entry->{ "BLOCK_COUNT" }; - - @begs = split /,/, $entry->{ "Q_BEGS" }; - @lens = split /,/, $entry->{ "BLOCK_LENS" }; - - for ( $i = 0; $i < $entry->{ "BLOCK_COUNT" }; $i++ ) - { - $exon_len = $lens[ $i ]; - - $entry->{ "EXON_LEN_$i" } = $exon_len; - - $exon_max = $exon_len if $exon_len > $exon_max; - $exon_min = $exon_len if $exon_len < $exon_min; - - $exon_tot += $exon_len; - } - - $entry->{ "EXON_LEN_-1" } = $exon_len; - $entry->{ "EXON_MAX_LEN" } = $exon_max; - $entry->{ "EXON_MIN_LEN" } = $exon_min; - $entry->{ "EXON_MEAN_LEN" } = int( $exon_tot / $entry->{ "EXONS" } ); - - $entry->{ "INTRONS" } = $entry->{ "BLOCK_COUNT" } - 1; - $entry->{ "INTRONS" } = 0 if $entry->{ "INTRONS" } < 0; - - if ( $entry->{ "INTRONS" } ) - { - for ( $i = 1; $i < $entry->{ "BLOCK_COUNT" }; $i++ ) - { - $intron_len = $begs[ $i ] - ( $begs[ $i - 1 ] + $lens[ $i - 1 ] ); - - $entry->{ "INTRON_LEN_" . ( $i - 1 ) } = $intron_len; - - $intron_max = $intron_len if $intron_len > $intron_max; - $intron_min = $intron_len if $intron_len < $intron_min; - - $intron_tot += $intron_len; - } - - $entry->{ "INTRON_LEN_-1" } = $intron_len; - $entry->{ "INTRON_MAX_LEN" } = $intron_max; - $entry->{ "INTRON_MIN_LEN" } = $intron_min; - $entry->{ "INTRON_MEAN_LEN" } = int( $intron_tot / $entry->{ "INTRONS" } ); - } - - return wantarray ? %{ $entry } : $entry; -} - - -sub bed_sort -{ - # Martin A. Hansen, Septermber 2008 - - # Sorts a BED file using the c program - # "bed_sort" specifing a sort mode: - - # 1: chr AND chr_beg. - # 2: chr AND strand AND chr_beg. - # 3: chr_beg. - # 4: strand AND chr_beg. - - my ( $bed_file, # BED file to sort - $sort_mode, # See above. - $cols, # Number of columns in BED file - ) = @_; - - Maasha::Common::run( "bed_sort", "--sort $sort_mode --cols $cols $bed_file" ); -} - - -sub bed_split_to_files -{ - # Martin A. Hansen, Septermber 2008 - - # Given a list of BED files, split these - # into temporary files based on the chromosome - # name. Returns a list of the temporary files. - - my ( $bed_files, # list of BED files to split - $cols, # number of columns - $tmp_dir, # temporary directory - ) = @_; - - # Returns a list. - - my ( $bed_file, $fh_in, $entry, $key, %fh_hash, @tmp_files ); - - foreach $bed_file ( @{ $bed_files } ) - { - $fh_in = Maasha::Filesys::file_read_open( $bed_file ); - - while ( $entry = bed_entry_get_array( $fh_in, $cols ) ) - { - $key = $entry->[ CHR ]; - - $fh_hash{ $key } = Maasha::Filesys::file_write_open( "$tmp_dir/$key.temp" ) if not exists $fh_hash{ $key }; - - bed_entry_put_array( $entry, $fh_hash{ $key } ); - } - - close $fh_in; - } - - foreach $key ( sort keys %fh_hash ) - { - push @tmp_files, "$tmp_dir/$key.temp"; - - close $fh_hash{ $key }; - } - - return wantarray ? @tmp_files : \@tmp_files; -} - - -sub bed_merge_entries -{ - # Martin A. Hansen, February 2008. - - # Merge a list of given BED entries in one big entry. - - my ( $entries, # list of BED entries to be merged - ) = @_; - - # Returns hash. - - my ( $i, @q_ids, @q_begs, @blocksizes, @new_q_begs, @new_blocksizes, %new_entry ); - - @{ $entries } = sort { $a->{ "CHR_BEG" } <=> $b->{ "CHR_BEG" } } @{ $entries }; - - for ( $i = 0; $i < @{ $entries }; $i++ ) - { - Maasha::Common::error( qq(Attempted merge of BED entries from different chromosomes) ) if $entries->[ 0 ]->{ "CHR" } ne $entries->[ $i ]->{ "CHR" }; - Maasha::Common::error( qq(Attempted merge of BED entries from different strands) ) if $entries->[ 0 ]->{ "STRAND" } ne $entries->[ $i ]->{ "STRAND" }; - - push @q_ids, $entries->[ $i ]->{ "Q_ID" } || sprintf( "ID%06d", $i ); - - if ( exists $entries->[ $i ]->{ "Q_BEGS" } ) - { - @q_begs = split ",", $entries->[ $i ]->{ "Q_BEGS" }; - @blocksizes = split ",", $entries->[ $i ]->{ "BLOCK_LENS" }; - } - else - { - @q_begs = 0; - @blocksizes = $entries->[ $i ]->{ "CHR_END" } - $entries->[ $i ]->{ "CHR_BEG" } + 1; - } - - map { $_ += $entries->[ $i ]->{ "CHR_BEG" } } @q_begs; - - push @new_q_begs, @q_begs; - push @new_blocksizes, @blocksizes; - } - - map { $_ -= $entries->[ 0 ]->{ "CHR_BEG" } } @new_q_begs; - - %new_entry = ( - CHR => $entries->[ 0 ]->{ "CHR" }, - CHR_BEG => $entries->[ 0 ]->{ "CHR_BEG" }, - CHR_END => $entries->[ -1 ]->{ "CHR_END" }, - REC_TYPE => "BED", - BED_LEN => $entries->[ -1 ]->{ "CHR_END" } - $entries->[ 0 ]->{ "CHR_BEG" } + 1, - BED_COLS => 12, - Q_ID => join( ":", @q_ids ), - SCORE => 999, - STRAND => $entries->[ 0 ]->{ "STRAND" } || "+", - THICK_BEG => $entries->[ 0 ]->{ "THICK_BEG" } || $entries->[ 0 ]->{ "CHR_BEG" }, - THICK_END => $entries->[ -1 ]->{ "THICK_END" } || $entries->[ -1 ]->{ "CHR_END" }, - COLOR => 0, - BLOCK_COUNT => scalar @new_q_begs, - BLOCK_LENS => join( ",", @new_blocksizes ), - Q_BEGS => join( ",", @new_q_begs ), - ); - - return wantarray ? %new_entry : \%new_entry; -} - - -sub bed_split_entry -{ - # Martin A. Hansen, February 2008. - - # Splits a given BED entry into a list of blocks, - # which are returned. A list of 6 column BED entry is returned. - - my ( $entry, # BED entry hashref - ) = @_; - - # Returns a list. - - my ( @q_begs, @blocksizes, $block, @blocks, $i ); - - if ( exists $entry->{ "BLOCK_COUNT" } ) - { - @q_begs = split ",", $entry->{ "Q_BEGS" }; - @blocksizes = split ",", $entry->{ "BLOCK_LENS" }; - - for ( $i = 0; $i < @q_begs; $i++ ) - { - undef $block; - - $block->{ "CHR" } = $entry->{ "CHR" }; - $block->{ "CHR_BEG" } = $entry->{ "CHR_BEG" } + $q_begs[ $i ]; - $block->{ "CHR_END" } = $entry->{ "CHR_BEG" } + $q_begs[ $i ] + $blocksizes[ $i ] - 1; - $block->{ "Q_ID" } = $entry->{ "Q_ID" } . sprintf( "_%03d", $i ); - $block->{ "SCORE" } = $entry->{ "SCORE" }; - $block->{ "STRAND" } = $entry->{ "STRAND" }; - $block->{ "BED_LEN" } = $block->{ "CHR_END" } - $block->{ "CHR_BEG" } + 1, - $block->{ "BED_COLS" } = 6; - $block->{ "REC_TYPE" } = "BED"; - - push @blocks, $block; - } - } - else - { - @blocks = @{ $entry }; - } - - return wantarray ? @blocks : \@blocks; -} - - - -sub bed_overlap -{ - # Martin A. Hansen, February 2008. - - # Checks if two BED entries overlap and - # return 1 if so - else 0; - - my ( $entry1, # hashref - $entry2, # hashref - $no_strand, # don't check strand flag - OPTIONAL - ) = @_; - - # Return bolean. - - return 0 if $entry1->{ "CHR" } ne $entry2->{ "CHR" }; - return 0 if $entry1->{ "STRAND" } ne $entry2->{ "STRAND" }; - - if ( $entry1->{ "CHR_END" } < $entry2->{ "CHR_BEG" } or $entry1->{ "CHR_BEG" } > $entry2->{ "CHR_END" } ) { - return 0; - } else { - return 1; - } -} - - -sub bed_upload_to_ucsc -{ - # Martin A. Hansen, September 2007. - - # Upload a BED file to the UCSC database. - - my ( $tmp_dir, # temporary directory - $file, # file to upload, - $options, # argument hashref - $append, # flag indicating table should be appended - ) = @_; - - # Returns nothing. - - my ( $args, $table, $sql_file, $fh_out, $fh_in ); - - if ( $append ) { - $args = join " ", $options->{ "database" }, $options->{ "table" }, "-tmpDir=$tmp_dir", "-oldTable", $file; - } else { - $args = join " ", $options->{ "database" }, $options->{ "table" }, "-tmpDir=$tmp_dir", $file; - } - - if ( $options->{ "table" } =~ /rnaSecStr/ ) - { - $table = $options->{ "table" }; - - print qq(uploading secondary structure table:"$table"\n) if $options->{ "verbose" }; - - $sql_file = "$tmp_dir/upload_RNA_SS.sql"; - - $fh_out = Maasha::Filesys::file_write_open( $sql_file ); - - print $fh_out qq( -CREATE TABLE $table ( - bin smallint not null, # Bin number for browser speedup - chrom varchar(255) not null, # Chromosome or FPC contig - chromStart int unsigned not null, # Start position in chromosome - chromEnd int unsigned not null, # End position in chromosome - name varchar(255) not null, # Name of item - score int unsigned not null, # Score from 0-1000 - strand char(1) not null, # + or - - size int unsigned not null, # Size of element. - secStr longblob not null, # Parentheses and '.'s which define the secondary structure - conf longblob not null, # Confidence of secondary-structure annotation per position (0.0-1.0). - #Indices - INDEX(name(16)), - INDEX(chrom(8), bin), - INDEX(chrom(8), chromStart) -); - ); - - close $fh_out; - - Maasha::Common::run( "hgLoadBed", "-notItemRgb -sqlTable=$sql_file $options->{ 'database' } $options->{ 'table' } -tmpDir=$tmp_dir $file > /dev/null 2>&1" ); - - unlink $sql_file; - } - else - { - Maasha::Common::run( "hgLoadBed", "$args > /dev/null 2>&1" ); - } -} - - -sub psl_upload_to_ucsc -{ - # Martin A. Hansen, September 2007. - - # Upload a PSL file to the UCSC database. - - my ( $file, # file to upload, - $options, # argument hashref - $append, # flag indicating table should be appended - ) = @_; - - # Returns nothing. - - my ( $args ); - - if ( $append ) { - $args = join " ", $options->{ "database" }, "-table=$options->{ 'table' }", "-clientLoad", "-append", $file; - } else { - $args = join " ", $options->{ "database" }, "-table=$options->{ 'table' }", "-clientLoad", $file; - } - - Maasha::Common::run( "hgLoadPsl", "$args > /dev/null 2>&1" ); -} - - # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> TRACK FILE <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< @@ -864,267 +211,6 @@ sub ucsc_update_config } -# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> fixedStep format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - - -sub fixedstep_get_entry -{ - # Martin A. Hansen, December 2007. - - # Given a file handle to a PhastCons file get the - # next entry which is all the lines after a "fixedStep" - # line and until the next "fixedStep" line or EOF. - - my ( $fh, # filehandle - ) = @_; - - # Returns a list of lines - - my ( $entry, @lines ); - - local $/ = "\nfixedStep "; - - $entry = <$fh>; - - chomp $entry; - - @lines = split "\n", $entry; - - return if @lines == 0; - - $lines[ 0 ] =~ s/fixedStep?\s*//; - - return wantarray ? @lines : \@lines; -} - - -sub fixedstep_index_create -{ - # Martin A. Hansen, January 2008. - - # Indexes a concatenated fixedStep file. - # The index consists of a hash with chromosomes as keys, - # and a list of [ chr_beg, next_chr_beg, chr_end, index_beg, index_len ] as values. - - my ( $path, # path to fixedStep file - ) = @_; - - # Returns a hashref - - my ( $fh, $pos, $index_beg, $index_len, $entry, $locator, $chr, $step, $beg, $end, $len, %index, $i ); - - $fh = Maasha::Filesys::file_read_open( $path ); - - $pos = 0; - - while ( $entry = Maasha::UCSC::fixedstep_get_entry( $fh ) ) - { - $locator = shift @{ $entry }; - - if ( $locator =~ /chrom=([^ ]+) start=(\d+) step=(\d+)/ ) - { - $chr = $1; - $beg = $2 - 1; # fixedStep files are 1-based - $step = $3; - } - else - { - Maasha::Common::error( qq(Could not parse locator: $locator) ); - } - - $pos += length( $locator ) + 11; - - $index_beg = $pos; - -# map { $pos += length( $_ ) + 1 } @{ $entry }; - - $pos += 6 * scalar @{ $entry }; - - $index_len = $pos - $index_beg; - - push @{ $index{ $chr } }, [ $beg, undef, $beg + scalar @{ $entry } - 1, $index_beg, $index_len ]; - } - - close $fh; - - foreach $chr ( keys %index ) - { - for ( $i = 0; $i < @{ $index{ $chr } } - 1; $i++ ) { - $index{ $chr }->[ $i ]->[ FS_NEXT_CHR_BEG ] = $index{ $chr }->[ $i + 1 ]->[ 0 ]; - } - - $index{ $chr }->[ -1 ]->[ FS_NEXT_CHR_BEG ] = $index{ $chr }->[ -1 ]->[ FS_CHR_END ] + 1; - } - - return wantarray ? %index : \%index; -} - - -sub fixedstep_index_store -{ - # Martin A. Hansen, January 2008. - - # Writes a fixedStep index to binary file. - - my ( $path, # full path to file - $index, # list with index - ) = @_; - - # returns nothing - - Maasha::Filesys::file_store( $path, $index ); -} - - -sub fixedstep_index_retrieve -{ - # Martin A. Hansen, January 2008. - - # Retrieves a fixedStep index from binary file. - - my ( $path, # full path to file - ) = @_; - - # returns list - - my $index; - - $index = Maasha::Filesys::file_retrieve( $path ); - - return wantarray ? %{ $index } : $index; -} - - -sub fixedstep_index_lookup -{ - # Martin A. Hansen, January 2008. - - # Retrieve fixedStep scores from a indexed - # fixedStep file given a chromosome and - # begin and end positions. - - my ( $index, # data structure - $fh, # filehandle to datafile - $chr, # chromosome - $chr_beg, # chromosome beg - $chr_end, # chromosome end - $flank, # include flanking region - OPTIONAL - ) = @_; - - # Returns a list - - my ( $index_beg, $index_end, $i, $c, $beg, $end, @vals, $scores ); - - $flank ||= 0; - - $chr_beg -= $flank; - $chr_end += $flank; - - # print "chr_beg->$chr_beg chr_end->$chr_end flank->$flank\n"; - - if ( exists $index->{ $chr } ) - { - $index_beg = Maasha::Matrix::interval_search( $index->{ $chr }, 0, 1, $chr_beg ); - - if ( $index_beg < 0 ) { - Maasha::Common::error( qq(Index search failed - begin index position doesn't exists: $chr_beg) ); - } - - if ( $chr_end < $index->{ $chr }->[ $index_beg ]->[ 1 ] ) - { - $index_end = $index_beg; - } - else - { - $index_end = Maasha::Matrix::interval_search( $index->{ $chr }, 0, 1, $chr_end ); - - if ( $index_end < 0 ) { - Maasha::Common::error( qq(Index search failed - end index position doesn't exists: $chr_end) ); - } - } - - map { $scores->[ $_ ] = 0 } 0 .. $chr_end - $chr_beg; - - if ( $index_beg == $index_end ) - { - $beg = Maasha::Calc::max( $chr_beg, $index->{ $chr }->[ $index_beg ]->[ FS_CHR_BEG ] ); - $end = Maasha::Calc::min( $chr_end, $index->{ $chr }->[ $index_end ]->[ FS_CHR_END ] ); - - if ( $beg <= $index->{ $chr }->[ $index_beg ]->[ FS_CHR_END ] and $end >= $index->{ $chr }->[ $index_beg ]->[ FS_CHR_BEG ] ) - { - @vals = split "\n", Maasha::Filesys::file_read( - $fh, - $index->{ $chr }->[ $index_beg ]->[ FS_INDEX_BEG ] + 6 * ( $beg - $index->{ $chr }->[ $index_beg ]->[ FS_CHR_BEG ] ), - 6 * ( $end - $beg + 1 ), - ); - } - - for ( $c = 0; $c < @vals; $c++ ) { - $scores->[ $c + $beg - $chr_beg ] = $vals[ $c ]; - } - } - else - { - $beg = Maasha::Calc::max( $chr_beg, $index->{ $chr }->[ $index_beg ]->[ FS_CHR_BEG ] ); - -# print Dumper( $beg, $index->{ $chr }->[ $index_beg ] ); -# print Dumper( "next", $index->{ $chr }->[ $index_beg ]->[ FS_NEXT_CHR_BEG ] ); - - # beg next - # v v - # |||||||||....... - - if ( $beg <= $index->{ $chr }->[ $index_beg ]->[ FS_CHR_END ] ) - { - @vals = split "\n", Maasha::Filesys::file_read( - $fh, - $index->{ $chr }->[ $index_beg ]->[ FS_INDEX_BEG ] + 6 * ( $beg - $index->{ $chr }->[ $index_beg ]->[ FS_CHR_BEG ] ), - 6 * ( $index->{ $chr }->[ $index_beg ]->[ FS_CHR_END ] - $beg + 1 ), - ); - - for ( $c = 0; $c < @vals; $c++ ) { - $scores->[ $c + $beg - $chr_beg ] = $vals[ $c ]; - } - } - - $end = Maasha::Calc::min( $chr_end, $index->{ $chr }->[ $index_end ]->[ FS_CHR_END ] ); - - if ( $end <= $index->{ $chr }->[ $index_end ]->[ FS_CHR_END ] ) - { - @vals = split "\n", Maasha::Filesys::file_read( - $fh, - $index->{ $chr }->[ $index_end ]->[ FS_INDEX_BEG ], - 6 * ( $end - $index->{ $chr }->[ $index_end ]->[ FS_CHR_BEG ] + 1 ), - ); - - for ( $c = 0; $c < @vals; $c++ ) { - $scores->[ $c + $index->{ $chr }->[ $index_end ]->[ FS_CHR_BEG ] - $chr_beg ] = $vals[ $c ]; - } - } - - for ( $i = $index_beg + 1; $i <= $index_end - 1; $i++ ) - { - @vals = split "\n", Maasha::Filesys::file_read( - $fh, - $index->{ $chr }->[ $i ]->[ FS_INDEX_BEG ], - 6 * ( $index->{ $chr }->[ $i ]->[ FS_CHR_END ] - $index->{ $chr }->[ $i ]->[ FS_CHR_BEG ] + 1 ), - ); - - for ( $c = 0; $c < @vals; $c++ ) { - $scores->[ $c + $index->{ $chr }->[ $i ]->[ FS_CHR_BEG ] - $chr_beg ] = $vals[ $c ]; - } - } - } - } - else - { - Maasha::Common::error( qq(Chromosome "$chr" was not found in index) ); - } - - return wantarray ? @{ $scores } : $scores; -} - - # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> PhastCons format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< @@ -1509,67 +595,6 @@ sub maf_parse } -# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> WIGGLE FORMAT <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - - -sub fixedstep_put_entry -{ - # Martin A. Hansen, April 2008. - - # Outputs a block of fixedStep values. - # Used for outputting wiggle data. - - my ( $chr, # chromosome - $beg, # start position - $block, # list of scores - $fh, # filehandle - OPTIONAL - $log10, # flag indicating that log10 scores should be used - ) = @_; - - # Returns nothing. - - $beg += 1; # fixedStep format is 1 based. - - $fh ||= \*STDOUT; - - print $fh "fixedStep chrom=$chr start=$beg step=1\n"; - - if ( $log10 ) { - map { printf( $fh "%f\n", Maasha::Calc::log10( $_ + 1 ) ) } @{ $block }; - } else { - map { printf( $fh "%d\n", ( $_ + 1 ) ) } @{ $block }; - } -} - - -sub wiggle_upload_to_ucsc -{ - # Martin A. Hansen, May 2008. - - # Upload a wiggle file to the UCSC database. - - my ( $tmp_dir, # temporary directory - $wib_dir, # wib directory - $wig_file, # file to upload, - $options, # argument hashref - ) = @_; - - # Returns nothing. - - my ( $args ); - -# $args = join " ", "-tmpDir=$tmp_dir", "-pathPrefix=$wib_dir", $options->{ "database" }, $options->{ 'table' }, $wig_file; - -# Maasha::Common::run( "hgLoadWiggle", "$args > /dev/null 2>&1" ); - - if ( $options->{ 'verbose' } ) { - `cd $tmp_dir && hgLoadWiggle -tmpDir=$tmp_dir -pathPrefix=$wib_dir $options->{ 'database' } $options->{ 'table' } $wig_file`; - } else { - `cd $tmp_dir && hgLoadWiggle -tmpDir=$tmp_dir -pathPrefix=$wib_dir $options->{ 'database' } $options->{ 'table' } $wig_file > /dev/null 2>&1`; - } -} - - # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> MySQL CONF <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< diff --git a/code_perl/Maasha/UCSC/BED.pm b/code_perl/Maasha/UCSC/BED.pm index 77988f7..eb8beac 100644 --- a/code_perl/Maasha/UCSC/BED.pm +++ b/code_perl/Maasha/UCSC/BED.pm @@ -707,6 +707,273 @@ sub tag_contigs_scan } +sub bed_upload_to_ucsc +{ + # Martin A. Hansen, September 2007. + + # Upload a BED file to the UCSC database. + + my ( $tmp_dir, # temporary directory + $file, # file to upload, + $options, # argument hashref + $append, # flag indicating table should be appended + ) = @_; + + # Returns nothing. + + my ( $args, $table, $sql_file, $fh_out, $fh_in ); + + if ( $append ) { + $args = join " ", $options->{ "database" }, $options->{ "table" }, "-tmpDir=$tmp_dir", "-oldTable", $file; + } else { + $args = join " ", $options->{ "database" }, $options->{ "table" }, "-tmpDir=$tmp_dir", $file; + } + + if ( $options->{ "table" } =~ /rnaSecStr/ ) + { + $table = $options->{ "table" }; + + print qq(uploading secondary structure table:"$table"\n) if $options->{ "verbose" }; + + $sql_file = "$tmp_dir/upload_RNA_SS.sql"; + + $fh_out = Maasha::Filesys::file_write_open( $sql_file ); + + print $fh_out qq( +CREATE TABLE $table ( + bin smallint not null, # Bin number for browser speedup + chrom varchar(255) not null, # Chromosome or FPC contig + chromStart int unsigned not null, # Start position in chromosome + chromEnd int unsigned not null, # End position in chromosome + name varchar(255) not null, # Name of item + score int unsigned not null, # Score from 0-1000 + strand char(1) not null, # + or - + size int unsigned not null, # Size of element. + secStr longblob not null, # Parentheses and '.'s which define the secondary structure + conf longblob not null, # Confidence of secondary-structure annotation per position (0.0-1.0). + #Indices + INDEX(name(16)), + INDEX(chrom(8), bin), + INDEX(chrom(8), chromStart) +); + ); + + close $fh_out; + + Maasha::Common::run( "hgLoadBed", "-notItemRgb -sqlTable=$sql_file $options->{ 'database' } $options->{ 'table' } -tmpDir=$tmp_dir $file > /dev/null 2>&1" ); + + unlink $sql_file; + } + else + { + Maasha::Common::run( "hgLoadBed", "$args > /dev/null 2>&1" ); + } +} + + +sub bed_analyze +{ + # Martin A. Hansen, March 2008. + + # Given a bed record, analysis this to give information + # about intron/exon sizes. + + my ( $entry, # BED entry + ) = @_; + + # Returns hashref. + + my ( $i, @begs, @lens, $exon_max, $exon_min, $exon_len, $exon_tot, $intron_max, $intron_min, $intron_len, $intron_tot ); + + $exon_max = 0; + $exon_min = 9999999999; + $intron_max = 0; + $intron_min = 9999999999; + + $entry->{ "EXONS" } = $entry->{ "BLOCK_COUNT" }; + + @begs = split /,/, $entry->{ "Q_BEGS" }; + @lens = split /,/, $entry->{ "BLOCK_LENS" }; + + for ( $i = 0; $i < $entry->{ "BLOCK_COUNT" }; $i++ ) + { + $exon_len = $lens[ $i ]; + + $entry->{ "EXON_LEN_$i" } = $exon_len; + + $exon_max = $exon_len if $exon_len > $exon_max; + $exon_min = $exon_len if $exon_len < $exon_min; + + $exon_tot += $exon_len; + } + + $entry->{ "EXON_LEN_-1" } = $exon_len; + $entry->{ "EXON_MAX_LEN" } = $exon_max; + $entry->{ "EXON_MIN_LEN" } = $exon_min; + $entry->{ "EXON_MEAN_LEN" } = int( $exon_tot / $entry->{ "EXONS" } ); + + $entry->{ "INTRONS" } = $entry->{ "BLOCK_COUNT" } - 1; + $entry->{ "INTRONS" } = 0 if $entry->{ "INTRONS" } < 0; + + if ( $entry->{ "INTRONS" } ) + { + for ( $i = 1; $i < $entry->{ "BLOCK_COUNT" }; $i++ ) + { + $intron_len = $begs[ $i ] - ( $begs[ $i - 1 ] + $lens[ $i - 1 ] ); + + $entry->{ "INTRON_LEN_" . ( $i - 1 ) } = $intron_len; + + $intron_max = $intron_len if $intron_len > $intron_max; + $intron_min = $intron_len if $intron_len < $intron_min; + + $intron_tot += $intron_len; + } + + $entry->{ "INTRON_LEN_-1" } = $intron_len; + $entry->{ "INTRON_MAX_LEN" } = $intron_max; + $entry->{ "INTRON_MIN_LEN" } = $intron_min; + $entry->{ "INTRON_MEAN_LEN" } = int( $intron_tot / $entry->{ "INTRONS" } ); + } + + return wantarray ? %{ $entry } : $entry; +} + + +sub bed_merge_entries +{ + # Martin A. Hansen, February 2008. + + # Merge a list of given BED entries in one big entry. + + my ( $entries, # list of BED entries to be merged + ) = @_; + + # Returns hash. + + my ( $i, @q_ids, @q_begs, @blocksizes, @new_q_begs, @new_blocksizes, %new_entry ); + + @{ $entries } = sort { $a->{ "CHR_BEG" } <=> $b->{ "CHR_BEG" } } @{ $entries }; + + for ( $i = 0; $i < @{ $entries }; $i++ ) + { + Maasha::Common::error( qq(Attempted merge of BED entries from different chromosomes) ) if $entries->[ 0 ]->{ "CHR" } ne $entries->[ $i ]->{ "CHR" }; + Maasha::Common::error( qq(Attempted merge of BED entries from different strands) ) if $entries->[ 0 ]->{ "STRAND" } ne $entries->[ $i ]->{ "STRAND" }; + + push @q_ids, $entries->[ $i ]->{ "Q_ID" } || sprintf( "ID%06d", $i ); + + if ( exists $entries->[ $i ]->{ "Q_BEGS" } ) + { + @q_begs = split ",", $entries->[ $i ]->{ "Q_BEGS" }; + @blocksizes = split ",", $entries->[ $i ]->{ "BLOCK_LENS" }; + } + else + { + @q_begs = 0; + @blocksizes = $entries->[ $i ]->{ "CHR_END" } - $entries->[ $i ]->{ "CHR_BEG" } + 1; + } + + map { $_ += $entries->[ $i ]->{ "CHR_BEG" } } @q_begs; + + push @new_q_begs, @q_begs; + push @new_blocksizes, @blocksizes; + } + + map { $_ -= $entries->[ 0 ]->{ "CHR_BEG" } } @new_q_begs; + + %new_entry = ( + CHR => $entries->[ 0 ]->{ "CHR" }, + CHR_BEG => $entries->[ 0 ]->{ "CHR_BEG" }, + CHR_END => $entries->[ -1 ]->{ "CHR_END" }, + REC_TYPE => "BED", + BED_LEN => $entries->[ -1 ]->{ "CHR_END" } - $entries->[ 0 ]->{ "CHR_BEG" } + 1, + BED_COLS => 12, + Q_ID => join( ":", @q_ids ), + SCORE => 999, + STRAND => $entries->[ 0 ]->{ "STRAND" } || "+", + THICK_BEG => $entries->[ 0 ]->{ "THICK_BEG" } || $entries->[ 0 ]->{ "CHR_BEG" }, + THICK_END => $entries->[ -1 ]->{ "THICK_END" } || $entries->[ -1 ]->{ "CHR_END" }, + COLOR => 0, + BLOCK_COUNT => scalar @new_q_begs, + BLOCK_LENS => join( ",", @new_blocksizes ), + Q_BEGS => join( ",", @new_q_begs ), + ); + + return wantarray ? %new_entry : \%new_entry; +} + + +sub bed_split_entry +{ + # Martin A. Hansen, February 2008. + + # Splits a given BED entry into a list of blocks, + # which are returned. A list of 6 column BED entry is returned. + + my ( $entry, # BED entry hashref + ) = @_; + + # Returns a list. + + my ( @q_begs, @blocksizes, $block, @blocks, $i ); + + if ( exists $entry->{ "BLOCK_COUNT" } ) + { + @q_begs = split ",", $entry->{ "Q_BEGS" }; + @blocksizes = split ",", $entry->{ "BLOCK_LENS" }; + + for ( $i = 0; $i < @q_begs; $i++ ) + { + undef $block; + + $block->{ "CHR" } = $entry->{ "CHR" }; + $block->{ "CHR_BEG" } = $entry->{ "CHR_BEG" } + $q_begs[ $i ]; + $block->{ "CHR_END" } = $entry->{ "CHR_BEG" } + $q_begs[ $i ] + $blocksizes[ $i ] - 1; + $block->{ "Q_ID" } = $entry->{ "Q_ID" } . sprintf( "_%03d", $i ); + $block->{ "SCORE" } = $entry->{ "SCORE" }; + $block->{ "STRAND" } = $entry->{ "STRAND" }; + $block->{ "BED_LEN" } = $block->{ "CHR_END" } - $block->{ "CHR_BEG" } + 1, + $block->{ "BED_COLS" } = 6; + $block->{ "REC_TYPE" } = "BED"; + + push @blocks, $block; + } + } + else + { + @blocks = @{ $entry }; + } + + return wantarray ? @blocks : \@blocks; +} + + +sub bed_overlap +{ + # Martin A. Hansen, February 2008. + + # Checks if two BED entries overlap and + # return 1 if so - else 0; + + my ( $entry1, # hashref + $entry2, # hashref + $no_strand, # don't check strand flag - OPTIONAL + ) = @_; + + # Return bolean. + + return 0 if $entry1->{ "CHR" } ne $entry2->{ "CHR" }; + return 0 if $entry1->{ "STRAND" } ne $entry2->{ "STRAND" }; + + if ( $entry1->{ "CHR_END" } < $entry2->{ "CHR_BEG" } or $entry1->{ "CHR_BEG" } > $entry2->{ "CHR_END" } ) { + return 0; + } else { + return 1; + } +} + + + + # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< diff --git a/code_perl/Maasha/UCSC/PSL.pm b/code_perl/Maasha/UCSC/PSL.pm index d803a96..4c9edb0 100644 --- a/code_perl/Maasha/UCSC/PSL.pm +++ b/code_perl/Maasha/UCSC/PSL.pm @@ -254,6 +254,29 @@ sub psl_put_entry } +sub psl_upload_to_ucsc +{ + # Martin A. Hansen, September 2007. + + # Upload a PSL file to the UCSC database. + + my ( $file, # file to upload, + $options, # argument hashref + $append, # flag indicating table should be appended + ) = @_; + + # Returns nothing. + + my ( $args ); + + if ( $append ) { + $args = join " ", $options->{ "database" }, "-table=$options->{ 'table' }", "-clientLoad", "-append", $file; + } else { + $args = join " ", $options->{ "database" }, "-table=$options->{ 'table' }", "-clientLoad", $file; + } + + Maasha::Common::run( "hgLoadPsl", "$args > /dev/null 2>&1" ); +} # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< diff --git a/code_perl/Maasha/UCSC/Wiggle.pm b/code_perl/Maasha/UCSC/Wiggle.pm index fbd7542..ce52c73 100644 --- a/code_perl/Maasha/UCSC/Wiggle.pm +++ b/code_perl/Maasha/UCSC/Wiggle.pm @@ -49,20 +49,26 @@ require Exporter; use constant { - BITS => 32, # Number of bits in an integer - SEQ_MAX => 200_000_000, # Maximum sequence size - chrom => 0, # BED field names - chromStart => 1, - chromEnd => 2, - name => 3, - score => 4, - strand => 5, - thickStart => 6, - thickEnd => 7, - itemRgb => 8, - blockCount => 9, - blockSizes => 10, - blockStarts => 11, + BITS => 32, # Number of bits in an integer + SEQ_MAX => 200_000_000, # Maximum sequence size + chrom => 0, # BED field names + chromStart => 1, + chromEnd => 2, + name => 3, + score => 4, + strand => 5, + thickStart => 6, + thickEnd => 7, + itemRgb => 8, + blockCount => 9, + blockSizes => 10, + blockStarts => 11, + + FS_CHR_BEG => 0, + FS_NEXT_CHR_BEG => 1, + FS_CHR_END => 2, + FS_INDEX_BEG => 3, + FS_INDEX_LEN => 4, }; @@ -320,6 +326,261 @@ sub fixedstep_scan } +sub fixedstep_index_create +{ + # Martin A. Hansen, January 2008. + + # Indexes a concatenated fixedStep file. + # The index consists of a hash with chromosomes as keys, + # and a list of [ chr_beg, next_chr_beg, chr_end, index_beg, index_len ] as values. + + my ( $path, # path to fixedStep file + ) = @_; + + # Returns a hashref + + my ( $fh, $pos, $index_beg, $index_len, $entry, $locator, $chr, $step, $beg, $end, $len, %index, $i ); + + $fh = Maasha::Filesys::file_read_open( $path ); + + $pos = 0; + + while ( $entry = Maasha::UCSC::fixedstep_get_entry( $fh ) ) + { + $locator = shift @{ $entry }; + + if ( $locator =~ /chrom=([^ ]+) start=(\d+) step=(\d+)/ ) + { + $chr = $1; + $beg = $2 - 1; # fixedStep files are 1-based + $step = $3; + } + else + { + Maasha::Common::error( qq(Could not parse locator: $locator) ); + } + + $pos += length( $locator ) + 11; + + $index_beg = $pos; + +# map { $pos += length( $_ ) + 1 } @{ $entry }; + + $pos += 6 * scalar @{ $entry }; + + $index_len = $pos - $index_beg; + + push @{ $index{ $chr } }, [ $beg, undef, $beg + scalar @{ $entry } - 1, $index_beg, $index_len ]; + } + + close $fh; + + foreach $chr ( keys %index ) + { + for ( $i = 0; $i < @{ $index{ $chr } } - 1; $i++ ) { + $index{ $chr }->[ $i ]->[ FS_NEXT_CHR_BEG ] = $index{ $chr }->[ $i + 1 ]->[ 0 ]; + } + + $index{ $chr }->[ -1 ]->[ FS_NEXT_CHR_BEG ] = $index{ $chr }->[ -1 ]->[ FS_CHR_END ] + 1; + } + + return wantarray ? %index : \%index; +} + + +sub fixedstep_index_store +{ + # Martin A. Hansen, January 2008. + + # Writes a fixedStep index to binary file. + + my ( $path, # full path to file + $index, # list with index + ) = @_; + + # returns nothing + + Maasha::Filesys::file_store( $path, $index ); +} + + +sub fixedstep_index_retrieve +{ + # Martin A. Hansen, January 2008. + + # Retrieves a fixedStep index from binary file. + + my ( $path, # full path to file + ) = @_; + + # returns list + + my $index; + + $index = Maasha::Filesys::file_retrieve( $path ); + + return wantarray ? %{ $index } : $index; +} + + +sub fixedstep_index_lookup +{ + # Martin A. Hansen, January 2008. + + # Retrieve fixedStep scores from a indexed + # fixedStep file given a chromosome and + # begin and end positions. + + my ( $index, # data structure + $fh, # filehandle to datafile + $chr, # chromosome + $chr_beg, # chromosome beg + $chr_end, # chromosome end + $flank, # include flanking region - OPTIONAL + ) = @_; + + # Returns a list + + my ( $index_beg, $index_end, $i, $c, $beg, $end, @vals, $scores ); + + $flank ||= 0; + + $chr_beg -= $flank; + $chr_end += $flank; + + # print "chr_beg->$chr_beg chr_end->$chr_end flank->$flank\n"; + + if ( exists $index->{ $chr } ) + { + $index_beg = Maasha::Matrix::interval_search( $index->{ $chr }, 0, 1, $chr_beg ); + + if ( $index_beg < 0 ) { + Maasha::Common::error( qq(Index search failed - begin index position doesn't exists: $chr_beg) ); + } + + if ( $chr_end < $index->{ $chr }->[ $index_beg ]->[ 1 ] ) + { + $index_end = $index_beg; + } + else + { + $index_end = Maasha::Matrix::interval_search( $index->{ $chr }, 0, 1, $chr_end ); + + if ( $index_end < 0 ) { + Maasha::Common::error( qq(Index search failed - end index position doesn't exists: $chr_end) ); + } + } + + map { $scores->[ $_ ] = 0 } 0 .. $chr_end - $chr_beg; + + if ( $index_beg == $index_end ) + { + $beg = Maasha::Calc::max( $chr_beg, $index->{ $chr }->[ $index_beg ]->[ FS_CHR_BEG ] ); + $end = Maasha::Calc::min( $chr_end, $index->{ $chr }->[ $index_end ]->[ FS_CHR_END ] ); + + if ( $beg <= $index->{ $chr }->[ $index_beg ]->[ FS_CHR_END ] and $end >= $index->{ $chr }->[ $index_beg ]->[ FS_CHR_BEG ] ) + { + @vals = split "\n", Maasha::Filesys::file_read( + $fh, + $index->{ $chr }->[ $index_beg ]->[ FS_INDEX_BEG ] + 6 * ( $beg - $index->{ $chr }->[ $index_beg ]->[ FS_CHR_BEG ] ), + 6 * ( $end - $beg + 1 ), + ); + } + + for ( $c = 0; $c < @vals; $c++ ) { + $scores->[ $c + $beg - $chr_beg ] = $vals[ $c ]; + } + } + else + { + $beg = Maasha::Calc::max( $chr_beg, $index->{ $chr }->[ $index_beg ]->[ FS_CHR_BEG ] ); + +# print Dumper( $beg, $index->{ $chr }->[ $index_beg ] ); +# print Dumper( "next", $index->{ $chr }->[ $index_beg ]->[ FS_NEXT_CHR_BEG ] ); + + # beg next + # v v + # |||||||||....... + + if ( $beg <= $index->{ $chr }->[ $index_beg ]->[ FS_CHR_END ] ) + { + @vals = split "\n", Maasha::Filesys::file_read( + $fh, + $index->{ $chr }->[ $index_beg ]->[ FS_INDEX_BEG ] + 6 * ( $beg - $index->{ $chr }->[ $index_beg ]->[ FS_CHR_BEG ] ), + 6 * ( $index->{ $chr }->[ $index_beg ]->[ FS_CHR_END ] - $beg + 1 ), + ); + + for ( $c = 0; $c < @vals; $c++ ) { + $scores->[ $c + $beg - $chr_beg ] = $vals[ $c ]; + } + } + + $end = Maasha::Calc::min( $chr_end, $index->{ $chr }->[ $index_end ]->[ FS_CHR_END ] ); + + if ( $end <= $index->{ $chr }->[ $index_end ]->[ FS_CHR_END ] ) + { + @vals = split "\n", Maasha::Filesys::file_read( + $fh, + $index->{ $chr }->[ $index_end ]->[ FS_INDEX_BEG ], + 6 * ( $end - $index->{ $chr }->[ $index_end ]->[ FS_CHR_BEG ] + 1 ), + ); + + for ( $c = 0; $c < @vals; $c++ ) { + $scores->[ $c + $index->{ $chr }->[ $index_end ]->[ FS_CHR_BEG ] - $chr_beg ] = $vals[ $c ]; + } + } + + for ( $i = $index_beg + 1; $i <= $index_end - 1; $i++ ) + { + @vals = split "\n", Maasha::Filesys::file_read( + $fh, + $index->{ $chr }->[ $i ]->[ FS_INDEX_BEG ], + 6 * ( $index->{ $chr }->[ $i ]->[ FS_CHR_END ] - $index->{ $chr }->[ $i ]->[ FS_CHR_BEG ] + 1 ), + ); + + for ( $c = 0; $c < @vals; $c++ ) { + $scores->[ $c + $index->{ $chr }->[ $i ]->[ FS_CHR_BEG ] - $chr_beg ] = $vals[ $c ]; + } + } + } + } + else + { + Maasha::Common::error( qq(Chromosome "$chr" was not found in index) ); + } + + return wantarray ? @{ $scores } : $scores; +} + + +sub wiggle_upload_to_ucsc +{ + # Martin A. Hansen, May 2008. + + # Upload a wiggle file to the UCSC database. + + my ( $tmp_dir, # temporary directory + $wib_dir, # wib directory + $wig_file, # file to upload, + $options, # argument hashref + ) = @_; + + # Returns nothing. + + my ( $args ); + +# $args = join " ", "-tmpDir=$tmp_dir", "-pathPrefix=$wib_dir", $options->{ "database" }, $options->{ 'table' }, $wig_file; + +# Maasha::Common::run( "hgLoadWiggle", "$args > /dev/null 2>&1" ); + + if ( $options->{ 'verbose' } ) { + `cd $tmp_dir && hgLoadWiggle -tmpDir=$tmp_dir -pathPrefix=$wib_dir $options->{ 'database' } $options->{ 'table' } $wig_file`; + } else { + `cd $tmp_dir && hgLoadWiggle -tmpDir=$tmp_dir -pathPrefix=$wib_dir $options->{ 'database' } $options->{ 'table' } $wig_file > /dev/null 2>&1`; + } +} + + # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< -- 2.39.2