X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=code_perl%2FMaasha%2FUCSC.pm;h=05913c95817f170a05a0f62613774ef0679e90b9;hb=9052a73697c99a1f0634e2d1f2cf2932f8288810;hp=4f932e3b5135de092033a663118ee75f4346bb4c;hpb=5d5968148bb0f5e97f6ff73cfdf77a8269f57343;p=biopieces.git diff --git a/code_perl/Maasha/UCSC.pm b/code_perl/Maasha/UCSC.pm index 4f932e3..05913c9 100644 --- a/code_perl/Maasha/UCSC.pm +++ b/code_perl/Maasha/UCSC.pm @@ -28,849 +28,212 @@ package Maasha::UCSC; # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< +use warnings; use strict; use vars qw ( @ISA @EXPORT ); use Data::Dumper; -use Time::HiRes qw( gettimeofday ); - use Maasha::Common; +use Maasha::Filesys; use Maasha::Calc; use Maasha::Matrix; use constant { - CHR_BEG => 0, - NEXT_CHR_BEG => 1, + CHR => 0, + CHR_BEG => 1, CHR_END => 2, - INDEX_BEG => 3, - INDEX_LEN => 4, + Q_ID => 3, + SCORE => 4, + STRAND => 5, + THICK_BEG => 6, + THICK_END => 7, + COLOR => 8, + BLOCK_COUNT => 9, + BLOCK_LENS => 10, + Q_BEGS => 11, }; @ISA = qw( Exporter ); -my $TIME = gettimeofday(); - - -# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> BED format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - -# http://genome.ucsc.edu/goldenPath/help/hgTracksHelp.html#BED +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> TRACK FILE <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< -sub bed_get_entry +sub ucsc_config_entry_get { - # Martin A. Hansen, December 2007. + # Martin A. Hansen, November 2008. - # Reads a bed entry given a filehandle. + # Given a filehandle to a UCSC Genome Browser + # config file (.ra) get the next entry and + # return as a hash. Entries are separated by blank + # lines. # lines are skipped unless it is the lines: + # # Track added ... + # # Database ... - my ( $fh, # file handle - $columns, # number of BED columns to read - OPTIONAL + my ( $fh, # file hanlde ) = @_; - # Returns hashref. - - my ( $line, @fields, %entry ); + # Returns a hashref. - $line = <$fh>; + my ( $line, %record ); - $line =~ s/(\n|\r)$//g; # some people have carriage returns in their BED files -> Grrrr - - return if not defined $line; - - @fields = split "\t", $line; - - $columns ||= scalar @fields; - - if ( $columns == 3 ) - { - %entry = ( - "CHR" => $fields[ 0 ], - "CHR_BEG" => $fields[ 1 ], - "CHR_END" => $fields[ 2 ] - 1, - ); - } - elsif ( $columns == 4 ) - { - %entry = ( - "CHR" => $fields[ 0 ], - "CHR_BEG" => $fields[ 1 ], - "CHR_END" => $fields[ 2 ] - 1, - "Q_ID" => $fields[ 3 ], - ); - } - elsif ( $columns == 5 ) - { - %entry = ( - "CHR" => $fields[ 0 ], - "CHR_BEG" => $fields[ 1 ], - "CHR_END" => $fields[ 2 ] - 1, - "Q_ID" => $fields[ 3 ], - "SCORE" => $fields[ 4 ], - ); - } - elsif ( $columns == 6 ) - { - %entry = ( - "CHR" => $fields[ 0 ], - "CHR_BEG" => $fields[ 1 ], - "CHR_END" => $fields[ 2 ] - 1, - "Q_ID" => $fields[ 3 ], - "SCORE" => $fields[ 4 ], - "STRAND" => $fields[ 5 ], - ); - } - elsif ( $columns == 12 ) - { - %entry = ( - "CHR" => $fields[ 0 ], - "CHR_BEG" => $fields[ 1 ], - "CHR_END" => $fields[ 2 ] - 1, - "Q_ID" => $fields[ 3 ], - "SCORE" => $fields[ 4 ], - "STRAND" => $fields[ 5 ], - "THICK_BEG" => $fields[ 6 ], - "THICK_END" => $fields[ 7 ] - 1, - "ITEMRGB" => $fields[ 8 ], - "BLOCKCOUNT" => $fields[ 9 ], - "BLOCKSIZES" => $fields[ 10 ], - "Q_BEGS" => $fields[ 11 ], - ); - } - else + while ( $line = <$fh> ) { - Maasha::Common::error( qq(Bad BED format in line->$line<-) ); - } - - $entry{ "REC_TYPE" } = "BED"; - $entry{ "BED_LEN" } = $entry{ "CHR_END" } - $entry{ "CHR_BEG" } + 1; - $entry{ "BED_COLS" } = $columns; - - return wantarray ? %entry : \%entry; -} - - -sub bed_get_entries -{ - # Martin A. Hansen, January 2008. - - # Given a path to a BED file, read in all entries - # and return. - - my ( $path, # full path to BED file - $columns, # number of columns in BED file - OPTIONAL (but is faster) - ) = @_; - - # Returns a list. - - my ( $fh, $entry, @list ); - - $fh = Maasha::Common::read_open( $path ); + $line =~ tr/\n\r//d; + + if ( $line =~ /Track added by 'upload_to_ucsc' (\S+) (\S+)/) { + $record{ 'date' } = $1; + $record{ 'time' } = $2; + } elsif ( $line =~ /^# date: (.+)/ ) { + $record{ 'date' } = $1; + } elsif ( $line =~ /^# time: (.+)/ ) { + $record{ 'time' } = $1; + } elsif ( $line =~ /^# (?:database:|Database) (.+)/ ) { + $record{ 'database' } = $1; + } elsif ( $line =~ /^#/ ) { + # skip + } elsif ( $line =~ /(\S+)\s+(.+)/ ) { + $record{ $1 } = $2; + } - while ( $entry = bed_get_entry( $fh ) ) { - push @list, $entry; + last if $line =~ /^$/ and exists $record{ "track" }; } - close $fh; + return undef if not exists $record{ "track" }; - return wantarray ? @list : \@list; + return wantarray ? %record : \%record; } -sub bed_put_entry +sub ucsc_config_entry_put { - # Martin A. Hansen, Septermber 2007. + # Martin A. Hansen, November 2008. - # Writes a BED entry to file. + # Outputs a Biopiece record (a hashref) + # to a filehandle or STDOUT. - # NB, this could really be more robust!? - - my ( $record, # hashref - $fh, # file handle - OPTIONAL - $columns, # number of columns in BED file - OPTIONAL (but is faster) + my ( $record, # hashref + $fh_out, # file handle ) = @_; # Returns nothing. - my ( @fields ); - - $columns ||= 12; # max number of columns possible - - if ( $columns == 3 ) - { - push @fields, $record->{ "CHR" }; - push @fields, $record->{ "CHR_BEG" }; - push @fields, $record->{ "CHR_END" } + 1; - } - elsif ( $columns == 4 ) - { - $record->{ "Q_ID" } =~ s/\s+/_/g; - - push @fields, $record->{ "CHR" }; - push @fields, $record->{ "CHR_BEG" }; - push @fields, $record->{ "CHR_END" } + 1; - push @fields, $record->{ "Q_ID" }; - } - elsif ( $columns == 5 ) - { - $record->{ "Q_ID" } =~ s/\s+/_/g; - $record->{ "SCORE" } =~ s/\.\d*//; - - push @fields, $record->{ "CHR" }; - push @fields, $record->{ "CHR_BEG" }; - push @fields, $record->{ "CHR_END" } + 1; - push @fields, $record->{ "Q_ID" }; - push @fields, $record->{ "SCORE" }; - } - elsif ( $columns == 6 ) - { - $record->{ "Q_ID" } =~ s/\s+/_/g; - $record->{ "SCORE" } =~ s/\.\d*//; - - push @fields, $record->{ "CHR" }; - push @fields, $record->{ "CHR_BEG" }; - push @fields, $record->{ "CHR_END" } + 1; - push @fields, $record->{ "Q_ID" }; - push @fields, $record->{ "SCORE" }; - push @fields, $record->{ "STRAND" }; - } - else - { - $record->{ "Q_ID" } =~ s/\s+/_/g; - $record->{ "SCORE" } =~ s/\.\d*//; - - push @fields, $record->{ "CHR" }; - push @fields, $record->{ "CHR_BEG" }; - push @fields, $record->{ "CHR_END" } + 1; - push @fields, $record->{ "Q_ID" }; - push @fields, $record->{ "SCORE" }; - push @fields, $record->{ "STRAND" }; - push @fields, $record->{ "THICK_BEG" } if defined $record->{ "THICK_BEG" }; - push @fields, $record->{ "THICK_END" } + 1 if defined $record->{ "THICK_END" }; - push @fields, $record->{ "ITEMRGB" } if defined $record->{ "ITEMRGB" }; - push @fields, $record->{ "BLOCKCOUNT" } if defined $record->{ "BLOCKCOUNT" }; - push @fields, $record->{ "BLOCKSIZES" } if defined $record->{ "BLOCKSIZES" }; - push @fields, $record->{ "Q_BEGS" } if defined $record->{ "Q_BEGS" }; - } - - if ( $fh ) { - print $fh join( "\t", @fields ), "\n"; - } else { - print join( "\t", @fields ), "\n"; - } + my ( $date, $time ); + + $fh_out ||= \*STDOUT; + + ( $date, $time ) = split " ", Maasha::Common::time_stamp(); + + $record->{ "date" } ||= $date; + $record->{ "time" } ||= $time; + + print $fh_out "\n# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n\n"; + + map { print $fh_out "# $_: $record->{ $_ }\n" if exists $record->{ $_ } } qw( date time database ); + map { print $fh_out "$_ $record->{ $_ }\n" if exists $record->{ $_ } } qw( track + name + description + itemRgb + db + offset + url + htmlUrl + shortLabel + longLabel + group + priority + useScore + visibility + maxHeightPixels + color + mafTrack + type ); } -sub bed_put_entries +sub ucsc_update_config { - # Martin A. Hansen, January 2008. + # Martin A. Hansen, September 2007. - # Write a list of BED entries. + # Update the /home/user/ucsc/my_tracks.ra file and executes makeCustomTracks.pl - my ( $entries, # list of entries, - $fh, # file handle - OPTIONAL + my ( $options, # hashref + $type, # track type ) = @_; # Returns nothing. - map { bed_put_entry( $_, $fh ) } @{ $entries }; -} - - -sub bed_analyze -{ - # Martin A. Hansen, March 2008. - - # Given a bed record, analysis this to give information - # about intron/exon sizes. - - my ( $entry, # BED entry - ) = @_; - - # Returns hashref. - - my ( $i, @begs, @lens, $exon_max, $exon_min, $exon_len, $exon_tot, $intron_max, $intron_min, $intron_len, $intron_tot ); - - $exon_max = 0; - $exon_min = 9999999999; - $intron_max = 0; - $intron_min = 9999999999; + my ( $file, $entry, %new_entry, $fh_in, $fh_out, $was_found ); - $entry->{ "EXONS" } = $entry->{ "BLOCKCOUNT" }; - - @begs = split /,/, $entry->{ "Q_BEGS" }; - @lens = split /,/, $entry->{ "BLOCKSIZES" }; - - for ( $i = 0; $i < $entry->{ "BLOCKCOUNT" }; $i++ ) - { - $exon_len = @lens[ $i ]; - - $entry->{ "EXON_LEN_$i" } = $exon_len; + $file = $ENV{ "HOME" } . "/ucsc/my_tracks.ra"; - $exon_max = $exon_len if $exon_len > $exon_max; - $exon_min = $exon_len if $exon_len < $exon_min; + Maasha::Filesys::file_copy( $file, "$file~" ); # create a backup - $exon_tot += $exon_len; - } + %new_entry = ( + database => $options->{ 'database' }, + track => $options->{ 'table' }, + shortLabel => $options->{ 'short_label' }, + longLabel => $options->{ 'long_label' }, + group => $options->{ 'group' }, + priority => $options->{ 'priority' }, + visibility => $options->{ 'visibility' }, + color => $options->{ 'color' }, + type => $type, + ); - $entry->{ "EXON_LEN_-1" } = $exon_len; - $entry->{ "EXON_MAX_LEN" } = $exon_max; - $entry->{ "EXON_MIN_LEN" } = $exon_min; - $entry->{ "EXON_MEAN_LEN" } = int( $exon_tot / $entry->{ "EXONS" } ); + $new_entry{ 'useScore' } = 1 if $options->{ 'use_score' }; + $new_entry{ 'mafTrack' } = "multiz17way" if $type eq "type bed 6 +"; + $new_entry{ 'maxHeightPixels' } = "50:50:11" if $type eq "wig 0"; - $entry->{ "INTRONS" } = $entry->{ "BLOCKCOUNT" } - 1; - $entry->{ "INTRONS" } = 0 if $entry->{ "INTRONS" } < 0; + $fh_in = Maasha::Filesys::file_read_open( "$file~" ); + $fh_out = Maasha::Filesys::file_write_open( $file ); - if ( $entry->{ "INTRONS" } ) + while ( $entry = ucsc_config_entry_get( $fh_in ) ) { - for ( $i = 1; $i < $entry->{ "BLOCKCOUNT" }; $i++ ) + if ( $entry->{ 'database' } eq $new_entry{ 'database' } and $entry->{ 'track' } eq $new_entry{ 'track' } ) { - $intron_len = @begs[ $i ] - ( @begs[ $i - 1 ] + @lens[ $i - 1 ] ); - - $entry->{ "INTRON_LEN_" . ( $i - 1 ) } = $intron_len; - - $intron_max = $intron_len if $intron_len > $intron_max; - $intron_min = $intron_len if $intron_len < $intron_min; - - $intron_tot += $intron_len; - } - - $entry->{ "INTRON_LEN_-1" } = $intron_len; - $entry->{ "INTRON_MAX_LEN" } = $intron_max; - $entry->{ "INTRON_MIN_LEN" } = $intron_min; - $entry->{ "INTRON_MEAN_LEN" } = int( $intron_tot / $entry->{ "INTRONS" } ); - } - - return wantarray ? %{ $entry } : $entry; -} - - -sub bed_sort -{ - # Martin A. Hansen, March 2008. - - # Sort a potential huge BED file according to - # CHR, CHR_BEG and optionally STRAND. - - my ( $tmp_dir, # temporary directory used for sorting - $file, # BED file to sort - $strand, # flag to sort on strand - OPTIONAL - ) = @_; - - # Returns nothing. - - my ( $fh_in, $key, $fh_out, %fh_hash, $part_file, $entry, $entries ); - - $fh_in = Maasha::Common::read_open( $file ); - - while ( $entry = bed_get_entry( $fh_in ) ) - { - if ( $strand ) { - $key = join "_", $entry->{ "CHR" }, $entry->{ "STRAND" }; - } else { - $key = $entry->{ "CHR" }; - } - - $fh_hash{ $key } = Maasha::Common::write_open( "$tmp_dir/$key.sort" ) if not exists $fh_hash{ $key }; - - bed_put_entry( $entry, $fh_hash{ $key } ); - } - - close $fh_in; - - map { close $_ } keys %fh_hash; - - $fh_out = Maasha::Common::write_open( "$tmp_dir/temp.sort" ); - - foreach $part_file ( sort keys %fh_hash ) - { - $entries = bed_get_entries( "$tmp_dir/$part_file.sort" ); - - @{ $entries } = sort { $a->{ "CHR_BEG" } <=> $b->{ "CHR_BEG" } } @{ $entries }; - - map { bed_put_entry( $_, $fh_out ) } @{ $entries }; - - unlink "$tmp_dir/$part_file.sort"; - } - - close $fh_out; - - rename "$tmp_dir/temp.sort", $file; -} - - -sub bed_merge_entries -{ - # Martin A. Hansen, February 2008. - - # Merge a list of given BED entries in one big entry. - - my ( $entries, # list of BED entries to be merged - ) = @_; - - # Returns hash. - - my ( $i, @q_ids, @q_begs, @blocksizes, @new_q_begs, @new_blocksizes, %new_entry ); - - @{ $entries } = sort { $a->{ "CHR_BEG" } <=> $b->{ "CHR_BEG" } } @{ $entries }; + ucsc_config_entry_put( \%new_entry, $fh_out ); - for ( $i = 0; $i < @{ $entries }; $i++ ) - { - Maasha::Common::error( qq(Attempted merge of BED entries from different chromosomes) ) if $entries->[ 0 ]->{ "CHR" } ne $entries->[ $i ]->{ "CHR" }; - Maasha::Common::error( qq(Attempted merge of BED entries from different strands) ) if $entries->[ 0 ]->{ "STRAND" } ne $entries->[ $i ]->{ "STRAND" }; - - push @q_ids, $entries->[ $i ]->{ "Q_ID" } || sprintf( "ID%06d", $i ); - - if ( exists $entries->[ $i ]->{ "Q_BEGS" } ) - { - @q_begs = split ",", $entries->[ $i ]->{ "Q_BEGS" }; - @blocksizes = split ",", $entries->[ $i ]->{ "BLOCKSIZES" }; + $was_found = 1; } else { - @q_begs = 0; - @blocksizes = $entries->[ $i ]->{ "CHR_END" } - $entries->[ $i ]->{ "CHR_BEG" } + 1; + ucsc_config_entry_put( $entry, $fh_out ); } - - map { $_ += $entries->[ $i ]->{ "CHR_BEG" } } @q_begs; - - push @new_q_begs, @q_begs; - push @new_blocksizes, @blocksizes; } - map { $_ -= $entries->[ 0 ]->{ "CHR_BEG" } } @new_q_begs; - - %new_entry = ( - CHR => $entries->[ 0 ]->{ "CHR" }, - CHR_BEG => $entries->[ 0 ]->{ "CHR_BEG" }, - CHR_END => $entries->[ -1 ]->{ "CHR_END" }, - REC_TYPE => "BED", - BED_LEN => $entries->[ -1 ]->{ "CHR_END" } - $entries->[ 0 ]->{ "CHR_BEG" } + 1, - BED_COLS => 12, - Q_ID => join( ":", @q_ids ), - SCORE => 999, - STRAND => $entries->[ 0 ]->{ "STRAND" } || "+", - THICK_BEG => $entries->[ 0 ]->{ "THICK_BEG" } || $entries->[ 0 ]->{ "CHR_BEG" }, - THICK_END => $entries->[ -1 ]->{ "THICK_END" } || $entries->[ -1 ]->{ "CHR_END" }, - ITEMRGB => "0,0,0", - BLOCKCOUNT => scalar @new_q_begs, - BLOCKSIZES => join( ",", @new_blocksizes ), - Q_BEGS => join( ",", @new_q_begs ), - ); - - return wantarray ? %new_entry : \%new_entry; -} - - -sub bed_split_entry -{ - # Martin A. Hansen, February 2008. - - # Splits a given BED entry into a list of blocks, - # which are returned. A list of 6 column BED entry is returned. - - my ( $entry, # BED entry hashref - ) = @_; - - # Returns a list. - - my ( @q_begs, @blocksizes, $block, @blocks, $i ); - - if ( exists $entry->{ "BLOCKCOUNT" } ) - { - @q_begs = split ",", $entry->{ "Q_BEGS" }; - @blocksizes = split ",", $entry->{ "BLOCKSIZES" }; - - for ( $i = 0; $i < @q_begs; $i++ ) - { - undef $block; - - $block->{ "CHR" } = $entry->{ "CHR" }; - $block->{ "CHR_BEG" } = $entry->{ "CHR_BEG" } + $q_begs[ $i ]; - $block->{ "CHR_END" } = $entry->{ "CHR_BEG" } + $q_begs[ $i ] + $blocksizes[ $i ] - 1; - $block->{ "Q_ID" } = $entry->{ "Q_ID" } . sprintf( "_%03d", $i ); - $block->{ "SCORE" } = $entry->{ "SCORE" }; - $block->{ "STRAND" } = $entry->{ "STRAND" }; - $block->{ "BED_LEN" } = $block->{ "CHR_END" } - $block->{ "CHR_BEG" } + 1, - $block->{ "BED_COLS" } = 6; - $block->{ "REC_TYPE" } = "BED"; - - push @blocks, $block; - } - } - else - { - @blocks = @{ $entry }; - } - - return wantarray ? @blocks : \@blocks; -} - - - -sub bed_overlap -{ - # Martin A. Hansen, February 2008. - - # Checks if two BED entries overlap and - # return 1 if so - else 0; - - my ( $entry1, # hashref - $entry2, # hashref - $no_strand, # don't check strand flag - OPTIONAL - ) = @_; - - # Return bolean. - - return 0 if $entry1->{ "CHR" } ne $entry2->{ "CHR" }; - return 0 if $entry1->{ "STRAND" } ne $entry2->{ "STRAND" }; - - if ( $entry1->{ "CHR_END" } < $entry2->{ "CHR_BEG" } or $entry1->{ "CHR_BEG" } > $entry2->{ "CHR_END" } ) { - return 0; - } else { - return 1; - } -} - - -sub bed_upload_to_ucsc -{ - # Martin A. Hansen, September 2007. - - # Upload a BED file to the UCSC database. - - my ( $tmp_dir, # temporary directory - $file, # file to upload, - $options, # argument hashref - $append, # flag indicating table should be appended - ) = @_; - - # Returns nothing. - - my ( $args, $table, $sql_file, $fh_out, $fh_in ); - - if ( $append ) { - $args = join " ", $options->{ "database" }, $options->{ "table" }, "-tmpDir=$tmp_dir", "-oldTable", $file; - } else { - $args = join " ", $options->{ "database" }, $options->{ "table" }, "-tmpDir=$tmp_dir", $file; - } - - if ( $options->{ "sec_struct" } ) - { - $table = $options->{ "table" }; - - Maasha::Common::error( "Attempt to load secondary structure track without 'rnaSecStr' in table name" ) if not $table =~ /rnaSecStr/; - - $sql_file = "$tmp_dir/upload_RNA_SS.sql"; - - $fh_out = Maasha::Common::write_open( $sql_file ); - - print $fh_out qq( -CREATE TABLE $table ( - bin smallint not null, # Bin number for browser speedup - chrom varchar(255) not null, # Chromosome or FPC contig - chromStart int unsigned not null, # Start position in chromosome - chromEnd int unsigned not null, # End position in chromosome - name varchar(255) not null, # Name of item - score int unsigned not null, # Score from 0-1000 - strand char(1) not null, # + or - - size int unsigned not null, # Size of element. - secStr longblob not null, # Parentheses and '.'s which define the secondary structure - conf longblob not null, # Confidence of secondary-structure annotation per position (0.0-1.0). - #Indices - INDEX(name(16)), - INDEX(chrom(8), bin), - INDEX(chrom(8), chromStart) -); - ); - - close $fh_out; - - Maasha::Common::run( "hgLoadBed", "-notItemRgb -sqlTable=$sql_file $options->{ 'database' } $options->{ 'table' } -tmpDir=$tmp_dir $file > /dev/null 2>&1" ); - - unlink $sql_file; - } - else - { - Maasha::Common::run( "hgLoadBed", "$args > /dev/null 2>&1" ); - } -} - - -# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> PSL format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - - -sub psl_get_entries -{ - # Martin A. Hansen, February 2008. - - # Reads PSL entries and returns a record. - - my ( $path, # full path to PSL file - ) = @_; - - # Returns hashref. - - my ( $fh, @lines, @fields, $i, %record, @records ); - - $fh = Maasha::Common::read_open( $path ); - - @lines = <$fh>; - - close $fh; - - chomp @lines; - - for ( $i = 5; $i < @lines; $i++ ) - { - @fields = split "\t", $lines[ $i ]; - - Maasha::Common::error( qq(Bad PSL format in file "$path") ) if not @fields == 21; - - undef %record; - - %record = ( - REC_TYPE => "PSL", - MATCHES => $fields[ 0 ], - MISMATCHES => $fields[ 1 ], - REPMATCHES => $fields[ 2 ], - NCOUNT => $fields[ 3 ], - QNUMINSERT => $fields[ 4 ], - QBASEINSERT => $fields[ 5 ], - SNUMINSERT => $fields[ 6 ], - SBASEINSERT => $fields[ 7 ], - STRAND => $fields[ 8 ], - Q_ID => $fields[ 9 ], - Q_LEN => $fields[ 10 ], - Q_BEG => $fields[ 11 ], - Q_END => $fields[ 12 ] - 1, - S_ID => $fields[ 13 ], - S_LEN => $fields[ 14 ], - S_BEG => $fields[ 15 ], - S_END => $fields[ 16 ] - 1, - BLOCKCOUNT => $fields[ 17 ], - BLOCKSIZES => $fields[ 18 ], - Q_BEGS => $fields[ 19 ], - S_BEGS => $fields[ 20 ], - ); - - $record{ "SCORE" } = $record{ "MATCHES" } + int( $record{ "REPMATCHES" } / 2 ) - $record{ "MISMATCHES" } - $record{ "QNUMINSERT" } - $record{ "SNUMINSERT" }; - - push @records, { %record }; - } - - return wantarray ? @records : \@records; -} - - -sub psl_put_header -{ - # Martin A. Hansen, September 2007. - - # Write a PSL header to file. - - my ( $fh, # file handle - OPTIONAL - ) = @_; - - # Returns nothing. - - $fh = \*STDOUT if not $fh; - - print $fh qq(psLayout version 3 -match mis- rep. N's Q gap Q gap T gap T gap strand Q Q Q Q T T T T block blockSizes qStart match match count bases count bases name size start end name size start end count ---------------------------------------------------------------------------------------------------------------------------------------------------------------- -); -} - - -sub psl_put_entry -{ - # Martin A. Hansen, September 2007. - - # Write a PSL entry to file. - - my ( $record, # hashref - $fh, # file handle - OPTIONAL - ) = @_; - - # Returns nothing. - - $fh = \*STDOUT if not $fh; - - my @output; - - push @output, $record->{ "MATCHES" }; - push @output, $record->{ "MISMATCHES" }; - push @output, $record->{ "REPMATCHES" }; - push @output, $record->{ "NCOUNT" }; - push @output, $record->{ "QNUMINSERT" }; - push @output, $record->{ "QBASEINSERT" }; - push @output, $record->{ "SNUMINSERT" }; - push @output, $record->{ "SBASEINSERT" }; - push @output, $record->{ "STRAND" }; - push @output, $record->{ "Q_ID" }; - push @output, $record->{ "Q_LEN" }; - push @output, $record->{ "Q_BEG" }; - push @output, $record->{ "Q_END" } + 1; - push @output, $record->{ "S_ID" }; - push @output, $record->{ "S_LEN" }; - push @output, $record->{ "S_BEG" }; - push @output, $record->{ "S_END" } + 1; - push @output, $record->{ "BLOCKCOUNT" }; - push @output, $record->{ "BLOCKSIZES" }; - push @output, $record->{ "Q_BEGS" }; - push @output, $record->{ "S_BEGS" }; - - print $fh join( "\t", @output ), "\n"; -} - - -sub psl_upload_to_ucsc -{ - # Martin A. Hansen, September 2007. - - # Upload a PSL file to the UCSC database. - - my ( $file, # file to upload, - $options, # argument hashref - $append, # flag indicating table should be appended - ) = @_; - - # Returns nothing. - - my ( $args ); - - if ( $append ) { - $args = join " ", $options->{ "database" }, "-table=$options->{ 'table' }", "-clientLoad", "-append", $file; - } else { - $args = join " ", $options->{ "database" }, "-table=$options->{ 'table' }", "-clientLoad", $file; - } - - Maasha::Common::run( "hgLoadPsl", "$args > /dev/null 2>&1" ); -} - - -# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> TRACK FILE <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - - -sub update_my_tracks -{ - # Martin A. Hansen, September 2007. - - # Update the /home/user/ucsc/my_tracks.ra file and executes makeCustomTracks.pl - - my ( $options, # hashref - $type, # track type - ) = @_; - - # Returns nothing. - - my ( $file, $fh_in, $fh_out, $line, $time ); - - $file = $ENV{ "HOME" } . "/ucsc/my_tracks.ra"; - - # ---- create a backup ---- - - $fh_in = Maasha::Common::read_open( $file ); - $fh_out = Maasha::Common::write_open( "$file~" ); - - while ( $line = <$fh_in> ) { - print $fh_out $line; - } + ucsc_config_entry_put( \%new_entry, $fh_out ) if not $was_found; close $fh_in; close $fh_out; - - # ---- append track ---- - - $time = Maasha::Common::time_stamp(); - - $fh_out = Maasha::Common::append_open( $file ); - - if ( $type eq "sec_struct" ) - { - print $fh_out "\n\n# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n"; - - print $fh_out "\n# Track added by 'upload_to_ucsc' $time\n\n"; - - print $fh_out "# Database $options->{ 'database' }\n\n"; - - print $fh_out "track $options->{ 'table' }\n"; - print $fh_out "shortLabel $options->{ 'short_label' }\n"; - print $fh_out "longLabel $options->{ 'long_label' }\n"; - print $fh_out "group $options->{ 'group' }\n"; - print $fh_out "priority $options->{ 'priority' }\n"; - print $fh_out "visibility $options->{ 'visibility' }\n"; - print $fh_out "color $options->{ 'color' }\n"; - print $fh_out "type bed 6 +\n"; - print $fh_out "mafTrack multiz17way\n"; - - print $fh_out "\n# //\n"; - } - else - { - print $fh_out "\n\n# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n"; - - print $fh_out "\n# Track added by 'upload_to_ucsc' $time\n\n"; - - print $fh_out "# Database $options->{ 'database' }\n\n"; - - print $fh_out "track $options->{ 'table' }\n"; - print $fh_out "shortLabel $options->{ 'short_label' }\n"; - print $fh_out "longLabel $options->{ 'long_label' }\n"; - print $fh_out "group $options->{ 'group' }\n"; - print $fh_out "priority $options->{ 'priority' }\n"; - print $fh_out "useScore 1\n" if $options->{ 'use_score' }; - print $fh_out "visibility $options->{ 'visibility' }\n"; - print $fh_out "maxHeightPixels 50:50:11\n" if $type eq "wig 0"; - print $fh_out "color $options->{ 'color' }\n"; - print $fh_out "type $type\n"; - - print $fh_out "\n# //\n"; - } - - close $fh_out; Maasha::Common::run( "ucscMakeTracks.pl", "-b > /dev/null 2>&1" ); } -# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> fixedStep format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> PhastCons format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< -sub fixedstep_get_entry +sub phastcons_index { - # Martin A. Hansen, December 2007. + # Martin A. Hansen, July 2008 - # Given a file handle to a PhastCons file get the - # next entry which is all the lines after a "fixedStep" - # line and until the next "fixedStep" line or EOF. + # Create a fixedStep index for PhastCons data. - my ( $fh, # filehandle + my ( $file, # file to index + $dir, # dir with file ) = @_; - # Returns a list of lines - - my ( $entry, @lines ); - - local $/ = "\nfixedStep "; - - $entry = <$fh>; + # Returns nothing. - chomp $entry; + my ( $index ); - @lines = split "\n", $entry; - - return if @lines == 0; + $index = fixedstep_index_create( "$dir/$file" ); - $lines[ 0 ] =~ s/fixedStep?\s*//; - - return wantarray ? @lines : \@lines; + fixedstep_index_store( "$dir/$file.index", $index ); } -# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> PhastCons format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - - sub phastcons_parse_entry { # Martin A. Hansen, December 2007. @@ -878,6 +241,11 @@ sub phastcons_parse_entry # Given a PhastCons entry converts this to a # list of super blocks. +# $options->{ "min" } ||= 10; +# $options->{ "dist" } ||= 25; +# $options->{ "threshold" } ||= 0.8; +# $options->{ "gap" } ||= 5; + my ( $lines, # list of lines $args, # argument hash ) = @_; @@ -985,18 +353,18 @@ sub phastcons_parse_entry $lens[ -1 ]++; push @entries, { - CHR => $super_block->[ 0 ]->{ "CHR" }, - CHR_BEG => $super_block->[ 0 ]->{ "CHR_BEG" }, - CHR_END => $super_block->[ -1 ]->{ "CHR_END" }, - Q_ID => "Q_ID", - SCORE => 100, - STRAND => "+", - THICK_BEG => $super_block->[ 0 ]->{ "CHR_BEG" }, - THICK_END => $super_block->[ -1 ]->{ "CHR_END" } + 1, - ITEMRGB => "0,200,100", - BLOCKCOUNT => scalar @{ $super_block }, - BLOCKSIZES => join( ",", @lens ), - Q_BEGS => join( ",", @begs ), + CHR => $super_block->[ 0 ]->{ "CHR" }, + CHR_BEG => $super_block->[ 0 ]->{ "CHR_BEG" }, + CHR_END => $super_block->[ -1 ]->{ "CHR_END" }, + Q_ID => "Q_ID", + SCORE => 100, + STRAND => "+", + THICK_BEG => $super_block->[ 0 ]->{ "CHR_BEG" }, + THICK_END => $super_block->[ -1 ]->{ "CHR_END" } + 1, + COLOR => 0, + BLOCK_COUNT => scalar @{ $super_block }, + BLOCK_LENS => join( ",", @lens ), + Q_BEGS => join( ",", @begs ), }; undef @begs; @@ -1007,240 +375,13 @@ sub phastcons_parse_entry } -sub phastcons_index_create -{ - # Martin A. Hansen, January 2008. - - # Indexes a concatenated PhastCons file. - # The index consists of a hash with chromosomes as keys, - # and a list of [ chr_beg, next_chr_beg, chr_end, index_beg, index_len ] as values. - - my ( $path, # path to PhastCons file - ) = @_; - - # Returns a hashref - - my ( $fh, $pos, $index_beg, $index_len, $entry, $locator, $chr, $step, $beg, $end, $len, %index, $i ); - - $fh = Maasha::Common::read_open( $path ); - - $pos = 0; - - while ( $entry = Maasha::UCSC::fixedstep_get_entry( $fh ) ) - { - $locator = shift @{ $entry }; - - if ( $locator =~ /chrom=([^ ]+) start=(\d+) step=(\d+)/ ) - { - $chr = $1; - $beg = $2 - 1; # phastcons files are 1-based - $step = $3; - } - else - { - Maasha::Common::error( qq(Could not parse PhastCons locator: $locator) ); - } - - $pos += length( $locator ) + 11; - - $index_beg = $pos; - -# map { $pos += length( $_ ) + 1 } @{ $entry }; - - $pos += 6 * scalar @{ $entry }; - - $index_len = $pos - $index_beg; - - push @{ $index{ $chr } }, [ $beg, undef, $beg + scalar @{ $entry } - 1, $index_beg, $index_len ]; - } - - close $fh; - - foreach $chr ( keys %index ) - { - for ( $i = 0; $i < @{ $index{ $chr } } - 1; $i++ ) { - $index{ $chr }->[ $i ]->[ NEXT_CHR_BEG ] = $index{ $chr }->[ $i + 1 ]->[ 0 ]; - } - - $index{ $chr }->[ -1 ]->[ NEXT_CHR_BEG ] = $index{ $chr }->[ -1 ]->[ CHR_END ] + 1; - } - - return wantarray ? %index : \%index; -} - - -sub phastcons_index_store -{ - # Martin A. Hansen, January 2008. - - # Writes a PhastCons index to binary file. - - my ( $path, # full path to file - $index, # list with index - ) = @_; - - # returns nothing - - Maasha::Common::file_store( $path, $index ); -} - - -sub phastcons_index_retrieve -{ - # Martin A. Hansen, January 2008. - - # Retrieves a PhastCons index from binary file. - - my ( $path, # full path to file - ) = @_; - - # returns list - - my $index; - - $index = Maasha::Common::file_retrieve( $path ); - - return wantarray ? %{ $index } : $index; -} - - -sub phastcons_index_lookup -{ - # Martin A. Hansen, January 2008. - - # Retrieve PhastCons scores from a indexed - # Phastcons file given a chromosome and - # begin and end positions. - - my ( $index, # data structure - $fh, # filehandle to datafile - $chr, # chromosome - $chr_beg, # chromosome beg - $chr_end, # chromosome end - $flank, # include flanking region - OPTIONAL - ) = @_; - - # Returns a list - - my ( $index_beg, $index_end, $i, $c, $beg, $end, @vals, $scores ); - - $flank ||= 0; - - $chr_beg -= $flank; - $chr_end += $flank; - -# print "chr_beg->$chr_beg chr_end->$chr_end flank->$flank\n"; - - if ( exists $index->{ $chr } ) - { - $index_beg = Maasha::Matrix::interval_search( $index->{ $chr }, 0, 1, $chr_beg ); - - if ( $index_beg < 0 ) { - Maasha::Common::error( qq(Index search failed - begin index position doesn't exists: $chr_beg) ); - } - - if ( $chr_end < $index->{ $chr }->[ $index_beg ]->[ 1 ] ) - { - $index_end = $index_beg; - } - else - { - $index_end = Maasha::Matrix::interval_search( $index->{ $chr }, 0, 1, $chr_end ); - - if ( $index_end < 0 ) { - Maasha::Common::error( qq(Index search failed - end index position doesn't exists: $chr_end) ); - } - } - - map { $scores->[ $_ ] = 0 } 0 .. $chr_end - $chr_beg; - - if ( $index_beg == $index_end ) - { - $beg = Maasha::Calc::max( $chr_beg, $index->{ $chr }->[ $index_beg ]->[ CHR_BEG ] ); - $end = Maasha::Calc::min( $chr_end, $index->{ $chr }->[ $index_end ]->[ CHR_END ] ); - - if ( $beg <= $index->{ $chr }->[ $index_beg ]->[ CHR_END ] and $end >= $index->{ $chr }->[ $index_beg ]->[ CHR_BEG ] ) - { - @vals = split "\n", Maasha::Common::file_read( - $fh, - $index->{ $chr }->[ $index_beg ]->[ INDEX_BEG ] + 6 * ( $beg - $index->{ $chr }->[ $index_beg ]->[ CHR_BEG ] ), - 6 * ( $end - $beg + 1 ), - ); - } - - for ( $c = 0; $c < @vals; $c++ ) { - $scores->[ $c + $beg - $chr_beg ] = $vals[ $c ]; - } - } - else - { - $beg = Maasha::Calc::max( $chr_beg, $index->{ $chr }->[ $index_beg ]->[ CHR_BEG ] ); - -# print Dumper( $beg, $index->{ $chr }->[ $index_beg ] ); -# print Dumper( "next", $index->{ $chr }->[ $index_beg ]->[ NEXT_CHR_BEG ] ); - - # beg next - # v v - # |||||||||....... - - if ( $beg <= $index->{ $chr }->[ $index_beg ]->[ CHR_END ] ) - { - @vals = split "\n", Maasha::Common::file_read( - $fh, - $index->{ $chr }->[ $index_beg ]->[ INDEX_BEG ] + 6 * ( $beg - $index->{ $chr }->[ $index_beg ]->[ CHR_BEG ] ), - 6 * ( $index->{ $chr }->[ $index_beg ]->[ CHR_END ] - $beg + 1 ), - ); - - for ( $c = 0; $c < @vals; $c++ ) { - $scores->[ $c + $beg - $chr_beg ] = $vals[ $c ]; - } - } - - $end = Maasha::Calc::min( $chr_end, $index->{ $chr }->[ $index_end ]->[ CHR_END ] ); - - if ( $end <= $index->{ $chr }->[ $index_end ]->[ CHR_END ] ) - { - @vals = split "\n", Maasha::Common::file_read( - $fh, - $index->{ $chr }->[ $index_end ]->[ INDEX_BEG ], - 6 * ( $end - $index->{ $chr }->[ $index_end ]->[ CHR_BEG ] + 1 ), - ); - - for ( $c = 0; $c < @vals; $c++ ) { - $scores->[ $c + $index->{ $chr }->[ $index_end ]->[ CHR_BEG ] - $chr_beg ] = $vals[ $c ]; - } - } - - for ( $i = $index_beg + 1; $i <= $index_end - 1; $i++ ) - { - @vals = split "\n", Maasha::Common::file_read( - $fh, - $index->{ $chr }->[ $i ]->[ INDEX_BEG ], - 6 * ( $index->{ $chr }->[ $i ]->[ CHR_END ] - $index->{ $chr }->[ $i ]->[ CHR_BEG ] + 1 ), - ); - - for ( $c = 0; $c < @vals; $c++ ) { - $scores->[ $c + $index->{ $chr }->[ $i ]->[ CHR_BEG ] - $chr_beg ] = $vals[ $c ]; - } - } - } - } - else - { - Maasha::Common::error( qq(Chromosome "$chr" was not found in index) ); - } - - return wantarray ? @{ $scores } : $scores; -} - - sub phastcons_normalize { # Martin A. Hansen, January 2008. # Normalizes a list of lists with PhastCons scores, # in such a way that each list contains the same number - # or PhastCons scores. + # of PhastCons scores. my ( $AoA, # AoA with PhastCons scores ) = @_; @@ -1304,7 +445,7 @@ sub phastcons_list_inflate # splice @{ $list }, $pos, 0, "X"; } - die qq(ERROR: bad inflate\n) if scalar @{ $list } != $len + $diff; + die qq(ERROR: Bad inflate\n) if scalar @{ $list } != $len + $diff; } @@ -1334,7 +475,7 @@ sub phastcons_list_deflate splice @{ $list }, $pos, 1; } - die qq(ERROR: bad deflate\n) if scalar @{ $list } != $len - $diff; + die qq(ERROR: Dad deflate\n) if scalar @{ $list } != $len - $diff; } @@ -1429,7 +570,7 @@ sub maf_parse my ( $fh, $line, @fields, @align ); - $fh = Maasha::Common::read_open( $path ); + $fh = Maasha::Filesys::file_read_open( $path ); while ( $line = <$fh> ) { @@ -1454,65 +595,6 @@ sub maf_parse } -# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> WIGGLE FORMAT <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - - -sub fixedstep_put_entry -{ - # Martin A. Hansen, April 2008. - - # Outputs a block of fixedStep values. - # Used for outputting wiggle data. - - my ( $chr, # chromosome - $beg, # start position - $block, # list of scores - $fh, # filehandle - OPTIONAL - ) = @_; - - # Returns nothing. - - $beg += 1; # fixedStep format is 1 based. - - if ( $fh ) - { - print $fh "fixedStep chrom=$chr start=$beg step=1\n"; - - map { printf( $fh "%d\n", ( $_ + 1 ) ) } @{ $block }; - } - else - { - print "fixedStep chrom=$chr start=$beg step=1\n"; - - map { printf( "%d\n", ( $_ + 1 ) ) } @{ $block }; - } -} - - -sub wiggle_upload_to_ucsc -{ - # Martin A. Hansen, May 2008. - - # Upload a wiggle file to the UCSC database. - - my ( $tmp_dir, # temporary directory - $wib_dir, # wib directory - $wig_file, # file to upload, - $options, # argument hashref - ) = @_; - - # Returns nothing. - - my ( $args ); - -# $args = join " ", "-tmpDir=$tmp_dir", "-pathPrefix=$wib_dir", $options->{ "database" }, $options->{ 'table' }, $wig_file; - -# Maasha::Common::run( "hgLoadWiggle", "$args > /dev/null 2>&1" ); - - `cd $tmp_dir && hgLoadWiggle -tmpDir=$tmp_dir -pathPrefix=$wib_dir $options->{ 'database' } $options->{ 'table' } $wig_file > /dev/null 2>&1`; -} - - # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> MySQL CONF <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< @@ -1525,9 +607,13 @@ sub ucsc_get_user # Returns a string. - my ( $fh, $line, $user ); + my ( $file, $fh, $line, $user ); + + $file = "$ENV{ 'HOME' }/.hg.conf"; - $fh = Maasha::Common::read_open( "$ENV{ 'HOME' }/.hg.conf" ); + return if not -f $file; + + $fh = Maasha::Filesys::file_read_open( $file ); while ( $line = <$fh> ) { @@ -1556,9 +642,13 @@ sub ucsc_get_password # Returns a string. - my ( $fh, $line, $password ); + my ( $file, $fh, $line, $password ); + + $file = "$ENV{ 'HOME' }/.hg.conf"; + + return if not -f $file; - $fh = Maasha::Common::read_open( "$ENV{ 'HOME' }/.hg.conf" ); + $fh = Maasha::Filesys::file_read_open( $file ); while ( $line = <$fh> ) {