code_perl/Maasha/UCSC.pm

   1 package Maasha::UCSC;
   2
   3 # Copyright (C) 2007 Martin A. Hansen.
   4
   5 # This program is free software; you can redistribute it and/or
   6 # modify it under the terms of the GNU General Public License
   7 # as published by the Free Software Foundation; either version 2
   8 # of the License, or (at your option) any later version.
   9
  10 # This program is distributed in the hope that it will be useful,
  11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 # GNU General Public License for more details.
  14
  15 # You should have received a copy of the GNU General Public License
  16 # along with this program; if not, write to the Free Software
  17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  18
  19 # http://www.gnu.org/copyleft/gpl.html
  20
  21
  22 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  23
  24
  25 # Stuff for interacting with UCSC genome browser
  26
  27
  28 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  29
  30
  31 use strict;
  32 use vars qw ( @ISA @EXPORT );
  33
  34 use Data::Dumper;
  35 use Time::HiRes qw( gettimeofday );
  36
  37 use Maasha::Common;
  38 use Maasha::Calc;
  39 use Maasha::Matrix;
  40
  41 use constant {
  42     CHR_BEG      => 0,
  43     NEXT_CHR_BEG => 1,
  44     CHR_END      => 2,
  45     INDEX_BEG    => 3,
  46     INDEX_LEN    => 4,
  47 };
  48
  49 @ISA = qw( Exporter );
  50
  51 my $TIME = gettimeofday();
  52
  53
  54 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> BED format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  55
  56
  57 # http://genome.ucsc.edu/goldenPath/help/hgTracksHelp.html#BED
  58
  59
  60 sub bed_get_entry
  61 {
  62     # Martin A. Hansen, December 2007.
  63
  64     # Reads a bed entry given a filehandle.
  65
  66     my ( $fh,        # file handle
  67          $columns,   # number of BED columns to read  -  OPTIONAL
  68        ) = @_;
  69
  70     # Returns hashref.
  71
  72     my ( $line, @fields, %entry );
  73
  74     $line = <$fh>;
  75
  76     $line =~ tr/\n\r//d;    # some people have carriage returns in their BED files -> Grrrr
  77
  78     return if not defined $line;
  79
  80     @fields = split "\t", $line;
  81
  82     $columns ||= scalar @fields;
  83
  84     if ( $columns == 3 )
  85     {
  86         %entry = (
  87             "CHR"      => $fields[ 0 ],
  88             "CHR_BEG"  => $fields[ 1 ],
  89             "CHR_END"  => $fields[ 2 ] - 1,
  90         );
  91     }
  92     elsif ( $columns == 4 )
  93     {
  94         %entry = (
  95             "CHR"      => $fields[ 0 ],
  96             "CHR_BEG"  => $fields[ 1 ],
  97             "CHR_END"  => $fields[ 2 ] - 1,
  98             "Q_ID"     => $fields[ 3 ],
  99         );
 100     }
 101     elsif ( $columns == 5 )
 102     {
 103         %entry = (
 104             "CHR"      => $fields[ 0 ],
 105             "CHR_BEG"  => $fields[ 1 ],
 106             "CHR_END"  => $fields[ 2 ] - 1,
 107             "Q_ID"     => $fields[ 3 ],
 108             "SCORE"    => $fields[ 4 ],
 109         );
 110     }
 111     elsif ( $columns == 6 )
 112     {
 113         %entry = (
 114             "CHR"      => $fields[ 0 ],
 115             "CHR_BEG"  => $fields[ 1 ],
 116             "CHR_END"  => $fields[ 2 ] - 1,
 117             "Q_ID"     => $fields[ 3 ],
 118             "SCORE"    => $fields[ 4 ],
 119             "STRAND"   => $fields[ 5 ],
 120         );
 121     }
 122     elsif ( $columns == 12 )
 123     {
 124         %entry = (
 125             "CHR"        => $fields[ 0 ],
 126             "CHR_BEG"    => $fields[ 1 ],
 127             "CHR_END"    => $fields[ 2 ] - 1,
 128             "Q_ID"       => $fields[ 3 ],
 129             "SCORE"      => $fields[ 4 ],
 130             "STRAND"     => $fields[ 5 ],
 131             "THICK_BEG"  => $fields[ 6 ],
 132             "THICK_END"  => $fields[ 7 ] - 1,
 133             "ITEMRGB"    => $fields[ 8 ],
 134             "BLOCKCOUNT" => $fields[ 9 ],
 135             "BLOCKSIZES" => $fields[ 10 ],
 136             "Q_BEGS"     => $fields[ 11 ],
 137         );
 138     }
 139     else
 140     {
 141         Maasha::Common::error( qq(Bad BED format in line->$line<-) );
 142     }
 143
 144     $entry{ "REC_TYPE" } = "BED";
 145     $entry{ "BED_LEN" }  = $entry{ "CHR_END" } - $entry{ "CHR_BEG" } + 1;
 146     $entry{ "BED_COLS" } = $columns;
 147
 148     return wantarray ? %entry : \%entry;
 149 }
 150
 151
 152 sub bed_get_entries
 153 {
 154     # Martin A. Hansen, January 2008.
 155
 156     # Given a path to a BED file, read in all entries
 157     # and return.
 158
 159     my ( $path,     # full path to BED file
 160          $columns,  # number of columns in BED file - OPTIONAL (but is faster)
 161        ) = @_;
 162
 163     # Returns a list.
 164
 165     my ( $fh, $entry, @list );
 166
 167     $fh = Maasha::Common::read_open( $path );
 168
 169     while ( $entry = bed_get_entry( $fh ) ) {
 170         push @list, $entry;
 171     }
 172
 173     close $fh;
 174
 175     return wantarray ? @list : \@list;
 176 }
 177
 178
 179 sub bed_put_entry
 180 {
 181     # Martin A. Hansen, Septermber 2007.
 182
 183     # Writes a BED entry to file.
 184
 185     # NB, this could really be more robust!?
 186
 187     my ( $record,       # hashref
 188          $fh,           # file handle                   - OPTIONAL
 189          $columns,      # number of columns in BED file - OPTIONAL (but is faster)
 190        ) = @_;
 191
 192     # Returns nothing.
 193
 194     my ( @fields );
 195
 196     $columns ||= 12;   # max number of columns possible
 197
 198     if ( $columns == 3 )
 199     {
 200         push @fields, $record->{ "CHR" };
 201         push @fields, $record->{ "CHR_BEG" };
 202         push @fields, $record->{ "CHR_END" } + 1;
 203     }
 204     elsif ( $columns == 4 )
 205     {
 206         $record->{ "Q_ID" }  =~ s/\s+/_/g;
 207
 208         push @fields, $record->{ "CHR" };
 209         push @fields, $record->{ "CHR_BEG" };
 210         push @fields, $record->{ "CHR_END" } + 1;
 211         push @fields, $record->{ "Q_ID" };
 212     }
 213     elsif ( $columns == 5 )
 214     {
 215         $record->{ "Q_ID" }  =~ s/\s+/_/g;
 216         $record->{ "SCORE" } =~ s/\.\d*//;
 217
 218         push @fields, $record->{ "CHR" };
 219         push @fields, $record->{ "CHR_BEG" };
 220         push @fields, $record->{ "CHR_END" } + 1;
 221         push @fields, $record->{ "Q_ID" };
 222         push @fields, $record->{ "SCORE" };
 223     }
 224     elsif ( $columns == 6 )
 225     {
 226         $record->{ "Q_ID" }  =~ s/\s+/_/g;
 227         $record->{ "SCORE" } =~ s/\.\d*//;
 228
 229         push @fields, $record->{ "CHR" };
 230         push @fields, $record->{ "CHR_BEG" };
 231         push @fields, $record->{ "CHR_END" } + 1;
 232         push @fields, $record->{ "Q_ID" };
 233         push @fields, $record->{ "SCORE" };
 234         push @fields, $record->{ "STRAND" };
 235     }
 236     else
 237     {
 238         $record->{ "Q_ID" }  =~ s/\s+/_/g;
 239         $record->{ "SCORE" } =~ s/\.\d*//;
 240
 241         push @fields, $record->{ "CHR" };
 242         push @fields, $record->{ "CHR_BEG" };
 243         push @fields, $record->{ "CHR_END" } + 1;
 244         push @fields, $record->{ "Q_ID" };
 245         push @fields, $record->{ "SCORE" };
 246         push @fields, $record->{ "STRAND" };
 247         push @fields, $record->{ "THICK_BEG" }     if defined $record->{ "THICK_BEG" };
 248         push @fields, $record->{ "THICK_END" } + 1 if defined $record->{ "THICK_END" };
 249         push @fields, $record->{ "ITEMRGB" }       if defined $record->{ "ITEMRGB" };
 250         push @fields, $record->{ "BLOCKCOUNT" }    if defined $record->{ "BLOCKCOUNT" };
 251         push @fields, $record->{ "BLOCKSIZES" }    if defined $record->{ "BLOCKSIZES" };
 252         push @fields, $record->{ "Q_BEGS" }        if defined $record->{ "Q_BEGS" };
 253     }
 254
 255     if ( $fh ) {
 256         print $fh join( "\t", @fields ), "\n";
 257     } else {
 258         print join( "\t", @fields ), "\n";
 259     }
 260 }
 261
 262
 263 sub bed_put_entries
 264 {
 265     # Martin A. Hansen, January 2008.
 266
 267     # Write a list of BED entries.
 268
 269     my ( $entries,   # list of entries,
 270          $fh,        # file handle - OPTIONAL
 271        ) = @_;
 272
 273     # Returns nothing.
 274
 275     map { bed_put_entry( $_, $fh ) } @{ $entries };
 276 }
 277
 278
 279 sub bed_analyze
 280 {
 281     # Martin A. Hansen, March 2008.
 282
 283     # Given a bed record, analysis this to give information
 284     # about intron/exon sizes.
 285
 286     my ( $entry,   # BED entry
 287        ) = @_;
 288
 289     # Returns hashref.
 290
 291     my ( $i, @begs, @lens, $exon_max, $exon_min, $exon_len, $exon_tot, $intron_max, $intron_min, $intron_len, $intron_tot );
 292
 293     $exon_max   = 0;
 294     $exon_min   = 9999999999;
 295     $intron_max = 0;
 296     $intron_min = 9999999999;
 297
 298     $entry->{ "EXONS" }   = $entry->{ "BLOCKCOUNT" };
 299
 300     @begs = split /,/, $entry->{ "Q_BEGS" };
 301     @lens = split /,/, $entry->{ "BLOCKSIZES" };
 302
 303     for ( $i = 0; $i < $entry->{ "BLOCKCOUNT" }; $i++ )
 304     {
 305         $exon_len = @lens[ $i ];
 306
 307         $entry->{ "EXON_LEN_$i" } = $exon_len;
 308
 309         $exon_max = $exon_len if $exon_len > $exon_max;
 310         $exon_min = $exon_len if $exon_len < $exon_min;
 311
 312         $exon_tot += $exon_len;
 313     }
 314
 315     $entry->{ "EXON_LEN_-1" }   = $exon_len;
 316     $entry->{ "EXON_MAX_LEN" }  = $exon_max;
 317     $entry->{ "EXON_MIN_LEN" }  = $exon_min;
 318     $entry->{ "EXON_MEAN_LEN" } = int( $exon_tot / $entry->{ "EXONS" } );
 319
 320     $entry->{ "INTRONS" } = $entry->{ "BLOCKCOUNT" } - 1;
 321     $entry->{ "INTRONS" } = 0 if $entry->{ "INTRONS" } < 0;
 322
 323     if ( $entry->{ "INTRONS" } )
 324     {
 325         for ( $i = 1; $i < $entry->{ "BLOCKCOUNT" }; $i++ )
 326         {
 327             $intron_len = @begs[ $i ] - ( @begs[ $i - 1 ] + @lens[ $i - 1 ] );
 328
 329             $entry->{ "INTRON_LEN_" . ( $i - 1 ) } = $intron_len;
 330
 331             $intron_max = $intron_len if $intron_len > $intron_max;
 332             $intron_min = $intron_len if $intron_len < $intron_min;
 333
 334             $intron_tot += $intron_len;
 335         }
 336
 337         $entry->{ "INTRON_LEN_-1" }   = $intron_len;
 338         $entry->{ "INTRON_MAX_LEN" }  = $intron_max;
 339         $entry->{ "INTRON_MIN_LEN" }  = $intron_min;
 340         $entry->{ "INTRON_MEAN_LEN" } = int( $intron_tot / $entry->{ "INTRONS" } );
 341     }
 342
 343     return wantarray ? %{ $entry } : $entry;
 344 }
 345
 346
 347 sub bed_sort
 348 {
 349     # Martin A. Hansen, March 2008.
 350
 351     # Sort a potential huge BED file according to
 352     # CHR, CHR_BEG and optionally STRAND.
 353
 354     my ( $tmp_dir,   # temporary directory used for sorting
 355          $file,      # BED file to sort
 356          $strand,    # flag to sort on strand - OPTIONAL
 357        ) = @_;
 358
 359     # Returns nothing.
 360
 361     my ( $fh_in, $key, $fh_out, %fh_hash, $part_file, $entry, $entries );
 362
 363     $fh_in = Maasha::Common::read_open( $file );
 364
 365     while ( $entry = bed_get_entry( $fh_in ) )
 366     {
 367         if ( $strand ) {
 368             $key = join "_", $entry->{ "CHR" }, $entry->{ "STRAND" };
 369         } else {
 370             $key = $entry->{ "CHR" };
 371         }
 372
 373         $fh_hash{ $key } = Maasha::Common::write_open( "$tmp_dir/$key.sort" ) if not exists $fh_hash{ $key };
 374
 375         bed_put_entry( $entry, $fh_hash{ $key } );
 376     }
 377
 378     close $fh_in;
 379
 380     map { close $_ } keys %fh_hash;
 381
 382     $fh_out = Maasha::Common::write_open( "$tmp_dir/temp.sort" );
 383
 384     foreach $part_file ( sort keys %fh_hash )
 385     {
 386         $entries = bed_get_entries( "$tmp_dir/$part_file.sort" );
 387
 388         @{ $entries } = sort { $a->{ "CHR_BEG" } <=> $b->{ "CHR_BEG" } } @{ $entries };
 389
 390         map { bed_put_entry( $_, $fh_out ) } @{ $entries };
 391
 392         unlink "$tmp_dir/$part_file.sort";
 393     }
 394
 395     close $fh_out;
 396
 397     rename "$tmp_dir/temp.sort", $file;
 398 }
 399
 400
 401 sub bed_merge_entries
 402 {
 403     # Martin A. Hansen, February 2008.
 404
 405     # Merge a list of given BED entries in one big entry.
 406
 407     my ( $entries,     # list of BED entries to be merged
 408        ) = @_;
 409
 410     # Returns hash.
 411
 412     my ( $i, @q_ids, @q_begs, @blocksizes, @new_q_begs, @new_blocksizes, %new_entry );
 413
 414     @{ $entries } = sort { $a->{ "CHR_BEG" } <=> $b->{ "CHR_BEG" } } @{ $entries };
 415
 416     for ( $i = 0; $i < @{ $entries }; $i++ )
 417     {
 418         Maasha::Common::error( qq(Attempted merge of BED entries from different chromosomes) ) if $entries->[ 0 ]->{ "CHR" }    ne $entries->[ $i ]->{ "CHR" };
 419         Maasha::Common::error( qq(Attempted merge of BED entries from different strands) )     if $entries->[ 0 ]->{ "STRAND" } ne $entries->[ $i ]->{ "STRAND" };
 420
 421         push @q_ids, $entries->[ $i ]->{ "Q_ID" } || sprintf( "ID%06d", $i );
 422
 423         if ( exists $entries->[ $i ]->{ "Q_BEGS" } )
 424         {
 425             @q_begs     = split ",", $entries->[ $i ]->{ "Q_BEGS" };
 426             @blocksizes = split ",", $entries->[ $i ]->{ "BLOCKSIZES" };
 427         }
 428         else
 429         {
 430             @q_begs     = 0;
 431             @blocksizes = $entries->[ $i ]->{ "CHR_END" } - $entries->[ $i ]->{ "CHR_BEG" } + 1;
 432         }
 433
 434         map { $_ += $entries->[ $i ]->{ "CHR_BEG" } } @q_begs;
 435
 436         push @new_q_begs, @q_begs;
 437         push @new_blocksizes, @blocksizes;
 438     }
 439
 440     map { $_ -= $entries->[ 0 ]->{ "CHR_BEG" } } @new_q_begs;
 441
 442     %new_entry = (
 443         CHR         => $entries->[ 0 ]->{ "CHR" },
 444         CHR_BEG     => $entries->[ 0 ]->{ "CHR_BEG" },
 445         CHR_END     => $entries->[ -1 ]->{ "CHR_END" },
 446         REC_TYPE    => "BED",
 447         BED_LEN     => $entries->[ -1 ]->{ "CHR_END" } - $entries->[ 0 ]->{ "CHR_BEG" } + 1,
 448         BED_COLS    => 12,
 449         Q_ID        => join( ":", @q_ids ),
 450         SCORE       => 999,
 451         STRAND      => $entries->[ 0 ]->{ "STRAND" }     || "+",
 452         THICK_BEG   => $entries->[ 0 ]->{ "THICK_BEG" }  || $entries->[ 0 ]->{ "CHR_BEG" },
 453         THICK_END   => $entries->[ -1 ]->{ "THICK_END" } || $entries->[ -1 ]->{ "CHR_END" },
 454         ITEMRGB     => "0,0,0",
 455         BLOCKCOUNT  => scalar @new_q_begs,
 456         BLOCKSIZES  => join( ",", @new_blocksizes ),
 457         Q_BEGS      => join( ",", @new_q_begs ),
 458     );
 459
 460     return wantarray ? %new_entry : \%new_entry;
 461 }
 462
 463
 464 sub bed_split_entry
 465 {
 466     # Martin A. Hansen, February 2008.
 467
 468     # Splits a given BED entry into a list of blocks,
 469     # which are returned. A list of 6 column BED entry is returned.
 470
 471     my ( $entry,    # BED entry hashref
 472        ) = @_;
 473
 474     # Returns a list.
 475
 476     my ( @q_begs, @blocksizes, $block, @blocks, $i );
 477
 478     if ( exists $entry->{ "BLOCKCOUNT" } )
 479     {
 480         @q_begs     = split ",", $entry->{ "Q_BEGS" };
 481         @blocksizes = split ",", $entry->{ "BLOCKSIZES" };
 482
 483         for ( $i = 0; $i < @q_begs; $i++ )
 484         {
 485             undef $block;
 486
 487             $block->{ "CHR" }      = $entry->{ "CHR" };
 488             $block->{ "CHR_BEG" }  = $entry->{ "CHR_BEG" } + $q_begs[ $i ];
 489             $block->{ "CHR_END" }  = $entry->{ "CHR_BEG" } + $q_begs[ $i ] + $blocksizes[ $i ] - 1;
 490             $block->{ "Q_ID" }     = $entry->{ "Q_ID" } . sprintf( "_%03d", $i );
 491             $block->{ "SCORE" }    = $entry->{ "SCORE" };
 492             $block->{ "STRAND" }   = $entry->{ "STRAND" };
 493             $block->{ "BED_LEN" }  = $block->{ "CHR_END" } - $block->{ "CHR_BEG" } + 1,
 494             $block->{ "BED_COLS" } = 6;
 495             $block->{ "REC_TYPE" } = "BED";
 496
 497             push @blocks, $block;
 498         }
 499     }
 500     else
 501     {
 502         @blocks = @{ $entry };
 503     }
 504
 505     return wantarray ? @blocks : \@blocks;
 506 }
 507
 508
 509
 510 sub bed_overlap
 511 {
 512     # Martin A. Hansen, February 2008.
 513
 514     # Checks if two BED entries overlap and
 515     # return 1 if so - else 0;
 516
 517     my ( $entry1,      # hashref
 518          $entry2,      # hashref
 519          $no_strand,   # don't check strand flag - OPTIONAL
 520        ) = @_;
 521
 522     # Return bolean.
 523
 524     return 0 if $entry1->{ "CHR" }    ne $entry2->{ "CHR" };
 525     return 0 if $entry1->{ "STRAND" } ne $entry2->{ "STRAND" };
 526
 527     if ( $entry1->{ "CHR_END" } < $entry2->{ "CHR_BEG" } or $entry1->{ "CHR_BEG" } > $entry2->{ "CHR_END" } ) {
 528         return 0;
 529     } else {
 530         return 1;
 531     }
 532 }
 533
 534
 535 sub bed_upload_to_ucsc
 536 {
 537     # Martin A. Hansen, September 2007.
 538
 539     # Upload a BED file to the UCSC database.
 540
 541     my ( $tmp_dir,   # temporary directory
 542          $file,      # file to upload,
 543          $options,   # argument hashref
 544          $append,    # flag indicating table should be appended
 545        ) = @_;
 546
 547     # Returns nothing.
 548
 549     my ( $args, $table, $sql_file, $fh_out, $fh_in );
 550
 551     if ( $append ) {
 552         $args = join " ", $options->{ "database" }, $options->{ "table" }, "-tmpDir=$tmp_dir", "-oldTable", $file;
 553     } else {
 554         $args = join " ", $options->{ "database" }, $options->{ "table" }, "-tmpDir=$tmp_dir", $file;
 555     }
 556
 557     if ( $options->{ "sec_struct" } )
 558     {
 559         $table = $options->{ "table" };
 560
 561         Maasha::Common::error( "Attempt to load secondary structure track without 'rnaSecStr' in table name" ) if not $table =~ /rnaSecStr/;
 562
 563         $sql_file = "$tmp_dir/upload_RNA_SS.sql";
 564
 565         $fh_out   = Maasha::Common::write_open( $sql_file );
 566
 567         print $fh_out qq(
 568 CREATE TABLE $table (
 569     bin smallint not null,              # Bin number for browser speedup
 570     chrom varchar(255) not null,        # Chromosome or FPC contig
 571     chromStart int unsigned not null,   # Start position in chromosome
 572     chromEnd int unsigned not null,     # End position in chromosome
 573     name varchar(255) not null,         # Name of item
 574     score int unsigned not null,        # Score from 0-1000
 575     strand char(1) not null,            # + or -
 576     size int unsigned not null,         # Size of element.
 577     secStr longblob not null,           # Parentheses and '.'s which define the secondary structure
 578     conf longblob not null,             # Confidence of secondary-structure annotation per position (0.0-1.0).
 579     #Indices
 580     INDEX(name(16)),
 581     INDEX(chrom(8), bin),
 582     INDEX(chrom(8), chromStart)
 583 );
 584         );
 585
 586         close $fh_out;
 587
 588         Maasha::Common::run( "hgLoadBed", "-notItemRgb -sqlTable=$sql_file $options->{ 'database' } $options->{ 'table' } -tmpDir=$tmp_dir $file > /dev/null 2>&1" );
 589
 590         unlink $sql_file;
 591     }
 592     else
 593     {
 594         Maasha::Common::run( "hgLoadBed", "$args > /dev/null 2>&1" );
 595     }
 596 }
 597
 598
 599 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> PSL format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
 600
 601
 602 sub psl_get_entry
 603 {
 604     # Martin A. Hansen, August 2008.
 605
 606     # Reads PSL next entry from a PSL file and returns a record.
 607
 608     my ( $fh,   # file handle of PSL filefull path to PSL file
 609        ) = @_;
 610
 611     # Returns hashref.
 612
 613     my ( $line, @fields, %record );
 614
 615     while ( $line = <$fh> )
 616     {
 617         chomp $line;
 618
 619         @fields = split "\t", $line;
 620
 621         if ( scalar @fields == 21 )
 622         {
 623             %record = (
 624                 REC_TYPE    => "PSL",
 625                 MATCHES     => $fields[ 0 ],
 626                 MISMATCHES  => $fields[ 1 ],
 627                 REPMATCHES  => $fields[ 2 ],
 628                 NCOUNT      => $fields[ 3 ],
 629                 QNUMINSERT  => $fields[ 4 ],
 630                 QBASEINSERT => $fields[ 5 ],
 631                 SNUMINSERT  => $fields[ 6 ],
 632                 SBASEINSERT => $fields[ 7 ],
 633                 STRAND      => $fields[ 8 ],
 634                 Q_ID        => $fields[ 9 ],
 635                 Q_LEN       => $fields[ 10 ],
 636                 Q_BEG       => $fields[ 11 ],
 637                 Q_END       => $fields[ 12 ] - 1,
 638                 S_ID        => $fields[ 13 ],
 639                 S_LEN       => $fields[ 14 ],
 640                 S_BEG       => $fields[ 15 ],
 641                 S_END       => $fields[ 16 ] - 1,
 642                 BLOCKCOUNT  => $fields[ 17 ],
 643                 BLOCKSIZES  => $fields[ 18 ],
 644                 Q_BEGS      => $fields[ 19 ],
 645                 S_BEGS      => $fields[ 20 ],
 646             );
 647
 648             $record{ "SCORE" } = $record{ "MATCHES" } + int( $record{ "REPMATCHES" } / 2 ) - $record{ "MISMATCHES" } - $record{ "QNUMINSERT" } - $record{ "SNUMINSERT" };
 649
 650             return wantarray ? %record : \%record;
 651         }
 652     }
 653
 654     return undef;
 655 }
 656
 657
 658 sub psl_get_entries
 659 {
 660     # Martin A. Hansen, February 2008.
 661
 662     # Reads PSL entries and returns a list of records.
 663
 664     my ( $path,   # full path to PSL file
 665        ) = @_;
 666
 667     # Returns hashref.
 668
 669     my ( $fh, @lines, @fields, $i, %record, @records );
 670
 671     $fh = Maasha::Common::read_open( $path );
 672
 673     @lines = <$fh>;
 674
 675     close $fh;
 676
 677     chomp @lines;
 678
 679     for ( $i = 5; $i < @lines; $i++ )
 680     {
 681         @fields = split "\t", $lines[ $i ];
 682
 683         Maasha::Common::error( qq(Bad PSL format in file "$path") ) if not @fields == 21;
 684
 685         undef %record;
 686
 687         %record = (
 688             REC_TYPE    => "PSL",
 689             MATCHES     => $fields[ 0 ],
 690             MISMATCHES  => $fields[ 1 ],
 691             REPMATCHES  => $fields[ 2 ],
 692             NCOUNT      => $fields[ 3 ],
 693             QNUMINSERT  => $fields[ 4 ],
 694             QBASEINSERT => $fields[ 5 ],
 695             SNUMINSERT  => $fields[ 6 ],
 696             SBASEINSERT => $fields[ 7 ],
 697             STRAND      => $fields[ 8 ],
 698             Q_ID        => $fields[ 9 ],
 699             Q_LEN       => $fields[ 10 ],
 700             Q_BEG       => $fields[ 11 ],
 701             Q_END       => $fields[ 12 ] - 1,
 702             S_ID        => $fields[ 13 ],
 703             S_LEN       => $fields[ 14 ],
 704             S_BEG       => $fields[ 15 ],
 705             S_END       => $fields[ 16 ] - 1,
 706             BLOCKCOUNT  => $fields[ 17 ],
 707             BLOCKSIZES  => $fields[ 18 ],
 708             Q_BEGS      => $fields[ 19 ],
 709             S_BEGS      => $fields[ 20 ],
 710         );
 711
 712         $record{ "SCORE" } = $record{ "MATCHES" } + int( $record{ "REPMATCHES" } / 2 ) - $record{ "MISMATCHES" } - $record{ "QNUMINSERT" } - $record{ "SNUMINSERT" };
 713
 714         push @records, { %record };
 715     }
 716
 717     return wantarray ? @records : \@records;
 718 }
 719
 720
 721 sub psl_put_header
 722 {
 723     # Martin A. Hansen, September 2007.
 724
 725     # Write a PSL header to file.
 726
 727     my ( $fh,  # file handle  - OPTIONAL
 728        ) = @_;
 729
 730     # Returns nothing.
 731
 732     $fh = \*STDOUT if not $fh;
 733
 734     print $fh qq(psLayout version 3
 735 match   mis-    rep.    N's     Q gap   Q gap   T gap   T gap   strand  Q               Q       Q       Q       T               T       T       T       block   blockSizes      qStart        match   match           count   bases   count   bases           name            size    start   end     name            size    start   end     count
 736 ---------------------------------------------------------------------------------------------------------------------------------------------------------------
 737 );
 738 }
 739
 740
 741 sub psl_put_entry
 742 {
 743     # Martin A. Hansen, September 2007.
 744
 745     # Write a PSL entry to file.
 746
 747     my ( $record,       # hashref
 748          $fh,           # file handle  -  OPTIONAL
 749        ) = @_;
 750
 751     # Returns nothing.
 752
 753     $fh = \*STDOUT if not $fh;
 754
 755     my @output;
 756
 757     push @output, $record->{ "MATCHES" };
 758     push @output, $record->{ "MISMATCHES" };
 759     push @output, $record->{ "REPMATCHES" };
 760     push @output, $record->{ "NCOUNT" };
 761     push @output, $record->{ "QNUMINSERT" };
 762     push @output, $record->{ "QBASEINSERT" };
 763     push @output, $record->{ "SNUMINSERT" };
 764     push @output, $record->{ "SBASEINSERT" };
 765     push @output, $record->{ "STRAND" };
 766     push @output, $record->{ "Q_ID" };
 767     push @output, $record->{ "Q_LEN" };
 768     push @output, $record->{ "Q_BEG" };
 769     push @output, $record->{ "Q_END" } + 1;
 770     push @output, $record->{ "S_ID" };
 771     push @output, $record->{ "S_LEN" };
 772     push @output, $record->{ "S_BEG" };
 773     push @output, $record->{ "S_END" } + 1;
 774     push @output, $record->{ "BLOCKCOUNT" };
 775     push @output, $record->{ "BLOCKSIZES" };
 776     push @output, $record->{ "Q_BEGS" };
 777     push @output, $record->{ "S_BEGS" };
 778
 779     print $fh join( "\t", @output ), "\n";
 780 }
 781
 782
 783 sub psl_upload_to_ucsc
 784 {
 785     # Martin A. Hansen, September 2007.
 786
 787     # Upload a PSL file to the UCSC database.
 788
 789     my ( $file,      # file to upload,
 790          $options,   # argument hashref
 791          $append,    # flag indicating table should be appended
 792        ) = @_;
 793
 794     # Returns nothing.
 795
 796     my ( $args );
 797
 798     if ( $append ) {
 799         $args = join " ", $options->{ "database" }, "-table=$options->{ 'table' }", "-clientLoad", "-append", $file;
 800     } else {
 801         $args = join " ", $options->{ "database" }, "-table=$options->{ 'table' }", "-clientLoad", $file;
 802     }
 803
 804     Maasha::Common::run( "hgLoadPsl", "$args > /dev/null 2>&1" );
 805 }
 806
 807
 808 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> TRACK FILE <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
 809
 810
 811 sub update_my_tracks
 812 {
 813     # Martin A. Hansen, September 2007.
 814
 815     # Update the /home/user/ucsc/my_tracks.ra file and executes makeCustomTracks.pl
 816
 817     my ( $options,   # hashref
 818          $type,      # track type
 819        ) = @_;
 820
 821     # Returns nothing.
 822
 823     my ( $file, $fh_in, $fh_out, $line, $time );
 824
 825     $file = $ENV{ "HOME" } . "/ucsc/my_tracks.ra";
 826
 827     # ---- create a backup ----
 828
 829     $fh_in  = Maasha::Common::read_open( $file );
 830     $fh_out = Maasha::Common::write_open( "$file~" );
 831
 832     while ( $line = <$fh_in> ) {
 833         print $fh_out $line;
 834     }
 835
 836     close $fh_in;
 837     close $fh_out;
 838
 839     # ---- append track ----
 840
 841     $time = Maasha::Common::time_stamp();
 842
 843     $fh_out = Maasha::Common::append_open( $file );
 844
 845     if ( $type eq "sec_struct" )
 846     {
 847         print $fh_out "\n\n# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n";
 848
 849         print $fh_out "\n# Track added by 'upload_to_ucsc' $time\n\n";
 850
 851         print $fh_out "# Database $options->{ 'database' }\n\n";
 852
 853         print $fh_out "track $options->{ 'table' }\n";
 854         print $fh_out "shortLabel $options->{ 'short_label' }\n";
 855         print $fh_out "longLabel $options->{ 'long_label' }\n";
 856         print $fh_out "group $options->{ 'group' }\n";
 857         print $fh_out "priority $options->{ 'priority' }\n";
 858         print $fh_out "visibility $options->{ 'visibility' }\n";
 859         print $fh_out "color $options->{ 'color' }\n";
 860         print $fh_out "type bed 6 +\n";
 861         print $fh_out "mafTrack multiz17way\n";
 862
 863         print $fh_out "\n# //\n";
 864     }
 865     else
 866     {
 867         print $fh_out "\n\n# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n";
 868
 869         print $fh_out "\n# Track added by 'upload_to_ucsc' $time\n\n";
 870
 871         print $fh_out "# Database $options->{ 'database' }\n\n";
 872
 873         print $fh_out "track $options->{ 'table' }\n";
 874         print $fh_out "shortLabel $options->{ 'short_label' }\n";
 875         print $fh_out "longLabel $options->{ 'long_label' }\n";
 876         print $fh_out "group $options->{ 'group' }\n";
 877         print $fh_out "priority $options->{ 'priority' }\n";
 878         print $fh_out "useScore 1\n" if $options->{ 'use_score' };
 879         print $fh_out "visibility $options->{ 'visibility' }\n";
 880         print $fh_out "maxHeightPixels 50:50:11\n" if $type eq "wig 0";
 881         print $fh_out "color $options->{ 'color' }\n";
 882         print $fh_out "type $type\n";
 883
 884         print $fh_out "\n# //\n";
 885     }
 886
 887     close $fh_out;
 888
 889     Maasha::Common::run( "ucscMakeTracks.pl", "-b > /dev/null 2>&1" );
 890 }
 891
 892
 893 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> fixedStep format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
 894
 895
 896 sub fixedstep_get_entry
 897 {
 898     # Martin A. Hansen, December 2007.
 899
 900     # Given a file handle to a PhastCons file get the
 901     # next entry which is all the lines after a "fixedStep"
 902     # line and until the next "fixedStep" line or EOF.
 903
 904     my ( $fh,   # filehandle
 905        ) = @_;
 906
 907     # Returns a list of lines
 908
 909     my ( $entry, @lines );
 910
 911     local $/ = "\nfixedStep ";
 912
 913     $entry = <$fh>;
 914
 915     chomp $entry;
 916
 917     @lines = split "\n", $entry;
 918
 919     return if @lines == 0;
 920
 921     $lines[ 0 ] =~ s/fixedStep?\s*//;
 922
 923     return wantarray ? @lines : \@lines;
 924 }
 925
 926
 927 sub fixedstep_index_create
 928 {
 929     # Martin A. Hansen, January 2008.
 930
 931     # Indexes a concatenated fixedStep file.
 932     # The index consists of a hash with chromosomes as keys,
 933     # and a list of [ chr_beg, next_chr_beg, chr_end, index_beg, index_len ] as values.
 934
 935     my ( $path,   # path to fixedStep file
 936        ) = @_;
 937
 938     # Returns a hashref
 939
 940     my ( $fh, $pos, $index_beg, $index_len, $entry, $locator, $chr, $step, $beg, $end, $len, %index, $i );
 941
 942     $fh = Maasha::Common::read_open( $path );
 943
 944     $pos = 0;
 945
 946     while ( $entry = Maasha::UCSC::fixedstep_get_entry( $fh ) )
 947     {
 948         $locator = shift @{ $entry };
 949
 950         if ( $locator =~ /chrom=([^ ]+) start=(\d+) step=(\d+)/ )
 951         {
 952             $chr  = $1;
 953             $beg  = $2 - 1;  #  fixedStep files are 1-based
 954             $step = $3;
 955         }
 956         else
 957         {
 958             Maasha::Common::error( qq(Could not parse locator: $locator) );
 959         }
 960
 961         $pos += length( $locator ) + 11;
 962
 963         $index_beg = $pos;
 964
 965 #        map { $pos += length( $_ ) + 1 } @{ $entry };
 966
 967         $pos += 6 * scalar @{ $entry };
 968
 969         $index_len = $pos - $index_beg;
 970
 971         push @{ $index{ $chr } }, [ $beg, undef, $beg + scalar @{ $entry } - 1, $index_beg, $index_len ];
 972     }
 973
 974     close $fh;
 975
 976     foreach $chr ( keys %index )
 977     {
 978         for ( $i = 0; $i < @{ $index{ $chr } } - 1; $i++ ) {
 979             $index{ $chr }->[ $i ]->[ NEXT_CHR_BEG ] = $index{ $chr }->[ $i + 1 ]->[ 0 ];
 980         }
 981
 982         $index{ $chr }->[ -1 ]->[ NEXT_CHR_BEG ] = $index{ $chr }->[ -1 ]->[ CHR_END ] + 1;
 983     }
 984
 985     return wantarray ? %index : \%index;
 986 }
 987
 988
 989 sub fixedstep_index_store
 990 {
 991     # Martin A. Hansen, January 2008.
 992
 993     # Writes a fixedStep index to binary file.
 994
 995     my ( $path,   # full path to file
 996          $index,  # list with index
 997        ) = @_;
 998
 999     # returns nothing
1000
1001     Maasha::Common::file_store( $path, $index );
1002 }
1003
1004
1005 sub fixedstep_index_retrieve
1006 {
1007     # Martin A. Hansen, January 2008.
1008
1009     # Retrieves a fixedStep index from binary file.
1010
1011     my ( $path,   # full path to file
1012        ) = @_;
1013
1014     # returns list
1015
1016     my $index;
1017
1018     $index = Maasha::Common::file_retrieve( $path );
1019
1020     return wantarray ? %{ $index } : $index;
1021 }
1022
1023
1024 sub fixedStep_index_lookup
1025 {
1026     # Martin A. Hansen, January 2008.
1027
1028     # Retrieve fixedStep scores from a indexed
1029     # fixedStep file given a chromosome and
1030     # begin and end positions.
1031
1032     my ( $index,     # data structure
1033          $fh,        # filehandle to datafile
1034          $chr,       # chromosome
1035          $chr_beg,   # chromosome beg
1036          $chr_end,   # chromosome end
1037          $flank,     # include flanking region - OPTIONAL
1038        ) = @_;
1039
1040     # Returns a list
1041
1042     my ( $index_beg, $index_end, $i, $c, $beg, $end, @vals, $scores );
1043
1044     $flank ||= 0;
1045
1046     $chr_beg -= $flank;
1047     $chr_end += $flank;
1048
1049 #    print "chr_beg->$chr_beg   chr_end->$chr_end   flank->$flank\n";
1050
1051     if ( exists $index->{ $chr } )
1052     {
1053         $index_beg = Maasha::Matrix::interval_search( $index->{ $chr }, 0, 1, $chr_beg );
1054
1055         if ( $index_beg < 0 ) {
1056             Maasha::Common::error( qq(Index search failed - begin index position doesn't exists: $chr_beg) );
1057         }
1058
1059         if ( $chr_end < $index->{ $chr }->[ $index_beg ]->[ 1 ] )
1060         {
1061             $index_end = $index_beg;
1062         }
1063         else
1064         {
1065             $index_end = Maasha::Matrix::interval_search( $index->{ $chr }, 0, 1, $chr_end );
1066
1067             if ( $index_end < 0 ) {
1068                 Maasha::Common::error( qq(Index search failed - end index position doesn't exists: $chr_end) );
1069             }
1070         }
1071
1072         map { $scores->[ $_ ] = 0 } 0 .. $chr_end - $chr_beg;
1073
1074         if ( $index_beg == $index_end )
1075         {
1076             $beg = Maasha::Calc::max( $chr_beg, $index->{ $chr }->[ $index_beg ]->[ CHR_BEG ] );
1077             $end = Maasha::Calc::min( $chr_end, $index->{ $chr }->[ $index_end ]->[ CHR_END ] );
1078
1079             if ( $beg <= $index->{ $chr }->[ $index_beg ]->[ CHR_END ] and $end >= $index->{ $chr }->[ $index_beg ]->[ CHR_BEG ] )
1080             {
1081                 @vals = split "\n", Maasha::Common::file_read(
1082                     $fh,
1083                     $index->{ $chr }->[ $index_beg ]->[ INDEX_BEG ] + 6 * ( $beg - $index->{ $chr }->[ $index_beg ]->[ CHR_BEG ] ),
1084                     6 * ( $end - $beg + 1 ),
1085                 );
1086             }
1087
1088             for ( $c = 0; $c < @vals; $c++ ) {
1089                 $scores->[ $c + $beg - $chr_beg ] = $vals[ $c ];
1090             }
1091         }
1092         else
1093         {
1094             $beg = Maasha::Calc::max( $chr_beg, $index->{ $chr }->[ $index_beg ]->[ CHR_BEG ] );
1095
1096 #            print Dumper( $beg, $index->{ $chr }->[ $index_beg ] );
1097 #            print Dumper( "next", $index->{ $chr }->[ $index_beg ]->[ NEXT_CHR_BEG ] );
1098
1099             #      beg         next
1100             #      v           v
1101             #  |||||||||.......
1102
1103             if ( $beg <= $index->{ $chr }->[ $index_beg ]->[ CHR_END ] )
1104             {
1105                 @vals = split "\n", Maasha::Common::file_read(
1106                     $fh,
1107                     $index->{ $chr }->[ $index_beg ]->[ INDEX_BEG ] + 6 * ( $beg - $index->{ $chr }->[ $index_beg ]->[ CHR_BEG ] ),
1108                     6 * ( $index->{ $chr }->[ $index_beg ]->[ CHR_END ] - $beg + 1 ),
1109                 );
1110
1111                 for ( $c = 0; $c < @vals; $c++ ) {
1112                     $scores->[ $c + $beg - $chr_beg ] = $vals[ $c ];
1113                 }
1114             }
1115
1116             $end = Maasha::Calc::min( $chr_end, $index->{ $chr }->[ $index_end ]->[ CHR_END ] );
1117
1118             if ( $end <= $index->{ $chr }->[ $index_end ]->[ CHR_END ] )
1119             {
1120                 @vals = split "\n", Maasha::Common::file_read(
1121                     $fh,
1122                     $index->{ $chr }->[ $index_end ]->[ INDEX_BEG ],
1123                     6 * ( $end - $index->{ $chr }->[ $index_end ]->[ CHR_BEG ] + 1 ),
1124                 );
1125
1126                 for ( $c = 0; $c < @vals; $c++ ) {
1127                     $scores->[ $c + $index->{ $chr }->[ $index_end ]->[ CHR_BEG ] - $chr_beg ] = $vals[ $c ];
1128                 }
1129             }
1130
1131             for ( $i = $index_beg + 1; $i <= $index_end - 1; $i++ )
1132             {
1133                 @vals = split "\n", Maasha::Common::file_read(
1134                     $fh,
1135                     $index->{ $chr }->[ $i ]->[ INDEX_BEG ],
1136                     6 * ( $index->{ $chr }->[ $i ]->[ CHR_END ] - $index->{ $chr }->[ $i ]->[ CHR_BEG ] + 1 ),
1137                 );
1138
1139                 for ( $c = 0; $c < @vals; $c++ ) {
1140                     $scores->[ $c + $index->{ $chr }->[ $i ]->[ CHR_BEG ] - $chr_beg ] = $vals[ $c ];
1141                 }
1142             }
1143         }
1144     }
1145     else
1146     {
1147         Maasha::Common::error( qq(Chromosome "$chr" was not found in index) );
1148     }
1149
1150     return wantarray ? @{ $scores } : $scores;
1151 }
1152
1153
1154 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> PhastCons format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1155
1156
1157 sub phastcons_index
1158 {
1159     # Martin A. Hansen, July 2008
1160
1161     # Create a fixedStep index for PhastCons data.
1162
1163     my ( $file,   # file to index
1164          $dir,    # dir with file
1165        ) = @_;
1166
1167     # Returns nothing.
1168
1169     my ( $index );
1170
1171     $index = fixedstep_index_create( "$dir/$file" );
1172
1173     fixedstep_index_store( "$dir/$file.index", $index );
1174 }
1175
1176
1177 sub phastcons_parse_entry
1178 {
1179     # Martin A. Hansen, December 2007.
1180
1181     # Given a PhastCons entry converts this to a
1182     # list of super blocks.
1183
1184     my ( $lines,   # list of lines
1185          $args,    # argument hash
1186        ) = @_;
1187
1188     # Returns
1189
1190     my ( $info, $chr, $beg, $step, $i, $c, $j, @blocks, @super_blocks, @entries, $super_block, $block, @lens, @begs );
1191
1192     $info = shift @{ $lines };
1193
1194     if ( $info =~ /^chrom=([^ ]+) start=(\d+) step=(\d+)$/ )
1195     {
1196         $chr  = $1;
1197         $beg  = $2;
1198         $step = $3;
1199
1200         die qq(ERROR: step size $step != 1 -> problem!\n) if $step != 1; # in an ideal world should would be fixed ...
1201     }
1202
1203     $i = 0;
1204
1205     while ( $i < @{ $lines } )
1206     {
1207         if ( $lines->[ $i ] >= $args->{ "threshold" } )
1208         {
1209             $c = $i + 1;
1210
1211             while ( $c < @{ $lines } )
1212             {
1213                 if ( $lines->[ $c ] < $args->{ "threshold" } )
1214                 {
1215                     $j = $c + 1;
1216
1217                     while ( $j < @{ $lines } and $lines->[ $j ] < $args->{ "threshold" } ) {
1218                         $j++;
1219                     }
1220
1221                     if ( $j - $c > $args->{ "gap" } )
1222                     {
1223                         if ( $c - $i >= $args->{ "min" } )
1224                         {
1225                             push @blocks, {
1226                                 CHR     => $chr,
1227                                 CHR_BEG => $beg + $i - 1,
1228                                 CHR_END => $beg + $c - 2,
1229                                 CHR_LEN => $c - $i,
1230                             };
1231                         }
1232
1233                         $i = $j;
1234
1235                         last;
1236                     }
1237
1238                     $c = $j
1239                 }
1240                 else
1241                 {
1242                     $c++;
1243                 }
1244             }
1245
1246             if ( $c - $i >= $args->{ "min" } )
1247             {
1248                 push @blocks, {
1249                     CHR     => $chr,
1250                     CHR_BEG => $beg + $i - 1,
1251                     CHR_END => $beg + $c - 2,
1252                     CHR_LEN => $c - $i,
1253                 };
1254             }
1255
1256             $i = $c;
1257         }
1258         else
1259         {
1260             $i++;
1261         }
1262     }
1263
1264     $i = 0;
1265
1266     while ( $i < @blocks )
1267     {
1268         $c = $i + 1;
1269
1270         while ( $c < @blocks and $blocks[ $c ]->{ "CHR_BEG" } - $blocks[ $c - 1 ]->{ "CHR_END" } <= $args->{ "dist" } )
1271         {
1272             $c++;
1273         }
1274
1275         push @super_blocks, [ @blocks[ $i .. $c - 1 ] ];
1276
1277         $i = $c;
1278     }
1279
1280     foreach $super_block ( @super_blocks )
1281     {
1282         foreach $block ( @{ $super_block } )
1283         {
1284             push @begs, $block->{ "CHR_BEG" } - $super_block->[ 0 ]->{ "CHR_BEG" };
1285             push @lens, $block->{ "CHR_LEN" } - 1;
1286         }
1287
1288         $lens[ -1 ]++;
1289
1290         push @entries, {
1291             CHR        => $super_block->[ 0 ]->{ "CHR" },
1292             CHR_BEG    => $super_block->[ 0 ]->{ "CHR_BEG" },
1293             CHR_END    => $super_block->[ -1 ]->{ "CHR_END" },
1294             Q_ID       => "Q_ID",
1295             SCORE      => 100,
1296             STRAND     => "+",
1297             THICK_BEG  => $super_block->[ 0 ]->{ "CHR_BEG" },
1298             THICK_END  => $super_block->[ -1 ]->{ "CHR_END" } + 1,
1299             ITEMRGB    => "0,200,100",
1300             BLOCKCOUNT => scalar @{ $super_block },
1301             BLOCKSIZES => join( ",", @lens ),
1302             Q_BEGS     => join( ",", @begs ),
1303         };
1304
1305         undef @begs;
1306         undef @lens;
1307     }
1308
1309     return wantarray ? @entries : \@entries;
1310 }
1311
1312
1313 sub phastcons_normalize
1314 {
1315     # Martin A. Hansen, January 2008.
1316
1317     # Normalizes a list of lists with PhastCons scores,
1318     # in such a way that each list contains the same number
1319     # or PhastCons scores.
1320
1321     my ( $AoA,    # AoA with PhastCons scores
1322        ) = @_;
1323
1324     # Returns AoA.
1325
1326     my ( $list, $max, $min, $mean, $diff );
1327
1328     $min = 99999999;
1329     $max = 0;
1330
1331     foreach $list ( @{ $AoA } )
1332     {
1333         $min = scalar @{ $list } if scalar @{ $list } < $min;
1334         $max = scalar @{ $list } if scalar @{ $list } > $max;
1335     }
1336
1337     $mean = int( ( $min + $max ) / 2 );
1338
1339 #    print STDERR "min->$min   max->$max   mean->$mean\n";
1340
1341     foreach $list ( @{ $AoA } )
1342     {
1343         $diff = scalar @{ $list } - $mean;
1344
1345         phastcons_list_inflate( $list, abs( $diff ) ) if $diff < 0;
1346         phastcons_list_deflate( $list, $diff )        if $diff > 0;
1347     }
1348
1349     return wantarray ? @{ $AoA } : $AoA;
1350 }
1351
1352
1353 sub phastcons_list_inflate
1354 {
1355     # Martin A. Hansen, January 2008.
1356
1357     # Inflates a list with a given number of elements
1358     # in such a way that the extra elements are introduced
1359     # evenly over the entire length of the list. The value
1360     # of the extra elements is based on a mean of the
1361     # adjacent elements.
1362
1363     my ( $list,   # list of elements
1364          $diff,   # number of elements to introduce
1365        ) = @_;
1366
1367     # Returns nothing
1368
1369     my ( $len, $space, $i, $pos );
1370
1371     $len = scalar @{ $list };
1372
1373     $space = $len / $diff;
1374
1375     for ( $i = 0; $i < $diff; $i++ )
1376     {
1377         $pos = int( ( $space / 2 ) + $i * $space );
1378
1379         splice @{ $list }, $pos, 0, ( $list->[ $pos - 1 ] + $list->[ $pos + 1 ] ) / 2;
1380         # splice @{ $list }, $pos, 0, "X";
1381     }
1382
1383     die qq(ERROR: bad inflate\n) if scalar @{ $list } != $len + $diff;
1384 }
1385
1386
1387 sub phastcons_list_deflate
1388 {
1389     # Martin A. Hansen, January 2008.
1390
1391     # Deflates a list by removing a given number of elements
1392     # evenly distributed over the entire list.
1393
1394     my ( $list,   # list of elements
1395          $diff,   # number of elements to remove
1396        ) = @_;
1397
1398     # Returns nothing
1399
1400     my ( $len, $space, $i, $pos );
1401
1402     $len = scalar @{ $list };
1403
1404     $space = ( $len - $diff ) / $diff;
1405
1406     for ( $i = 0; $i < $diff; $i++ )
1407     {
1408         $pos = int( ( $space / 2 ) + $i * $space );
1409
1410         splice @{ $list }, $pos, 1;
1411     }
1412
1413     die qq(ERROR: bad deflate\n) if scalar @{ $list } != $len - $diff;
1414 }
1415
1416
1417 sub phastcons_mean
1418 {
1419     # Martin A. Hansen, January 2008.
1420
1421     # Given a normalized PhastCons matrix in an AoA,
1422     # calculate the mean for each column and return as a list.
1423
1424     my ( $AoA,    # AoA with normalized PhastCons scores
1425        ) = @_;
1426
1427     # Returns a list
1428
1429     my ( @list );
1430
1431     $AoA = Maasha::Matrix::matrix_flip( $AoA );
1432
1433     map { push @list, Maasha::Calc::mean( $_ ) } @{ $AoA };
1434
1435     return wantarray ? @list : \@list;
1436 }
1437
1438
1439 sub phastcons_median
1440 {
1441     # Martin A. Hansen, January 2008.
1442
1443     # Given a normalized PhastCons matrix in an AoA,
1444     # calculate the median for each column and return as a list.
1445
1446     my ( $AoA,    # AoA with normalized PhastCons scores
1447        ) = @_;
1448
1449     # Returns a list
1450
1451     my ( @list );
1452
1453     $AoA = Maasha::Matrix::matrix_flip( $AoA );
1454
1455     map { push @list, Maasha::Calc::median( $_ ) } @{ $AoA };
1456
1457     return wantarray ? @list : \@list;
1458 }
1459
1460
1461 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> MULTIPLE ALIGNMENT FILES <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1462
1463
1464 sub maf_extract
1465 {
1466     # Martin A. Hansen, April 2008.
1467
1468     # Executes mafFrag to extract a subalignment from a multiz track
1469     # in the UCSC genome browser database.
1470
1471     my ( $tmp_dir,    # temporary directory
1472          $database,   # genome database
1473          $table,      # table with the multiz track
1474          $chr,        # chromosome
1475          $beg,        # begin position
1476          $end,        # end position
1477          $strand,     # strand
1478        ) = @_;
1479
1480     # Returns a list of record
1481
1482     my ( $tmp_file, $align );
1483
1484     $tmp_file = "$tmp_dir/maf_extract.maf";
1485
1486     Maasha::Common::run( "mafFrag", "$database $table $chr $beg $end $strand $tmp_file" );
1487
1488     $align = maf_parse( $tmp_file );
1489
1490     unlink $tmp_file;
1491
1492     return wantarray ? @{ $align } : $align;
1493 }
1494
1495
1496 sub maf_parse
1497 {
1498     # Martin A. Hansen, April 2008.
1499
1500
1501     my ( $path,   # full path to MAF file
1502        ) = @_;
1503
1504     # Returns a list of record.
1505
1506     my ( $fh, $line, @fields, @align );
1507
1508     $fh = Maasha::Common::read_open( $path );
1509
1510     while ( $line = <$fh> )
1511     {
1512         chomp $line;
1513
1514         if ( $line =~ /^s/ )
1515         {
1516             @fields = split / /, $line;
1517
1518             push @align, {
1519                 SEQ_NAME  => $fields[ 1 ],
1520                 SEQ       => $fields[ -1 ],
1521                 ALIGN     => 1,
1522                 ALIGN_LEN => length $fields[ -1 ],
1523             }
1524         }
1525     }
1526
1527     close $fh;
1528
1529     return wantarray ? @align : \@align;
1530 }
1531
1532
1533 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> WIGGLE FORMAT <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1534
1535
1536 sub fixedstep_put_entry
1537 {
1538     # Martin A. Hansen, April 2008.
1539
1540     # Outputs a block of fixedStep values.
1541     # Used for outputting wiggle data.
1542
1543     my ( $chr,      # chromosome
1544          $beg,      # start position
1545          $block,    # list of scores
1546          $fh,       # filehandle - OPTIONAL
1547          $log10,    # flag indicating that log10 scores should be used
1548        ) = @_;
1549
1550     # Returns nothing.
1551
1552     $beg += 1;   # fixedStep format is 1 based.
1553
1554     $fh ||= \*STDOUT;
1555
1556     print $fh "fixedStep chrom=$chr start=$beg step=1\n";
1557
1558     if ( $log10 ) {
1559         map { printf( $fh "%f\n", Maasha::Calc::log10( $_ + 1 ) ) } @{ $block };
1560     } else {
1561         map { printf( $fh "%d\n", ( $_ + 1 ) ) } @{ $block };
1562     }
1563 }
1564
1565
1566 sub wiggle_upload_to_ucsc
1567 {
1568     # Martin A. Hansen, May 2008.
1569
1570     # Upload a wiggle file to the UCSC database.
1571
1572     my ( $tmp_dir,    # temporary directory
1573          $wib_dir,    # wib directory
1574          $wig_file,   # file to upload,
1575          $options,    # argument hashref
1576        ) = @_;
1577
1578     # Returns nothing.
1579
1580     my ( $args );
1581
1582 #    $args = join " ", "-tmpDir=$tmp_dir", "-pathPrefix=$wib_dir", $options->{ "database" }, $options->{ 'table' }, $wig_file;
1583
1584 #    Maasha::Common::run( "hgLoadWiggle", "$args > /dev/null 2>&1" );
1585
1586     if ( $options->{ 'verbose' } ) {
1587         `cd $tmp_dir && hgLoadWiggle -tmpDir=$tmp_dir -pathPrefix=$wib_dir $options->{ 'database' } $options->{ 'table' } $wig_file`;
1588     } else {
1589         `cd $tmp_dir && hgLoadWiggle -tmpDir=$tmp_dir -pathPrefix=$wib_dir $options->{ 'database' } $options->{ 'table' } $wig_file > /dev/null 2>&1`;
1590     }
1591 }
1592
1593
1594 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> MySQL CONF <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1595
1596
1597 sub ucsc_get_user
1598 {
1599     # Martin A. Hansen, May 2008
1600
1601     # Fetches the MySQL database user name from the
1602     # .hg.conf file in the users home directory.
1603
1604     # Returns a string.
1605
1606     my ( $fh, $line, $user );
1607
1608     $fh = Maasha::Common::read_open( "$ENV{ 'HOME' }/.hg.conf" );
1609
1610     while ( $line = <$fh> )
1611     {
1612         chomp $line;
1613
1614         if ( $line =~ /^db\.user=(.+)/ )
1615         {
1616             $user = $1;
1617
1618             last;
1619         }
1620     }
1621
1622     close $fh;
1623
1624     return $user;
1625 }
1626
1627
1628 sub ucsc_get_password
1629 {
1630     # Martin A. Hansen, May 2008
1631
1632     # Fetches the MySQL database password from the
1633     # .hg.conf file in the users home directory.
1634
1635     # Returns a string.
1636
1637     my ( $fh, $line, $password );
1638
1639     $fh = Maasha::Common::read_open( "$ENV{ 'HOME' }/.hg.conf" );
1640
1641     while ( $line = <$fh> )
1642     {
1643         chomp $line;
1644
1645         if ( $line =~ /^db\.password=(.+)/ )
1646         {
1647             $password = $1;
1648
1649             last;
1650         }
1651     }
1652
1653     close $fh;
1654
1655     return $password;
1656 }
1657
1658
1659 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1660
1661
1662 __END__
1663