code_perl/Maasha/UCSC.pm

   1 package Maasha::UCSC;
   2
   3 # Copyright (C) 2007 Martin A. Hansen.
   4
   5 # This program is free software; you can redistribute it and/or
   6 # modify it under the terms of the GNU General Public License
   7 # as published by the Free Software Foundation; either version 2
   8 # of the License, or (at your option) any later version.
   9
  10 # This program is distributed in the hope that it will be useful,
  11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 # GNU General Public License for more details.
  14
  15 # You should have received a copy of the GNU General Public License
  16 # along with this program; if not, write to the Free Software
  17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  18
  19 # http://www.gnu.org/copyleft/gpl.html
  20
  21
  22 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  23
  24
  25 # Stuff for interacting with UCSC genome browser
  26
  27
  28 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  29
  30
  31 use strict;
  32 use vars qw ( @ISA @EXPORT );
  33
  34 use Data::Dumper;
  35 use Time::HiRes qw( gettimeofday );
  36
  37 use Maasha::Common;
  38 use Maasha::Calc;
  39 use Maasha::Matrix;
  40
  41 use constant {
  42     FS_CHR_BEG      => 0,
  43     FS_NEXT_CHR_BEG => 1,
  44     FS_CHR_END      => 2,
  45     FS_INDEX_BEG    => 3,
  46     FS_INDEX_LEN    => 4,
  47
  48     CHR          => 0,
  49     CHR_BEG      => 1,
  50     CHR_END      => 2,
  51     Q_ID         => 3,
  52     SCORE        => 4,
  53     STRAND       => 5,
  54     THICK_BEG    => 6,
  55     THICK_END    => 7,
  56     ITEMRGB      => 8,
  57     BLOCKCOUNT   => 9,
  58     BLOCKSIZES   => 10,
  59     Q_BEGS       => 11,
  60 };
  61
  62 @ISA = qw( Exporter );
  63
  64 my $TIME = gettimeofday();
  65
  66
  67 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> BED format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  68
  69
  70 # http://genome.ucsc.edu/goldenPath/help/hgTracksHelp.html#BED
  71
  72
  73 sub bed_entry_get_array
  74 {
  75     # Martin A. Hansen, September 2008.
  76
  77     # Reads a BED entry given a filehandle.
  78
  79     # This is a new _faster_ BED entry parser that
  80     # uses arrays and not hashrefs.
  81
  82     # IMPORTANT! This function does not correct the
  83     # BED_END position that is kept the way UCSC
  84     # does.
  85
  86     my ( $fh,     # file handle
  87          $cols,   # columns to read - OPTIONAL (3,4,5,6, or 12)
  88        ) = @_;
  89
  90     # Returns a list.
  91
  92     my ( $line, @entry );
  93
  94     $line = <$fh>;
  95
  96     $line =~ tr/\n\r//d;    # some people have carriage returns in their BED files -> Grrrr
  97
  98     return if not defined $line;
  99
 100     if ( not defined $cols ) {
 101         $cols = 1 + $line =~ tr/\t//;
 102     }
 103
 104     @entry = split "\t", $line, $cols + 1;
 105
 106     pop @entry if scalar @entry > $cols;
 107
 108     return wantarray ? @entry : \@entry;
 109 }
 110
 111
 112 sub bed_get_entry
 113 {
 114     # Martin A. Hansen, December 2007.
 115
 116     # Reads a bed entry given a filehandle.
 117
 118     my ( $fh,        # file handle
 119          $columns,   # number of BED columns to read  -  OPTIONAL
 120        ) = @_;
 121
 122     # Returns hashref.
 123
 124     my ( $line, @fields, %entry );
 125
 126     $line = <$fh>;
 127
 128     $line =~ tr/\n\r//d;    # some people have carriage returns in their BED files -> Grrrr
 129
 130     return if not defined $line;
 131
 132     @fields = split "\t", $line;
 133
 134     $columns ||= scalar @fields;
 135
 136     if ( $columns == 3 )
 137     {
 138         %entry = (
 139             "CHR"      => $fields[ 0 ],
 140             "CHR_BEG"  => $fields[ 1 ],
 141             "CHR_END"  => $fields[ 2 ] - 1,
 142         );
 143     }
 144     elsif ( $columns == 4 )
 145     {
 146         %entry = (
 147             "CHR"      => $fields[ 0 ],
 148             "CHR_BEG"  => $fields[ 1 ],
 149             "CHR_END"  => $fields[ 2 ] - 1,
 150             "Q_ID"     => $fields[ 3 ],
 151         );
 152     }
 153     elsif ( $columns == 5 )
 154     {
 155         %entry = (
 156             "CHR"      => $fields[ 0 ],
 157             "CHR_BEG"  => $fields[ 1 ],
 158             "CHR_END"  => $fields[ 2 ] - 1,
 159             "Q_ID"     => $fields[ 3 ],
 160             "SCORE"    => $fields[ 4 ],
 161         );
 162     }
 163     elsif ( $columns == 6 )
 164     {
 165         %entry = (
 166             "CHR"      => $fields[ 0 ],
 167             "CHR_BEG"  => $fields[ 1 ],
 168             "CHR_END"  => $fields[ 2 ] - 1,
 169             "Q_ID"     => $fields[ 3 ],
 170             "SCORE"    => $fields[ 4 ],
 171             "STRAND"   => $fields[ 5 ],
 172         );
 173     }
 174     elsif ( $columns == 12 )
 175     {
 176         %entry = (
 177             "CHR"        => $fields[ 0 ],
 178             "CHR_BEG"    => $fields[ 1 ],
 179             "CHR_END"    => $fields[ 2 ] - 1,
 180             "Q_ID"       => $fields[ 3 ],
 181             "SCORE"      => $fields[ 4 ],
 182             "STRAND"     => $fields[ 5 ],
 183             "THICK_BEG"  => $fields[ 6 ],
 184             "THICK_END"  => $fields[ 7 ] - 1,
 185             "ITEMRGB"    => $fields[ 8 ],
 186             "BLOCKCOUNT" => $fields[ 9 ],
 187             "BLOCKSIZES" => $fields[ 10 ],
 188             "Q_BEGS"     => $fields[ 11 ],
 189         );
 190     }
 191     else
 192     {
 193         Maasha::Common::error( qq(Bad BED format in line->$line<-) );
 194     }
 195
 196     $entry{ "REC_TYPE" } = "BED";
 197     $entry{ "BED_LEN" }  = $entry{ "CHR_END" } - $entry{ "CHR_BEG" } + 1;
 198     $entry{ "BED_COLS" } = $columns;
 199
 200     return wantarray ? %entry : \%entry;
 201 }
 202
 203
 204 sub bed_get_entries
 205 {
 206     # Martin A. Hansen, January 2008.
 207
 208     # Given a path to a BED file, read in all entries
 209     # and return.
 210
 211     my ( $path,     # full path to BED file
 212          $columns,  # number of columns in BED file - OPTIONAL (but is faster)
 213        ) = @_;
 214
 215     # Returns a list.
 216
 217     my ( $fh, $entry, @list );
 218
 219     $fh = Maasha::Common::read_open( $path );
 220
 221     while ( $entry = bed_get_entry( $fh ) ) {
 222         push @list, $entry;
 223     }
 224
 225     close $fh;
 226
 227     return wantarray ? @list : \@list;
 228 }
 229
 230
 231 sub bed_entry_put_array
 232 {
 233     # Martin A. Hansen, Septermber 2008.
 234
 235     # Writes a BED entry array to file.
 236
 237     # IMPORTANT! This function does not correct the
 238     # BED_END position that is assumed to be in the
 239     # UCSC positions scheme.
 240
 241     my ( $record,   # list
 242          $fh,       # file handle                   - OPTIONAL
 243          $cols,     # number of columns in BED file - OPTIONAL
 244        ) = @_;
 245
 246     # Returns nothing.
 247
 248     $fh = \*STDOUT if not defined $fh;
 249
 250     if ( defined $cols ) {
 251         print $fh join( "\t", @{ $record }[ 0 .. $cols - 1 ] ), "\n";
 252     } else {
 253         print $fh join( "\t", @{ $record } ), "\n";
 254     }
 255 }
 256
 257
 258 sub bed_put_entry
 259 {
 260     # Martin A. Hansen, Septermber 2007.
 261
 262     # Writes a BED entry to file.
 263
 264     # NB, this could really be more robust!?
 265
 266     my ( $record,       # hashref
 267          $fh,           # file handle                   - OPTIONAL
 268          $columns,      # number of columns in BED file - OPTIONAL (but is faster)
 269        ) = @_;
 270
 271     # Returns nothing.
 272
 273     my ( @fields );
 274
 275     $columns ||= 12;   # max number of columns possible
 276
 277     if ( $columns == 3 )
 278     {
 279         push @fields, $record->{ "CHR" };
 280         push @fields, $record->{ "CHR_BEG" };
 281         push @fields, $record->{ "CHR_END" } + 1;
 282     }
 283     elsif ( $columns == 4 )
 284     {
 285         $record->{ "Q_ID" }  =~ s/\s+/_/g;
 286
 287         push @fields, $record->{ "CHR" };
 288         push @fields, $record->{ "CHR_BEG" };
 289         push @fields, $record->{ "CHR_END" } + 1;
 290         push @fields, $record->{ "Q_ID" };
 291     }
 292     elsif ( $columns == 5 )
 293     {
 294         $record->{ "Q_ID" }  =~ s/\s+/_/g;
 295         $record->{ "SCORE" } =~ s/\.\d*//;
 296
 297         push @fields, $record->{ "CHR" };
 298         push @fields, $record->{ "CHR_BEG" };
 299         push @fields, $record->{ "CHR_END" } + 1;
 300         push @fields, $record->{ "Q_ID" };
 301         push @fields, $record->{ "SCORE" };
 302     }
 303     elsif ( $columns == 6 )
 304     {
 305         $record->{ "Q_ID" }  =~ s/\s+/_/g;
 306         $record->{ "SCORE" } =~ s/\.\d*//;
 307
 308         push @fields, $record->{ "CHR" };
 309         push @fields, $record->{ "CHR_BEG" };
 310         push @fields, $record->{ "CHR_END" } + 1;
 311         push @fields, $record->{ "Q_ID" };
 312         push @fields, $record->{ "SCORE" };
 313         push @fields, $record->{ "STRAND" };
 314     }
 315     else
 316     {
 317         $record->{ "Q_ID" }  =~ s/\s+/_/g;
 318         $record->{ "SCORE" } =~ s/\.\d*//;
 319
 320         push @fields, $record->{ "CHR" };
 321         push @fields, $record->{ "CHR_BEG" };
 322         push @fields, $record->{ "CHR_END" } + 1;
 323         push @fields, $record->{ "Q_ID" };
 324         push @fields, $record->{ "SCORE" };
 325         push @fields, $record->{ "STRAND" };
 326         push @fields, $record->{ "THICK_BEG" }     if defined $record->{ "THICK_BEG" };
 327         push @fields, $record->{ "THICK_END" } + 1 if defined $record->{ "THICK_END" };
 328         push @fields, $record->{ "ITEMRGB" }       if defined $record->{ "ITEMRGB" };
 329         push @fields, $record->{ "BLOCKCOUNT" }    if defined $record->{ "BLOCKCOUNT" };
 330         push @fields, $record->{ "BLOCKSIZES" }    if defined $record->{ "BLOCKSIZES" };
 331         push @fields, $record->{ "Q_BEGS" }        if defined $record->{ "Q_BEGS" };
 332     }
 333
 334     if ( $fh ) {
 335         print $fh join( "\t", @fields ), "\n";
 336     } else {
 337         print join( "\t", @fields ), "\n";
 338     }
 339 }
 340
 341
 342 sub bed_put_entries
 343 {
 344     # Martin A. Hansen, January 2008.
 345
 346     # Write a list of BED entries.
 347
 348     my ( $entries,   # list of entries,
 349          $fh,        # file handle - OPTIONAL
 350        ) = @_;
 351
 352     # Returns nothing.
 353
 354     map { bed_put_entry( $_, $fh ) } @{ $entries };
 355 }
 356
 357
 358 sub bed_analyze
 359 {
 360     # Martin A. Hansen, March 2008.
 361
 362     # Given a bed record, analysis this to give information
 363     # about intron/exon sizes.
 364
 365     my ( $entry,   # BED entry
 366        ) = @_;
 367
 368     # Returns hashref.
 369
 370     my ( $i, @begs, @lens, $exon_max, $exon_min, $exon_len, $exon_tot, $intron_max, $intron_min, $intron_len, $intron_tot );
 371
 372     $exon_max   = 0;
 373     $exon_min   = 9999999999;
 374     $intron_max = 0;
 375     $intron_min = 9999999999;
 376
 377     $entry->{ "EXONS" }   = $entry->{ "BLOCKCOUNT" };
 378
 379     @begs = split /,/, $entry->{ "Q_BEGS" };
 380     @lens = split /,/, $entry->{ "BLOCKSIZES" };
 381
 382     for ( $i = 0; $i < $entry->{ "BLOCKCOUNT" }; $i++ )
 383     {
 384         $exon_len = @lens[ $i ];
 385
 386         $entry->{ "EXON_LEN_$i" } = $exon_len;
 387
 388         $exon_max = $exon_len if $exon_len > $exon_max;
 389         $exon_min = $exon_len if $exon_len < $exon_min;
 390
 391         $exon_tot += $exon_len;
 392     }
 393
 394     $entry->{ "EXON_LEN_-1" }   = $exon_len;
 395     $entry->{ "EXON_MAX_LEN" }  = $exon_max;
 396     $entry->{ "EXON_MIN_LEN" }  = $exon_min;
 397     $entry->{ "EXON_MEAN_LEN" } = int( $exon_tot / $entry->{ "EXONS" } );
 398
 399     $entry->{ "INTRONS" } = $entry->{ "BLOCKCOUNT" } - 1;
 400     $entry->{ "INTRONS" } = 0 if $entry->{ "INTRONS" } < 0;
 401
 402     if ( $entry->{ "INTRONS" } )
 403     {
 404         for ( $i = 1; $i < $entry->{ "BLOCKCOUNT" }; $i++ )
 405         {
 406             $intron_len = @begs[ $i ] - ( @begs[ $i - 1 ] + @lens[ $i - 1 ] );
 407
 408             $entry->{ "INTRON_LEN_" . ( $i - 1 ) } = $intron_len;
 409
 410             $intron_max = $intron_len if $intron_len > $intron_max;
 411             $intron_min = $intron_len if $intron_len < $intron_min;
 412
 413             $intron_tot += $intron_len;
 414         }
 415
 416         $entry->{ "INTRON_LEN_-1" }   = $intron_len;
 417         $entry->{ "INTRON_MAX_LEN" }  = $intron_max;
 418         $entry->{ "INTRON_MIN_LEN" }  = $intron_min;
 419         $entry->{ "INTRON_MEAN_LEN" } = int( $intron_tot / $entry->{ "INTRONS" } );
 420     }
 421
 422     return wantarray ? %{ $entry } : $entry;
 423 }
 424
 425
 426 sub bed_sort
 427 {
 428     # Martin A. Hansen, Septermber 2008
 429
 430     # Sorts a BED file using the c program
 431     # "bed_sort" specifing a sort mode:
 432
 433     # 1: chr AND chr_beg.
 434     # 2: chr AND strand AND chr_beg.
 435     # 3: chr_beg.
 436     # 4: strand AND chr_beg.
 437
 438     my ( $bed_file,    # BED file to sort
 439          $sort_mode,   # See above.
 440          $cols,        # Number of columns in BED file
 441        ) = @_;
 442
 443     &Maasha::Common::run( "bed_sort", "--sort $sort_mode --cols $cols $bed_file" );
 444 }
 445
 446
 447 sub bed_split_to_files
 448 {
 449     # Martin A. Hansen, Septermber 2008
 450
 451     # Given a list of BED files, split these
 452     # into temporary files based on the chromosome
 453     # name. Returns a list of the temporary files.
 454
 455     my ( $bed_files,   # list of BED files to split
 456          $cols,        # number of columns
 457          $tmp_dir,     # temporary directory
 458        ) = @_;
 459
 460     # Returns a list.
 461
 462     my ( $bed_file, $fh_in, $entry, $key, %fh_hash, @tmp_files );
 463
 464     foreach $bed_file ( @{ $bed_files } )
 465     {
 466         $fh_in = Maasha::Common::read_open( $bed_file );
 467
 468         while ( $entry = bed_entry_get_array( $fh_in, $cols ) )
 469         {
 470             $key = $entry->[ CHR ];
 471
 472             $fh_hash{ $key } = Maasha::Common::write_open( "$tmp_dir/$key.temp" ) if not exists $fh_hash{ $key };
 473
 474             bed_entry_put_array( $entry, $fh_hash{ $key } );
 475         }
 476
 477         close $fh_in;
 478     }
 479
 480     foreach $key ( sort keys %fh_hash )
 481     {
 482         push @tmp_files, "$tmp_dir/$key.temp";
 483
 484         close $fh_hash{ $key };
 485     }
 486
 487     return wantarray ? @tmp_files : \@tmp_files;
 488 }
 489
 490
 491 sub bed_merge_entries
 492 {
 493     # Martin A. Hansen, February 2008.
 494
 495     # Merge a list of given BED entries in one big entry.
 496
 497     my ( $entries,     # list of BED entries to be merged
 498        ) = @_;
 499
 500     # Returns hash.
 501
 502     my ( $i, @q_ids, @q_begs, @blocksizes, @new_q_begs, @new_blocksizes, %new_entry );
 503
 504     @{ $entries } = sort { $a->{ "CHR_BEG" } <=> $b->{ "CHR_BEG" } } @{ $entries };
 505
 506     for ( $i = 0; $i < @{ $entries }; $i++ )
 507     {
 508         Maasha::Common::error( qq(Attempted merge of BED entries from different chromosomes) ) if $entries->[ 0 ]->{ "CHR" }    ne $entries->[ $i ]->{ "CHR" };
 509         Maasha::Common::error( qq(Attempted merge of BED entries from different strands) )     if $entries->[ 0 ]->{ "STRAND" } ne $entries->[ $i ]->{ "STRAND" };
 510
 511         push @q_ids, $entries->[ $i ]->{ "Q_ID" } || sprintf( "ID%06d", $i );
 512
 513         if ( exists $entries->[ $i ]->{ "Q_BEGS" } )
 514         {
 515             @q_begs     = split ",", $entries->[ $i ]->{ "Q_BEGS" };
 516             @blocksizes = split ",", $entries->[ $i ]->{ "BLOCKSIZES" };
 517         }
 518         else
 519         {
 520             @q_begs     = 0;
 521             @blocksizes = $entries->[ $i ]->{ "CHR_END" } - $entries->[ $i ]->{ "CHR_BEG" } + 1;
 522         }
 523
 524         map { $_ += $entries->[ $i ]->{ "CHR_BEG" } } @q_begs;
 525
 526         push @new_q_begs, @q_begs;
 527         push @new_blocksizes, @blocksizes;
 528     }
 529
 530     map { $_ -= $entries->[ 0 ]->{ "CHR_BEG" } } @new_q_begs;
 531
 532     %new_entry = (
 533         CHR         => $entries->[ 0 ]->{ "CHR" },
 534         CHR_BEG     => $entries->[ 0 ]->{ "CHR_BEG" },
 535         CHR_END     => $entries->[ -1 ]->{ "CHR_END" },
 536         REC_TYPE    => "BED",
 537         BED_LEN     => $entries->[ -1 ]->{ "CHR_END" } - $entries->[ 0 ]->{ "CHR_BEG" } + 1,
 538         BED_COLS    => 12,
 539         Q_ID        => join( ":", @q_ids ),
 540         SCORE       => 999,
 541         STRAND      => $entries->[ 0 ]->{ "STRAND" }     || "+",
 542         THICK_BEG   => $entries->[ 0 ]->{ "THICK_BEG" }  || $entries->[ 0 ]->{ "CHR_BEG" },
 543         THICK_END   => $entries->[ -1 ]->{ "THICK_END" } || $entries->[ -1 ]->{ "CHR_END" },
 544         ITEMRGB     => "0,0,0",
 545         BLOCKCOUNT  => scalar @new_q_begs,
 546         BLOCKSIZES  => join( ",", @new_blocksizes ),
 547         Q_BEGS      => join( ",", @new_q_begs ),
 548     );
 549
 550     return wantarray ? %new_entry : \%new_entry;
 551 }
 552
 553
 554 sub bed_split_entry
 555 {
 556     # Martin A. Hansen, February 2008.
 557
 558     # Splits a given BED entry into a list of blocks,
 559     # which are returned. A list of 6 column BED entry is returned.
 560
 561     my ( $entry,    # BED entry hashref
 562        ) = @_;
 563
 564     # Returns a list.
 565
 566     my ( @q_begs, @blocksizes, $block, @blocks, $i );
 567
 568     if ( exists $entry->{ "BLOCKCOUNT" } )
 569     {
 570         @q_begs     = split ",", $entry->{ "Q_BEGS" };
 571         @blocksizes = split ",", $entry->{ "BLOCKSIZES" };
 572
 573         for ( $i = 0; $i < @q_begs; $i++ )
 574         {
 575             undef $block;
 576
 577             $block->{ "CHR" }      = $entry->{ "CHR" };
 578             $block->{ "CHR_BEG" }  = $entry->{ "CHR_BEG" } + $q_begs[ $i ];
 579             $block->{ "CHR_END" }  = $entry->{ "CHR_BEG" } + $q_begs[ $i ] + $blocksizes[ $i ] - 1;
 580             $block->{ "Q_ID" }     = $entry->{ "Q_ID" } . sprintf( "_%03d", $i );
 581             $block->{ "SCORE" }    = $entry->{ "SCORE" };
 582             $block->{ "STRAND" }   = $entry->{ "STRAND" };
 583             $block->{ "BED_LEN" }  = $block->{ "CHR_END" } - $block->{ "CHR_BEG" } + 1,
 584             $block->{ "BED_COLS" } = 6;
 585             $block->{ "REC_TYPE" } = "BED";
 586
 587             push @blocks, $block;
 588         }
 589     }
 590     else
 591     {
 592         @blocks = @{ $entry };
 593     }
 594
 595     return wantarray ? @blocks : \@blocks;
 596 }
 597
 598
 599
 600 sub bed_overlap
 601 {
 602     # Martin A. Hansen, February 2008.
 603
 604     # Checks if two BED entries overlap and
 605     # return 1 if so - else 0;
 606
 607     my ( $entry1,      # hashref
 608          $entry2,      # hashref
 609          $no_strand,   # don't check strand flag - OPTIONAL
 610        ) = @_;
 611
 612     # Return bolean.
 613
 614     return 0 if $entry1->{ "CHR" }    ne $entry2->{ "CHR" };
 615     return 0 if $entry1->{ "STRAND" } ne $entry2->{ "STRAND" };
 616
 617     if ( $entry1->{ "CHR_END" } < $entry2->{ "CHR_BEG" } or $entry1->{ "CHR_BEG" } > $entry2->{ "CHR_END" } ) {
 618         return 0;
 619     } else {
 620         return 1;
 621     }
 622 }
 623
 624
 625 sub bed_upload_to_ucsc
 626 {
 627     # Martin A. Hansen, September 2007.
 628
 629     # Upload a BED file to the UCSC database.
 630
 631     my ( $tmp_dir,   # temporary directory
 632          $file,      # file to upload,
 633          $options,   # argument hashref
 634          $append,    # flag indicating table should be appended
 635        ) = @_;
 636
 637     # Returns nothing.
 638
 639     my ( $args, $table, $sql_file, $fh_out, $fh_in );
 640
 641     if ( $append ) {
 642         $args = join " ", $options->{ "database" }, $options->{ "table" }, "-tmpDir=$tmp_dir", "-oldTable", $file;
 643     } else {
 644         $args = join " ", $options->{ "database" }, $options->{ "table" }, "-tmpDir=$tmp_dir", $file;
 645     }
 646
 647     if ( $options->{ "sec_struct" } )
 648     {
 649         $table = $options->{ "table" };
 650
 651         Maasha::Common::error( "Attempt to load secondary structure track without 'rnaSecStr' in table name" ) if not $table =~ /rnaSecStr/;
 652
 653         $sql_file = "$tmp_dir/upload_RNA_SS.sql";
 654
 655         $fh_out   = Maasha::Common::write_open( $sql_file );
 656
 657         print $fh_out qq(
 658 CREATE TABLE $table (
 659     bin smallint not null,              # Bin number for browser speedup
 660     chrom varchar(255) not null,        # Chromosome or FPC contig
 661     chromStart int unsigned not null,   # Start position in chromosome
 662     chromEnd int unsigned not null,     # End position in chromosome
 663     name varchar(255) not null,         # Name of item
 664     score int unsigned not null,        # Score from 0-1000
 665     strand char(1) not null,            # + or -
 666     size int unsigned not null,         # Size of element.
 667     secStr longblob not null,           # Parentheses and '.'s which define the secondary structure
 668     conf longblob not null,             # Confidence of secondary-structure annotation per position (0.0-1.0).
 669     #Indices
 670     INDEX(name(16)),
 671     INDEX(chrom(8), bin),
 672     INDEX(chrom(8), chromStart)
 673 );
 674         );
 675
 676         close $fh_out;
 677
 678         Maasha::Common::run( "hgLoadBed", "-notItemRgb -sqlTable=$sql_file $options->{ 'database' } $options->{ 'table' } -tmpDir=$tmp_dir $file > /dev/null 2>&1" );
 679
 680         unlink $sql_file;
 681     }
 682     else
 683     {
 684         Maasha::Common::run( "hgLoadBed", "$args > /dev/null 2>&1" );
 685     }
 686 }
 687
 688
 689 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> PSL format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
 690
 691
 692 sub psl_get_entry
 693 {
 694     # Martin A. Hansen, August 2008.
 695
 696     # Reads PSL next entry from a PSL file and returns a record.
 697
 698     my ( $fh,   # file handle of PSL filefull path to PSL file
 699        ) = @_;
 700
 701     # Returns hashref.
 702
 703     my ( $line, @fields, %record );
 704
 705     while ( $line = <$fh> )
 706     {
 707         chomp $line;
 708
 709         @fields = split "\t", $line;
 710
 711         if ( scalar @fields == 21 )
 712         {
 713             %record = (
 714                 REC_TYPE    => "PSL",
 715                 MATCHES     => $fields[ 0 ],
 716                 MISMATCHES  => $fields[ 1 ],
 717                 REPMATCHES  => $fields[ 2 ],
 718                 NCOUNT      => $fields[ 3 ],
 719                 QNUMINSERT  => $fields[ 4 ],
 720                 QBASEINSERT => $fields[ 5 ],
 721                 SNUMINSERT  => $fields[ 6 ],
 722                 SBASEINSERT => $fields[ 7 ],
 723                 STRAND      => $fields[ 8 ],
 724                 Q_ID        => $fields[ 9 ],
 725                 Q_LEN       => $fields[ 10 ],
 726                 Q_BEG       => $fields[ 11 ],
 727                 Q_END       => $fields[ 12 ] - 1,
 728                 S_ID        => $fields[ 13 ],
 729                 S_LEN       => $fields[ 14 ],
 730                 S_BEG       => $fields[ 15 ],
 731                 S_END       => $fields[ 16 ] - 1,
 732                 BLOCKCOUNT  => $fields[ 17 ],
 733                 BLOCKSIZES  => $fields[ 18 ],
 734                 Q_BEGS      => $fields[ 19 ],
 735                 S_BEGS      => $fields[ 20 ],
 736             );
 737
 738             $record{ "SCORE" } = $record{ "MATCHES" } + int( $record{ "REPMATCHES" } / 2 ) - $record{ "MISMATCHES" } - $record{ "QNUMINSERT" } - $record{ "SNUMINSERT" };
 739             $record{ "SPAN" }  = $fields[ 12 ] - $fields[ 11 ];
 740
 741             return wantarray ? %record : \%record;
 742         }
 743     }
 744
 745     return undef;
 746 }
 747
 748
 749 sub psl_get_entries
 750 {
 751     # Martin A. Hansen, February 2008.
 752
 753     # Reads PSL entries and returns a list of records.
 754
 755     my ( $path,   # full path to PSL file
 756        ) = @_;
 757
 758     # Returns hashref.
 759
 760     my ( $fh, @lines, @fields, $i, %record, @records );
 761
 762     $fh = Maasha::Common::read_open( $path );
 763
 764     @lines = <$fh>;
 765
 766     close $fh;
 767
 768     chomp @lines;
 769
 770     for ( $i = 5; $i < @lines; $i++ )
 771     {
 772         @fields = split "\t", $lines[ $i ];
 773
 774         Maasha::Common::error( qq(Bad PSL format in file "$path") ) if not @fields == 21;
 775
 776         undef %record;
 777
 778         %record = (
 779             REC_TYPE    => "PSL",
 780             MATCHES     => $fields[ 0 ],
 781             MISMATCHES  => $fields[ 1 ],
 782             REPMATCHES  => $fields[ 2 ],
 783             NCOUNT      => $fields[ 3 ],
 784             QNUMINSERT  => $fields[ 4 ],
 785             QBASEINSERT => $fields[ 5 ],
 786             SNUMINSERT  => $fields[ 6 ],
 787             SBASEINSERT => $fields[ 7 ],
 788             STRAND      => $fields[ 8 ],
 789             Q_ID        => $fields[ 9 ],
 790             Q_LEN       => $fields[ 10 ],
 791             Q_BEG       => $fields[ 11 ],
 792             Q_END       => $fields[ 12 ] - 1,
 793             S_ID        => $fields[ 13 ],
 794             S_LEN       => $fields[ 14 ],
 795             S_BEG       => $fields[ 15 ],
 796             S_END       => $fields[ 16 ] - 1,
 797             BLOCKCOUNT  => $fields[ 17 ],
 798             BLOCKSIZES  => $fields[ 18 ],
 799             Q_BEGS      => $fields[ 19 ],
 800             S_BEGS      => $fields[ 20 ],
 801         );
 802
 803         $record{ "SCORE" } = $record{ "MATCHES" } + int( $record{ "REPMATCHES" } / 2 ) - $record{ "MISMATCHES" } - $record{ "QNUMINSERT" } - $record{ "SNUMINSERT" };
 804
 805         push @records, { %record };
 806     }
 807
 808     return wantarray ? @records : \@records;
 809 }
 810
 811
 812 sub psl_put_header
 813 {
 814     # Martin A. Hansen, September 2007.
 815
 816     # Write a PSL header to file.
 817
 818     my ( $fh,  # file handle  - OPTIONAL
 819        ) = @_;
 820
 821     # Returns nothing.
 822
 823     $fh = \*STDOUT if not $fh;
 824
 825     print $fh qq(psLayout version 3
 826 match   mis-    rep.    N's     Q gap   Q gap   T gap   T gap   strand  Q               Q       Q       Q       T               T       T       T       block   blockSizes      qStart        match   match           count   bases   count   bases           name            size    start   end     name            size    start   end     count
 827 ---------------------------------------------------------------------------------------------------------------------------------------------------------------
 828 );
 829 }
 830
 831
 832 sub psl_put_entry
 833 {
 834     # Martin A. Hansen, September 2007.
 835
 836     # Write a PSL entry to file.
 837
 838     my ( $record,       # hashref
 839          $fh,           # file handle  -  OPTIONAL
 840        ) = @_;
 841
 842     # Returns nothing.
 843
 844     $fh = \*STDOUT if not $fh;
 845
 846     my @output;
 847
 848     push @output, $record->{ "MATCHES" };
 849     push @output, $record->{ "MISMATCHES" };
 850     push @output, $record->{ "REPMATCHES" };
 851     push @output, $record->{ "NCOUNT" };
 852     push @output, $record->{ "QNUMINSERT" };
 853     push @output, $record->{ "QBASEINSERT" };
 854     push @output, $record->{ "SNUMINSERT" };
 855     push @output, $record->{ "SBASEINSERT" };
 856     push @output, $record->{ "STRAND" };
 857     push @output, $record->{ "Q_ID" };
 858     push @output, $record->{ "Q_LEN" };
 859     push @output, $record->{ "Q_BEG" };
 860     push @output, $record->{ "Q_END" } + 1;
 861     push @output, $record->{ "S_ID" };
 862     push @output, $record->{ "S_LEN" };
 863     push @output, $record->{ "S_BEG" };
 864     push @output, $record->{ "S_END" } + 1;
 865     push @output, $record->{ "BLOCKCOUNT" };
 866     push @output, $record->{ "BLOCKSIZES" };
 867     push @output, $record->{ "Q_BEGS" };
 868     push @output, $record->{ "S_BEGS" };
 869
 870     print $fh join( "\t", @output ), "\n";
 871 }
 872
 873
 874 sub psl_upload_to_ucsc
 875 {
 876     # Martin A. Hansen, September 2007.
 877
 878     # Upload a PSL file to the UCSC database.
 879
 880     my ( $file,      # file to upload,
 881          $options,   # argument hashref
 882          $append,    # flag indicating table should be appended
 883        ) = @_;
 884
 885     # Returns nothing.
 886
 887     my ( $args );
 888
 889     if ( $append ) {
 890         $args = join " ", $options->{ "database" }, "-table=$options->{ 'table' }", "-clientLoad", "-append", $file;
 891     } else {
 892         $args = join " ", $options->{ "database" }, "-table=$options->{ 'table' }", "-clientLoad", $file;
 893     }
 894
 895     Maasha::Common::run( "hgLoadPsl", "$args > /dev/null 2>&1" );
 896 }
 897
 898
 899 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> TRACK FILE <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
 900
 901
 902 sub ucsc_config_get_entry
 903 {
 904     # Martin A. Hansen, November 2008.
 905
 906     # Given a filehandle to a UCSC Genome Browser
 907     # config file (.ra) get the next entry and
 908     # return as a hash. Entries are separated by blank
 909     # lines. # lines are skipped unless it is the lines:
 910     # # Track added ...
 911     # # Database ...
 912
 913     my ( $fh,   # file hanlde
 914        ) = @_;
 915
 916     # Returns a hashref.
 917
 918     my ( $line, %record );
 919
 920     while ( $line = <$fh> )
 921     {
 922         if ( $line =~ /Track added by 'upload_to_ucsc' (\S+) (\S+)/) {
 923              $record{ 'date' } = $1;
 924              $record{ 'time' } = $2;
 925         } elsif ( $line =~ /^# date: (.+)/ ) {
 926             $record{ 'date' } = $1;
 927         } elsif ( $line =~ /^# time: (.+)/ ) {
 928             $record{ 'time' } = $1;
 929         } elsif ( $line =~ /^# (?:database:|Database) (.+)/ ) {
 930             $record{ 'database' } = $1;
 931         } elsif ( $line =~ /^#/ ) {
 932             # skip
 933         } elsif ( $line =~ /(\S+)\s+(.+)/ ) {
 934             $record{ $1 } = $2;
 935         }
 936
 937         last if $line =~ /^$/ and exists $record{ "track" };
 938     }
 939
 940     return undef if not exists $record{ "track" };
 941
 942     return wantarray ? %record : \%record;
 943 }
 944
 945
 946 sub ucsc_config_put_entry
 947 {
 948     # Martin A. Hansen, November 2008.
 949
 950     # Outputs a Biopiece record (a hashref)
 951     # to a filehandle or STDOUT.
 952
 953     my ( $record,    # hashref
 954          $fh_out,    # file handle
 955        ) = @_;
 956
 957     # Returns nothing.
 958
 959     my ( $date, $time );
 960
 961     $fh_out ||= \*STDOUT;
 962
 963     ( $date, $time ) = split " ", Maasha::Common::time_stamp();
 964
 965     $record->{ "date" } ||= $date;
 966     $record->{ "time" } ||= $time;
 967
 968     print $fh_out "\n# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n\n";
 969
 970     map { print $fh_out "# $_: $record->{ $_ }\n" if exists $record->{ $_ } } qw( date time database );
 971     map { print $fh_out "$_ $record->{ $_ }\n" if exists $record->{ $_ } } qw( track
 972                                                                                name
 973                                                                                description
 974                                                                                itemRgb
 975                                                                                db
 976                                                                                offset
 977                                                                                url
 978                                                                                htmlUrl
 979                                                                                shortLabel
 980                                                                                longLabel
 981                                                                                group
 982                                                                                priority
 983                                                                                useScore
 984                                                                                visibility
 985                                                                                color
 986                                                                                type );
 987 }
 988
 989
 990 sub update_my_tracks
 991 {
 992     # Martin A. Hansen, September 2007.
 993
 994     # Update the /home/user/ucsc/my_tracks.ra file and executes makeCustomTracks.pl
 995
 996     my ( $options,   # hashref
 997          $type,      # track type
 998        ) = @_;
 999
1000     # Returns nothing.
1001
1002     my ( $file, $fh_in, $fh_out, $line, $time );
1003
1004     $file = $ENV{ "HOME" } . "/ucsc/my_tracks.ra";
1005
1006     # ---- create a backup ----
1007
1008     $fh_in  = Maasha::Common::read_open( $file );
1009     $fh_out = Maasha::Common::write_open( "$file~" );
1010
1011     while ( $line = <$fh_in> ) {
1012         print $fh_out $line;
1013     }
1014
1015     close $fh_in;
1016     close $fh_out;
1017
1018     # ---- append track ----
1019
1020     $time = Maasha::Common::time_stamp();
1021
1022     $fh_out = Maasha::Common::append_open( $file );
1023
1024     if ( $type eq "sec_struct" )
1025     {
1026         print $fh_out "\n\n# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n";
1027
1028         print $fh_out "\n# Track added by 'upload_to_ucsc' $time\n\n";
1029
1030         print $fh_out "# Database $options->{ 'database' }\n\n";
1031
1032         print $fh_out "track $options->{ 'table' }\n";
1033         print $fh_out "shortLabel $options->{ 'short_label' }\n";
1034         print $fh_out "longLabel $options->{ 'long_label' }\n";
1035         print $fh_out "group $options->{ 'group' }\n";
1036         print $fh_out "priority $options->{ 'priority' }\n";
1037         print $fh_out "visibility $options->{ 'visibility' }\n";
1038         print $fh_out "color $options->{ 'color' }\n";
1039         print $fh_out "type bed 6 +\n";
1040         print $fh_out "mafTrack multiz17way\n";
1041
1042         print $fh_out "\n# //\n";
1043     }
1044     else
1045     {
1046         print $fh_out "\n\n# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n";
1047
1048         print $fh_out "\n# Track added by 'upload_to_ucsc' $time\n\n";
1049
1050         print $fh_out "# Database $options->{ 'database' }\n\n";
1051
1052         print $fh_out "track $options->{ 'table' }\n";
1053         print $fh_out "shortLabel $options->{ 'short_label' }\n";
1054         print $fh_out "longLabel $options->{ 'long_label' }\n";
1055         print $fh_out "group $options->{ 'group' }\n";
1056         print $fh_out "priority $options->{ 'priority' }\n";
1057         print $fh_out "useScore 1\n" if $options->{ 'use_score' };
1058         print $fh_out "visibility $options->{ 'visibility' }\n";
1059         print $fh_out "maxHeightPixels 50:50:11\n" if $type eq "wig 0";
1060         print $fh_out "color $options->{ 'color' }\n";
1061         print $fh_out "type $type\n";
1062
1063         print $fh_out "\n# //\n";
1064     }
1065
1066     close $fh_out;
1067
1068     Maasha::Common::run( "ucscMakeTracks.pl", "-b > /dev/null 2>&1" );
1069 }
1070
1071
1072 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> fixedStep format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1073
1074
1075 sub fixedstep_get_entry
1076 {
1077     # Martin A. Hansen, December 2007.
1078
1079     # Given a file handle to a PhastCons file get the
1080     # next entry which is all the lines after a "fixedStep"
1081     # line and until the next "fixedStep" line or EOF.
1082
1083     my ( $fh,   # filehandle
1084        ) = @_;
1085
1086     # Returns a list of lines
1087
1088     my ( $entry, @lines );
1089
1090     local $/ = "\nfixedStep ";
1091
1092     $entry = <$fh>;
1093
1094     chomp $entry;
1095
1096     @lines = split "\n", $entry;
1097
1098     return if @lines == 0;
1099
1100     $lines[ 0 ] =~ s/fixedStep?\s*//;
1101
1102     return wantarray ? @lines : \@lines;
1103 }
1104
1105
1106 sub fixedstep_index_create
1107 {
1108     # Martin A. Hansen, January 2008.
1109
1110     # Indexes a concatenated fixedStep file.
1111     # The index consists of a hash with chromosomes as keys,
1112     # and a list of [ chr_beg, next_chr_beg, chr_end, index_beg, index_len ] as values.
1113
1114     my ( $path,   # path to fixedStep file
1115        ) = @_;
1116
1117     # Returns a hashref
1118
1119     my ( $fh, $pos, $index_beg, $index_len, $entry, $locator, $chr, $step, $beg, $end, $len, %index, $i );
1120
1121     $fh = Maasha::Common::read_open( $path );
1122
1123     $pos = 0;
1124
1125     while ( $entry = Maasha::UCSC::fixedstep_get_entry( $fh ) )
1126     {
1127         $locator = shift @{ $entry };
1128
1129         if ( $locator =~ /chrom=([^ ]+) start=(\d+) step=(\d+)/ )
1130         {
1131             $chr  = $1;
1132             $beg  = $2 - 1;  #  fixedStep files are 1-based
1133             $step = $3;
1134         }
1135         else
1136         {
1137             Maasha::Common::error( qq(Could not parse locator: $locator) );
1138         }
1139
1140         $pos += length( $locator ) + 11;
1141
1142         $index_beg = $pos;
1143
1144 #        map { $pos += length( $_ ) + 1 } @{ $entry };
1145
1146         $pos += 6 * scalar @{ $entry };
1147
1148         $index_len = $pos - $index_beg;
1149
1150         push @{ $index{ $chr } }, [ $beg, undef, $beg + scalar @{ $entry } - 1, $index_beg, $index_len ];
1151     }
1152
1153     close $fh;
1154
1155     foreach $chr ( keys %index )
1156     {
1157         for ( $i = 0; $i < @{ $index{ $chr } } - 1; $i++ ) {
1158             $index{ $chr }->[ $i ]->[ FS_NEXT_CHR_BEG ] = $index{ $chr }->[ $i + 1 ]->[ 0 ];
1159         }
1160
1161         $index{ $chr }->[ -1 ]->[ FS_NEXT_CHR_BEG ] = $index{ $chr }->[ -1 ]->[ FS_CHR_END ] + 1;
1162     }
1163
1164     return wantarray ? %index : \%index;
1165 }
1166
1167
1168 sub fixedstep_index_store
1169 {
1170     # Martin A. Hansen, January 2008.
1171
1172     # Writes a fixedStep index to binary file.
1173
1174     my ( $path,   # full path to file
1175          $index,  # list with index
1176        ) = @_;
1177
1178     # returns nothing
1179
1180     Maasha::Common::file_store( $path, $index );
1181 }
1182
1183
1184 sub fixedstep_index_retrieve
1185 {
1186     # Martin A. Hansen, January 2008.
1187
1188     # Retrieves a fixedStep index from binary file.
1189
1190     my ( $path,   # full path to file
1191        ) = @_;
1192
1193     # returns list
1194
1195     my $index;
1196
1197     $index = Maasha::Common::file_retrieve( $path );
1198
1199     return wantarray ? %{ $index } : $index;
1200 }
1201
1202
1203 sub fixedstep_index_lookup
1204 {
1205     # Martin A. Hansen, January 2008.
1206
1207     # Retrieve fixedStep scores from a indexed
1208     # fixedStep file given a chromosome and
1209     # begin and end positions.
1210
1211     my ( $index,     # data structure
1212          $fh,        # filehandle to datafile
1213          $chr,       # chromosome
1214          $chr_beg,   # chromosome beg
1215          $chr_end,   # chromosome end
1216          $flank,     # include flanking region - OPTIONAL
1217        ) = @_;
1218
1219     # Returns a list
1220
1221     my ( $index_beg, $index_end, $i, $c, $beg, $end, @vals, $scores );
1222
1223     $flank ||= 0;
1224
1225     $chr_beg -= $flank;
1226     $chr_end += $flank;
1227
1228     # print "chr_beg->$chr_beg   chr_end->$chr_end   flank->$flank\n";
1229
1230     if ( exists $index->{ $chr } )
1231     {
1232         $index_beg = Maasha::Matrix::interval_search( $index->{ $chr }, 0, 1, $chr_beg );
1233
1234         if ( $index_beg < 0 ) {
1235             Maasha::Common::error( qq(Index search failed - begin index position doesn't exists: $chr_beg) );
1236         }
1237
1238         if ( $chr_end < $index->{ $chr }->[ $index_beg ]->[ 1 ] )
1239         {
1240             $index_end = $index_beg;
1241         }
1242         else
1243         {
1244             $index_end = Maasha::Matrix::interval_search( $index->{ $chr }, 0, 1, $chr_end );
1245
1246             if ( $index_end < 0 ) {
1247                 Maasha::Common::error( qq(Index search failed - end index position doesn't exists: $chr_end) );
1248             }
1249         }
1250
1251         map { $scores->[ $_ ] = 0 } 0 .. $chr_end - $chr_beg;
1252
1253         if ( $index_beg == $index_end )
1254         {
1255             $beg = Maasha::Calc::max( $chr_beg, $index->{ $chr }->[ $index_beg ]->[ FS_CHR_BEG ] );
1256             $end = Maasha::Calc::min( $chr_end, $index->{ $chr }->[ $index_end ]->[ FS_CHR_END ] );
1257
1258             if ( $beg <= $index->{ $chr }->[ $index_beg ]->[ FS_CHR_END ] and $end >= $index->{ $chr }->[ $index_beg ]->[ FS_CHR_BEG ] )
1259             {
1260                 @vals = split "\n", Maasha::Common::file_read(
1261                     $fh,
1262                     $index->{ $chr }->[ $index_beg ]->[ FS_INDEX_BEG ] + 6 * ( $beg - $index->{ $chr }->[ $index_beg ]->[ FS_CHR_BEG ] ),
1263                     6 * ( $end - $beg + 1 ),
1264                 );
1265             }
1266
1267             for ( $c = 0; $c < @vals; $c++ ) {
1268                 $scores->[ $c + $beg - $chr_beg ] = $vals[ $c ];
1269             }
1270         }
1271         else
1272         {
1273             $beg = Maasha::Calc::max( $chr_beg, $index->{ $chr }->[ $index_beg ]->[ FS_CHR_BEG ] );
1274
1275 #            print Dumper( $beg, $index->{ $chr }->[ $index_beg ] );
1276 #            print Dumper( "next", $index->{ $chr }->[ $index_beg ]->[ FS_NEXT_CHR_BEG ] );
1277
1278             #      beg         next
1279             #      v           v
1280             #  |||||||||.......
1281
1282             if ( $beg <= $index->{ $chr }->[ $index_beg ]->[ FS_CHR_END ] )
1283             {
1284                 @vals = split "\n", Maasha::Common::file_read(
1285                     $fh,
1286                     $index->{ $chr }->[ $index_beg ]->[ FS_INDEX_BEG ] + 6 * ( $beg - $index->{ $chr }->[ $index_beg ]->[ FS_CHR_BEG ] ),
1287                     6 * ( $index->{ $chr }->[ $index_beg ]->[ FS_CHR_END ] - $beg + 1 ),
1288                 );
1289
1290                 for ( $c = 0; $c < @vals; $c++ ) {
1291                     $scores->[ $c + $beg - $chr_beg ] = $vals[ $c ];
1292                 }
1293             }
1294
1295             $end = Maasha::Calc::min( $chr_end, $index->{ $chr }->[ $index_end ]->[ FS_CHR_END ] );
1296
1297             if ( $end <= $index->{ $chr }->[ $index_end ]->[ FS_CHR_END ] )
1298             {
1299                 @vals = split "\n", Maasha::Common::file_read(
1300                     $fh,
1301                     $index->{ $chr }->[ $index_end ]->[ FS_INDEX_BEG ],
1302                     6 * ( $end - $index->{ $chr }->[ $index_end ]->[ FS_CHR_BEG ] + 1 ),
1303                 );
1304
1305                 for ( $c = 0; $c < @vals; $c++ ) {
1306                     $scores->[ $c + $index->{ $chr }->[ $index_end ]->[ FS_CHR_BEG ] - $chr_beg ] = $vals[ $c ];
1307                 }
1308             }
1309
1310             for ( $i = $index_beg + 1; $i <= $index_end - 1; $i++ )
1311             {
1312                 @vals = split "\n", Maasha::Common::file_read(
1313                     $fh,
1314                     $index->{ $chr }->[ $i ]->[ FS_INDEX_BEG ],
1315                     6 * ( $index->{ $chr }->[ $i ]->[ FS_CHR_END ] - $index->{ $chr }->[ $i ]->[ FS_CHR_BEG ] + 1 ),
1316                 );
1317
1318                 for ( $c = 0; $c < @vals; $c++ ) {
1319                     $scores->[ $c + $index->{ $chr }->[ $i ]->[ FS_CHR_BEG ] - $chr_beg ] = $vals[ $c ];
1320                 }
1321             }
1322         }
1323     }
1324     else
1325     {
1326         Maasha::Common::error( qq(Chromosome "$chr" was not found in index) );
1327     }
1328
1329     return wantarray ? @{ $scores } : $scores;
1330 }
1331
1332
1333 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> PhastCons format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1334
1335
1336 sub phastcons_index
1337 {
1338     # Martin A. Hansen, July 2008
1339
1340     # Create a fixedStep index for PhastCons data.
1341
1342     my ( $file,   # file to index
1343          $dir,    # dir with file
1344        ) = @_;
1345
1346     # Returns nothing.
1347
1348     my ( $index );
1349
1350     $index = fixedstep_index_create( "$dir/$file" );
1351
1352     fixedstep_index_store( "$dir/$file.index", $index );
1353 }
1354
1355
1356 sub phastcons_parse_entry
1357 {
1358     # Martin A. Hansen, December 2007.
1359
1360     # Given a PhastCons entry converts this to a
1361     # list of super blocks.
1362
1363     my ( $lines,   # list of lines
1364          $args,    # argument hash
1365        ) = @_;
1366
1367     # Returns
1368
1369     my ( $info, $chr, $beg, $step, $i, $c, $j, @blocks, @super_blocks, @entries, $super_block, $block, @lens, @begs );
1370
1371     $info = shift @{ $lines };
1372
1373     if ( $info =~ /^chrom=([^ ]+) start=(\d+) step=(\d+)$/ )
1374     {
1375         $chr  = $1;
1376         $beg  = $2;
1377         $step = $3;
1378
1379         die qq(ERROR: step size $step != 1 -> problem!\n) if $step != 1; # in an ideal world should would be fixed ...
1380     }
1381
1382     $i = 0;
1383
1384     while ( $i < @{ $lines } )
1385     {
1386         if ( $lines->[ $i ] >= $args->{ "threshold" } )
1387         {
1388             $c = $i + 1;
1389
1390             while ( $c < @{ $lines } )
1391             {
1392                 if ( $lines->[ $c ] < $args->{ "threshold" } )
1393                 {
1394                     $j = $c + 1;
1395
1396                     while ( $j < @{ $lines } and $lines->[ $j ] < $args->{ "threshold" } ) {
1397                         $j++;
1398                     }
1399
1400                     if ( $j - $c > $args->{ "gap" } )
1401                     {
1402                         if ( $c - $i >= $args->{ "min" } )
1403                         {
1404                             push @blocks, {
1405                                 CHR     => $chr,
1406                                 CHR_BEG => $beg + $i - 1,
1407                                 CHR_END => $beg + $c - 2,
1408                                 CHR_LEN => $c - $i,
1409                             };
1410                         }
1411
1412                         $i = $j;
1413
1414                         last;
1415                     }
1416
1417                     $c = $j
1418                 }
1419                 else
1420                 {
1421                     $c++;
1422                 }
1423             }
1424
1425             if ( $c - $i >= $args->{ "min" } )
1426             {
1427                 push @blocks, {
1428                     CHR     => $chr,
1429                     CHR_BEG => $beg + $i - 1,
1430                     CHR_END => $beg + $c - 2,
1431                     CHR_LEN => $c - $i,
1432                 };
1433             }
1434
1435             $i = $c;
1436         }
1437         else
1438         {
1439             $i++;
1440         }
1441     }
1442
1443     $i = 0;
1444
1445     while ( $i < @blocks )
1446     {
1447         $c = $i + 1;
1448
1449         while ( $c < @blocks and $blocks[ $c ]->{ "CHR_BEG" } - $blocks[ $c - 1 ]->{ "CHR_END" } <= $args->{ "dist" } )
1450         {
1451             $c++;
1452         }
1453
1454         push @super_blocks, [ @blocks[ $i .. $c - 1 ] ];
1455
1456         $i = $c;
1457     }
1458
1459     foreach $super_block ( @super_blocks )
1460     {
1461         foreach $block ( @{ $super_block } )
1462         {
1463             push @begs, $block->{ "CHR_BEG" } - $super_block->[ 0 ]->{ "CHR_BEG" };
1464             push @lens, $block->{ "CHR_LEN" } - 1;
1465         }
1466
1467         $lens[ -1 ]++;
1468
1469         push @entries, {
1470             CHR        => $super_block->[ 0 ]->{ "CHR" },
1471             CHR_BEG    => $super_block->[ 0 ]->{ "CHR_BEG" },
1472             CHR_END    => $super_block->[ -1 ]->{ "CHR_END" },
1473             Q_ID       => "Q_ID",
1474             SCORE      => 100,
1475             STRAND     => "+",
1476             THICK_BEG  => $super_block->[ 0 ]->{ "CHR_BEG" },
1477             THICK_END  => $super_block->[ -1 ]->{ "CHR_END" } + 1,
1478             ITEMRGB    => "0,200,100",
1479             BLOCKCOUNT => scalar @{ $super_block },
1480             BLOCKSIZES => join( ",", @lens ),
1481             Q_BEGS     => join( ",", @begs ),
1482         };
1483
1484         undef @begs;
1485         undef @lens;
1486     }
1487
1488     return wantarray ? @entries : \@entries;
1489 }
1490
1491
1492 sub phastcons_normalize
1493 {
1494     # Martin A. Hansen, January 2008.
1495
1496     # Normalizes a list of lists with PhastCons scores,
1497     # in such a way that each list contains the same number
1498     # or PhastCons scores.
1499
1500     my ( $AoA,    # AoA with PhastCons scores
1501        ) = @_;
1502
1503     # Returns AoA.
1504
1505     my ( $list, $max, $min, $mean, $diff );
1506
1507     $min = 99999999;
1508     $max = 0;
1509
1510     foreach $list ( @{ $AoA } )
1511     {
1512         $min = scalar @{ $list } if scalar @{ $list } < $min;
1513         $max = scalar @{ $list } if scalar @{ $list } > $max;
1514     }
1515
1516     $mean = int( ( $min + $max ) / 2 );
1517
1518 #    print STDERR "min->$min   max->$max   mean->$mean\n";
1519
1520     foreach $list ( @{ $AoA } )
1521     {
1522         $diff = scalar @{ $list } - $mean;
1523
1524         phastcons_list_inflate( $list, abs( $diff ) ) if $diff < 0;
1525         phastcons_list_deflate( $list, $diff )        if $diff > 0;
1526     }
1527
1528     return wantarray ? @{ $AoA } : $AoA;
1529 }
1530
1531
1532 sub phastcons_list_inflate
1533 {
1534     # Martin A. Hansen, January 2008.
1535
1536     # Inflates a list with a given number of elements
1537     # in such a way that the extra elements are introduced
1538     # evenly over the entire length of the list. The value
1539     # of the extra elements is based on a mean of the
1540     # adjacent elements.
1541
1542     my ( $list,   # list of elements
1543          $diff,   # number of elements to introduce
1544        ) = @_;
1545
1546     # Returns nothing
1547
1548     my ( $len, $space, $i, $pos );
1549
1550     $len = scalar @{ $list };
1551
1552     $space = $len / $diff;
1553
1554     for ( $i = 0; $i < $diff; $i++ )
1555     {
1556         $pos = int( ( $space / 2 ) + $i * $space );
1557
1558         splice @{ $list }, $pos, 0, ( $list->[ $pos - 1 ] + $list->[ $pos + 1 ] ) / 2;
1559         # splice @{ $list }, $pos, 0, "X";
1560     }
1561
1562     die qq(ERROR: bad inflate\n) if scalar @{ $list } != $len + $diff;
1563 }
1564
1565
1566 sub phastcons_list_deflate
1567 {
1568     # Martin A. Hansen, January 2008.
1569
1570     # Deflates a list by removing a given number of elements
1571     # evenly distributed over the entire list.
1572
1573     my ( $list,   # list of elements
1574          $diff,   # number of elements to remove
1575        ) = @_;
1576
1577     # Returns nothing
1578
1579     my ( $len, $space, $i, $pos );
1580
1581     $len = scalar @{ $list };
1582
1583     $space = ( $len - $diff ) / $diff;
1584
1585     for ( $i = 0; $i < $diff; $i++ )
1586     {
1587         $pos = int( ( $space / 2 ) + $i * $space );
1588
1589         splice @{ $list }, $pos, 1;
1590     }
1591
1592     die qq(ERROR: bad deflate\n) if scalar @{ $list } != $len - $diff;
1593 }
1594
1595
1596 sub phastcons_mean
1597 {
1598     # Martin A. Hansen, January 2008.
1599
1600     # Given a normalized PhastCons matrix in an AoA,
1601     # calculate the mean for each column and return as a list.
1602
1603     my ( $AoA,    # AoA with normalized PhastCons scores
1604        ) = @_;
1605
1606     # Returns a list
1607
1608     my ( @list );
1609
1610     $AoA = Maasha::Matrix::matrix_flip( $AoA );
1611
1612     map { push @list, Maasha::Calc::mean( $_ ) } @{ $AoA };
1613
1614     return wantarray ? @list : \@list;
1615 }
1616
1617
1618 sub phastcons_median
1619 {
1620     # Martin A. Hansen, January 2008.
1621
1622     # Given a normalized PhastCons matrix in an AoA,
1623     # calculate the median for each column and return as a list.
1624
1625     my ( $AoA,    # AoA with normalized PhastCons scores
1626        ) = @_;
1627
1628     # Returns a list
1629
1630     my ( @list );
1631
1632     $AoA = Maasha::Matrix::matrix_flip( $AoA );
1633
1634     map { push @list, Maasha::Calc::median( $_ ) } @{ $AoA };
1635
1636     return wantarray ? @list : \@list;
1637 }
1638
1639
1640 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> MULTIPLE ALIGNMENT FILES <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1641
1642
1643 sub maf_extract
1644 {
1645     # Martin A. Hansen, April 2008.
1646
1647     # Executes mafFrag to extract a subalignment from a multiz track
1648     # in the UCSC genome browser database.
1649
1650     my ( $tmp_dir,    # temporary directory
1651          $database,   # genome database
1652          $table,      # table with the multiz track
1653          $chr,        # chromosome
1654          $beg,        # begin position
1655          $end,        # end position
1656          $strand,     # strand
1657        ) = @_;
1658
1659     # Returns a list of record
1660
1661     my ( $tmp_file, $align );
1662
1663     $tmp_file = "$tmp_dir/maf_extract.maf";
1664
1665     Maasha::Common::run( "mafFrag", "$database $table $chr $beg $end $strand $tmp_file" );
1666
1667     $align = maf_parse( $tmp_file );
1668
1669     unlink $tmp_file;
1670
1671     return wantarray ? @{ $align } : $align;
1672 }
1673
1674
1675 sub maf_parse
1676 {
1677     # Martin A. Hansen, April 2008.
1678
1679
1680     my ( $path,   # full path to MAF file
1681        ) = @_;
1682
1683     # Returns a list of record.
1684
1685     my ( $fh, $line, @fields, @align );
1686
1687     $fh = Maasha::Common::read_open( $path );
1688
1689     while ( $line = <$fh> )
1690     {
1691         chomp $line;
1692
1693         if ( $line =~ /^s/ )
1694         {
1695             @fields = split / /, $line;
1696
1697             push @align, {
1698                 SEQ_NAME  => $fields[ 1 ],
1699                 SEQ       => $fields[ -1 ],
1700                 ALIGN     => 1,
1701                 ALIGN_LEN => length $fields[ -1 ],
1702             }
1703         }
1704     }
1705
1706     close $fh;
1707
1708     return wantarray ? @align : \@align;
1709 }
1710
1711
1712 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> WIGGLE FORMAT <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1713
1714
1715 sub fixedstep_put_entry
1716 {
1717     # Martin A. Hansen, April 2008.
1718
1719     # Outputs a block of fixedStep values.
1720     # Used for outputting wiggle data.
1721
1722     my ( $chr,      # chromosome
1723          $beg,      # start position
1724          $block,    # list of scores
1725          $fh,       # filehandle - OPTIONAL
1726          $log10,    # flag indicating that log10 scores should be used
1727        ) = @_;
1728
1729     # Returns nothing.
1730
1731     $beg += 1;   # fixedStep format is 1 based.
1732
1733     $fh ||= \*STDOUT;
1734
1735     print $fh "fixedStep chrom=$chr start=$beg step=1\n";
1736
1737     if ( $log10 ) {
1738         map { printf( $fh "%f\n", Maasha::Calc::log10( $_ + 1 ) ) } @{ $block };
1739     } else {
1740         map { printf( $fh "%d\n", ( $_ + 1 ) ) } @{ $block };
1741     }
1742 }
1743
1744
1745 sub wiggle_upload_to_ucsc
1746 {
1747     # Martin A. Hansen, May 2008.
1748
1749     # Upload a wiggle file to the UCSC database.
1750
1751     my ( $tmp_dir,    # temporary directory
1752          $wib_dir,    # wib directory
1753          $wig_file,   # file to upload,
1754          $options,    # argument hashref
1755        ) = @_;
1756
1757     # Returns nothing.
1758
1759     my ( $args );
1760
1761 #    $args = join " ", "-tmpDir=$tmp_dir", "-pathPrefix=$wib_dir", $options->{ "database" }, $options->{ 'table' }, $wig_file;
1762
1763 #    Maasha::Common::run( "hgLoadWiggle", "$args > /dev/null 2>&1" );
1764
1765     if ( $options->{ 'verbose' } ) {
1766         `cd $tmp_dir && hgLoadWiggle -tmpDir=$tmp_dir -pathPrefix=$wib_dir $options->{ 'database' } $options->{ 'table' } $wig_file`;
1767     } else {
1768         `cd $tmp_dir && hgLoadWiggle -tmpDir=$tmp_dir -pathPrefix=$wib_dir $options->{ 'database' } $options->{ 'table' } $wig_file > /dev/null 2>&1`;
1769     }
1770 }
1771
1772
1773 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> MySQL CONF <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1774
1775
1776 sub ucsc_get_user
1777 {
1778     # Martin A. Hansen, May 2008
1779
1780     # Fetches the MySQL database user name from the
1781     # .hg.conf file in the users home directory.
1782
1783     # Returns a string.
1784
1785     my ( $fh, $line, $user );
1786
1787     $fh = Maasha::Common::read_open( "$ENV{ 'HOME' }/.hg.conf" );
1788
1789     while ( $line = <$fh> )
1790     {
1791         chomp $line;
1792
1793         if ( $line =~ /^db\.user=(.+)/ )
1794         {
1795             $user = $1;
1796
1797             last;
1798         }
1799     }
1800
1801     close $fh;
1802
1803     return $user;
1804 }
1805
1806
1807 sub ucsc_get_password
1808 {
1809     # Martin A. Hansen, May 2008
1810
1811     # Fetches the MySQL database password from the
1812     # .hg.conf file in the users home directory.
1813
1814     # Returns a string.
1815
1816     my ( $fh, $line, $password );
1817
1818     $fh = Maasha::Common::read_open( "$ENV{ 'HOME' }/.hg.conf" );
1819
1820     while ( $line = <$fh> )
1821     {
1822         chomp $line;
1823
1824         if ( $line =~ /^db\.password=(.+)/ )
1825         {
1826             $password = $1;
1827
1828             last;
1829         }
1830     }
1831
1832     close $fh;
1833
1834     return $password;
1835 }
1836
1837
1838 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1839
1840
1841 __END__
1842