code_perl/Maasha/UCSC.pm

   1 package Maasha::UCSC;
   2
   3 # Copyright (C) 2007 Martin A. Hansen.
   4
   5 # This program is free software; you can redistribute it and/or
   6 # modify it under the terms of the GNU General Public License
   7 # as published by the Free Software Foundation; either version 2
   8 # of the License, or (at your option) any later version.
   9
  10 # This program is distributed in the hope that it will be useful,
  11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 # GNU General Public License for more details.
  14
  15 # You should have received a copy of the GNU General Public License
  16 # along with this program; if not, write to the Free Software
  17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  18
  19 # http://www.gnu.org/copyleft/gpl.html
  20
  21
  22 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  23
  24
  25 # Stuff for interacting with UCSC genome browser
  26
  27
  28 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  29
  30
  31 use strict;
  32 use vars qw ( @ISA @EXPORT );
  33
  34 use Data::Dumper;
  35 use Time::HiRes qw( gettimeofday );
  36
  37 use Maasha::Common;
  38 use Maasha::Calc;
  39 use Maasha::Matrix;
  40
  41 use constant {
  42     FS_CHR_BEG      => 0,
  43     FS_NEXT_CHR_BEG => 1,
  44     FS_CHR_END      => 2,
  45     FS_INDEX_BEG    => 3,
  46     FS_INDEX_LEN    => 4,
  47
  48     CHR          => 0,
  49     CHR_BEG      => 1,
  50     CHR_END      => 2,
  51     Q_ID         => 3,
  52     SCORE        => 4,
  53     STRAND       => 5,
  54     THICK_BEG    => 6,
  55     THICK_END    => 7,
  56     ITEMRGB      => 8,
  57     BLOCKCOUNT   => 9,
  58     BLOCKSIZES   => 10,
  59     Q_BEGS       => 11,
  60 };
  61
  62 @ISA = qw( Exporter );
  63
  64 my $TIME = gettimeofday();
  65
  66
  67 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> BED format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  68
  69
  70 # http://genome.ucsc.edu/goldenPath/help/hgTracksHelp.html#BED
  71
  72
  73 sub bed_entry_get_array
  74 {
  75     # Martin A. Hansen, September 2008.
  76
  77     # Reads a BED entry given a filehandle.
  78
  79     # This is a new _faster_ BED entry parser that
  80     # uses arrays and not hashrefs.
  81
  82     # IMPORTANT! This function does not correct the
  83     # BED_END position that is kept the way UCSC
  84     # does.
  85
  86     my ( $fh,     # file handle
  87          $cols,   # columns to read - OPTIONAL (3,4,5,6, or 12)
  88        ) = @_;
  89
  90     # Returns a list.
  91
  92     my ( $line, @entry );
  93
  94     $line = <$fh>;
  95
  96     $line =~ tr/\n\r//d;    # some people have carriage returns in their BED files -> Grrrr
  97
  98     return if not defined $line;
  99
 100     if ( not defined $cols ) {
 101         $cols = 1 + $line =~ tr/\t//;
 102     }
 103
 104     @entry = split "\t", $line, $cols + 1;
 105
 106     pop @entry if scalar @entry > $cols;
 107
 108     return wantarray ? @entry : \@entry;
 109 }
 110
 111
 112 sub bed_get_entry
 113 {
 114     # Martin A. Hansen, December 2007.
 115
 116     # Reads a bed entry given a filehandle.
 117
 118     my ( $fh,        # file handle
 119          $columns,   # number of BED columns to read  -  OPTIONAL
 120        ) = @_;
 121
 122     # Returns hashref.
 123
 124     my ( $line, @fields, %entry );
 125
 126     $line = <$fh>;
 127
 128     $line =~ tr/\n\r//d;    # some people have carriage returns in their BED files -> Grrrr
 129
 130     return if not defined $line;
 131
 132     @fields = split "\t", $line;
 133
 134     $columns ||= scalar @fields;
 135
 136     if ( $columns == 3 )
 137     {
 138         %entry = (
 139             "CHR"      => $fields[ 0 ],
 140             "CHR_BEG"  => $fields[ 1 ],
 141             "CHR_END"  => $fields[ 2 ] - 1,
 142         );
 143     }
 144     elsif ( $columns == 4 )
 145     {
 146         %entry = (
 147             "CHR"      => $fields[ 0 ],
 148             "CHR_BEG"  => $fields[ 1 ],
 149             "CHR_END"  => $fields[ 2 ] - 1,
 150             "Q_ID"     => $fields[ 3 ],
 151         );
 152     }
 153     elsif ( $columns == 5 )
 154     {
 155         %entry = (
 156             "CHR"      => $fields[ 0 ],
 157             "CHR_BEG"  => $fields[ 1 ],
 158             "CHR_END"  => $fields[ 2 ] - 1,
 159             "Q_ID"     => $fields[ 3 ],
 160             "SCORE"    => $fields[ 4 ],
 161         );
 162     }
 163     elsif ( $columns == 6 )
 164     {
 165         %entry = (
 166             "CHR"      => $fields[ 0 ],
 167             "CHR_BEG"  => $fields[ 1 ],
 168             "CHR_END"  => $fields[ 2 ] - 1,
 169             "Q_ID"     => $fields[ 3 ],
 170             "SCORE"    => $fields[ 4 ],
 171             "STRAND"   => $fields[ 5 ],
 172         );
 173     }
 174     elsif ( $columns == 12 )
 175     {
 176         %entry = (
 177             "CHR"        => $fields[ 0 ],
 178             "CHR_BEG"    => $fields[ 1 ],
 179             "CHR_END"    => $fields[ 2 ] - 1,
 180             "Q_ID"       => $fields[ 3 ],
 181             "SCORE"      => $fields[ 4 ],
 182             "STRAND"     => $fields[ 5 ],
 183             "THICK_BEG"  => $fields[ 6 ],
 184             "THICK_END"  => $fields[ 7 ] - 1,
 185             "ITEMRGB"    => $fields[ 8 ],
 186             "BLOCKCOUNT" => $fields[ 9 ],
 187             "BLOCKSIZES" => $fields[ 10 ],
 188             "Q_BEGS"     => $fields[ 11 ],
 189         );
 190     }
 191     else
 192     {
 193         Maasha::Common::error( qq(Bad BED format in line->$line<-) );
 194     }
 195
 196     $entry{ "REC_TYPE" } = "BED";
 197     $entry{ "BED_LEN" }  = $entry{ "CHR_END" } - $entry{ "CHR_BEG" } + 1;
 198     $entry{ "BED_COLS" } = $columns;
 199
 200     return wantarray ? %entry : \%entry;
 201 }
 202
 203
 204 sub bed_get_entries
 205 {
 206     # Martin A. Hansen, January 2008.
 207
 208     # Given a path to a BED file, read in all entries
 209     # and return.
 210
 211     my ( $path,     # full path to BED file
 212          $columns,  # number of columns in BED file - OPTIONAL (but is faster)
 213        ) = @_;
 214
 215     # Returns a list.
 216
 217     my ( $fh, $entry, @list );
 218
 219     $fh = Maasha::Common::read_open( $path );
 220
 221     while ( $entry = bed_get_entry( $fh ) ) {
 222         push @list, $entry;
 223     }
 224
 225     close $fh;
 226
 227     return wantarray ? @list : \@list;
 228 }
 229
 230
 231 sub bed_entry_put_array
 232 {
 233     # Martin A. Hansen, Septermber 2008.
 234
 235     # Writes a BED entry array to file.
 236
 237     # IMPORTANT! This function does not correct the
 238     # BED_END position that is assumed to be in the
 239     # UCSC positions scheme.
 240
 241     my ( $record,   # list
 242          $fh,       # file handle                   - OPTIONAL
 243          $cols,     # number of columns in BED file - OPTIONAL
 244        ) = @_;
 245
 246     # Returns nothing.
 247
 248     $fh = \*STDOUT if not defined $fh;
 249
 250     if ( defined $cols ) {
 251         print $fh join( "\t", @{ $record }[ 0 .. $cols - 1 ] ), "\n";
 252     } else {
 253         print $fh join( "\t", @{ $record } ), "\n";
 254     }
 255 }
 256
 257
 258 sub bed_put_entry
 259 {
 260     # Martin A. Hansen, Septermber 2007.
 261
 262     # Writes a BED entry to file.
 263
 264     # NB, this could really be more robust!?
 265
 266     my ( $record,       # hashref
 267          $fh,           # file handle                   - OPTIONAL
 268          $columns,      # number of columns in BED file - OPTIONAL (but is faster)
 269        ) = @_;
 270
 271     # Returns nothing.
 272
 273     my ( @fields );
 274
 275     $columns ||= 12;   # max number of columns possible
 276
 277     if ( $columns == 3 )
 278     {
 279         push @fields, $record->{ "CHR" };
 280         push @fields, $record->{ "CHR_BEG" };
 281         push @fields, $record->{ "CHR_END" } + 1;
 282     }
 283     elsif ( $columns == 4 )
 284     {
 285         $record->{ "Q_ID" }  =~ s/\s+/_/g;
 286
 287         push @fields, $record->{ "CHR" };
 288         push @fields, $record->{ "CHR_BEG" };
 289         push @fields, $record->{ "CHR_END" } + 1;
 290         push @fields, $record->{ "Q_ID" };
 291     }
 292     elsif ( $columns == 5 )
 293     {
 294         $record->{ "Q_ID" }  =~ s/\s+/_/g;
 295         $record->{ "SCORE" } =~ s/\.\d*//;
 296
 297         push @fields, $record->{ "CHR" };
 298         push @fields, $record->{ "CHR_BEG" };
 299         push @fields, $record->{ "CHR_END" } + 1;
 300         push @fields, $record->{ "Q_ID" };
 301         push @fields, $record->{ "SCORE" };
 302     }
 303     elsif ( $columns == 6 )
 304     {
 305         $record->{ "Q_ID" }  =~ s/\s+/_/g;
 306         $record->{ "SCORE" } =~ s/\.\d*//;
 307
 308         push @fields, $record->{ "CHR" };
 309         push @fields, $record->{ "CHR_BEG" };
 310         push @fields, $record->{ "CHR_END" } + 1;
 311         push @fields, $record->{ "Q_ID" };
 312         push @fields, $record->{ "SCORE" };
 313         push @fields, $record->{ "STRAND" };
 314     }
 315     else
 316     {
 317         $record->{ "Q_ID" }  =~ s/\s+/_/g;
 318         $record->{ "SCORE" } =~ s/\.\d*//;
 319
 320         push @fields, $record->{ "CHR" };
 321         push @fields, $record->{ "CHR_BEG" };
 322         push @fields, $record->{ "CHR_END" } + 1;
 323         push @fields, $record->{ "Q_ID" };
 324         push @fields, $record->{ "SCORE" };
 325         push @fields, $record->{ "STRAND" };
 326         push @fields, $record->{ "THICK_BEG" }     if defined $record->{ "THICK_BEG" };
 327         push @fields, $record->{ "THICK_END" } + 1 if defined $record->{ "THICK_END" };
 328         push @fields, $record->{ "ITEMRGB" }       if defined $record->{ "ITEMRGB" };
 329         push @fields, $record->{ "BLOCKCOUNT" }    if defined $record->{ "BLOCKCOUNT" };
 330         push @fields, $record->{ "BLOCKSIZES" }    if defined $record->{ "BLOCKSIZES" };
 331         push @fields, $record->{ "Q_BEGS" }        if defined $record->{ "Q_BEGS" };
 332     }
 333
 334     if ( $fh ) {
 335         print $fh join( "\t", @fields ), "\n";
 336     } else {
 337         print join( "\t", @fields ), "\n";
 338     }
 339 }
 340
 341
 342 sub bed_put_entries
 343 {
 344     # Martin A. Hansen, January 2008.
 345
 346     # Write a list of BED entries.
 347
 348     my ( $entries,   # list of entries,
 349          $fh,        # file handle - OPTIONAL
 350        ) = @_;
 351
 352     # Returns nothing.
 353
 354     map { bed_put_entry( $_, $fh ) } @{ $entries };
 355 }
 356
 357
 358 sub bed_analyze
 359 {
 360     # Martin A. Hansen, March 2008.
 361
 362     # Given a bed record, analysis this to give information
 363     # about intron/exon sizes.
 364
 365     my ( $entry,   # BED entry
 366        ) = @_;
 367
 368     # Returns hashref.
 369
 370     my ( $i, @begs, @lens, $exon_max, $exon_min, $exon_len, $exon_tot, $intron_max, $intron_min, $intron_len, $intron_tot );
 371
 372     $exon_max   = 0;
 373     $exon_min   = 9999999999;
 374     $intron_max = 0;
 375     $intron_min = 9999999999;
 376
 377     $entry->{ "EXONS" }   = $entry->{ "BLOCKCOUNT" };
 378
 379     @begs = split /,/, $entry->{ "Q_BEGS" };
 380     @lens = split /,/, $entry->{ "BLOCKSIZES" };
 381
 382     for ( $i = 0; $i < $entry->{ "BLOCKCOUNT" }; $i++ )
 383     {
 384         $exon_len = @lens[ $i ];
 385
 386         $entry->{ "EXON_LEN_$i" } = $exon_len;
 387
 388         $exon_max = $exon_len if $exon_len > $exon_max;
 389         $exon_min = $exon_len if $exon_len < $exon_min;
 390
 391         $exon_tot += $exon_len;
 392     }
 393
 394     $entry->{ "EXON_LEN_-1" }   = $exon_len;
 395     $entry->{ "EXON_MAX_LEN" }  = $exon_max;
 396     $entry->{ "EXON_MIN_LEN" }  = $exon_min;
 397     $entry->{ "EXON_MEAN_LEN" } = int( $exon_tot / $entry->{ "EXONS" } );
 398
 399     $entry->{ "INTRONS" } = $entry->{ "BLOCKCOUNT" } - 1;
 400     $entry->{ "INTRONS" } = 0 if $entry->{ "INTRONS" } < 0;
 401
 402     if ( $entry->{ "INTRONS" } )
 403     {
 404         for ( $i = 1; $i < $entry->{ "BLOCKCOUNT" }; $i++ )
 405         {
 406             $intron_len = @begs[ $i ] - ( @begs[ $i - 1 ] + @lens[ $i - 1 ] );
 407
 408             $entry->{ "INTRON_LEN_" . ( $i - 1 ) } = $intron_len;
 409
 410             $intron_max = $intron_len if $intron_len > $intron_max;
 411             $intron_min = $intron_len if $intron_len < $intron_min;
 412
 413             $intron_tot += $intron_len;
 414         }
 415
 416         $entry->{ "INTRON_LEN_-1" }   = $intron_len;
 417         $entry->{ "INTRON_MAX_LEN" }  = $intron_max;
 418         $entry->{ "INTRON_MIN_LEN" }  = $intron_min;
 419         $entry->{ "INTRON_MEAN_LEN" } = int( $intron_tot / $entry->{ "INTRONS" } );
 420     }
 421
 422     return wantarray ? %{ $entry } : $entry;
 423 }
 424
 425
 426 sub bed_sort
 427 {
 428     # Martin A. Hansen, Septermber 2008
 429
 430     # Sorts a BED file using the c program
 431     # "bed_sort" specifing a sort mode:
 432
 433     # 1: chr AND chr_beg.
 434     # 2: chr AND strand AND chr_beg.
 435     # 3: chr_beg.
 436     # 4: strand AND chr_beg.
 437
 438     my ( $bed_file,    # BED file to sort
 439          $sort_mode,   # See above.
 440          $cols,        # Number of columns in BED file
 441        ) = @_;
 442
 443     &Maasha::Common::run( "bed_sort", "--sort $sort_mode --cols $cols $bed_file" );
 444 }
 445
 446
 447 sub bed_split_to_files
 448 {
 449     # Martin A. Hansen, Septermber 2008
 450
 451     # Given a list of BED files, split these
 452     # into temporary files based on the chromosome
 453     # name. Returns a list of the temporary files.
 454
 455     my ( $bed_files,   # list of BED files to split
 456          $cols,        # number of columns
 457          $tmp_dir,     # temporary directory
 458        ) = @_;
 459
 460     # Returns a list.
 461
 462     my ( $bed_file, $fh_in, $entry, $key, %fh_hash, @tmp_files );
 463
 464     foreach $bed_file ( @{ $bed_files } )
 465     {
 466         $fh_in = Maasha::Common::read_open( $bed_file );
 467
 468         while ( $entry = bed_entry_get_array( $fh_in, $cols ) )
 469         {
 470             $key = $entry->[ CHR ];
 471
 472             $fh_hash{ $key } = Maasha::Common::write_open( "$tmp_dir/$key.temp" ) if not exists $fh_hash{ $key };
 473
 474             bed_entry_put_array( $entry, $fh_hash{ $key } );
 475         }
 476
 477         close $fh_in;
 478     }
 479
 480     foreach $key ( sort keys %fh_hash )
 481     {
 482         push @tmp_files, "$tmp_dir/$key.temp";
 483
 484         close $fh_hash{ $key };
 485     }
 486
 487     return wantarray ? @tmp_files : \@tmp_files;
 488 }
 489
 490
 491 sub bed_merge_entries
 492 {
 493     # Martin A. Hansen, February 2008.
 494
 495     # Merge a list of given BED entries in one big entry.
 496
 497     my ( $entries,     # list of BED entries to be merged
 498        ) = @_;
 499
 500     # Returns hash.
 501
 502     my ( $i, @q_ids, @q_begs, @blocksizes, @new_q_begs, @new_blocksizes, %new_entry );
 503
 504     @{ $entries } = sort { $a->{ "CHR_BEG" } <=> $b->{ "CHR_BEG" } } @{ $entries };
 505
 506     for ( $i = 0; $i < @{ $entries }; $i++ )
 507     {
 508         Maasha::Common::error( qq(Attempted merge of BED entries from different chromosomes) ) if $entries->[ 0 ]->{ "CHR" }    ne $entries->[ $i ]->{ "CHR" };
 509         Maasha::Common::error( qq(Attempted merge of BED entries from different strands) )     if $entries->[ 0 ]->{ "STRAND" } ne $entries->[ $i ]->{ "STRAND" };
 510
 511         push @q_ids, $entries->[ $i ]->{ "Q_ID" } || sprintf( "ID%06d", $i );
 512
 513         if ( exists $entries->[ $i ]->{ "Q_BEGS" } )
 514         {
 515             @q_begs     = split ",", $entries->[ $i ]->{ "Q_BEGS" };
 516             @blocksizes = split ",", $entries->[ $i ]->{ "BLOCKSIZES" };
 517         }
 518         else
 519         {
 520             @q_begs     = 0;
 521             @blocksizes = $entries->[ $i ]->{ "CHR_END" } - $entries->[ $i ]->{ "CHR_BEG" } + 1;
 522         }
 523
 524         map { $_ += $entries->[ $i ]->{ "CHR_BEG" } } @q_begs;
 525
 526         push @new_q_begs, @q_begs;
 527         push @new_blocksizes, @blocksizes;
 528     }
 529
 530     map { $_ -= $entries->[ 0 ]->{ "CHR_BEG" } } @new_q_begs;
 531
 532     %new_entry = (
 533         CHR         => $entries->[ 0 ]->{ "CHR" },
 534         CHR_BEG     => $entries->[ 0 ]->{ "CHR_BEG" },
 535         CHR_END     => $entries->[ -1 ]->{ "CHR_END" },
 536         REC_TYPE    => "BED",
 537         BED_LEN     => $entries->[ -1 ]->{ "CHR_END" } - $entries->[ 0 ]->{ "CHR_BEG" } + 1,
 538         BED_COLS    => 12,
 539         Q_ID        => join( ":", @q_ids ),
 540         SCORE       => 999,
 541         STRAND      => $entries->[ 0 ]->{ "STRAND" }     || "+",
 542         THICK_BEG   => $entries->[ 0 ]->{ "THICK_BEG" }  || $entries->[ 0 ]->{ "CHR_BEG" },
 543         THICK_END   => $entries->[ -1 ]->{ "THICK_END" } || $entries->[ -1 ]->{ "CHR_END" },
 544         ITEMRGB     => "0,0,0",
 545         BLOCKCOUNT  => scalar @new_q_begs,
 546         BLOCKSIZES  => join( ",", @new_blocksizes ),
 547         Q_BEGS      => join( ",", @new_q_begs ),
 548     );
 549
 550     return wantarray ? %new_entry : \%new_entry;
 551 }
 552
 553
 554 sub bed_split_entry
 555 {
 556     # Martin A. Hansen, February 2008.
 557
 558     # Splits a given BED entry into a list of blocks,
 559     # which are returned. A list of 6 column BED entry is returned.
 560
 561     my ( $entry,    # BED entry hashref
 562        ) = @_;
 563
 564     # Returns a list.
 565
 566     my ( @q_begs, @blocksizes, $block, @blocks, $i );
 567
 568     if ( exists $entry->{ "BLOCKCOUNT" } )
 569     {
 570         @q_begs     = split ",", $entry->{ "Q_BEGS" };
 571         @blocksizes = split ",", $entry->{ "BLOCKSIZES" };
 572
 573         for ( $i = 0; $i < @q_begs; $i++ )
 574         {
 575             undef $block;
 576
 577             $block->{ "CHR" }      = $entry->{ "CHR" };
 578             $block->{ "CHR_BEG" }  = $entry->{ "CHR_BEG" } + $q_begs[ $i ];
 579             $block->{ "CHR_END" }  = $entry->{ "CHR_BEG" } + $q_begs[ $i ] + $blocksizes[ $i ] - 1;
 580             $block->{ "Q_ID" }     = $entry->{ "Q_ID" } . sprintf( "_%03d", $i );
 581             $block->{ "SCORE" }    = $entry->{ "SCORE" };
 582             $block->{ "STRAND" }   = $entry->{ "STRAND" };
 583             $block->{ "BED_LEN" }  = $block->{ "CHR_END" } - $block->{ "CHR_BEG" } + 1,
 584             $block->{ "BED_COLS" } = 6;
 585             $block->{ "REC_TYPE" } = "BED";
 586
 587             push @blocks, $block;
 588         }
 589     }
 590     else
 591     {
 592         @blocks = @{ $entry };
 593     }
 594
 595     return wantarray ? @blocks : \@blocks;
 596 }
 597
 598
 599
 600 sub bed_overlap
 601 {
 602     # Martin A. Hansen, February 2008.
 603
 604     # Checks if two BED entries overlap and
 605     # return 1 if so - else 0;
 606
 607     my ( $entry1,      # hashref
 608          $entry2,      # hashref
 609          $no_strand,   # don't check strand flag - OPTIONAL
 610        ) = @_;
 611
 612     # Return bolean.
 613
 614     return 0 if $entry1->{ "CHR" }    ne $entry2->{ "CHR" };
 615     return 0 if $entry1->{ "STRAND" } ne $entry2->{ "STRAND" };
 616
 617     if ( $entry1->{ "CHR_END" } < $entry2->{ "CHR_BEG" } or $entry1->{ "CHR_BEG" } > $entry2->{ "CHR_END" } ) {
 618         return 0;
 619     } else {
 620         return 1;
 621     }
 622 }
 623
 624
 625 sub bed_upload_to_ucsc
 626 {
 627     # Martin A. Hansen, September 2007.
 628
 629     # Upload a BED file to the UCSC database.
 630
 631     my ( $tmp_dir,   # temporary directory
 632          $file,      # file to upload,
 633          $options,   # argument hashref
 634          $append,    # flag indicating table should be appended
 635        ) = @_;
 636
 637     # Returns nothing.
 638
 639     my ( $args, $table, $sql_file, $fh_out, $fh_in );
 640
 641     if ( $append ) {
 642         $args = join " ", $options->{ "database" }, $options->{ "table" }, "-tmpDir=$tmp_dir", "-oldTable", $file;
 643     } else {
 644         $args = join " ", $options->{ "database" }, $options->{ "table" }, "-tmpDir=$tmp_dir", $file;
 645     }
 646
 647     if ( $options->{ "table" } =~ /rnaSecStr/ )
 648     {
 649         $table = $options->{ "table" };
 650
 651         print qq(uploading secondary structure table:"$table"\n) if $options->{ "verbose" };
 652
 653         $sql_file = "$tmp_dir/upload_RNA_SS.sql";
 654
 655         $fh_out   = Maasha::Common::write_open( $sql_file );
 656
 657         print $fh_out qq(
 658 CREATE TABLE $table (
 659     bin smallint not null,              # Bin number for browser speedup
 660     chrom varchar(255) not null,        # Chromosome or FPC contig
 661     chromStart int unsigned not null,   # Start position in chromosome
 662     chromEnd int unsigned not null,     # End position in chromosome
 663     name varchar(255) not null,         # Name of item
 664     score int unsigned not null,        # Score from 0-1000
 665     strand char(1) not null,            # + or -
 666     size int unsigned not null,         # Size of element.
 667     secStr longblob not null,           # Parentheses and '.'s which define the secondary structure
 668     conf longblob not null,             # Confidence of secondary-structure annotation per position (0.0-1.0).
 669     #Indices
 670     INDEX(name(16)),
 671     INDEX(chrom(8), bin),
 672     INDEX(chrom(8), chromStart)
 673 );
 674         );
 675
 676         close $fh_out;
 677
 678         Maasha::Common::run( "hgLoadBed", "-notItemRgb -sqlTable=$sql_file $options->{ 'database' } $options->{ 'table' } -tmpDir=$tmp_dir $file > /dev/null 2>&1" );
 679
 680         unlink $sql_file;
 681     }
 682     else
 683     {
 684         Maasha::Common::run( "hgLoadBed", "$args > /dev/null 2>&1" );
 685     }
 686 }
 687
 688
 689 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> PSL format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
 690
 691 sub psl2record
 692 {
 693     # Martin A. Hansen, November 2008
 694
 695     # Converts PSL record keys to Biopiece record keys.
 696
 697     my ( $psl,   # hashref
 698        ) = @_;
 699
 700     # returns hashref
 701
 702     my %record;
 703
 704     %record = (
 705         REC_TYPE    => "PSL",
 706         BLOCKSIZES  => $psl->{ 'blockSize' },
 707         SNUMINSERT  => $psl->{ 'tNumInsert' },
 708         Q_END       => $psl->{ 'qEnd' },
 709         SBASEINSERT => $psl->{ 'tBaseInsert' },
 710         S_END       => $psl->{ 'tEnd' },
 711         QBASEINSERT => $psl->{ 'qBaseInsert' },
 712         REPMATCHES  => $psl->{ 'repMatches' },
 713         QNUMINSERT  => $psl->{ 'qNumInsert' },
 714         MISMATCHES  => $psl->{ 'misMatches' },
 715         BLOCKCOUNT  => $psl->{ 'blockCount' },
 716         Q_LEN       => $psl->{ 'qSize' },
 717         S_ID        => $psl->{ 'tName' },
 718         STRAND      => $psl->{ 'strand' },
 719         Q_ID        => $psl->{ 'qName' },
 720         MATCHES     => $psl->{ 'matches' },
 721         S_LEN       => $psl->{ 'tSize' },
 722         NCOUNT      => $psl->{ 'nCount' },
 723         Q_BEGS      => $psl->{ 'qStarts' },
 724         S_BEGS      => $psl->{ 'tStarts' },
 725         S_BEG       => $psl->{ 'tStart' },
 726         Q_BEG       => $psl->{ 'qStart ' },
 727     );
 728
 729     $record{ "SCORE" } = $record{ "MATCHES" } + int( $record{ "REPMATCHES" } / 2 ) - $record{ "MISMATCHES" } - $record{ "QNUMINSERT" } - $record{ "SNUMINSERT" };
 730     $record{ "SPAN" }  = $record{ "Q_END" } - $record{ "Q_BEG" };
 731
 732     return wantarray ? %record : \%record;
 733 }
 734
 735
 736 sub psl_get_entry
 737 {
 738     # Martin A. Hansen, August 2008.
 739
 740     # Reads PSL next entry from a PSL file and returns a record.
 741
 742     my ( $fh,   # file handle of PSL filefull path to PSL file
 743        ) = @_;
 744
 745     # Returns hashref.
 746
 747     my ( $line, @fields, %record );
 748
 749     while ( $line = <$fh> )
 750     {
 751         chomp $line;
 752
 753         @fields = split "\t", $line;
 754
 755         if ( scalar @fields == 21 )
 756         {
 757             %record = (
 758                 REC_TYPE    => "PSL",
 759                 MATCHES     => $fields[ 0 ],
 760                 MISMATCHES  => $fields[ 1 ],
 761                 REPMATCHES  => $fields[ 2 ],
 762                 NCOUNT      => $fields[ 3 ],
 763                 QNUMINSERT  => $fields[ 4 ],
 764                 QBASEINSERT => $fields[ 5 ],
 765                 SNUMINSERT  => $fields[ 6 ],
 766                 SBASEINSERT => $fields[ 7 ],
 767                 STRAND      => $fields[ 8 ],
 768                 Q_ID        => $fields[ 9 ],
 769                 Q_LEN       => $fields[ 10 ],
 770                 Q_BEG       => $fields[ 11 ],
 771                 Q_END       => $fields[ 12 ] - 1,
 772                 S_ID        => $fields[ 13 ],
 773                 S_LEN       => $fields[ 14 ],
 774                 S_BEG       => $fields[ 15 ],
 775                 S_END       => $fields[ 16 ] - 1,
 776                 BLOCKCOUNT  => $fields[ 17 ],
 777                 BLOCKSIZES  => $fields[ 18 ],
 778                 Q_BEGS      => $fields[ 19 ],
 779                 S_BEGS      => $fields[ 20 ],
 780             );
 781
 782             $record{ "SCORE" } = $record{ "MATCHES" } + int( $record{ "REPMATCHES" } / 2 ) - $record{ "MISMATCHES" } - $record{ "QNUMINSERT" } - $record{ "SNUMINSERT" };
 783             $record{ "SPAN" }  = $fields[ 12 ] - $fields[ 11 ];
 784
 785             return wantarray ? %record : \%record;
 786         }
 787     }
 788
 789     return undef;
 790 }
 791
 792
 793 sub psl_get_entries
 794 {
 795     # Martin A. Hansen, February 2008.
 796
 797     # Reads PSL entries and returns a list of records.
 798
 799     my ( $path,   # full path to PSL file
 800        ) = @_;
 801
 802     # Returns hashref.
 803
 804     my ( $fh, @lines, @fields, $i, %record, @records );
 805
 806     $fh = Maasha::Common::read_open( $path );
 807
 808     @lines = <$fh>;
 809
 810     close $fh;
 811
 812     chomp @lines;
 813
 814     for ( $i = 5; $i < @lines; $i++ )
 815     {
 816         @fields = split "\t", $lines[ $i ];
 817
 818         Maasha::Common::error( qq(Bad PSL format in file "$path") ) if not @fields == 21;
 819
 820         undef %record;
 821
 822         %record = (
 823             REC_TYPE    => "PSL",
 824             MATCHES     => $fields[ 0 ],
 825             MISMATCHES  => $fields[ 1 ],
 826             REPMATCHES  => $fields[ 2 ],
 827             NCOUNT      => $fields[ 3 ],
 828             QNUMINSERT  => $fields[ 4 ],
 829             QBASEINSERT => $fields[ 5 ],
 830             SNUMINSERT  => $fields[ 6 ],
 831             SBASEINSERT => $fields[ 7 ],
 832             STRAND      => $fields[ 8 ],
 833             Q_ID        => $fields[ 9 ],
 834             Q_LEN       => $fields[ 10 ],
 835             Q_BEG       => $fields[ 11 ],
 836             Q_END       => $fields[ 12 ] - 1,
 837             S_ID        => $fields[ 13 ],
 838             S_LEN       => $fields[ 14 ],
 839             S_BEG       => $fields[ 15 ],
 840             S_END       => $fields[ 16 ] - 1,
 841             BLOCKCOUNT  => $fields[ 17 ],
 842             BLOCKSIZES  => $fields[ 18 ],
 843             Q_BEGS      => $fields[ 19 ],
 844             S_BEGS      => $fields[ 20 ],
 845         );
 846
 847         $record{ "SCORE" } = $record{ "MATCHES" } + int( $record{ "REPMATCHES" } / 2 ) - $record{ "MISMATCHES" } - $record{ "QNUMINSERT" } - $record{ "SNUMINSERT" };
 848
 849         push @records, { %record };
 850     }
 851
 852     return wantarray ? @records : \@records;
 853 }
 854
 855
 856 sub psl_put_header
 857 {
 858     # Martin A. Hansen, September 2007.
 859
 860     # Write a PSL header to file.
 861
 862     my ( $fh,  # file handle  - OPTIONAL
 863        ) = @_;
 864
 865     # Returns nothing.
 866
 867     $fh = \*STDOUT if not $fh;
 868
 869     print $fh qq(psLayout version 3
 870 match   mis-    rep.    N's     Q gap   Q gap   T gap   T gap   strand  Q               Q       Q       Q       T               T       T       T       block   blockSizes      qStart        match   match           count   bases   count   bases           name            size    start   end     name            size    start   end     count
 871 ---------------------------------------------------------------------------------------------------------------------------------------------------------------
 872 );
 873 }
 874
 875
 876 sub psl_put_entry
 877 {
 878     # Martin A. Hansen, September 2007.
 879
 880     # Write a PSL entry to file.
 881
 882     my ( $record,       # hashref
 883          $fh,           # file handle  -  OPTIONAL
 884        ) = @_;
 885
 886     # Returns nothing.
 887
 888     $fh = \*STDOUT if not $fh;
 889
 890     my @output;
 891
 892     push @output, $record->{ "MATCHES" };
 893     push @output, $record->{ "MISMATCHES" };
 894     push @output, $record->{ "REPMATCHES" };
 895     push @output, $record->{ "NCOUNT" };
 896     push @output, $record->{ "QNUMINSERT" };
 897     push @output, $record->{ "QBASEINSERT" };
 898     push @output, $record->{ "SNUMINSERT" };
 899     push @output, $record->{ "SBASEINSERT" };
 900     push @output, $record->{ "STRAND" };
 901     push @output, $record->{ "Q_ID" };
 902     push @output, $record->{ "Q_LEN" };
 903     push @output, $record->{ "Q_BEG" };
 904     push @output, $record->{ "Q_END" } + 1;
 905     push @output, $record->{ "S_ID" };
 906     push @output, $record->{ "S_LEN" };
 907     push @output, $record->{ "S_BEG" };
 908     push @output, $record->{ "S_END" } + 1;
 909     push @output, $record->{ "BLOCKCOUNT" };
 910     push @output, $record->{ "BLOCKSIZES" };
 911     push @output, $record->{ "Q_BEGS" };
 912     push @output, $record->{ "S_BEGS" };
 913
 914     print $fh join( "\t", @output ), "\n";
 915 }
 916
 917
 918 sub psl_upload_to_ucsc
 919 {
 920     # Martin A. Hansen, September 2007.
 921
 922     # Upload a PSL file to the UCSC database.
 923
 924     my ( $file,      # file to upload,
 925          $options,   # argument hashref
 926          $append,    # flag indicating table should be appended
 927        ) = @_;
 928
 929     # Returns nothing.
 930
 931     my ( $args );
 932
 933     if ( $append ) {
 934         $args = join " ", $options->{ "database" }, "-table=$options->{ 'table' }", "-clientLoad", "-append", $file;
 935     } else {
 936         $args = join " ", $options->{ "database" }, "-table=$options->{ 'table' }", "-clientLoad", $file;
 937     }
 938
 939     Maasha::Common::run( "hgLoadPsl", "$args > /dev/null 2>&1" );
 940 }
 941
 942
 943 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> TRACK FILE <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
 944
 945
 946 sub ucsc_config_get_entry
 947 {
 948     # Martin A. Hansen, November 2008.
 949
 950     # Given a filehandle to a UCSC Genome Browser
 951     # config file (.ra) get the next entry and
 952     # return as a hash. Entries are separated by blank
 953     # lines. # lines are skipped unless it is the lines:
 954     # # Track added ...
 955     # # Database ...
 956
 957     my ( $fh,   # file hanlde
 958        ) = @_;
 959
 960     # Returns a hashref.
 961
 962     my ( $line, %record );
 963
 964     while ( $line = <$fh> )
 965     {
 966         if ( $line =~ /Track added by 'upload_to_ucsc' (\S+) (\S+)/) {
 967              $record{ 'date' } = $1;
 968              $record{ 'time' } = $2;
 969         } elsif ( $line =~ /^# date: (.+)/ ) {
 970             $record{ 'date' } = $1;
 971         } elsif ( $line =~ /^# time: (.+)/ ) {
 972             $record{ 'time' } = $1;
 973         } elsif ( $line =~ /^# (?:database:|Database) (.+)/ ) {
 974             $record{ 'database' } = $1;
 975         } elsif ( $line =~ /^#/ ) {
 976             # skip
 977         } elsif ( $line =~ /(\S+)\s+(.+)/ ) {
 978             $record{ $1 } = $2;
 979         }
 980
 981         last if $line =~ /^$/ and exists $record{ "track" };
 982     }
 983
 984     return undef if not exists $record{ "track" };
 985
 986     return wantarray ? %record : \%record;
 987 }
 988
 989
 990 sub ucsc_config_put_entry
 991 {
 992     # Martin A. Hansen, November 2008.
 993
 994     # Outputs a Biopiece record (a hashref)
 995     # to a filehandle or STDOUT.
 996
 997     my ( $record,    # hashref
 998          $fh_out,    # file handle
 999        ) = @_;
1000
1001     # Returns nothing.
1002
1003     my ( $date, $time );
1004
1005     $fh_out ||= \*STDOUT;
1006
1007     ( $date, $time ) = split " ", Maasha::Common::time_stamp();
1008
1009     $record->{ "date" } ||= $date;
1010     $record->{ "time" } ||= $time;
1011
1012     print $fh_out "\n# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n\n";
1013
1014     map { print $fh_out "# $_: $record->{ $_ }\n" if exists $record->{ $_ } } qw( date time database );
1015     map { print $fh_out "$_ $record->{ $_ }\n" if exists $record->{ $_ } } qw( track
1016                                                                                name
1017                                                                                description
1018                                                                                itemRgb
1019                                                                                db
1020                                                                                offset
1021                                                                                url
1022                                                                                htmlUrl
1023                                                                                shortLabel
1024                                                                                longLabel
1025                                                                                group
1026                                                                                priority
1027                                                                                useScore
1028                                                                                visibility
1029                                                                                maxHeightPixels
1030                                                                                color
1031                                                                                type );
1032 }
1033
1034
1035 sub update_my_tracks
1036 {
1037     # Martin A. Hansen, September 2007.
1038
1039     # Update the /home/user/ucsc/my_tracks.ra file and executes makeCustomTracks.pl
1040
1041     my ( $options,   # hashref
1042          $type,      # track type
1043        ) = @_;
1044
1045     # Returns nothing.
1046
1047     my ( $file, $fh_in, $fh_out, $line, $time );
1048
1049     $file = $ENV{ "HOME" } . "/ucsc/my_tracks.ra";
1050
1051     # ---- create a backup ----
1052
1053     $fh_in  = Maasha::Common::read_open( $file );
1054     $fh_out = Maasha::Common::write_open( "$file~" );
1055
1056     while ( $line = <$fh_in> ) {
1057         print $fh_out $line;
1058     }
1059
1060     close $fh_in;
1061     close $fh_out;
1062
1063     # ---- append track ----
1064
1065     $time = Maasha::Common::time_stamp();
1066
1067     $fh_out = Maasha::Common::append_open( $file );
1068
1069     if ( $type eq "sec_struct" )
1070     {
1071         print $fh_out "\n\n# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n";
1072
1073         print $fh_out "\n# Track added by 'upload_to_ucsc' $time\n\n";
1074
1075         print $fh_out "# Database $options->{ 'database' }\n\n";
1076
1077         print $fh_out "track $options->{ 'table' }\n";
1078         print $fh_out "shortLabel $options->{ 'short_label' }\n";
1079         print $fh_out "longLabel $options->{ 'long_label' }\n";
1080         print $fh_out "group $options->{ 'group' }\n";
1081         print $fh_out "priority $options->{ 'priority' }\n";
1082         print $fh_out "visibility $options->{ 'visibility' }\n";
1083         print $fh_out "color $options->{ 'color' }\n";
1084         print $fh_out "type bed 6 +\n";
1085         print $fh_out "mafTrack multiz17way\n";
1086
1087         print $fh_out "\n# //\n";
1088     }
1089     else
1090     {
1091         print $fh_out "\n\n# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n";
1092
1093         print $fh_out "\n# Track added by 'upload_to_ucsc' $time\n\n";
1094
1095         print $fh_out "# Database $options->{ 'database' }\n\n";
1096
1097         print $fh_out "track $options->{ 'table' }\n";
1098         print $fh_out "shortLabel $options->{ 'short_label' }\n";
1099         print $fh_out "longLabel $options->{ 'long_label' }\n";
1100         print $fh_out "group $options->{ 'group' }\n";
1101         print $fh_out "priority $options->{ 'priority' }\n";
1102         print $fh_out "useScore 1\n" if $options->{ 'use_score' };
1103         print $fh_out "visibility $options->{ 'visibility' }\n";
1104         print $fh_out "maxHeightPixels 50:50:11\n" if $type eq "wig 0";
1105         print $fh_out "color $options->{ 'color' }\n";
1106         print $fh_out "type $type\n";
1107
1108         print $fh_out "\n# //\n";
1109     }
1110
1111     close $fh_out;
1112
1113     Maasha::Common::run( "ucscMakeTracks.pl", "-b > /dev/null 2>&1" );
1114 }
1115
1116
1117 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> fixedStep format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1118
1119
1120 sub fixedstep_get_entry
1121 {
1122     # Martin A. Hansen, December 2007.
1123
1124     # Given a file handle to a PhastCons file get the
1125     # next entry which is all the lines after a "fixedStep"
1126     # line and until the next "fixedStep" line or EOF.
1127
1128     my ( $fh,   # filehandle
1129        ) = @_;
1130
1131     # Returns a list of lines
1132
1133     my ( $entry, @lines );
1134
1135     local $/ = "\nfixedStep ";
1136
1137     $entry = <$fh>;
1138
1139     chomp $entry;
1140
1141     @lines = split "\n", $entry;
1142
1143     return if @lines == 0;
1144
1145     $lines[ 0 ] =~ s/fixedStep?\s*//;
1146
1147     return wantarray ? @lines : \@lines;
1148 }
1149
1150
1151 sub fixedstep_index_create
1152 {
1153     # Martin A. Hansen, January 2008.
1154
1155     # Indexes a concatenated fixedStep file.
1156     # The index consists of a hash with chromosomes as keys,
1157     # and a list of [ chr_beg, next_chr_beg, chr_end, index_beg, index_len ] as values.
1158
1159     my ( $path,   # path to fixedStep file
1160        ) = @_;
1161
1162     # Returns a hashref
1163
1164     my ( $fh, $pos, $index_beg, $index_len, $entry, $locator, $chr, $step, $beg, $end, $len, %index, $i );
1165
1166     $fh = Maasha::Common::read_open( $path );
1167
1168     $pos = 0;
1169
1170     while ( $entry = Maasha::UCSC::fixedstep_get_entry( $fh ) )
1171     {
1172         $locator = shift @{ $entry };
1173
1174         if ( $locator =~ /chrom=([^ ]+) start=(\d+) step=(\d+)/ )
1175         {
1176             $chr  = $1;
1177             $beg  = $2 - 1;  #  fixedStep files are 1-based
1178             $step = $3;
1179         }
1180         else
1181         {
1182             Maasha::Common::error( qq(Could not parse locator: $locator) );
1183         }
1184
1185         $pos += length( $locator ) + 11;
1186
1187         $index_beg = $pos;
1188
1189 #        map { $pos += length( $_ ) + 1 } @{ $entry };
1190
1191         $pos += 6 * scalar @{ $entry };
1192
1193         $index_len = $pos - $index_beg;
1194
1195         push @{ $index{ $chr } }, [ $beg, undef, $beg + scalar @{ $entry } - 1, $index_beg, $index_len ];
1196     }
1197
1198     close $fh;
1199
1200     foreach $chr ( keys %index )
1201     {
1202         for ( $i = 0; $i < @{ $index{ $chr } } - 1; $i++ ) {
1203             $index{ $chr }->[ $i ]->[ FS_NEXT_CHR_BEG ] = $index{ $chr }->[ $i + 1 ]->[ 0 ];
1204         }
1205
1206         $index{ $chr }->[ -1 ]->[ FS_NEXT_CHR_BEG ] = $index{ $chr }->[ -1 ]->[ FS_CHR_END ] + 1;
1207     }
1208
1209     return wantarray ? %index : \%index;
1210 }
1211
1212
1213 sub fixedstep_index_store
1214 {
1215     # Martin A. Hansen, January 2008.
1216
1217     # Writes a fixedStep index to binary file.
1218
1219     my ( $path,   # full path to file
1220          $index,  # list with index
1221        ) = @_;
1222
1223     # returns nothing
1224
1225     Maasha::Common::file_store( $path, $index );
1226 }
1227
1228
1229 sub fixedstep_index_retrieve
1230 {
1231     # Martin A. Hansen, January 2008.
1232
1233     # Retrieves a fixedStep index from binary file.
1234
1235     my ( $path,   # full path to file
1236        ) = @_;
1237
1238     # returns list
1239
1240     my $index;
1241
1242     $index = Maasha::Common::file_retrieve( $path );
1243
1244     return wantarray ? %{ $index } : $index;
1245 }
1246
1247
1248 sub fixedstep_index_lookup
1249 {
1250     # Martin A. Hansen, January 2008.
1251
1252     # Retrieve fixedStep scores from a indexed
1253     # fixedStep file given a chromosome and
1254     # begin and end positions.
1255
1256     my ( $index,     # data structure
1257          $fh,        # filehandle to datafile
1258          $chr,       # chromosome
1259          $chr_beg,   # chromosome beg
1260          $chr_end,   # chromosome end
1261          $flank,     # include flanking region - OPTIONAL
1262        ) = @_;
1263
1264     # Returns a list
1265
1266     my ( $index_beg, $index_end, $i, $c, $beg, $end, @vals, $scores );
1267
1268     $flank ||= 0;
1269
1270     $chr_beg -= $flank;
1271     $chr_end += $flank;
1272
1273     # print "chr_beg->$chr_beg   chr_end->$chr_end   flank->$flank\n";
1274
1275     if ( exists $index->{ $chr } )
1276     {
1277         $index_beg = Maasha::Matrix::interval_search( $index->{ $chr }, 0, 1, $chr_beg );
1278
1279         if ( $index_beg < 0 ) {
1280             Maasha::Common::error( qq(Index search failed - begin index position doesn't exists: $chr_beg) );
1281         }
1282
1283         if ( $chr_end < $index->{ $chr }->[ $index_beg ]->[ 1 ] )
1284         {
1285             $index_end = $index_beg;
1286         }
1287         else
1288         {
1289             $index_end = Maasha::Matrix::interval_search( $index->{ $chr }, 0, 1, $chr_end );
1290
1291             if ( $index_end < 0 ) {
1292                 Maasha::Common::error( qq(Index search failed - end index position doesn't exists: $chr_end) );
1293             }
1294         }
1295
1296         map { $scores->[ $_ ] = 0 } 0 .. $chr_end - $chr_beg;
1297
1298         if ( $index_beg == $index_end )
1299         {
1300             $beg = Maasha::Calc::max( $chr_beg, $index->{ $chr }->[ $index_beg ]->[ FS_CHR_BEG ] );
1301             $end = Maasha::Calc::min( $chr_end, $index->{ $chr }->[ $index_end ]->[ FS_CHR_END ] );
1302
1303             if ( $beg <= $index->{ $chr }->[ $index_beg ]->[ FS_CHR_END ] and $end >= $index->{ $chr }->[ $index_beg ]->[ FS_CHR_BEG ] )
1304             {
1305                 @vals = split "\n", Maasha::Common::file_read(
1306                     $fh,
1307                     $index->{ $chr }->[ $index_beg ]->[ FS_INDEX_BEG ] + 6 * ( $beg - $index->{ $chr }->[ $index_beg ]->[ FS_CHR_BEG ] ),
1308                     6 * ( $end - $beg + 1 ),
1309                 );
1310             }
1311
1312             for ( $c = 0; $c < @vals; $c++ ) {
1313                 $scores->[ $c + $beg - $chr_beg ] = $vals[ $c ];
1314             }
1315         }
1316         else
1317         {
1318             $beg = Maasha::Calc::max( $chr_beg, $index->{ $chr }->[ $index_beg ]->[ FS_CHR_BEG ] );
1319
1320 #            print Dumper( $beg, $index->{ $chr }->[ $index_beg ] );
1321 #            print Dumper( "next", $index->{ $chr }->[ $index_beg ]->[ FS_NEXT_CHR_BEG ] );
1322
1323             #      beg         next
1324             #      v           v
1325             #  |||||||||.......
1326
1327             if ( $beg <= $index->{ $chr }->[ $index_beg ]->[ FS_CHR_END ] )
1328             {
1329                 @vals = split "\n", Maasha::Common::file_read(
1330                     $fh,
1331                     $index->{ $chr }->[ $index_beg ]->[ FS_INDEX_BEG ] + 6 * ( $beg - $index->{ $chr }->[ $index_beg ]->[ FS_CHR_BEG ] ),
1332                     6 * ( $index->{ $chr }->[ $index_beg ]->[ FS_CHR_END ] - $beg + 1 ),
1333                 );
1334
1335                 for ( $c = 0; $c < @vals; $c++ ) {
1336                     $scores->[ $c + $beg - $chr_beg ] = $vals[ $c ];
1337                 }
1338             }
1339
1340             $end = Maasha::Calc::min( $chr_end, $index->{ $chr }->[ $index_end ]->[ FS_CHR_END ] );
1341
1342             if ( $end <= $index->{ $chr }->[ $index_end ]->[ FS_CHR_END ] )
1343             {
1344                 @vals = split "\n", Maasha::Common::file_read(
1345                     $fh,
1346                     $index->{ $chr }->[ $index_end ]->[ FS_INDEX_BEG ],
1347                     6 * ( $end - $index->{ $chr }->[ $index_end ]->[ FS_CHR_BEG ] + 1 ),
1348                 );
1349
1350                 for ( $c = 0; $c < @vals; $c++ ) {
1351                     $scores->[ $c + $index->{ $chr }->[ $index_end ]->[ FS_CHR_BEG ] - $chr_beg ] = $vals[ $c ];
1352                 }
1353             }
1354
1355             for ( $i = $index_beg + 1; $i <= $index_end - 1; $i++ )
1356             {
1357                 @vals = split "\n", Maasha::Common::file_read(
1358                     $fh,
1359                     $index->{ $chr }->[ $i ]->[ FS_INDEX_BEG ],
1360                     6 * ( $index->{ $chr }->[ $i ]->[ FS_CHR_END ] - $index->{ $chr }->[ $i ]->[ FS_CHR_BEG ] + 1 ),
1361                 );
1362
1363                 for ( $c = 0; $c < @vals; $c++ ) {
1364                     $scores->[ $c + $index->{ $chr }->[ $i ]->[ FS_CHR_BEG ] - $chr_beg ] = $vals[ $c ];
1365                 }
1366             }
1367         }
1368     }
1369     else
1370     {
1371         Maasha::Common::error( qq(Chromosome "$chr" was not found in index) );
1372     }
1373
1374     return wantarray ? @{ $scores } : $scores;
1375 }
1376
1377
1378 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> PhastCons format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1379
1380
1381 sub phastcons_index
1382 {
1383     # Martin A. Hansen, July 2008
1384
1385     # Create a fixedStep index for PhastCons data.
1386
1387     my ( $file,   # file to index
1388          $dir,    # dir with file
1389        ) = @_;
1390
1391     # Returns nothing.
1392
1393     my ( $index );
1394
1395     $index = fixedstep_index_create( "$dir/$file" );
1396
1397     fixedstep_index_store( "$dir/$file.index", $index );
1398 }
1399
1400
1401 sub phastcons_parse_entry
1402 {
1403     # Martin A. Hansen, December 2007.
1404
1405     # Given a PhastCons entry converts this to a
1406     # list of super blocks.
1407
1408     my ( $lines,   # list of lines
1409          $args,    # argument hash
1410        ) = @_;
1411
1412     # Returns
1413
1414     my ( $info, $chr, $beg, $step, $i, $c, $j, @blocks, @super_blocks, @entries, $super_block, $block, @lens, @begs );
1415
1416     $info = shift @{ $lines };
1417
1418     if ( $info =~ /^chrom=([^ ]+) start=(\d+) step=(\d+)$/ )
1419     {
1420         $chr  = $1;
1421         $beg  = $2;
1422         $step = $3;
1423
1424         die qq(ERROR: step size $step != 1 -> problem!\n) if $step != 1; # in an ideal world should would be fixed ...
1425     }
1426
1427     $i = 0;
1428
1429     while ( $i < @{ $lines } )
1430     {
1431         if ( $lines->[ $i ] >= $args->{ "threshold" } )
1432         {
1433             $c = $i + 1;
1434
1435             while ( $c < @{ $lines } )
1436             {
1437                 if ( $lines->[ $c ] < $args->{ "threshold" } )
1438                 {
1439                     $j = $c + 1;
1440
1441                     while ( $j < @{ $lines } and $lines->[ $j ] < $args->{ "threshold" } ) {
1442                         $j++;
1443                     }
1444
1445                     if ( $j - $c > $args->{ "gap" } )
1446                     {
1447                         if ( $c - $i >= $args->{ "min" } )
1448                         {
1449                             push @blocks, {
1450                                 CHR     => $chr,
1451                                 CHR_BEG => $beg + $i - 1,
1452                                 CHR_END => $beg + $c - 2,
1453                                 CHR_LEN => $c - $i,
1454                             };
1455                         }
1456
1457                         $i = $j;
1458
1459                         last;
1460                     }
1461
1462                     $c = $j
1463                 }
1464                 else
1465                 {
1466                     $c++;
1467                 }
1468             }
1469
1470             if ( $c - $i >= $args->{ "min" } )
1471             {
1472                 push @blocks, {
1473                     CHR     => $chr,
1474                     CHR_BEG => $beg + $i - 1,
1475                     CHR_END => $beg + $c - 2,
1476                     CHR_LEN => $c - $i,
1477                 };
1478             }
1479
1480             $i = $c;
1481         }
1482         else
1483         {
1484             $i++;
1485         }
1486     }
1487
1488     $i = 0;
1489
1490     while ( $i < @blocks )
1491     {
1492         $c = $i + 1;
1493
1494         while ( $c < @blocks and $blocks[ $c ]->{ "CHR_BEG" } - $blocks[ $c - 1 ]->{ "CHR_END" } <= $args->{ "dist" } )
1495         {
1496             $c++;
1497         }
1498
1499         push @super_blocks, [ @blocks[ $i .. $c - 1 ] ];
1500
1501         $i = $c;
1502     }
1503
1504     foreach $super_block ( @super_blocks )
1505     {
1506         foreach $block ( @{ $super_block } )
1507         {
1508             push @begs, $block->{ "CHR_BEG" } - $super_block->[ 0 ]->{ "CHR_BEG" };
1509             push @lens, $block->{ "CHR_LEN" } - 1;
1510         }
1511
1512         $lens[ -1 ]++;
1513
1514         push @entries, {
1515             CHR        => $super_block->[ 0 ]->{ "CHR" },
1516             CHR_BEG    => $super_block->[ 0 ]->{ "CHR_BEG" },
1517             CHR_END    => $super_block->[ -1 ]->{ "CHR_END" },
1518             Q_ID       => "Q_ID",
1519             SCORE      => 100,
1520             STRAND     => "+",
1521             THICK_BEG  => $super_block->[ 0 ]->{ "CHR_BEG" },
1522             THICK_END  => $super_block->[ -1 ]->{ "CHR_END" } + 1,
1523             ITEMRGB    => "0,200,100",
1524             BLOCKCOUNT => scalar @{ $super_block },
1525             BLOCKSIZES => join( ",", @lens ),
1526             Q_BEGS     => join( ",", @begs ),
1527         };
1528
1529         undef @begs;
1530         undef @lens;
1531     }
1532
1533     return wantarray ? @entries : \@entries;
1534 }
1535
1536
1537 sub phastcons_normalize
1538 {
1539     # Martin A. Hansen, January 2008.
1540
1541     # Normalizes a list of lists with PhastCons scores,
1542     # in such a way that each list contains the same number
1543     # or PhastCons scores.
1544
1545     my ( $AoA,    # AoA with PhastCons scores
1546        ) = @_;
1547
1548     # Returns AoA.
1549
1550     my ( $list, $max, $min, $mean, $diff );
1551
1552     $min = 99999999;
1553     $max = 0;
1554
1555     foreach $list ( @{ $AoA } )
1556     {
1557         $min = scalar @{ $list } if scalar @{ $list } < $min;
1558         $max = scalar @{ $list } if scalar @{ $list } > $max;
1559     }
1560
1561     $mean = int( ( $min + $max ) / 2 );
1562
1563 #    print STDERR "min->$min   max->$max   mean->$mean\n";
1564
1565     foreach $list ( @{ $AoA } )
1566     {
1567         $diff = scalar @{ $list } - $mean;
1568
1569         phastcons_list_inflate( $list, abs( $diff ) ) if $diff < 0;
1570         phastcons_list_deflate( $list, $diff )        if $diff > 0;
1571     }
1572
1573     return wantarray ? @{ $AoA } : $AoA;
1574 }
1575
1576
1577 sub phastcons_list_inflate
1578 {
1579     # Martin A. Hansen, January 2008.
1580
1581     # Inflates a list with a given number of elements
1582     # in such a way that the extra elements are introduced
1583     # evenly over the entire length of the list. The value
1584     # of the extra elements is based on a mean of the
1585     # adjacent elements.
1586
1587     my ( $list,   # list of elements
1588          $diff,   # number of elements to introduce
1589        ) = @_;
1590
1591     # Returns nothing
1592
1593     my ( $len, $space, $i, $pos );
1594
1595     $len = scalar @{ $list };
1596
1597     $space = $len / $diff;
1598
1599     for ( $i = 0; $i < $diff; $i++ )
1600     {
1601         $pos = int( ( $space / 2 ) + $i * $space );
1602
1603         splice @{ $list }, $pos, 0, ( $list->[ $pos - 1 ] + $list->[ $pos + 1 ] ) / 2;
1604         # splice @{ $list }, $pos, 0, "X";
1605     }
1606
1607     die qq(ERROR: bad inflate\n) if scalar @{ $list } != $len + $diff;
1608 }
1609
1610
1611 sub phastcons_list_deflate
1612 {
1613     # Martin A. Hansen, January 2008.
1614
1615     # Deflates a list by removing a given number of elements
1616     # evenly distributed over the entire list.
1617
1618     my ( $list,   # list of elements
1619          $diff,   # number of elements to remove
1620        ) = @_;
1621
1622     # Returns nothing
1623
1624     my ( $len, $space, $i, $pos );
1625
1626     $len = scalar @{ $list };
1627
1628     $space = ( $len - $diff ) / $diff;
1629
1630     for ( $i = 0; $i < $diff; $i++ )
1631     {
1632         $pos = int( ( $space / 2 ) + $i * $space );
1633
1634         splice @{ $list }, $pos, 1;
1635     }
1636
1637     die qq(ERROR: bad deflate\n) if scalar @{ $list } != $len - $diff;
1638 }
1639
1640
1641 sub phastcons_mean
1642 {
1643     # Martin A. Hansen, January 2008.
1644
1645     # Given a normalized PhastCons matrix in an AoA,
1646     # calculate the mean for each column and return as a list.
1647
1648     my ( $AoA,    # AoA with normalized PhastCons scores
1649        ) = @_;
1650
1651     # Returns a list
1652
1653     my ( @list );
1654
1655     $AoA = Maasha::Matrix::matrix_flip( $AoA );
1656
1657     map { push @list, Maasha::Calc::mean( $_ ) } @{ $AoA };
1658
1659     return wantarray ? @list : \@list;
1660 }
1661
1662
1663 sub phastcons_median
1664 {
1665     # Martin A. Hansen, January 2008.
1666
1667     # Given a normalized PhastCons matrix in an AoA,
1668     # calculate the median for each column and return as a list.
1669
1670     my ( $AoA,    # AoA with normalized PhastCons scores
1671        ) = @_;
1672
1673     # Returns a list
1674
1675     my ( @list );
1676
1677     $AoA = Maasha::Matrix::matrix_flip( $AoA );
1678
1679     map { push @list, Maasha::Calc::median( $_ ) } @{ $AoA };
1680
1681     return wantarray ? @list : \@list;
1682 }
1683
1684
1685 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> MULTIPLE ALIGNMENT FILES <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1686
1687
1688 sub maf_extract
1689 {
1690     # Martin A. Hansen, April 2008.
1691
1692     # Executes mafFrag to extract a subalignment from a multiz track
1693     # in the UCSC genome browser database.
1694
1695     my ( $tmp_dir,    # temporary directory
1696          $database,   # genome database
1697          $table,      # table with the multiz track
1698          $chr,        # chromosome
1699          $beg,        # begin position
1700          $end,        # end position
1701          $strand,     # strand
1702        ) = @_;
1703
1704     # Returns a list of record
1705
1706     my ( $tmp_file, $align );
1707
1708     $tmp_file = "$tmp_dir/maf_extract.maf";
1709
1710     Maasha::Common::run( "mafFrag", "$database $table $chr $beg $end $strand $tmp_file" );
1711
1712     $align = maf_parse( $tmp_file );
1713
1714     unlink $tmp_file;
1715
1716     return wantarray ? @{ $align } : $align;
1717 }
1718
1719
1720 sub maf_parse
1721 {
1722     # Martin A. Hansen, April 2008.
1723
1724
1725     my ( $path,   # full path to MAF file
1726        ) = @_;
1727
1728     # Returns a list of record.
1729
1730     my ( $fh, $line, @fields, @align );
1731
1732     $fh = Maasha::Common::read_open( $path );
1733
1734     while ( $line = <$fh> )
1735     {
1736         chomp $line;
1737
1738         if ( $line =~ /^s/ )
1739         {
1740             @fields = split / /, $line;
1741
1742             push @align, {
1743                 SEQ_NAME  => $fields[ 1 ],
1744                 SEQ       => $fields[ -1 ],
1745                 ALIGN     => 1,
1746                 ALIGN_LEN => length $fields[ -1 ],
1747             }
1748         }
1749     }
1750
1751     close $fh;
1752
1753     return wantarray ? @align : \@align;
1754 }
1755
1756
1757 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> WIGGLE FORMAT <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1758
1759
1760 sub fixedstep_put_entry
1761 {
1762     # Martin A. Hansen, April 2008.
1763
1764     # Outputs a block of fixedStep values.
1765     # Used for outputting wiggle data.
1766
1767     my ( $chr,      # chromosome
1768          $beg,      # start position
1769          $block,    # list of scores
1770          $fh,       # filehandle - OPTIONAL
1771          $log10,    # flag indicating that log10 scores should be used
1772        ) = @_;
1773
1774     # Returns nothing.
1775
1776     $beg += 1;   # fixedStep format is 1 based.
1777
1778     $fh ||= \*STDOUT;
1779
1780     print $fh "fixedStep chrom=$chr start=$beg step=1\n";
1781
1782     if ( $log10 ) {
1783         map { printf( $fh "%f\n", Maasha::Calc::log10( $_ + 1 ) ) } @{ $block };
1784     } else {
1785         map { printf( $fh "%d\n", ( $_ + 1 ) ) } @{ $block };
1786     }
1787 }
1788
1789
1790 sub wiggle_upload_to_ucsc
1791 {
1792     # Martin A. Hansen, May 2008.
1793
1794     # Upload a wiggle file to the UCSC database.
1795
1796     my ( $tmp_dir,    # temporary directory
1797          $wib_dir,    # wib directory
1798          $wig_file,   # file to upload,
1799          $options,    # argument hashref
1800        ) = @_;
1801
1802     # Returns nothing.
1803
1804     my ( $args );
1805
1806 #    $args = join " ", "-tmpDir=$tmp_dir", "-pathPrefix=$wib_dir", $options->{ "database" }, $options->{ 'table' }, $wig_file;
1807
1808 #    Maasha::Common::run( "hgLoadWiggle", "$args > /dev/null 2>&1" );
1809
1810     if ( $options->{ 'verbose' } ) {
1811         `cd $tmp_dir && hgLoadWiggle -tmpDir=$tmp_dir -pathPrefix=$wib_dir $options->{ 'database' } $options->{ 'table' } $wig_file`;
1812     } else {
1813         `cd $tmp_dir && hgLoadWiggle -tmpDir=$tmp_dir -pathPrefix=$wib_dir $options->{ 'database' } $options->{ 'table' } $wig_file > /dev/null 2>&1`;
1814     }
1815 }
1816
1817
1818 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> MySQL CONF <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1819
1820
1821 sub ucsc_get_user
1822 {
1823     # Martin A. Hansen, May 2008
1824
1825     # Fetches the MySQL database user name from the
1826     # .hg.conf file in the users home directory.
1827
1828     # Returns a string.
1829
1830     my ( $fh, $line, $user );
1831
1832     $fh = Maasha::Common::read_open( "$ENV{ 'HOME' }/.hg.conf" );
1833
1834     while ( $line = <$fh> )
1835     {
1836         chomp $line;
1837
1838         if ( $line =~ /^db\.user=(.+)/ )
1839         {
1840             $user = $1;
1841
1842             last;
1843         }
1844     }
1845
1846     close $fh;
1847
1848     return $user;
1849 }
1850
1851
1852 sub ucsc_get_password
1853 {
1854     # Martin A. Hansen, May 2008
1855
1856     # Fetches the MySQL database password from the
1857     # .hg.conf file in the users home directory.
1858
1859     # Returns a string.
1860
1861     my ( $fh, $line, $password );
1862
1863     $fh = Maasha::Common::read_open( "$ENV{ 'HOME' }/.hg.conf" );
1864
1865     while ( $line = <$fh> )
1866     {
1867         chomp $line;
1868
1869         if ( $line =~ /^db\.password=(.+)/ )
1870         {
1871             $password = $1;
1872
1873             last;
1874         }
1875     }
1876
1877     close $fh;
1878
1879     return $password;
1880 }
1881
1882
1883 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1884
1885
1886 __END__
1887