code_perl/Maasha/UCSC.pm

   1 package Maasha::UCSC;
   2
   3 # Copyright (C) 2007 Martin A. Hansen.
   4
   5 # This program is free software; you can redistribute it and/or
   6 # modify it under the terms of the GNU General Public License
   7 # as published by the Free Software Foundation; either version 2
   8 # of the License, or (at your option) any later version.
   9
  10 # This program is distributed in the hope that it will be useful,
  11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 # GNU General Public License for more details.
  14
  15 # You should have received a copy of the GNU General Public License
  16 # along with this program; if not, write to the Free Software
  17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  18
  19 # http://www.gnu.org/copyleft/gpl.html
  20
  21
  22 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  23
  24
  25 # Stuff for interacting with UCSC genome browser
  26
  27
  28 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  29
  30
  31 use strict;
  32 use vars qw ( @ISA @EXPORT );
  33
  34 use Data::Dumper;
  35 use Time::HiRes qw( gettimeofday );
  36
  37 use Maasha::Common;
  38 use Maasha::Calc;
  39 use Maasha::Matrix;
  40
  41 use constant {
  42     CHR_BEG      => 0,
  43     NEXT_CHR_BEG => 1,
  44     CHR_END      => 2,
  45     INDEX_BEG    => 3,
  46     INDEX_LEN    => 4,
  47
  48     CHR        => 0,
  49     CHR_BEG    => 1,
  50     CHR_END    => 2,
  51     Q_ID       => 3,
  52     SCORE      => 4,
  53     STRAND     => 5,
  54     THICK_BEG  => 6,
  55     THICK_END  => 7,
  56     ITEMRGB    => 8,
  57     BLOCKCOUNT => 9,
  58     BLOCKSIZES => 10,
  59     Q_BEGS     => 11,
  60 };
  61
  62 @ISA = qw( Exporter );
  63
  64 my $TIME = gettimeofday();
  65
  66
  67 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> BED format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  68
  69
  70 # http://genome.ucsc.edu/goldenPath/help/hgTracksHelp.html#BED
  71
  72
  73 sub bed_entry_get_array
  74 {
  75     # Martin A. Hansen, September 2008.
  76
  77     # Reads a BED entry given a filehandle.
  78
  79     # This is a new _faster_ BED entry parser that
  80     # uses arrays and not hashrefs.
  81
  82     # IMPORTANT! This function does not correct the
  83     # BED_END position that is kept the way UCSC
  84     # does.
  85
  86     my ( $fh,     # file handle
  87          $cols,   # columns to read - OPTIONAL (3,4,5,6, or 12)
  88        ) = @_;
  89
  90     # Returns a list.
  91
  92     my ( $line, @entry );
  93
  94     $line = <$fh>;
  95
  96     $line =~ tr/\n\r//d;    # some people have carriage returns in their BED files -> Grrrr
  97
  98     return if not defined $line;
  99
 100     if ( not defined $cols ) {
 101         $cols = 1 + $line =~ tr/\t//;
 102     }
 103
 104     @entry = split "\t", $line, $cols + 1;
 105
 106     pop @entry if scalar @entry > $cols;
 107
 108     return wantarray ? @entry : \@entry;
 109 }
 110
 111
 112 sub bed_get_entry
 113 {
 114     # Martin A. Hansen, December 2007.
 115
 116     # Reads a bed entry given a filehandle.
 117
 118     my ( $fh,        # file handle
 119          $columns,   # number of BED columns to read  -  OPTIONAL
 120        ) = @_;
 121
 122     # Returns hashref.
 123
 124     my ( $line, @fields, %entry );
 125
 126     $line = <$fh>;
 127
 128     $line =~ tr/\n\r//d;    # some people have carriage returns in their BED files -> Grrrr
 129
 130     return if not defined $line;
 131
 132     @fields = split "\t", $line;
 133
 134     $columns ||= scalar @fields;
 135
 136     if ( $columns == 3 )
 137     {
 138         %entry = (
 139             "CHR"      => $fields[ 0 ],
 140             "CHR_BEG"  => $fields[ 1 ],
 141             "CHR_END"  => $fields[ 2 ] - 1,
 142         );
 143     }
 144     elsif ( $columns == 4 )
 145     {
 146         %entry = (
 147             "CHR"      => $fields[ 0 ],
 148             "CHR_BEG"  => $fields[ 1 ],
 149             "CHR_END"  => $fields[ 2 ] - 1,
 150             "Q_ID"     => $fields[ 3 ],
 151         );
 152     }
 153     elsif ( $columns == 5 )
 154     {
 155         %entry = (
 156             "CHR"      => $fields[ 0 ],
 157             "CHR_BEG"  => $fields[ 1 ],
 158             "CHR_END"  => $fields[ 2 ] - 1,
 159             "Q_ID"     => $fields[ 3 ],
 160             "SCORE"    => $fields[ 4 ],
 161         );
 162     }
 163     elsif ( $columns == 6 )
 164     {
 165         %entry = (
 166             "CHR"      => $fields[ 0 ],
 167             "CHR_BEG"  => $fields[ 1 ],
 168             "CHR_END"  => $fields[ 2 ] - 1,
 169             "Q_ID"     => $fields[ 3 ],
 170             "SCORE"    => $fields[ 4 ],
 171             "STRAND"   => $fields[ 5 ],
 172         );
 173     }
 174     elsif ( $columns == 12 )
 175     {
 176         %entry = (
 177             "CHR"        => $fields[ 0 ],
 178             "CHR_BEG"    => $fields[ 1 ],
 179             "CHR_END"    => $fields[ 2 ] - 1,
 180             "Q_ID"       => $fields[ 3 ],
 181             "SCORE"      => $fields[ 4 ],
 182             "STRAND"     => $fields[ 5 ],
 183             "THICK_BEG"  => $fields[ 6 ],
 184             "THICK_END"  => $fields[ 7 ] - 1,
 185             "ITEMRGB"    => $fields[ 8 ],
 186             "BLOCKCOUNT" => $fields[ 9 ],
 187             "BLOCKSIZES" => $fields[ 10 ],
 188             "Q_BEGS"     => $fields[ 11 ],
 189         );
 190     }
 191     else
 192     {
 193         Maasha::Common::error( qq(Bad BED format in line->$line<-) );
 194     }
 195
 196     $entry{ "REC_TYPE" } = "BED";
 197     $entry{ "BED_LEN" }  = $entry{ "CHR_END" } - $entry{ "CHR_BEG" } + 1;
 198     $entry{ "BED_COLS" } = $columns;
 199
 200     return wantarray ? %entry : \%entry;
 201 }
 202
 203
 204 sub bed_get_entries
 205 {
 206     # Martin A. Hansen, January 2008.
 207
 208     # Given a path to a BED file, read in all entries
 209     # and return.
 210
 211     my ( $path,     # full path to BED file
 212          $columns,  # number of columns in BED file - OPTIONAL (but is faster)
 213        ) = @_;
 214
 215     # Returns a list.
 216
 217     my ( $fh, $entry, @list );
 218
 219     $fh = Maasha::Common::read_open( $path );
 220
 221     while ( $entry = bed_get_entry( $fh ) ) {
 222         push @list, $entry;
 223     }
 224
 225     close $fh;
 226
 227     return wantarray ? @list : \@list;
 228 }
 229
 230
 231 sub bed_entry_put_array
 232 {
 233     # Martin A. Hansen, Septermber 2008.
 234
 235     # Writes a BED entry array to file.
 236
 237     # IMPORTANT! This function does not correct the
 238     # BED_END position that is assumed to be in the
 239     # UCSC positions scheme.
 240
 241     my ( $record,   # list
 242          $fh,       # file handle                   - OPTIONAL
 243          $cols,     # number of columns in BED file - OPTIONAL
 244        ) = @_;
 245
 246     # Returns nothing.
 247
 248     $fh = \*STDOUT if not defined $fh;
 249
 250     if ( defined $cols ) {
 251         print $fh join( "\t", @{ $record }[ 0 .. $cols - 1 ] ), "\n";
 252     } else {
 253         print $fh join( "\t", @{ $record } ), "\n";
 254     }
 255 }
 256
 257
 258 sub bed_put_entry
 259 {
 260     # Martin A. Hansen, Septermber 2007.
 261
 262     # Writes a BED entry to file.
 263
 264     # NB, this could really be more robust!?
 265
 266     my ( $record,       # hashref
 267          $fh,           # file handle                   - OPTIONAL
 268          $columns,      # number of columns in BED file - OPTIONAL (but is faster)
 269        ) = @_;
 270
 271     # Returns nothing.
 272
 273     my ( @fields );
 274
 275     $columns ||= 12;   # max number of columns possible
 276
 277     if ( $columns == 3 )
 278     {
 279         push @fields, $record->{ "CHR" };
 280         push @fields, $record->{ "CHR_BEG" };
 281         push @fields, $record->{ "CHR_END" } + 1;
 282     }
 283     elsif ( $columns == 4 )
 284     {
 285         $record->{ "Q_ID" }  =~ s/\s+/_/g;
 286
 287         push @fields, $record->{ "CHR" };
 288         push @fields, $record->{ "CHR_BEG" };
 289         push @fields, $record->{ "CHR_END" } + 1;
 290         push @fields, $record->{ "Q_ID" };
 291     }
 292     elsif ( $columns == 5 )
 293     {
 294         $record->{ "Q_ID" }  =~ s/\s+/_/g;
 295         $record->{ "SCORE" } =~ s/\.\d*//;
 296
 297         push @fields, $record->{ "CHR" };
 298         push @fields, $record->{ "CHR_BEG" };
 299         push @fields, $record->{ "CHR_END" } + 1;
 300         push @fields, $record->{ "Q_ID" };
 301         push @fields, $record->{ "SCORE" };
 302     }
 303     elsif ( $columns == 6 )
 304     {
 305         $record->{ "Q_ID" }  =~ s/\s+/_/g;
 306         $record->{ "SCORE" } =~ s/\.\d*//;
 307
 308         push @fields, $record->{ "CHR" };
 309         push @fields, $record->{ "CHR_BEG" };
 310         push @fields, $record->{ "CHR_END" } + 1;
 311         push @fields, $record->{ "Q_ID" };
 312         push @fields, $record->{ "SCORE" };
 313         push @fields, $record->{ "STRAND" };
 314     }
 315     else
 316     {
 317         $record->{ "Q_ID" }  =~ s/\s+/_/g;
 318         $record->{ "SCORE" } =~ s/\.\d*//;
 319
 320         push @fields, $record->{ "CHR" };
 321         push @fields, $record->{ "CHR_BEG" };
 322         push @fields, $record->{ "CHR_END" } + 1;
 323         push @fields, $record->{ "Q_ID" };
 324         push @fields, $record->{ "SCORE" };
 325         push @fields, $record->{ "STRAND" };
 326         push @fields, $record->{ "THICK_BEG" }     if defined $record->{ "THICK_BEG" };
 327         push @fields, $record->{ "THICK_END" } + 1 if defined $record->{ "THICK_END" };
 328         push @fields, $record->{ "ITEMRGB" }       if defined $record->{ "ITEMRGB" };
 329         push @fields, $record->{ "BLOCKCOUNT" }    if defined $record->{ "BLOCKCOUNT" };
 330         push @fields, $record->{ "BLOCKSIZES" }    if defined $record->{ "BLOCKSIZES" };
 331         push @fields, $record->{ "Q_BEGS" }        if defined $record->{ "Q_BEGS" };
 332     }
 333
 334     if ( $fh ) {
 335         print $fh join( "\t", @fields ), "\n";
 336     } else {
 337         print join( "\t", @fields ), "\n";
 338     }
 339 }
 340
 341
 342 sub bed_put_entries
 343 {
 344     # Martin A. Hansen, January 2008.
 345
 346     # Write a list of BED entries.
 347
 348     my ( $entries,   # list of entries,
 349          $fh,        # file handle - OPTIONAL
 350        ) = @_;
 351
 352     # Returns nothing.
 353
 354     map { bed_put_entry( $_, $fh ) } @{ $entries };
 355 }
 356
 357
 358 sub bed_analyze
 359 {
 360     # Martin A. Hansen, March 2008.
 361
 362     # Given a bed record, analysis this to give information
 363     # about intron/exon sizes.
 364
 365     my ( $entry,   # BED entry
 366        ) = @_;
 367
 368     # Returns hashref.
 369
 370     my ( $i, @begs, @lens, $exon_max, $exon_min, $exon_len, $exon_tot, $intron_max, $intron_min, $intron_len, $intron_tot );
 371
 372     $exon_max   = 0;
 373     $exon_min   = 9999999999;
 374     $intron_max = 0;
 375     $intron_min = 9999999999;
 376
 377     $entry->{ "EXONS" }   = $entry->{ "BLOCKCOUNT" };
 378
 379     @begs = split /,/, $entry->{ "Q_BEGS" };
 380     @lens = split /,/, $entry->{ "BLOCKSIZES" };
 381
 382     for ( $i = 0; $i < $entry->{ "BLOCKCOUNT" }; $i++ )
 383     {
 384         $exon_len = @lens[ $i ];
 385
 386         $entry->{ "EXON_LEN_$i" } = $exon_len;
 387
 388         $exon_max = $exon_len if $exon_len > $exon_max;
 389         $exon_min = $exon_len if $exon_len < $exon_min;
 390
 391         $exon_tot += $exon_len;
 392     }
 393
 394     $entry->{ "EXON_LEN_-1" }   = $exon_len;
 395     $entry->{ "EXON_MAX_LEN" }  = $exon_max;
 396     $entry->{ "EXON_MIN_LEN" }  = $exon_min;
 397     $entry->{ "EXON_MEAN_LEN" } = int( $exon_tot / $entry->{ "EXONS" } );
 398
 399     $entry->{ "INTRONS" } = $entry->{ "BLOCKCOUNT" } - 1;
 400     $entry->{ "INTRONS" } = 0 if $entry->{ "INTRONS" } < 0;
 401
 402     if ( $entry->{ "INTRONS" } )
 403     {
 404         for ( $i = 1; $i < $entry->{ "BLOCKCOUNT" }; $i++ )
 405         {
 406             $intron_len = @begs[ $i ] - ( @begs[ $i - 1 ] + @lens[ $i - 1 ] );
 407
 408             $entry->{ "INTRON_LEN_" . ( $i - 1 ) } = $intron_len;
 409
 410             $intron_max = $intron_len if $intron_len > $intron_max;
 411             $intron_min = $intron_len if $intron_len < $intron_min;
 412
 413             $intron_tot += $intron_len;
 414         }
 415
 416         $entry->{ "INTRON_LEN_-1" }   = $intron_len;
 417         $entry->{ "INTRON_MAX_LEN" }  = $intron_max;
 418         $entry->{ "INTRON_MIN_LEN" }  = $intron_min;
 419         $entry->{ "INTRON_MEAN_LEN" } = int( $intron_tot / $entry->{ "INTRONS" } );
 420     }
 421
 422     return wantarray ? %{ $entry } : $entry;
 423 }
 424
 425
 426 sub bed_sort
 427 {
 428     # Martin A. Hansen, Septermber 2008
 429
 430     # Sorts a BED file using the c program
 431     # "bed_sort" specifing a sort mode:
 432
 433     # 1: chr AND chr_beg.
 434     # 2: chr AND strand AND chr_beg.
 435     # 3: chr_beg.
 436     # 4: strand AND chr_beg.
 437
 438     my ( $bed_file,    # BED file to sort
 439          $sort_mode,   # See above.
 440          $cols,        # Number of columns in BED file
 441        ) = @_;
 442
 443     &Maasha::Common::run( "bed_sort", "--sort $sort_mode --cols $cols $bed_file" );
 444 }
 445
 446
 447 sub bed_split_to_files
 448 {
 449     # Martin A. Hansen, Septermber 2008
 450
 451     # Given a list of BED files, split these
 452     # into temporary files based on the chromosome
 453     # name. Returns a list of the temporary files.
 454
 455     my ( $bed_files,   # list of BED files to split
 456          $cols,        # number of columns
 457          $tmp_dir,     # temporary directory
 458        ) = @_;
 459
 460     # Returns a list.
 461
 462     my ( $bed_file, $fh_in, $entry, $key, %fh_hash, @tmp_files );
 463
 464     foreach $bed_file ( @{ $bed_files } )
 465     {
 466         $fh_in = Maasha::Common::read_open( $bed_file );
 467
 468         while ( $entry = bed_entry_get_array( $fh_in, $cols ) )
 469         {
 470             $key = $entry->[ CHR ];
 471
 472             $fh_hash{ $key } = Maasha::Common::write_open( "$tmp_dir/$key.temp" ) if not exists $fh_hash{ $key };
 473
 474             bed_entry_put_array( $entry, $fh_hash{ $key } );
 475         }
 476
 477         close $fh_in;
 478     }
 479
 480     foreach $key ( sort keys %fh_hash )
 481     {
 482         push @tmp_files, "$tmp_dir/$key.temp";
 483
 484         close $fh_hash{ $key };
 485     }
 486
 487     return wantarray ? @tmp_files : \@tmp_files;
 488 }
 489
 490
 491 sub bed_merge_entries
 492 {
 493     # Martin A. Hansen, February 2008.
 494
 495     # Merge a list of given BED entries in one big entry.
 496
 497     my ( $entries,     # list of BED entries to be merged
 498        ) = @_;
 499
 500     # Returns hash.
 501
 502     my ( $i, @q_ids, @q_begs, @blocksizes, @new_q_begs, @new_blocksizes, %new_entry );
 503
 504     @{ $entries } = sort { $a->{ "CHR_BEG" } <=> $b->{ "CHR_BEG" } } @{ $entries };
 505
 506     for ( $i = 0; $i < @{ $entries }; $i++ )
 507     {
 508         Maasha::Common::error( qq(Attempted merge of BED entries from different chromosomes) ) if $entries->[ 0 ]->{ "CHR" }    ne $entries->[ $i ]->{ "CHR" };
 509         Maasha::Common::error( qq(Attempted merge of BED entries from different strands) )     if $entries->[ 0 ]->{ "STRAND" } ne $entries->[ $i ]->{ "STRAND" };
 510
 511         push @q_ids, $entries->[ $i ]->{ "Q_ID" } || sprintf( "ID%06d", $i );
 512
 513         if ( exists $entries->[ $i ]->{ "Q_BEGS" } )
 514         {
 515             @q_begs     = split ",", $entries->[ $i ]->{ "Q_BEGS" };
 516             @blocksizes = split ",", $entries->[ $i ]->{ "BLOCKSIZES" };
 517         }
 518         else
 519         {
 520             @q_begs     = 0;
 521             @blocksizes = $entries->[ $i ]->{ "CHR_END" } - $entries->[ $i ]->{ "CHR_BEG" } + 1;
 522         }
 523
 524         map { $_ += $entries->[ $i ]->{ "CHR_BEG" } } @q_begs;
 525
 526         push @new_q_begs, @q_begs;
 527         push @new_blocksizes, @blocksizes;
 528     }
 529
 530     map { $_ -= $entries->[ 0 ]->{ "CHR_BEG" } } @new_q_begs;
 531
 532     %new_entry = (
 533         CHR         => $entries->[ 0 ]->{ "CHR" },
 534         CHR_BEG     => $entries->[ 0 ]->{ "CHR_BEG" },
 535         CHR_END     => $entries->[ -1 ]->{ "CHR_END" },
 536         REC_TYPE    => "BED",
 537         BED_LEN     => $entries->[ -1 ]->{ "CHR_END" } - $entries->[ 0 ]->{ "CHR_BEG" } + 1,
 538         BED_COLS    => 12,
 539         Q_ID        => join( ":", @q_ids ),
 540         SCORE       => 999,
 541         STRAND      => $entries->[ 0 ]->{ "STRAND" }     || "+",
 542         THICK_BEG   => $entries->[ 0 ]->{ "THICK_BEG" }  || $entries->[ 0 ]->{ "CHR_BEG" },
 543         THICK_END   => $entries->[ -1 ]->{ "THICK_END" } || $entries->[ -1 ]->{ "CHR_END" },
 544         ITEMRGB     => "0,0,0",
 545         BLOCKCOUNT  => scalar @new_q_begs,
 546         BLOCKSIZES  => join( ",", @new_blocksizes ),
 547         Q_BEGS      => join( ",", @new_q_begs ),
 548     );
 549
 550     return wantarray ? %new_entry : \%new_entry;
 551 }
 552
 553
 554 sub bed_split_entry
 555 {
 556     # Martin A. Hansen, February 2008.
 557
 558     # Splits a given BED entry into a list of blocks,
 559     # which are returned. A list of 6 column BED entry is returned.
 560
 561     my ( $entry,    # BED entry hashref
 562        ) = @_;
 563
 564     # Returns a list.
 565
 566     my ( @q_begs, @blocksizes, $block, @blocks, $i );
 567
 568     if ( exists $entry->{ "BLOCKCOUNT" } )
 569     {
 570         @q_begs     = split ",", $entry->{ "Q_BEGS" };
 571         @blocksizes = split ",", $entry->{ "BLOCKSIZES" };
 572
 573         for ( $i = 0; $i < @q_begs; $i++ )
 574         {
 575             undef $block;
 576
 577             $block->{ "CHR" }      = $entry->{ "CHR" };
 578             $block->{ "CHR_BEG" }  = $entry->{ "CHR_BEG" } + $q_begs[ $i ];
 579             $block->{ "CHR_END" }  = $entry->{ "CHR_BEG" } + $q_begs[ $i ] + $blocksizes[ $i ] - 1;
 580             $block->{ "Q_ID" }     = $entry->{ "Q_ID" } . sprintf( "_%03d", $i );
 581             $block->{ "SCORE" }    = $entry->{ "SCORE" };
 582             $block->{ "STRAND" }   = $entry->{ "STRAND" };
 583             $block->{ "BED_LEN" }  = $block->{ "CHR_END" } - $block->{ "CHR_BEG" } + 1,
 584             $block->{ "BED_COLS" } = 6;
 585             $block->{ "REC_TYPE" } = "BED";
 586
 587             push @blocks, $block;
 588         }
 589     }
 590     else
 591     {
 592         @blocks = @{ $entry };
 593     }
 594
 595     return wantarray ? @blocks : \@blocks;
 596 }
 597
 598
 599
 600 sub bed_overlap
 601 {
 602     # Martin A. Hansen, February 2008.
 603
 604     # Checks if two BED entries overlap and
 605     # return 1 if so - else 0;
 606
 607     my ( $entry1,      # hashref
 608          $entry2,      # hashref
 609          $no_strand,   # don't check strand flag - OPTIONAL
 610        ) = @_;
 611
 612     # Return bolean.
 613
 614     return 0 if $entry1->{ "CHR" }    ne $entry2->{ "CHR" };
 615     return 0 if $entry1->{ "STRAND" } ne $entry2->{ "STRAND" };
 616
 617     if ( $entry1->{ "CHR_END" } < $entry2->{ "CHR_BEG" } or $entry1->{ "CHR_BEG" } > $entry2->{ "CHR_END" } ) {
 618         return 0;
 619     } else {
 620         return 1;
 621     }
 622 }
 623
 624
 625 sub bed_upload_to_ucsc
 626 {
 627     # Martin A. Hansen, September 2007.
 628
 629     # Upload a BED file to the UCSC database.
 630
 631     my ( $tmp_dir,   # temporary directory
 632          $file,      # file to upload,
 633          $options,   # argument hashref
 634          $append,    # flag indicating table should be appended
 635        ) = @_;
 636
 637     # Returns nothing.
 638
 639     my ( $args, $table, $sql_file, $fh_out, $fh_in );
 640
 641     if ( $append ) {
 642         $args = join " ", $options->{ "database" }, $options->{ "table" }, "-tmpDir=$tmp_dir", "-oldTable", $file;
 643     } else {
 644         $args = join " ", $options->{ "database" }, $options->{ "table" }, "-tmpDir=$tmp_dir", $file;
 645     }
 646
 647     if ( $options->{ "sec_struct" } )
 648     {
 649         $table = $options->{ "table" };
 650
 651         Maasha::Common::error( "Attempt to load secondary structure track without 'rnaSecStr' in table name" ) if not $table =~ /rnaSecStr/;
 652
 653         $sql_file = "$tmp_dir/upload_RNA_SS.sql";
 654
 655         $fh_out   = Maasha::Common::write_open( $sql_file );
 656
 657         print $fh_out qq(
 658 CREATE TABLE $table (
 659     bin smallint not null,              # Bin number for browser speedup
 660     chrom varchar(255) not null,        # Chromosome or FPC contig
 661     chromStart int unsigned not null,   # Start position in chromosome
 662     chromEnd int unsigned not null,     # End position in chromosome
 663     name varchar(255) not null,         # Name of item
 664     score int unsigned not null,        # Score from 0-1000
 665     strand char(1) not null,            # + or -
 666     size int unsigned not null,         # Size of element.
 667     secStr longblob not null,           # Parentheses and '.'s which define the secondary structure
 668     conf longblob not null,             # Confidence of secondary-structure annotation per position (0.0-1.0).
 669     #Indices
 670     INDEX(name(16)),
 671     INDEX(chrom(8), bin),
 672     INDEX(chrom(8), chromStart)
 673 );
 674         );
 675
 676         close $fh_out;
 677
 678         Maasha::Common::run( "hgLoadBed", "-notItemRgb -sqlTable=$sql_file $options->{ 'database' } $options->{ 'table' } -tmpDir=$tmp_dir $file > /dev/null 2>&1" );
 679
 680         unlink $sql_file;
 681     }
 682     else
 683     {
 684         Maasha::Common::run( "hgLoadBed", "$args > /dev/null 2>&1" );
 685     }
 686 }
 687
 688
 689 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> PSL format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
 690
 691
 692 sub psl_get_entry
 693 {
 694     # Martin A. Hansen, August 2008.
 695
 696     # Reads PSL next entry from a PSL file and returns a record.
 697
 698     my ( $fh,   # file handle of PSL filefull path to PSL file
 699        ) = @_;
 700
 701     # Returns hashref.
 702
 703     my ( $line, @fields, %record );
 704
 705     while ( $line = <$fh> )
 706     {
 707         chomp $line;
 708
 709         @fields = split "\t", $line;
 710
 711         if ( scalar @fields == 21 )
 712         {
 713             %record = (
 714                 REC_TYPE    => "PSL",
 715                 MATCHES     => $fields[ 0 ],
 716                 MISMATCHES  => $fields[ 1 ],
 717                 REPMATCHES  => $fields[ 2 ],
 718                 NCOUNT      => $fields[ 3 ],
 719                 QNUMINSERT  => $fields[ 4 ],
 720                 QBASEINSERT => $fields[ 5 ],
 721                 SNUMINSERT  => $fields[ 6 ],
 722                 SBASEINSERT => $fields[ 7 ],
 723                 STRAND      => $fields[ 8 ],
 724                 Q_ID        => $fields[ 9 ],
 725                 Q_LEN       => $fields[ 10 ],
 726                 Q_BEG       => $fields[ 11 ],
 727                 Q_END       => $fields[ 12 ] - 1,
 728                 S_ID        => $fields[ 13 ],
 729                 S_LEN       => $fields[ 14 ],
 730                 S_BEG       => $fields[ 15 ],
 731                 S_END       => $fields[ 16 ] - 1,
 732                 BLOCKCOUNT  => $fields[ 17 ],
 733                 BLOCKSIZES  => $fields[ 18 ],
 734                 Q_BEGS      => $fields[ 19 ],
 735                 S_BEGS      => $fields[ 20 ],
 736             );
 737
 738             $record{ "SCORE" } = $record{ "MATCHES" } + int( $record{ "REPMATCHES" } / 2 ) - $record{ "MISMATCHES" } - $record{ "QNUMINSERT" } - $record{ "SNUMINSERT" };
 739
 740             return wantarray ? %record : \%record;
 741         }
 742     }
 743
 744     return undef;
 745 }
 746
 747
 748 sub psl_get_entries
 749 {
 750     # Martin A. Hansen, February 2008.
 751
 752     # Reads PSL entries and returns a list of records.
 753
 754     my ( $path,   # full path to PSL file
 755        ) = @_;
 756
 757     # Returns hashref.
 758
 759     my ( $fh, @lines, @fields, $i, %record, @records );
 760
 761     $fh = Maasha::Common::read_open( $path );
 762
 763     @lines = <$fh>;
 764
 765     close $fh;
 766
 767     chomp @lines;
 768
 769     for ( $i = 5; $i < @lines; $i++ )
 770     {
 771         @fields = split "\t", $lines[ $i ];
 772
 773         Maasha::Common::error( qq(Bad PSL format in file "$path") ) if not @fields == 21;
 774
 775         undef %record;
 776
 777         %record = (
 778             REC_TYPE    => "PSL",
 779             MATCHES     => $fields[ 0 ],
 780             MISMATCHES  => $fields[ 1 ],
 781             REPMATCHES  => $fields[ 2 ],
 782             NCOUNT      => $fields[ 3 ],
 783             QNUMINSERT  => $fields[ 4 ],
 784             QBASEINSERT => $fields[ 5 ],
 785             SNUMINSERT  => $fields[ 6 ],
 786             SBASEINSERT => $fields[ 7 ],
 787             STRAND      => $fields[ 8 ],
 788             Q_ID        => $fields[ 9 ],
 789             Q_LEN       => $fields[ 10 ],
 790             Q_BEG       => $fields[ 11 ],
 791             Q_END       => $fields[ 12 ] - 1,
 792             S_ID        => $fields[ 13 ],
 793             S_LEN       => $fields[ 14 ],
 794             S_BEG       => $fields[ 15 ],
 795             S_END       => $fields[ 16 ] - 1,
 796             BLOCKCOUNT  => $fields[ 17 ],
 797             BLOCKSIZES  => $fields[ 18 ],
 798             Q_BEGS      => $fields[ 19 ],
 799             S_BEGS      => $fields[ 20 ],
 800         );
 801
 802         $record{ "SCORE" } = $record{ "MATCHES" } + int( $record{ "REPMATCHES" } / 2 ) - $record{ "MISMATCHES" } - $record{ "QNUMINSERT" } - $record{ "SNUMINSERT" };
 803
 804         push @records, { %record };
 805     }
 806
 807     return wantarray ? @records : \@records;
 808 }
 809
 810
 811 sub psl_put_header
 812 {
 813     # Martin A. Hansen, September 2007.
 814
 815     # Write a PSL header to file.
 816
 817     my ( $fh,  # file handle  - OPTIONAL
 818        ) = @_;
 819
 820     # Returns nothing.
 821
 822     $fh = \*STDOUT if not $fh;
 823
 824     print $fh qq(psLayout version 3
 825 match   mis-    rep.    N's     Q gap   Q gap   T gap   T gap   strand  Q               Q       Q       Q       T               T       T       T       block   blockSizes      qStart        match   match           count   bases   count   bases           name            size    start   end     name            size    start   end     count
 826 ---------------------------------------------------------------------------------------------------------------------------------------------------------------
 827 );
 828 }
 829
 830
 831 sub psl_put_entry
 832 {
 833     # Martin A. Hansen, September 2007.
 834
 835     # Write a PSL entry to file.
 836
 837     my ( $record,       # hashref
 838          $fh,           # file handle  -  OPTIONAL
 839        ) = @_;
 840
 841     # Returns nothing.
 842
 843     $fh = \*STDOUT if not $fh;
 844
 845     my @output;
 846
 847     push @output, $record->{ "MATCHES" };
 848     push @output, $record->{ "MISMATCHES" };
 849     push @output, $record->{ "REPMATCHES" };
 850     push @output, $record->{ "NCOUNT" };
 851     push @output, $record->{ "QNUMINSERT" };
 852     push @output, $record->{ "QBASEINSERT" };
 853     push @output, $record->{ "SNUMINSERT" };
 854     push @output, $record->{ "SBASEINSERT" };
 855     push @output, $record->{ "STRAND" };
 856     push @output, $record->{ "Q_ID" };
 857     push @output, $record->{ "Q_LEN" };
 858     push @output, $record->{ "Q_BEG" };
 859     push @output, $record->{ "Q_END" } + 1;
 860     push @output, $record->{ "S_ID" };
 861     push @output, $record->{ "S_LEN" };
 862     push @output, $record->{ "S_BEG" };
 863     push @output, $record->{ "S_END" } + 1;
 864     push @output, $record->{ "BLOCKCOUNT" };
 865     push @output, $record->{ "BLOCKSIZES" };
 866     push @output, $record->{ "Q_BEGS" };
 867     push @output, $record->{ "S_BEGS" };
 868
 869     print $fh join( "\t", @output ), "\n";
 870 }
 871
 872
 873 sub psl_upload_to_ucsc
 874 {
 875     # Martin A. Hansen, September 2007.
 876
 877     # Upload a PSL file to the UCSC database.
 878
 879     my ( $file,      # file to upload,
 880          $options,   # argument hashref
 881          $append,    # flag indicating table should be appended
 882        ) = @_;
 883
 884     # Returns nothing.
 885
 886     my ( $args );
 887
 888     if ( $append ) {
 889         $args = join " ", $options->{ "database" }, "-table=$options->{ 'table' }", "-clientLoad", "-append", $file;
 890     } else {
 891         $args = join " ", $options->{ "database" }, "-table=$options->{ 'table' }", "-clientLoad", $file;
 892     }
 893
 894     Maasha::Common::run( "hgLoadPsl", "$args > /dev/null 2>&1" );
 895 }
 896
 897
 898 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> TRACK FILE <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
 899
 900
 901 sub update_my_tracks
 902 {
 903     # Martin A. Hansen, September 2007.
 904
 905     # Update the /home/user/ucsc/my_tracks.ra file and executes makeCustomTracks.pl
 906
 907     my ( $options,   # hashref
 908          $type,      # track type
 909        ) = @_;
 910
 911     # Returns nothing.
 912
 913     my ( $file, $fh_in, $fh_out, $line, $time );
 914
 915     $file = $ENV{ "HOME" } . "/ucsc/my_tracks.ra";
 916
 917     # ---- create a backup ----
 918
 919     $fh_in  = Maasha::Common::read_open( $file );
 920     $fh_out = Maasha::Common::write_open( "$file~" );
 921
 922     while ( $line = <$fh_in> ) {
 923         print $fh_out $line;
 924     }
 925
 926     close $fh_in;
 927     close $fh_out;
 928
 929     # ---- append track ----
 930
 931     $time = Maasha::Common::time_stamp();
 932
 933     $fh_out = Maasha::Common::append_open( $file );
 934
 935     if ( $type eq "sec_struct" )
 936     {
 937         print $fh_out "\n\n# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n";
 938
 939         print $fh_out "\n# Track added by 'upload_to_ucsc' $time\n\n";
 940
 941         print $fh_out "# Database $options->{ 'database' }\n\n";
 942
 943         print $fh_out "track $options->{ 'table' }\n";
 944         print $fh_out "shortLabel $options->{ 'short_label' }\n";
 945         print $fh_out "longLabel $options->{ 'long_label' }\n";
 946         print $fh_out "group $options->{ 'group' }\n";
 947         print $fh_out "priority $options->{ 'priority' }\n";
 948         print $fh_out "visibility $options->{ 'visibility' }\n";
 949         print $fh_out "color $options->{ 'color' }\n";
 950         print $fh_out "type bed 6 +\n";
 951         print $fh_out "mafTrack multiz17way\n";
 952
 953         print $fh_out "\n# //\n";
 954     }
 955     else
 956     {
 957         print $fh_out "\n\n# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n";
 958
 959         print $fh_out "\n# Track added by 'upload_to_ucsc' $time\n\n";
 960
 961         print $fh_out "# Database $options->{ 'database' }\n\n";
 962
 963         print $fh_out "track $options->{ 'table' }\n";
 964         print $fh_out "shortLabel $options->{ 'short_label' }\n";
 965         print $fh_out "longLabel $options->{ 'long_label' }\n";
 966         print $fh_out "group $options->{ 'group' }\n";
 967         print $fh_out "priority $options->{ 'priority' }\n";
 968         print $fh_out "useScore 1\n" if $options->{ 'use_score' };
 969         print $fh_out "visibility $options->{ 'visibility' }\n";
 970         print $fh_out "maxHeightPixels 50:50:11\n" if $type eq "wig 0";
 971         print $fh_out "color $options->{ 'color' }\n";
 972         print $fh_out "type $type\n";
 973
 974         print $fh_out "\n# //\n";
 975     }
 976
 977     close $fh_out;
 978
 979     Maasha::Common::run( "ucscMakeTracks.pl", "-b > /dev/null 2>&1" );
 980 }
 981
 982
 983 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> fixedStep format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
 984
 985
 986 sub fixedstep_get_entry
 987 {
 988     # Martin A. Hansen, December 2007.
 989
 990     # Given a file handle to a PhastCons file get the
 991     # next entry which is all the lines after a "fixedStep"
 992     # line and until the next "fixedStep" line or EOF.
 993
 994     my ( $fh,   # filehandle
 995        ) = @_;
 996
 997     # Returns a list of lines
 998
 999     my ( $entry, @lines );
1000
1001     local $/ = "\nfixedStep ";
1002
1003     $entry = <$fh>;
1004
1005     chomp $entry;
1006
1007     @lines = split "\n", $entry;
1008
1009     return if @lines == 0;
1010
1011     $lines[ 0 ] =~ s/fixedStep?\s*//;
1012
1013     return wantarray ? @lines : \@lines;
1014 }
1015
1016
1017 sub fixedstep_index_create
1018 {
1019     # Martin A. Hansen, January 2008.
1020
1021     # Indexes a concatenated fixedStep file.
1022     # The index consists of a hash with chromosomes as keys,
1023     # and a list of [ chr_beg, next_chr_beg, chr_end, index_beg, index_len ] as values.
1024
1025     my ( $path,   # path to fixedStep file
1026        ) = @_;
1027
1028     # Returns a hashref
1029
1030     my ( $fh, $pos, $index_beg, $index_len, $entry, $locator, $chr, $step, $beg, $end, $len, %index, $i );
1031
1032     $fh = Maasha::Common::read_open( $path );
1033
1034     $pos = 0;
1035
1036     while ( $entry = Maasha::UCSC::fixedstep_get_entry( $fh ) )
1037     {
1038         $locator = shift @{ $entry };
1039
1040         if ( $locator =~ /chrom=([^ ]+) start=(\d+) step=(\d+)/ )
1041         {
1042             $chr  = $1;
1043             $beg  = $2 - 1;  #  fixedStep files are 1-based
1044             $step = $3;
1045         }
1046         else
1047         {
1048             Maasha::Common::error( qq(Could not parse locator: $locator) );
1049         }
1050
1051         $pos += length( $locator ) + 11;
1052
1053         $index_beg = $pos;
1054
1055 #        map { $pos += length( $_ ) + 1 } @{ $entry };
1056
1057         $pos += 6 * scalar @{ $entry };
1058
1059         $index_len = $pos - $index_beg;
1060
1061         push @{ $index{ $chr } }, [ $beg, undef, $beg + scalar @{ $entry } - 1, $index_beg, $index_len ];
1062     }
1063
1064     close $fh;
1065
1066     foreach $chr ( keys %index )
1067     {
1068         for ( $i = 0; $i < @{ $index{ $chr } } - 1; $i++ ) {
1069             $index{ $chr }->[ $i ]->[ NEXT_CHR_BEG ] = $index{ $chr }->[ $i + 1 ]->[ 0 ];
1070         }
1071
1072         $index{ $chr }->[ -1 ]->[ NEXT_CHR_BEG ] = $index{ $chr }->[ -1 ]->[ CHR_END ] + 1;
1073     }
1074
1075     return wantarray ? %index : \%index;
1076 }
1077
1078
1079 sub fixedstep_index_store
1080 {
1081     # Martin A. Hansen, January 2008.
1082
1083     # Writes a fixedStep index to binary file.
1084
1085     my ( $path,   # full path to file
1086          $index,  # list with index
1087        ) = @_;
1088
1089     # returns nothing
1090
1091     Maasha::Common::file_store( $path, $index );
1092 }
1093
1094
1095 sub fixedstep_index_retrieve
1096 {
1097     # Martin A. Hansen, January 2008.
1098
1099     # Retrieves a fixedStep index from binary file.
1100
1101     my ( $path,   # full path to file
1102        ) = @_;
1103
1104     # returns list
1105
1106     my $index;
1107
1108     $index = Maasha::Common::file_retrieve( $path );
1109
1110     return wantarray ? %{ $index } : $index;
1111 }
1112
1113
1114 sub fixedStep_index_lookup
1115 {
1116     # Martin A. Hansen, January 2008.
1117
1118     # Retrieve fixedStep scores from a indexed
1119     # fixedStep file given a chromosome and
1120     # begin and end positions.
1121
1122     my ( $index,     # data structure
1123          $fh,        # filehandle to datafile
1124          $chr,       # chromosome
1125          $chr_beg,   # chromosome beg
1126          $chr_end,   # chromosome end
1127          $flank,     # include flanking region - OPTIONAL
1128        ) = @_;
1129
1130     # Returns a list
1131
1132     my ( $index_beg, $index_end, $i, $c, $beg, $end, @vals, $scores );
1133
1134     $flank ||= 0;
1135
1136     $chr_beg -= $flank;
1137     $chr_end += $flank;
1138
1139 #    print "chr_beg->$chr_beg   chr_end->$chr_end   flank->$flank\n";
1140
1141     if ( exists $index->{ $chr } )
1142     {
1143         $index_beg = Maasha::Matrix::interval_search( $index->{ $chr }, 0, 1, $chr_beg );
1144
1145         if ( $index_beg < 0 ) {
1146             Maasha::Common::error( qq(Index search failed - begin index position doesn't exists: $chr_beg) );
1147         }
1148
1149         if ( $chr_end < $index->{ $chr }->[ $index_beg ]->[ 1 ] )
1150         {
1151             $index_end = $index_beg;
1152         }
1153         else
1154         {
1155             $index_end = Maasha::Matrix::interval_search( $index->{ $chr }, 0, 1, $chr_end );
1156
1157             if ( $index_end < 0 ) {
1158                 Maasha::Common::error( qq(Index search failed - end index position doesn't exists: $chr_end) );
1159             }
1160         }
1161
1162         map { $scores->[ $_ ] = 0 } 0 .. $chr_end - $chr_beg;
1163
1164         if ( $index_beg == $index_end )
1165         {
1166             $beg = Maasha::Calc::max( $chr_beg, $index->{ $chr }->[ $index_beg ]->[ CHR_BEG ] );
1167             $end = Maasha::Calc::min( $chr_end, $index->{ $chr }->[ $index_end ]->[ CHR_END ] );
1168
1169             if ( $beg <= $index->{ $chr }->[ $index_beg ]->[ CHR_END ] and $end >= $index->{ $chr }->[ $index_beg ]->[ CHR_BEG ] )
1170             {
1171                 @vals = split "\n", Maasha::Common::file_read(
1172                     $fh,
1173                     $index->{ $chr }->[ $index_beg ]->[ INDEX_BEG ] + 6 * ( $beg - $index->{ $chr }->[ $index_beg ]->[ CHR_BEG ] ),
1174                     6 * ( $end - $beg + 1 ),
1175                 );
1176             }
1177
1178             for ( $c = 0; $c < @vals; $c++ ) {
1179                 $scores->[ $c + $beg - $chr_beg ] = $vals[ $c ];
1180             }
1181         }
1182         else
1183         {
1184             $beg = Maasha::Calc::max( $chr_beg, $index->{ $chr }->[ $index_beg ]->[ CHR_BEG ] );
1185
1186 #            print Dumper( $beg, $index->{ $chr }->[ $index_beg ] );
1187 #            print Dumper( "next", $index->{ $chr }->[ $index_beg ]->[ NEXT_CHR_BEG ] );
1188
1189             #      beg         next
1190             #      v           v
1191             #  |||||||||.......
1192
1193             if ( $beg <= $index->{ $chr }->[ $index_beg ]->[ CHR_END ] )
1194             {
1195                 @vals = split "\n", Maasha::Common::file_read(
1196                     $fh,
1197                     $index->{ $chr }->[ $index_beg ]->[ INDEX_BEG ] + 6 * ( $beg - $index->{ $chr }->[ $index_beg ]->[ CHR_BEG ] ),
1198                     6 * ( $index->{ $chr }->[ $index_beg ]->[ CHR_END ] - $beg + 1 ),
1199                 );
1200
1201                 for ( $c = 0; $c < @vals; $c++ ) {
1202                     $scores->[ $c + $beg - $chr_beg ] = $vals[ $c ];
1203                 }
1204             }
1205
1206             $end = Maasha::Calc::min( $chr_end, $index->{ $chr }->[ $index_end ]->[ CHR_END ] );
1207
1208             if ( $end <= $index->{ $chr }->[ $index_end ]->[ CHR_END ] )
1209             {
1210                 @vals = split "\n", Maasha::Common::file_read(
1211                     $fh,
1212                     $index->{ $chr }->[ $index_end ]->[ INDEX_BEG ],
1213                     6 * ( $end - $index->{ $chr }->[ $index_end ]->[ CHR_BEG ] + 1 ),
1214                 );
1215
1216                 for ( $c = 0; $c < @vals; $c++ ) {
1217                     $scores->[ $c + $index->{ $chr }->[ $index_end ]->[ CHR_BEG ] - $chr_beg ] = $vals[ $c ];
1218                 }
1219             }
1220
1221             for ( $i = $index_beg + 1; $i <= $index_end - 1; $i++ )
1222             {
1223                 @vals = split "\n", Maasha::Common::file_read(
1224                     $fh,
1225                     $index->{ $chr }->[ $i ]->[ INDEX_BEG ],
1226                     6 * ( $index->{ $chr }->[ $i ]->[ CHR_END ] - $index->{ $chr }->[ $i ]->[ CHR_BEG ] + 1 ),
1227                 );
1228
1229                 for ( $c = 0; $c < @vals; $c++ ) {
1230                     $scores->[ $c + $index->{ $chr }->[ $i ]->[ CHR_BEG ] - $chr_beg ] = $vals[ $c ];
1231                 }
1232             }
1233         }
1234     }
1235     else
1236     {
1237         Maasha::Common::error( qq(Chromosome "$chr" was not found in index) );
1238     }
1239
1240     return wantarray ? @{ $scores } : $scores;
1241 }
1242
1243
1244 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> PhastCons format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1245
1246
1247 sub phastcons_index
1248 {
1249     # Martin A. Hansen, July 2008
1250
1251     # Create a fixedStep index for PhastCons data.
1252
1253     my ( $file,   # file to index
1254          $dir,    # dir with file
1255        ) = @_;
1256
1257     # Returns nothing.
1258
1259     my ( $index );
1260
1261     $index = fixedstep_index_create( "$dir/$file" );
1262
1263     fixedstep_index_store( "$dir/$file.index", $index );
1264 }
1265
1266
1267 sub phastcons_parse_entry
1268 {
1269     # Martin A. Hansen, December 2007.
1270
1271     # Given a PhastCons entry converts this to a
1272     # list of super blocks.
1273
1274     my ( $lines,   # list of lines
1275          $args,    # argument hash
1276        ) = @_;
1277
1278     # Returns
1279
1280     my ( $info, $chr, $beg, $step, $i, $c, $j, @blocks, @super_blocks, @entries, $super_block, $block, @lens, @begs );
1281
1282     $info = shift @{ $lines };
1283
1284     if ( $info =~ /^chrom=([^ ]+) start=(\d+) step=(\d+)$/ )
1285     {
1286         $chr  = $1;
1287         $beg  = $2;
1288         $step = $3;
1289
1290         die qq(ERROR: step size $step != 1 -> problem!\n) if $step != 1; # in an ideal world should would be fixed ...
1291     }
1292
1293     $i = 0;
1294
1295     while ( $i < @{ $lines } )
1296     {
1297         if ( $lines->[ $i ] >= $args->{ "threshold" } )
1298         {
1299             $c = $i + 1;
1300
1301             while ( $c < @{ $lines } )
1302             {
1303                 if ( $lines->[ $c ] < $args->{ "threshold" } )
1304                 {
1305                     $j = $c + 1;
1306
1307                     while ( $j < @{ $lines } and $lines->[ $j ] < $args->{ "threshold" } ) {
1308                         $j++;
1309                     }
1310
1311                     if ( $j - $c > $args->{ "gap" } )
1312                     {
1313                         if ( $c - $i >= $args->{ "min" } )
1314                         {
1315                             push @blocks, {
1316                                 CHR     => $chr,
1317                                 CHR_BEG => $beg + $i - 1,
1318                                 CHR_END => $beg + $c - 2,
1319                                 CHR_LEN => $c - $i,
1320                             };
1321                         }
1322
1323                         $i = $j;
1324
1325                         last;
1326                     }
1327
1328                     $c = $j
1329                 }
1330                 else
1331                 {
1332                     $c++;
1333                 }
1334             }
1335
1336             if ( $c - $i >= $args->{ "min" } )
1337             {
1338                 push @blocks, {
1339                     CHR     => $chr,
1340                     CHR_BEG => $beg + $i - 1,
1341                     CHR_END => $beg + $c - 2,
1342                     CHR_LEN => $c - $i,
1343                 };
1344             }
1345
1346             $i = $c;
1347         }
1348         else
1349         {
1350             $i++;
1351         }
1352     }
1353
1354     $i = 0;
1355
1356     while ( $i < @blocks )
1357     {
1358         $c = $i + 1;
1359
1360         while ( $c < @blocks and $blocks[ $c ]->{ "CHR_BEG" } - $blocks[ $c - 1 ]->{ "CHR_END" } <= $args->{ "dist" } )
1361         {
1362             $c++;
1363         }
1364
1365         push @super_blocks, [ @blocks[ $i .. $c - 1 ] ];
1366
1367         $i = $c;
1368     }
1369
1370     foreach $super_block ( @super_blocks )
1371     {
1372         foreach $block ( @{ $super_block } )
1373         {
1374             push @begs, $block->{ "CHR_BEG" } - $super_block->[ 0 ]->{ "CHR_BEG" };
1375             push @lens, $block->{ "CHR_LEN" } - 1;
1376         }
1377
1378         $lens[ -1 ]++;
1379
1380         push @entries, {
1381             CHR        => $super_block->[ 0 ]->{ "CHR" },
1382             CHR_BEG    => $super_block->[ 0 ]->{ "CHR_BEG" },
1383             CHR_END    => $super_block->[ -1 ]->{ "CHR_END" },
1384             Q_ID       => "Q_ID",
1385             SCORE      => 100,
1386             STRAND     => "+",
1387             THICK_BEG  => $super_block->[ 0 ]->{ "CHR_BEG" },
1388             THICK_END  => $super_block->[ -1 ]->{ "CHR_END" } + 1,
1389             ITEMRGB    => "0,200,100",
1390             BLOCKCOUNT => scalar @{ $super_block },
1391             BLOCKSIZES => join( ",", @lens ),
1392             Q_BEGS     => join( ",", @begs ),
1393         };
1394
1395         undef @begs;
1396         undef @lens;
1397     }
1398
1399     return wantarray ? @entries : \@entries;
1400 }
1401
1402
1403 sub phastcons_normalize
1404 {
1405     # Martin A. Hansen, January 2008.
1406
1407     # Normalizes a list of lists with PhastCons scores,
1408     # in such a way that each list contains the same number
1409     # or PhastCons scores.
1410
1411     my ( $AoA,    # AoA with PhastCons scores
1412        ) = @_;
1413
1414     # Returns AoA.
1415
1416     my ( $list, $max, $min, $mean, $diff );
1417
1418     $min = 99999999;
1419     $max = 0;
1420
1421     foreach $list ( @{ $AoA } )
1422     {
1423         $min = scalar @{ $list } if scalar @{ $list } < $min;
1424         $max = scalar @{ $list } if scalar @{ $list } > $max;
1425     }
1426
1427     $mean = int( ( $min + $max ) / 2 );
1428
1429 #    print STDERR "min->$min   max->$max   mean->$mean\n";
1430
1431     foreach $list ( @{ $AoA } )
1432     {
1433         $diff = scalar @{ $list } - $mean;
1434
1435         phastcons_list_inflate( $list, abs( $diff ) ) if $diff < 0;
1436         phastcons_list_deflate( $list, $diff )        if $diff > 0;
1437     }
1438
1439     return wantarray ? @{ $AoA } : $AoA;
1440 }
1441
1442
1443 sub phastcons_list_inflate
1444 {
1445     # Martin A. Hansen, January 2008.
1446
1447     # Inflates a list with a given number of elements
1448     # in such a way that the extra elements are introduced
1449     # evenly over the entire length of the list. The value
1450     # of the extra elements is based on a mean of the
1451     # adjacent elements.
1452
1453     my ( $list,   # list of elements
1454          $diff,   # number of elements to introduce
1455        ) = @_;
1456
1457     # Returns nothing
1458
1459     my ( $len, $space, $i, $pos );
1460
1461     $len = scalar @{ $list };
1462
1463     $space = $len / $diff;
1464
1465     for ( $i = 0; $i < $diff; $i++ )
1466     {
1467         $pos = int( ( $space / 2 ) + $i * $space );
1468
1469         splice @{ $list }, $pos, 0, ( $list->[ $pos - 1 ] + $list->[ $pos + 1 ] ) / 2;
1470         # splice @{ $list }, $pos, 0, "X";
1471     }
1472
1473     die qq(ERROR: bad inflate\n) if scalar @{ $list } != $len + $diff;
1474 }
1475
1476
1477 sub phastcons_list_deflate
1478 {
1479     # Martin A. Hansen, January 2008.
1480
1481     # Deflates a list by removing a given number of elements
1482     # evenly distributed over the entire list.
1483
1484     my ( $list,   # list of elements
1485          $diff,   # number of elements to remove
1486        ) = @_;
1487
1488     # Returns nothing
1489
1490     my ( $len, $space, $i, $pos );
1491
1492     $len = scalar @{ $list };
1493
1494     $space = ( $len - $diff ) / $diff;
1495
1496     for ( $i = 0; $i < $diff; $i++ )
1497     {
1498         $pos = int( ( $space / 2 ) + $i * $space );
1499
1500         splice @{ $list }, $pos, 1;
1501     }
1502
1503     die qq(ERROR: bad deflate\n) if scalar @{ $list } != $len - $diff;
1504 }
1505
1506
1507 sub phastcons_mean
1508 {
1509     # Martin A. Hansen, January 2008.
1510
1511     # Given a normalized PhastCons matrix in an AoA,
1512     # calculate the mean for each column and return as a list.
1513
1514     my ( $AoA,    # AoA with normalized PhastCons scores
1515        ) = @_;
1516
1517     # Returns a list
1518
1519     my ( @list );
1520
1521     $AoA = Maasha::Matrix::matrix_flip( $AoA );
1522
1523     map { push @list, Maasha::Calc::mean( $_ ) } @{ $AoA };
1524
1525     return wantarray ? @list : \@list;
1526 }
1527
1528
1529 sub phastcons_median
1530 {
1531     # Martin A. Hansen, January 2008.
1532
1533     # Given a normalized PhastCons matrix in an AoA,
1534     # calculate the median for each column and return as a list.
1535
1536     my ( $AoA,    # AoA with normalized PhastCons scores
1537        ) = @_;
1538
1539     # Returns a list
1540
1541     my ( @list );
1542
1543     $AoA = Maasha::Matrix::matrix_flip( $AoA );
1544
1545     map { push @list, Maasha::Calc::median( $_ ) } @{ $AoA };
1546
1547     return wantarray ? @list : \@list;
1548 }
1549
1550
1551 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> MULTIPLE ALIGNMENT FILES <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1552
1553
1554 sub maf_extract
1555 {
1556     # Martin A. Hansen, April 2008.
1557
1558     # Executes mafFrag to extract a subalignment from a multiz track
1559     # in the UCSC genome browser database.
1560
1561     my ( $tmp_dir,    # temporary directory
1562          $database,   # genome database
1563          $table,      # table with the multiz track
1564          $chr,        # chromosome
1565          $beg,        # begin position
1566          $end,        # end position
1567          $strand,     # strand
1568        ) = @_;
1569
1570     # Returns a list of record
1571
1572     my ( $tmp_file, $align );
1573
1574     $tmp_file = "$tmp_dir/maf_extract.maf";
1575
1576     Maasha::Common::run( "mafFrag", "$database $table $chr $beg $end $strand $tmp_file" );
1577
1578     $align = maf_parse( $tmp_file );
1579
1580     unlink $tmp_file;
1581
1582     return wantarray ? @{ $align } : $align;
1583 }
1584
1585
1586 sub maf_parse
1587 {
1588     # Martin A. Hansen, April 2008.
1589
1590
1591     my ( $path,   # full path to MAF file
1592        ) = @_;
1593
1594     # Returns a list of record.
1595
1596     my ( $fh, $line, @fields, @align );
1597
1598     $fh = Maasha::Common::read_open( $path );
1599
1600     while ( $line = <$fh> )
1601     {
1602         chomp $line;
1603
1604         if ( $line =~ /^s/ )
1605         {
1606             @fields = split / /, $line;
1607
1608             push @align, {
1609                 SEQ_NAME  => $fields[ 1 ],
1610                 SEQ       => $fields[ -1 ],
1611                 ALIGN     => 1,
1612                 ALIGN_LEN => length $fields[ -1 ],
1613             }
1614         }
1615     }
1616
1617     close $fh;
1618
1619     return wantarray ? @align : \@align;
1620 }
1621
1622
1623 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> WIGGLE FORMAT <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1624
1625
1626 sub fixedstep_put_entry
1627 {
1628     # Martin A. Hansen, April 2008.
1629
1630     # Outputs a block of fixedStep values.
1631     # Used for outputting wiggle data.
1632
1633     my ( $chr,      # chromosome
1634          $beg,      # start position
1635          $block,    # list of scores
1636          $fh,       # filehandle - OPTIONAL
1637          $log10,    # flag indicating that log10 scores should be used
1638        ) = @_;
1639
1640     # Returns nothing.
1641
1642     $beg += 1;   # fixedStep format is 1 based.
1643
1644     $fh ||= \*STDOUT;
1645
1646     print $fh "fixedStep chrom=$chr start=$beg step=1\n";
1647
1648     if ( $log10 ) {
1649         map { printf( $fh "%f\n", Maasha::Calc::log10( $_ + 1 ) ) } @{ $block };
1650     } else {
1651         map { printf( $fh "%d\n", ( $_ + 1 ) ) } @{ $block };
1652     }
1653 }
1654
1655
1656 sub wiggle_upload_to_ucsc
1657 {
1658     # Martin A. Hansen, May 2008.
1659
1660     # Upload a wiggle file to the UCSC database.
1661
1662     my ( $tmp_dir,    # temporary directory
1663          $wib_dir,    # wib directory
1664          $wig_file,   # file to upload,
1665          $options,    # argument hashref
1666        ) = @_;
1667
1668     # Returns nothing.
1669
1670     my ( $args );
1671
1672 #    $args = join " ", "-tmpDir=$tmp_dir", "-pathPrefix=$wib_dir", $options->{ "database" }, $options->{ 'table' }, $wig_file;
1673
1674 #    Maasha::Common::run( "hgLoadWiggle", "$args > /dev/null 2>&1" );
1675
1676     if ( $options->{ 'verbose' } ) {
1677         `cd $tmp_dir && hgLoadWiggle -tmpDir=$tmp_dir -pathPrefix=$wib_dir $options->{ 'database' } $options->{ 'table' } $wig_file`;
1678     } else {
1679         `cd $tmp_dir && hgLoadWiggle -tmpDir=$tmp_dir -pathPrefix=$wib_dir $options->{ 'database' } $options->{ 'table' } $wig_file > /dev/null 2>&1`;
1680     }
1681 }
1682
1683
1684 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> MySQL CONF <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1685
1686
1687 sub ucsc_get_user
1688 {
1689     # Martin A. Hansen, May 2008
1690
1691     # Fetches the MySQL database user name from the
1692     # .hg.conf file in the users home directory.
1693
1694     # Returns a string.
1695
1696     my ( $fh, $line, $user );
1697
1698     $fh = Maasha::Common::read_open( "$ENV{ 'HOME' }/.hg.conf" );
1699
1700     while ( $line = <$fh> )
1701     {
1702         chomp $line;
1703
1704         if ( $line =~ /^db\.user=(.+)/ )
1705         {
1706             $user = $1;
1707
1708             last;
1709         }
1710     }
1711
1712     close $fh;
1713
1714     return $user;
1715 }
1716
1717
1718 sub ucsc_get_password
1719 {
1720     # Martin A. Hansen, May 2008
1721
1722     # Fetches the MySQL database password from the
1723     # .hg.conf file in the users home directory.
1724
1725     # Returns a string.
1726
1727     my ( $fh, $line, $password );
1728
1729     $fh = Maasha::Common::read_open( "$ENV{ 'HOME' }/.hg.conf" );
1730
1731     while ( $line = <$fh> )
1732     {
1733         chomp $line;
1734
1735         if ( $line =~ /^db\.password=(.+)/ )
1736         {
1737             $password = $1;
1738
1739             last;
1740         }
1741     }
1742
1743     close $fh;
1744
1745     return $password;
1746 }
1747
1748
1749 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1750
1751
1752 __END__
1753