code_perl/Maasha/UCSC.pm

   1 package Maasha::UCSC;
   2
   3 # Copyright (C) 2007 Martin A. Hansen.
   4
   5 # This program is free software; you can redistribute it and/or
   6 # modify it under the terms of the GNU General Public License
   7 # as published by the Free Software Foundation; either version 2
   8 # of the License, or (at your option) any later version.
   9
  10 # This program is distributed in the hope that it will be useful,
  11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 # GNU General Public License for more details.
  14
  15 # You should have received a copy of the GNU General Public License
  16 # along with this program; if not, write to the Free Software
  17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  18
  19 # http://www.gnu.org/copyleft/gpl.html
  20
  21
  22 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  23
  24
  25 # Stuff for interacting with UCSC genome browser
  26
  27
  28 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  29
  30
  31 use strict;
  32 use vars qw ( @ISA @EXPORT );
  33
  34 use Data::Dumper;
  35 use Maasha::Common;
  36 use Maasha::Filesys;
  37 use Maasha::Calc;
  38 use Maasha::Matrix;
  39
  40 use constant {
  41     FS_CHR_BEG      => 0,
  42     FS_NEXT_CHR_BEG => 1,
  43     FS_CHR_END      => 2,
  44     FS_INDEX_BEG    => 3,
  45     FS_INDEX_LEN    => 4,
  46
  47     CHR          => 0,
  48     CHR_BEG      => 1,
  49     CHR_END      => 2,
  50     Q_ID         => 3,
  51     SCORE        => 4,
  52     STRAND       => 5,
  53     THICK_BEG    => 6,
  54     THICK_END    => 7,
  55     ITEMRGB      => 8,
  56     BLOCKCOUNT   => 9,
  57     BLOCKSIZES   => 10,
  58     Q_BEGS       => 11,
  59 };
  60
  61 @ISA = qw( Exporter );
  62
  63
  64 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> BED format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  65
  66
  67 # http://genome.ucsc.edu/goldenPath/help/hgTracksHelp.html#BED
  68
  69
  70 sub bed_entry_get_array
  71 {
  72     # Martin A. Hansen, September 2008.
  73
  74     # Reads a BED entry given a filehandle.
  75
  76     # This is a new _faster_ BED entry parser that
  77     # uses arrays and not hashrefs.
  78
  79     # IMPORTANT! This function does not correct the
  80     # BED_END position that is kept the way UCSC
  81     # does.
  82
  83     my ( $fh,     # file handle
  84          $cols,   # columns to read - OPTIONAL (3,4,5,6, or 12)
  85        ) = @_;
  86
  87     # Returns a list.
  88
  89     my ( $line, @entry );
  90
  91     $line = <$fh>;
  92
  93     $line =~ tr/\n\r//d;    # some people have carriage returns in their BED files -> Grrrr
  94
  95     return if not defined $line;
  96
  97     if ( not defined $cols ) {
  98         $cols = 1 + $line =~ tr/\t//;
  99     }
 100
 101     @entry = split "\t", $line, $cols + 1;
 102
 103     pop @entry if scalar @entry > $cols;
 104
 105     return wantarray ? @entry : \@entry;
 106 }
 107
 108
 109 sub bed_get_entry
 110 {
 111     # Martin A. Hansen, December 2007.
 112
 113     # Reads a bed entry given a filehandle.
 114
 115     my ( $fh,        # file handle
 116          $columns,   # number of BED columns to read  -  OPTIONAL
 117        ) = @_;
 118
 119     # Returns hashref.
 120
 121     my ( $line, @fields, %entry );
 122
 123     $line = <$fh>;
 124
 125     $line =~ tr/\n\r//d;    # some people have carriage returns in their BED files -> Grrrr
 126
 127     return if not defined $line;
 128
 129     @fields = split "\t", $line;
 130
 131     $columns ||= scalar @fields;
 132
 133     if ( $columns == 3 )
 134     {
 135         %entry = (
 136             "CHR"      => $fields[ 0 ],
 137             "CHR_BEG"  => $fields[ 1 ],
 138             "CHR_END"  => $fields[ 2 ] - 1,
 139         );
 140     }
 141     elsif ( $columns == 4 )
 142     {
 143         %entry = (
 144             "CHR"      => $fields[ 0 ],
 145             "CHR_BEG"  => $fields[ 1 ],
 146             "CHR_END"  => $fields[ 2 ] - 1,
 147             "Q_ID"     => $fields[ 3 ],
 148         );
 149     }
 150     elsif ( $columns == 5 )
 151     {
 152         %entry = (
 153             "CHR"      => $fields[ 0 ],
 154             "CHR_BEG"  => $fields[ 1 ],
 155             "CHR_END"  => $fields[ 2 ] - 1,
 156             "Q_ID"     => $fields[ 3 ],
 157             "SCORE"    => $fields[ 4 ],
 158         );
 159     }
 160     elsif ( $columns == 6 )
 161     {
 162         %entry = (
 163             "CHR"      => $fields[ 0 ],
 164             "CHR_BEG"  => $fields[ 1 ],
 165             "CHR_END"  => $fields[ 2 ] - 1,
 166             "Q_ID"     => $fields[ 3 ],
 167             "SCORE"    => $fields[ 4 ],
 168             "STRAND"   => $fields[ 5 ],
 169         );
 170     }
 171     elsif ( $columns == 12 )
 172     {
 173         %entry = (
 174             "CHR"        => $fields[ 0 ],
 175             "CHR_BEG"    => $fields[ 1 ],
 176             "CHR_END"    => $fields[ 2 ] - 1,
 177             "Q_ID"       => $fields[ 3 ],
 178             "SCORE"      => $fields[ 4 ],
 179             "STRAND"     => $fields[ 5 ],
 180             "THICK_BEG"  => $fields[ 6 ],
 181             "THICK_END"  => $fields[ 7 ] - 1,
 182             "ITEMRGB"    => $fields[ 8 ],
 183             "BLOCKCOUNT" => $fields[ 9 ],
 184             "BLOCKSIZES" => $fields[ 10 ],
 185             "Q_BEGS"     => $fields[ 11 ],
 186         );
 187     }
 188     else
 189     {
 190         Maasha::Common::error( qq(Bad BED format in line->$line<-) );
 191     }
 192
 193     $entry{ "REC_TYPE" } = "BED";
 194     $entry{ "BED_LEN" }  = $entry{ "CHR_END" } - $entry{ "CHR_BEG" } + 1;
 195     $entry{ "BED_COLS" } = $columns;
 196
 197     return wantarray ? %entry : \%entry;
 198 }
 199
 200
 201 sub bed_get_entries
 202 {
 203     # Martin A. Hansen, January 2008.
 204
 205     # Given a path to a BED file, read in all entries
 206     # and return.
 207
 208     my ( $path,     # full path to BED file
 209          $columns,  # number of columns in BED file - OPTIONAL (but is faster)
 210        ) = @_;
 211
 212     # Returns a list.
 213
 214     my ( $fh, $entry, @list );
 215
 216     $fh = Maasha::Common::read_open( $path );
 217
 218     while ( $entry = bed_get_entry( $fh ) ) {
 219         push @list, $entry;
 220     }
 221
 222     close $fh;
 223
 224     return wantarray ? @list : \@list;
 225 }
 226
 227
 228 sub bed_entry_put_array
 229 {
 230     # Martin A. Hansen, Septermber 2008.
 231
 232     # Writes a BED entry array to file.
 233
 234     # IMPORTANT! This function does not correct the
 235     # BED_END position that is assumed to be in the
 236     # UCSC positions scheme.
 237
 238     my ( $record,   # list
 239          $fh,       # file handle                   - OPTIONAL
 240          $cols,     # number of columns in BED file - OPTIONAL
 241        ) = @_;
 242
 243     # Returns nothing.
 244
 245     $fh = \*STDOUT if not defined $fh;
 246
 247     if ( defined $cols ) {
 248         print $fh join( "\t", @{ $record }[ 0 .. $cols - 1 ] ), "\n";
 249     } else {
 250         print $fh join( "\t", @{ $record } ), "\n";
 251     }
 252 }
 253
 254
 255 sub bed_put_entry
 256 {
 257     # Martin A. Hansen, Septermber 2007.
 258
 259     # Writes a BED entry to file.
 260
 261     # NB, this could really be more robust!?
 262
 263     my ( $record,       # hashref
 264          $fh,           # file handle                   - OPTIONAL
 265          $columns,      # number of columns in BED file - OPTIONAL (but is faster)
 266        ) = @_;
 267
 268     # Returns nothing.
 269
 270     my ( @fields );
 271
 272     $columns ||= 12;   # max number of columns possible
 273
 274     if ( $columns == 3 )
 275     {
 276         push @fields, $record->{ "CHR" };
 277         push @fields, $record->{ "CHR_BEG" };
 278         push @fields, $record->{ "CHR_END" } + 1;
 279     }
 280     elsif ( $columns == 4 )
 281     {
 282         $record->{ "Q_ID" }  =~ s/\s+/_/g;
 283
 284         push @fields, $record->{ "CHR" };
 285         push @fields, $record->{ "CHR_BEG" };
 286         push @fields, $record->{ "CHR_END" } + 1;
 287         push @fields, $record->{ "Q_ID" };
 288     }
 289     elsif ( $columns == 5 )
 290     {
 291         $record->{ "Q_ID" }  =~ s/\s+/_/g;
 292         $record->{ "SCORE" } =~ s/\.\d*//;
 293
 294         push @fields, $record->{ "CHR" };
 295         push @fields, $record->{ "CHR_BEG" };
 296         push @fields, $record->{ "CHR_END" } + 1;
 297         push @fields, $record->{ "Q_ID" };
 298         push @fields, $record->{ "SCORE" };
 299     }
 300     elsif ( $columns == 6 )
 301     {
 302         $record->{ "Q_ID" }  =~ s/\s+/_/g;
 303         $record->{ "SCORE" } =~ s/\.\d*//;
 304
 305         push @fields, $record->{ "CHR" };
 306         push @fields, $record->{ "CHR_BEG" };
 307         push @fields, $record->{ "CHR_END" } + 1;
 308         push @fields, $record->{ "Q_ID" };
 309         push @fields, $record->{ "SCORE" };
 310         push @fields, $record->{ "STRAND" };
 311     }
 312     else
 313     {
 314         $record->{ "Q_ID" }  =~ s/\s+/_/g;
 315         $record->{ "SCORE" } =~ s/\.\d*//;
 316
 317         push @fields, $record->{ "CHR" };
 318         push @fields, $record->{ "CHR_BEG" };
 319         push @fields, $record->{ "CHR_END" } + 1;
 320         push @fields, $record->{ "Q_ID" };
 321         push @fields, $record->{ "SCORE" };
 322         push @fields, $record->{ "STRAND" };
 323         push @fields, $record->{ "THICK_BEG" }     if defined $record->{ "THICK_BEG" };
 324         push @fields, $record->{ "THICK_END" } + 1 if defined $record->{ "THICK_END" };
 325         push @fields, $record->{ "ITEMRGB" }       if defined $record->{ "ITEMRGB" };
 326         push @fields, $record->{ "BLOCKCOUNT" }    if defined $record->{ "BLOCKCOUNT" };
 327         push @fields, $record->{ "BLOCKSIZES" }    if defined $record->{ "BLOCKSIZES" };
 328         push @fields, $record->{ "Q_BEGS" }        if defined $record->{ "Q_BEGS" };
 329     }
 330
 331     if ( $fh ) {
 332         print $fh join( "\t", @fields ), "\n";
 333     } else {
 334         print join( "\t", @fields ), "\n";
 335     }
 336 }
 337
 338
 339 sub bed_put_entries
 340 {
 341     # Martin A. Hansen, January 2008.
 342
 343     # Write a list of BED entries.
 344
 345     my ( $entries,   # list of entries,
 346          $fh,        # file handle - OPTIONAL
 347        ) = @_;
 348
 349     # Returns nothing.
 350
 351     map { bed_put_entry( $_, $fh ) } @{ $entries };
 352 }
 353
 354
 355 sub bed_analyze
 356 {
 357     # Martin A. Hansen, March 2008.
 358
 359     # Given a bed record, analysis this to give information
 360     # about intron/exon sizes.
 361
 362     my ( $entry,   # BED entry
 363        ) = @_;
 364
 365     # Returns hashref.
 366
 367     my ( $i, @begs, @lens, $exon_max, $exon_min, $exon_len, $exon_tot, $intron_max, $intron_min, $intron_len, $intron_tot );
 368
 369     $exon_max   = 0;
 370     $exon_min   = 9999999999;
 371     $intron_max = 0;
 372     $intron_min = 9999999999;
 373
 374     $entry->{ "EXONS" }   = $entry->{ "BLOCKCOUNT" };
 375
 376     @begs = split /,/, $entry->{ "Q_BEGS" };
 377     @lens = split /,/, $entry->{ "BLOCKSIZES" };
 378
 379     for ( $i = 0; $i < $entry->{ "BLOCKCOUNT" }; $i++ )
 380     {
 381         $exon_len = @lens[ $i ];
 382
 383         $entry->{ "EXON_LEN_$i" } = $exon_len;
 384
 385         $exon_max = $exon_len if $exon_len > $exon_max;
 386         $exon_min = $exon_len if $exon_len < $exon_min;
 387
 388         $exon_tot += $exon_len;
 389     }
 390
 391     $entry->{ "EXON_LEN_-1" }   = $exon_len;
 392     $entry->{ "EXON_MAX_LEN" }  = $exon_max;
 393     $entry->{ "EXON_MIN_LEN" }  = $exon_min;
 394     $entry->{ "EXON_MEAN_LEN" } = int( $exon_tot / $entry->{ "EXONS" } );
 395
 396     $entry->{ "INTRONS" } = $entry->{ "BLOCKCOUNT" } - 1;
 397     $entry->{ "INTRONS" } = 0 if $entry->{ "INTRONS" } < 0;
 398
 399     if ( $entry->{ "INTRONS" } )
 400     {
 401         for ( $i = 1; $i < $entry->{ "BLOCKCOUNT" }; $i++ )
 402         {
 403             $intron_len = @begs[ $i ] - ( @begs[ $i - 1 ] + @lens[ $i - 1 ] );
 404
 405             $entry->{ "INTRON_LEN_" . ( $i - 1 ) } = $intron_len;
 406
 407             $intron_max = $intron_len if $intron_len > $intron_max;
 408             $intron_min = $intron_len if $intron_len < $intron_min;
 409
 410             $intron_tot += $intron_len;
 411         }
 412
 413         $entry->{ "INTRON_LEN_-1" }   = $intron_len;
 414         $entry->{ "INTRON_MAX_LEN" }  = $intron_max;
 415         $entry->{ "INTRON_MIN_LEN" }  = $intron_min;
 416         $entry->{ "INTRON_MEAN_LEN" } = int( $intron_tot / $entry->{ "INTRONS" } );
 417     }
 418
 419     return wantarray ? %{ $entry } : $entry;
 420 }
 421
 422
 423 sub bed_sort
 424 {
 425     # Martin A. Hansen, Septermber 2008
 426
 427     # Sorts a BED file using the c program
 428     # "bed_sort" specifing a sort mode:
 429
 430     # 1: chr AND chr_beg.
 431     # 2: chr AND strand AND chr_beg.
 432     # 3: chr_beg.
 433     # 4: strand AND chr_beg.
 434
 435     my ( $bed_file,    # BED file to sort
 436          $sort_mode,   # See above.
 437          $cols,        # Number of columns in BED file
 438        ) = @_;
 439
 440     &Maasha::Common::run( "bed_sort", "--sort $sort_mode --cols $cols $bed_file" );
 441 }
 442
 443
 444 sub bed_split_to_files
 445 {
 446     # Martin A. Hansen, Septermber 2008
 447
 448     # Given a list of BED files, split these
 449     # into temporary files based on the chromosome
 450     # name. Returns a list of the temporary files.
 451
 452     my ( $bed_files,   # list of BED files to split
 453          $cols,        # number of columns
 454          $tmp_dir,     # temporary directory
 455        ) = @_;
 456
 457     # Returns a list.
 458
 459     my ( $bed_file, $fh_in, $entry, $key, %fh_hash, @tmp_files );
 460
 461     foreach $bed_file ( @{ $bed_files } )
 462     {
 463         $fh_in = Maasha::Common::read_open( $bed_file );
 464
 465         while ( $entry = bed_entry_get_array( $fh_in, $cols ) )
 466         {
 467             $key = $entry->[ CHR ];
 468
 469             $fh_hash{ $key } = Maasha::Common::write_open( "$tmp_dir/$key.temp" ) if not exists $fh_hash{ $key };
 470
 471             bed_entry_put_array( $entry, $fh_hash{ $key } );
 472         }
 473
 474         close $fh_in;
 475     }
 476
 477     foreach $key ( sort keys %fh_hash )
 478     {
 479         push @tmp_files, "$tmp_dir/$key.temp";
 480
 481         close $fh_hash{ $key };
 482     }
 483
 484     return wantarray ? @tmp_files : \@tmp_files;
 485 }
 486
 487
 488 sub bed_merge_entries
 489 {
 490     # Martin A. Hansen, February 2008.
 491
 492     # Merge a list of given BED entries in one big entry.
 493
 494     my ( $entries,     # list of BED entries to be merged
 495        ) = @_;
 496
 497     # Returns hash.
 498
 499     my ( $i, @q_ids, @q_begs, @blocksizes, @new_q_begs, @new_blocksizes, %new_entry );
 500
 501     @{ $entries } = sort { $a->{ "CHR_BEG" } <=> $b->{ "CHR_BEG" } } @{ $entries };
 502
 503     for ( $i = 0; $i < @{ $entries }; $i++ )
 504     {
 505         Maasha::Common::error( qq(Attempted merge of BED entries from different chromosomes) ) if $entries->[ 0 ]->{ "CHR" }    ne $entries->[ $i ]->{ "CHR" };
 506         Maasha::Common::error( qq(Attempted merge of BED entries from different strands) )     if $entries->[ 0 ]->{ "STRAND" } ne $entries->[ $i ]->{ "STRAND" };
 507
 508         push @q_ids, $entries->[ $i ]->{ "Q_ID" } || sprintf( "ID%06d", $i );
 509
 510         if ( exists $entries->[ $i ]->{ "Q_BEGS" } )
 511         {
 512             @q_begs     = split ",", $entries->[ $i ]->{ "Q_BEGS" };
 513             @blocksizes = split ",", $entries->[ $i ]->{ "BLOCKSIZES" };
 514         }
 515         else
 516         {
 517             @q_begs     = 0;
 518             @blocksizes = $entries->[ $i ]->{ "CHR_END" } - $entries->[ $i ]->{ "CHR_BEG" } + 1;
 519         }
 520
 521         map { $_ += $entries->[ $i ]->{ "CHR_BEG" } } @q_begs;
 522
 523         push @new_q_begs, @q_begs;
 524         push @new_blocksizes, @blocksizes;
 525     }
 526
 527     map { $_ -= $entries->[ 0 ]->{ "CHR_BEG" } } @new_q_begs;
 528
 529     %new_entry = (
 530         CHR         => $entries->[ 0 ]->{ "CHR" },
 531         CHR_BEG     => $entries->[ 0 ]->{ "CHR_BEG" },
 532         CHR_END     => $entries->[ -1 ]->{ "CHR_END" },
 533         REC_TYPE    => "BED",
 534         BED_LEN     => $entries->[ -1 ]->{ "CHR_END" } - $entries->[ 0 ]->{ "CHR_BEG" } + 1,
 535         BED_COLS    => 12,
 536         Q_ID        => join( ":", @q_ids ),
 537         SCORE       => 999,
 538         STRAND      => $entries->[ 0 ]->{ "STRAND" }     || "+",
 539         THICK_BEG   => $entries->[ 0 ]->{ "THICK_BEG" }  || $entries->[ 0 ]->{ "CHR_BEG" },
 540         THICK_END   => $entries->[ -1 ]->{ "THICK_END" } || $entries->[ -1 ]->{ "CHR_END" },
 541         ITEMRGB     => "0,0,0",
 542         BLOCKCOUNT  => scalar @new_q_begs,
 543         BLOCKSIZES  => join( ",", @new_blocksizes ),
 544         Q_BEGS      => join( ",", @new_q_begs ),
 545     );
 546
 547     return wantarray ? %new_entry : \%new_entry;
 548 }
 549
 550
 551 sub bed_split_entry
 552 {
 553     # Martin A. Hansen, February 2008.
 554
 555     # Splits a given BED entry into a list of blocks,
 556     # which are returned. A list of 6 column BED entry is returned.
 557
 558     my ( $entry,    # BED entry hashref
 559        ) = @_;
 560
 561     # Returns a list.
 562
 563     my ( @q_begs, @blocksizes, $block, @blocks, $i );
 564
 565     if ( exists $entry->{ "BLOCKCOUNT" } )
 566     {
 567         @q_begs     = split ",", $entry->{ "Q_BEGS" };
 568         @blocksizes = split ",", $entry->{ "BLOCKSIZES" };
 569
 570         for ( $i = 0; $i < @q_begs; $i++ )
 571         {
 572             undef $block;
 573
 574             $block->{ "CHR" }      = $entry->{ "CHR" };
 575             $block->{ "CHR_BEG" }  = $entry->{ "CHR_BEG" } + $q_begs[ $i ];
 576             $block->{ "CHR_END" }  = $entry->{ "CHR_BEG" } + $q_begs[ $i ] + $blocksizes[ $i ] - 1;
 577             $block->{ "Q_ID" }     = $entry->{ "Q_ID" } . sprintf( "_%03d", $i );
 578             $block->{ "SCORE" }    = $entry->{ "SCORE" };
 579             $block->{ "STRAND" }   = $entry->{ "STRAND" };
 580             $block->{ "BED_LEN" }  = $block->{ "CHR_END" } - $block->{ "CHR_BEG" } + 1,
 581             $block->{ "BED_COLS" } = 6;
 582             $block->{ "REC_TYPE" } = "BED";
 583
 584             push @blocks, $block;
 585         }
 586     }
 587     else
 588     {
 589         @blocks = @{ $entry };
 590     }
 591
 592     return wantarray ? @blocks : \@blocks;
 593 }
 594
 595
 596
 597 sub bed_overlap
 598 {
 599     # Martin A. Hansen, February 2008.
 600
 601     # Checks if two BED entries overlap and
 602     # return 1 if so - else 0;
 603
 604     my ( $entry1,      # hashref
 605          $entry2,      # hashref
 606          $no_strand,   # don't check strand flag - OPTIONAL
 607        ) = @_;
 608
 609     # Return bolean.
 610
 611     return 0 if $entry1->{ "CHR" }    ne $entry2->{ "CHR" };
 612     return 0 if $entry1->{ "STRAND" } ne $entry2->{ "STRAND" };
 613
 614     if ( $entry1->{ "CHR_END" } < $entry2->{ "CHR_BEG" } or $entry1->{ "CHR_BEG" } > $entry2->{ "CHR_END" } ) {
 615         return 0;
 616     } else {
 617         return 1;
 618     }
 619 }
 620
 621
 622 sub bed_upload_to_ucsc
 623 {
 624     # Martin A. Hansen, September 2007.
 625
 626     # Upload a BED file to the UCSC database.
 627
 628     my ( $tmp_dir,   # temporary directory
 629          $file,      # file to upload,
 630          $options,   # argument hashref
 631          $append,    # flag indicating table should be appended
 632        ) = @_;
 633
 634     # Returns nothing.
 635
 636     my ( $args, $table, $sql_file, $fh_out, $fh_in );
 637
 638     if ( $append ) {
 639         $args = join " ", $options->{ "database" }, $options->{ "table" }, "-tmpDir=$tmp_dir", "-oldTable", $file;
 640     } else {
 641         $args = join " ", $options->{ "database" }, $options->{ "table" }, "-tmpDir=$tmp_dir", $file;
 642     }
 643
 644     if ( $options->{ "table" } =~ /rnaSecStr/ )
 645     {
 646         $table = $options->{ "table" };
 647
 648         print qq(uploading secondary structure table:"$table"\n) if $options->{ "verbose" };
 649
 650         $sql_file = "$tmp_dir/upload_RNA_SS.sql";
 651
 652         $fh_out   = Maasha::Common::write_open( $sql_file );
 653
 654         print $fh_out qq(
 655 CREATE TABLE $table (
 656     bin smallint not null,              # Bin number for browser speedup
 657     chrom varchar(255) not null,        # Chromosome or FPC contig
 658     chromStart int unsigned not null,   # Start position in chromosome
 659     chromEnd int unsigned not null,     # End position in chromosome
 660     name varchar(255) not null,         # Name of item
 661     score int unsigned not null,        # Score from 0-1000
 662     strand char(1) not null,            # + or -
 663     size int unsigned not null,         # Size of element.
 664     secStr longblob not null,           # Parentheses and '.'s which define the secondary structure
 665     conf longblob not null,             # Confidence of secondary-structure annotation per position (0.0-1.0).
 666     #Indices
 667     INDEX(name(16)),
 668     INDEX(chrom(8), bin),
 669     INDEX(chrom(8), chromStart)
 670 );
 671         );
 672
 673         close $fh_out;
 674
 675         Maasha::Common::run( "hgLoadBed", "-notItemRgb -sqlTable=$sql_file $options->{ 'database' } $options->{ 'table' } -tmpDir=$tmp_dir $file > /dev/null 2>&1" );
 676
 677         unlink $sql_file;
 678     }
 679     else
 680     {
 681         Maasha::Common::run( "hgLoadBed", "$args > /dev/null 2>&1" );
 682     }
 683 }
 684
 685
 686 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> PSL format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
 687
 688
 689 sub psl_get_entry
 690 {
 691     # Martin A. Hansen, August 2008.
 692
 693     # Reads PSL next entry from a PSL file and returns a record.
 694
 695     my ( $fh,   # file handle of PSL filefull path to PSL file
 696        ) = @_;
 697
 698     # Returns hashref.
 699
 700     my ( $line, @fields, %record );
 701
 702     while ( $line = <$fh> )
 703     {
 704         chomp $line;
 705
 706         @fields = split "\t", $line;
 707
 708         if ( scalar @fields == 21 )
 709         {
 710             %record = (
 711                 REC_TYPE    => "PSL",
 712                 MATCHES     => $fields[ 0 ],
 713                 MISMATCHES  => $fields[ 1 ],
 714                 REPMATCHES  => $fields[ 2 ],
 715                 NCOUNT      => $fields[ 3 ],
 716                 QNUMINSERT  => $fields[ 4 ],
 717                 QBASEINSERT => $fields[ 5 ],
 718                 SNUMINSERT  => $fields[ 6 ],
 719                 SBASEINSERT => $fields[ 7 ],
 720                 STRAND      => $fields[ 8 ],
 721                 Q_ID        => $fields[ 9 ],
 722                 Q_LEN       => $fields[ 10 ],
 723                 Q_BEG       => $fields[ 11 ],
 724                 Q_END       => $fields[ 12 ] - 1,
 725                 S_ID        => $fields[ 13 ],
 726                 S_LEN       => $fields[ 14 ],
 727                 S_BEG       => $fields[ 15 ],
 728                 S_END       => $fields[ 16 ] - 1,
 729                 BLOCKCOUNT  => $fields[ 17 ],
 730                 BLOCKSIZES  => $fields[ 18 ],
 731                 Q_BEGS      => $fields[ 19 ],
 732                 S_BEGS      => $fields[ 20 ],
 733             );
 734
 735             $record{ "SCORE" } = $record{ "MATCHES" } + int( $record{ "REPMATCHES" } / 2 ) - $record{ "MISMATCHES" } - $record{ "QNUMINSERT" } - $record{ "SNUMINSERT" };
 736             $record{ "SPAN" }  = $fields[ 12 ] - $fields[ 11 ];
 737
 738             return wantarray ? %record : \%record;
 739         }
 740     }
 741
 742     return undef;
 743 }
 744
 745
 746 sub psl_get_entries
 747 {
 748     # Martin A. Hansen, February 2008.
 749
 750     # Reads PSL entries and returns a list of records.
 751
 752     my ( $path,   # full path to PSL file
 753        ) = @_;
 754
 755     # Returns hashref.
 756
 757     my ( $fh, @lines, @fields, $i, %record, @records );
 758
 759     $fh = Maasha::Common::read_open( $path );
 760
 761     @lines = <$fh>;
 762
 763     close $fh;
 764
 765     chomp @lines;
 766
 767     for ( $i = 5; $i < @lines; $i++ )
 768     {
 769         @fields = split "\t", $lines[ $i ];
 770
 771         Maasha::Common::error( qq(Bad PSL format in file "$path") ) if not @fields == 21;
 772
 773         undef %record;
 774
 775         %record = (
 776             REC_TYPE    => "PSL",
 777             MATCHES     => $fields[ 0 ],
 778             MISMATCHES  => $fields[ 1 ],
 779             REPMATCHES  => $fields[ 2 ],
 780             NCOUNT      => $fields[ 3 ],
 781             QNUMINSERT  => $fields[ 4 ],
 782             QBASEINSERT => $fields[ 5 ],
 783             SNUMINSERT  => $fields[ 6 ],
 784             SBASEINSERT => $fields[ 7 ],
 785             STRAND      => $fields[ 8 ],
 786             Q_ID        => $fields[ 9 ],
 787             Q_LEN       => $fields[ 10 ],
 788             Q_BEG       => $fields[ 11 ],
 789             Q_END       => $fields[ 12 ] - 1,
 790             S_ID        => $fields[ 13 ],
 791             S_LEN       => $fields[ 14 ],
 792             S_BEG       => $fields[ 15 ],
 793             S_END       => $fields[ 16 ] - 1,
 794             BLOCKCOUNT  => $fields[ 17 ],
 795             BLOCKSIZES  => $fields[ 18 ],
 796             Q_BEGS      => $fields[ 19 ],
 797             S_BEGS      => $fields[ 20 ],
 798         );
 799
 800         $record{ "SCORE" } = $record{ "MATCHES" } + int( $record{ "REPMATCHES" } / 2 ) - $record{ "MISMATCHES" } - $record{ "QNUMINSERT" } - $record{ "SNUMINSERT" };
 801
 802         push @records, { %record };
 803     }
 804
 805     return wantarray ? @records : \@records;
 806 }
 807
 808
 809 sub psl_put_header
 810 {
 811     # Martin A. Hansen, September 2007.
 812
 813     # Write a PSL header to file.
 814
 815     my ( $fh,  # file handle  - OPTIONAL
 816        ) = @_;
 817
 818     # Returns nothing.
 819
 820     $fh = \*STDOUT if not $fh;
 821
 822     print $fh qq(psLayout version 3
 823 match   mis-    rep.    N's     Q gap   Q gap   T gap   T gap   strand  Q               Q       Q       Q       T               T       T       T       block   blockSizes      qStart        match   match           count   bases   count   bases           name            size    start   end     name            size    start   end     count
 824 ---------------------------------------------------------------------------------------------------------------------------------------------------------------
 825 );
 826 }
 827
 828
 829 sub psl_put_entry
 830 {
 831     # Martin A. Hansen, September 2007.
 832
 833     # Write a PSL entry to file.
 834
 835     my ( $record,       # hashref
 836          $fh,           # file handle  -  OPTIONAL
 837        ) = @_;
 838
 839     # Returns nothing.
 840
 841     $fh = \*STDOUT if not $fh;
 842
 843     my @output;
 844
 845     push @output, $record->{ "MATCHES" };
 846     push @output, $record->{ "MISMATCHES" };
 847     push @output, $record->{ "REPMATCHES" };
 848     push @output, $record->{ "NCOUNT" };
 849     push @output, $record->{ "QNUMINSERT" };
 850     push @output, $record->{ "QBASEINSERT" };
 851     push @output, $record->{ "SNUMINSERT" };
 852     push @output, $record->{ "SBASEINSERT" };
 853     push @output, $record->{ "STRAND" };
 854     push @output, $record->{ "Q_ID" };
 855     push @output, $record->{ "Q_LEN" };
 856     push @output, $record->{ "Q_BEG" };
 857     push @output, $record->{ "Q_END" } + 1;
 858     push @output, $record->{ "S_ID" };
 859     push @output, $record->{ "S_LEN" };
 860     push @output, $record->{ "S_BEG" };
 861     push @output, $record->{ "S_END" } + 1;
 862     push @output, $record->{ "BLOCKCOUNT" };
 863     push @output, $record->{ "BLOCKSIZES" };
 864     push @output, $record->{ "Q_BEGS" };
 865     push @output, $record->{ "S_BEGS" };
 866
 867     print $fh join( "\t", @output ), "\n";
 868 }
 869
 870
 871 sub psl_upload_to_ucsc
 872 {
 873     # Martin A. Hansen, September 2007.
 874
 875     # Upload a PSL file to the UCSC database.
 876
 877     my ( $file,      # file to upload,
 878          $options,   # argument hashref
 879          $append,    # flag indicating table should be appended
 880        ) = @_;
 881
 882     # Returns nothing.
 883
 884     my ( $args );
 885
 886     if ( $append ) {
 887         $args = join " ", $options->{ "database" }, "-table=$options->{ 'table' }", "-clientLoad", "-append", $file;
 888     } else {
 889         $args = join " ", $options->{ "database" }, "-table=$options->{ 'table' }", "-clientLoad", $file;
 890     }
 891
 892     Maasha::Common::run( "hgLoadPsl", "$args > /dev/null 2>&1" );
 893 }
 894
 895
 896 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> TRACK FILE <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
 897
 898
 899 sub ucsc_config_entry_get
 900 {
 901     # Martin A. Hansen, November 2008.
 902
 903     # Given a filehandle to a UCSC Genome Browser
 904     # config file (.ra) get the next entry and
 905     # return as a hash. Entries are separated by blank
 906     # lines. # lines are skipped unless it is the lines:
 907     # # Track added ...
 908     # # Database ...
 909
 910     my ( $fh,   # file hanlde
 911        ) = @_;
 912
 913     # Returns a hashref.
 914
 915     my ( $line, %record );
 916
 917     while ( $line = <$fh> )
 918     {
 919         if ( $line =~ /Track added by 'upload_to_ucsc' (\S+) (\S+)/) {
 920              $record{ 'date' } = $1;
 921              $record{ 'time' } = $2;
 922         } elsif ( $line =~ /^# date: (.+)/ ) {
 923             $record{ 'date' } = $1;
 924         } elsif ( $line =~ /^# time: (.+)/ ) {
 925             $record{ 'time' } = $1;
 926         } elsif ( $line =~ /^# (?:database:|Database) (.+)/ ) {
 927             $record{ 'database' } = $1;
 928         } elsif ( $line =~ /^#/ ) {
 929             # skip
 930         } elsif ( $line =~ /(\S+)\s+(.+)/ ) {
 931             $record{ $1 } = $2;
 932         }
 933
 934         last if $line =~ /^$/ and exists $record{ "track" };
 935     }
 936
 937     return undef if not exists $record{ "track" };
 938
 939     return wantarray ? %record : \%record;
 940 }
 941
 942
 943 sub ucsc_config_entry_put
 944 {
 945     # Martin A. Hansen, November 2008.
 946
 947     # Outputs a Biopiece record (a hashref)
 948     # to a filehandle or STDOUT.
 949
 950     my ( $record,    # hashref
 951          $fh_out,    # file handle
 952        ) = @_;
 953
 954     # Returns nothing.
 955
 956     my ( $date, $time );
 957
 958     $fh_out ||= \*STDOUT;
 959
 960     ( $date, $time ) = split " ", Maasha::Common::time_stamp();
 961
 962     $record->{ "date" } ||= $date;
 963     $record->{ "time" } ||= $time;
 964
 965     print $fh_out "\n# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n\n";
 966
 967     map { print $fh_out "# $_: $record->{ $_ }\n" if exists $record->{ $_ } } qw( date time database );
 968     map { print $fh_out "$_ $record->{ $_ }\n" if exists $record->{ $_ } } qw( track
 969                                                                                name
 970                                                                                description
 971                                                                                itemRgb
 972                                                                                db
 973                                                                                offset
 974                                                                                url
 975                                                                                htmlUrl
 976                                                                                shortLabel
 977                                                                                longLabel
 978                                                                                group
 979                                                                                priority
 980                                                                                useScore
 981                                                                                visibility
 982                                                                                maxHeightPixels
 983                                                                                color
 984                                                                                mafTrack
 985                                                                                type );
 986 }
 987
 988
 989 sub ucsc_update_config
 990 {
 991     # Martin A. Hansen, September 2007.
 992
 993     # Update the /home/user/ucsc/my_tracks.ra file and executes makeCustomTracks.pl
 994
 995     my ( $options,   # hashref
 996          $type,      # track type
 997        ) = @_;
 998
 999     # Returns nothing.
1000
1001     my ( $file, $entry, %new_entry, $fh_in, $fh_out, $was_found );
1002
1003     $file = $ENV{ "HOME" } . "/ucsc/my_tracks.ra";
1004
1005     Maasha::Filesys::file_copy( $file, "$file~" );   # create a backup
1006
1007     %new_entry = (
1008         database   => $options->{ 'database' },
1009         track      => $options->{ 'table' },
1010         shortLabel => $options->{ 'short_label' },
1011         longLabel  => $options->{ 'long_label' },
1012         group      => $options->{ 'group' },
1013         priority   => $options->{ 'priority' },
1014         useScore   => $options->{ 'use_score' },
1015         visibility => $options->{ 'visibility' },
1016         color      => $options->{ 'color' },
1017         type       => $type,
1018     );
1019
1020     $new_entry{ 'mafTrack' }        = "multiz17way" if $type eq "type bed 6 +";
1021     $new_entry{ 'maxHeightPixels' } = "50:50:11"    if $type eq "wig 0";
1022
1023     $fh_in  = Maasha::Filesys::file_read_open( "$file~" );
1024     $fh_out = Maasha::Filesys::file_write_open( $file );
1025
1026     while ( $entry = ucsc_config_entry_get( $fh_in ) )
1027     {
1028         if ( $entry->{ 'database' } eq $new_entry{ 'database' } and $entry->{ 'track' } eq $new_entry{ 'track' } )
1029         {
1030             ucsc_config_entry_put( \%new_entry, $fh_out );
1031
1032             $was_found = 1;
1033         }
1034         else
1035         {
1036             ucsc_config_entry_put( $entry, $fh_out );
1037         }
1038     }
1039
1040     ucsc_config_entry_put( \%new_entry, $fh_out ) if not $was_found;
1041
1042     close $fh_in;
1043     close $fh_out;
1044
1045     Maasha::Common::run( "ucscMakeTracks.pl", "-b > /dev/null 2>&1" );
1046 }
1047
1048
1049 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> fixedStep format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1050
1051
1052 sub fixedstep_get_entry
1053 {
1054     # Martin A. Hansen, December 2007.
1055
1056     # Given a file handle to a PhastCons file get the
1057     # next entry which is all the lines after a "fixedStep"
1058     # line and until the next "fixedStep" line or EOF.
1059
1060     my ( $fh,   # filehandle
1061        ) = @_;
1062
1063     # Returns a list of lines
1064
1065     my ( $entry, @lines );
1066
1067     local $/ = "\nfixedStep ";
1068
1069     $entry = <$fh>;
1070
1071     chomp $entry;
1072
1073     @lines = split "\n", $entry;
1074
1075     return if @lines == 0;
1076
1077     $lines[ 0 ] =~ s/fixedStep?\s*//;
1078
1079     return wantarray ? @lines : \@lines;
1080 }
1081
1082
1083 sub fixedstep_index_create
1084 {
1085     # Martin A. Hansen, January 2008.
1086
1087     # Indexes a concatenated fixedStep file.
1088     # The index consists of a hash with chromosomes as keys,
1089     # and a list of [ chr_beg, next_chr_beg, chr_end, index_beg, index_len ] as values.
1090
1091     my ( $path,   # path to fixedStep file
1092        ) = @_;
1093
1094     # Returns a hashref
1095
1096     my ( $fh, $pos, $index_beg, $index_len, $entry, $locator, $chr, $step, $beg, $end, $len, %index, $i );
1097
1098     $fh = Maasha::Common::read_open( $path );
1099
1100     $pos = 0;
1101
1102     while ( $entry = Maasha::UCSC::fixedstep_get_entry( $fh ) )
1103     {
1104         $locator = shift @{ $entry };
1105
1106         if ( $locator =~ /chrom=([^ ]+) start=(\d+) step=(\d+)/ )
1107         {
1108             $chr  = $1;
1109             $beg  = $2 - 1;  #  fixedStep files are 1-based
1110             $step = $3;
1111         }
1112         else
1113         {
1114             Maasha::Common::error( qq(Could not parse locator: $locator) );
1115         }
1116
1117         $pos += length( $locator ) + 11;
1118
1119         $index_beg = $pos;
1120
1121 #        map { $pos += length( $_ ) + 1 } @{ $entry };
1122
1123         $pos += 6 * scalar @{ $entry };
1124
1125         $index_len = $pos - $index_beg;
1126
1127         push @{ $index{ $chr } }, [ $beg, undef, $beg + scalar @{ $entry } - 1, $index_beg, $index_len ];
1128     }
1129
1130     close $fh;
1131
1132     foreach $chr ( keys %index )
1133     {
1134         for ( $i = 0; $i < @{ $index{ $chr } } - 1; $i++ ) {
1135             $index{ $chr }->[ $i ]->[ FS_NEXT_CHR_BEG ] = $index{ $chr }->[ $i + 1 ]->[ 0 ];
1136         }
1137
1138         $index{ $chr }->[ -1 ]->[ FS_NEXT_CHR_BEG ] = $index{ $chr }->[ -1 ]->[ FS_CHR_END ] + 1;
1139     }
1140
1141     return wantarray ? %index : \%index;
1142 }
1143
1144
1145 sub fixedstep_index_store
1146 {
1147     # Martin A. Hansen, January 2008.
1148
1149     # Writes a fixedStep index to binary file.
1150
1151     my ( $path,   # full path to file
1152          $index,  # list with index
1153        ) = @_;
1154
1155     # returns nothing
1156
1157     Maasha::Common::file_store( $path, $index );
1158 }
1159
1160
1161 sub fixedstep_index_retrieve
1162 {
1163     # Martin A. Hansen, January 2008.
1164
1165     # Retrieves a fixedStep index from binary file.
1166
1167     my ( $path,   # full path to file
1168        ) = @_;
1169
1170     # returns list
1171
1172     my $index;
1173
1174     $index = Maasha::Common::file_retrieve( $path );
1175
1176     return wantarray ? %{ $index } : $index;
1177 }
1178
1179
1180 sub fixedstep_index_lookup
1181 {
1182     # Martin A. Hansen, January 2008.
1183
1184     # Retrieve fixedStep scores from a indexed
1185     # fixedStep file given a chromosome and
1186     # begin and end positions.
1187
1188     my ( $index,     # data structure
1189          $fh,        # filehandle to datafile
1190          $chr,       # chromosome
1191          $chr_beg,   # chromosome beg
1192          $chr_end,   # chromosome end
1193          $flank,     # include flanking region - OPTIONAL
1194        ) = @_;
1195
1196     # Returns a list
1197
1198     my ( $index_beg, $index_end, $i, $c, $beg, $end, @vals, $scores );
1199
1200     $flank ||= 0;
1201
1202     $chr_beg -= $flank;
1203     $chr_end += $flank;
1204
1205     # print "chr_beg->$chr_beg   chr_end->$chr_end   flank->$flank\n";
1206
1207     if ( exists $index->{ $chr } )
1208     {
1209         $index_beg = Maasha::Matrix::interval_search( $index->{ $chr }, 0, 1, $chr_beg );
1210
1211         if ( $index_beg < 0 ) {
1212             Maasha::Common::error( qq(Index search failed - begin index position doesn't exists: $chr_beg) );
1213         }
1214
1215         if ( $chr_end < $index->{ $chr }->[ $index_beg ]->[ 1 ] )
1216         {
1217             $index_end = $index_beg;
1218         }
1219         else
1220         {
1221             $index_end = Maasha::Matrix::interval_search( $index->{ $chr }, 0, 1, $chr_end );
1222
1223             if ( $index_end < 0 ) {
1224                 Maasha::Common::error( qq(Index search failed - end index position doesn't exists: $chr_end) );
1225             }
1226         }
1227
1228         map { $scores->[ $_ ] = 0 } 0 .. $chr_end - $chr_beg;
1229
1230         if ( $index_beg == $index_end )
1231         {
1232             $beg = Maasha::Calc::max( $chr_beg, $index->{ $chr }->[ $index_beg ]->[ FS_CHR_BEG ] );
1233             $end = Maasha::Calc::min( $chr_end, $index->{ $chr }->[ $index_end ]->[ FS_CHR_END ] );
1234
1235             if ( $beg <= $index->{ $chr }->[ $index_beg ]->[ FS_CHR_END ] and $end >= $index->{ $chr }->[ $index_beg ]->[ FS_CHR_BEG ] )
1236             {
1237                 @vals = split "\n", Maasha::Common::file_read(
1238                     $fh,
1239                     $index->{ $chr }->[ $index_beg ]->[ FS_INDEX_BEG ] + 6 * ( $beg - $index->{ $chr }->[ $index_beg ]->[ FS_CHR_BEG ] ),
1240                     6 * ( $end - $beg + 1 ),
1241                 );
1242             }
1243
1244             for ( $c = 0; $c < @vals; $c++ ) {
1245                 $scores->[ $c + $beg - $chr_beg ] = $vals[ $c ];
1246             }
1247         }
1248         else
1249         {
1250             $beg = Maasha::Calc::max( $chr_beg, $index->{ $chr }->[ $index_beg ]->[ FS_CHR_BEG ] );
1251
1252 #            print Dumper( $beg, $index->{ $chr }->[ $index_beg ] );
1253 #            print Dumper( "next", $index->{ $chr }->[ $index_beg ]->[ FS_NEXT_CHR_BEG ] );
1254
1255             #      beg         next
1256             #      v           v
1257             #  |||||||||.......
1258
1259             if ( $beg <= $index->{ $chr }->[ $index_beg ]->[ FS_CHR_END ] )
1260             {
1261                 @vals = split "\n", Maasha::Common::file_read(
1262                     $fh,
1263                     $index->{ $chr }->[ $index_beg ]->[ FS_INDEX_BEG ] + 6 * ( $beg - $index->{ $chr }->[ $index_beg ]->[ FS_CHR_BEG ] ),
1264                     6 * ( $index->{ $chr }->[ $index_beg ]->[ FS_CHR_END ] - $beg + 1 ),
1265                 );
1266
1267                 for ( $c = 0; $c < @vals; $c++ ) {
1268                     $scores->[ $c + $beg - $chr_beg ] = $vals[ $c ];
1269                 }
1270             }
1271
1272             $end = Maasha::Calc::min( $chr_end, $index->{ $chr }->[ $index_end ]->[ FS_CHR_END ] );
1273
1274             if ( $end <= $index->{ $chr }->[ $index_end ]->[ FS_CHR_END ] )
1275             {
1276                 @vals = split "\n", Maasha::Common::file_read(
1277                     $fh,
1278                     $index->{ $chr }->[ $index_end ]->[ FS_INDEX_BEG ],
1279                     6 * ( $end - $index->{ $chr }->[ $index_end ]->[ FS_CHR_BEG ] + 1 ),
1280                 );
1281
1282                 for ( $c = 0; $c < @vals; $c++ ) {
1283                     $scores->[ $c + $index->{ $chr }->[ $index_end ]->[ FS_CHR_BEG ] - $chr_beg ] = $vals[ $c ];
1284                 }
1285             }
1286
1287             for ( $i = $index_beg + 1; $i <= $index_end - 1; $i++ )
1288             {
1289                 @vals = split "\n", Maasha::Common::file_read(
1290                     $fh,
1291                     $index->{ $chr }->[ $i ]->[ FS_INDEX_BEG ],
1292                     6 * ( $index->{ $chr }->[ $i ]->[ FS_CHR_END ] - $index->{ $chr }->[ $i ]->[ FS_CHR_BEG ] + 1 ),
1293                 );
1294
1295                 for ( $c = 0; $c < @vals; $c++ ) {
1296                     $scores->[ $c + $index->{ $chr }->[ $i ]->[ FS_CHR_BEG ] - $chr_beg ] = $vals[ $c ];
1297                 }
1298             }
1299         }
1300     }
1301     else
1302     {
1303         Maasha::Common::error( qq(Chromosome "$chr" was not found in index) );
1304     }
1305
1306     return wantarray ? @{ $scores } : $scores;
1307 }
1308
1309
1310 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> PhastCons format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1311
1312
1313 sub phastcons_index
1314 {
1315     # Martin A. Hansen, July 2008
1316
1317     # Create a fixedStep index for PhastCons data.
1318
1319     my ( $file,   # file to index
1320          $dir,    # dir with file
1321        ) = @_;
1322
1323     # Returns nothing.
1324
1325     my ( $index );
1326
1327     $index = fixedstep_index_create( "$dir/$file" );
1328
1329     fixedstep_index_store( "$dir/$file.index", $index );
1330 }
1331
1332
1333 sub phastcons_parse_entry
1334 {
1335     # Martin A. Hansen, December 2007.
1336
1337     # Given a PhastCons entry converts this to a
1338     # list of super blocks.
1339
1340     my ( $lines,   # list of lines
1341          $args,    # argument hash
1342        ) = @_;
1343
1344     # Returns
1345
1346     my ( $info, $chr, $beg, $step, $i, $c, $j, @blocks, @super_blocks, @entries, $super_block, $block, @lens, @begs );
1347
1348     $info = shift @{ $lines };
1349
1350     if ( $info =~ /^chrom=([^ ]+) start=(\d+) step=(\d+)$/ )
1351     {
1352         $chr  = $1;
1353         $beg  = $2;
1354         $step = $3;
1355
1356         die qq(ERROR: step size $step != 1 -> problem!\n) if $step != 1; # in an ideal world should would be fixed ...
1357     }
1358
1359     $i = 0;
1360
1361     while ( $i < @{ $lines } )
1362     {
1363         if ( $lines->[ $i ] >= $args->{ "threshold" } )
1364         {
1365             $c = $i + 1;
1366
1367             while ( $c < @{ $lines } )
1368             {
1369                 if ( $lines->[ $c ] < $args->{ "threshold" } )
1370                 {
1371                     $j = $c + 1;
1372
1373                     while ( $j < @{ $lines } and $lines->[ $j ] < $args->{ "threshold" } ) {
1374                         $j++;
1375                     }
1376
1377                     if ( $j - $c > $args->{ "gap" } )
1378                     {
1379                         if ( $c - $i >= $args->{ "min" } )
1380                         {
1381                             push @blocks, {
1382                                 CHR     => $chr,
1383                                 CHR_BEG => $beg + $i - 1,
1384                                 CHR_END => $beg + $c - 2,
1385                                 CHR_LEN => $c - $i,
1386                             };
1387                         }
1388
1389                         $i = $j;
1390
1391                         last;
1392                     }
1393
1394                     $c = $j
1395                 }
1396                 else
1397                 {
1398                     $c++;
1399                 }
1400             }
1401
1402             if ( $c - $i >= $args->{ "min" } )
1403             {
1404                 push @blocks, {
1405                     CHR     => $chr,
1406                     CHR_BEG => $beg + $i - 1,
1407                     CHR_END => $beg + $c - 2,
1408                     CHR_LEN => $c - $i,
1409                 };
1410             }
1411
1412             $i = $c;
1413         }
1414         else
1415         {
1416             $i++;
1417         }
1418     }
1419
1420     $i = 0;
1421
1422     while ( $i < @blocks )
1423     {
1424         $c = $i + 1;
1425
1426         while ( $c < @blocks and $blocks[ $c ]->{ "CHR_BEG" } - $blocks[ $c - 1 ]->{ "CHR_END" } <= $args->{ "dist" } )
1427         {
1428             $c++;
1429         }
1430
1431         push @super_blocks, [ @blocks[ $i .. $c - 1 ] ];
1432
1433         $i = $c;
1434     }
1435
1436     foreach $super_block ( @super_blocks )
1437     {
1438         foreach $block ( @{ $super_block } )
1439         {
1440             push @begs, $block->{ "CHR_BEG" } - $super_block->[ 0 ]->{ "CHR_BEG" };
1441             push @lens, $block->{ "CHR_LEN" } - 1;
1442         }
1443
1444         $lens[ -1 ]++;
1445
1446         push @entries, {
1447             CHR        => $super_block->[ 0 ]->{ "CHR" },
1448             CHR_BEG    => $super_block->[ 0 ]->{ "CHR_BEG" },
1449             CHR_END    => $super_block->[ -1 ]->{ "CHR_END" },
1450             Q_ID       => "Q_ID",
1451             SCORE      => 100,
1452             STRAND     => "+",
1453             THICK_BEG  => $super_block->[ 0 ]->{ "CHR_BEG" },
1454             THICK_END  => $super_block->[ -1 ]->{ "CHR_END" } + 1,
1455             ITEMRGB    => "0,200,100",
1456             BLOCKCOUNT => scalar @{ $super_block },
1457             BLOCKSIZES => join( ",", @lens ),
1458             Q_BEGS     => join( ",", @begs ),
1459         };
1460
1461         undef @begs;
1462         undef @lens;
1463     }
1464
1465     return wantarray ? @entries : \@entries;
1466 }
1467
1468
1469 sub phastcons_normalize
1470 {
1471     # Martin A. Hansen, January 2008.
1472
1473     # Normalizes a list of lists with PhastCons scores,
1474     # in such a way that each list contains the same number
1475     # or PhastCons scores.
1476
1477     my ( $AoA,    # AoA with PhastCons scores
1478        ) = @_;
1479
1480     # Returns AoA.
1481
1482     my ( $list, $max, $min, $mean, $diff );
1483
1484     $min = 99999999;
1485     $max = 0;
1486
1487     foreach $list ( @{ $AoA } )
1488     {
1489         $min = scalar @{ $list } if scalar @{ $list } < $min;
1490         $max = scalar @{ $list } if scalar @{ $list } > $max;
1491     }
1492
1493     $mean = int( ( $min + $max ) / 2 );
1494
1495 #    print STDERR "min->$min   max->$max   mean->$mean\n";
1496
1497     foreach $list ( @{ $AoA } )
1498     {
1499         $diff = scalar @{ $list } - $mean;
1500
1501         phastcons_list_inflate( $list, abs( $diff ) ) if $diff < 0;
1502         phastcons_list_deflate( $list, $diff )        if $diff > 0;
1503     }
1504
1505     return wantarray ? @{ $AoA } : $AoA;
1506 }
1507
1508
1509 sub phastcons_list_inflate
1510 {
1511     # Martin A. Hansen, January 2008.
1512
1513     # Inflates a list with a given number of elements
1514     # in such a way that the extra elements are introduced
1515     # evenly over the entire length of the list. The value
1516     # of the extra elements is based on a mean of the
1517     # adjacent elements.
1518
1519     my ( $list,   # list of elements
1520          $diff,   # number of elements to introduce
1521        ) = @_;
1522
1523     # Returns nothing
1524
1525     my ( $len, $space, $i, $pos );
1526
1527     $len = scalar @{ $list };
1528
1529     $space = $len / $diff;
1530
1531     for ( $i = 0; $i < $diff; $i++ )
1532     {
1533         $pos = int( ( $space / 2 ) + $i * $space );
1534
1535         splice @{ $list }, $pos, 0, ( $list->[ $pos - 1 ] + $list->[ $pos + 1 ] ) / 2;
1536         # splice @{ $list }, $pos, 0, "X";
1537     }
1538
1539     die qq(ERROR: bad inflate\n) if scalar @{ $list } != $len + $diff;
1540 }
1541
1542
1543 sub phastcons_list_deflate
1544 {
1545     # Martin A. Hansen, January 2008.
1546
1547     # Deflates a list by removing a given number of elements
1548     # evenly distributed over the entire list.
1549
1550     my ( $list,   # list of elements
1551          $diff,   # number of elements to remove
1552        ) = @_;
1553
1554     # Returns nothing
1555
1556     my ( $len, $space, $i, $pos );
1557
1558     $len = scalar @{ $list };
1559
1560     $space = ( $len - $diff ) / $diff;
1561
1562     for ( $i = 0; $i < $diff; $i++ )
1563     {
1564         $pos = int( ( $space / 2 ) + $i * $space );
1565
1566         splice @{ $list }, $pos, 1;
1567     }
1568
1569     die qq(ERROR: bad deflate\n) if scalar @{ $list } != $len - $diff;
1570 }
1571
1572
1573 sub phastcons_mean
1574 {
1575     # Martin A. Hansen, January 2008.
1576
1577     # Given a normalized PhastCons matrix in an AoA,
1578     # calculate the mean for each column and return as a list.
1579
1580     my ( $AoA,    # AoA with normalized PhastCons scores
1581        ) = @_;
1582
1583     # Returns a list
1584
1585     my ( @list );
1586
1587     $AoA = Maasha::Matrix::matrix_flip( $AoA );
1588
1589     map { push @list, Maasha::Calc::mean( $_ ) } @{ $AoA };
1590
1591     return wantarray ? @list : \@list;
1592 }
1593
1594
1595 sub phastcons_median
1596 {
1597     # Martin A. Hansen, January 2008.
1598
1599     # Given a normalized PhastCons matrix in an AoA,
1600     # calculate the median for each column and return as a list.
1601
1602     my ( $AoA,    # AoA with normalized PhastCons scores
1603        ) = @_;
1604
1605     # Returns a list
1606
1607     my ( @list );
1608
1609     $AoA = Maasha::Matrix::matrix_flip( $AoA );
1610
1611     map { push @list, Maasha::Calc::median( $_ ) } @{ $AoA };
1612
1613     return wantarray ? @list : \@list;
1614 }
1615
1616
1617 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> MULTIPLE ALIGNMENT FILES <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1618
1619
1620 sub maf_extract
1621 {
1622     # Martin A. Hansen, April 2008.
1623
1624     # Executes mafFrag to extract a subalignment from a multiz track
1625     # in the UCSC genome browser database.
1626
1627     my ( $tmp_dir,    # temporary directory
1628          $database,   # genome database
1629          $table,      # table with the multiz track
1630          $chr,        # chromosome
1631          $beg,        # begin position
1632          $end,        # end position
1633          $strand,     # strand
1634        ) = @_;
1635
1636     # Returns a list of record
1637
1638     my ( $tmp_file, $align );
1639
1640     $tmp_file = "$tmp_dir/maf_extract.maf";
1641
1642     Maasha::Common::run( "mafFrag", "$database $table $chr $beg $end $strand $tmp_file" );
1643
1644     $align = maf_parse( $tmp_file );
1645
1646     unlink $tmp_file;
1647
1648     return wantarray ? @{ $align } : $align;
1649 }
1650
1651
1652 sub maf_parse
1653 {
1654     # Martin A. Hansen, April 2008.
1655
1656
1657     my ( $path,   # full path to MAF file
1658        ) = @_;
1659
1660     # Returns a list of record.
1661
1662     my ( $fh, $line, @fields, @align );
1663
1664     $fh = Maasha::Common::read_open( $path );
1665
1666     while ( $line = <$fh> )
1667     {
1668         chomp $line;
1669
1670         if ( $line =~ /^s/ )
1671         {
1672             @fields = split / /, $line;
1673
1674             push @align, {
1675                 SEQ_NAME  => $fields[ 1 ],
1676                 SEQ       => $fields[ -1 ],
1677                 ALIGN     => 1,
1678                 ALIGN_LEN => length $fields[ -1 ],
1679             }
1680         }
1681     }
1682
1683     close $fh;
1684
1685     return wantarray ? @align : \@align;
1686 }
1687
1688
1689 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> WIGGLE FORMAT <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1690
1691
1692 sub fixedstep_put_entry
1693 {
1694     # Martin A. Hansen, April 2008.
1695
1696     # Outputs a block of fixedStep values.
1697     # Used for outputting wiggle data.
1698
1699     my ( $chr,      # chromosome
1700          $beg,      # start position
1701          $block,    # list of scores
1702          $fh,       # filehandle - OPTIONAL
1703          $log10,    # flag indicating that log10 scores should be used
1704        ) = @_;
1705
1706     # Returns nothing.
1707
1708     $beg += 1;   # fixedStep format is 1 based.
1709
1710     $fh ||= \*STDOUT;
1711
1712     print $fh "fixedStep chrom=$chr start=$beg step=1\n";
1713
1714     if ( $log10 ) {
1715         map { printf( $fh "%f\n", Maasha::Calc::log10( $_ + 1 ) ) } @{ $block };
1716     } else {
1717         map { printf( $fh "%d\n", ( $_ + 1 ) ) } @{ $block };
1718     }
1719 }
1720
1721
1722 sub wiggle_upload_to_ucsc
1723 {
1724     # Martin A. Hansen, May 2008.
1725
1726     # Upload a wiggle file to the UCSC database.
1727
1728     my ( $tmp_dir,    # temporary directory
1729          $wib_dir,    # wib directory
1730          $wig_file,   # file to upload,
1731          $options,    # argument hashref
1732        ) = @_;
1733
1734     # Returns nothing.
1735
1736     my ( $args );
1737
1738 #    $args = join " ", "-tmpDir=$tmp_dir", "-pathPrefix=$wib_dir", $options->{ "database" }, $options->{ 'table' }, $wig_file;
1739
1740 #    Maasha::Common::run( "hgLoadWiggle", "$args > /dev/null 2>&1" );
1741
1742     if ( $options->{ 'verbose' } ) {
1743         `cd $tmp_dir && hgLoadWiggle -tmpDir=$tmp_dir -pathPrefix=$wib_dir $options->{ 'database' } $options->{ 'table' } $wig_file`;
1744     } else {
1745         `cd $tmp_dir && hgLoadWiggle -tmpDir=$tmp_dir -pathPrefix=$wib_dir $options->{ 'database' } $options->{ 'table' } $wig_file > /dev/null 2>&1`;
1746     }
1747 }
1748
1749
1750 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> MySQL CONF <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1751
1752
1753 sub ucsc_get_user
1754 {
1755     # Martin A. Hansen, May 2008
1756
1757     # Fetches the MySQL database user name from the
1758     # .hg.conf file in the users home directory.
1759
1760     # Returns a string.
1761
1762     my ( $fh, $line, $user );
1763
1764     $fh = Maasha::Common::read_open( "$ENV{ 'HOME' }/.hg.conf" );
1765
1766     while ( $line = <$fh> )
1767     {
1768         chomp $line;
1769
1770         if ( $line =~ /^db\.user=(.+)/ )
1771         {
1772             $user = $1;
1773
1774             last;
1775         }
1776     }
1777
1778     close $fh;
1779
1780     return $user;
1781 }
1782
1783
1784 sub ucsc_get_password
1785 {
1786     # Martin A. Hansen, May 2008
1787
1788     # Fetches the MySQL database password from the
1789     # .hg.conf file in the users home directory.
1790
1791     # Returns a string.
1792
1793     my ( $fh, $line, $password );
1794
1795     $fh = Maasha::Common::read_open( "$ENV{ 'HOME' }/.hg.conf" );
1796
1797     while ( $line = <$fh> )
1798     {
1799         chomp $line;
1800
1801         if ( $line =~ /^db\.password=(.+)/ )
1802         {
1803             $password = $1;
1804
1805             last;
1806         }
1807     }
1808
1809     close $fh;
1810
1811     return $password;
1812 }
1813
1814
1815 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1816
1817
1818 __END__
1819