code_perl/Maasha/UCSC.pm

   1 package Maasha::UCSC;
   2
   3 # Copyright (C) 2007 Martin A. Hansen.
   4
   5 # This program is free software; you can redistribute it and/or
   6 # modify it under the terms of the GNU General Public License
   7 # as published by the Free Software Foundation; either version 2
   8 # of the License, or (at your option) any later version.
   9
  10 # This program is distributed in the hope that it will be useful,
  11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 # GNU General Public License for more details.
  14
  15 # You should have received a copy of the GNU General Public License
  16 # along with this program; if not, write to the Free Software
  17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  18
  19 # http://www.gnu.org/copyleft/gpl.html
  20
  21
  22 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  23
  24
  25 # Stuff for interacting with UCSC genome browser
  26
  27
  28 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  29
  30
  31 use strict;
  32 use vars qw ( @ISA @EXPORT );
  33
  34 use Data::Dumper;
  35 use Maasha::Common;
  36 use Maasha::Filesys;
  37 use Maasha::Calc;
  38 use Maasha::Matrix;
  39
  40 use constant {
  41     FS_CHR_BEG      => 0,
  42     FS_NEXT_CHR_BEG => 1,
  43     FS_CHR_END      => 2,
  44     FS_INDEX_BEG    => 3,
  45     FS_INDEX_LEN    => 4,
  46
  47     CHR          => 0,
  48     CHR_BEG      => 1,
  49     CHR_END      => 2,
  50     Q_ID         => 3,
  51     SCORE        => 4,
  52     STRAND       => 5,
  53     THICK_BEG    => 6,
  54     THICK_END    => 7,
  55     COLOR        => 8,
  56     BLOCK_COUNT  => 9,
  57     BLOCK_LENS   => 10,
  58     Q_BEGS       => 11,
  59 };
  60
  61 @ISA = qw( Exporter );
  62
  63
  64 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> BED format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  65
  66
  67 # http://genome.ucsc.edu/goldenPath/help/hgTracksHelp.html#BED
  68
  69
  70 sub bed_entry_get_array
  71 {
  72     # Martin A. Hansen, September 2008.
  73
  74     # Reads a BED entry given a filehandle.
  75
  76     # This is a new _faster_ BED entry parser that
  77     # uses arrays and not hashrefs.
  78
  79     # IMPORTANT! This function does not correct the
  80     # BED_END position that is kept the way UCSC
  81     # does.
  82
  83     my ( $fh,     # file handle
  84          $cols,   # columns to read - OPTIONAL (3,4,5,6, or 12)
  85        ) = @_;
  86
  87     # Returns a list.
  88
  89     my ( $line, @entry );
  90
  91     $line = <$fh>;
  92
  93     $line =~ tr/\n\r//d;    # some people have carriage returns in their BED files -> Grrrr
  94
  95     return if not defined $line;
  96
  97     if ( not defined $cols ) {
  98         $cols = 1 + $line =~ tr/\t//;
  99     }
 100
 101     @entry = split "\t", $line, $cols + 1;
 102
 103     pop @entry if scalar @entry > $cols;
 104
 105     return wantarray ? @entry : \@entry;
 106 }
 107
 108
 109 sub bed_get_entry
 110 {
 111     # Martin A. Hansen, December 2007.
 112
 113     # Reads a bed entry given a filehandle.
 114
 115     my ( $fh,        # file handle
 116          $columns,   # number of BED columns to read  -  OPTIONAL
 117        ) = @_;
 118
 119     # Returns hashref.
 120
 121     my ( $line, @fields, %entry );
 122
 123     $line = <$fh>;
 124
 125     $line =~ tr/\n\r//d;    # some people have carriage returns in their BED files -> Grrrr
 126
 127     return if not defined $line;
 128
 129     @fields = split "\t", $line;
 130
 131     $columns ||= scalar @fields;
 132
 133     if ( $columns == 3 )
 134     {
 135         %entry = (
 136             "CHR"      => $fields[ 0 ],
 137             "CHR_BEG"  => $fields[ 1 ],
 138             "CHR_END"  => $fields[ 2 ] - 1,
 139         );
 140     }
 141     elsif ( $columns == 4 )
 142     {
 143         %entry = (
 144             "CHR"      => $fields[ 0 ],
 145             "CHR_BEG"  => $fields[ 1 ],
 146             "CHR_END"  => $fields[ 2 ] - 1,
 147             "Q_ID"     => $fields[ 3 ],
 148         );
 149     }
 150     elsif ( $columns == 5 )
 151     {
 152         %entry = (
 153             "CHR"      => $fields[ 0 ],
 154             "CHR_BEG"  => $fields[ 1 ],
 155             "CHR_END"  => $fields[ 2 ] - 1,
 156             "Q_ID"     => $fields[ 3 ],
 157             "SCORE"    => $fields[ 4 ],
 158         );
 159     }
 160     elsif ( $columns == 6 )
 161     {
 162         %entry = (
 163             "CHR"      => $fields[ 0 ],
 164             "CHR_BEG"  => $fields[ 1 ],
 165             "CHR_END"  => $fields[ 2 ] - 1,
 166             "Q_ID"     => $fields[ 3 ],
 167             "SCORE"    => $fields[ 4 ],
 168             "STRAND"   => $fields[ 5 ],
 169         );
 170     }
 171     elsif ( $columns == 12 )
 172     {
 173         %entry = (
 174             "CHR"         => $fields[ 0 ],
 175             "CHR_BEG"     => $fields[ 1 ],
 176             "CHR_END"     => $fields[ 2 ] - 1,
 177             "Q_ID"        => $fields[ 3 ],
 178             "SCORE"       => $fields[ 4 ],
 179             "STRAND"      => $fields[ 5 ],
 180             "THICK_BEG"   => $fields[ 6 ],
 181             "THICK_END"   => $fields[ 7 ] - 1,
 182             "COLOR"       => $fields[ 8 ],
 183             "BLOCK_COUNT" => $fields[ 9 ],
 184             "BLOCK_LENS"  => $fields[ 10 ],
 185             "Q_BEGS"      => $fields[ 11 ],
 186         );
 187     }
 188     else
 189     {
 190         Maasha::Common::error( qq(Bad BED format in line->$line<-) );
 191     }
 192
 193     $entry{ "REC_TYPE" } = "BED";
 194     $entry{ "BED_LEN" }  = $entry{ "CHR_END" } - $entry{ "CHR_BEG" } + 1;
 195     $entry{ "BED_COLS" } = $columns;
 196
 197     return wantarray ? %entry : \%entry;
 198 }
 199
 200
 201 sub bed_get_entries
 202 {
 203     # Martin A. Hansen, January 2008.
 204
 205     # Given a path to a BED file, read in all entries
 206     # and return.
 207
 208     my ( $path,     # full path to BED file
 209          $columns,  # number of columns in BED file - OPTIONAL (but is faster)
 210        ) = @_;
 211
 212     # Returns a list.
 213
 214     my ( $fh, $entry, @list );
 215
 216     $fh = Maasha::Common::read_open( $path );
 217
 218     while ( $entry = bed_get_entry( $fh ) ) {
 219         push @list, $entry;
 220     }
 221
 222     close $fh;
 223
 224     return wantarray ? @list : \@list;
 225 }
 226
 227
 228 sub bed_entry_put_array
 229 {
 230     # Martin A. Hansen, Septermber 2008.
 231
 232     # Writes a BED entry array to file.
 233
 234     # IMPORTANT! This function does not correct the
 235     # BED_END position that is assumed to be in the
 236     # UCSC positions scheme.
 237
 238     my ( $record,   # list
 239          $fh,       # file handle                   - OPTIONAL
 240          $cols,     # number of columns in BED file - OPTIONAL
 241        ) = @_;
 242
 243     # Returns nothing.
 244
 245     $fh = \*STDOUT if not defined $fh;
 246
 247     if ( defined $cols ) {
 248         print $fh join( "\t", @{ $record }[ 0 .. $cols - 1 ] ), "\n";
 249     } else {
 250         print $fh join( "\t", @{ $record } ), "\n";
 251     }
 252 }
 253
 254
 255 sub bed_put_entry
 256 {
 257     # Martin A. Hansen, Septermber 2007.
 258
 259     # Writes a BED entry to file.
 260
 261     # NB, this could really be more robust!?
 262
 263     my ( $record,       # hashref
 264          $fh,           # file handle                   - OPTIONAL
 265          $columns,      # number of columns in BED file - OPTIONAL (but is faster)
 266        ) = @_;
 267
 268     # Returns nothing.
 269
 270     my ( @fields );
 271
 272     $columns ||= 12;   # max number of columns possible
 273
 274     if ( $columns == 3 )
 275     {
 276         push @fields, $record->{ "CHR" };
 277         push @fields, $record->{ "CHR_BEG" };
 278         push @fields, $record->{ "CHR_END" } + 1;
 279     }
 280     elsif ( $columns == 4 )
 281     {
 282         $record->{ "Q_ID" }  =~ s/\s+/_/g;
 283
 284         push @fields, $record->{ "CHR" };
 285         push @fields, $record->{ "CHR_BEG" };
 286         push @fields, $record->{ "CHR_END" } + 1;
 287         push @fields, $record->{ "Q_ID" };
 288     }
 289     elsif ( $columns == 5 )
 290     {
 291         $record->{ "Q_ID" }  =~ s/\s+/_/g;
 292         $record->{ "SCORE" } =~ s/\.\d*//;
 293
 294         push @fields, $record->{ "CHR" };
 295         push @fields, $record->{ "CHR_BEG" };
 296         push @fields, $record->{ "CHR_END" } + 1;
 297         push @fields, $record->{ "Q_ID" };
 298         push @fields, $record->{ "SCORE" };
 299     }
 300     elsif ( $columns == 6 )
 301     {
 302         $record->{ "Q_ID" }  =~ s/\s+/_/g;
 303         $record->{ "SCORE" } =~ s/\.\d*//;
 304
 305         push @fields, $record->{ "CHR" };
 306         push @fields, $record->{ "CHR_BEG" };
 307         push @fields, $record->{ "CHR_END" } + 1;
 308         push @fields, $record->{ "Q_ID" };
 309         push @fields, $record->{ "SCORE" };
 310         push @fields, $record->{ "STRAND" };
 311     }
 312     else
 313     {
 314         $record->{ "Q_ID" }  =~ s/\s+/_/g;
 315         $record->{ "SCORE" } =~ s/\.\d*//;
 316
 317         push @fields, $record->{ "CHR" };
 318         push @fields, $record->{ "CHR_BEG" };
 319         push @fields, $record->{ "CHR_END" } + 1;
 320         push @fields, $record->{ "Q_ID" };
 321         push @fields, $record->{ "SCORE" };
 322         push @fields, $record->{ "STRAND" };
 323         push @fields, $record->{ "THICK_BEG" }     if defined $record->{ "THICK_BEG" };
 324         push @fields, $record->{ "THICK_END" } + 1 if defined $record->{ "THICK_END" };
 325         push @fields, $record->{ "COLOR" }         if defined $record->{ "COLOR" };
 326         push @fields, $record->{ "BLOCK_COUNT" }   if defined $record->{ "BLOCK_COUNT" };
 327         push @fields, $record->{ "BLOCK_LENS" }    if defined $record->{ "BLOCK_LENS" };
 328         push @fields, $record->{ "Q_BEGS" }        if defined $record->{ "Q_BEGS" };
 329     }
 330
 331     if ( $fh ) {
 332         print $fh join( "\t", @fields ), "\n";
 333     } else {
 334         print join( "\t", @fields ), "\n";
 335     }
 336 }
 337
 338
 339 sub bed_put_entries
 340 {
 341     # Martin A. Hansen, January 2008.
 342
 343     # Write a list of BED entries.
 344
 345     my ( $entries,   # list of entries,
 346          $fh,        # file handle - OPTIONAL
 347        ) = @_;
 348
 349     # Returns nothing.
 350
 351     map { bed_put_entry( $_, $fh ) } @{ $entries };
 352 }
 353
 354
 355 sub bed_analyze
 356 {
 357     # Martin A. Hansen, March 2008.
 358
 359     # Given a bed record, analysis this to give information
 360     # about intron/exon sizes.
 361
 362     my ( $entry,   # BED entry
 363        ) = @_;
 364
 365     # Returns hashref.
 366
 367     my ( $i, @begs, @lens, $exon_max, $exon_min, $exon_len, $exon_tot, $intron_max, $intron_min, $intron_len, $intron_tot );
 368
 369     $exon_max   = 0;
 370     $exon_min   = 9999999999;
 371     $intron_max = 0;
 372     $intron_min = 9999999999;
 373
 374     $entry->{ "EXONS" }   = $entry->{ "BLOCK_COUNT" };
 375
 376     @begs = split /,/, $entry->{ "Q_BEGS" };
 377     @lens = split /,/, $entry->{ "BLOCK_LENS" };
 378
 379     for ( $i = 0; $i < $entry->{ "BLOCK_COUNT" }; $i++ )
 380     {
 381         $exon_len = @lens[ $i ];
 382
 383         $entry->{ "EXON_LEN_$i" } = $exon_len;
 384
 385         $exon_max = $exon_len if $exon_len > $exon_max;
 386         $exon_min = $exon_len if $exon_len < $exon_min;
 387
 388         $exon_tot += $exon_len;
 389     }
 390
 391     $entry->{ "EXON_LEN_-1" }   = $exon_len;
 392     $entry->{ "EXON_MAX_LEN" }  = $exon_max;
 393     $entry->{ "EXON_MIN_LEN" }  = $exon_min;
 394     $entry->{ "EXON_MEAN_LEN" } = int( $exon_tot / $entry->{ "EXONS" } );
 395
 396     $entry->{ "INTRONS" } = $entry->{ "BLOCK_COUNT" } - 1;
 397     $entry->{ "INTRONS" } = 0 if $entry->{ "INTRONS" } < 0;
 398
 399     if ( $entry->{ "INTRONS" } )
 400     {
 401         for ( $i = 1; $i < $entry->{ "BLOCK_COUNT" }; $i++ )
 402         {
 403             $intron_len = @begs[ $i ] - ( @begs[ $i - 1 ] + @lens[ $i - 1 ] );
 404
 405             $entry->{ "INTRON_LEN_" . ( $i - 1 ) } = $intron_len;
 406
 407             $intron_max = $intron_len if $intron_len > $intron_max;
 408             $intron_min = $intron_len if $intron_len < $intron_min;
 409
 410             $intron_tot += $intron_len;
 411         }
 412
 413         $entry->{ "INTRON_LEN_-1" }   = $intron_len;
 414         $entry->{ "INTRON_MAX_LEN" }  = $intron_max;
 415         $entry->{ "INTRON_MIN_LEN" }  = $intron_min;
 416         $entry->{ "INTRON_MEAN_LEN" } = int( $intron_tot / $entry->{ "INTRONS" } );
 417     }
 418
 419     return wantarray ? %{ $entry } : $entry;
 420 }
 421
 422
 423 sub bed_sort
 424 {
 425     # Martin A. Hansen, Septermber 2008
 426
 427     # Sorts a BED file using the c program
 428     # "bed_sort" specifing a sort mode:
 429
 430     # 1: chr AND chr_beg.
 431     # 2: chr AND strand AND chr_beg.
 432     # 3: chr_beg.
 433     # 4: strand AND chr_beg.
 434
 435     my ( $bed_file,    # BED file to sort
 436          $sort_mode,   # See above.
 437          $cols,        # Number of columns in BED file
 438        ) = @_;
 439
 440     &Maasha::Common::run( "bed_sort", "--sort $sort_mode --cols $cols $bed_file" );
 441 }
 442
 443
 444 sub bed_split_to_files
 445 {
 446     # Martin A. Hansen, Septermber 2008
 447
 448     # Given a list of BED files, split these
 449     # into temporary files based on the chromosome
 450     # name. Returns a list of the temporary files.
 451
 452     my ( $bed_files,   # list of BED files to split
 453          $cols,        # number of columns
 454          $tmp_dir,     # temporary directory
 455        ) = @_;
 456
 457     # Returns a list.
 458
 459     my ( $bed_file, $fh_in, $entry, $key, %fh_hash, @tmp_files );
 460
 461     foreach $bed_file ( @{ $bed_files } )
 462     {
 463         $fh_in = Maasha::Common::read_open( $bed_file );
 464
 465         while ( $entry = bed_entry_get_array( $fh_in, $cols ) )
 466         {
 467             $key = $entry->[ CHR ];
 468
 469             $fh_hash{ $key } = Maasha::Common::write_open( "$tmp_dir/$key.temp" ) if not exists $fh_hash{ $key };
 470
 471             bed_entry_put_array( $entry, $fh_hash{ $key } );
 472         }
 473
 474         close $fh_in;
 475     }
 476
 477     foreach $key ( sort keys %fh_hash )
 478     {
 479         push @tmp_files, "$tmp_dir/$key.temp";
 480
 481         close $fh_hash{ $key };
 482     }
 483
 484     return wantarray ? @tmp_files : \@tmp_files;
 485 }
 486
 487
 488 sub bed_merge_entries
 489 {
 490     # Martin A. Hansen, February 2008.
 491
 492     # Merge a list of given BED entries in one big entry.
 493
 494     my ( $entries,     # list of BED entries to be merged
 495        ) = @_;
 496
 497     # Returns hash.
 498
 499     my ( $i, @q_ids, @q_begs, @blocksizes, @new_q_begs, @new_blocksizes, %new_entry );
 500
 501     @{ $entries } = sort { $a->{ "CHR_BEG" } <=> $b->{ "CHR_BEG" } } @{ $entries };
 502
 503     for ( $i = 0; $i < @{ $entries }; $i++ )
 504     {
 505         Maasha::Common::error( qq(Attempted merge of BED entries from different chromosomes) ) if $entries->[ 0 ]->{ "CHR" }    ne $entries->[ $i ]->{ "CHR" };
 506         Maasha::Common::error( qq(Attempted merge of BED entries from different strands) )     if $entries->[ 0 ]->{ "STRAND" } ne $entries->[ $i ]->{ "STRAND" };
 507
 508         push @q_ids, $entries->[ $i ]->{ "Q_ID" } || sprintf( "ID%06d", $i );
 509
 510         if ( exists $entries->[ $i ]->{ "Q_BEGS" } )
 511         {
 512             @q_begs     = split ",", $entries->[ $i ]->{ "Q_BEGS" };
 513             @blocksizes = split ",", $entries->[ $i ]->{ "BLOCK_LENS" };
 514         }
 515         else
 516         {
 517             @q_begs     = 0;
 518             @blocksizes = $entries->[ $i ]->{ "CHR_END" } - $entries->[ $i ]->{ "CHR_BEG" } + 1;
 519         }
 520
 521         map { $_ += $entries->[ $i ]->{ "CHR_BEG" } } @q_begs;
 522
 523         push @new_q_begs, @q_begs;
 524         push @new_blocksizes, @blocksizes;
 525     }
 526
 527     map { $_ -= $entries->[ 0 ]->{ "CHR_BEG" } } @new_q_begs;
 528
 529     %new_entry = (
 530         CHR          => $entries->[ 0 ]->{ "CHR" },
 531         CHR_BEG      => $entries->[ 0 ]->{ "CHR_BEG" },
 532         CHR_END      => $entries->[ -1 ]->{ "CHR_END" },
 533         REC_TYPE     => "BED",
 534         BED_LEN      => $entries->[ -1 ]->{ "CHR_END" } - $entries->[ 0 ]->{ "CHR_BEG" } + 1,
 535         BED_COLS     => 12,
 536         Q_ID         => join( ":", @q_ids ),
 537         SCORE        => 999,
 538         STRAND       => $entries->[ 0 ]->{ "STRAND" }     || "+",
 539         THICK_BEG    => $entries->[ 0 ]->{ "THICK_BEG" }  || $entries->[ 0 ]->{ "CHR_BEG" },
 540         THICK_END    => $entries->[ -1 ]->{ "THICK_END" } || $entries->[ -1 ]->{ "CHR_END" },
 541         COLOR        => 0,
 542         BLOCK_COUNT  => scalar @new_q_begs,
 543         BLOCK_LENS   => join( ",", @new_blocksizes ),
 544         Q_BEGS       => join( ",", @new_q_begs ),
 545     );
 546
 547     return wantarray ? %new_entry : \%new_entry;
 548 }
 549
 550
 551 sub bed_split_entry
 552 {
 553     # Martin A. Hansen, February 2008.
 554
 555     # Splits a given BED entry into a list of blocks,
 556     # which are returned. A list of 6 column BED entry is returned.
 557
 558     my ( $entry,    # BED entry hashref
 559        ) = @_;
 560
 561     # Returns a list.
 562
 563     my ( @q_begs, @blocksizes, $block, @blocks, $i );
 564
 565     if ( exists $entry->{ "BLOCK_COUNT" } )
 566     {
 567         @q_begs     = split ",", $entry->{ "Q_BEGS" };
 568         @blocksizes = split ",", $entry->{ "BLOCK_LENS" };
 569
 570         for ( $i = 0; $i < @q_begs; $i++ )
 571         {
 572             undef $block;
 573
 574             $block->{ "CHR" }      = $entry->{ "CHR" };
 575             $block->{ "CHR_BEG" }  = $entry->{ "CHR_BEG" } + $q_begs[ $i ];
 576             $block->{ "CHR_END" }  = $entry->{ "CHR_BEG" } + $q_begs[ $i ] + $blocksizes[ $i ] - 1;
 577             $block->{ "Q_ID" }     = $entry->{ "Q_ID" } . sprintf( "_%03d", $i );
 578             $block->{ "SCORE" }    = $entry->{ "SCORE" };
 579             $block->{ "STRAND" }   = $entry->{ "STRAND" };
 580             $block->{ "BED_LEN" }  = $block->{ "CHR_END" } - $block->{ "CHR_BEG" } + 1,
 581             $block->{ "BED_COLS" } = 6;
 582             $block->{ "REC_TYPE" } = "BED";
 583
 584             push @blocks, $block;
 585         }
 586     }
 587     else
 588     {
 589         @blocks = @{ $entry };
 590     }
 591
 592     return wantarray ? @blocks : \@blocks;
 593 }
 594
 595
 596
 597 sub bed_overlap
 598 {
 599     # Martin A. Hansen, February 2008.
 600
 601     # Checks if two BED entries overlap and
 602     # return 1 if so - else 0;
 603
 604     my ( $entry1,      # hashref
 605          $entry2,      # hashref
 606          $no_strand,   # don't check strand flag - OPTIONAL
 607        ) = @_;
 608
 609     # Return bolean.
 610
 611     return 0 if $entry1->{ "CHR" }    ne $entry2->{ "CHR" };
 612     return 0 if $entry1->{ "STRAND" } ne $entry2->{ "STRAND" };
 613
 614     if ( $entry1->{ "CHR_END" } < $entry2->{ "CHR_BEG" } or $entry1->{ "CHR_BEG" } > $entry2->{ "CHR_END" } ) {
 615         return 0;
 616     } else {
 617         return 1;
 618     }
 619 }
 620
 621
 622 sub bed_upload_to_ucsc
 623 {
 624     # Martin A. Hansen, September 2007.
 625
 626     # Upload a BED file to the UCSC database.
 627
 628     my ( $tmp_dir,   # temporary directory
 629          $file,      # file to upload,
 630          $options,   # argument hashref
 631          $append,    # flag indicating table should be appended
 632        ) = @_;
 633
 634     # Returns nothing.
 635
 636     my ( $args, $table, $sql_file, $fh_out, $fh_in );
 637
 638     if ( $append ) {
 639         $args = join " ", $options->{ "database" }, $options->{ "table" }, "-tmpDir=$tmp_dir", "-oldTable", $file;
 640     } else {
 641         $args = join " ", $options->{ "database" }, $options->{ "table" }, "-tmpDir=$tmp_dir", $file;
 642     }
 643
 644     if ( $options->{ "table" } =~ /rnaSecStr/ )
 645     {
 646         $table = $options->{ "table" };
 647
 648         print qq(uploading secondary structure table:"$table"\n) if $options->{ "verbose" };
 649
 650         $sql_file = "$tmp_dir/upload_RNA_SS.sql";
 651
 652         $fh_out   = Maasha::Common::write_open( $sql_file );
 653
 654         print $fh_out qq(
 655 CREATE TABLE $table (
 656     bin smallint not null,              # Bin number for browser speedup
 657     chrom varchar(255) not null,        # Chromosome or FPC contig
 658     chromStart int unsigned not null,   # Start position in chromosome
 659     chromEnd int unsigned not null,     # End position in chromosome
 660     name varchar(255) not null,         # Name of item
 661     score int unsigned not null,        # Score from 0-1000
 662     strand char(1) not null,            # + or -
 663     size int unsigned not null,         # Size of element.
 664     secStr longblob not null,           # Parentheses and '.'s which define the secondary structure
 665     conf longblob not null,             # Confidence of secondary-structure annotation per position (0.0-1.0).
 666     #Indices
 667     INDEX(name(16)),
 668     INDEX(chrom(8), bin),
 669     INDEX(chrom(8), chromStart)
 670 );
 671         );
 672
 673         close $fh_out;
 674
 675         Maasha::Common::run( "hgLoadBed", "-notItemRgb -sqlTable=$sql_file $options->{ 'database' } $options->{ 'table' } -tmpDir=$tmp_dir $file > /dev/null 2>&1" );
 676
 677         unlink $sql_file;
 678     }
 679     else
 680     {
 681         Maasha::Common::run( "hgLoadBed", "$args > /dev/null 2>&1" );
 682     }
 683 }
 684
 685
 686 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> PSL format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
 687
 688
 689 sub psl_get_entry
 690 {
 691     # Martin A. Hansen, August 2008.
 692
 693     # Reads PSL next entry from a PSL file and returns a record.
 694
 695     my ( $fh,   # file handle of PSL filefull path to PSL file
 696        ) = @_;
 697
 698     # Returns hashref.
 699
 700     my ( $line, @fields, %record );
 701
 702     while ( $line = <$fh> )
 703     {
 704         chomp $line;
 705
 706         @fields = split "\t", $line;
 707
 708         if ( scalar @fields == 21 )
 709         {
 710             %record = (
 711                 REC_TYPE    => "PSL",
 712                 MATCHES     => $fields[ 0 ],
 713                 MISMATCHES  => $fields[ 1 ],
 714                 REPMATCHES  => $fields[ 2 ],
 715                 NCOUNT      => $fields[ 3 ],
 716                 QNUMINSERT  => $fields[ 4 ],
 717                 QBASEINSERT => $fields[ 5 ],
 718                 SNUMINSERT  => $fields[ 6 ],
 719                 SBASEINSERT => $fields[ 7 ],
 720                 STRAND      => $fields[ 8 ],
 721                 Q_ID        => $fields[ 9 ],
 722                 Q_LEN       => $fields[ 10 ],
 723                 Q_BEG       => $fields[ 11 ],
 724                 Q_END       => $fields[ 12 ] - 1,
 725                 S_ID        => $fields[ 13 ],
 726                 S_LEN       => $fields[ 14 ],
 727                 S_BEG       => $fields[ 15 ],
 728                 S_END       => $fields[ 16 ] - 1,
 729                 BLOCK_COUNT => $fields[ 17 ],
 730                 BLOCK_LENS  => $fields[ 18 ],
 731                 Q_BEGS      => $fields[ 19 ],
 732                 S_BEGS      => $fields[ 20 ],
 733             );
 734
 735             $record{ "SCORE" } = $record{ "MATCHES" } + int( $record{ "REPMATCHES" } / 2 ) - $record{ "MISMATCHES" } - $record{ "QNUMINSERT" } - $record{ "SNUMINSERT" };
 736             $record{ "Q_SPAN" } = $fields[ 12 ] - $fields[ 11 ];
 737             $record{ "S_SPAN" } = $fields[ 16 ] - $fields[ 15 ];
 738
 739             return wantarray ? %record : \%record;
 740         }
 741     }
 742
 743     return undef;
 744 }
 745
 746
 747 sub psl_get_entries
 748 {
 749     # Martin A. Hansen, February 2008.
 750
 751     # Reads PSL entries and returns a list of records.
 752
 753     my ( $path,   # full path to PSL file
 754        ) = @_;
 755
 756     # Returns hashref.
 757
 758     my ( $fh, @lines, @fields, $i, %record, @records );
 759
 760     $fh = Maasha::Common::read_open( $path );
 761
 762     @lines = <$fh>;
 763
 764     close $fh;
 765
 766     chomp @lines;
 767
 768     for ( $i = 5; $i < @lines; $i++ )
 769     {
 770         @fields = split "\t", $lines[ $i ];
 771
 772         Maasha::Common::error( qq(Bad PSL format in file "$path") ) if not @fields == 21;
 773
 774         undef %record;
 775
 776         %record = (
 777             REC_TYPE    => "PSL",
 778             MATCHES     => $fields[ 0 ],
 779             MISMATCHES  => $fields[ 1 ],
 780             REPMATCHES  => $fields[ 2 ],
 781             NCOUNT      => $fields[ 3 ],
 782             QNUMINSERT  => $fields[ 4 ],
 783             QBASEINSERT => $fields[ 5 ],
 784             SNUMINSERT  => $fields[ 6 ],
 785             SBASEINSERT => $fields[ 7 ],
 786             STRAND      => $fields[ 8 ],
 787             Q_ID        => $fields[ 9 ],
 788             Q_LEN       => $fields[ 10 ],
 789             Q_BEG       => $fields[ 11 ],
 790             Q_END       => $fields[ 12 ] - 1,
 791             S_ID        => $fields[ 13 ],
 792             S_LEN       => $fields[ 14 ],
 793             S_BEG       => $fields[ 15 ],
 794             S_END       => $fields[ 16 ] - 1,
 795             BLOCK_COUNT => $fields[ 17 ],
 796             BLOCK_LENS  => $fields[ 18 ],
 797             Q_BEGS      => $fields[ 19 ],
 798             S_BEGS      => $fields[ 20 ],
 799         );
 800
 801         $record{ "SCORE" } = $record{ "MATCHES" } + int( $record{ "REPMATCHES" } / 2 ) - $record{ "MISMATCHES" } - $record{ "QNUMINSERT" } - $record{ "SNUMINSERT" };
 802
 803         push @records, { %record };
 804     }
 805
 806     return wantarray ? @records : \@records;
 807 }
 808
 809
 810 sub psl_put_header
 811 {
 812     # Martin A. Hansen, September 2007.
 813
 814     # Write a PSL header to file.
 815
 816     my ( $fh,  # file handle  - OPTIONAL
 817        ) = @_;
 818
 819     # Returns nothing.
 820
 821     $fh = \*STDOUT if not $fh;
 822
 823     print $fh qq(psLayout version 3
 824 match   mis-    rep.    N's     Q gap   Q gap   T gap   T gap   strand  Q               Q       Q       Q       T               T       T       T       block   blockSizes      qStart        match   match           count   bases   count   bases           name            size    start   end     name            size    start   end     count
 825 ---------------------------------------------------------------------------------------------------------------------------------------------------------------
 826 );
 827 }
 828
 829
 830 sub psl_put_entry
 831 {
 832     # Martin A. Hansen, September 2007.
 833
 834     # Write a PSL entry to file.
 835
 836     my ( $record,       # hashref
 837          $fh,           # file handle  -  OPTIONAL
 838        ) = @_;
 839
 840     # Returns nothing.
 841
 842     $fh = \*STDOUT if not $fh;
 843
 844     my @output;
 845
 846     push @output, $record->{ "MATCHES" };
 847     push @output, $record->{ "MISMATCHES" };
 848     push @output, $record->{ "REPMATCHES" };
 849     push @output, $record->{ "NCOUNT" };
 850     push @output, $record->{ "QNUMINSERT" };
 851     push @output, $record->{ "QBASEINSERT" };
 852     push @output, $record->{ "SNUMINSERT" };
 853     push @output, $record->{ "SBASEINSERT" };
 854     push @output, $record->{ "STRAND" };
 855     push @output, $record->{ "Q_ID" };
 856     push @output, $record->{ "Q_LEN" };
 857     push @output, $record->{ "Q_BEG" };
 858     push @output, $record->{ "Q_END" } + 1;
 859     push @output, $record->{ "S_ID" };
 860     push @output, $record->{ "S_LEN" };
 861     push @output, $record->{ "S_BEG" };
 862     push @output, $record->{ "S_END" } + 1;
 863     push @output, $record->{ "BLOCK_COUNT" };
 864     push @output, $record->{ "BLOCK_LENS" };
 865     push @output, $record->{ "Q_BEGS" };
 866     push @output, $record->{ "S_BEGS" };
 867
 868     print $fh join( "\t", @output ), "\n";
 869 }
 870
 871
 872 sub psl_upload_to_ucsc
 873 {
 874     # Martin A. Hansen, September 2007.
 875
 876     # Upload a PSL file to the UCSC database.
 877
 878     my ( $file,      # file to upload,
 879          $options,   # argument hashref
 880          $append,    # flag indicating table should be appended
 881        ) = @_;
 882
 883     # Returns nothing.
 884
 885     my ( $args );
 886
 887     if ( $append ) {
 888         $args = join " ", $options->{ "database" }, "-table=$options->{ 'table' }", "-clientLoad", "-append", $file;
 889     } else {
 890         $args = join " ", $options->{ "database" }, "-table=$options->{ 'table' }", "-clientLoad", $file;
 891     }
 892
 893     Maasha::Common::run( "hgLoadPsl", "$args > /dev/null 2>&1" );
 894 }
 895
 896
 897 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> TRACK FILE <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
 898
 899
 900 sub ucsc_config_entry_get
 901 {
 902     # Martin A. Hansen, November 2008.
 903
 904     # Given a filehandle to a UCSC Genome Browser
 905     # config file (.ra) get the next entry and
 906     # return as a hash. Entries are separated by blank
 907     # lines. # lines are skipped unless it is the lines:
 908     # # Track added ...
 909     # # Database ...
 910
 911     my ( $fh,   # file hanlde
 912        ) = @_;
 913
 914     # Returns a hashref.
 915
 916     my ( $line, %record );
 917
 918     while ( $line = <$fh> )
 919     {
 920         $line =~ tr/\n\r//d;
 921
 922         if ( $line =~ /Track added by 'upload_to_ucsc' (\S+) (\S+)/) {
 923              $record{ 'date' } = $1;
 924              $record{ 'time' } = $2;
 925         } elsif ( $line =~ /^# date: (.+)/ ) {
 926             $record{ 'date' } = $1;
 927         } elsif ( $line =~ /^# time: (.+)/ ) {
 928             $record{ 'time' } = $1;
 929         } elsif ( $line =~ /^# (?:database:|Database) (.+)/ ) {
 930             $record{ 'database' } = $1;
 931         } elsif ( $line =~ /^#/ ) {
 932             # skip
 933         } elsif ( $line =~ /(\S+)\s+(.+)/ ) {
 934             $record{ $1 } = $2;
 935         }
 936
 937         last if $line =~ /^$/ and exists $record{ "track" };
 938     }
 939
 940     return undef if not exists $record{ "track" };
 941
 942     return wantarray ? %record : \%record;
 943 }
 944
 945
 946 sub ucsc_config_entry_put
 947 {
 948     # Martin A. Hansen, November 2008.
 949
 950     # Outputs a Biopiece record (a hashref)
 951     # to a filehandle or STDOUT.
 952
 953     my ( $record,    # hashref
 954          $fh_out,    # file handle
 955        ) = @_;
 956
 957     # Returns nothing.
 958
 959     my ( $date, $time );
 960
 961     $fh_out ||= \*STDOUT;
 962
 963     ( $date, $time ) = split " ", Maasha::Common::time_stamp();
 964
 965     $record->{ "date" } ||= $date;
 966     $record->{ "time" } ||= $time;
 967
 968     print $fh_out "\n# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n\n";
 969
 970     map { print $fh_out "# $_: $record->{ $_ }\n" if exists $record->{ $_ } } qw( date time database );
 971     map { print $fh_out "$_ $record->{ $_ }\n" if exists $record->{ $_ } } qw( track
 972                                                                                name
 973                                                                                description
 974                                                                                itemRgb
 975                                                                                db
 976                                                                                offset
 977                                                                                url
 978                                                                                htmlUrl
 979                                                                                shortLabel
 980                                                                                longLabel
 981                                                                                group
 982                                                                                priority
 983                                                                                useScore
 984                                                                                visibility
 985                                                                                maxHeightPixels
 986                                                                                color
 987                                                                                mafTrack
 988                                                                                type );
 989 }
 990
 991
 992 sub ucsc_update_config
 993 {
 994     # Martin A. Hansen, September 2007.
 995
 996     # Update the /home/user/ucsc/my_tracks.ra file and executes makeCustomTracks.pl
 997
 998     my ( $options,   # hashref
 999          $type,      # track type
1000        ) = @_;
1001
1002     # Returns nothing.
1003
1004     my ( $file, $entry, %new_entry, $fh_in, $fh_out, $was_found );
1005
1006     $file = $ENV{ "HOME" } . "/ucsc/my_tracks.ra";
1007
1008     Maasha::Filesys::file_copy( $file, "$file~" );   # create a backup
1009
1010     %new_entry = (
1011         database   => $options->{ 'database' },
1012         track      => $options->{ 'table' },
1013         shortLabel => $options->{ 'short_label' },
1014         longLabel  => $options->{ 'long_label' },
1015         group      => $options->{ 'group' },
1016         priority   => $options->{ 'priority' },
1017         visibility => $options->{ 'visibility' },
1018         color      => $options->{ 'color' },
1019         type       => $type,
1020     );
1021
1022     $new_entry{ 'useScore' }        = 1             if $options->{ 'use_score' };
1023     $new_entry{ 'mafTrack' }        = "multiz17way" if $type eq "type bed 6 +";
1024     $new_entry{ 'maxHeightPixels' } = "50:50:11"    if $type eq "wig 0";
1025
1026     $fh_in  = Maasha::Filesys::file_read_open( "$file~" );
1027     $fh_out = Maasha::Filesys::file_write_open( $file );
1028
1029     while ( $entry = ucsc_config_entry_get( $fh_in ) )
1030     {
1031         if ( $entry->{ 'database' } eq $new_entry{ 'database' } and $entry->{ 'track' } eq $new_entry{ 'track' } )
1032         {
1033             ucsc_config_entry_put( \%new_entry, $fh_out );
1034
1035             $was_found = 1;
1036         }
1037         else
1038         {
1039             ucsc_config_entry_put( $entry, $fh_out );
1040         }
1041     }
1042
1043     ucsc_config_entry_put( \%new_entry, $fh_out ) if not $was_found;
1044
1045     close $fh_in;
1046     close $fh_out;
1047
1048     Maasha::Common::run( "ucscMakeTracks.pl", "-b > /dev/null 2>&1" );
1049 }
1050
1051
1052 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> fixedStep format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1053
1054
1055 sub fixedstep_get_entry
1056 {
1057     # Martin A. Hansen, December 2007.
1058
1059     # Given a file handle to a PhastCons file get the
1060     # next entry which is all the lines after a "fixedStep"
1061     # line and until the next "fixedStep" line or EOF.
1062
1063     my ( $fh,   # filehandle
1064        ) = @_;
1065
1066     # Returns a list of lines
1067
1068     my ( $entry, @lines );
1069
1070     local $/ = "\nfixedStep ";
1071
1072     $entry = <$fh>;
1073
1074     chomp $entry;
1075
1076     @lines = split "\n", $entry;
1077
1078     return if @lines == 0;
1079
1080     $lines[ 0 ] =~ s/fixedStep?\s*//;
1081
1082     return wantarray ? @lines : \@lines;
1083 }
1084
1085
1086 sub fixedstep_index_create
1087 {
1088     # Martin A. Hansen, January 2008.
1089
1090     # Indexes a concatenated fixedStep file.
1091     # The index consists of a hash with chromosomes as keys,
1092     # and a list of [ chr_beg, next_chr_beg, chr_end, index_beg, index_len ] as values.
1093
1094     my ( $path,   # path to fixedStep file
1095        ) = @_;
1096
1097     # Returns a hashref
1098
1099     my ( $fh, $pos, $index_beg, $index_len, $entry, $locator, $chr, $step, $beg, $end, $len, %index, $i );
1100
1101     $fh = Maasha::Common::read_open( $path );
1102
1103     $pos = 0;
1104
1105     while ( $entry = Maasha::UCSC::fixedstep_get_entry( $fh ) )
1106     {
1107         $locator = shift @{ $entry };
1108
1109         if ( $locator =~ /chrom=([^ ]+) start=(\d+) step=(\d+)/ )
1110         {
1111             $chr  = $1;
1112             $beg  = $2 - 1;  #  fixedStep files are 1-based
1113             $step = $3;
1114         }
1115         else
1116         {
1117             Maasha::Common::error( qq(Could not parse locator: $locator) );
1118         }
1119
1120         $pos += length( $locator ) + 11;
1121
1122         $index_beg = $pos;
1123
1124 #        map { $pos += length( $_ ) + 1 } @{ $entry };
1125
1126         $pos += 6 * scalar @{ $entry };
1127
1128         $index_len = $pos - $index_beg;
1129
1130         push @{ $index{ $chr } }, [ $beg, undef, $beg + scalar @{ $entry } - 1, $index_beg, $index_len ];
1131     }
1132
1133     close $fh;
1134
1135     foreach $chr ( keys %index )
1136     {
1137         for ( $i = 0; $i < @{ $index{ $chr } } - 1; $i++ ) {
1138             $index{ $chr }->[ $i ]->[ FS_NEXT_CHR_BEG ] = $index{ $chr }->[ $i + 1 ]->[ 0 ];
1139         }
1140
1141         $index{ $chr }->[ -1 ]->[ FS_NEXT_CHR_BEG ] = $index{ $chr }->[ -1 ]->[ FS_CHR_END ] + 1;
1142     }
1143
1144     return wantarray ? %index : \%index;
1145 }
1146
1147
1148 sub fixedstep_index_store
1149 {
1150     # Martin A. Hansen, January 2008.
1151
1152     # Writes a fixedStep index to binary file.
1153
1154     my ( $path,   # full path to file
1155          $index,  # list with index
1156        ) = @_;
1157
1158     # returns nothing
1159
1160     Maasha::Common::file_store( $path, $index );
1161 }
1162
1163
1164 sub fixedstep_index_retrieve
1165 {
1166     # Martin A. Hansen, January 2008.
1167
1168     # Retrieves a fixedStep index from binary file.
1169
1170     my ( $path,   # full path to file
1171        ) = @_;
1172
1173     # returns list
1174
1175     my $index;
1176
1177     $index = Maasha::Common::file_retrieve( $path );
1178
1179     return wantarray ? %{ $index } : $index;
1180 }
1181
1182
1183 sub fixedstep_index_lookup
1184 {
1185     # Martin A. Hansen, January 2008.
1186
1187     # Retrieve fixedStep scores from a indexed
1188     # fixedStep file given a chromosome and
1189     # begin and end positions.
1190
1191     my ( $index,     # data structure
1192          $fh,        # filehandle to datafile
1193          $chr,       # chromosome
1194          $chr_beg,   # chromosome beg
1195          $chr_end,   # chromosome end
1196          $flank,     # include flanking region - OPTIONAL
1197        ) = @_;
1198
1199     # Returns a list
1200
1201     my ( $index_beg, $index_end, $i, $c, $beg, $end, @vals, $scores );
1202
1203     $flank ||= 0;
1204
1205     $chr_beg -= $flank;
1206     $chr_end += $flank;
1207
1208     # print "chr_beg->$chr_beg   chr_end->$chr_end   flank->$flank\n";
1209
1210     if ( exists $index->{ $chr } )
1211     {
1212         $index_beg = Maasha::Matrix::interval_search( $index->{ $chr }, 0, 1, $chr_beg );
1213
1214         if ( $index_beg < 0 ) {
1215             Maasha::Common::error( qq(Index search failed - begin index position doesn't exists: $chr_beg) );
1216         }
1217
1218         if ( $chr_end < $index->{ $chr }->[ $index_beg ]->[ 1 ] )
1219         {
1220             $index_end = $index_beg;
1221         }
1222         else
1223         {
1224             $index_end = Maasha::Matrix::interval_search( $index->{ $chr }, 0, 1, $chr_end );
1225
1226             if ( $index_end < 0 ) {
1227                 Maasha::Common::error( qq(Index search failed - end index position doesn't exists: $chr_end) );
1228             }
1229         }
1230
1231         map { $scores->[ $_ ] = 0 } 0 .. $chr_end - $chr_beg;
1232
1233         if ( $index_beg == $index_end )
1234         {
1235             $beg = Maasha::Calc::max( $chr_beg, $index->{ $chr }->[ $index_beg ]->[ FS_CHR_BEG ] );
1236             $end = Maasha::Calc::min( $chr_end, $index->{ $chr }->[ $index_end ]->[ FS_CHR_END ] );
1237
1238             if ( $beg <= $index->{ $chr }->[ $index_beg ]->[ FS_CHR_END ] and $end >= $index->{ $chr }->[ $index_beg ]->[ FS_CHR_BEG ] )
1239             {
1240                 @vals = split "\n", Maasha::Common::file_read(
1241                     $fh,
1242                     $index->{ $chr }->[ $index_beg ]->[ FS_INDEX_BEG ] + 6 * ( $beg - $index->{ $chr }->[ $index_beg ]->[ FS_CHR_BEG ] ),
1243                     6 * ( $end - $beg + 1 ),
1244                 );
1245             }
1246
1247             for ( $c = 0; $c < @vals; $c++ ) {
1248                 $scores->[ $c + $beg - $chr_beg ] = $vals[ $c ];
1249             }
1250         }
1251         else
1252         {
1253             $beg = Maasha::Calc::max( $chr_beg, $index->{ $chr }->[ $index_beg ]->[ FS_CHR_BEG ] );
1254
1255 #            print Dumper( $beg, $index->{ $chr }->[ $index_beg ] );
1256 #            print Dumper( "next", $index->{ $chr }->[ $index_beg ]->[ FS_NEXT_CHR_BEG ] );
1257
1258             #      beg         next
1259             #      v           v
1260             #  |||||||||.......
1261
1262             if ( $beg <= $index->{ $chr }->[ $index_beg ]->[ FS_CHR_END ] )
1263             {
1264                 @vals = split "\n", Maasha::Common::file_read(
1265                     $fh,
1266                     $index->{ $chr }->[ $index_beg ]->[ FS_INDEX_BEG ] + 6 * ( $beg - $index->{ $chr }->[ $index_beg ]->[ FS_CHR_BEG ] ),
1267                     6 * ( $index->{ $chr }->[ $index_beg ]->[ FS_CHR_END ] - $beg + 1 ),
1268                 );
1269
1270                 for ( $c = 0; $c < @vals; $c++ ) {
1271                     $scores->[ $c + $beg - $chr_beg ] = $vals[ $c ];
1272                 }
1273             }
1274
1275             $end = Maasha::Calc::min( $chr_end, $index->{ $chr }->[ $index_end ]->[ FS_CHR_END ] );
1276
1277             if ( $end <= $index->{ $chr }->[ $index_end ]->[ FS_CHR_END ] )
1278             {
1279                 @vals = split "\n", Maasha::Common::file_read(
1280                     $fh,
1281                     $index->{ $chr }->[ $index_end ]->[ FS_INDEX_BEG ],
1282                     6 * ( $end - $index->{ $chr }->[ $index_end ]->[ FS_CHR_BEG ] + 1 ),
1283                 );
1284
1285                 for ( $c = 0; $c < @vals; $c++ ) {
1286                     $scores->[ $c + $index->{ $chr }->[ $index_end ]->[ FS_CHR_BEG ] - $chr_beg ] = $vals[ $c ];
1287                 }
1288             }
1289
1290             for ( $i = $index_beg + 1; $i <= $index_end - 1; $i++ )
1291             {
1292                 @vals = split "\n", Maasha::Common::file_read(
1293                     $fh,
1294                     $index->{ $chr }->[ $i ]->[ FS_INDEX_BEG ],
1295                     6 * ( $index->{ $chr }->[ $i ]->[ FS_CHR_END ] - $index->{ $chr }->[ $i ]->[ FS_CHR_BEG ] + 1 ),
1296                 );
1297
1298                 for ( $c = 0; $c < @vals; $c++ ) {
1299                     $scores->[ $c + $index->{ $chr }->[ $i ]->[ FS_CHR_BEG ] - $chr_beg ] = $vals[ $c ];
1300                 }
1301             }
1302         }
1303     }
1304     else
1305     {
1306         Maasha::Common::error( qq(Chromosome "$chr" was not found in index) );
1307     }
1308
1309     return wantarray ? @{ $scores } : $scores;
1310 }
1311
1312
1313 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> PhastCons format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1314
1315
1316 sub phastcons_index
1317 {
1318     # Martin A. Hansen, July 2008
1319
1320     # Create a fixedStep index for PhastCons data.
1321
1322     my ( $file,   # file to index
1323          $dir,    # dir with file
1324        ) = @_;
1325
1326     # Returns nothing.
1327
1328     my ( $index );
1329
1330     $index = fixedstep_index_create( "$dir/$file" );
1331
1332     fixedstep_index_store( "$dir/$file.index", $index );
1333 }
1334
1335
1336 sub phastcons_parse_entry
1337 {
1338     # Martin A. Hansen, December 2007.
1339
1340     # Given a PhastCons entry converts this to a
1341     # list of super blocks.
1342
1343     my ( $lines,   # list of lines
1344          $args,    # argument hash
1345        ) = @_;
1346
1347     # Returns
1348
1349     my ( $info, $chr, $beg, $step, $i, $c, $j, @blocks, @super_blocks, @entries, $super_block, $block, @lens, @begs );
1350
1351     $info = shift @{ $lines };
1352
1353     if ( $info =~ /^chrom=([^ ]+) start=(\d+) step=(\d+)$/ )
1354     {
1355         $chr  = $1;
1356         $beg  = $2;
1357         $step = $3;
1358
1359         die qq(ERROR: step size $step != 1 -> problem!\n) if $step != 1; # in an ideal world should would be fixed ...
1360     }
1361
1362     $i = 0;
1363
1364     while ( $i < @{ $lines } )
1365     {
1366         if ( $lines->[ $i ] >= $args->{ "threshold" } )
1367         {
1368             $c = $i + 1;
1369
1370             while ( $c < @{ $lines } )
1371             {
1372                 if ( $lines->[ $c ] < $args->{ "threshold" } )
1373                 {
1374                     $j = $c + 1;
1375
1376                     while ( $j < @{ $lines } and $lines->[ $j ] < $args->{ "threshold" } ) {
1377                         $j++;
1378                     }
1379
1380                     if ( $j - $c > $args->{ "gap" } )
1381                     {
1382                         if ( $c - $i >= $args->{ "min" } )
1383                         {
1384                             push @blocks, {
1385                                 CHR     => $chr,
1386                                 CHR_BEG => $beg + $i - 1,
1387                                 CHR_END => $beg + $c - 2,
1388                                 CHR_LEN => $c - $i,
1389                             };
1390                         }
1391
1392                         $i = $j;
1393
1394                         last;
1395                     }
1396
1397                     $c = $j
1398                 }
1399                 else
1400                 {
1401                     $c++;
1402                 }
1403             }
1404
1405             if ( $c - $i >= $args->{ "min" } )
1406             {
1407                 push @blocks, {
1408                     CHR     => $chr,
1409                     CHR_BEG => $beg + $i - 1,
1410                     CHR_END => $beg + $c - 2,
1411                     CHR_LEN => $c - $i,
1412                 };
1413             }
1414
1415             $i = $c;
1416         }
1417         else
1418         {
1419             $i++;
1420         }
1421     }
1422
1423     $i = 0;
1424
1425     while ( $i < @blocks )
1426     {
1427         $c = $i + 1;
1428
1429         while ( $c < @blocks and $blocks[ $c ]->{ "CHR_BEG" } - $blocks[ $c - 1 ]->{ "CHR_END" } <= $args->{ "dist" } )
1430         {
1431             $c++;
1432         }
1433
1434         push @super_blocks, [ @blocks[ $i .. $c - 1 ] ];
1435
1436         $i = $c;
1437     }
1438
1439     foreach $super_block ( @super_blocks )
1440     {
1441         foreach $block ( @{ $super_block } )
1442         {
1443             push @begs, $block->{ "CHR_BEG" } - $super_block->[ 0 ]->{ "CHR_BEG" };
1444             push @lens, $block->{ "CHR_LEN" } - 1;
1445         }
1446
1447         $lens[ -1 ]++;
1448
1449         push @entries, {
1450             CHR         => $super_block->[ 0 ]->{ "CHR" },
1451             CHR_BEG     => $super_block->[ 0 ]->{ "CHR_BEG" },
1452             CHR_END     => $super_block->[ -1 ]->{ "CHR_END" },
1453             Q_ID        => "Q_ID",
1454             SCORE       => 100,
1455             STRAND      => "+",
1456             THICK_BEG   => $super_block->[ 0 ]->{ "CHR_BEG" },
1457             THICK_END   => $super_block->[ -1 ]->{ "CHR_END" } + 1,
1458             COLOR       => 0,
1459             BLOCK_COUNT => scalar @{ $super_block },
1460             BLOCK_LENS  => join( ",", @lens ),
1461             Q_BEGS      => join( ",", @begs ),
1462         };
1463
1464         undef @begs;
1465         undef @lens;
1466     }
1467
1468     return wantarray ? @entries : \@entries;
1469 }
1470
1471
1472 sub phastcons_normalize
1473 {
1474     # Martin A. Hansen, January 2008.
1475
1476     # Normalizes a list of lists with PhastCons scores,
1477     # in such a way that each list contains the same number
1478     # or PhastCons scores.
1479
1480     my ( $AoA,    # AoA with PhastCons scores
1481        ) = @_;
1482
1483     # Returns AoA.
1484
1485     my ( $list, $max, $min, $mean, $diff );
1486
1487     $min = 99999999;
1488     $max = 0;
1489
1490     foreach $list ( @{ $AoA } )
1491     {
1492         $min = scalar @{ $list } if scalar @{ $list } < $min;
1493         $max = scalar @{ $list } if scalar @{ $list } > $max;
1494     }
1495
1496     $mean = int( ( $min + $max ) / 2 );
1497
1498 #    print STDERR "min->$min   max->$max   mean->$mean\n";
1499
1500     foreach $list ( @{ $AoA } )
1501     {
1502         $diff = scalar @{ $list } - $mean;
1503
1504         phastcons_list_inflate( $list, abs( $diff ) ) if $diff < 0;
1505         phastcons_list_deflate( $list, $diff )        if $diff > 0;
1506     }
1507
1508     return wantarray ? @{ $AoA } : $AoA;
1509 }
1510
1511
1512 sub phastcons_list_inflate
1513 {
1514     # Martin A. Hansen, January 2008.
1515
1516     # Inflates a list with a given number of elements
1517     # in such a way that the extra elements are introduced
1518     # evenly over the entire length of the list. The value
1519     # of the extra elements is based on a mean of the
1520     # adjacent elements.
1521
1522     my ( $list,   # list of elements
1523          $diff,   # number of elements to introduce
1524        ) = @_;
1525
1526     # Returns nothing
1527
1528     my ( $len, $space, $i, $pos );
1529
1530     $len = scalar @{ $list };
1531
1532     $space = $len / $diff;
1533
1534     for ( $i = 0; $i < $diff; $i++ )
1535     {
1536         $pos = int( ( $space / 2 ) + $i * $space );
1537
1538         splice @{ $list }, $pos, 0, ( $list->[ $pos - 1 ] + $list->[ $pos + 1 ] ) / 2;
1539         # splice @{ $list }, $pos, 0, "X";
1540     }
1541
1542     die qq(ERROR: bad inflate\n) if scalar @{ $list } != $len + $diff;
1543 }
1544
1545
1546 sub phastcons_list_deflate
1547 {
1548     # Martin A. Hansen, January 2008.
1549
1550     # Deflates a list by removing a given number of elements
1551     # evenly distributed over the entire list.
1552
1553     my ( $list,   # list of elements
1554          $diff,   # number of elements to remove
1555        ) = @_;
1556
1557     # Returns nothing
1558
1559     my ( $len, $space, $i, $pos );
1560
1561     $len = scalar @{ $list };
1562
1563     $space = ( $len - $diff ) / $diff;
1564
1565     for ( $i = 0; $i < $diff; $i++ )
1566     {
1567         $pos = int( ( $space / 2 ) + $i * $space );
1568
1569         splice @{ $list }, $pos, 1;
1570     }
1571
1572     die qq(ERROR: bad deflate\n) if scalar @{ $list } != $len - $diff;
1573 }
1574
1575
1576 sub phastcons_mean
1577 {
1578     # Martin A. Hansen, January 2008.
1579
1580     # Given a normalized PhastCons matrix in an AoA,
1581     # calculate the mean for each column and return as a list.
1582
1583     my ( $AoA,    # AoA with normalized PhastCons scores
1584        ) = @_;
1585
1586     # Returns a list
1587
1588     my ( @list );
1589
1590     $AoA = Maasha::Matrix::matrix_flip( $AoA );
1591
1592     map { push @list, Maasha::Calc::mean( $_ ) } @{ $AoA };
1593
1594     return wantarray ? @list : \@list;
1595 }
1596
1597
1598 sub phastcons_median
1599 {
1600     # Martin A. Hansen, January 2008.
1601
1602     # Given a normalized PhastCons matrix in an AoA,
1603     # calculate the median for each column and return as a list.
1604
1605     my ( $AoA,    # AoA with normalized PhastCons scores
1606        ) = @_;
1607
1608     # Returns a list
1609
1610     my ( @list );
1611
1612     $AoA = Maasha::Matrix::matrix_flip( $AoA );
1613
1614     map { push @list, Maasha::Calc::median( $_ ) } @{ $AoA };
1615
1616     return wantarray ? @list : \@list;
1617 }
1618
1619
1620 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> MULTIPLE ALIGNMENT FILES <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1621
1622
1623 sub maf_extract
1624 {
1625     # Martin A. Hansen, April 2008.
1626
1627     # Executes mafFrag to extract a subalignment from a multiz track
1628     # in the UCSC genome browser database.
1629
1630     my ( $tmp_dir,    # temporary directory
1631          $database,   # genome database
1632          $table,      # table with the multiz track
1633          $chr,        # chromosome
1634          $beg,        # begin position
1635          $end,        # end position
1636          $strand,     # strand
1637        ) = @_;
1638
1639     # Returns a list of record
1640
1641     my ( $tmp_file, $align );
1642
1643     $tmp_file = "$tmp_dir/maf_extract.maf";
1644
1645     Maasha::Common::run( "mafFrag", "$database $table $chr $beg $end $strand $tmp_file" );
1646
1647     $align = maf_parse( $tmp_file );
1648
1649     unlink $tmp_file;
1650
1651     return wantarray ? @{ $align } : $align;
1652 }
1653
1654
1655 sub maf_parse
1656 {
1657     # Martin A. Hansen, April 2008.
1658
1659
1660     my ( $path,   # full path to MAF file
1661        ) = @_;
1662
1663     # Returns a list of record.
1664
1665     my ( $fh, $line, @fields, @align );
1666
1667     $fh = Maasha::Common::read_open( $path );
1668
1669     while ( $line = <$fh> )
1670     {
1671         chomp $line;
1672
1673         if ( $line =~ /^s/ )
1674         {
1675             @fields = split / /, $line;
1676
1677             push @align, {
1678                 SEQ_NAME  => $fields[ 1 ],
1679                 SEQ       => $fields[ -1 ],
1680                 ALIGN     => 1,
1681                 ALIGN_LEN => length $fields[ -1 ],
1682             }
1683         }
1684     }
1685
1686     close $fh;
1687
1688     return wantarray ? @align : \@align;
1689 }
1690
1691
1692 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> WIGGLE FORMAT <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1693
1694
1695 sub fixedstep_put_entry
1696 {
1697     # Martin A. Hansen, April 2008.
1698
1699     # Outputs a block of fixedStep values.
1700     # Used for outputting wiggle data.
1701
1702     my ( $chr,      # chromosome
1703          $beg,      # start position
1704          $block,    # list of scores
1705          $fh,       # filehandle - OPTIONAL
1706          $log10,    # flag indicating that log10 scores should be used
1707        ) = @_;
1708
1709     # Returns nothing.
1710
1711     $beg += 1;   # fixedStep format is 1 based.
1712
1713     $fh ||= \*STDOUT;
1714
1715     print $fh "fixedStep chrom=$chr start=$beg step=1\n";
1716
1717     if ( $log10 ) {
1718         map { printf( $fh "%f\n", Maasha::Calc::log10( $_ + 1 ) ) } @{ $block };
1719     } else {
1720         map { printf( $fh "%d\n", ( $_ + 1 ) ) } @{ $block };
1721     }
1722 }
1723
1724
1725 sub wiggle_upload_to_ucsc
1726 {
1727     # Martin A. Hansen, May 2008.
1728
1729     # Upload a wiggle file to the UCSC database.
1730
1731     my ( $tmp_dir,    # temporary directory
1732          $wib_dir,    # wib directory
1733          $wig_file,   # file to upload,
1734          $options,    # argument hashref
1735        ) = @_;
1736
1737     # Returns nothing.
1738
1739     my ( $args );
1740
1741 #    $args = join " ", "-tmpDir=$tmp_dir", "-pathPrefix=$wib_dir", $options->{ "database" }, $options->{ 'table' }, $wig_file;
1742
1743 #    Maasha::Common::run( "hgLoadWiggle", "$args > /dev/null 2>&1" );
1744
1745     if ( $options->{ 'verbose' } ) {
1746         `cd $tmp_dir && hgLoadWiggle -tmpDir=$tmp_dir -pathPrefix=$wib_dir $options->{ 'database' } $options->{ 'table' } $wig_file`;
1747     } else {
1748         `cd $tmp_dir && hgLoadWiggle -tmpDir=$tmp_dir -pathPrefix=$wib_dir $options->{ 'database' } $options->{ 'table' } $wig_file > /dev/null 2>&1`;
1749     }
1750 }
1751
1752
1753 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> MySQL CONF <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1754
1755
1756 sub ucsc_get_user
1757 {
1758     # Martin A. Hansen, May 2008
1759
1760     # Fetches the MySQL database user name from the
1761     # .hg.conf file in the users home directory.
1762
1763     # Returns a string.
1764
1765     my ( $fh, $line, $user );
1766
1767     $fh = Maasha::Common::read_open( "$ENV{ 'HOME' }/.hg.conf" );
1768
1769     while ( $line = <$fh> )
1770     {
1771         chomp $line;
1772
1773         if ( $line =~ /^db\.user=(.+)/ )
1774         {
1775             $user = $1;
1776
1777             last;
1778         }
1779     }
1780
1781     close $fh;
1782
1783     return $user;
1784 }
1785
1786
1787 sub ucsc_get_password
1788 {
1789     # Martin A. Hansen, May 2008
1790
1791     # Fetches the MySQL database password from the
1792     # .hg.conf file in the users home directory.
1793
1794     # Returns a string.
1795
1796     my ( $fh, $line, $password );
1797
1798     $fh = Maasha::Common::read_open( "$ENV{ 'HOME' }/.hg.conf" );
1799
1800     while ( $line = <$fh> )
1801     {
1802         chomp $line;
1803
1804         if ( $line =~ /^db\.password=(.+)/ )
1805         {
1806             $password = $1;
1807
1808             last;
1809         }
1810     }
1811
1812     close $fh;
1813
1814     return $password;
1815 }
1816
1817
1818 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1819
1820
1821 __END__
1822