code_perl/Maasha/UCSC.pm

   1 package Maasha::UCSC;
   2
   3 # Copyright (C) 2007 Martin A. Hansen.
   4
   5 # This program is free software; you can redistribute it and/or
   6 # modify it under the terms of the GNU General Public License
   7 # as published by the Free Software Foundation; either version 2
   8 # of the License, or (at your option) any later version.
   9
  10 # This program is distributed in the hope that it will be useful,
  11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 # GNU General Public License for more details.
  14
  15 # You should have received a copy of the GNU General Public License
  16 # along with this program; if not, write to the Free Software
  17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  18
  19 # http://www.gnu.org/copyleft/gpl.html
  20
  21
  22 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  23
  24
  25 # Stuff for interacting with UCSC genome browser
  26
  27
  28 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  29
  30
  31 use strict;
  32 use vars qw ( @ISA @EXPORT );
  33
  34 use Data::Dumper;
  35 use Maasha::Common;
  36 use Maasha::Filesys;
  37 use Maasha::Calc;
  38 use Maasha::Matrix;
  39
  40 use constant {
  41     FS_CHR_BEG      => 0,
  42     FS_NEXT_CHR_BEG => 1,
  43     FS_CHR_END      => 2,
  44     FS_INDEX_BEG    => 3,
  45     FS_INDEX_LEN    => 4,
  46
  47     CHR          => 0,
  48     CHR_BEG      => 1,
  49     CHR_END      => 2,
  50     Q_ID         => 3,
  51     SCORE        => 4,
  52     STRAND       => 5,
  53     THICK_BEG    => 6,
  54     THICK_END    => 7,
  55     COLOR        => 8,
  56     BLOCK_COUNT  => 9,
  57     BLOCK_LENS   => 10,
  58     Q_BEGS       => 11,
  59 };
  60
  61 @ISA = qw( Exporter );
  62
  63
  64 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> BED format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  65
  66
  67 # http://genome.ucsc.edu/goldenPath/help/hgTracksHelp.html#BED
  68
  69
  70 sub bed_entry_get_array
  71 {
  72     # Martin A. Hansen, September 2008.
  73
  74     # Reads a BED entry given a filehandle.
  75
  76     # This is a new _faster_ BED entry parser that
  77     # uses arrays and not hashrefs.
  78
  79     # IMPORTANT! This function does not correct the
  80     # BED_END position that is kept the way UCSC
  81     # does.
  82
  83     my ( $fh,     # file handle
  84          $cols,   # columns to read - OPTIONAL (3,4,5,6, or 12)
  85        ) = @_;
  86
  87     # Returns a list.
  88
  89     my ( $line, @entry );
  90
  91     $line = <$fh>;
  92
  93     $line =~ tr/\n\r//d;    # some people have carriage returns in their BED files -> Grrrr
  94
  95     return if not defined $line;
  96
  97     if ( not defined $cols ) {
  98         $cols = 1 + $line =~ tr/\t//;
  99     }
 100
 101     @entry = split "\t", $line, $cols + 1;
 102
 103     pop @entry if scalar @entry > $cols;
 104
 105     return wantarray ? @entry : \@entry;
 106 }
 107
 108
 109 sub bed_get_entry
 110 {
 111     # Martin A. Hansen, December 2007.
 112
 113     # Reads a bed entry given a filehandle.
 114
 115     my ( $fh,        # file handle
 116          $columns,   # number of BED columns to read  -  OPTIONAL
 117        ) = @_;
 118
 119     # Returns hashref.
 120
 121     my ( $line, @fields, %entry );
 122
 123     $line = <$fh>;
 124
 125     $line =~ tr/\n\r//d;    # some people have carriage returns in their BED files -> Grrrr
 126
 127     return if not defined $line;
 128
 129     @fields = split "\t", $line;
 130
 131     $columns ||= scalar @fields;
 132
 133     if ( $columns == 3 )
 134     {
 135         %entry = (
 136             "CHR"      => $fields[ 0 ],
 137             "CHR_BEG"  => $fields[ 1 ],
 138             "CHR_END"  => $fields[ 2 ] - 1,
 139         );
 140     }
 141     elsif ( $columns == 4 )
 142     {
 143         %entry = (
 144             "CHR"      => $fields[ 0 ],
 145             "CHR_BEG"  => $fields[ 1 ],
 146             "CHR_END"  => $fields[ 2 ] - 1,
 147             "Q_ID"     => $fields[ 3 ],
 148         );
 149     }
 150     elsif ( $columns == 5 )
 151     {
 152         %entry = (
 153             "CHR"      => $fields[ 0 ],
 154             "CHR_BEG"  => $fields[ 1 ],
 155             "CHR_END"  => $fields[ 2 ] - 1,
 156             "Q_ID"     => $fields[ 3 ],
 157             "SCORE"    => $fields[ 4 ],
 158         );
 159     }
 160     elsif ( $columns == 6 )
 161     {
 162         %entry = (
 163             "CHR"      => $fields[ 0 ],
 164             "CHR_BEG"  => $fields[ 1 ],
 165             "CHR_END"  => $fields[ 2 ] - 1,
 166             "Q_ID"     => $fields[ 3 ],
 167             "SCORE"    => $fields[ 4 ],
 168             "STRAND"   => $fields[ 5 ],
 169         );
 170     }
 171     elsif ( $columns == 12 )
 172     {
 173         %entry = (
 174             "CHR"         => $fields[ 0 ],
 175             "CHR_BEG"     => $fields[ 1 ],
 176             "CHR_END"     => $fields[ 2 ] - 1,
 177             "Q_ID"        => $fields[ 3 ],
 178             "SCORE"       => $fields[ 4 ],
 179             "STRAND"      => $fields[ 5 ],
 180             "THICK_BEG"   => $fields[ 6 ],
 181             "THICK_END"   => $fields[ 7 ] - 1,
 182             "COLOR"       => $fields[ 8 ],
 183             "BLOCK_COUNT" => $fields[ 9 ],
 184             "BLOCK_LENS"  => $fields[ 10 ],
 185             "Q_BEGS"      => $fields[ 11 ],
 186         );
 187     }
 188     else
 189     {
 190         Maasha::Common::error( qq(Bad BED format in line->$line<-) );
 191     }
 192
 193     $entry{ "REC_TYPE" } = "BED";
 194     $entry{ "BED_LEN" }  = $entry{ "CHR_END" } - $entry{ "CHR_BEG" } + 1;
 195     $entry{ "BED_COLS" } = $columns;
 196
 197     return wantarray ? %entry : \%entry;
 198 }
 199
 200
 201 sub bed_get_entries
 202 {
 203     # Martin A. Hansen, January 2008.
 204
 205     # Given a path to a BED file, read in all entries
 206     # and return.
 207
 208     my ( $path,     # full path to BED file
 209          $columns,  # number of columns in BED file - OPTIONAL (but is faster)
 210        ) = @_;
 211
 212     # Returns a list.
 213
 214     my ( $fh, $entry, @list );
 215
 216     $fh = Maasha::Common::read_open( $path );
 217
 218     while ( $entry = bed_get_entry( $fh ) ) {
 219         push @list, $entry;
 220     }
 221
 222     close $fh;
 223
 224     return wantarray ? @list : \@list;
 225 }
 226
 227
 228 sub bed_entry_put_array
 229 {
 230     # Martin A. Hansen, Septermber 2008.
 231
 232     # Writes a BED entry array to file.
 233
 234     # IMPORTANT! This function does not correct the
 235     # BED_END position that is assumed to be in the
 236     # UCSC positions scheme.
 237
 238     my ( $record,   # list
 239          $fh,       # file handle                   - OPTIONAL
 240          $cols,     # number of columns in BED file - OPTIONAL
 241        ) = @_;
 242
 243     # Returns nothing.
 244
 245     $fh = \*STDOUT if not defined $fh;
 246
 247     if ( defined $cols ) {
 248         print $fh join( "\t", @{ $record }[ 0 .. $cols - 1 ] ), "\n";
 249     } else {
 250         print $fh join( "\t", @{ $record } ), "\n";
 251     }
 252 }
 253
 254
 255 sub bed_put_entry
 256 {
 257     # Martin A. Hansen, Septermber 2007.
 258
 259     # Writes a BED entry to file.
 260
 261     # NB, this could really be more robust!?
 262
 263     my ( $record,       # hashref
 264          $fh,           # file handle                   - OPTIONAL
 265          $columns,      # number of columns in BED file - OPTIONAL (but is faster)
 266        ) = @_;
 267
 268     # Returns nothing.
 269
 270     my ( @fields );
 271
 272     $columns ||= 12;   # max number of columns possible
 273
 274     if ( $columns == 3 )
 275     {
 276         push @fields, $record->{ "CHR" };
 277         push @fields, $record->{ "CHR_BEG" };
 278         push @fields, $record->{ "CHR_END" } + 1;
 279     }
 280     elsif ( $columns == 4 )
 281     {
 282         $record->{ "Q_ID" }  =~ s/\s+/_/g;
 283
 284         push @fields, $record->{ "CHR" };
 285         push @fields, $record->{ "CHR_BEG" };
 286         push @fields, $record->{ "CHR_END" } + 1;
 287         push @fields, $record->{ "Q_ID" };
 288     }
 289     elsif ( $columns == 5 )
 290     {
 291         $record->{ "Q_ID" }  =~ s/\s+/_/g;
 292         $record->{ "SCORE" } =~ s/\.\d*//;
 293
 294         push @fields, $record->{ "CHR" };
 295         push @fields, $record->{ "CHR_BEG" };
 296         push @fields, $record->{ "CHR_END" } + 1;
 297         push @fields, $record->{ "Q_ID" };
 298         push @fields, $record->{ "SCORE" };
 299     }
 300     elsif ( $columns == 6 )
 301     {
 302         $record->{ "Q_ID" }  =~ s/\s+/_/g;
 303         $record->{ "SCORE" } =~ s/\.\d*//;
 304
 305         push @fields, $record->{ "CHR" };
 306         push @fields, $record->{ "CHR_BEG" };
 307         push @fields, $record->{ "CHR_END" } + 1;
 308         push @fields, $record->{ "Q_ID" };
 309         push @fields, $record->{ "SCORE" };
 310         push @fields, $record->{ "STRAND" };
 311     }
 312     else
 313     {
 314         $record->{ "Q_ID" }  =~ s/\s+/_/g;
 315         $record->{ "SCORE" } =~ s/\.\d*//;
 316
 317         push @fields, $record->{ "CHR" };
 318         push @fields, $record->{ "CHR_BEG" };
 319         push @fields, $record->{ "CHR_END" } + 1;
 320         push @fields, $record->{ "Q_ID" };
 321         push @fields, $record->{ "SCORE" };
 322         push @fields, $record->{ "STRAND" };
 323         push @fields, $record->{ "THICK_BEG" }     if defined $record->{ "THICK_BEG" };
 324         push @fields, $record->{ "THICK_END" } + 1 if defined $record->{ "THICK_END" };
 325         push @fields, $record->{ "COLOR" }         if defined $record->{ "COLOR" };
 326         push @fields, $record->{ "BLOCK_COUNT" }   if defined $record->{ "BLOCK_COUNT" };
 327         push @fields, $record->{ "BLOCK_LENS" }    if defined $record->{ "BLOCK_LENS" };
 328         push @fields, $record->{ "Q_BEGS" }        if defined $record->{ "Q_BEGS" };
 329     }
 330
 331     if ( $fh ) {
 332         print $fh join( "\t", @fields ), "\n";
 333     } else {
 334         print join( "\t", @fields ), "\n";
 335     }
 336 }
 337
 338
 339 sub bed_put_entries
 340 {
 341     # Martin A. Hansen, January 2008.
 342
 343     # Write a list of BED entries.
 344
 345     my ( $entries,   # list of entries,
 346          $fh,        # file handle - OPTIONAL
 347        ) = @_;
 348
 349     # Returns nothing.
 350
 351     map { bed_put_entry( $_, $fh ) } @{ $entries };
 352 }
 353
 354
 355 sub bed_analyze
 356 {
 357     # Martin A. Hansen, March 2008.
 358
 359     # Given a bed record, analysis this to give information
 360     # about intron/exon sizes.
 361
 362     my ( $entry,   # BED entry
 363        ) = @_;
 364
 365     # Returns hashref.
 366
 367     my ( $i, @begs, @lens, $exon_max, $exon_min, $exon_len, $exon_tot, $intron_max, $intron_min, $intron_len, $intron_tot );
 368
 369     $exon_max   = 0;
 370     $exon_min   = 9999999999;
 371     $intron_max = 0;
 372     $intron_min = 9999999999;
 373
 374     $entry->{ "EXONS" }   = $entry->{ "BLOCK_COUNT" };
 375
 376     @begs = split /,/, $entry->{ "Q_BEGS" };
 377     @lens = split /,/, $entry->{ "BLOCK_LENS" };
 378
 379     for ( $i = 0; $i < $entry->{ "BLOCK_COUNT" }; $i++ )
 380     {
 381         $exon_len = $lens[ $i ];
 382
 383         $entry->{ "EXON_LEN_$i" } = $exon_len;
 384
 385         $exon_max = $exon_len if $exon_len > $exon_max;
 386         $exon_min = $exon_len if $exon_len < $exon_min;
 387
 388         $exon_tot += $exon_len;
 389     }
 390
 391     $entry->{ "EXON_LEN_-1" }   = $exon_len;
 392     $entry->{ "EXON_MAX_LEN" }  = $exon_max;
 393     $entry->{ "EXON_MIN_LEN" }  = $exon_min;
 394     $entry->{ "EXON_MEAN_LEN" } = int( $exon_tot / $entry->{ "EXONS" } );
 395
 396     $entry->{ "INTRONS" } = $entry->{ "BLOCK_COUNT" } - 1;
 397     $entry->{ "INTRONS" } = 0 if $entry->{ "INTRONS" } < 0;
 398
 399     if ( $entry->{ "INTRONS" } )
 400     {
 401         for ( $i = 1; $i < $entry->{ "BLOCK_COUNT" }; $i++ )
 402         {
 403             $intron_len = $begs[ $i ] - ( $begs[ $i - 1 ] + $lens[ $i - 1 ] );
 404
 405             $entry->{ "INTRON_LEN_" . ( $i - 1 ) } = $intron_len;
 406
 407             $intron_max = $intron_len if $intron_len > $intron_max;
 408             $intron_min = $intron_len if $intron_len < $intron_min;
 409
 410             $intron_tot += $intron_len;
 411         }
 412
 413         $entry->{ "INTRON_LEN_-1" }   = $intron_len;
 414         $entry->{ "INTRON_MAX_LEN" }  = $intron_max;
 415         $entry->{ "INTRON_MIN_LEN" }  = $intron_min;
 416         $entry->{ "INTRON_MEAN_LEN" } = int( $intron_tot / $entry->{ "INTRONS" } );
 417     }
 418
 419     return wantarray ? %{ $entry } : $entry;
 420 }
 421
 422
 423 sub bed_sort
 424 {
 425     # Martin A. Hansen, Septermber 2008
 426
 427     # Sorts a BED file using the c program
 428     # "bed_sort" specifing a sort mode:
 429
 430     # 1: chr AND chr_beg.
 431     # 2: chr AND strand AND chr_beg.
 432     # 3: chr_beg.
 433     # 4: strand AND chr_beg.
 434
 435     my ( $bed_file,    # BED file to sort
 436          $sort_mode,   # See above.
 437          $cols,        # Number of columns in BED file
 438        ) = @_;
 439
 440     &Maasha::Common::run( "bed_sort", "--sort $sort_mode --cols $cols $bed_file" );
 441 }
 442
 443
 444 sub bed_split_to_files
 445 {
 446     # Martin A. Hansen, Septermber 2008
 447
 448     # Given a list of BED files, split these
 449     # into temporary files based on the chromosome
 450     # name. Returns a list of the temporary files.
 451
 452     my ( $bed_files,   # list of BED files to split
 453          $cols,        # number of columns
 454          $tmp_dir,     # temporary directory
 455        ) = @_;
 456
 457     # Returns a list.
 458
 459     my ( $bed_file, $fh_in, $entry, $key, %fh_hash, @tmp_files );
 460
 461     foreach $bed_file ( @{ $bed_files } )
 462     {
 463         $fh_in = Maasha::Common::read_open( $bed_file );
 464
 465         while ( $entry = bed_entry_get_array( $fh_in, $cols ) )
 466         {
 467             $key = $entry->[ CHR ];
 468
 469             $fh_hash{ $key } = Maasha::Common::write_open( "$tmp_dir/$key.temp" ) if not exists $fh_hash{ $key };
 470
 471             bed_entry_put_array( $entry, $fh_hash{ $key } );
 472         }
 473
 474         close $fh_in;
 475     }
 476
 477     foreach $key ( sort keys %fh_hash )
 478     {
 479         push @tmp_files, "$tmp_dir/$key.temp";
 480
 481         close $fh_hash{ $key };
 482     }
 483
 484     return wantarray ? @tmp_files : \@tmp_files;
 485 }
 486
 487
 488 sub bed_merge_entries
 489 {
 490     # Martin A. Hansen, February 2008.
 491
 492     # Merge a list of given BED entries in one big entry.
 493
 494     my ( $entries,     # list of BED entries to be merged
 495        ) = @_;
 496
 497     # Returns hash.
 498
 499     my ( $i, @q_ids, @q_begs, @blocksizes, @new_q_begs, @new_blocksizes, %new_entry );
 500
 501     @{ $entries } = sort { $a->{ "CHR_BEG" } <=> $b->{ "CHR_BEG" } } @{ $entries };
 502
 503     for ( $i = 0; $i < @{ $entries }; $i++ )
 504     {
 505         Maasha::Common::error( qq(Attempted merge of BED entries from different chromosomes) ) if $entries->[ 0 ]->{ "CHR" }    ne $entries->[ $i ]->{ "CHR" };
 506         Maasha::Common::error( qq(Attempted merge of BED entries from different strands) )     if $entries->[ 0 ]->{ "STRAND" } ne $entries->[ $i ]->{ "STRAND" };
 507
 508         push @q_ids, $entries->[ $i ]->{ "Q_ID" } || sprintf( "ID%06d", $i );
 509
 510         if ( exists $entries->[ $i ]->{ "Q_BEGS" } )
 511         {
 512             @q_begs     = split ",", $entries->[ $i ]->{ "Q_BEGS" };
 513             @blocksizes = split ",", $entries->[ $i ]->{ "BLOCK_LENS" };
 514         }
 515         else
 516         {
 517             @q_begs     = 0;
 518             @blocksizes = $entries->[ $i ]->{ "CHR_END" } - $entries->[ $i ]->{ "CHR_BEG" } + 1;
 519         }
 520
 521         map { $_ += $entries->[ $i ]->{ "CHR_BEG" } } @q_begs;
 522
 523         push @new_q_begs, @q_begs;
 524         push @new_blocksizes, @blocksizes;
 525     }
 526
 527     map { $_ -= $entries->[ 0 ]->{ "CHR_BEG" } } @new_q_begs;
 528
 529     %new_entry = (
 530         CHR          => $entries->[ 0 ]->{ "CHR" },
 531         CHR_BEG      => $entries->[ 0 ]->{ "CHR_BEG" },
 532         CHR_END      => $entries->[ -1 ]->{ "CHR_END" },
 533         REC_TYPE     => "BED",
 534         BED_LEN      => $entries->[ -1 ]->{ "CHR_END" } - $entries->[ 0 ]->{ "CHR_BEG" } + 1,
 535         BED_COLS     => 12,
 536         Q_ID         => join( ":", @q_ids ),
 537         SCORE        => 999,
 538         STRAND       => $entries->[ 0 ]->{ "STRAND" }     || "+",
 539         THICK_BEG    => $entries->[ 0 ]->{ "THICK_BEG" }  || $entries->[ 0 ]->{ "CHR_BEG" },
 540         THICK_END    => $entries->[ -1 ]->{ "THICK_END" } || $entries->[ -1 ]->{ "CHR_END" },
 541         COLOR        => 0,
 542         BLOCK_COUNT  => scalar @new_q_begs,
 543         BLOCK_LENS   => join( ",", @new_blocksizes ),
 544         Q_BEGS       => join( ",", @new_q_begs ),
 545     );
 546
 547     return wantarray ? %new_entry : \%new_entry;
 548 }
 549
 550
 551 sub bed_split_entry
 552 {
 553     # Martin A. Hansen, February 2008.
 554
 555     # Splits a given BED entry into a list of blocks,
 556     # which are returned. A list of 6 column BED entry is returned.
 557
 558     my ( $entry,    # BED entry hashref
 559        ) = @_;
 560
 561     # Returns a list.
 562
 563     my ( @q_begs, @blocksizes, $block, @blocks, $i );
 564
 565     if ( exists $entry->{ "BLOCK_COUNT" } )
 566     {
 567         @q_begs     = split ",", $entry->{ "Q_BEGS" };
 568         @blocksizes = split ",", $entry->{ "BLOCK_LENS" };
 569
 570         for ( $i = 0; $i < @q_begs; $i++ )
 571         {
 572             undef $block;
 573
 574             $block->{ "CHR" }      = $entry->{ "CHR" };
 575             $block->{ "CHR_BEG" }  = $entry->{ "CHR_BEG" } + $q_begs[ $i ];
 576             $block->{ "CHR_END" }  = $entry->{ "CHR_BEG" } + $q_begs[ $i ] + $blocksizes[ $i ] - 1;
 577             $block->{ "Q_ID" }     = $entry->{ "Q_ID" } . sprintf( "_%03d", $i );
 578             $block->{ "SCORE" }    = $entry->{ "SCORE" };
 579             $block->{ "STRAND" }   = $entry->{ "STRAND" };
 580             $block->{ "BED_LEN" }  = $block->{ "CHR_END" } - $block->{ "CHR_BEG" } + 1,
 581             $block->{ "BED_COLS" } = 6;
 582             $block->{ "REC_TYPE" } = "BED";
 583
 584             push @blocks, $block;
 585         }
 586     }
 587     else
 588     {
 589         @blocks = @{ $entry };
 590     }
 591
 592     return wantarray ? @blocks : \@blocks;
 593 }
 594
 595
 596
 597 sub bed_overlap
 598 {
 599     # Martin A. Hansen, February 2008.
 600
 601     # Checks if two BED entries overlap and
 602     # return 1 if so - else 0;
 603
 604     my ( $entry1,      # hashref
 605          $entry2,      # hashref
 606          $no_strand,   # don't check strand flag - OPTIONAL
 607        ) = @_;
 608
 609     # Return bolean.
 610
 611     return 0 if $entry1->{ "CHR" }    ne $entry2->{ "CHR" };
 612     return 0 if $entry1->{ "STRAND" } ne $entry2->{ "STRAND" };
 613
 614     if ( $entry1->{ "CHR_END" } < $entry2->{ "CHR_BEG" } or $entry1->{ "CHR_BEG" } > $entry2->{ "CHR_END" } ) {
 615         return 0;
 616     } else {
 617         return 1;
 618     }
 619 }
 620
 621
 622 sub bed_upload_to_ucsc
 623 {
 624     # Martin A. Hansen, September 2007.
 625
 626     # Upload a BED file to the UCSC database.
 627
 628     my ( $tmp_dir,   # temporary directory
 629          $file,      # file to upload,
 630          $options,   # argument hashref
 631          $append,    # flag indicating table should be appended
 632        ) = @_;
 633
 634     # Returns nothing.
 635
 636     my ( $args, $table, $sql_file, $fh_out, $fh_in );
 637
 638     if ( $append ) {
 639         $args = join " ", $options->{ "database" }, $options->{ "table" }, "-tmpDir=$tmp_dir", "-oldTable", $file;
 640     } else {
 641         $args = join " ", $options->{ "database" }, $options->{ "table" }, "-tmpDir=$tmp_dir", $file;
 642     }
 643
 644     if ( $options->{ "table" } =~ /rnaSecStr/ )
 645     {
 646         $table = $options->{ "table" };
 647
 648         print qq(uploading secondary structure table:"$table"\n) if $options->{ "verbose" };
 649
 650         $sql_file = "$tmp_dir/upload_RNA_SS.sql";
 651
 652         $fh_out   = Maasha::Common::write_open( $sql_file );
 653
 654         print $fh_out qq(
 655 CREATE TABLE $table (
 656     bin smallint not null,              # Bin number for browser speedup
 657     chrom varchar(255) not null,        # Chromosome or FPC contig
 658     chromStart int unsigned not null,   # Start position in chromosome
 659     chromEnd int unsigned not null,     # End position in chromosome
 660     name varchar(255) not null,         # Name of item
 661     score int unsigned not null,        # Score from 0-1000
 662     strand char(1) not null,            # + or -
 663     size int unsigned not null,         # Size of element.
 664     secStr longblob not null,           # Parentheses and '.'s which define the secondary structure
 665     conf longblob not null,             # Confidence of secondary-structure annotation per position (0.0-1.0).
 666     #Indices
 667     INDEX(name(16)),
 668     INDEX(chrom(8), bin),
 669     INDEX(chrom(8), chromStart)
 670 );
 671         );
 672
 673         close $fh_out;
 674
 675         Maasha::Common::run( "hgLoadBed", "-notItemRgb -sqlTable=$sql_file $options->{ 'database' } $options->{ 'table' } -tmpDir=$tmp_dir $file > /dev/null 2>&1" );
 676
 677         unlink $sql_file;
 678     }
 679     else
 680     {
 681         Maasha::Common::run( "hgLoadBed", "$args > /dev/null 2>&1" );
 682     }
 683 }
 684
 685
 686 sub psl_upload_to_ucsc
 687 {
 688     # Martin A. Hansen, September 2007.
 689
 690     # Upload a PSL file to the UCSC database.
 691
 692     my ( $file,      # file to upload,
 693          $options,   # argument hashref
 694          $append,    # flag indicating table should be appended
 695        ) = @_;
 696
 697     # Returns nothing.
 698
 699     my ( $args );
 700
 701     if ( $append ) {
 702         $args = join " ", $options->{ "database" }, "-table=$options->{ 'table' }", "-clientLoad", "-append", $file;
 703     } else {
 704         $args = join " ", $options->{ "database" }, "-table=$options->{ 'table' }", "-clientLoad", $file;
 705     }
 706
 707     Maasha::Common::run( "hgLoadPsl", "$args > /dev/null 2>&1" );
 708 }
 709
 710
 711 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> TRACK FILE <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
 712
 713
 714 sub ucsc_config_entry_get
 715 {
 716     # Martin A. Hansen, November 2008.
 717
 718     # Given a filehandle to a UCSC Genome Browser
 719     # config file (.ra) get the next entry and
 720     # return as a hash. Entries are separated by blank
 721     # lines. # lines are skipped unless it is the lines:
 722     # # Track added ...
 723     # # Database ...
 724
 725     my ( $fh,   # file hanlde
 726        ) = @_;
 727
 728     # Returns a hashref.
 729
 730     my ( $line, %record );
 731
 732     while ( $line = <$fh> )
 733     {
 734         $line =~ tr/\n\r//d;
 735
 736         if ( $line =~ /Track added by 'upload_to_ucsc' (\S+) (\S+)/) {
 737              $record{ 'date' } = $1;
 738              $record{ 'time' } = $2;
 739         } elsif ( $line =~ /^# date: (.+)/ ) {
 740             $record{ 'date' } = $1;
 741         } elsif ( $line =~ /^# time: (.+)/ ) {
 742             $record{ 'time' } = $1;
 743         } elsif ( $line =~ /^# (?:database:|Database) (.+)/ ) {
 744             $record{ 'database' } = $1;
 745         } elsif ( $line =~ /^#/ ) {
 746             # skip
 747         } elsif ( $line =~ /(\S+)\s+(.+)/ ) {
 748             $record{ $1 } = $2;
 749         }
 750
 751         last if $line =~ /^$/ and exists $record{ "track" };
 752     }
 753
 754     return undef if not exists $record{ "track" };
 755
 756     return wantarray ? %record : \%record;
 757 }
 758
 759
 760 sub ucsc_config_entry_put
 761 {
 762     # Martin A. Hansen, November 2008.
 763
 764     # Outputs a Biopiece record (a hashref)
 765     # to a filehandle or STDOUT.
 766
 767     my ( $record,    # hashref
 768          $fh_out,    # file handle
 769        ) = @_;
 770
 771     # Returns nothing.
 772
 773     my ( $date, $time );
 774
 775     $fh_out ||= \*STDOUT;
 776
 777     ( $date, $time ) = split " ", Maasha::Common::time_stamp();
 778
 779     $record->{ "date" } ||= $date;
 780     $record->{ "time" } ||= $time;
 781
 782     print $fh_out "\n# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n\n";
 783
 784     map { print $fh_out "# $_: $record->{ $_ }\n" if exists $record->{ $_ } } qw( date time database );
 785     map { print $fh_out "$_ $record->{ $_ }\n" if exists $record->{ $_ } } qw( track
 786                                                                                name
 787                                                                                description
 788                                                                                itemRgb
 789                                                                                db
 790                                                                                offset
 791                                                                                url
 792                                                                                htmlUrl
 793                                                                                shortLabel
 794                                                                                longLabel
 795                                                                                group
 796                                                                                priority
 797                                                                                useScore
 798                                                                                visibility
 799                                                                                maxHeightPixels
 800                                                                                color
 801                                                                                mafTrack
 802                                                                                type );
 803 }
 804
 805
 806 sub ucsc_update_config
 807 {
 808     # Martin A. Hansen, September 2007.
 809
 810     # Update the /home/user/ucsc/my_tracks.ra file and executes makeCustomTracks.pl
 811
 812     my ( $options,   # hashref
 813          $type,      # track type
 814        ) = @_;
 815
 816     # Returns nothing.
 817
 818     my ( $file, $entry, %new_entry, $fh_in, $fh_out, $was_found );
 819
 820     $file = $ENV{ "HOME" } . "/ucsc/my_tracks.ra";
 821
 822     Maasha::Filesys::file_copy( $file, "$file~" );   # create a backup
 823
 824     %new_entry = (
 825         database   => $options->{ 'database' },
 826         track      => $options->{ 'table' },
 827         shortLabel => $options->{ 'short_label' },
 828         longLabel  => $options->{ 'long_label' },
 829         group      => $options->{ 'group' },
 830         priority   => $options->{ 'priority' },
 831         visibility => $options->{ 'visibility' },
 832         color      => $options->{ 'color' },
 833         type       => $type,
 834     );
 835
 836     $new_entry{ 'useScore' }        = 1             if $options->{ 'use_score' };
 837     $new_entry{ 'mafTrack' }        = "multiz17way" if $type eq "type bed 6 +";
 838     $new_entry{ 'maxHeightPixels' } = "50:50:11"    if $type eq "wig 0";
 839
 840     $fh_in  = Maasha::Filesys::file_read_open( "$file~" );
 841     $fh_out = Maasha::Filesys::file_write_open( $file );
 842
 843     while ( $entry = ucsc_config_entry_get( $fh_in ) )
 844     {
 845         if ( $entry->{ 'database' } eq $new_entry{ 'database' } and $entry->{ 'track' } eq $new_entry{ 'track' } )
 846         {
 847             ucsc_config_entry_put( \%new_entry, $fh_out );
 848
 849             $was_found = 1;
 850         }
 851         else
 852         {
 853             ucsc_config_entry_put( $entry, $fh_out );
 854         }
 855     }
 856
 857     ucsc_config_entry_put( \%new_entry, $fh_out ) if not $was_found;
 858
 859     close $fh_in;
 860     close $fh_out;
 861
 862     Maasha::Common::run( "ucscMakeTracks.pl", "-b > /dev/null 2>&1" );
 863 }
 864
 865
 866 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> fixedStep format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
 867
 868
 869 sub fixedstep_get_entry
 870 {
 871     # Martin A. Hansen, December 2007.
 872
 873     # Given a file handle to a PhastCons file get the
 874     # next entry which is all the lines after a "fixedStep"
 875     # line and until the next "fixedStep" line or EOF.
 876
 877     my ( $fh,   # filehandle
 878        ) = @_;
 879
 880     # Returns a list of lines
 881
 882     my ( $entry, @lines );
 883
 884     local $/ = "\nfixedStep ";
 885
 886     $entry = <$fh>;
 887
 888     chomp $entry;
 889
 890     @lines = split "\n", $entry;
 891
 892     return if @lines == 0;
 893
 894     $lines[ 0 ] =~ s/fixedStep?\s*//;
 895
 896     return wantarray ? @lines : \@lines;
 897 }
 898
 899
 900 sub fixedstep_index_create
 901 {
 902     # Martin A. Hansen, January 2008.
 903
 904     # Indexes a concatenated fixedStep file.
 905     # The index consists of a hash with chromosomes as keys,
 906     # and a list of [ chr_beg, next_chr_beg, chr_end, index_beg, index_len ] as values.
 907
 908     my ( $path,   # path to fixedStep file
 909        ) = @_;
 910
 911     # Returns a hashref
 912
 913     my ( $fh, $pos, $index_beg, $index_len, $entry, $locator, $chr, $step, $beg, $end, $len, %index, $i );
 914
 915     $fh = Maasha::Common::read_open( $path );
 916
 917     $pos = 0;
 918
 919     while ( $entry = Maasha::UCSC::fixedstep_get_entry( $fh ) )
 920     {
 921         $locator = shift @{ $entry };
 922
 923         if ( $locator =~ /chrom=([^ ]+) start=(\d+) step=(\d+)/ )
 924         {
 925             $chr  = $1;
 926             $beg  = $2 - 1;  #  fixedStep files are 1-based
 927             $step = $3;
 928         }
 929         else
 930         {
 931             Maasha::Common::error( qq(Could not parse locator: $locator) );
 932         }
 933
 934         $pos += length( $locator ) + 11;
 935
 936         $index_beg = $pos;
 937
 938 #        map { $pos += length( $_ ) + 1 } @{ $entry };
 939
 940         $pos += 6 * scalar @{ $entry };
 941
 942         $index_len = $pos - $index_beg;
 943
 944         push @{ $index{ $chr } }, [ $beg, undef, $beg + scalar @{ $entry } - 1, $index_beg, $index_len ];
 945     }
 946
 947     close $fh;
 948
 949     foreach $chr ( keys %index )
 950     {
 951         for ( $i = 0; $i < @{ $index{ $chr } } - 1; $i++ ) {
 952             $index{ $chr }->[ $i ]->[ FS_NEXT_CHR_BEG ] = $index{ $chr }->[ $i + 1 ]->[ 0 ];
 953         }
 954
 955         $index{ $chr }->[ -1 ]->[ FS_NEXT_CHR_BEG ] = $index{ $chr }->[ -1 ]->[ FS_CHR_END ] + 1;
 956     }
 957
 958     return wantarray ? %index : \%index;
 959 }
 960
 961
 962 sub fixedstep_index_store
 963 {
 964     # Martin A. Hansen, January 2008.
 965
 966     # Writes a fixedStep index to binary file.
 967
 968     my ( $path,   # full path to file
 969          $index,  # list with index
 970        ) = @_;
 971
 972     # returns nothing
 973
 974     Maasha::Common::file_store( $path, $index );
 975 }
 976
 977
 978 sub fixedstep_index_retrieve
 979 {
 980     # Martin A. Hansen, January 2008.
 981
 982     # Retrieves a fixedStep index from binary file.
 983
 984     my ( $path,   # full path to file
 985        ) = @_;
 986
 987     # returns list
 988
 989     my $index;
 990
 991     $index = Maasha::Common::file_retrieve( $path );
 992
 993     return wantarray ? %{ $index } : $index;
 994 }
 995
 996
 997 sub fixedstep_index_lookup
 998 {
 999     # Martin A. Hansen, January 2008.
1000
1001     # Retrieve fixedStep scores from a indexed
1002     # fixedStep file given a chromosome and
1003     # begin and end positions.
1004
1005     my ( $index,     # data structure
1006          $fh,        # filehandle to datafile
1007          $chr,       # chromosome
1008          $chr_beg,   # chromosome beg
1009          $chr_end,   # chromosome end
1010          $flank,     # include flanking region - OPTIONAL
1011        ) = @_;
1012
1013     # Returns a list
1014
1015     my ( $index_beg, $index_end, $i, $c, $beg, $end, @vals, $scores );
1016
1017     $flank ||= 0;
1018
1019     $chr_beg -= $flank;
1020     $chr_end += $flank;
1021
1022     # print "chr_beg->$chr_beg   chr_end->$chr_end   flank->$flank\n";
1023
1024     if ( exists $index->{ $chr } )
1025     {
1026         $index_beg = Maasha::Matrix::interval_search( $index->{ $chr }, 0, 1, $chr_beg );
1027
1028         if ( $index_beg < 0 ) {
1029             Maasha::Common::error( qq(Index search failed - begin index position doesn't exists: $chr_beg) );
1030         }
1031
1032         if ( $chr_end < $index->{ $chr }->[ $index_beg ]->[ 1 ] )
1033         {
1034             $index_end = $index_beg;
1035         }
1036         else
1037         {
1038             $index_end = Maasha::Matrix::interval_search( $index->{ $chr }, 0, 1, $chr_end );
1039
1040             if ( $index_end < 0 ) {
1041                 Maasha::Common::error( qq(Index search failed - end index position doesn't exists: $chr_end) );
1042             }
1043         }
1044
1045         map { $scores->[ $_ ] = 0 } 0 .. $chr_end - $chr_beg;
1046
1047         if ( $index_beg == $index_end )
1048         {
1049             $beg = Maasha::Calc::max( $chr_beg, $index->{ $chr }->[ $index_beg ]->[ FS_CHR_BEG ] );
1050             $end = Maasha::Calc::min( $chr_end, $index->{ $chr }->[ $index_end ]->[ FS_CHR_END ] );
1051
1052             if ( $beg <= $index->{ $chr }->[ $index_beg ]->[ FS_CHR_END ] and $end >= $index->{ $chr }->[ $index_beg ]->[ FS_CHR_BEG ] )
1053             {
1054                 @vals = split "\n", Maasha::Common::file_read(
1055                     $fh,
1056                     $index->{ $chr }->[ $index_beg ]->[ FS_INDEX_BEG ] + 6 * ( $beg - $index->{ $chr }->[ $index_beg ]->[ FS_CHR_BEG ] ),
1057                     6 * ( $end - $beg + 1 ),
1058                 );
1059             }
1060
1061             for ( $c = 0; $c < @vals; $c++ ) {
1062                 $scores->[ $c + $beg - $chr_beg ] = $vals[ $c ];
1063             }
1064         }
1065         else
1066         {
1067             $beg = Maasha::Calc::max( $chr_beg, $index->{ $chr }->[ $index_beg ]->[ FS_CHR_BEG ] );
1068
1069 #            print Dumper( $beg, $index->{ $chr }->[ $index_beg ] );
1070 #            print Dumper( "next", $index->{ $chr }->[ $index_beg ]->[ FS_NEXT_CHR_BEG ] );
1071
1072             #      beg         next
1073             #      v           v
1074             #  |||||||||.......
1075
1076             if ( $beg <= $index->{ $chr }->[ $index_beg ]->[ FS_CHR_END ] )
1077             {
1078                 @vals = split "\n", Maasha::Common::file_read(
1079                     $fh,
1080                     $index->{ $chr }->[ $index_beg ]->[ FS_INDEX_BEG ] + 6 * ( $beg - $index->{ $chr }->[ $index_beg ]->[ FS_CHR_BEG ] ),
1081                     6 * ( $index->{ $chr }->[ $index_beg ]->[ FS_CHR_END ] - $beg + 1 ),
1082                 );
1083
1084                 for ( $c = 0; $c < @vals; $c++ ) {
1085                     $scores->[ $c + $beg - $chr_beg ] = $vals[ $c ];
1086                 }
1087             }
1088
1089             $end = Maasha::Calc::min( $chr_end, $index->{ $chr }->[ $index_end ]->[ FS_CHR_END ] );
1090
1091             if ( $end <= $index->{ $chr }->[ $index_end ]->[ FS_CHR_END ] )
1092             {
1093                 @vals = split "\n", Maasha::Common::file_read(
1094                     $fh,
1095                     $index->{ $chr }->[ $index_end ]->[ FS_INDEX_BEG ],
1096                     6 * ( $end - $index->{ $chr }->[ $index_end ]->[ FS_CHR_BEG ] + 1 ),
1097                 );
1098
1099                 for ( $c = 0; $c < @vals; $c++ ) {
1100                     $scores->[ $c + $index->{ $chr }->[ $index_end ]->[ FS_CHR_BEG ] - $chr_beg ] = $vals[ $c ];
1101                 }
1102             }
1103
1104             for ( $i = $index_beg + 1; $i <= $index_end - 1; $i++ )
1105             {
1106                 @vals = split "\n", Maasha::Common::file_read(
1107                     $fh,
1108                     $index->{ $chr }->[ $i ]->[ FS_INDEX_BEG ],
1109                     6 * ( $index->{ $chr }->[ $i ]->[ FS_CHR_END ] - $index->{ $chr }->[ $i ]->[ FS_CHR_BEG ] + 1 ),
1110                 );
1111
1112                 for ( $c = 0; $c < @vals; $c++ ) {
1113                     $scores->[ $c + $index->{ $chr }->[ $i ]->[ FS_CHR_BEG ] - $chr_beg ] = $vals[ $c ];
1114                 }
1115             }
1116         }
1117     }
1118     else
1119     {
1120         Maasha::Common::error( qq(Chromosome "$chr" was not found in index) );
1121     }
1122
1123     return wantarray ? @{ $scores } : $scores;
1124 }
1125
1126
1127 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> PhastCons format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1128
1129
1130 sub phastcons_index
1131 {
1132     # Martin A. Hansen, July 2008
1133
1134     # Create a fixedStep index for PhastCons data.
1135
1136     my ( $file,   # file to index
1137          $dir,    # dir with file
1138        ) = @_;
1139
1140     # Returns nothing.
1141
1142     my ( $index );
1143
1144     $index = fixedstep_index_create( "$dir/$file" );
1145
1146     fixedstep_index_store( "$dir/$file.index", $index );
1147 }
1148
1149
1150 sub phastcons_parse_entry
1151 {
1152     # Martin A. Hansen, December 2007.
1153
1154     # Given a PhastCons entry converts this to a
1155     # list of super blocks.
1156
1157 #    $options->{ "min" }       ||= 10;
1158 #    $options->{ "dist" }      ||= 25;
1159 #    $options->{ "threshold" } ||= 0.8;
1160 #    $options->{ "gap" }       ||= 5;
1161
1162     my ( $lines,   # list of lines
1163          $args,    # argument hash
1164        ) = @_;
1165
1166     # Returns
1167
1168     my ( $info, $chr, $beg, $step, $i, $c, $j, @blocks, @super_blocks, @entries, $super_block, $block, @lens, @begs );
1169
1170     $info = shift @{ $lines };
1171
1172     if ( $info =~ /^chrom=([^ ]+) start=(\d+) step=(\d+)$/ )
1173     {
1174         $chr  = $1;
1175         $beg  = $2;
1176         $step = $3;
1177
1178         die qq(ERROR: step size $step != 1 -> problem!\n) if $step != 1; # in an ideal world should would be fixed ...
1179     }
1180
1181     $i = 0;
1182
1183     while ( $i < @{ $lines } )
1184     {
1185         if ( $lines->[ $i ] >= $args->{ "threshold" } )
1186         {
1187             $c = $i + 1;
1188
1189             while ( $c < @{ $lines } )
1190             {
1191                 if ( $lines->[ $c ] < $args->{ "threshold" } )
1192                 {
1193                     $j = $c + 1;
1194
1195                     while ( $j < @{ $lines } and $lines->[ $j ] < $args->{ "threshold" } ) {
1196                         $j++;
1197                     }
1198
1199                     if ( $j - $c > $args->{ "gap" } )
1200                     {
1201                         if ( $c - $i >= $args->{ "min" } )
1202                         {
1203                             push @blocks, {
1204                                 CHR     => $chr,
1205                                 CHR_BEG => $beg + $i - 1,
1206                                 CHR_END => $beg + $c - 2,
1207                                 CHR_LEN => $c - $i,
1208                             };
1209                         }
1210
1211                         $i = $j;
1212
1213                         last;
1214                     }
1215
1216                     $c = $j
1217                 }
1218                 else
1219                 {
1220                     $c++;
1221                 }
1222             }
1223
1224             if ( $c - $i >= $args->{ "min" } )
1225             {
1226                 push @blocks, {
1227                     CHR     => $chr,
1228                     CHR_BEG => $beg + $i - 1,
1229                     CHR_END => $beg + $c - 2,
1230                     CHR_LEN => $c - $i,
1231                 };
1232             }
1233
1234             $i = $c;
1235         }
1236         else
1237         {
1238             $i++;
1239         }
1240     }
1241
1242     $i = 0;
1243
1244     while ( $i < @blocks )
1245     {
1246         $c = $i + 1;
1247
1248         while ( $c < @blocks and $blocks[ $c ]->{ "CHR_BEG" } - $blocks[ $c - 1 ]->{ "CHR_END" } <= $args->{ "dist" } )
1249         {
1250             $c++;
1251         }
1252
1253         push @super_blocks, [ @blocks[ $i .. $c - 1 ] ];
1254
1255         $i = $c;
1256     }
1257
1258     foreach $super_block ( @super_blocks )
1259     {
1260         foreach $block ( @{ $super_block } )
1261         {
1262             push @begs, $block->{ "CHR_BEG" } - $super_block->[ 0 ]->{ "CHR_BEG" };
1263             push @lens, $block->{ "CHR_LEN" } - 1;
1264         }
1265
1266         $lens[ -1 ]++;
1267
1268         push @entries, {
1269             CHR         => $super_block->[ 0 ]->{ "CHR" },
1270             CHR_BEG     => $super_block->[ 0 ]->{ "CHR_BEG" },
1271             CHR_END     => $super_block->[ -1 ]->{ "CHR_END" },
1272             Q_ID        => "Q_ID",
1273             SCORE       => 100,
1274             STRAND      => "+",
1275             THICK_BEG   => $super_block->[ 0 ]->{ "CHR_BEG" },
1276             THICK_END   => $super_block->[ -1 ]->{ "CHR_END" } + 1,
1277             COLOR       => 0,
1278             BLOCK_COUNT => scalar @{ $super_block },
1279             BLOCK_LENS  => join( ",", @lens ),
1280             Q_BEGS      => join( ",", @begs ),
1281         };
1282
1283         undef @begs;
1284         undef @lens;
1285     }
1286
1287     return wantarray ? @entries : \@entries;
1288 }
1289
1290
1291 sub phastcons_normalize
1292 {
1293     # Martin A. Hansen, January 2008.
1294
1295     # Normalizes a list of lists with PhastCons scores,
1296     # in such a way that each list contains the same number
1297     # or PhastCons scores.
1298
1299     my ( $AoA,    # AoA with PhastCons scores
1300        ) = @_;
1301
1302     # Returns AoA.
1303
1304     my ( $list, $max, $min, $mean, $diff );
1305
1306     $min = 99999999;
1307     $max = 0;
1308
1309     foreach $list ( @{ $AoA } )
1310     {
1311         $min = scalar @{ $list } if scalar @{ $list } < $min;
1312         $max = scalar @{ $list } if scalar @{ $list } > $max;
1313     }
1314
1315     $mean = int( ( $min + $max ) / 2 );
1316
1317 #    print STDERR "min->$min   max->$max   mean->$mean\n";
1318
1319     foreach $list ( @{ $AoA } )
1320     {
1321         $diff = scalar @{ $list } - $mean;
1322
1323         phastcons_list_inflate( $list, abs( $diff ) ) if $diff < 0;
1324         phastcons_list_deflate( $list, $diff )        if $diff > 0;
1325     }
1326
1327     return wantarray ? @{ $AoA } : $AoA;
1328 }
1329
1330
1331 sub phastcons_list_inflate
1332 {
1333     # Martin A. Hansen, January 2008.
1334
1335     # Inflates a list with a given number of elements
1336     # in such a way that the extra elements are introduced
1337     # evenly over the entire length of the list. The value
1338     # of the extra elements is based on a mean of the
1339     # adjacent elements.
1340
1341     my ( $list,   # list of elements
1342          $diff,   # number of elements to introduce
1343        ) = @_;
1344
1345     # Returns nothing
1346
1347     my ( $len, $space, $i, $pos );
1348
1349     $len = scalar @{ $list };
1350
1351     $space = $len / $diff;
1352
1353     for ( $i = 0; $i < $diff; $i++ )
1354     {
1355         $pos = int( ( $space / 2 ) + $i * $space );
1356
1357         splice @{ $list }, $pos, 0, ( $list->[ $pos - 1 ] + $list->[ $pos + 1 ] ) / 2;
1358         # splice @{ $list }, $pos, 0, "X";
1359     }
1360
1361     die qq(ERROR: bad inflate\n) if scalar @{ $list } != $len + $diff;
1362 }
1363
1364
1365 sub phastcons_list_deflate
1366 {
1367     # Martin A. Hansen, January 2008.
1368
1369     # Deflates a list by removing a given number of elements
1370     # evenly distributed over the entire list.
1371
1372     my ( $list,   # list of elements
1373          $diff,   # number of elements to remove
1374        ) = @_;
1375
1376     # Returns nothing
1377
1378     my ( $len, $space, $i, $pos );
1379
1380     $len = scalar @{ $list };
1381
1382     $space = ( $len - $diff ) / $diff;
1383
1384     for ( $i = 0; $i < $diff; $i++ )
1385     {
1386         $pos = int( ( $space / 2 ) + $i * $space );
1387
1388         splice @{ $list }, $pos, 1;
1389     }
1390
1391     die qq(ERROR: bad deflate\n) if scalar @{ $list } != $len - $diff;
1392 }
1393
1394
1395 sub phastcons_mean
1396 {
1397     # Martin A. Hansen, January 2008.
1398
1399     # Given a normalized PhastCons matrix in an AoA,
1400     # calculate the mean for each column and return as a list.
1401
1402     my ( $AoA,    # AoA with normalized PhastCons scores
1403        ) = @_;
1404
1405     # Returns a list
1406
1407     my ( @list );
1408
1409     $AoA = Maasha::Matrix::matrix_flip( $AoA );
1410
1411     map { push @list, Maasha::Calc::mean( $_ ) } @{ $AoA };
1412
1413     return wantarray ? @list : \@list;
1414 }
1415
1416
1417 sub phastcons_median
1418 {
1419     # Martin A. Hansen, January 2008.
1420
1421     # Given a normalized PhastCons matrix in an AoA,
1422     # calculate the median for each column and return as a list.
1423
1424     my ( $AoA,    # AoA with normalized PhastCons scores
1425        ) = @_;
1426
1427     # Returns a list
1428
1429     my ( @list );
1430
1431     $AoA = Maasha::Matrix::matrix_flip( $AoA );
1432
1433     map { push @list, Maasha::Calc::median( $_ ) } @{ $AoA };
1434
1435     return wantarray ? @list : \@list;
1436 }
1437
1438
1439 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> MULTIPLE ALIGNMENT FILES <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1440
1441
1442 sub maf_extract
1443 {
1444     # Martin A. Hansen, April 2008.
1445
1446     # Executes mafFrag to extract a subalignment from a multiz track
1447     # in the UCSC genome browser database.
1448
1449     my ( $tmp_dir,    # temporary directory
1450          $database,   # genome database
1451          $table,      # table with the multiz track
1452          $chr,        # chromosome
1453          $beg,        # begin position
1454          $end,        # end position
1455          $strand,     # strand
1456        ) = @_;
1457
1458     # Returns a list of record
1459
1460     my ( $tmp_file, $align );
1461
1462     $tmp_file = "$tmp_dir/maf_extract.maf";
1463
1464     Maasha::Common::run( "mafFrag", "$database $table $chr $beg $end $strand $tmp_file" );
1465
1466     $align = maf_parse( $tmp_file );
1467
1468     unlink $tmp_file;
1469
1470     return wantarray ? @{ $align } : $align;
1471 }
1472
1473
1474 sub maf_parse
1475 {
1476     # Martin A. Hansen, April 2008.
1477
1478
1479     my ( $path,   # full path to MAF file
1480        ) = @_;
1481
1482     # Returns a list of record.
1483
1484     my ( $fh, $line, @fields, @align );
1485
1486     $fh = Maasha::Common::read_open( $path );
1487
1488     while ( $line = <$fh> )
1489     {
1490         chomp $line;
1491
1492         if ( $line =~ /^s/ )
1493         {
1494             @fields = split / /, $line;
1495
1496             push @align, {
1497                 SEQ_NAME  => $fields[ 1 ],
1498                 SEQ       => $fields[ -1 ],
1499                 ALIGN     => 1,
1500                 ALIGN_LEN => length $fields[ -1 ],
1501             }
1502         }
1503     }
1504
1505     close $fh;
1506
1507     return wantarray ? @align : \@align;
1508 }
1509
1510
1511 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> WIGGLE FORMAT <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1512
1513
1514 sub fixedstep_put_entry
1515 {
1516     # Martin A. Hansen, April 2008.
1517
1518     # Outputs a block of fixedStep values.
1519     # Used for outputting wiggle data.
1520
1521     my ( $chr,      # chromosome
1522          $beg,      # start position
1523          $block,    # list of scores
1524          $fh,       # filehandle - OPTIONAL
1525          $log10,    # flag indicating that log10 scores should be used
1526        ) = @_;
1527
1528     # Returns nothing.
1529
1530     $beg += 1;   # fixedStep format is 1 based.
1531
1532     $fh ||= \*STDOUT;
1533
1534     print $fh "fixedStep chrom=$chr start=$beg step=1\n";
1535
1536     if ( $log10 ) {
1537         map { printf( $fh "%f\n", Maasha::Calc::log10( $_ + 1 ) ) } @{ $block };
1538     } else {
1539         map { printf( $fh "%d\n", ( $_ + 1 ) ) } @{ $block };
1540     }
1541 }
1542
1543
1544 sub wiggle_upload_to_ucsc
1545 {
1546     # Martin A. Hansen, May 2008.
1547
1548     # Upload a wiggle file to the UCSC database.
1549
1550     my ( $tmp_dir,    # temporary directory
1551          $wib_dir,    # wib directory
1552          $wig_file,   # file to upload,
1553          $options,    # argument hashref
1554        ) = @_;
1555
1556     # Returns nothing.
1557
1558     my ( $args );
1559
1560 #    $args = join " ", "-tmpDir=$tmp_dir", "-pathPrefix=$wib_dir", $options->{ "database" }, $options->{ 'table' }, $wig_file;
1561
1562 #    Maasha::Common::run( "hgLoadWiggle", "$args > /dev/null 2>&1" );
1563
1564     if ( $options->{ 'verbose' } ) {
1565         `cd $tmp_dir && hgLoadWiggle -tmpDir=$tmp_dir -pathPrefix=$wib_dir $options->{ 'database' } $options->{ 'table' } $wig_file`;
1566     } else {
1567         `cd $tmp_dir && hgLoadWiggle -tmpDir=$tmp_dir -pathPrefix=$wib_dir $options->{ 'database' } $options->{ 'table' } $wig_file > /dev/null 2>&1`;
1568     }
1569 }
1570
1571
1572 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> MySQL CONF <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1573
1574
1575 sub ucsc_get_user
1576 {
1577     # Martin A. Hansen, May 2008
1578
1579     # Fetches the MySQL database user name from the
1580     # .hg.conf file in the users home directory.
1581
1582     # Returns a string.
1583
1584     my ( $file, $fh, $line, $user );
1585
1586     $file = "$ENV{ 'HOME' }/.hg.conf";
1587
1588     return if not -f $file;
1589
1590     $fh = Maasha::Common::read_open( $file );
1591
1592     while ( $line = <$fh> )
1593     {
1594         chomp $line;
1595
1596         if ( $line =~ /^db\.user=(.+)/ )
1597         {
1598             $user = $1;
1599
1600             last;
1601         }
1602     }
1603
1604     close $fh;
1605
1606     return $user;
1607 }
1608
1609
1610 sub ucsc_get_password
1611 {
1612     # Martin A. Hansen, May 2008
1613
1614     # Fetches the MySQL database password from the
1615     # .hg.conf file in the users home directory.
1616
1617     # Returns a string.
1618
1619     my ( $file, $fh, $line, $password );
1620
1621     $file = "$ENV{ 'HOME' }/.hg.conf";
1622
1623     return if not -f $file;
1624
1625     $fh = Maasha::Common::read_open( $file );
1626
1627     while ( $line = <$fh> )
1628     {
1629         chomp $line;
1630
1631         if ( $line =~ /^db\.password=(.+)/ )
1632         {
1633             $password = $1;
1634
1635             last;
1636         }
1637     }
1638
1639     close $fh;
1640
1641     return $password;
1642 }
1643
1644
1645 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1646
1647
1648 __END__
1649