code_perl/Maasha/UCSC.pm

   1 package Maasha::UCSC;
   2
   3 # Copyright (C) 2007 Martin A. Hansen.
   4
   5 # This program is free software; you can redistribute it and/or
   6 # modify it under the terms of the GNU General Public License
   7 # as published by the Free Software Foundation; either version 2
   8 # of the License, or (at your option) any later version.
   9
  10 # This program is distributed in the hope that it will be useful,
  11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 # GNU General Public License for more details.
  14
  15 # You should have received a copy of the GNU General Public License
  16 # along with this program; if not, write to the Free Software
  17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  18
  19 # http://www.gnu.org/copyleft/gpl.html
  20
  21
  22 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  23
  24
  25 # Stuff for interacting with UCSC genome browser
  26
  27
  28 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  29
  30
  31 use warnings;
  32 use strict;
  33 use vars qw ( @ISA @EXPORT );
  34
  35 use Data::Dumper;
  36 use Maasha::Common;
  37 use Maasha::Filesys;
  38 use Maasha::Calc;
  39 use Maasha::Matrix;
  40
  41 use constant {
  42     FS_CHR_BEG      => 0,
  43     FS_NEXT_CHR_BEG => 1,
  44     FS_CHR_END      => 2,
  45     FS_INDEX_BEG    => 3,
  46     FS_INDEX_LEN    => 4,
  47
  48     CHR          => 0,
  49     CHR_BEG      => 1,
  50     CHR_END      => 2,
  51     Q_ID         => 3,
  52     SCORE        => 4,
  53     STRAND       => 5,
  54     THICK_BEG    => 6,
  55     THICK_END    => 7,
  56     COLOR        => 8,
  57     BLOCK_COUNT  => 9,
  58     BLOCK_LENS   => 10,
  59     Q_BEGS       => 11,
  60 };
  61
  62 @ISA = qw( Exporter );
  63
  64
  65 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> BED format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  66
  67
  68 # http://genome.ucsc.edu/goldenPath/help/hgTracksHelp.html#BED
  69
  70
  71 sub bed_entry_get_array
  72 {
  73     # Martin A. Hansen, September 2008.
  74
  75     # Reads a BED entry given a filehandle.
  76
  77     # This is a new _faster_ BED entry parser that
  78     # uses arrays and not hashrefs.
  79
  80     # IMPORTANT! This function does not correct the
  81     # BED_END position that is kept the way UCSC
  82     # does.
  83
  84     my ( $fh,     # file handle
  85          $cols,   # columns to read - OPTIONAL (3,4,5,6, or 12)
  86        ) = @_;
  87
  88     # Returns a list.
  89
  90     my ( $line, @entry );
  91
  92     $line = <$fh>;
  93
  94     $line =~ tr/\n\r//d;    # some people have carriage returns in their BED files -> Grrrr
  95
  96     return if not defined $line;
  97
  98     if ( not defined $cols ) {
  99         $cols = 1 + $line =~ tr/\t//;
 100     }
 101
 102     @entry = split "\t", $line, $cols + 1;
 103
 104     pop @entry if scalar @entry > $cols;
 105
 106     return wantarray ? @entry : \@entry;
 107 }
 108
 109
 110 sub bed_get_entry
 111 {
 112     # Martin A. Hansen, December 2007.
 113
 114     # Reads a bed entry given a filehandle.
 115
 116     my ( $fh,        # file handle
 117          $columns,   # number of BED columns to read  -  OPTIONAL
 118        ) = @_;
 119
 120     # Returns hashref.
 121
 122     my ( $line, @fields, %entry );
 123
 124     $line = <$fh>;
 125
 126     $line =~ tr/\n\r//d;    # some people have carriage returns in their BED files -> Grrrr
 127
 128     return if not defined $line;
 129
 130     @fields = split "\t", $line;
 131
 132     $columns ||= scalar @fields;
 133
 134     if ( $columns == 3 )
 135     {
 136         %entry = (
 137             "CHR"      => $fields[ 0 ],
 138             "CHR_BEG"  => $fields[ 1 ],
 139             "CHR_END"  => $fields[ 2 ] - 1,
 140         );
 141     }
 142     elsif ( $columns == 4 )
 143     {
 144         %entry = (
 145             "CHR"      => $fields[ 0 ],
 146             "CHR_BEG"  => $fields[ 1 ],
 147             "CHR_END"  => $fields[ 2 ] - 1,
 148             "Q_ID"     => $fields[ 3 ],
 149         );
 150     }
 151     elsif ( $columns == 5 )
 152     {
 153         %entry = (
 154             "CHR"      => $fields[ 0 ],
 155             "CHR_BEG"  => $fields[ 1 ],
 156             "CHR_END"  => $fields[ 2 ] - 1,
 157             "Q_ID"     => $fields[ 3 ],
 158             "SCORE"    => $fields[ 4 ],
 159         );
 160     }
 161     elsif ( $columns == 6 )
 162     {
 163         %entry = (
 164             "CHR"      => $fields[ 0 ],
 165             "CHR_BEG"  => $fields[ 1 ],
 166             "CHR_END"  => $fields[ 2 ] - 1,
 167             "Q_ID"     => $fields[ 3 ],
 168             "SCORE"    => $fields[ 4 ],
 169             "STRAND"   => $fields[ 5 ],
 170         );
 171     }
 172     elsif ( $columns == 12 )
 173     {
 174         %entry = (
 175             "CHR"         => $fields[ 0 ],
 176             "CHR_BEG"     => $fields[ 1 ],
 177             "CHR_END"     => $fields[ 2 ] - 1,
 178             "Q_ID"        => $fields[ 3 ],
 179             "SCORE"       => $fields[ 4 ],
 180             "STRAND"      => $fields[ 5 ],
 181             "THICK_BEG"   => $fields[ 6 ],
 182             "THICK_END"   => $fields[ 7 ] - 1,
 183             "COLOR"       => $fields[ 8 ],
 184             "BLOCK_COUNT" => $fields[ 9 ],
 185             "BLOCK_LENS"  => $fields[ 10 ],
 186             "Q_BEGS"      => $fields[ 11 ],
 187         );
 188     }
 189     else
 190     {
 191         Maasha::Common::error( qq(Bad BED format in line->$line<-) );
 192     }
 193
 194     $entry{ "REC_TYPE" } = "BED";
 195     $entry{ "BED_LEN" }  = $entry{ "CHR_END" } - $entry{ "CHR_BEG" } + 1;
 196     $entry{ "BED_COLS" } = $columns;
 197
 198     return wantarray ? %entry : \%entry;
 199 }
 200
 201
 202 sub bed_get_entries
 203 {
 204     # Martin A. Hansen, January 2008.
 205
 206     # Given a path to a BED file, read in all entries
 207     # and return.
 208
 209     my ( $path,     # full path to BED file
 210          $columns,  # number of columns in BED file - OPTIONAL (but is faster)
 211        ) = @_;
 212
 213     # Returns a list.
 214
 215     my ( $fh, $entry, @list );
 216
 217     $fh = Maasha::Common::read_open( $path );
 218
 219     while ( $entry = bed_get_entry( $fh ) ) {
 220         push @list, $entry;
 221     }
 222
 223     close $fh;
 224
 225     return wantarray ? @list : \@list;
 226 }
 227
 228
 229 sub bed_entry_put_array
 230 {
 231     # Martin A. Hansen, Septermber 2008.
 232
 233     # Writes a BED entry array to file.
 234
 235     # IMPORTANT! This function does not correct the
 236     # BED_END position that is assumed to be in the
 237     # UCSC positions scheme.
 238
 239     my ( $record,   # list
 240          $fh,       # file handle                   - OPTIONAL
 241          $cols,     # number of columns in BED file - OPTIONAL
 242        ) = @_;
 243
 244     # Returns nothing.
 245
 246     $fh = \*STDOUT if not defined $fh;
 247
 248     if ( defined $cols ) {
 249         print $fh join( "\t", @{ $record }[ 0 .. $cols - 1 ] ), "\n";
 250     } else {
 251         print $fh join( "\t", @{ $record } ), "\n";
 252     }
 253 }
 254
 255
 256 sub bed_put_entry
 257 {
 258     # Martin A. Hansen, Septermber 2007.
 259
 260     # Writes a BED entry to file.
 261
 262     # NB, this could really be more robust!?
 263
 264     my ( $record,       # hashref
 265          $fh,           # file handle                   - OPTIONAL
 266          $columns,      # number of columns in BED file - OPTIONAL (but is faster)
 267        ) = @_;
 268
 269     # Returns nothing.
 270
 271     my ( @fields );
 272
 273     $columns ||= 12;   # max number of columns possible
 274
 275     if ( $columns == 3 )
 276     {
 277         push @fields, $record->{ "CHR" };
 278         push @fields, $record->{ "CHR_BEG" };
 279         push @fields, $record->{ "CHR_END" } + 1;
 280     }
 281     elsif ( $columns == 4 )
 282     {
 283         $record->{ "Q_ID" }  =~ s/\s+/_/g;
 284
 285         push @fields, $record->{ "CHR" };
 286         push @fields, $record->{ "CHR_BEG" };
 287         push @fields, $record->{ "CHR_END" } + 1;
 288         push @fields, $record->{ "Q_ID" };
 289     }
 290     elsif ( $columns == 5 )
 291     {
 292         $record->{ "Q_ID" }  =~ s/\s+/_/g;
 293         $record->{ "SCORE" } =~ s/\.\d*//;
 294
 295         push @fields, $record->{ "CHR" };
 296         push @fields, $record->{ "CHR_BEG" };
 297         push @fields, $record->{ "CHR_END" } + 1;
 298         push @fields, $record->{ "Q_ID" };
 299         push @fields, $record->{ "SCORE" };
 300     }
 301     elsif ( $columns == 6 )
 302     {
 303         $record->{ "Q_ID" }  =~ s/\s+/_/g;
 304         $record->{ "SCORE" } =~ s/\.\d*//;
 305
 306         push @fields, $record->{ "CHR" };
 307         push @fields, $record->{ "CHR_BEG" };
 308         push @fields, $record->{ "CHR_END" } + 1;
 309         push @fields, $record->{ "Q_ID" };
 310         push @fields, $record->{ "SCORE" };
 311         push @fields, $record->{ "STRAND" };
 312     }
 313     else
 314     {
 315         $record->{ "Q_ID" }  =~ s/\s+/_/g;
 316         $record->{ "SCORE" } =~ s/\.\d*//;
 317
 318         push @fields, $record->{ "CHR" };
 319         push @fields, $record->{ "CHR_BEG" };
 320         push @fields, $record->{ "CHR_END" } + 1;
 321         push @fields, $record->{ "Q_ID" };
 322         push @fields, $record->{ "SCORE" };
 323         push @fields, $record->{ "STRAND" };
 324         push @fields, $record->{ "THICK_BEG" }     if defined $record->{ "THICK_BEG" };
 325         push @fields, $record->{ "THICK_END" } + 1 if defined $record->{ "THICK_END" };
 326         push @fields, $record->{ "COLOR" }         if defined $record->{ "COLOR" };
 327         push @fields, $record->{ "BLOCK_COUNT" }   if defined $record->{ "BLOCK_COUNT" };
 328         push @fields, $record->{ "BLOCK_LENS" }    if defined $record->{ "BLOCK_LENS" };
 329         push @fields, $record->{ "Q_BEGS" }        if defined $record->{ "Q_BEGS" };
 330     }
 331
 332     if ( $fh ) {
 333         print $fh join( "\t", @fields ), "\n";
 334     } else {
 335         print join( "\t", @fields ), "\n";
 336     }
 337 }
 338
 339
 340 sub bed_put_entries
 341 {
 342     # Martin A. Hansen, January 2008.
 343
 344     # Write a list of BED entries.
 345
 346     my ( $entries,   # list of entries,
 347          $fh,        # file handle - OPTIONAL
 348        ) = @_;
 349
 350     # Returns nothing.
 351
 352     map { bed_put_entry( $_, $fh ) } @{ $entries };
 353 }
 354
 355
 356 sub bed_analyze
 357 {
 358     # Martin A. Hansen, March 2008.
 359
 360     # Given a bed record, analysis this to give information
 361     # about intron/exon sizes.
 362
 363     my ( $entry,   # BED entry
 364        ) = @_;
 365
 366     # Returns hashref.
 367
 368     my ( $i, @begs, @lens, $exon_max, $exon_min, $exon_len, $exon_tot, $intron_max, $intron_min, $intron_len, $intron_tot );
 369
 370     $exon_max   = 0;
 371     $exon_min   = 9999999999;
 372     $intron_max = 0;
 373     $intron_min = 9999999999;
 374
 375     $entry->{ "EXONS" }   = $entry->{ "BLOCK_COUNT" };
 376
 377     @begs = split /,/, $entry->{ "Q_BEGS" };
 378     @lens = split /,/, $entry->{ "BLOCK_LENS" };
 379
 380     for ( $i = 0; $i < $entry->{ "BLOCK_COUNT" }; $i++ )
 381     {
 382         $exon_len = $lens[ $i ];
 383
 384         $entry->{ "EXON_LEN_$i" } = $exon_len;
 385
 386         $exon_max = $exon_len if $exon_len > $exon_max;
 387         $exon_min = $exon_len if $exon_len < $exon_min;
 388
 389         $exon_tot += $exon_len;
 390     }
 391
 392     $entry->{ "EXON_LEN_-1" }   = $exon_len;
 393     $entry->{ "EXON_MAX_LEN" }  = $exon_max;
 394     $entry->{ "EXON_MIN_LEN" }  = $exon_min;
 395     $entry->{ "EXON_MEAN_LEN" } = int( $exon_tot / $entry->{ "EXONS" } );
 396
 397     $entry->{ "INTRONS" } = $entry->{ "BLOCK_COUNT" } - 1;
 398     $entry->{ "INTRONS" } = 0 if $entry->{ "INTRONS" } < 0;
 399
 400     if ( $entry->{ "INTRONS" } )
 401     {
 402         for ( $i = 1; $i < $entry->{ "BLOCK_COUNT" }; $i++ )
 403         {
 404             $intron_len = $begs[ $i ] - ( $begs[ $i - 1 ] + $lens[ $i - 1 ] );
 405
 406             $entry->{ "INTRON_LEN_" . ( $i - 1 ) } = $intron_len;
 407
 408             $intron_max = $intron_len if $intron_len > $intron_max;
 409             $intron_min = $intron_len if $intron_len < $intron_min;
 410
 411             $intron_tot += $intron_len;
 412         }
 413
 414         $entry->{ "INTRON_LEN_-1" }   = $intron_len;
 415         $entry->{ "INTRON_MAX_LEN" }  = $intron_max;
 416         $entry->{ "INTRON_MIN_LEN" }  = $intron_min;
 417         $entry->{ "INTRON_MEAN_LEN" } = int( $intron_tot / $entry->{ "INTRONS" } );
 418     }
 419
 420     return wantarray ? %{ $entry } : $entry;
 421 }
 422
 423
 424 sub bed_sort
 425 {
 426     # Martin A. Hansen, Septermber 2008
 427
 428     # Sorts a BED file using the c program
 429     # "bed_sort" specifing a sort mode:
 430
 431     # 1: chr AND chr_beg.
 432     # 2: chr AND strand AND chr_beg.
 433     # 3: chr_beg.
 434     # 4: strand AND chr_beg.
 435
 436     my ( $bed_file,    # BED file to sort
 437          $sort_mode,   # See above.
 438          $cols,        # Number of columns in BED file
 439        ) = @_;
 440
 441     &Maasha::Common::run( "bed_sort", "--sort $sort_mode --cols $cols $bed_file" );
 442 }
 443
 444
 445 sub bed_split_to_files
 446 {
 447     # Martin A. Hansen, Septermber 2008
 448
 449     # Given a list of BED files, split these
 450     # into temporary files based on the chromosome
 451     # name. Returns a list of the temporary files.
 452
 453     my ( $bed_files,   # list of BED files to split
 454          $cols,        # number of columns
 455          $tmp_dir,     # temporary directory
 456        ) = @_;
 457
 458     # Returns a list.
 459
 460     my ( $bed_file, $fh_in, $entry, $key, %fh_hash, @tmp_files );
 461
 462     foreach $bed_file ( @{ $bed_files } )
 463     {
 464         $fh_in = Maasha::Common::read_open( $bed_file );
 465
 466         while ( $entry = bed_entry_get_array( $fh_in, $cols ) )
 467         {
 468             $key = $entry->[ CHR ];
 469
 470             $fh_hash{ $key } = Maasha::Common::write_open( "$tmp_dir/$key.temp" ) if not exists $fh_hash{ $key };
 471
 472             bed_entry_put_array( $entry, $fh_hash{ $key } );
 473         }
 474
 475         close $fh_in;
 476     }
 477
 478     foreach $key ( sort keys %fh_hash )
 479     {
 480         push @tmp_files, "$tmp_dir/$key.temp";
 481
 482         close $fh_hash{ $key };
 483     }
 484
 485     return wantarray ? @tmp_files : \@tmp_files;
 486 }
 487
 488
 489 sub bed_merge_entries
 490 {
 491     # Martin A. Hansen, February 2008.
 492
 493     # Merge a list of given BED entries in one big entry.
 494
 495     my ( $entries,     # list of BED entries to be merged
 496        ) = @_;
 497
 498     # Returns hash.
 499
 500     my ( $i, @q_ids, @q_begs, @blocksizes, @new_q_begs, @new_blocksizes, %new_entry );
 501
 502     @{ $entries } = sort { $a->{ "CHR_BEG" } <=> $b->{ "CHR_BEG" } } @{ $entries };
 503
 504     for ( $i = 0; $i < @{ $entries }; $i++ )
 505     {
 506         Maasha::Common::error( qq(Attempted merge of BED entries from different chromosomes) ) if $entries->[ 0 ]->{ "CHR" }    ne $entries->[ $i ]->{ "CHR" };
 507         Maasha::Common::error( qq(Attempted merge of BED entries from different strands) )     if $entries->[ 0 ]->{ "STRAND" } ne $entries->[ $i ]->{ "STRAND" };
 508
 509         push @q_ids, $entries->[ $i ]->{ "Q_ID" } || sprintf( "ID%06d", $i );
 510
 511         if ( exists $entries->[ $i ]->{ "Q_BEGS" } )
 512         {
 513             @q_begs     = split ",", $entries->[ $i ]->{ "Q_BEGS" };
 514             @blocksizes = split ",", $entries->[ $i ]->{ "BLOCK_LENS" };
 515         }
 516         else
 517         {
 518             @q_begs     = 0;
 519             @blocksizes = $entries->[ $i ]->{ "CHR_END" } - $entries->[ $i ]->{ "CHR_BEG" } + 1;
 520         }
 521
 522         map { $_ += $entries->[ $i ]->{ "CHR_BEG" } } @q_begs;
 523
 524         push @new_q_begs, @q_begs;
 525         push @new_blocksizes, @blocksizes;
 526     }
 527
 528     map { $_ -= $entries->[ 0 ]->{ "CHR_BEG" } } @new_q_begs;
 529
 530     %new_entry = (
 531         CHR          => $entries->[ 0 ]->{ "CHR" },
 532         CHR_BEG      => $entries->[ 0 ]->{ "CHR_BEG" },
 533         CHR_END      => $entries->[ -1 ]->{ "CHR_END" },
 534         REC_TYPE     => "BED",
 535         BED_LEN      => $entries->[ -1 ]->{ "CHR_END" } - $entries->[ 0 ]->{ "CHR_BEG" } + 1,
 536         BED_COLS     => 12,
 537         Q_ID         => join( ":", @q_ids ),
 538         SCORE        => 999,
 539         STRAND       => $entries->[ 0 ]->{ "STRAND" }     || "+",
 540         THICK_BEG    => $entries->[ 0 ]->{ "THICK_BEG" }  || $entries->[ 0 ]->{ "CHR_BEG" },
 541         THICK_END    => $entries->[ -1 ]->{ "THICK_END" } || $entries->[ -1 ]->{ "CHR_END" },
 542         COLOR        => 0,
 543         BLOCK_COUNT  => scalar @new_q_begs,
 544         BLOCK_LENS   => join( ",", @new_blocksizes ),
 545         Q_BEGS       => join( ",", @new_q_begs ),
 546     );
 547
 548     return wantarray ? %new_entry : \%new_entry;
 549 }
 550
 551
 552 sub bed_split_entry
 553 {
 554     # Martin A. Hansen, February 2008.
 555
 556     # Splits a given BED entry into a list of blocks,
 557     # which are returned. A list of 6 column BED entry is returned.
 558
 559     my ( $entry,    # BED entry hashref
 560        ) = @_;
 561
 562     # Returns a list.
 563
 564     my ( @q_begs, @blocksizes, $block, @blocks, $i );
 565
 566     if ( exists $entry->{ "BLOCK_COUNT" } )
 567     {
 568         @q_begs     = split ",", $entry->{ "Q_BEGS" };
 569         @blocksizes = split ",", $entry->{ "BLOCK_LENS" };
 570
 571         for ( $i = 0; $i < @q_begs; $i++ )
 572         {
 573             undef $block;
 574
 575             $block->{ "CHR" }      = $entry->{ "CHR" };
 576             $block->{ "CHR_BEG" }  = $entry->{ "CHR_BEG" } + $q_begs[ $i ];
 577             $block->{ "CHR_END" }  = $entry->{ "CHR_BEG" } + $q_begs[ $i ] + $blocksizes[ $i ] - 1;
 578             $block->{ "Q_ID" }     = $entry->{ "Q_ID" } . sprintf( "_%03d", $i );
 579             $block->{ "SCORE" }    = $entry->{ "SCORE" };
 580             $block->{ "STRAND" }   = $entry->{ "STRAND" };
 581             $block->{ "BED_LEN" }  = $block->{ "CHR_END" } - $block->{ "CHR_BEG" } + 1,
 582             $block->{ "BED_COLS" } = 6;
 583             $block->{ "REC_TYPE" } = "BED";
 584
 585             push @blocks, $block;
 586         }
 587     }
 588     else
 589     {
 590         @blocks = @{ $entry };
 591     }
 592
 593     return wantarray ? @blocks : \@blocks;
 594 }
 595
 596
 597
 598 sub bed_overlap
 599 {
 600     # Martin A. Hansen, February 2008.
 601
 602     # Checks if two BED entries overlap and
 603     # return 1 if so - else 0;
 604
 605     my ( $entry1,      # hashref
 606          $entry2,      # hashref
 607          $no_strand,   # don't check strand flag - OPTIONAL
 608        ) = @_;
 609
 610     # Return bolean.
 611
 612     return 0 if $entry1->{ "CHR" }    ne $entry2->{ "CHR" };
 613     return 0 if $entry1->{ "STRAND" } ne $entry2->{ "STRAND" };
 614
 615     if ( $entry1->{ "CHR_END" } < $entry2->{ "CHR_BEG" } or $entry1->{ "CHR_BEG" } > $entry2->{ "CHR_END" } ) {
 616         return 0;
 617     } else {
 618         return 1;
 619     }
 620 }
 621
 622
 623 sub bed_upload_to_ucsc
 624 {
 625     # Martin A. Hansen, September 2007.
 626
 627     # Upload a BED file to the UCSC database.
 628
 629     my ( $tmp_dir,   # temporary directory
 630          $file,      # file to upload,
 631          $options,   # argument hashref
 632          $append,    # flag indicating table should be appended
 633        ) = @_;
 634
 635     # Returns nothing.
 636
 637     my ( $args, $table, $sql_file, $fh_out, $fh_in );
 638
 639     if ( $append ) {
 640         $args = join " ", $options->{ "database" }, $options->{ "table" }, "-tmpDir=$tmp_dir", "-oldTable", $file;
 641     } else {
 642         $args = join " ", $options->{ "database" }, $options->{ "table" }, "-tmpDir=$tmp_dir", $file;
 643     }
 644
 645     if ( $options->{ "table" } =~ /rnaSecStr/ )
 646     {
 647         $table = $options->{ "table" };
 648
 649         print qq(uploading secondary structure table:"$table"\n) if $options->{ "verbose" };
 650
 651         $sql_file = "$tmp_dir/upload_RNA_SS.sql";
 652
 653         $fh_out   = Maasha::Common::write_open( $sql_file );
 654
 655         print $fh_out qq(
 656 CREATE TABLE $table (
 657     bin smallint not null,              # Bin number for browser speedup
 658     chrom varchar(255) not null,        # Chromosome or FPC contig
 659     chromStart int unsigned not null,   # Start position in chromosome
 660     chromEnd int unsigned not null,     # End position in chromosome
 661     name varchar(255) not null,         # Name of item
 662     score int unsigned not null,        # Score from 0-1000
 663     strand char(1) not null,            # + or -
 664     size int unsigned not null,         # Size of element.
 665     secStr longblob not null,           # Parentheses and '.'s which define the secondary structure
 666     conf longblob not null,             # Confidence of secondary-structure annotation per position (0.0-1.0).
 667     #Indices
 668     INDEX(name(16)),
 669     INDEX(chrom(8), bin),
 670     INDEX(chrom(8), chromStart)
 671 );
 672         );
 673
 674         close $fh_out;
 675
 676         Maasha::Common::run( "hgLoadBed", "-notItemRgb -sqlTable=$sql_file $options->{ 'database' } $options->{ 'table' } -tmpDir=$tmp_dir $file > /dev/null 2>&1" );
 677
 678         unlink $sql_file;
 679     }
 680     else
 681     {
 682         Maasha::Common::run( "hgLoadBed", "$args > /dev/null 2>&1" );
 683     }
 684 }
 685
 686
 687 sub psl_upload_to_ucsc
 688 {
 689     # Martin A. Hansen, September 2007.
 690
 691     # Upload a PSL file to the UCSC database.
 692
 693     my ( $file,      # file to upload,
 694          $options,   # argument hashref
 695          $append,    # flag indicating table should be appended
 696        ) = @_;
 697
 698     # Returns nothing.
 699
 700     my ( $args );
 701
 702     if ( $append ) {
 703         $args = join " ", $options->{ "database" }, "-table=$options->{ 'table' }", "-clientLoad", "-append", $file;
 704     } else {
 705         $args = join " ", $options->{ "database" }, "-table=$options->{ 'table' }", "-clientLoad", $file;
 706     }
 707
 708     Maasha::Common::run( "hgLoadPsl", "$args > /dev/null 2>&1" );
 709 }
 710
 711
 712 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> TRACK FILE <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
 713
 714
 715 sub ucsc_config_entry_get
 716 {
 717     # Martin A. Hansen, November 2008.
 718
 719     # Given a filehandle to a UCSC Genome Browser
 720     # config file (.ra) get the next entry and
 721     # return as a hash. Entries are separated by blank
 722     # lines. # lines are skipped unless it is the lines:
 723     # # Track added ...
 724     # # Database ...
 725
 726     my ( $fh,   # file hanlde
 727        ) = @_;
 728
 729     # Returns a hashref.
 730
 731     my ( $line, %record );
 732
 733     while ( $line = <$fh> )
 734     {
 735         $line =~ tr/\n\r//d;
 736
 737         if ( $line =~ /Track added by 'upload_to_ucsc' (\S+) (\S+)/) {
 738              $record{ 'date' } = $1;
 739              $record{ 'time' } = $2;
 740         } elsif ( $line =~ /^# date: (.+)/ ) {
 741             $record{ 'date' } = $1;
 742         } elsif ( $line =~ /^# time: (.+)/ ) {
 743             $record{ 'time' } = $1;
 744         } elsif ( $line =~ /^# (?:database:|Database) (.+)/ ) {
 745             $record{ 'database' } = $1;
 746         } elsif ( $line =~ /^#/ ) {
 747             # skip
 748         } elsif ( $line =~ /(\S+)\s+(.+)/ ) {
 749             $record{ $1 } = $2;
 750         }
 751
 752         last if $line =~ /^$/ and exists $record{ "track" };
 753     }
 754
 755     return undef if not exists $record{ "track" };
 756
 757     return wantarray ? %record : \%record;
 758 }
 759
 760
 761 sub ucsc_config_entry_put
 762 {
 763     # Martin A. Hansen, November 2008.
 764
 765     # Outputs a Biopiece record (a hashref)
 766     # to a filehandle or STDOUT.
 767
 768     my ( $record,    # hashref
 769          $fh_out,    # file handle
 770        ) = @_;
 771
 772     # Returns nothing.
 773
 774     my ( $date, $time );
 775
 776     $fh_out ||= \*STDOUT;
 777
 778     ( $date, $time ) = split " ", Maasha::Common::time_stamp();
 779
 780     $record->{ "date" } ||= $date;
 781     $record->{ "time" } ||= $time;
 782
 783     print $fh_out "\n# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n\n";
 784
 785     map { print $fh_out "# $_: $record->{ $_ }\n" if exists $record->{ $_ } } qw( date time database );
 786     map { print $fh_out "$_ $record->{ $_ }\n" if exists $record->{ $_ } } qw( track
 787                                                                                name
 788                                                                                description
 789                                                                                itemRgb
 790                                                                                db
 791                                                                                offset
 792                                                                                url
 793                                                                                htmlUrl
 794                                                                                shortLabel
 795                                                                                longLabel
 796                                                                                group
 797                                                                                priority
 798                                                                                useScore
 799                                                                                visibility
 800                                                                                maxHeightPixels
 801                                                                                color
 802                                                                                mafTrack
 803                                                                                type );
 804 }
 805
 806
 807 sub ucsc_update_config
 808 {
 809     # Martin A. Hansen, September 2007.
 810
 811     # Update the /home/user/ucsc/my_tracks.ra file and executes makeCustomTracks.pl
 812
 813     my ( $options,   # hashref
 814          $type,      # track type
 815        ) = @_;
 816
 817     # Returns nothing.
 818
 819     my ( $file, $entry, %new_entry, $fh_in, $fh_out, $was_found );
 820
 821     $file = $ENV{ "HOME" } . "/ucsc/my_tracks.ra";
 822
 823     Maasha::Filesys::file_copy( $file, "$file~" );   # create a backup
 824
 825     %new_entry = (
 826         database   => $options->{ 'database' },
 827         track      => $options->{ 'table' },
 828         shortLabel => $options->{ 'short_label' },
 829         longLabel  => $options->{ 'long_label' },
 830         group      => $options->{ 'group' },
 831         priority   => $options->{ 'priority' },
 832         visibility => $options->{ 'visibility' },
 833         color      => $options->{ 'color' },
 834         type       => $type,
 835     );
 836
 837     $new_entry{ 'useScore' }        = 1             if $options->{ 'use_score' };
 838     $new_entry{ 'mafTrack' }        = "multiz17way" if $type eq "type bed 6 +";
 839     $new_entry{ 'maxHeightPixels' } = "50:50:11"    if $type eq "wig 0";
 840
 841     $fh_in  = Maasha::Filesys::file_read_open( "$file~" );
 842     $fh_out = Maasha::Filesys::file_write_open( $file );
 843
 844     while ( $entry = ucsc_config_entry_get( $fh_in ) )
 845     {
 846         if ( $entry->{ 'database' } eq $new_entry{ 'database' } and $entry->{ 'track' } eq $new_entry{ 'track' } )
 847         {
 848             ucsc_config_entry_put( \%new_entry, $fh_out );
 849
 850             $was_found = 1;
 851         }
 852         else
 853         {
 854             ucsc_config_entry_put( $entry, $fh_out );
 855         }
 856     }
 857
 858     ucsc_config_entry_put( \%new_entry, $fh_out ) if not $was_found;
 859
 860     close $fh_in;
 861     close $fh_out;
 862
 863     Maasha::Common::run( "ucscMakeTracks.pl", "-b > /dev/null 2>&1" );
 864 }
 865
 866
 867 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> fixedStep format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
 868
 869
 870 sub fixedstep_get_entry
 871 {
 872     # Martin A. Hansen, December 2007.
 873
 874     # Given a file handle to a PhastCons file get the
 875     # next entry which is all the lines after a "fixedStep"
 876     # line and until the next "fixedStep" line or EOF.
 877
 878     my ( $fh,   # filehandle
 879        ) = @_;
 880
 881     # Returns a list of lines
 882
 883     my ( $entry, @lines );
 884
 885     local $/ = "\nfixedStep ";
 886
 887     $entry = <$fh>;
 888
 889     chomp $entry;
 890
 891     @lines = split "\n", $entry;
 892
 893     return if @lines == 0;
 894
 895     $lines[ 0 ] =~ s/fixedStep?\s*//;
 896
 897     return wantarray ? @lines : \@lines;
 898 }
 899
 900
 901 sub fixedstep_index_create
 902 {
 903     # Martin A. Hansen, January 2008.
 904
 905     # Indexes a concatenated fixedStep file.
 906     # The index consists of a hash with chromosomes as keys,
 907     # and a list of [ chr_beg, next_chr_beg, chr_end, index_beg, index_len ] as values.
 908
 909     my ( $path,   # path to fixedStep file
 910        ) = @_;
 911
 912     # Returns a hashref
 913
 914     my ( $fh, $pos, $index_beg, $index_len, $entry, $locator, $chr, $step, $beg, $end, $len, %index, $i );
 915
 916     $fh = Maasha::Common::read_open( $path );
 917
 918     $pos = 0;
 919
 920     while ( $entry = Maasha::UCSC::fixedstep_get_entry( $fh ) )
 921     {
 922         $locator = shift @{ $entry };
 923
 924         if ( $locator =~ /chrom=([^ ]+) start=(\d+) step=(\d+)/ )
 925         {
 926             $chr  = $1;
 927             $beg  = $2 - 1;  #  fixedStep files are 1-based
 928             $step = $3;
 929         }
 930         else
 931         {
 932             Maasha::Common::error( qq(Could not parse locator: $locator) );
 933         }
 934
 935         $pos += length( $locator ) + 11;
 936
 937         $index_beg = $pos;
 938
 939 #        map { $pos += length( $_ ) + 1 } @{ $entry };
 940
 941         $pos += 6 * scalar @{ $entry };
 942
 943         $index_len = $pos - $index_beg;
 944
 945         push @{ $index{ $chr } }, [ $beg, undef, $beg + scalar @{ $entry } - 1, $index_beg, $index_len ];
 946     }
 947
 948     close $fh;
 949
 950     foreach $chr ( keys %index )
 951     {
 952         for ( $i = 0; $i < @{ $index{ $chr } } - 1; $i++ ) {
 953             $index{ $chr }->[ $i ]->[ FS_NEXT_CHR_BEG ] = $index{ $chr }->[ $i + 1 ]->[ 0 ];
 954         }
 955
 956         $index{ $chr }->[ -1 ]->[ FS_NEXT_CHR_BEG ] = $index{ $chr }->[ -1 ]->[ FS_CHR_END ] + 1;
 957     }
 958
 959     return wantarray ? %index : \%index;
 960 }
 961
 962
 963 sub fixedstep_index_store
 964 {
 965     # Martin A. Hansen, January 2008.
 966
 967     # Writes a fixedStep index to binary file.
 968
 969     my ( $path,   # full path to file
 970          $index,  # list with index
 971        ) = @_;
 972
 973     # returns nothing
 974
 975     Maasha::Common::file_store( $path, $index );
 976 }
 977
 978
 979 sub fixedstep_index_retrieve
 980 {
 981     # Martin A. Hansen, January 2008.
 982
 983     # Retrieves a fixedStep index from binary file.
 984
 985     my ( $path,   # full path to file
 986        ) = @_;
 987
 988     # returns list
 989
 990     my $index;
 991
 992     $index = Maasha::Common::file_retrieve( $path );
 993
 994     return wantarray ? %{ $index } : $index;
 995 }
 996
 997
 998 sub fixedstep_index_lookup
 999 {
1000     # Martin A. Hansen, January 2008.
1001
1002     # Retrieve fixedStep scores from a indexed
1003     # fixedStep file given a chromosome and
1004     # begin and end positions.
1005
1006     my ( $index,     # data structure
1007          $fh,        # filehandle to datafile
1008          $chr,       # chromosome
1009          $chr_beg,   # chromosome beg
1010          $chr_end,   # chromosome end
1011          $flank,     # include flanking region - OPTIONAL
1012        ) = @_;
1013
1014     # Returns a list
1015
1016     my ( $index_beg, $index_end, $i, $c, $beg, $end, @vals, $scores );
1017
1018     $flank ||= 0;
1019
1020     $chr_beg -= $flank;
1021     $chr_end += $flank;
1022
1023     # print "chr_beg->$chr_beg   chr_end->$chr_end   flank->$flank\n";
1024
1025     if ( exists $index->{ $chr } )
1026     {
1027         $index_beg = Maasha::Matrix::interval_search( $index->{ $chr }, 0, 1, $chr_beg );
1028
1029         if ( $index_beg < 0 ) {
1030             Maasha::Common::error( qq(Index search failed - begin index position doesn't exists: $chr_beg) );
1031         }
1032
1033         if ( $chr_end < $index->{ $chr }->[ $index_beg ]->[ 1 ] )
1034         {
1035             $index_end = $index_beg;
1036         }
1037         else
1038         {
1039             $index_end = Maasha::Matrix::interval_search( $index->{ $chr }, 0, 1, $chr_end );
1040
1041             if ( $index_end < 0 ) {
1042                 Maasha::Common::error( qq(Index search failed - end index position doesn't exists: $chr_end) );
1043             }
1044         }
1045
1046         map { $scores->[ $_ ] = 0 } 0 .. $chr_end - $chr_beg;
1047
1048         if ( $index_beg == $index_end )
1049         {
1050             $beg = Maasha::Calc::max( $chr_beg, $index->{ $chr }->[ $index_beg ]->[ FS_CHR_BEG ] );
1051             $end = Maasha::Calc::min( $chr_end, $index->{ $chr }->[ $index_end ]->[ FS_CHR_END ] );
1052
1053             if ( $beg <= $index->{ $chr }->[ $index_beg ]->[ FS_CHR_END ] and $end >= $index->{ $chr }->[ $index_beg ]->[ FS_CHR_BEG ] )
1054             {
1055                 @vals = split "\n", Maasha::Common::file_read(
1056                     $fh,
1057                     $index->{ $chr }->[ $index_beg ]->[ FS_INDEX_BEG ] + 6 * ( $beg - $index->{ $chr }->[ $index_beg ]->[ FS_CHR_BEG ] ),
1058                     6 * ( $end - $beg + 1 ),
1059                 );
1060             }
1061
1062             for ( $c = 0; $c < @vals; $c++ ) {
1063                 $scores->[ $c + $beg - $chr_beg ] = $vals[ $c ];
1064             }
1065         }
1066         else
1067         {
1068             $beg = Maasha::Calc::max( $chr_beg, $index->{ $chr }->[ $index_beg ]->[ FS_CHR_BEG ] );
1069
1070 #            print Dumper( $beg, $index->{ $chr }->[ $index_beg ] );
1071 #            print Dumper( "next", $index->{ $chr }->[ $index_beg ]->[ FS_NEXT_CHR_BEG ] );
1072
1073             #      beg         next
1074             #      v           v
1075             #  |||||||||.......
1076
1077             if ( $beg <= $index->{ $chr }->[ $index_beg ]->[ FS_CHR_END ] )
1078             {
1079                 @vals = split "\n", Maasha::Common::file_read(
1080                     $fh,
1081                     $index->{ $chr }->[ $index_beg ]->[ FS_INDEX_BEG ] + 6 * ( $beg - $index->{ $chr }->[ $index_beg ]->[ FS_CHR_BEG ] ),
1082                     6 * ( $index->{ $chr }->[ $index_beg ]->[ FS_CHR_END ] - $beg + 1 ),
1083                 );
1084
1085                 for ( $c = 0; $c < @vals; $c++ ) {
1086                     $scores->[ $c + $beg - $chr_beg ] = $vals[ $c ];
1087                 }
1088             }
1089
1090             $end = Maasha::Calc::min( $chr_end, $index->{ $chr }->[ $index_end ]->[ FS_CHR_END ] );
1091
1092             if ( $end <= $index->{ $chr }->[ $index_end ]->[ FS_CHR_END ] )
1093             {
1094                 @vals = split "\n", Maasha::Common::file_read(
1095                     $fh,
1096                     $index->{ $chr }->[ $index_end ]->[ FS_INDEX_BEG ],
1097                     6 * ( $end - $index->{ $chr }->[ $index_end ]->[ FS_CHR_BEG ] + 1 ),
1098                 );
1099
1100                 for ( $c = 0; $c < @vals; $c++ ) {
1101                     $scores->[ $c + $index->{ $chr }->[ $index_end ]->[ FS_CHR_BEG ] - $chr_beg ] = $vals[ $c ];
1102                 }
1103             }
1104
1105             for ( $i = $index_beg + 1; $i <= $index_end - 1; $i++ )
1106             {
1107                 @vals = split "\n", Maasha::Common::file_read(
1108                     $fh,
1109                     $index->{ $chr }->[ $i ]->[ FS_INDEX_BEG ],
1110                     6 * ( $index->{ $chr }->[ $i ]->[ FS_CHR_END ] - $index->{ $chr }->[ $i ]->[ FS_CHR_BEG ] + 1 ),
1111                 );
1112
1113                 for ( $c = 0; $c < @vals; $c++ ) {
1114                     $scores->[ $c + $index->{ $chr }->[ $i ]->[ FS_CHR_BEG ] - $chr_beg ] = $vals[ $c ];
1115                 }
1116             }
1117         }
1118     }
1119     else
1120     {
1121         Maasha::Common::error( qq(Chromosome "$chr" was not found in index) );
1122     }
1123
1124     return wantarray ? @{ $scores } : $scores;
1125 }
1126
1127
1128 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> PhastCons format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1129
1130
1131 sub phastcons_index
1132 {
1133     # Martin A. Hansen, July 2008
1134
1135     # Create a fixedStep index for PhastCons data.
1136
1137     my ( $file,   # file to index
1138          $dir,    # dir with file
1139        ) = @_;
1140
1141     # Returns nothing.
1142
1143     my ( $index );
1144
1145     $index = fixedstep_index_create( "$dir/$file" );
1146
1147     fixedstep_index_store( "$dir/$file.index", $index );
1148 }
1149
1150
1151 sub phastcons_parse_entry
1152 {
1153     # Martin A. Hansen, December 2007.
1154
1155     # Given a PhastCons entry converts this to a
1156     # list of super blocks.
1157
1158 #    $options->{ "min" }       ||= 10;
1159 #    $options->{ "dist" }      ||= 25;
1160 #    $options->{ "threshold" } ||= 0.8;
1161 #    $options->{ "gap" }       ||= 5;
1162
1163     my ( $lines,   # list of lines
1164          $args,    # argument hash
1165        ) = @_;
1166
1167     # Returns
1168
1169     my ( $info, $chr, $beg, $step, $i, $c, $j, @blocks, @super_blocks, @entries, $super_block, $block, @lens, @begs );
1170
1171     $info = shift @{ $lines };
1172
1173     if ( $info =~ /^chrom=([^ ]+) start=(\d+) step=(\d+)$/ )
1174     {
1175         $chr  = $1;
1176         $beg  = $2;
1177         $step = $3;
1178
1179         die qq(ERROR: step size $step != 1 -> problem!\n) if $step != 1; # in an ideal world should would be fixed ...
1180     }
1181
1182     $i = 0;
1183
1184     while ( $i < @{ $lines } )
1185     {
1186         if ( $lines->[ $i ] >= $args->{ "threshold" } )
1187         {
1188             $c = $i + 1;
1189
1190             while ( $c < @{ $lines } )
1191             {
1192                 if ( $lines->[ $c ] < $args->{ "threshold" } )
1193                 {
1194                     $j = $c + 1;
1195
1196                     while ( $j < @{ $lines } and $lines->[ $j ] < $args->{ "threshold" } ) {
1197                         $j++;
1198                     }
1199
1200                     if ( $j - $c > $args->{ "gap" } )
1201                     {
1202                         if ( $c - $i >= $args->{ "min" } )
1203                         {
1204                             push @blocks, {
1205                                 CHR     => $chr,
1206                                 CHR_BEG => $beg + $i - 1,
1207                                 CHR_END => $beg + $c - 2,
1208                                 CHR_LEN => $c - $i,
1209                             };
1210                         }
1211
1212                         $i = $j;
1213
1214                         last;
1215                     }
1216
1217                     $c = $j
1218                 }
1219                 else
1220                 {
1221                     $c++;
1222                 }
1223             }
1224
1225             if ( $c - $i >= $args->{ "min" } )
1226             {
1227                 push @blocks, {
1228                     CHR     => $chr,
1229                     CHR_BEG => $beg + $i - 1,
1230                     CHR_END => $beg + $c - 2,
1231                     CHR_LEN => $c - $i,
1232                 };
1233             }
1234
1235             $i = $c;
1236         }
1237         else
1238         {
1239             $i++;
1240         }
1241     }
1242
1243     $i = 0;
1244
1245     while ( $i < @blocks )
1246     {
1247         $c = $i + 1;
1248
1249         while ( $c < @blocks and $blocks[ $c ]->{ "CHR_BEG" } - $blocks[ $c - 1 ]->{ "CHR_END" } <= $args->{ "dist" } )
1250         {
1251             $c++;
1252         }
1253
1254         push @super_blocks, [ @blocks[ $i .. $c - 1 ] ];
1255
1256         $i = $c;
1257     }
1258
1259     foreach $super_block ( @super_blocks )
1260     {
1261         foreach $block ( @{ $super_block } )
1262         {
1263             push @begs, $block->{ "CHR_BEG" } - $super_block->[ 0 ]->{ "CHR_BEG" };
1264             push @lens, $block->{ "CHR_LEN" } - 1;
1265         }
1266
1267         $lens[ -1 ]++;
1268
1269         push @entries, {
1270             CHR         => $super_block->[ 0 ]->{ "CHR" },
1271             CHR_BEG     => $super_block->[ 0 ]->{ "CHR_BEG" },
1272             CHR_END     => $super_block->[ -1 ]->{ "CHR_END" },
1273             Q_ID        => "Q_ID",
1274             SCORE       => 100,
1275             STRAND      => "+",
1276             THICK_BEG   => $super_block->[ 0 ]->{ "CHR_BEG" },
1277             THICK_END   => $super_block->[ -1 ]->{ "CHR_END" } + 1,
1278             COLOR       => 0,
1279             BLOCK_COUNT => scalar @{ $super_block },
1280             BLOCK_LENS  => join( ",", @lens ),
1281             Q_BEGS      => join( ",", @begs ),
1282         };
1283
1284         undef @begs;
1285         undef @lens;
1286     }
1287
1288     return wantarray ? @entries : \@entries;
1289 }
1290
1291
1292 sub phastcons_normalize
1293 {
1294     # Martin A. Hansen, January 2008.
1295
1296     # Normalizes a list of lists with PhastCons scores,
1297     # in such a way that each list contains the same number
1298     # of PhastCons scores.
1299
1300     my ( $AoA,    # AoA with PhastCons scores
1301        ) = @_;
1302
1303     # Returns AoA.
1304
1305     my ( $list, $max, $min, $mean, $diff );
1306
1307     $min = 99999999;
1308     $max = 0;
1309
1310     foreach $list ( @{ $AoA } )
1311     {
1312         $min = scalar @{ $list } if scalar @{ $list } < $min;
1313         $max = scalar @{ $list } if scalar @{ $list } > $max;
1314     }
1315
1316     $mean = int( ( $min + $max ) / 2 );
1317
1318 #    print STDERR "min->$min   max->$max   mean->$mean\n";
1319
1320     foreach $list ( @{ $AoA } )
1321     {
1322         $diff = scalar @{ $list } - $mean;
1323
1324         phastcons_list_inflate( $list, abs( $diff ) ) if $diff < 0;
1325         phastcons_list_deflate( $list, $diff )        if $diff > 0;
1326     }
1327
1328     return wantarray ? @{ $AoA } : $AoA;
1329 }
1330
1331
1332 sub phastcons_list_inflate
1333 {
1334     # Martin A. Hansen, January 2008.
1335
1336     # Inflates a list with a given number of elements
1337     # in such a way that the extra elements are introduced
1338     # evenly over the entire length of the list. The value
1339     # of the extra elements is based on a mean of the
1340     # adjacent elements.
1341
1342     my ( $list,   # list of elements
1343          $diff,   # number of elements to introduce
1344        ) = @_;
1345
1346     # Returns nothing
1347
1348     my ( $len, $space, $i, $pos );
1349
1350     $len = scalar @{ $list };
1351
1352     $space = $len / $diff;
1353
1354     for ( $i = 0; $i < $diff; $i++ )
1355     {
1356         $pos = int( ( $space / 2 ) + $i * $space );
1357
1358         splice @{ $list }, $pos, 0, ( $list->[ $pos - 1 ] + $list->[ $pos + 1 ] ) / 2;
1359         # splice @{ $list }, $pos, 0, "X";
1360     }
1361
1362     die qq(ERROR: Bad inflate\n) if scalar @{ $list } != $len + $diff;
1363 }
1364
1365
1366 sub phastcons_list_deflate
1367 {
1368     # Martin A. Hansen, January 2008.
1369
1370     # Deflates a list by removing a given number of elements
1371     # evenly distributed over the entire list.
1372
1373     my ( $list,   # list of elements
1374          $diff,   # number of elements to remove
1375        ) = @_;
1376
1377     # Returns nothing
1378
1379     my ( $len, $space, $i, $pos );
1380
1381     $len = scalar @{ $list };
1382
1383     $space = ( $len - $diff ) / $diff;
1384
1385     for ( $i = 0; $i < $diff; $i++ )
1386     {
1387         $pos = int( ( $space / 2 ) + $i * $space );
1388
1389         splice @{ $list }, $pos, 1;
1390     }
1391
1392     die qq(ERROR: Dad deflate\n) if scalar @{ $list } != $len - $diff;
1393 }
1394
1395
1396 sub phastcons_mean
1397 {
1398     # Martin A. Hansen, January 2008.
1399
1400     # Given a normalized PhastCons matrix in an AoA,
1401     # calculate the mean for each column and return as a list.
1402
1403     my ( $AoA,    # AoA with normalized PhastCons scores
1404        ) = @_;
1405
1406     # Returns a list
1407
1408     my ( @list );
1409
1410     $AoA = Maasha::Matrix::matrix_flip( $AoA );
1411
1412     map { push @list, Maasha::Calc::mean( $_ ) } @{ $AoA };
1413
1414     return wantarray ? @list : \@list;
1415 }
1416
1417
1418 sub phastcons_median
1419 {
1420     # Martin A. Hansen, January 2008.
1421
1422     # Given a normalized PhastCons matrix in an AoA,
1423     # calculate the median for each column and return as a list.
1424
1425     my ( $AoA,    # AoA with normalized PhastCons scores
1426        ) = @_;
1427
1428     # Returns a list
1429
1430     my ( @list );
1431
1432     $AoA = Maasha::Matrix::matrix_flip( $AoA );
1433
1434     map { push @list, Maasha::Calc::median( $_ ) } @{ $AoA };
1435
1436     return wantarray ? @list : \@list;
1437 }
1438
1439
1440 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> MULTIPLE ALIGNMENT FILES <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1441
1442
1443 sub maf_extract
1444 {
1445     # Martin A. Hansen, April 2008.
1446
1447     # Executes mafFrag to extract a subalignment from a multiz track
1448     # in the UCSC genome browser database.
1449
1450     my ( $tmp_dir,    # temporary directory
1451          $database,   # genome database
1452          $table,      # table with the multiz track
1453          $chr,        # chromosome
1454          $beg,        # begin position
1455          $end,        # end position
1456          $strand,     # strand
1457        ) = @_;
1458
1459     # Returns a list of record
1460
1461     my ( $tmp_file, $align );
1462
1463     $tmp_file = "$tmp_dir/maf_extract.maf";
1464
1465     Maasha::Common::run( "mafFrag", "$database $table $chr $beg $end $strand $tmp_file" );
1466
1467     $align = maf_parse( $tmp_file );
1468
1469     unlink $tmp_file;
1470
1471     return wantarray ? @{ $align } : $align;
1472 }
1473
1474
1475 sub maf_parse
1476 {
1477     # Martin A. Hansen, April 2008.
1478
1479
1480     my ( $path,   # full path to MAF file
1481        ) = @_;
1482
1483     # Returns a list of record.
1484
1485     my ( $fh, $line, @fields, @align );
1486
1487     $fh = Maasha::Common::read_open( $path );
1488
1489     while ( $line = <$fh> )
1490     {
1491         chomp $line;
1492
1493         if ( $line =~ /^s/ )
1494         {
1495             @fields = split / /, $line;
1496
1497             push @align, {
1498                 SEQ_NAME  => $fields[ 1 ],
1499                 SEQ       => $fields[ -1 ],
1500                 ALIGN     => 1,
1501                 ALIGN_LEN => length $fields[ -1 ],
1502             }
1503         }
1504     }
1505
1506     close $fh;
1507
1508     return wantarray ? @align : \@align;
1509 }
1510
1511
1512 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> WIGGLE FORMAT <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1513
1514
1515 sub fixedstep_put_entry
1516 {
1517     # Martin A. Hansen, April 2008.
1518
1519     # Outputs a block of fixedStep values.
1520     # Used for outputting wiggle data.
1521
1522     my ( $chr,      # chromosome
1523          $beg,      # start position
1524          $block,    # list of scores
1525          $fh,       # filehandle - OPTIONAL
1526          $log10,    # flag indicating that log10 scores should be used
1527        ) = @_;
1528
1529     # Returns nothing.
1530
1531     $beg += 1;   # fixedStep format is 1 based.
1532
1533     $fh ||= \*STDOUT;
1534
1535     print $fh "fixedStep chrom=$chr start=$beg step=1\n";
1536
1537     if ( $log10 ) {
1538         map { printf( $fh "%f\n", Maasha::Calc::log10( $_ + 1 ) ) } @{ $block };
1539     } else {
1540         map { printf( $fh "%d\n", ( $_ + 1 ) ) } @{ $block };
1541     }
1542 }
1543
1544
1545 sub wiggle_upload_to_ucsc
1546 {
1547     # Martin A. Hansen, May 2008.
1548
1549     # Upload a wiggle file to the UCSC database.
1550
1551     my ( $tmp_dir,    # temporary directory
1552          $wib_dir,    # wib directory
1553          $wig_file,   # file to upload,
1554          $options,    # argument hashref
1555        ) = @_;
1556
1557     # Returns nothing.
1558
1559     my ( $args );
1560
1561 #    $args = join " ", "-tmpDir=$tmp_dir", "-pathPrefix=$wib_dir", $options->{ "database" }, $options->{ 'table' }, $wig_file;
1562
1563 #    Maasha::Common::run( "hgLoadWiggle", "$args > /dev/null 2>&1" );
1564
1565     if ( $options->{ 'verbose' } ) {
1566         `cd $tmp_dir && hgLoadWiggle -tmpDir=$tmp_dir -pathPrefix=$wib_dir $options->{ 'database' } $options->{ 'table' } $wig_file`;
1567     } else {
1568         `cd $tmp_dir && hgLoadWiggle -tmpDir=$tmp_dir -pathPrefix=$wib_dir $options->{ 'database' } $options->{ 'table' } $wig_file > /dev/null 2>&1`;
1569     }
1570 }
1571
1572
1573 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> MySQL CONF <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1574
1575
1576 sub ucsc_get_user
1577 {
1578     # Martin A. Hansen, May 2008
1579
1580     # Fetches the MySQL database user name from the
1581     # .hg.conf file in the users home directory.
1582
1583     # Returns a string.
1584
1585     my ( $file, $fh, $line, $user );
1586
1587     $file = "$ENV{ 'HOME' }/.hg.conf";
1588
1589     return if not -f $file;
1590
1591     $fh = Maasha::Common::read_open( $file );
1592
1593     while ( $line = <$fh> )
1594     {
1595         chomp $line;
1596
1597         if ( $line =~ /^db\.user=(.+)/ )
1598         {
1599             $user = $1;
1600
1601             last;
1602         }
1603     }
1604
1605     close $fh;
1606
1607     return $user;
1608 }
1609
1610
1611 sub ucsc_get_password
1612 {
1613     # Martin A. Hansen, May 2008
1614
1615     # Fetches the MySQL database password from the
1616     # .hg.conf file in the users home directory.
1617
1618     # Returns a string.
1619
1620     my ( $file, $fh, $line, $password );
1621
1622     $file = "$ENV{ 'HOME' }/.hg.conf";
1623
1624     return if not -f $file;
1625
1626     $fh = Maasha::Common::read_open( $file );
1627
1628     while ( $line = <$fh> )
1629     {
1630         chomp $line;
1631
1632         if ( $line =~ /^db\.password=(.+)/ )
1633         {
1634             $password = $1;
1635
1636             last;
1637         }
1638     }
1639
1640     close $fh;
1641
1642     return $password;
1643 }
1644
1645
1646 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1647
1648
1649 __END__
1650