code_perl/Maasha/UCSC.pm

   1 package Maasha::UCSC;
   2
   3 # Copyright (C) 2007 Martin A. Hansen.
   4
   5 # This program is free software; you can redistribute it and/or
   6 # modify it under the terms of the GNU General Public License
   7 # as published by the Free Software Foundation; either version 2
   8 # of the License, or (at your option) any later version.
   9
  10 # This program is distributed in the hope that it will be useful,
  11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 # GNU General Public License for more details.
  14
  15 # You should have received a copy of the GNU General Public License
  16 # along with this program; if not, write to the Free Software
  17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  18
  19 # http://www.gnu.org/copyleft/gpl.html
  20
  21
  22 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  23
  24
  25 # Stuff for interacting with UCSC genome browser
  26
  27
  28 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  29
  30
  31 use strict;
  32 use vars qw ( @ISA @EXPORT );
  33
  34 use Data::Dumper;
  35 use Maasha::Common;
  36 use Maasha::Filesys;
  37 use Maasha::Calc;
  38 use Maasha::Matrix;
  39
  40 use constant {
  41     FS_CHR_BEG      => 0,
  42     FS_NEXT_CHR_BEG => 1,
  43     FS_CHR_END      => 2,
  44     FS_INDEX_BEG    => 3,
  45     FS_INDEX_LEN    => 4,
  46
  47     CHR          => 0,
  48     CHR_BEG      => 1,
  49     CHR_END      => 2,
  50     Q_ID         => 3,
  51     SCORE        => 4,
  52     STRAND       => 5,
  53     THICK_BEG    => 6,
  54     THICK_END    => 7,
  55     ITEMRGB      => 8,
  56     BLOCKCOUNT   => 9,
  57     BLOCKSIZES   => 10,
  58     Q_BEGS       => 11,
  59 };
  60
  61 @ISA = qw( Exporter );
  62
  63
  64 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> BED format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  65
  66
  67 # http://genome.ucsc.edu/goldenPath/help/hgTracksHelp.html#BED
  68
  69
  70 sub bed_entry_get_array
  71 {
  72     # Martin A. Hansen, September 2008.
  73
  74     # Reads a BED entry given a filehandle.
  75
  76     # This is a new _faster_ BED entry parser that
  77     # uses arrays and not hashrefs.
  78
  79     # IMPORTANT! This function does not correct the
  80     # BED_END position that is kept the way UCSC
  81     # does.
  82
  83     my ( $fh,     # file handle
  84          $cols,   # columns to read - OPTIONAL (3,4,5,6, or 12)
  85        ) = @_;
  86
  87     # Returns a list.
  88
  89     my ( $line, @entry );
  90
  91     $line = <$fh>;
  92
  93     $line =~ tr/\n\r//d;    # some people have carriage returns in their BED files -> Grrrr
  94
  95     return if not defined $line;
  96
  97     if ( not defined $cols ) {
  98         $cols = 1 + $line =~ tr/\t//;
  99     }
 100
 101     @entry = split "\t", $line, $cols + 1;
 102
 103     pop @entry if scalar @entry > $cols;
 104
 105     return wantarray ? @entry : \@entry;
 106 }
 107
 108
 109 sub bed_get_entry
 110 {
 111     # Martin A. Hansen, December 2007.
 112
 113     # Reads a bed entry given a filehandle.
 114
 115     my ( $fh,        # file handle
 116          $columns,   # number of BED columns to read  -  OPTIONAL
 117        ) = @_;
 118
 119     # Returns hashref.
 120
 121     my ( $line, @fields, %entry );
 122
 123     $line = <$fh>;
 124
 125     $line =~ tr/\n\r//d;    # some people have carriage returns in their BED files -> Grrrr
 126
 127     return if not defined $line;
 128
 129     @fields = split "\t", $line;
 130
 131     $columns ||= scalar @fields;
 132
 133     if ( $columns == 3 )
 134     {
 135         %entry = (
 136             "CHR"      => $fields[ 0 ],
 137             "CHR_BEG"  => $fields[ 1 ],
 138             "CHR_END"  => $fields[ 2 ] - 1,
 139         );
 140     }
 141     elsif ( $columns == 4 )
 142     {
 143         %entry = (
 144             "CHR"      => $fields[ 0 ],
 145             "CHR_BEG"  => $fields[ 1 ],
 146             "CHR_END"  => $fields[ 2 ] - 1,
 147             "Q_ID"     => $fields[ 3 ],
 148         );
 149     }
 150     elsif ( $columns == 5 )
 151     {
 152         %entry = (
 153             "CHR"      => $fields[ 0 ],
 154             "CHR_BEG"  => $fields[ 1 ],
 155             "CHR_END"  => $fields[ 2 ] - 1,
 156             "Q_ID"     => $fields[ 3 ],
 157             "SCORE"    => $fields[ 4 ],
 158         );
 159     }
 160     elsif ( $columns == 6 )
 161     {
 162         %entry = (
 163             "CHR"      => $fields[ 0 ],
 164             "CHR_BEG"  => $fields[ 1 ],
 165             "CHR_END"  => $fields[ 2 ] - 1,
 166             "Q_ID"     => $fields[ 3 ],
 167             "SCORE"    => $fields[ 4 ],
 168             "STRAND"   => $fields[ 5 ],
 169         );
 170     }
 171     elsif ( $columns == 12 )
 172     {
 173         %entry = (
 174             "CHR"        => $fields[ 0 ],
 175             "CHR_BEG"    => $fields[ 1 ],
 176             "CHR_END"    => $fields[ 2 ] - 1,
 177             "Q_ID"       => $fields[ 3 ],
 178             "SCORE"      => $fields[ 4 ],
 179             "STRAND"     => $fields[ 5 ],
 180             "THICK_BEG"  => $fields[ 6 ],
 181             "THICK_END"  => $fields[ 7 ] - 1,
 182             "ITEMRGB"    => $fields[ 8 ],
 183             "BLOCKCOUNT" => $fields[ 9 ],
 184             "BLOCKSIZES" => $fields[ 10 ],
 185             "Q_BEGS"     => $fields[ 11 ],
 186         );
 187     }
 188     else
 189     {
 190         Maasha::Common::error( qq(Bad BED format in line->$line<-) );
 191     }
 192
 193     $entry{ "REC_TYPE" } = "BED";
 194     $entry{ "BED_LEN" }  = $entry{ "CHR_END" } - $entry{ "CHR_BEG" } + 1;
 195     $entry{ "BED_COLS" } = $columns;
 196
 197     return wantarray ? %entry : \%entry;
 198 }
 199
 200
 201 sub bed_get_entries
 202 {
 203     # Martin A. Hansen, January 2008.
 204
 205     # Given a path to a BED file, read in all entries
 206     # and return.
 207
 208     my ( $path,     # full path to BED file
 209          $columns,  # number of columns in BED file - OPTIONAL (but is faster)
 210        ) = @_;
 211
 212     # Returns a list.
 213
 214     my ( $fh, $entry, @list );
 215
 216     $fh = Maasha::Common::read_open( $path );
 217
 218     while ( $entry = bed_get_entry( $fh ) ) {
 219         push @list, $entry;
 220     }
 221
 222     close $fh;
 223
 224     return wantarray ? @list : \@list;
 225 }
 226
 227
 228 sub bed_entry_put_array
 229 {
 230     # Martin A. Hansen, Septermber 2008.
 231
 232     # Writes a BED entry array to file.
 233
 234     # IMPORTANT! This function does not correct the
 235     # BED_END position that is assumed to be in the
 236     # UCSC positions scheme.
 237
 238     my ( $record,   # list
 239          $fh,       # file handle                   - OPTIONAL
 240          $cols,     # number of columns in BED file - OPTIONAL
 241        ) = @_;
 242
 243     # Returns nothing.
 244
 245     $fh = \*STDOUT if not defined $fh;
 246
 247     if ( defined $cols ) {
 248         print $fh join( "\t", @{ $record }[ 0 .. $cols - 1 ] ), "\n";
 249     } else {
 250         print $fh join( "\t", @{ $record } ), "\n";
 251     }
 252 }
 253
 254
 255 sub bed_put_entry
 256 {
 257     # Martin A. Hansen, Septermber 2007.
 258
 259     # Writes a BED entry to file.
 260
 261     # NB, this could really be more robust!?
 262
 263     my ( $record,       # hashref
 264          $fh,           # file handle                   - OPTIONAL
 265          $columns,      # number of columns in BED file - OPTIONAL (but is faster)
 266        ) = @_;
 267
 268     # Returns nothing.
 269
 270     my ( @fields );
 271
 272     $columns ||= 12;   # max number of columns possible
 273
 274     if ( $columns == 3 )
 275     {
 276         push @fields, $record->{ "CHR" };
 277         push @fields, $record->{ "CHR_BEG" };
 278         push @fields, $record->{ "CHR_END" } + 1;
 279     }
 280     elsif ( $columns == 4 )
 281     {
 282         $record->{ "Q_ID" }  =~ s/\s+/_/g;
 283
 284         push @fields, $record->{ "CHR" };
 285         push @fields, $record->{ "CHR_BEG" };
 286         push @fields, $record->{ "CHR_END" } + 1;
 287         push @fields, $record->{ "Q_ID" };
 288     }
 289     elsif ( $columns == 5 )
 290     {
 291         $record->{ "Q_ID" }  =~ s/\s+/_/g;
 292         $record->{ "SCORE" } =~ s/\.\d*//;
 293
 294         push @fields, $record->{ "CHR" };
 295         push @fields, $record->{ "CHR_BEG" };
 296         push @fields, $record->{ "CHR_END" } + 1;
 297         push @fields, $record->{ "Q_ID" };
 298         push @fields, $record->{ "SCORE" };
 299     }
 300     elsif ( $columns == 6 )
 301     {
 302         $record->{ "Q_ID" }  =~ s/\s+/_/g;
 303         $record->{ "SCORE" } =~ s/\.\d*//;
 304
 305         push @fields, $record->{ "CHR" };
 306         push @fields, $record->{ "CHR_BEG" };
 307         push @fields, $record->{ "CHR_END" } + 1;
 308         push @fields, $record->{ "Q_ID" };
 309         push @fields, $record->{ "SCORE" };
 310         push @fields, $record->{ "STRAND" };
 311     }
 312     else
 313     {
 314         $record->{ "Q_ID" }  =~ s/\s+/_/g;
 315         $record->{ "SCORE" } =~ s/\.\d*//;
 316
 317         push @fields, $record->{ "CHR" };
 318         push @fields, $record->{ "CHR_BEG" };
 319         push @fields, $record->{ "CHR_END" } + 1;
 320         push @fields, $record->{ "Q_ID" };
 321         push @fields, $record->{ "SCORE" };
 322         push @fields, $record->{ "STRAND" };
 323         push @fields, $record->{ "THICK_BEG" }     if defined $record->{ "THICK_BEG" };
 324         push @fields, $record->{ "THICK_END" } + 1 if defined $record->{ "THICK_END" };
 325         push @fields, $record->{ "ITEMRGB" }       if defined $record->{ "ITEMRGB" };
 326         push @fields, $record->{ "BLOCKCOUNT" }    if defined $record->{ "BLOCKCOUNT" };
 327         push @fields, $record->{ "BLOCKSIZES" }    if defined $record->{ "BLOCKSIZES" };
 328         push @fields, $record->{ "Q_BEGS" }        if defined $record->{ "Q_BEGS" };
 329     }
 330
 331     if ( $fh ) {
 332         print $fh join( "\t", @fields ), "\n";
 333     } else {
 334         print join( "\t", @fields ), "\n";
 335     }
 336 }
 337
 338
 339 sub bed_put_entries
 340 {
 341     # Martin A. Hansen, January 2008.
 342
 343     # Write a list of BED entries.
 344
 345     my ( $entries,   # list of entries,
 346          $fh,        # file handle - OPTIONAL
 347        ) = @_;
 348
 349     # Returns nothing.
 350
 351     map { bed_put_entry( $_, $fh ) } @{ $entries };
 352 }
 353
 354
 355 sub bed_analyze
 356 {
 357     # Martin A. Hansen, March 2008.
 358
 359     # Given a bed record, analysis this to give information
 360     # about intron/exon sizes.
 361
 362     my ( $entry,   # BED entry
 363        ) = @_;
 364
 365     # Returns hashref.
 366
 367     my ( $i, @begs, @lens, $exon_max, $exon_min, $exon_len, $exon_tot, $intron_max, $intron_min, $intron_len, $intron_tot );
 368
 369     $exon_max   = 0;
 370     $exon_min   = 9999999999;
 371     $intron_max = 0;
 372     $intron_min = 9999999999;
 373
 374     $entry->{ "EXONS" }   = $entry->{ "BLOCKCOUNT" };
 375
 376     @begs = split /,/, $entry->{ "Q_BEGS" };
 377     @lens = split /,/, $entry->{ "BLOCKSIZES" };
 378
 379     for ( $i = 0; $i < $entry->{ "BLOCKCOUNT" }; $i++ )
 380     {
 381         $exon_len = @lens[ $i ];
 382
 383         $entry->{ "EXON_LEN_$i" } = $exon_len;
 384
 385         $exon_max = $exon_len if $exon_len > $exon_max;
 386         $exon_min = $exon_len if $exon_len < $exon_min;
 387
 388         $exon_tot += $exon_len;
 389     }
 390
 391     $entry->{ "EXON_LEN_-1" }   = $exon_len;
 392     $entry->{ "EXON_MAX_LEN" }  = $exon_max;
 393     $entry->{ "EXON_MIN_LEN" }  = $exon_min;
 394     $entry->{ "EXON_MEAN_LEN" } = int( $exon_tot / $entry->{ "EXONS" } );
 395
 396     $entry->{ "INTRONS" } = $entry->{ "BLOCKCOUNT" } - 1;
 397     $entry->{ "INTRONS" } = 0 if $entry->{ "INTRONS" } < 0;
 398
 399     if ( $entry->{ "INTRONS" } )
 400     {
 401         for ( $i = 1; $i < $entry->{ "BLOCKCOUNT" }; $i++ )
 402         {
 403             $intron_len = @begs[ $i ] - ( @begs[ $i - 1 ] + @lens[ $i - 1 ] );
 404
 405             $entry->{ "INTRON_LEN_" . ( $i - 1 ) } = $intron_len;
 406
 407             $intron_max = $intron_len if $intron_len > $intron_max;
 408             $intron_min = $intron_len if $intron_len < $intron_min;
 409
 410             $intron_tot += $intron_len;
 411         }
 412
 413         $entry->{ "INTRON_LEN_-1" }   = $intron_len;
 414         $entry->{ "INTRON_MAX_LEN" }  = $intron_max;
 415         $entry->{ "INTRON_MIN_LEN" }  = $intron_min;
 416         $entry->{ "INTRON_MEAN_LEN" } = int( $intron_tot / $entry->{ "INTRONS" } );
 417     }
 418
 419     return wantarray ? %{ $entry } : $entry;
 420 }
 421
 422
 423 sub bed_sort
 424 {
 425     # Martin A. Hansen, Septermber 2008
 426
 427     # Sorts a BED file using the c program
 428     # "bed_sort" specifing a sort mode:
 429
 430     # 1: chr AND chr_beg.
 431     # 2: chr AND strand AND chr_beg.
 432     # 3: chr_beg.
 433     # 4: strand AND chr_beg.
 434
 435     my ( $bed_file,    # BED file to sort
 436          $sort_mode,   # See above.
 437          $cols,        # Number of columns in BED file
 438        ) = @_;
 439
 440     &Maasha::Common::run( "bed_sort", "--sort $sort_mode --cols $cols $bed_file" );
 441 }
 442
 443
 444 sub bed_split_to_files
 445 {
 446     # Martin A. Hansen, Septermber 2008
 447
 448     # Given a list of BED files, split these
 449     # into temporary files based on the chromosome
 450     # name. Returns a list of the temporary files.
 451
 452     my ( $bed_files,   # list of BED files to split
 453          $cols,        # number of columns
 454          $tmp_dir,     # temporary directory
 455        ) = @_;
 456
 457     # Returns a list.
 458
 459     my ( $bed_file, $fh_in, $entry, $key, %fh_hash, @tmp_files );
 460
 461     foreach $bed_file ( @{ $bed_files } )
 462     {
 463         $fh_in = Maasha::Common::read_open( $bed_file );
 464
 465         while ( $entry = bed_entry_get_array( $fh_in, $cols ) )
 466         {
 467             $key = $entry->[ CHR ];
 468
 469             $fh_hash{ $key } = Maasha::Common::write_open( "$tmp_dir/$key.temp" ) if not exists $fh_hash{ $key };
 470
 471             bed_entry_put_array( $entry, $fh_hash{ $key } );
 472         }
 473
 474         close $fh_in;
 475     }
 476
 477     foreach $key ( sort keys %fh_hash )
 478     {
 479         push @tmp_files, "$tmp_dir/$key.temp";
 480
 481         close $fh_hash{ $key };
 482     }
 483
 484     return wantarray ? @tmp_files : \@tmp_files;
 485 }
 486
 487
 488 sub bed_merge_entries
 489 {
 490     # Martin A. Hansen, February 2008.
 491
 492     # Merge a list of given BED entries in one big entry.
 493
 494     my ( $entries,     # list of BED entries to be merged
 495        ) = @_;
 496
 497     # Returns hash.
 498
 499     my ( $i, @q_ids, @q_begs, @blocksizes, @new_q_begs, @new_blocksizes, %new_entry );
 500
 501     @{ $entries } = sort { $a->{ "CHR_BEG" } <=> $b->{ "CHR_BEG" } } @{ $entries };
 502
 503     for ( $i = 0; $i < @{ $entries }; $i++ )
 504     {
 505         Maasha::Common::error( qq(Attempted merge of BED entries from different chromosomes) ) if $entries->[ 0 ]->{ "CHR" }    ne $entries->[ $i ]->{ "CHR" };
 506         Maasha::Common::error( qq(Attempted merge of BED entries from different strands) )     if $entries->[ 0 ]->{ "STRAND" } ne $entries->[ $i ]->{ "STRAND" };
 507
 508         push @q_ids, $entries->[ $i ]->{ "Q_ID" } || sprintf( "ID%06d", $i );
 509
 510         if ( exists $entries->[ $i ]->{ "Q_BEGS" } )
 511         {
 512             @q_begs     = split ",", $entries->[ $i ]->{ "Q_BEGS" };
 513             @blocksizes = split ",", $entries->[ $i ]->{ "BLOCKSIZES" };
 514         }
 515         else
 516         {
 517             @q_begs     = 0;
 518             @blocksizes = $entries->[ $i ]->{ "CHR_END" } - $entries->[ $i ]->{ "CHR_BEG" } + 1;
 519         }
 520
 521         map { $_ += $entries->[ $i ]->{ "CHR_BEG" } } @q_begs;
 522
 523         push @new_q_begs, @q_begs;
 524         push @new_blocksizes, @blocksizes;
 525     }
 526
 527     map { $_ -= $entries->[ 0 ]->{ "CHR_BEG" } } @new_q_begs;
 528
 529     %new_entry = (
 530         CHR         => $entries->[ 0 ]->{ "CHR" },
 531         CHR_BEG     => $entries->[ 0 ]->{ "CHR_BEG" },
 532         CHR_END     => $entries->[ -1 ]->{ "CHR_END" },
 533         REC_TYPE    => "BED",
 534         BED_LEN     => $entries->[ -1 ]->{ "CHR_END" } - $entries->[ 0 ]->{ "CHR_BEG" } + 1,
 535         BED_COLS    => 12,
 536         Q_ID        => join( ":", @q_ids ),
 537         SCORE       => 999,
 538         STRAND      => $entries->[ 0 ]->{ "STRAND" }     || "+",
 539         THICK_BEG   => $entries->[ 0 ]->{ "THICK_BEG" }  || $entries->[ 0 ]->{ "CHR_BEG" },
 540         THICK_END   => $entries->[ -1 ]->{ "THICK_END" } || $entries->[ -1 ]->{ "CHR_END" },
 541         ITEMRGB     => "0,0,0",
 542         BLOCKCOUNT  => scalar @new_q_begs,
 543         BLOCKSIZES  => join( ",", @new_blocksizes ),
 544         Q_BEGS      => join( ",", @new_q_begs ),
 545     );
 546
 547     return wantarray ? %new_entry : \%new_entry;
 548 }
 549
 550
 551 sub bed_split_entry
 552 {
 553     # Martin A. Hansen, February 2008.
 554
 555     # Splits a given BED entry into a list of blocks,
 556     # which are returned. A list of 6 column BED entry is returned.
 557
 558     my ( $entry,    # BED entry hashref
 559        ) = @_;
 560
 561     # Returns a list.
 562
 563     my ( @q_begs, @blocksizes, $block, @blocks, $i );
 564
 565     if ( exists $entry->{ "BLOCKCOUNT" } )
 566     {
 567         @q_begs     = split ",", $entry->{ "Q_BEGS" };
 568         @blocksizes = split ",", $entry->{ "BLOCKSIZES" };
 569
 570         for ( $i = 0; $i < @q_begs; $i++ )
 571         {
 572             undef $block;
 573
 574             $block->{ "CHR" }      = $entry->{ "CHR" };
 575             $block->{ "CHR_BEG" }  = $entry->{ "CHR_BEG" } + $q_begs[ $i ];
 576             $block->{ "CHR_END" }  = $entry->{ "CHR_BEG" } + $q_begs[ $i ] + $blocksizes[ $i ] - 1;
 577             $block->{ "Q_ID" }     = $entry->{ "Q_ID" } . sprintf( "_%03d", $i );
 578             $block->{ "SCORE" }    = $entry->{ "SCORE" };
 579             $block->{ "STRAND" }   = $entry->{ "STRAND" };
 580             $block->{ "BED_LEN" }  = $block->{ "CHR_END" } - $block->{ "CHR_BEG" } + 1,
 581             $block->{ "BED_COLS" } = 6;
 582             $block->{ "REC_TYPE" } = "BED";
 583
 584             push @blocks, $block;
 585         }
 586     }
 587     else
 588     {
 589         @blocks = @{ $entry };
 590     }
 591
 592     return wantarray ? @blocks : \@blocks;
 593 }
 594
 595
 596
 597 sub bed_overlap
 598 {
 599     # Martin A. Hansen, February 2008.
 600
 601     # Checks if two BED entries overlap and
 602     # return 1 if so - else 0;
 603
 604     my ( $entry1,      # hashref
 605          $entry2,      # hashref
 606          $no_strand,   # don't check strand flag - OPTIONAL
 607        ) = @_;
 608
 609     # Return bolean.
 610
 611     return 0 if $entry1->{ "CHR" }    ne $entry2->{ "CHR" };
 612     return 0 if $entry1->{ "STRAND" } ne $entry2->{ "STRAND" };
 613
 614     if ( $entry1->{ "CHR_END" } < $entry2->{ "CHR_BEG" } or $entry1->{ "CHR_BEG" } > $entry2->{ "CHR_END" } ) {
 615         return 0;
 616     } else {
 617         return 1;
 618     }
 619 }
 620
 621
 622 sub bed_upload_to_ucsc
 623 {
 624     # Martin A. Hansen, September 2007.
 625
 626     # Upload a BED file to the UCSC database.
 627
 628     my ( $tmp_dir,   # temporary directory
 629          $file,      # file to upload,
 630          $options,   # argument hashref
 631          $append,    # flag indicating table should be appended
 632        ) = @_;
 633
 634     # Returns nothing.
 635
 636     my ( $args, $table, $sql_file, $fh_out, $fh_in );
 637
 638     if ( $append ) {
 639         $args = join " ", $options->{ "database" }, $options->{ "table" }, "-tmpDir=$tmp_dir", "-oldTable", $file;
 640     } else {
 641         $args = join " ", $options->{ "database" }, $options->{ "table" }, "-tmpDir=$tmp_dir", $file;
 642     }
 643
 644     if ( $options->{ "table" } =~ /rnaSecStr/ )
 645     {
 646         $table = $options->{ "table" };
 647
 648         print qq(uploading secondary structure table:"$table"\n) if $options->{ "verbose" };
 649
 650         $sql_file = "$tmp_dir/upload_RNA_SS.sql";
 651
 652         $fh_out   = Maasha::Common::write_open( $sql_file );
 653
 654         print $fh_out qq(
 655 CREATE TABLE $table (
 656     bin smallint not null,              # Bin number for browser speedup
 657     chrom varchar(255) not null,        # Chromosome or FPC contig
 658     chromStart int unsigned not null,   # Start position in chromosome
 659     chromEnd int unsigned not null,     # End position in chromosome
 660     name varchar(255) not null,         # Name of item
 661     score int unsigned not null,        # Score from 0-1000
 662     strand char(1) not null,            # + or -
 663     size int unsigned not null,         # Size of element.
 664     secStr longblob not null,           # Parentheses and '.'s which define the secondary structure
 665     conf longblob not null,             # Confidence of secondary-structure annotation per position (0.0-1.0).
 666     #Indices
 667     INDEX(name(16)),
 668     INDEX(chrom(8), bin),
 669     INDEX(chrom(8), chromStart)
 670 );
 671         );
 672
 673         close $fh_out;
 674
 675         Maasha::Common::run( "hgLoadBed", "-notItemRgb -sqlTable=$sql_file $options->{ 'database' } $options->{ 'table' } -tmpDir=$tmp_dir $file > /dev/null 2>&1" );
 676
 677         unlink $sql_file;
 678     }
 679     else
 680     {
 681         Maasha::Common::run( "hgLoadBed", "$args > /dev/null 2>&1" );
 682     }
 683 }
 684
 685
 686 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> PSL format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
 687
 688
 689 sub psl_get_entry
 690 {
 691     # Martin A. Hansen, August 2008.
 692
 693     # Reads PSL next entry from a PSL file and returns a record.
 694
 695     my ( $fh,   # file handle of PSL filefull path to PSL file
 696        ) = @_;
 697
 698     # Returns hashref.
 699
 700     my ( $line, @fields, %record );
 701
 702     while ( $line = <$fh> )
 703     {
 704         chomp $line;
 705
 706         @fields = split "\t", $line;
 707
 708         if ( scalar @fields == 21 )
 709         {
 710             %record = (
 711                 REC_TYPE    => "PSL",
 712                 MATCHES     => $fields[ 0 ],
 713                 MISMATCHES  => $fields[ 1 ],
 714                 REPMATCHES  => $fields[ 2 ],
 715                 NCOUNT      => $fields[ 3 ],
 716                 QNUMINSERT  => $fields[ 4 ],
 717                 QBASEINSERT => $fields[ 5 ],
 718                 SNUMINSERT  => $fields[ 6 ],
 719                 SBASEINSERT => $fields[ 7 ],
 720                 STRAND      => $fields[ 8 ],
 721                 Q_ID        => $fields[ 9 ],
 722                 Q_LEN       => $fields[ 10 ],
 723                 Q_BEG       => $fields[ 11 ],
 724                 Q_END       => $fields[ 12 ] - 1,
 725                 S_ID        => $fields[ 13 ],
 726                 S_LEN       => $fields[ 14 ],
 727                 S_BEG       => $fields[ 15 ],
 728                 S_END       => $fields[ 16 ] - 1,
 729                 BLOCKCOUNT  => $fields[ 17 ],
 730                 BLOCKSIZES  => $fields[ 18 ],
 731                 Q_BEGS      => $fields[ 19 ],
 732                 S_BEGS      => $fields[ 20 ],
 733             );
 734
 735             $record{ "SCORE" } = $record{ "MATCHES" } + int( $record{ "REPMATCHES" } / 2 ) - $record{ "MISMATCHES" } - $record{ "QNUMINSERT" } - $record{ "SNUMINSERT" };
 736             $record{ "SPAN" }  = $fields[ 12 ] - $fields[ 11 ];
 737
 738             return wantarray ? %record : \%record;
 739         }
 740     }
 741
 742     return undef;
 743 }
 744
 745
 746 sub psl_get_entries
 747 {
 748     # Martin A. Hansen, February 2008.
 749
 750     # Reads PSL entries and returns a list of records.
 751
 752     my ( $path,   # full path to PSL file
 753        ) = @_;
 754
 755     # Returns hashref.
 756
 757     my ( $fh, @lines, @fields, $i, %record, @records );
 758
 759     $fh = Maasha::Common::read_open( $path );
 760
 761     @lines = <$fh>;
 762
 763     close $fh;
 764
 765     chomp @lines;
 766
 767     for ( $i = 5; $i < @lines; $i++ )
 768     {
 769         @fields = split "\t", $lines[ $i ];
 770
 771         Maasha::Common::error( qq(Bad PSL format in file "$path") ) if not @fields == 21;
 772
 773         undef %record;
 774
 775         %record = (
 776             REC_TYPE    => "PSL",
 777             MATCHES     => $fields[ 0 ],
 778             MISMATCHES  => $fields[ 1 ],
 779             REPMATCHES  => $fields[ 2 ],
 780             NCOUNT      => $fields[ 3 ],
 781             QNUMINSERT  => $fields[ 4 ],
 782             QBASEINSERT => $fields[ 5 ],
 783             SNUMINSERT  => $fields[ 6 ],
 784             SBASEINSERT => $fields[ 7 ],
 785             STRAND      => $fields[ 8 ],
 786             Q_ID        => $fields[ 9 ],
 787             Q_LEN       => $fields[ 10 ],
 788             Q_BEG       => $fields[ 11 ],
 789             Q_END       => $fields[ 12 ] - 1,
 790             S_ID        => $fields[ 13 ],
 791             S_LEN       => $fields[ 14 ],
 792             S_BEG       => $fields[ 15 ],
 793             S_END       => $fields[ 16 ] - 1,
 794             BLOCKCOUNT  => $fields[ 17 ],
 795             BLOCKSIZES  => $fields[ 18 ],
 796             Q_BEGS      => $fields[ 19 ],
 797             S_BEGS      => $fields[ 20 ],
 798         );
 799
 800         $record{ "SCORE" } = $record{ "MATCHES" } + int( $record{ "REPMATCHES" } / 2 ) - $record{ "MISMATCHES" } - $record{ "QNUMINSERT" } - $record{ "SNUMINSERT" };
 801
 802         push @records, { %record };
 803     }
 804
 805     return wantarray ? @records : \@records;
 806 }
 807
 808
 809 sub psl_put_header
 810 {
 811     # Martin A. Hansen, September 2007.
 812
 813     # Write a PSL header to file.
 814
 815     my ( $fh,  # file handle  - OPTIONAL
 816        ) = @_;
 817
 818     # Returns nothing.
 819
 820     $fh = \*STDOUT if not $fh;
 821
 822     print $fh qq(psLayout version 3
 823 match   mis-    rep.    N's     Q gap   Q gap   T gap   T gap   strand  Q               Q       Q       Q       T               T       T       T       block   blockSizes      qStart        match   match           count   bases   count   bases           name            size    start   end     name            size    start   end     count
 824 ---------------------------------------------------------------------------------------------------------------------------------------------------------------
 825 );
 826 }
 827
 828
 829 sub psl_put_entry
 830 {
 831     # Martin A. Hansen, September 2007.
 832
 833     # Write a PSL entry to file.
 834
 835     my ( $record,       # hashref
 836          $fh,           # file handle  -  OPTIONAL
 837        ) = @_;
 838
 839     # Returns nothing.
 840
 841     $fh = \*STDOUT if not $fh;
 842
 843     my @output;
 844
 845     push @output, $record->{ "MATCHES" };
 846     push @output, $record->{ "MISMATCHES" };
 847     push @output, $record->{ "REPMATCHES" };
 848     push @output, $record->{ "NCOUNT" };
 849     push @output, $record->{ "QNUMINSERT" };
 850     push @output, $record->{ "QBASEINSERT" };
 851     push @output, $record->{ "SNUMINSERT" };
 852     push @output, $record->{ "SBASEINSERT" };
 853     push @output, $record->{ "STRAND" };
 854     push @output, $record->{ "Q_ID" };
 855     push @output, $record->{ "Q_LEN" };
 856     push @output, $record->{ "Q_BEG" };
 857     push @output, $record->{ "Q_END" } + 1;
 858     push @output, $record->{ "S_ID" };
 859     push @output, $record->{ "S_LEN" };
 860     push @output, $record->{ "S_BEG" };
 861     push @output, $record->{ "S_END" } + 1;
 862     push @output, $record->{ "BLOCKCOUNT" };
 863     push @output, $record->{ "BLOCKSIZES" };
 864     push @output, $record->{ "Q_BEGS" };
 865     push @output, $record->{ "S_BEGS" };
 866
 867     print $fh join( "\t", @output ), "\n";
 868 }
 869
 870
 871 sub psl_upload_to_ucsc
 872 {
 873     # Martin A. Hansen, September 2007.
 874
 875     # Upload a PSL file to the UCSC database.
 876
 877     my ( $file,      # file to upload,
 878          $options,   # argument hashref
 879          $append,    # flag indicating table should be appended
 880        ) = @_;
 881
 882     # Returns nothing.
 883
 884     my ( $args );
 885
 886     if ( $append ) {
 887         $args = join " ", $options->{ "database" }, "-table=$options->{ 'table' }", "-clientLoad", "-append", $file;
 888     } else {
 889         $args = join " ", $options->{ "database" }, "-table=$options->{ 'table' }", "-clientLoad", $file;
 890     }
 891
 892     Maasha::Common::run( "hgLoadPsl", "$args > /dev/null 2>&1" );
 893 }
 894
 895
 896 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> TRACK FILE <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
 897
 898
 899 sub ucsc_config_entry_get
 900 {
 901     # Martin A. Hansen, November 2008.
 902
 903     # Given a filehandle to a UCSC Genome Browser
 904     # config file (.ra) get the next entry and
 905     # return as a hash. Entries are separated by blank
 906     # lines. # lines are skipped unless it is the lines:
 907     # # Track added ...
 908     # # Database ...
 909
 910     my ( $fh,   # file hanlde
 911        ) = @_;
 912
 913     # Returns a hashref.
 914
 915     my ( $line, %record );
 916
 917     while ( $line = <$fh> )
 918     {
 919         if ( $line =~ /Track added by 'upload_to_ucsc' (\S+) (\S+)/) {
 920              $record{ 'date' } = $1;
 921              $record{ 'time' } = $2;
 922         } elsif ( $line =~ /^# date: (.+)/ ) {
 923             $record{ 'date' } = $1;
 924         } elsif ( $line =~ /^# time: (.+)/ ) {
 925             $record{ 'time' } = $1;
 926         } elsif ( $line =~ /^# (?:database:|Database) (.+)/ ) {
 927             $record{ 'database' } = $1;
 928         } elsif ( $line =~ /^#/ ) {
 929             # skip
 930         } elsif ( $line =~ /(\S+)\s+(.+)/ ) {
 931             $record{ $1 } = $2;
 932         }
 933
 934         last if $line =~ /^$/ and exists $record{ "track" };
 935     }
 936
 937     return undef if not exists $record{ "track" };
 938
 939     return wantarray ? %record : \%record;
 940 }
 941
 942
 943 sub ucsc_config_entry_put
 944 {
 945     # Martin A. Hansen, November 2008.
 946
 947     # Outputs a Biopiece record (a hashref)
 948     # to a filehandle or STDOUT.
 949
 950     my ( $record,    # hashref
 951          $fh_out,    # file handle
 952        ) = @_;
 953
 954     # Returns nothing.
 955
 956     my ( $date, $time );
 957
 958     $fh_out ||= \*STDOUT;
 959
 960     ( $date, $time ) = split " ", Maasha::Common::time_stamp();
 961
 962     $record->{ "date" } ||= $date;
 963     $record->{ "time" } ||= $time;
 964
 965     print $fh_out "\n# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n\n";
 966
 967     map { print $fh_out "# $_: $record->{ $_ }\n" if exists $record->{ $_ } } qw( date time database );
 968     map { print $fh_out "$_ $record->{ $_ }\n" if exists $record->{ $_ } } qw( track
 969                                                                                name
 970                                                                                description
 971                                                                                itemRgb
 972                                                                                db
 973                                                                                offset
 974                                                                                url
 975                                                                                htmlUrl
 976                                                                                shortLabel
 977                                                                                longLabel
 978                                                                                group
 979                                                                                priority
 980                                                                                useScore
 981                                                                                visibility
 982                                                                                maxHeightPixels
 983                                                                                color
 984                                                                                mafTrack
 985                                                                                type );
 986 }
 987
 988
 989 sub ucsc_update_config
 990 {
 991     # Martin A. Hansen, September 2007.
 992
 993     # Update the /home/user/ucsc/my_tracks.ra file and executes makeCustomTracks.pl
 994
 995     my ( $options,   # hashref
 996          $type,      # track type
 997        ) = @_;
 998
 999     # Returns nothing.
1000
1001     my ( $file, %entry, $fh_out );
1002
1003     $file = $ENV{ "HOME" } . "/ucsc/my_tracks.ra";
1004
1005     Maasha::Filesys::file_copy( $file, "$file~" );   # create a backup
1006
1007     %entry = (
1008         database   => $options->{ 'database' },
1009         track      => $options->{ 'table' },
1010         shortLabel => $options->{ 'short_label' },
1011         longLabel  => $options->{ 'long_label' },
1012         group      => $options->{ 'group' },
1013         priority   => $options->{ 'priority' },
1014         useScore   => $options->{ 'use_score' },
1015         visibility => $options->{ 'visibility' },
1016         color      => $options->{ 'color' },
1017         type       => $type,
1018     );
1019
1020     $entry{ 'mafTrack' }        = "multiz17way" if $type eq "type bed 6 +";
1021     $entry{ 'maxHeightPixels' } = "50:50:11"    if $type eq "wig 0";
1022
1023     $fh_out = Maasha::Filesys::file_append_open( $file );
1024
1025     ucsc_config_entry_put( \%entry, $fh_out );
1026
1027     close $fh_out;
1028
1029     Maasha::Common::run( "ucscMakeTracks.pl", "-b > /dev/null 2>&1" );
1030 }
1031
1032
1033 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> fixedStep format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1034
1035
1036 sub fixedstep_get_entry
1037 {
1038     # Martin A. Hansen, December 2007.
1039
1040     # Given a file handle to a PhastCons file get the
1041     # next entry which is all the lines after a "fixedStep"
1042     # line and until the next "fixedStep" line or EOF.
1043
1044     my ( $fh,   # filehandle
1045        ) = @_;
1046
1047     # Returns a list of lines
1048
1049     my ( $entry, @lines );
1050
1051     local $/ = "\nfixedStep ";
1052
1053     $entry = <$fh>;
1054
1055     chomp $entry;
1056
1057     @lines = split "\n", $entry;
1058
1059     return if @lines == 0;
1060
1061     $lines[ 0 ] =~ s/fixedStep?\s*//;
1062
1063     return wantarray ? @lines : \@lines;
1064 }
1065
1066
1067 sub fixedstep_index_create
1068 {
1069     # Martin A. Hansen, January 2008.
1070
1071     # Indexes a concatenated fixedStep file.
1072     # The index consists of a hash with chromosomes as keys,
1073     # and a list of [ chr_beg, next_chr_beg, chr_end, index_beg, index_len ] as values.
1074
1075     my ( $path,   # path to fixedStep file
1076        ) = @_;
1077
1078     # Returns a hashref
1079
1080     my ( $fh, $pos, $index_beg, $index_len, $entry, $locator, $chr, $step, $beg, $end, $len, %index, $i );
1081
1082     $fh = Maasha::Common::read_open( $path );
1083
1084     $pos = 0;
1085
1086     while ( $entry = Maasha::UCSC::fixedstep_get_entry( $fh ) )
1087     {
1088         $locator = shift @{ $entry };
1089
1090         if ( $locator =~ /chrom=([^ ]+) start=(\d+) step=(\d+)/ )
1091         {
1092             $chr  = $1;
1093             $beg  = $2 - 1;  #  fixedStep files are 1-based
1094             $step = $3;
1095         }
1096         else
1097         {
1098             Maasha::Common::error( qq(Could not parse locator: $locator) );
1099         }
1100
1101         $pos += length( $locator ) + 11;
1102
1103         $index_beg = $pos;
1104
1105 #        map { $pos += length( $_ ) + 1 } @{ $entry };
1106
1107         $pos += 6 * scalar @{ $entry };
1108
1109         $index_len = $pos - $index_beg;
1110
1111         push @{ $index{ $chr } }, [ $beg, undef, $beg + scalar @{ $entry } - 1, $index_beg, $index_len ];
1112     }
1113
1114     close $fh;
1115
1116     foreach $chr ( keys %index )
1117     {
1118         for ( $i = 0; $i < @{ $index{ $chr } } - 1; $i++ ) {
1119             $index{ $chr }->[ $i ]->[ FS_NEXT_CHR_BEG ] = $index{ $chr }->[ $i + 1 ]->[ 0 ];
1120         }
1121
1122         $index{ $chr }->[ -1 ]->[ FS_NEXT_CHR_BEG ] = $index{ $chr }->[ -1 ]->[ FS_CHR_END ] + 1;
1123     }
1124
1125     return wantarray ? %index : \%index;
1126 }
1127
1128
1129 sub fixedstep_index_store
1130 {
1131     # Martin A. Hansen, January 2008.
1132
1133     # Writes a fixedStep index to binary file.
1134
1135     my ( $path,   # full path to file
1136          $index,  # list with index
1137        ) = @_;
1138
1139     # returns nothing
1140
1141     Maasha::Common::file_store( $path, $index );
1142 }
1143
1144
1145 sub fixedstep_index_retrieve
1146 {
1147     # Martin A. Hansen, January 2008.
1148
1149     # Retrieves a fixedStep index from binary file.
1150
1151     my ( $path,   # full path to file
1152        ) = @_;
1153
1154     # returns list
1155
1156     my $index;
1157
1158     $index = Maasha::Common::file_retrieve( $path );
1159
1160     return wantarray ? %{ $index } : $index;
1161 }
1162
1163
1164 sub fixedstep_index_lookup
1165 {
1166     # Martin A. Hansen, January 2008.
1167
1168     # Retrieve fixedStep scores from a indexed
1169     # fixedStep file given a chromosome and
1170     # begin and end positions.
1171
1172     my ( $index,     # data structure
1173          $fh,        # filehandle to datafile
1174          $chr,       # chromosome
1175          $chr_beg,   # chromosome beg
1176          $chr_end,   # chromosome end
1177          $flank,     # include flanking region - OPTIONAL
1178        ) = @_;
1179
1180     # Returns a list
1181
1182     my ( $index_beg, $index_end, $i, $c, $beg, $end, @vals, $scores );
1183
1184     $flank ||= 0;
1185
1186     $chr_beg -= $flank;
1187     $chr_end += $flank;
1188
1189     # print "chr_beg->$chr_beg   chr_end->$chr_end   flank->$flank\n";
1190
1191     if ( exists $index->{ $chr } )
1192     {
1193         $index_beg = Maasha::Matrix::interval_search( $index->{ $chr }, 0, 1, $chr_beg );
1194
1195         if ( $index_beg < 0 ) {
1196             Maasha::Common::error( qq(Index search failed - begin index position doesn't exists: $chr_beg) );
1197         }
1198
1199         if ( $chr_end < $index->{ $chr }->[ $index_beg ]->[ 1 ] )
1200         {
1201             $index_end = $index_beg;
1202         }
1203         else
1204         {
1205             $index_end = Maasha::Matrix::interval_search( $index->{ $chr }, 0, 1, $chr_end );
1206
1207             if ( $index_end < 0 ) {
1208                 Maasha::Common::error( qq(Index search failed - end index position doesn't exists: $chr_end) );
1209             }
1210         }
1211
1212         map { $scores->[ $_ ] = 0 } 0 .. $chr_end - $chr_beg;
1213
1214         if ( $index_beg == $index_end )
1215         {
1216             $beg = Maasha::Calc::max( $chr_beg, $index->{ $chr }->[ $index_beg ]->[ FS_CHR_BEG ] );
1217             $end = Maasha::Calc::min( $chr_end, $index->{ $chr }->[ $index_end ]->[ FS_CHR_END ] );
1218
1219             if ( $beg <= $index->{ $chr }->[ $index_beg ]->[ FS_CHR_END ] and $end >= $index->{ $chr }->[ $index_beg ]->[ FS_CHR_BEG ] )
1220             {
1221                 @vals = split "\n", Maasha::Common::file_read(
1222                     $fh,
1223                     $index->{ $chr }->[ $index_beg ]->[ FS_INDEX_BEG ] + 6 * ( $beg - $index->{ $chr }->[ $index_beg ]->[ FS_CHR_BEG ] ),
1224                     6 * ( $end - $beg + 1 ),
1225                 );
1226             }
1227
1228             for ( $c = 0; $c < @vals; $c++ ) {
1229                 $scores->[ $c + $beg - $chr_beg ] = $vals[ $c ];
1230             }
1231         }
1232         else
1233         {
1234             $beg = Maasha::Calc::max( $chr_beg, $index->{ $chr }->[ $index_beg ]->[ FS_CHR_BEG ] );
1235
1236 #            print Dumper( $beg, $index->{ $chr }->[ $index_beg ] );
1237 #            print Dumper( "next", $index->{ $chr }->[ $index_beg ]->[ FS_NEXT_CHR_BEG ] );
1238
1239             #      beg         next
1240             #      v           v
1241             #  |||||||||.......
1242
1243             if ( $beg <= $index->{ $chr }->[ $index_beg ]->[ FS_CHR_END ] )
1244             {
1245                 @vals = split "\n", Maasha::Common::file_read(
1246                     $fh,
1247                     $index->{ $chr }->[ $index_beg ]->[ FS_INDEX_BEG ] + 6 * ( $beg - $index->{ $chr }->[ $index_beg ]->[ FS_CHR_BEG ] ),
1248                     6 * ( $index->{ $chr }->[ $index_beg ]->[ FS_CHR_END ] - $beg + 1 ),
1249                 );
1250
1251                 for ( $c = 0; $c < @vals; $c++ ) {
1252                     $scores->[ $c + $beg - $chr_beg ] = $vals[ $c ];
1253                 }
1254             }
1255
1256             $end = Maasha::Calc::min( $chr_end, $index->{ $chr }->[ $index_end ]->[ FS_CHR_END ] );
1257
1258             if ( $end <= $index->{ $chr }->[ $index_end ]->[ FS_CHR_END ] )
1259             {
1260                 @vals = split "\n", Maasha::Common::file_read(
1261                     $fh,
1262                     $index->{ $chr }->[ $index_end ]->[ FS_INDEX_BEG ],
1263                     6 * ( $end - $index->{ $chr }->[ $index_end ]->[ FS_CHR_BEG ] + 1 ),
1264                 );
1265
1266                 for ( $c = 0; $c < @vals; $c++ ) {
1267                     $scores->[ $c + $index->{ $chr }->[ $index_end ]->[ FS_CHR_BEG ] - $chr_beg ] = $vals[ $c ];
1268                 }
1269             }
1270
1271             for ( $i = $index_beg + 1; $i <= $index_end - 1; $i++ )
1272             {
1273                 @vals = split "\n", Maasha::Common::file_read(
1274                     $fh,
1275                     $index->{ $chr }->[ $i ]->[ FS_INDEX_BEG ],
1276                     6 * ( $index->{ $chr }->[ $i ]->[ FS_CHR_END ] - $index->{ $chr }->[ $i ]->[ FS_CHR_BEG ] + 1 ),
1277                 );
1278
1279                 for ( $c = 0; $c < @vals; $c++ ) {
1280                     $scores->[ $c + $index->{ $chr }->[ $i ]->[ FS_CHR_BEG ] - $chr_beg ] = $vals[ $c ];
1281                 }
1282             }
1283         }
1284     }
1285     else
1286     {
1287         Maasha::Common::error( qq(Chromosome "$chr" was not found in index) );
1288     }
1289
1290     return wantarray ? @{ $scores } : $scores;
1291 }
1292
1293
1294 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> PhastCons format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1295
1296
1297 sub phastcons_index
1298 {
1299     # Martin A. Hansen, July 2008
1300
1301     # Create a fixedStep index for PhastCons data.
1302
1303     my ( $file,   # file to index
1304          $dir,    # dir with file
1305        ) = @_;
1306
1307     # Returns nothing.
1308
1309     my ( $index );
1310
1311     $index = fixedstep_index_create( "$dir/$file" );
1312
1313     fixedstep_index_store( "$dir/$file.index", $index );
1314 }
1315
1316
1317 sub phastcons_parse_entry
1318 {
1319     # Martin A. Hansen, December 2007.
1320
1321     # Given a PhastCons entry converts this to a
1322     # list of super blocks.
1323
1324     my ( $lines,   # list of lines
1325          $args,    # argument hash
1326        ) = @_;
1327
1328     # Returns
1329
1330     my ( $info, $chr, $beg, $step, $i, $c, $j, @blocks, @super_blocks, @entries, $super_block, $block, @lens, @begs );
1331
1332     $info = shift @{ $lines };
1333
1334     if ( $info =~ /^chrom=([^ ]+) start=(\d+) step=(\d+)$/ )
1335     {
1336         $chr  = $1;
1337         $beg  = $2;
1338         $step = $3;
1339
1340         die qq(ERROR: step size $step != 1 -> problem!\n) if $step != 1; # in an ideal world should would be fixed ...
1341     }
1342
1343     $i = 0;
1344
1345     while ( $i < @{ $lines } )
1346     {
1347         if ( $lines->[ $i ] >= $args->{ "threshold" } )
1348         {
1349             $c = $i + 1;
1350
1351             while ( $c < @{ $lines } )
1352             {
1353                 if ( $lines->[ $c ] < $args->{ "threshold" } )
1354                 {
1355                     $j = $c + 1;
1356
1357                     while ( $j < @{ $lines } and $lines->[ $j ] < $args->{ "threshold" } ) {
1358                         $j++;
1359                     }
1360
1361                     if ( $j - $c > $args->{ "gap" } )
1362                     {
1363                         if ( $c - $i >= $args->{ "min" } )
1364                         {
1365                             push @blocks, {
1366                                 CHR     => $chr,
1367                                 CHR_BEG => $beg + $i - 1,
1368                                 CHR_END => $beg + $c - 2,
1369                                 CHR_LEN => $c - $i,
1370                             };
1371                         }
1372
1373                         $i = $j;
1374
1375                         last;
1376                     }
1377
1378                     $c = $j
1379                 }
1380                 else
1381                 {
1382                     $c++;
1383                 }
1384             }
1385
1386             if ( $c - $i >= $args->{ "min" } )
1387             {
1388                 push @blocks, {
1389                     CHR     => $chr,
1390                     CHR_BEG => $beg + $i - 1,
1391                     CHR_END => $beg + $c - 2,
1392                     CHR_LEN => $c - $i,
1393                 };
1394             }
1395
1396             $i = $c;
1397         }
1398         else
1399         {
1400             $i++;
1401         }
1402     }
1403
1404     $i = 0;
1405
1406     while ( $i < @blocks )
1407     {
1408         $c = $i + 1;
1409
1410         while ( $c < @blocks and $blocks[ $c ]->{ "CHR_BEG" } - $blocks[ $c - 1 ]->{ "CHR_END" } <= $args->{ "dist" } )
1411         {
1412             $c++;
1413         }
1414
1415         push @super_blocks, [ @blocks[ $i .. $c - 1 ] ];
1416
1417         $i = $c;
1418     }
1419
1420     foreach $super_block ( @super_blocks )
1421     {
1422         foreach $block ( @{ $super_block } )
1423         {
1424             push @begs, $block->{ "CHR_BEG" } - $super_block->[ 0 ]->{ "CHR_BEG" };
1425             push @lens, $block->{ "CHR_LEN" } - 1;
1426         }
1427
1428         $lens[ -1 ]++;
1429
1430         push @entries, {
1431             CHR        => $super_block->[ 0 ]->{ "CHR" },
1432             CHR_BEG    => $super_block->[ 0 ]->{ "CHR_BEG" },
1433             CHR_END    => $super_block->[ -1 ]->{ "CHR_END" },
1434             Q_ID       => "Q_ID",
1435             SCORE      => 100,
1436             STRAND     => "+",
1437             THICK_BEG  => $super_block->[ 0 ]->{ "CHR_BEG" },
1438             THICK_END  => $super_block->[ -1 ]->{ "CHR_END" } + 1,
1439             ITEMRGB    => "0,200,100",
1440             BLOCKCOUNT => scalar @{ $super_block },
1441             BLOCKSIZES => join( ",", @lens ),
1442             Q_BEGS     => join( ",", @begs ),
1443         };
1444
1445         undef @begs;
1446         undef @lens;
1447     }
1448
1449     return wantarray ? @entries : \@entries;
1450 }
1451
1452
1453 sub phastcons_normalize
1454 {
1455     # Martin A. Hansen, January 2008.
1456
1457     # Normalizes a list of lists with PhastCons scores,
1458     # in such a way that each list contains the same number
1459     # or PhastCons scores.
1460
1461     my ( $AoA,    # AoA with PhastCons scores
1462        ) = @_;
1463
1464     # Returns AoA.
1465
1466     my ( $list, $max, $min, $mean, $diff );
1467
1468     $min = 99999999;
1469     $max = 0;
1470
1471     foreach $list ( @{ $AoA } )
1472     {
1473         $min = scalar @{ $list } if scalar @{ $list } < $min;
1474         $max = scalar @{ $list } if scalar @{ $list } > $max;
1475     }
1476
1477     $mean = int( ( $min + $max ) / 2 );
1478
1479 #    print STDERR "min->$min   max->$max   mean->$mean\n";
1480
1481     foreach $list ( @{ $AoA } )
1482     {
1483         $diff = scalar @{ $list } - $mean;
1484
1485         phastcons_list_inflate( $list, abs( $diff ) ) if $diff < 0;
1486         phastcons_list_deflate( $list, $diff )        if $diff > 0;
1487     }
1488
1489     return wantarray ? @{ $AoA } : $AoA;
1490 }
1491
1492
1493 sub phastcons_list_inflate
1494 {
1495     # Martin A. Hansen, January 2008.
1496
1497     # Inflates a list with a given number of elements
1498     # in such a way that the extra elements are introduced
1499     # evenly over the entire length of the list. The value
1500     # of the extra elements is based on a mean of the
1501     # adjacent elements.
1502
1503     my ( $list,   # list of elements
1504          $diff,   # number of elements to introduce
1505        ) = @_;
1506
1507     # Returns nothing
1508
1509     my ( $len, $space, $i, $pos );
1510
1511     $len = scalar @{ $list };
1512
1513     $space = $len / $diff;
1514
1515     for ( $i = 0; $i < $diff; $i++ )
1516     {
1517         $pos = int( ( $space / 2 ) + $i * $space );
1518
1519         splice @{ $list }, $pos, 0, ( $list->[ $pos - 1 ] + $list->[ $pos + 1 ] ) / 2;
1520         # splice @{ $list }, $pos, 0, "X";
1521     }
1522
1523     die qq(ERROR: bad inflate\n) if scalar @{ $list } != $len + $diff;
1524 }
1525
1526
1527 sub phastcons_list_deflate
1528 {
1529     # Martin A. Hansen, January 2008.
1530
1531     # Deflates a list by removing a given number of elements
1532     # evenly distributed over the entire list.
1533
1534     my ( $list,   # list of elements
1535          $diff,   # number of elements to remove
1536        ) = @_;
1537
1538     # Returns nothing
1539
1540     my ( $len, $space, $i, $pos );
1541
1542     $len = scalar @{ $list };
1543
1544     $space = ( $len - $diff ) / $diff;
1545
1546     for ( $i = 0; $i < $diff; $i++ )
1547     {
1548         $pos = int( ( $space / 2 ) + $i * $space );
1549
1550         splice @{ $list }, $pos, 1;
1551     }
1552
1553     die qq(ERROR: bad deflate\n) if scalar @{ $list } != $len - $diff;
1554 }
1555
1556
1557 sub phastcons_mean
1558 {
1559     # Martin A. Hansen, January 2008.
1560
1561     # Given a normalized PhastCons matrix in an AoA,
1562     # calculate the mean for each column and return as a list.
1563
1564     my ( $AoA,    # AoA with normalized PhastCons scores
1565        ) = @_;
1566
1567     # Returns a list
1568
1569     my ( @list );
1570
1571     $AoA = Maasha::Matrix::matrix_flip( $AoA );
1572
1573     map { push @list, Maasha::Calc::mean( $_ ) } @{ $AoA };
1574
1575     return wantarray ? @list : \@list;
1576 }
1577
1578
1579 sub phastcons_median
1580 {
1581     # Martin A. Hansen, January 2008.
1582
1583     # Given a normalized PhastCons matrix in an AoA,
1584     # calculate the median for each column and return as a list.
1585
1586     my ( $AoA,    # AoA with normalized PhastCons scores
1587        ) = @_;
1588
1589     # Returns a list
1590
1591     my ( @list );
1592
1593     $AoA = Maasha::Matrix::matrix_flip( $AoA );
1594
1595     map { push @list, Maasha::Calc::median( $_ ) } @{ $AoA };
1596
1597     return wantarray ? @list : \@list;
1598 }
1599
1600
1601 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> MULTIPLE ALIGNMENT FILES <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1602
1603
1604 sub maf_extract
1605 {
1606     # Martin A. Hansen, April 2008.
1607
1608     # Executes mafFrag to extract a subalignment from a multiz track
1609     # in the UCSC genome browser database.
1610
1611     my ( $tmp_dir,    # temporary directory
1612          $database,   # genome database
1613          $table,      # table with the multiz track
1614          $chr,        # chromosome
1615          $beg,        # begin position
1616          $end,        # end position
1617          $strand,     # strand
1618        ) = @_;
1619
1620     # Returns a list of record
1621
1622     my ( $tmp_file, $align );
1623
1624     $tmp_file = "$tmp_dir/maf_extract.maf";
1625
1626     Maasha::Common::run( "mafFrag", "$database $table $chr $beg $end $strand $tmp_file" );
1627
1628     $align = maf_parse( $tmp_file );
1629
1630     unlink $tmp_file;
1631
1632     return wantarray ? @{ $align } : $align;
1633 }
1634
1635
1636 sub maf_parse
1637 {
1638     # Martin A. Hansen, April 2008.
1639
1640
1641     my ( $path,   # full path to MAF file
1642        ) = @_;
1643
1644     # Returns a list of record.
1645
1646     my ( $fh, $line, @fields, @align );
1647
1648     $fh = Maasha::Common::read_open( $path );
1649
1650     while ( $line = <$fh> )
1651     {
1652         chomp $line;
1653
1654         if ( $line =~ /^s/ )
1655         {
1656             @fields = split / /, $line;
1657
1658             push @align, {
1659                 SEQ_NAME  => $fields[ 1 ],
1660                 SEQ       => $fields[ -1 ],
1661                 ALIGN     => 1,
1662                 ALIGN_LEN => length $fields[ -1 ],
1663             }
1664         }
1665     }
1666
1667     close $fh;
1668
1669     return wantarray ? @align : \@align;
1670 }
1671
1672
1673 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> WIGGLE FORMAT <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1674
1675
1676 sub fixedstep_put_entry
1677 {
1678     # Martin A. Hansen, April 2008.
1679
1680     # Outputs a block of fixedStep values.
1681     # Used for outputting wiggle data.
1682
1683     my ( $chr,      # chromosome
1684          $beg,      # start position
1685          $block,    # list of scores
1686          $fh,       # filehandle - OPTIONAL
1687          $log10,    # flag indicating that log10 scores should be used
1688        ) = @_;
1689
1690     # Returns nothing.
1691
1692     $beg += 1;   # fixedStep format is 1 based.
1693
1694     $fh ||= \*STDOUT;
1695
1696     print $fh "fixedStep chrom=$chr start=$beg step=1\n";
1697
1698     if ( $log10 ) {
1699         map { printf( $fh "%f\n", Maasha::Calc::log10( $_ + 1 ) ) } @{ $block };
1700     } else {
1701         map { printf( $fh "%d\n", ( $_ + 1 ) ) } @{ $block };
1702     }
1703 }
1704
1705
1706 sub wiggle_upload_to_ucsc
1707 {
1708     # Martin A. Hansen, May 2008.
1709
1710     # Upload a wiggle file to the UCSC database.
1711
1712     my ( $tmp_dir,    # temporary directory
1713          $wib_dir,    # wib directory
1714          $wig_file,   # file to upload,
1715          $options,    # argument hashref
1716        ) = @_;
1717
1718     # Returns nothing.
1719
1720     my ( $args );
1721
1722 #    $args = join " ", "-tmpDir=$tmp_dir", "-pathPrefix=$wib_dir", $options->{ "database" }, $options->{ 'table' }, $wig_file;
1723
1724 #    Maasha::Common::run( "hgLoadWiggle", "$args > /dev/null 2>&1" );
1725
1726     if ( $options->{ 'verbose' } ) {
1727         `cd $tmp_dir && hgLoadWiggle -tmpDir=$tmp_dir -pathPrefix=$wib_dir $options->{ 'database' } $options->{ 'table' } $wig_file`;
1728     } else {
1729         `cd $tmp_dir && hgLoadWiggle -tmpDir=$tmp_dir -pathPrefix=$wib_dir $options->{ 'database' } $options->{ 'table' } $wig_file > /dev/null 2>&1`;
1730     }
1731 }
1732
1733
1734 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> MySQL CONF <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1735
1736
1737 sub ucsc_get_user
1738 {
1739     # Martin A. Hansen, May 2008
1740
1741     # Fetches the MySQL database user name from the
1742     # .hg.conf file in the users home directory.
1743
1744     # Returns a string.
1745
1746     my ( $fh, $line, $user );
1747
1748     $fh = Maasha::Common::read_open( "$ENV{ 'HOME' }/.hg.conf" );
1749
1750     while ( $line = <$fh> )
1751     {
1752         chomp $line;
1753
1754         if ( $line =~ /^db\.user=(.+)/ )
1755         {
1756             $user = $1;
1757
1758             last;
1759         }
1760     }
1761
1762     close $fh;
1763
1764     return $user;
1765 }
1766
1767
1768 sub ucsc_get_password
1769 {
1770     # Martin A. Hansen, May 2008
1771
1772     # Fetches the MySQL database password from the
1773     # .hg.conf file in the users home directory.
1774
1775     # Returns a string.
1776
1777     my ( $fh, $line, $password );
1778
1779     $fh = Maasha::Common::read_open( "$ENV{ 'HOME' }/.hg.conf" );
1780
1781     while ( $line = <$fh> )
1782     {
1783         chomp $line;
1784
1785         if ( $line =~ /^db\.password=(.+)/ )
1786         {
1787             $password = $1;
1788
1789             last;
1790         }
1791     }
1792
1793     close $fh;
1794
1795     return $password;
1796 }
1797
1798
1799 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1800
1801
1802 __END__
1803