CHR_END => 2,
INDEX_BEG => 3,
INDEX_LEN => 4,
+
+ CHR => 0,
+ CHR_BEG => 1,
+ CHR_END => 2,
+ Q_ID => 3,
+ SCORE => 4,
+ STRAND => 5,
+ THICK_BEG => 6,
+ THICK_END => 7,
+ ITEMRGB => 8,
+ BLOCKCOUNT => 9,
+ BLOCKSIZES => 10,
+ Q_BEGS => 11,
};
@ISA = qw( Exporter );
# http://genome.ucsc.edu/goldenPath/help/hgTracksHelp.html#BED
+sub bed_entry_get_array
+{
+ # Martin A. Hansen, September 2008.
+
+ # Reads a BED entry given a filehandle.
+
+ # This is a new _faster_ BED entry parser that
+ # uses arrays and not hashrefs.
+
+ # IMPORTANT! This function does not correct the
+ # BED_END position that is kept the way UCSC
+ # does.
+
+ my ( $fh, # file handle
+ $cols, # columns to read - OPTIONAL (3,4,5,6, or 12)
+ ) = @_;
+
+ # Returns a list.
+
+ my ( $line, @entry );
+
+ $line = <$fh>;
+
+ $line =~ tr/\n\r//d; # some people have carriage returns in their BED files -> Grrrr
+
+ return if not defined $line;
+
+ if ( not defined $cols ) {
+ $cols = 1 + $line =~ tr/\t//;
+ }
+
+ @entry = split "\t", $line, $cols + 1;
+
+ pop @entry if scalar @entry > $cols;
+
+ return wantarray ? @entry : \@entry;
+}
+
+
sub bed_get_entry
{
# Martin A. Hansen, December 2007.
$line = <$fh>;
- $line =~ s/(\n|\r)$//g; # some people have carriage returns in their BED files -> Grrrr
+ $line =~ tr/\n\r//d; # some people have carriage returns in their BED files -> Grrrr
return if not defined $line;
}
+sub bed_entry_put_array
+{
+ # Martin A. Hansen, Septermber 2008.
+
+ # Writes a BED entry array to file.
+
+ # IMPORTANT! This function does not correct the
+ # BED_END position that is assumed to be in the
+ # UCSC positions scheme.
+
+ my ( $record, # list
+ $fh, # file handle - OPTIONAL
+ $cols, # number of columns in BED file - OPTIONAL
+ ) = @_;
+
+ # Returns nothing.
+
+ $fh = \*STDOUT if not defined $fh;
+
+ if ( defined $cols ) {
+ print $fh join( "\t", @{ $record }[ 0 .. $cols - 1 ] ), "\n";
+ } else {
+ print $fh join( "\t", @{ $record } ), "\n";
+ }
+}
+
+
sub bed_put_entry
{
# Martin A. Hansen, Septermber 2007.
sub bed_sort
{
- # Martin A. Hansen, March 2008.
+ # Martin A. Hansen, Septermber 2008
- # Sort a potential huge BED file according to
- # CHR, CHR_BEG and optionally STRAND.
+ # Sorts a BED file using the c program
+ # "bed_sort" specifing a sort mode:
- my ( $tmp_dir, # temporary directory used for sorting
- $file, # BED file to sort
- $strand, # flag to sort on strand - OPTIONAL
- ) = @_;
+ # 1: chr AND chr_beg.
+ # 2: chr AND strand AND chr_beg.
+ # 3: chr_beg.
+ # 4: strand AND chr_beg.
- # Returns nothing.
+ my ( $bed_file, # BED file to sort
+ $sort_mode, # See above.
+ $cols, # Number of columns in BED file
+ ) = @_;
- my ( $fh_in, $key, $fh_out, %fh_hash, $part_file, $entry, $entries );
+ &Maasha::Common::run( "bed_sort", "--sort $sort_mode --cols $cols $bed_file" );
+}
- $fh_in = Maasha::Common::read_open( $file );
- while ( $entry = bed_get_entry( $fh_in ) )
- {
- if ( $strand ) {
- $key = join "_", $entry->{ "CHR" }, $entry->{ "STRAND" };
- } else {
- $key = $entry->{ "CHR" };
- }
+sub bed_split_to_files
+{
+ # Martin A. Hansen, Septermber 2008
- $fh_hash{ $key } = Maasha::Common::write_open( "$tmp_dir/$key.sort" ) if not exists $fh_hash{ $key };
-
- bed_put_entry( $entry, $fh_hash{ $key } );
- }
+ # Given a list of BED files, split these
+ # into temporary files based on the chromosome
+ # name. Returns a list of the temporary files.
- close $fh_in;
+ my ( $bed_files, # list of BED files to split
+ $cols, # number of columns
+ $tmp_dir, # temporary directory
+ ) = @_;
- map { close $_ } keys %fh_hash;
+ # Returns a list.
- $fh_out = Maasha::Common::write_open( "$tmp_dir/temp.sort" );
+ my ( $bed_file, $fh_in, $entry, $key, %fh_hash, @tmp_files );
- foreach $part_file ( sort keys %fh_hash )
+ foreach $bed_file ( @{ $bed_files } )
{
- $entries = bed_get_entries( "$tmp_dir/$part_file.sort" );
+ $fh_in = Maasha::Common::read_open( $bed_file );
- @{ $entries } = sort { $a->{ "CHR_BEG" } <=> $b->{ "CHR_BEG" } } @{ $entries };
-
- map { bed_put_entry( $_, $fh_out ) } @{ $entries };
+ while ( $entry = bed_entry_get_array( $fh_in, $cols ) )
+ {
+ $key = $entry->[ CHR ];
+
+ $fh_hash{ $key } = Maasha::Common::write_open( "$tmp_dir/$key.temp" ) if not exists $fh_hash{ $key };
+
+ bed_entry_put_array( $entry, $fh_hash{ $key } );
+ }
- unlink "$tmp_dir/$part_file.sort";
+ close $fh_in;
}
- close $fh_out;
+ foreach $key ( sort keys %fh_hash )
+ {
+ push @tmp_files, "$tmp_dir/$key.temp";
+
+ close $fh_hash{ $key };
+ }
- rename "$tmp_dir/temp.sort", $file;
+ return wantarray ? @tmp_files : \@tmp_files;
}
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> PSL format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+sub psl_get_entry
+{
+ # Martin A. Hansen, August 2008.
+
+ # Reads PSL next entry from a PSL file and returns a record.
+
+ my ( $fh, # file handle of PSL filefull path to PSL file
+ ) = @_;
+
+ # Returns hashref.
+
+ my ( $line, @fields, %record );
+
+ while ( $line = <$fh> )
+ {
+ chomp $line;
+
+ @fields = split "\t", $line;
+
+ if ( scalar @fields == 21 )
+ {
+ %record = (
+ REC_TYPE => "PSL",
+ MATCHES => $fields[ 0 ],
+ MISMATCHES => $fields[ 1 ],
+ REPMATCHES => $fields[ 2 ],
+ NCOUNT => $fields[ 3 ],
+ QNUMINSERT => $fields[ 4 ],
+ QBASEINSERT => $fields[ 5 ],
+ SNUMINSERT => $fields[ 6 ],
+ SBASEINSERT => $fields[ 7 ],
+ STRAND => $fields[ 8 ],
+ Q_ID => $fields[ 9 ],
+ Q_LEN => $fields[ 10 ],
+ Q_BEG => $fields[ 11 ],
+ Q_END => $fields[ 12 ] - 1,
+ S_ID => $fields[ 13 ],
+ S_LEN => $fields[ 14 ],
+ S_BEG => $fields[ 15 ],
+ S_END => $fields[ 16 ] - 1,
+ BLOCKCOUNT => $fields[ 17 ],
+ BLOCKSIZES => $fields[ 18 ],
+ Q_BEGS => $fields[ 19 ],
+ S_BEGS => $fields[ 20 ],
+ );
+
+ $record{ "SCORE" } = $record{ "MATCHES" } + int( $record{ "REPMATCHES" } / 2 ) - $record{ "MISMATCHES" } - $record{ "QNUMINSERT" } - $record{ "SNUMINSERT" };
+ $record{ "SPAN" } = $fields[ 12 ] - $fields[ 11 ];
+
+ return wantarray ? %record : \%record;
+ }
+ }
+
+ return undef;
+}
+
+
sub psl_get_entries
{
# Martin A. Hansen, February 2008.
- # Reads PSL entries and returns a record.
+ # Reads PSL entries and returns a list of records.
my ( $path, # full path to PSL file
) = @_;
}
-# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> PhastCons format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> fixedStep format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
-sub phastcons_get_entry
+sub fixedstep_get_entry
{
# Martin A. Hansen, December 2007.
}
-sub phastcons_parse_entry
-{
- # Martin A. Hansen, December 2007.
-
- # Given a PhastCons entry converts this to a
- # list of super blocks.
-
- my ( $lines, # list of lines
- $args, # argument hash
- ) = @_;
-
- # Returns
-
- my ( $info, $chr, $beg, $step, $i, $c, $j, @blocks, @super_blocks, @entries, $super_block, $block, @lens, @begs );
-
- $info = shift @{ $lines };
-
- if ( $info =~ /^chrom=([^ ]+) start=(\d+) step=(\d+)$/ )
- {
- $chr = $1;
- $beg = $2;
- $step = $3;
-
- die qq(ERROR: step size $step != 1 -> problem!\n) if $step != 1; # in an ideal world should would be fixed ...
- }
-
- $i = 0;
-
- while ( $i < @{ $lines } )
- {
- if ( $lines->[ $i ] >= $args->{ "threshold" } )
- {
- $c = $i + 1;
-
- while ( $c < @{ $lines } )
- {
- if ( $lines->[ $c ] < $args->{ "threshold" } )
- {
- $j = $c + 1;
-
- while ( $j < @{ $lines } and $lines->[ $j ] < $args->{ "threshold" } ) {
- $j++;
- }
-
- if ( $j - $c > $args->{ "gap" } )
- {
- if ( $c - $i >= $args->{ "min" } )
- {
- push @blocks, {
- CHR => $chr,
- CHR_BEG => $beg + $i - 1,
- CHR_END => $beg + $c - 2,
- CHR_LEN => $c - $i,
- };
- }
-
- $i = $j;
-
- last;
- }
-
- $c = $j
- }
- else
- {
- $c++;
- }
- }
-
- if ( $c - $i >= $args->{ "min" } )
- {
- push @blocks, {
- CHR => $chr,
- CHR_BEG => $beg + $i - 1,
- CHR_END => $beg + $c - 2,
- CHR_LEN => $c - $i,
- };
- }
-
- $i = $c;
- }
- else
- {
- $i++;
- }
- }
-
- $i = 0;
-
- while ( $i < @blocks )
- {
- $c = $i + 1;
-
- while ( $c < @blocks and $blocks[ $c ]->{ "CHR_BEG" } - $blocks[ $c - 1 ]->{ "CHR_END" } <= $args->{ "dist" } )
- {
- $c++;
- }
-
- push @super_blocks, [ @blocks[ $i .. $c - 1 ] ];
-
- $i = $c;
- }
-
- foreach $super_block ( @super_blocks )
- {
- foreach $block ( @{ $super_block } )
- {
- push @begs, $block->{ "CHR_BEG" } - $super_block->[ 0 ]->{ "CHR_BEG" };
- push @lens, $block->{ "CHR_LEN" } - 1;
- }
-
- $lens[ -1 ]++;
-
- push @entries, {
- CHR => $super_block->[ 0 ]->{ "CHR" },
- CHR_BEG => $super_block->[ 0 ]->{ "CHR_BEG" },
- CHR_END => $super_block->[ -1 ]->{ "CHR_END" },
- Q_ID => "Q_ID",
- SCORE => 100,
- STRAND => "+",
- THICK_BEG => $super_block->[ 0 ]->{ "CHR_BEG" },
- THICK_END => $super_block->[ -1 ]->{ "CHR_END" } + 1,
- ITEMRGB => "0,200,100",
- BLOCKCOUNT => scalar @{ $super_block },
- BLOCKSIZES => join( ",", @lens ),
- Q_BEGS => join( ",", @begs ),
- };
-
- undef @begs;
- undef @lens;
- }
-
- return wantarray ? @entries : \@entries;
-}
-
-
-sub phastcons_index_create
+sub fixedstep_index_create
{
# Martin A. Hansen, January 2008.
- # Indexes a concatenated PhastCons file.
+ # Indexes a concatenated fixedStep file.
# The index consists of a hash with chromosomes as keys,
# and a list of [ chr_beg, next_chr_beg, chr_end, index_beg, index_len ] as values.
- my ( $path, # path to PhastCons file
+ my ( $path, # path to fixedStep file
) = @_;
# Returns a hashref
$pos = 0;
- while ( $entry = Maasha::UCSC::phastcons_get_entry( $fh ) )
+ while ( $entry = Maasha::UCSC::fixedstep_get_entry( $fh ) )
{
$locator = shift @{ $entry };
if ( $locator =~ /chrom=([^ ]+) start=(\d+) step=(\d+)/ )
{
$chr = $1;
- $beg = $2 - 1; # phastcons files are 1-based
+ $beg = $2 - 1; # fixedStep files are 1-based
$step = $3;
}
else
{
- Maasha::Common::error( qq(Could not parse PhastCons locator: $locator) );
+ Maasha::Common::error( qq(Could not parse locator: $locator) );
}
$pos += length( $locator ) + 11;
}
-sub phastcons_index_store
+sub fixedstep_index_store
{
# Martin A. Hansen, January 2008.
- # Writes a PhastCons index to binary file.
+ # Writes a fixedStep index to binary file.
my ( $path, # full path to file
$index, # list with index
}
-sub phastcons_index_retrieve
+sub fixedstep_index_retrieve
{
# Martin A. Hansen, January 2008.
- # Retrieves a PhastCons index from binary file.
+ # Retrieves a fixedStep index from binary file.
my ( $path, # full path to file
) = @_;
}
-sub phastcons_index_lookup
+sub fixedstep_index_lookup
{
# Martin A. Hansen, January 2008.
- # Retrieve PhastCons scores from a indexed
- # Phastcons file given a chromosome and
+ # Retrieve fixedStep scores from a indexed
+ # fixedStep file given a chromosome and
# begin and end positions.
my ( $index, # data structure
}
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> PhastCons format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+sub phastcons_index
+{
+ # Martin A. Hansen, July 2008
+
+ # Create a fixedStep index for PhastCons data.
+
+ my ( $file, # file to index
+ $dir, # dir with file
+ ) = @_;
+
+ # Returns nothing.
+
+ my ( $index );
+
+ $index = fixedstep_index_create( "$dir/$file" );
+
+ fixedstep_index_store( "$dir/$file.index", $index );
+}
+
+
+sub phastcons_parse_entry
+{
+ # Martin A. Hansen, December 2007.
+
+ # Given a PhastCons entry converts this to a
+ # list of super blocks.
+
+ my ( $lines, # list of lines
+ $args, # argument hash
+ ) = @_;
+
+ # Returns
+
+ my ( $info, $chr, $beg, $step, $i, $c, $j, @blocks, @super_blocks, @entries, $super_block, $block, @lens, @begs );
+
+ $info = shift @{ $lines };
+
+ if ( $info =~ /^chrom=([^ ]+) start=(\d+) step=(\d+)$/ )
+ {
+ $chr = $1;
+ $beg = $2;
+ $step = $3;
+
+ die qq(ERROR: step size $step != 1 -> problem!\n) if $step != 1; # in an ideal world should would be fixed ...
+ }
+
+ $i = 0;
+
+ while ( $i < @{ $lines } )
+ {
+ if ( $lines->[ $i ] >= $args->{ "threshold" } )
+ {
+ $c = $i + 1;
+
+ while ( $c < @{ $lines } )
+ {
+ if ( $lines->[ $c ] < $args->{ "threshold" } )
+ {
+ $j = $c + 1;
+
+ while ( $j < @{ $lines } and $lines->[ $j ] < $args->{ "threshold" } ) {
+ $j++;
+ }
+
+ if ( $j - $c > $args->{ "gap" } )
+ {
+ if ( $c - $i >= $args->{ "min" } )
+ {
+ push @blocks, {
+ CHR => $chr,
+ CHR_BEG => $beg + $i - 1,
+ CHR_END => $beg + $c - 2,
+ CHR_LEN => $c - $i,
+ };
+ }
+
+ $i = $j;
+
+ last;
+ }
+
+ $c = $j
+ }
+ else
+ {
+ $c++;
+ }
+ }
+
+ if ( $c - $i >= $args->{ "min" } )
+ {
+ push @blocks, {
+ CHR => $chr,
+ CHR_BEG => $beg + $i - 1,
+ CHR_END => $beg + $c - 2,
+ CHR_LEN => $c - $i,
+ };
+ }
+
+ $i = $c;
+ }
+ else
+ {
+ $i++;
+ }
+ }
+
+ $i = 0;
+
+ while ( $i < @blocks )
+ {
+ $c = $i + 1;
+
+ while ( $c < @blocks and $blocks[ $c ]->{ "CHR_BEG" } - $blocks[ $c - 1 ]->{ "CHR_END" } <= $args->{ "dist" } )
+ {
+ $c++;
+ }
+
+ push @super_blocks, [ @blocks[ $i .. $c - 1 ] ];
+
+ $i = $c;
+ }
+
+ foreach $super_block ( @super_blocks )
+ {
+ foreach $block ( @{ $super_block } )
+ {
+ push @begs, $block->{ "CHR_BEG" } - $super_block->[ 0 ]->{ "CHR_BEG" };
+ push @lens, $block->{ "CHR_LEN" } - 1;
+ }
+
+ $lens[ -1 ]++;
+
+ push @entries, {
+ CHR => $super_block->[ 0 ]->{ "CHR" },
+ CHR_BEG => $super_block->[ 0 ]->{ "CHR_BEG" },
+ CHR_END => $super_block->[ -1 ]->{ "CHR_END" },
+ Q_ID => "Q_ID",
+ SCORE => 100,
+ STRAND => "+",
+ THICK_BEG => $super_block->[ 0 ]->{ "CHR_BEG" },
+ THICK_END => $super_block->[ -1 ]->{ "CHR_END" } + 1,
+ ITEMRGB => "0,200,100",
+ BLOCKCOUNT => scalar @{ $super_block },
+ BLOCKSIZES => join( ",", @lens ),
+ Q_BEGS => join( ",", @begs ),
+ };
+
+ undef @begs;
+ undef @lens;
+ }
+
+ return wantarray ? @entries : \@entries;
+}
+
+
sub phastcons_normalize
{
# Martin A. Hansen, January 2008.
$beg, # start position
$block, # list of scores
$fh, # filehandle - OPTIONAL
+ $log10, # flag indicating that log10 scores should be used
) = @_;
# Returns nothing.
$beg += 1; # fixedStep format is 1 based.
- if ( $fh )
- {
- print $fh "fixedStep chrom=$chr start=$beg step=1\n";
+ $fh ||= \*STDOUT;
- map { printf( $fh "%d\n", ( $_ + 1 ) ) } @{ $block };
- }
- else
- {
- print "fixedStep chrom=$chr start=$beg step=1\n";
+ print $fh "fixedStep chrom=$chr start=$beg step=1\n";
- map { printf( "%d\n", ( $_ + 1 ) ) } @{ $block };
+ if ( $log10 ) {
+ map { printf( $fh "%f\n", Maasha::Calc::log10( $_ + 1 ) ) } @{ $block };
+ } else {
+ map { printf( $fh "%d\n", ( $_ + 1 ) ) } @{ $block };
}
}
# Maasha::Common::run( "hgLoadWiggle", "$args > /dev/null 2>&1" );
- `cd $tmp_dir && hgLoadWiggle -tmpDir=$tmp_dir -pathPrefix=$wib_dir $options->{ 'database' } $options->{ 'table' } $wig_file > /dev/null 2>&1`;
+ if ( $options->{ 'verbose' } ) {
+ `cd $tmp_dir && hgLoadWiggle -tmpDir=$tmp_dir -pathPrefix=$wib_dir $options->{ 'database' } $options->{ 'table' } $wig_file`;
+ } else {
+ `cd $tmp_dir && hgLoadWiggle -tmpDir=$tmp_dir -pathPrefix=$wib_dir $options->{ 'database' } $options->{ 'table' } $wig_file > /dev/null 2>&1`;
+ }
}