use Storable qw( dclone );
use Maasha::Config;
use Maasha::Common;
+use Maasha::Filesys;
use Maasha::Fasta;
use Maasha::Align;
use Maasha::Matrix;
use Maasha::Plot;
use Maasha::Calc;
use Maasha::UCSC;
+use Maasha::UCSC::BED;
+use Maasha::UCSC::Wiggle;
use Maasha::NCBI;
use Maasha::GFF;
use Maasha::TwoBit;
my ( $script, $BP_TMP );
-$script = Maasha::Common::get_scriptname();
-$BP_TMP = Maasha::Common::get_tmpdir();
+$script = Maasha::Common::get_scriptname();
+$BP_TMP = Maasha::Common::get_tmpdir();
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> LOG <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
{
@options = qw(
data_in|i=s
+ cols|c=s
num|n=s
);
}
# Returns nothing.
- my ( $file, $record, $entry, $data_in, $num );
+ my ( $cols, $file, $record, $bed_entry, $data_in, $num );
+
+ $cols = $options->{ 'cols' }->[ 0 ];
while ( $record = get_record( $in ) ) {
put_record( $record, $out );
{
$data_in = Maasha::Common::read_open( $file );
- while ( $entry = Maasha::UCSC::bed_get_entry( $data_in ) )
+ while ( $bed_entry = Maasha::UCSC::BED::bed_entry_get( $data_in, $cols ) )
{
- put_record( $entry, $out );
+ $record = Maasha::UCSC::BED::bed2biopiece( $bed_entry );
+
+ put_record( $record, $out );
goto NUM if $options->{ "num" } and $num == $options->{ "num" };
# Returns nothing.
- my ( $file, $record, $entry, $head, $chr, $chr_beg, $step, $data_in, $num );
+ my ( $file, $record, $entry, $data_in, $num );
while ( $record = get_record( $in ) ) {
put_record( $record, $out );
{
$data_in = Maasha::Common::read_open( $file );
- while ( $entry = Maasha::UCSC::fixedstep_get_entry( $data_in ) )
+ while ( $entry = Maasha::UCSC::Wiggle::fixedstep_entry_get( $data_in ) )
{
- $head = shift @{ $entry };
-
- if ( $head =~ /^chrom=([^ ]+) start=(\d+) step=(\d+)$/ )
- {
- $record->{ "REC_TYPE" } = "fixed_step";
- $record->{ "CHR" } = $1;
- $record->{ "CHR_BEG" } = $2;
- $record->{ "STEP" } = $3;
- $record->{ "VALS" } = join ";", @{ $entry };
- }
+ $record = Maasha::UCSC::Wiggle::fixedstep2biopiece( $entry );
put_record( $record, $out );
{
$data_in = Maasha::Common::read_open( $file );
- while ( $record = Maasha::UCSC::ucsc_config_get_entry( $data_in ) )
+ while ( $record = Maasha::UCSC::ucsc_config_entry_get( $data_in ) )
{
$record->{ 'REC_TYPE' } = "UCSC Config";
# Assemble tags from the stream into
# tag contigs.
+ # The current implementation is quite
+ # slow because of heavy use of temporary
+ # files.
+
my ( $in, # handle to in stream
$out, # handle to out stream
$options, # options hash
# Returns nothing.
- my ( $record, $new_record, %fh_hash, $fh_out, $chr, $array, $pos, $beg, $end, $score, $file, $id );
+ my ( $bed_file, $fh_in, $fh_out, $cols, $record, $bed_entry, $file_hash, $chr,
+ $array, $pos, $id, $beg, $end, $score, $new_record );
+
+ $bed_file = "$BP_TMP/assemble_tag_contigs.bed";
+ $fh_out = Maasha::Filesys::file_write_open( $bed_file );
+ $cols = 5; # we only need the first 5 BED columns
while ( $record = get_record( $in ) )
{
- $record->{ "CHR" } = $record->{ "S_ID" } if not defined $record->{ "CHR" };
- $record->{ "CHR_BEG" } = $record->{ "S_BEG" } if not defined $record->{ "CHR_BEG" };
- $record->{ "CHR_END" } = $record->{ "S_END" } if not defined $record->{ "CHR_END" };
-
- if ( $record->{ "CHR" } and defined $record->{ "CHR_BEG" } and $record->{ "CHR_END" } )
- {
- $fh_hash{ $record->{ "CHR" } } = Maasha::Common::write_open( "$BP_TMP/$record->{ 'CHR' }" ) if not exists $fh_hash{ $record->{ "CHR" } };
-
- $fh_out = $fh_hash{ $record->{ "CHR" } };
-
- Maasha::UCSC::bed_put_entry( $record, $fh_out, 5 );
- }
- }
-
- map { close $_ } keys %fh_hash;
-
- foreach $chr ( sort keys %fh_hash )
- {
- $array = tag_contigs_assemble( "$BP_TMP/$chr" );
-
- $pos = 0;
- $id = 0;
-
- while ( ( $beg, $end, $score ) = tag_contigs_scan( $array, $pos ) and $beg )
- {
- $new_record = {
- CHR => $chr,
- CHR_BEG => $beg,
- CHR_END => $end - 1,
- Q_ID => sprintf( "TC%06d", $id ),
- SCORE => $score,
- STRAND => $record->{ 'STRAND' } || '+',
- };
-
- put_record( $new_record, $out );
-
- $pos = $end;
- $id++;
+ if ( $bed_entry = Maasha::UCSC::BED::biopiece2bed( $record, $cols ) ) {
+ Maasha::UCSC::BED::bed_entry_put( $bed_entry, $fh_out, $cols );
}
-
- unlink "$BP_TMP/$chr";
}
-}
-
-
-sub tag_contigs_assemble
-{
- # Martin A. Hansen, November 2008.
-
- # Given a BED file with entries from only one
- # chromosome assembles tag contigs from these
- # ignoring strand information. Only tags with
- # a score higher than the clone count over
- # genomic loci (the SCORE field) is included
- # in the tag contigs.
-
- # ----------- tags
- # -------------
- # ----------
- # ----------
- # ======================== tag contig
+ close $fh_out;
- my ( $path, # full path to BED file
- ) = @_;
+ $file_hash = Maasha::UCSC::BED::bed_file_split_on_chr( $bed_file );
- # Returns an arrayref.
+ unlink $bed_file;
- my ( $fh, $entry, $clones, $score, @array );
+ foreach $chr ( sort keys %{ $file_hash } )
+ {
+ $bed_file = Maasha::UCSC::BED::tag_contigs_assemble( $file_hash->{ $chr }, $chr, $BP_TMP );
- $fh = Maasha::Common::read_open( $path );
+ $fh_in = Maasha::Filesys::file_read_open( $bed_file );
- while ( $entry = Maasha::UCSC::bed_get_entry( $fh ) )
- {
- if ( $entry->{ 'Q_ID' } =~ /(\d+)$/ )
+ while ( $bed_entry = Maasha::UCSC::BED::bed_entry_get( $fh_in ) )
{
- $clones = $1;
-
- $score = int( $clones / $entry->{ 'SCORE' } );
-
- map { $array[ $_ ] += $score } $entry->{ 'CHR_BEG' } .. $entry->{ 'CHR_END' } if $score >= 1;
+ if ( $record = Maasha::UCSC::BED::bed2biopiece( $bed_entry ) ) {
+ put_record( $record, $out );
+ }
}
- }
- close $fh;
-
- return wantarray ? @array : \@array;
-}
-
-
-sub tag_contigs_scan
-{
- # Martin A. Hansen, November 2008.
-
- # Scans an array with tag contigs and locates
- # the next contig from a given position. The
- # score of the tag contig is determined as the
- # maximum value of the tag contig. If a tag contig
- # is found a triple is returned with beg, end and score
- # otherwise an empty triple is returned.
-
- my ( $array, # array to scan
- $beg, # position to start scanning from
- ) = @_;
-
- # Returns an arrayref.
-
- my ( $end, $score );
-
- $score = 0;
-
- while ( $beg < scalar @{ $array } and not $array->[ $beg ] ) { $beg++ }
-
- $end = $beg;
-
- while ( $array->[ $end ] )
- {
- $score = Maasha::Calc::max( $score, $array->[ $end ] );
-
- $end++
- }
+ close $fh_in;
- if ( $score > 0 ) {
- return wantarray ? ( $beg, $end, $score ) : [ $beg, $end, $score ];
- } else {
- return wantarray ? () : [];
+ unlink $file_hash->{ $chr };
+ unlink $bed_file;
}
}
while ( $record = get_record( $in ) )
{
- if ( $fh_out and $entry = record2fasta( $record ) )
+ if ( $fh_out and $entry = Maasha::Fasta::biopiece2fasta( $record ) )
{
Maasha::Fasta::put_entry( $entry, $fh_out, $options->{ "wrap" } );
}
# Returns nothing.
- my ( $record, %fh_hash, $fh_in, $fh_out, $chr, $chr, $beg, $end, $q_id, $block, $entry, $clones, $beg_block, $max, $i );
+ my ( $bed_file, $fh_in, $fh_out, $cols, $record, $file_hash, $chr, $bed_entry, $fixedstep_file, $fixedstep_entry );
+
+ $bed_file = "$BP_TMP/calc_fixedstep.bed";
+ $fh_out = Maasha::Filesys::file_write_open( $bed_file );
+ $cols = 5; # we only need the first 5 BED columns
while ( $record = get_record( $in ) )
{
- $record->{ "CHR" } = $record->{ "S_ID" } if not defined $record->{ "CHR" };
- $record->{ "CHR_BEG" } = $record->{ "S_BEG" } if not defined $record->{ "CHR_BEG" };
- $record->{ "CHR_END" } = $record->{ "S_END" } if not defined $record->{ "CHR_END" };
-
- if ( $record->{ "CHR" } and defined $record->{ "CHR_BEG" } and $record->{ "CHR_END" } )
- {
- $fh_hash{ $record->{ "CHR" } } = Maasha::Common::write_open( "$BP_TMP/$record->{ 'CHR' }" ) if not exists $fh_hash{ $record->{ "CHR" } };
-
- $fh_out = $fh_hash{ $record->{ "CHR" } };
-
- Maasha::UCSC::bed_put_entry( $record, $fh_out, 5 );
+ if ( $bed_entry = Maasha::UCSC::BED::biopiece2bed( $record, $cols ) ) {
+ Maasha::UCSC::BED::bed_entry_put( $bed_entry, $fh_out, $cols );
}
}
- map { close $_ } keys %fh_hash;
+ close $fh_out;
- foreach $chr ( sort keys %fh_hash )
- {
- #Maasha::Common::run( "bedSort", "$BP_TMP/$chr $BP_TMP/$chr" );
- Maasha::Common::run( "bed_sort", "--sort 3 --cols 5 $BP_TMP/$chr > $BP_TMP/$chr.sort" );
+ $file_hash = Maasha::UCSC::BED::bed_file_split_on_chr( $bed_file );
- rename "$BP_TMP/$chr.sort", "$BP_TMP/$chr";
+ unlink $bed_file;
- $fh_in = Maasha::Common::read_open( "$BP_TMP/$chr" );
+ foreach $chr ( sort keys %{ $file_hash } )
+ {
+ $fixedstep_file = Maasha::UCSC::Wiggle::fixedstep_calc( $file_hash->{ $chr }, $chr, $options->{ 'score' } );
- undef $block;
+ $fh_in = Maasha::Filesys::file_read_open( $fixedstep_file );
- while ( $entry = Maasha::UCSC::bed_get_entry( $fh_in, 5 ) )
+ while ( $fixedstep_entry = Maasha::UCSC::Wiggle::fixedstep_entry_get( $fh_in ) )
{
- $chr = $entry->{ 'CHR' };
- $beg = $entry->{ 'CHR_BEG' };
- $end = $entry->{ 'CHR_END' };
- $q_id = $entry->{ 'Q_ID' };
-
- if ( $options->{ "score" } ) {
- $clones = $entry->{ 'SCORE' };
- } elsif ( $q_id =~ /_(\d+)$/ ) {
- $clones = $1;
- } else {
- $clones = 1;
- }
-
- if ( $block )
- {
- if ( $beg > $max )
- {
- map { $_ = sprintf( "%.4f", Maasha::Calc::log10( $_ ) ) } @{ $block } if $options->{ "log10" };
-
- $record->{ "CHR" } = $chr;
- $record->{ "CHR_BEG" } = $beg_block;
- $record->{ "STEP" } = 1;
- $record->{ "VALS" } = join ";", @{ $block };
- $record->{ "REC_TYPE" } = "fixed_step";
-
- put_record( $record, $out );
-
- undef $block;
- }
- else
- {
- for ( $i = $beg - $beg_block; $i < ( $beg - $beg_block ) + ( $end - $beg ); $i++ ) {
- $block->[ $i ] += $clones;
- }
-
- $max = Maasha::Calc::max( $max, $end );
- }
- }
-
- if ( not $block )
- {
- $beg_block = $beg;
- $max = $end;
-
- for ( $i = 0; $i < ( $end - $beg ); $i++ ) {
- $block->[ $i ] += $clones;
- }
+ if ( $record = Maasha::UCSC::Wiggle::fixedstep2biopiece( $fixedstep_entry ) ) {
+ put_record( $record, $out );
}
}
close $fh_in;
- map { $_ = sprintf( "%.4f", Maasha::Calc::log10( $_ ) ) } @{ $block } if $options->{ "log10" };
-
- $record->{ "CHR" } = $chr;
- $record->{ "CHR_BEG" } = $beg_block;
- $record->{ "STEP" } = 1;
- $record->{ "VALS" } = join ";", @{ $block };
- $record->{ "REC_TYPE" } = "fixed_step";
-
- put_record( $record, $out );
-
- unlink "$BP_TMP/$chr";
+ unlink $file_hash->{ $chr };
+ unlink $bed_file;
}
}
{
put_record( $record, $out ) if not $options->{ "no_stream" };
- if ( $entry = record2fasta( $record ) )
+ if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) )
{
$seq_type = Maasha::Seq::seq_guess_type( $entry->[ SEQ ] ) if not $seq_type;
while ( $record = get_record( $in ) )
{
- if ( $entry = record2fasta( $record ) )
+ if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) )
{
$q_type = Maasha::Seq::seq_guess_type( $entry->[ SEQ ] ) if not $q_type;
while ( $record = get_record( $in ) )
{
- if ( $entry = record2fasta( $record ) )
+ if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) )
{
$type = Maasha::Seq::seq_guess_type( $entry->[ SEQ ] ) if not $type;
Maasha::Fasta::put_entry( $entry, $fh_out, 80 );
while ( $record = get_record( $in ) )
{
- if ( $entry = record2fasta( $record ) )
+ if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) )
{
Maasha::Fasta::put_entry( $entry, $fh_out );
while ( $record = get_record( $in ) )
{
- if ( $options->{ "index_name" } and $entry = record2fasta( $record ) )
+ if ( $options->{ "index_name" } and $entry = Maasha::Fasta::biopiece2fasta( $record ) )
{
Maasha::Fasta::put_entry( $entry, $fh_tmp );
while ( $record = get_record( $in ) )
{
- if ( $entry = record2fasta( $record ) ) {
+ if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) ) {
Maasha::Fasta::put_entry( $entry, $fh, $options->{ "wrap" } );
}
# Returns nothing.
- my ( $cols, $fh, $record, $new_record );
+ my ( $cols, $fh, $record, $bed_entry, $new_record );
$cols = $options->{ 'cols' }->[ 0 ];
- $fh = write_stream( $options->{ "data_out" }, $options->{ "compress" } );
+ $fh = write_stream( $options->{ 'data_out' }, $options->{ 'compress' } );
while ( $record = get_record( $in ) )
{
- if ( $record->{ "REC_TYPE" } eq "BED" ) # ---- Hits from BED ----
- {
- $cols ||= $record->{ "BED_COLS" };
-
- Maasha::UCSC::bed_put_entry( $record, $fh, $cols );
- }
- elsif ( $record->{ "REC_TYPE" } eq "PSL" and $record->{ "S_ID" } =~ /^chr/i ) # ---- Hits from BLAT (PSL) ----
- {
- $new_record->{ "CHR" } = $record->{ "S_ID" };
- $new_record->{ "CHR_BEG" } = $record->{ "S_BEG" };
- $new_record->{ "CHR_END" } = $record->{ "S_END" };
- $new_record->{ "Q_ID" } = $record->{ "Q_ID" };
- $new_record->{ "SCORE" } = $record->{ "SCORE" } || 999;
- $new_record->{ "STRAND" } = $record->{ "STRAND" };
+ $record = Maasha::UCSC::psl2record( $record ) if $record->{ 'tBaseInsert' }; # Dirty addition to allow Affy data from MySQL to be dumped
- $cols ||= 6;
-
- Maasha::UCSC::bed_put_entry( $new_record, $fh, $cols );
- }
- elsif ( $record->{ "REC_TYPE" } eq "PATSCAN" and $record->{ "CHR" } ) # ---- Hits from patscan_seq ----
- {
- $cols ||= 6;
-
- Maasha::UCSC::bed_put_entry( $record, $fh, $cols );
+ if ( $bed_entry = Maasha::UCSC::BED::biopiece2bed( $record, $cols ) ) {
+ Maasha::UCSC::BED::bed_entry_put( $bed_entry, $fh, $cols );
}
- elsif ( $record->{ "REC_TYPE" } eq "BLAST" and $record->{ "S_ID" } =~ /^chr/i ) # ---- Hits from BLAST ----
- {
- $new_record->{ "CHR" } = $record->{ "S_ID" };
- $new_record->{ "CHR_BEG" } = $record->{ "S_BEG" };
- $new_record->{ "CHR_END" } = $record->{ "S_END" };
- $new_record->{ "Q_ID" } = $record->{ "Q_ID" };
- $new_record->{ "SCORE" } = $record->{ "SCORE" } || 999; # or use E_VAL somehow
- $new_record->{ "STRAND" } = $record->{ "STRAND" };
-
- $cols ||= 6;
- Maasha::UCSC::bed_put_entry( $new_record, $fh, $cols );
- }
- elsif ( $record->{ "REC_TYPE" } eq "VMATCH" and $record->{ "S_ID" } =~ /^chr/i ) # ---- Hits from Vmatch ----
- {
- $new_record->{ "CHR" } = $record->{ "S_ID" };
- $new_record->{ "CHR_BEG" } = $record->{ "S_BEG" };
- $new_record->{ "CHR_END" } = $record->{ "S_END" };
- $new_record->{ "Q_ID" } = $record->{ "Q_ID" };
- $new_record->{ "SCORE" } = $record->{ "SCORE" } || 999; # or use E_VAL somehow
- $new_record->{ "STRAND" } = $record->{ "STRAND" };
-
- $cols ||= 6;
-
- Maasha::UCSC::bed_put_entry( $new_record, $fh, $cols );
- }
- elsif ( $record->{ "REC_TYPE" } eq "SOAP" and $record->{ "S_ID" } =~ /^chr/i ) # ---- Hits from Soap ----
- {
- $new_record->{ "CHR" } = $record->{ "S_ID" };
- $new_record->{ "CHR_BEG" } = $record->{ "S_BEG" };
- $new_record->{ "CHR_END" } = $record->{ "S_END" };
- $new_record->{ "Q_ID" } = $record->{ "Q_ID" };
- $new_record->{ "SCORE" } = $record->{ "SCORE" } || 999;
- $new_record->{ "STRAND" } = $record->{ "STRAND" };
-
- $cols ||= 6;
-
- Maasha::UCSC::bed_put_entry( $new_record, $fh, $cols );
- }
- elsif ( $record->{ 'tBaseInsert' } ) # ---- Dirty addition to allow Affy data from MySQL to be dumped ----
- {
- $record = Maasha::UCSC::psl2record( $record );
-
- $new_record = $record;
- $new_record->{ "CHR" } = $record->{ "S_ID" };
- $new_record->{ "CHR_BEG" } = $record->{ "S_BEG" };
- $new_record->{ "CHR_END" } = $record->{ "S_END" };
- $new_record->{ "Q_ID" } = $record->{ "Q_ID" };
- $new_record->{ "SCORE" } = $record->{ "SCORE" } || 999;
- $new_record->{ "STRAND" } = $record->{ "STRAND" };
-
- Maasha::UCSC::bed_put_entry( $new_record, $fh, $cols );
- }
- elsif ( $record->{ "CHR" } and defined $record->{ "CHR_BEG" } and $record->{ "CHR_END" } ) # ---- Generic data from tables ----
- {
- Maasha::UCSC::bed_put_entry( $record, $fh, $cols );
- }
-
- put_record( $record, $out ) if not $options->{ "no_stream" };
+ put_record( $record, $out ) if not $options->{ 'no_stream' };
}
close $fh;
# Returns nothing.
- my ( $fh, $record, $vals );
+ my ( $fh, $record, $entry );
$fh = write_stream( $options->{ "data_out" }, $options->{ "compress" } );
while ( $record = get_record( $in ) )
{
- if ( $record->{ "CHR" } and $record->{ "CHR_BEG" } and $record->{ "STEP" } and $record->{ "VALS" } )
- {
- print $fh "fixedStep chrom=$record->{ 'CHR' } start=$record->{ 'CHR_BEG' } step=$record->{ 'STEP' }\n";
-
- $vals = $record->{ 'VALS' };
-
- $vals =~ tr/;/\n/;
-
- print $fh "$vals\n";
+ if ( $entry = Maasha::UCSC::Wiggle::biopiece2fixedstep( $record ) ) {
+ map { print $fh "$_\n" } @{ $entry };
}
put_record( $record, $out ) if not $options->{ "no_stream" };
while ( $record = get_record( $in ) )
{
- if ( $entry = record2fasta( $record ) ) {
+ if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) ) {
Maasha::Fasta::put_entry( $entry, $fh_tmp );
}
while ( $record = get_record( $in ) )
{
- if ( $entry = record2fasta( $record ) )
+ if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) )
{
$entry->[ SEQ ] = Maasha::Solid::seq2color_space( uc $entry->[ SEQ ] );
while ( $record = get_record( $in ) )
{
- Maasha::UCSC::ucsc_config_put_entry( $record, $fh ) if $record->{ "REC_TYPE" } eq "UCSC Config";
+ Maasha::UCSC::ucsc_config_entry_put( $record, $fh ) if $record->{ "REC_TYPE" } eq "UCSC Config";
put_record( $record, $out ) if not $options->{ "no_stream" };
}
$fh_in = Maasha::Common::read_open( $options->{ 'config_file' } );
- while ( $track = Maasha::UCSC::ucsc_config_get_entry( $fh_in ) ) {
+ while ( $track = Maasha::UCSC::ucsc_config_entry_get( $fh_in ) ) {
push @tracks, $track;
}
$fh_out = Maasha::Common::write_open( $options->{ 'config_file' } );
- map { Maasha::UCSC::ucsc_config_put_entry( $_, $fh_out ) } @new_tracks;
+ map { Maasha::UCSC::ucsc_config_entry_put( $_, $fh_out ) } @new_tracks;
close $fh_out;
}
else
{
- warn qq(WARNING: Bad compute expression: "$options->{ 'eval' }"\n);
+ Maasha::Common::error( qq(Bad compute expression: "$options->{ 'eval' }"\n) );
}
}
# Returns nothing.
- my ( $record, $file, $wib_file, $wig_file, $wib_dir, $fh_out, $i, $first, $format, $type, $columns, $append, $vals );
+ my ( $record, $file, $wib_file, $wig_file, $wib_dir, $fh_out, $i, $first, $format, $type, $columns, $append, $entry );
$options->{ "short_label" } ||= $options->{ 'table' };
$options->{ "long_label" } ||= $options->{ 'table' };
$options->{ "color" } ||= join( ",", int( rand( 255 ) ), int( rand( 255 ) ), int( rand( 255 ) ) );
$options->{ "chunk_size" } ||= 10_000_000_000; # Due to 32-bit UCSC compilation really large tables cannot be loaded in one go.
- $file = "$BP_TMP/ucsc_upload.tmp";
-
+ $file = "$BP_TMP/ucsc_upload.tmp";
$append = 0;
-
- $first = 1;
-
- $i = 0;
+ $first = 1;
+ $i = 0;
$fh_out = Maasha::Common::write_open( $file );
if ( $record->{ "REC_TYPE" } eq "fixed_step" )
{
- $vals = $record->{ "VALS" };
- $vals =~ tr/;/\n/;
-
- print $fh_out "fixedStep chrom=$record->{ 'CHR' } start=$record->{ 'CHR_BEG' } step=$record->{ 'STEP' }\n";
- print $fh_out "$vals\n";
+ $format = "WIGGLE";
- $format = "WIGGLE" if not $format;
+ if ( $entry = Maasha::UCSC::Wiggle::biopiece2fixedstep( $record ) ) {
+ Maasha::UCSC::Wiggle::fixedstep_entry_put( $entry, $fh_out );
+ }
}
elsif ( $record->{ "REC_TYPE" } eq "PSL" )
{
+ $format = "PSL";
+
Maasha::UCSC::psl_put_header( $fh_out ) if $first;
Maasha::UCSC::psl_put_entry( $record, $fh_out );
$first = 0;
-
- $format = "PSL" if not $format;
}
elsif ( $record->{ "REC_TYPE" } eq "BED" and $record->{ "SEC_STRUCT" } )
{
# chrom chromStart chromEnd name score strand size secStr conf
+ $format = "BED_SS";
+
print $fh_out join ( "\t",
$record->{ "CHR" },
$record->{ "CHR_BEG" },
$record->{ "SEC_STRUCT" },
$record->{ "CONF" },
), "\n";
-
- $format = "BED_SS" if not $format;
}
elsif ( $record->{ "REC_TYPE" } eq "BED" )
{
- Maasha::UCSC::bed_put_entry( $record, $fh_out, $record->{ "BED_COLS" } );
+ $format = "BED";
+ $columns = $record->{ "BED_COLS" };
- $format = "BED" if not $format;
- $columns = $record->{ "BED_COLS" } if not $columns;
+ if ( $entry = Maasha::UCSC::BED::biopiece2bed( $record, $columns ) ) {
+ Maasha::UCSC::BED::bed_entry_put( $entry, $fh_out, $columns );
+ }
}
elsif ( $record->{ "REC_TYPE" } eq "PATSCAN" and $record->{ "CHR" } )
{
- Maasha::UCSC::bed_put_entry( $record, $fh_out, 6 );
+ $format = "BED";
+ $columns = 6;
- $format = "BED" if not $format;
- $columns = 6 if not $columns;
+ if ( $entry = Maasha::UCSC::BED::biopiece2bed( $record, $columns ) ) {
+ Maasha::UCSC::BED::bed_entry_put( $entry, $fh_out, $columns );
+ }
}
elsif ( $record->{ "REC_TYPE" } eq "BLAST" and $record->{ "S_ID" } =~ /^chr/ )
{
- $record->{ "CHR" } = $record->{ "S_ID" };
- $record->{ "CHR_BEG" } = $record->{ "S_BEG" };
- $record->{ "CHR_END" } = $record->{ "S_END" };
- $record->{ "SCORE" } = $record->{ "BIT_SCORE" } * 1000;
+ $format = "BED";
+ $columns = 6;
- $format = "BED" if not $format;
- $columns = 6 if not $columns;
+ $record->{ "SCORE" } = $record->{ "BIT_SCORE" } * 1000;
- Maasha::UCSC::bed_put_entry( $record, $fh_out );
+ if ( $entry = Maasha::UCSC::BED::biopiece2bed( $record, $columns ) ) {
+ Maasha::UCSC::BED::bed_entry_put( $entry, $fh_out, $columns );
+ }
}
elsif ( $record->{ "REC_TYPE" } eq "VMATCH" and $record->{ "S_ID" } =~ /^chr/i )
{
- $record->{ "CHR" } = $record->{ "S_ID" };
- $record->{ "CHR_BEG" } = $record->{ "S_BEG" };
- $record->{ "CHR_END" } = $record->{ "S_END" };
- $record->{ "SCORE" } = $record->{ "SCORE" } || 999;
- $record->{ "SCORE" } = int( $record->{ "SCORE" } );
+ $format = "BED";
+ $columns = 6;
- $format = "BED" if not $format;
- $columns = 6 if not $columns;
-
- Maasha::UCSC::bed_put_entry( $record, $fh_out, 6 );
+ if ( $entry = Maasha::UCSC::BED::biopiece2bed( $record, $columns ) ) {
+ Maasha::UCSC::BED::bed_entry_put( $entry, $fh_out, $columns );
+ }
}
if ( $i == $options->{ "chunk_size" } )
}
elsif ( $format eq "BED_SS" )
{
- $options->{ "sec_struct" } = 1;
-
- $type = "sec_struct";
+ $type = "type bed 6 +";
Maasha::UCSC::bed_upload_to_ucsc( $BP_TMP, $file, $options, $append );
}
unlink $file;
- Maasha::UCSC::update_my_tracks( $options, $type );
+ Maasha::UCSC::ucsc_update_config( $options, $type );
}
}
}
-sub record2fasta
-{
- # Martin A. Hansen, July 2008.
-
- # Given a biopiece record converts it to a FASTA record.
- # If no generic SEQ or SEQ_NAME is found, the Q_* and S_* are
- # tried in that order.
-
- my ( $record, # record
- ) = @_;
-
- # Returns a tuple.
-
- my ( $seq_name, $seq );
-
- $seq_name = $record->{ "SEQ_NAME" } || $record->{ "Q_ID" } || $record->{ "S_ID" };
- $seq = $record->{ "SEQ" } || $record->{ "Q_SEQ" } || $record->{ "S_SEQ" };
-
- if ( defined $seq_name and defined $seq ) {
- return wantarray ? ( $seq_name, $seq ) : [ $seq_name, $seq ];
- } else {
- return;
- }
-}
-
-
sub read_stream
{
# Martin A. Hansen, July 2007.
}
+sub biopiece2fasta
+{
+ # Martin A. Hansen, July 2008.
+
+ # Given a biopiece record converts it to a FASTA record.
+ # If no generic SEQ or SEQ_NAME is found, the Q_* and S_* are
+ # tried in that order.
+
+ my ( $record, # record
+ ) = @_;
+
+ # Returns a tuple.
+
+ my ( $seq_name, $seq );
+
+ $seq_name = $record->{ "SEQ_NAME" } || $record->{ "Q_ID" } || $record->{ "S_ID" };
+ $seq = $record->{ "SEQ" } || $record->{ "Q_SEQ" } || $record->{ "S_SEQ" };
+
+ if ( defined $seq_name and defined $seq ) {
+ return wantarray ? ( $seq_name, $seq ) : [ $seq_name, $seq ];
+ } else {
+ return;
+ }
+}
+
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
--- /dev/null
+package Maasha::Filesys;
+
+
+# Copyright (C) 2006-2008 Martin A. Hansen.
+
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+# http://www.gnu.org/copyleft/gpl.html
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+# This module contains routines for manipulation of files and directories.
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+use strict;
+use IO::File;
+use Maasha::Common;
+
+use Exporter;
+
+use vars qw( @ISA @EXPORT @EXPORT_OK );
+
+@ISA = qw( Exporter ) ;
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> FILES <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+sub file_read_open
+{
+ # Martin A. Hansen, January 2004.
+
+ # Read opens a file that may be gzipped and returns a filehandle.
+
+ my ( $path, # full path to file
+ ) = @_;
+
+ # Returns filehandle
+
+ my ( $fh );
+
+ if ( is_gzipped( $path ) ) {
+ $fh = new IO::File "zcat $path|" or Maasha::Common::error( qq(Could not read-open file "$path": $!) );
+ } else {
+ $fh = new IO::File $path, "r" or Maasha::Common::error( qq(Could not read-open file "$path": $!) );
+ }
+
+ return $fh;
+}
+
+
+sub file_write_open
+{
+ # Martin A. Hansen, January 2004.
+
+ # write opens a file and returns a filehandle
+
+ my ( $path, # full path to file
+ $gzip, # flag if data is to be gzipped - OPRIONAL
+ ) = @_;
+
+ # returns filehandle
+
+ my ( $fh );
+
+ if ( $gzip ) {
+ $fh = new IO::File "|gzip -f>$path" or Maasha::Common::error( qq(Could not write-open file "$path": $!) );
+ } else {
+ $fh = new IO::File $path, "w" or Maasha::Common::error( qq(Could not write-open file "$path": $!) );
+ }
+
+ return $fh;
+}
+
+
+sub file_append_open
+{
+ # Martin A. Hansen, February 2006.
+
+ # append opens file and returns a filehandle
+
+ my ( $path, # path to file
+ ) = @_;
+
+ # returns filehandle
+
+ my ( $fh );
+
+ $fh = new IO::File $path, "a" or Maasha::Common::error( qq(Could not append-open file "$path": $!) );
+
+ return $fh;
+}
+
+
+sub file_copy
+{
+ # Martin A. Hansen, November 2008.
+
+ # Copy the content of a file from source path to
+ # destination path.
+
+ my ( $src, # source path
+ $dst, # destination path
+ ) = @_;
+
+ # Returns nothing.
+
+ my ( $fh_in, $fh_out, $line );
+
+ Maasha::Common::error( qq(copy failed: destination equals source "$src") ) if $src eq $dst;
+
+ $fh_in = file_read_open( $src );
+ $fh_out = file_write_open( $dst );
+
+ while ( $line = <$fh_in> ) {
+ print $fh_out $line;
+ }
+
+ close $fh_in;
+ close $fh_out;
+}
+
+
+sub is_gzipped
+{
+ # Martin A. Hansen, November 2008.
+
+ # Checks if a given file is gzipped.
+ # Currrently uses a call to the systems
+ # file tool. Returns 1 if gzipped otherwise
+ # returns 0.
+
+ my ( $path, # path to file
+ ) = @_;
+
+ # Returns boolean.
+
+ my ( $type );
+
+ $type = `file $path`;
+
+ if ( $type =~ /gzip compressed/ ) {
+ return 1;
+ } else {
+ return 0;
+ }
+}
+
+
+sub file_size
+{
+ # Martin A. Hansen, March 2007
+
+ # returns the file size for a given file
+
+ my ( $path, # full path to file
+ ) = @_;
+
+ # returns integer
+
+ my $file_size = ( stat ( $path ) )[ 7 ];
+
+ return $file_size;
+}
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+1;
+
+
+__END__
use vars qw ( @ISA @EXPORT );
use Data::Dumper;
-use Time::HiRes qw( gettimeofday );
-
use Maasha::Common;
+use Maasha::Filesys;
use Maasha::Calc;
use Maasha::Matrix;
@ISA = qw( Exporter );
-my $TIME = gettimeofday();
-
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> BED format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> PSL format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
-sub psl2record
-{
- # Martin A. Hansen, November 2008
-
- # Converts PSL record keys to Biopiece record keys.
-
- my ( $psl, # hashref
- ) = @_;
-
- # returns hashref
-
- my %record;
-
- %record = (
- REC_TYPE => "PSL",
- BLOCKSIZES => $psl->{ 'blockSize' },
- SNUMINSERT => $psl->{ 'tNumInsert' },
- Q_END => $psl->{ 'qEnd' },
- SBASEINSERT => $psl->{ 'tBaseInsert' },
- S_END => $psl->{ 'tEnd' },
- QBASEINSERT => $psl->{ 'qBaseInsert' },
- REPMATCHES => $psl->{ 'repMatches' },
- QNUMINSERT => $psl->{ 'qNumInsert' },
- MISMATCHES => $psl->{ 'misMatches' },
- BLOCKCOUNT => $psl->{ 'blockCount' },
- Q_LEN => $psl->{ 'qSize' },
- S_ID => $psl->{ 'tName' },
- STRAND => $psl->{ 'strand' },
- Q_ID => $psl->{ 'qName' },
- MATCHES => $psl->{ 'matches' },
- S_LEN => $psl->{ 'tSize' },
- NCOUNT => $psl->{ 'nCount' },
- Q_BEGS => $psl->{ 'qStarts' },
- S_BEGS => $psl->{ 'tStarts' },
- S_BEG => $psl->{ 'tStart' },
- Q_BEG => $psl->{ 'qStart ' },
- );
-
- $record{ "SCORE" } = $record{ "MATCHES" } + int( $record{ "REPMATCHES" } / 2 ) - $record{ "MISMATCHES" } - $record{ "QNUMINSERT" } - $record{ "SNUMINSERT" };
- $record{ "SPAN" } = $record{ "Q_END" } - $record{ "Q_BEG" };
-
- return wantarray ? %record : \%record;
-}
-
sub psl_get_entry
{
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> TRACK FILE <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
-sub ucsc_config_get_entry
+sub ucsc_config_entry_get
{
# Martin A. Hansen, November 2008.
}
-sub ucsc_config_put_entry
+sub ucsc_config_entry_put
{
# Martin A. Hansen, November 2008.
visibility
maxHeightPixels
color
+ mafTrack
type );
}
-sub update_my_tracks
+sub ucsc_update_config
{
# Martin A. Hansen, September 2007.
# Returns nothing.
- my ( $file, $fh_in, $fh_out, $line, $time );
+ my ( $file, %entry, $fh_out );
$file = $ENV{ "HOME" } . "/ucsc/my_tracks.ra";
- # ---- create a backup ----
-
- $fh_in = Maasha::Common::read_open( $file );
- $fh_out = Maasha::Common::write_open( "$file~" );
-
- while ( $line = <$fh_in> ) {
- print $fh_out $line;
- }
-
- close $fh_in;
- close $fh_out;
-
- # ---- append track ----
-
- $time = Maasha::Common::time_stamp();
-
- $fh_out = Maasha::Common::append_open( $file );
-
- if ( $type eq "sec_struct" )
- {
- print $fh_out "\n\n# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n";
-
- print $fh_out "\n# Track added by 'upload_to_ucsc' $time\n\n";
-
- print $fh_out "# Database $options->{ 'database' }\n\n";
-
- print $fh_out "track $options->{ 'table' }\n";
- print $fh_out "shortLabel $options->{ 'short_label' }\n";
- print $fh_out "longLabel $options->{ 'long_label' }\n";
- print $fh_out "group $options->{ 'group' }\n";
- print $fh_out "priority $options->{ 'priority' }\n";
- print $fh_out "visibility $options->{ 'visibility' }\n";
- print $fh_out "color $options->{ 'color' }\n";
- print $fh_out "type bed 6 +\n";
- print $fh_out "mafTrack multiz17way\n";
-
- print $fh_out "\n# //\n";
- }
- else
- {
- print $fh_out "\n\n# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n";
-
- print $fh_out "\n# Track added by 'upload_to_ucsc' $time\n\n";
-
- print $fh_out "# Database $options->{ 'database' }\n\n";
+ Maasha::Filesys::file_copy( $file, "$file~" ); # create a backup
+
+ %entry = (
+ database => $options->{ 'database' },
+ track => $options->{ 'table' },
+ shortLabel => $options->{ 'short_label' },
+ longLabel => $options->{ 'long_label' },
+ group => $options->{ 'group' },
+ priority => $options->{ 'priority' },
+ useScore => $options->{ 'use_score' },
+ visibility => $options->{ 'visibility' },
+ color => $options->{ 'color' },
+ type => $type,
+ );
- print $fh_out "track $options->{ 'table' }\n";
- print $fh_out "shortLabel $options->{ 'short_label' }\n";
- print $fh_out "longLabel $options->{ 'long_label' }\n";
- print $fh_out "group $options->{ 'group' }\n";
- print $fh_out "priority $options->{ 'priority' }\n";
- print $fh_out "useScore 1\n" if $options->{ 'use_score' };
- print $fh_out "visibility $options->{ 'visibility' }\n";
- print $fh_out "maxHeightPixels 50:50:11\n" if $type eq "wig 0";
- print $fh_out "color $options->{ 'color' }\n";
- print $fh_out "type $type\n";
+ $entry{ 'mafTrack' } = "multiz17way" if $type eq "type bed 6 +";
+ $entry{ 'maxHeightPixels' } = "50:50:11" if $type eq "wig 0";
- print $fh_out "\n# //\n";
- }
+ $fh_out = Maasha::Filesys::file_append_open( $file );
+ ucsc_config_entry_put( \%entry, $fh_out );
+
close $fh_out;
Maasha::Common::run( "ucscMakeTracks.pl", "-b > /dev/null 2>&1" );
--- /dev/null
+package Maasha::UCSC::BED;
+
+# Copyright (C) 2007-2008 Martin A. Hansen.
+
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+# http://www.gnu.org/copyleft/gpl.html
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+# Routines for interaction with Browser Extensible DATA (BED) entries and files.
+
+# Read about the BED format here: http://genome.ucsc.edu/FAQ/FAQformat#format1
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+use strict;
+
+use Data::Dumper;
+use Maasha::Common;
+use Maasha::Filesys;
+use Maasha::Calc;
+
+use vars qw( @ISA @EXPORT_OK );
+
+require Exporter;
+
+@ISA = qw( Exporter );
+
+@EXPORT_OK = qw(
+);
+
+use constant {
+ chrom => 0, # BED field names
+ chromStart => 1,
+ chromEnd => 2,
+ name => 3,
+ score => 4,
+ strand => 5,
+ thickStart => 6,
+ thickEnd => 7,
+ itemRgb => 8,
+ blockCount => 9,
+ blockSizes => 10,
+ blockStarts => 11,
+ CHR => 0, # Biopieces field names
+ CHR_BEG => 1,
+ CHR_END => 2,
+ Q_ID => 3,
+ SCORE => 4,
+ STRAND => 5,
+ THICK_BEG => 6,
+ THICK_END => 7,
+ COLOR => 8,
+ BLOCK_COUNT => 9,
+ BLOCK_LENS => 10,
+ Q_BEGS => 11,
+};
+
+
+my $CHECK_ALL = 1; # Global flag indicating that BED input and output is checked thoroughly.
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> BED format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+# Hash for converting BED keys to Biopieces keys.
+
+my %BED2BIOPIECES = (
+ chrom => "CHR",
+ chromStart => "CHR_BEG",
+ chromEnd => "CHR_END",
+ name => "Q_ID",
+ score => "SCORE",
+ strand => "STRAND",
+ thickStart => "THICK_BEG",
+ thickEnd => "THICK_END",
+ itemRgb => "COLOR",
+ blockCount => "BLOCK_COUNT",
+ blockSizes => "BLOCK_LENS",
+ blockStarts => "Q_BEGS",
+);
+
+
+# Hash for converting biopieces keys to BED keys.
+
+my %BIOPIECES2BED = (
+ CHR => "chrom",
+ CHR_BEG => "chromStart",
+ CHR_END => "chromEnd",
+ Q_ID => "name",
+ SCORE => "score",
+ STRAND => "strand",
+ THICK_BEG => "thickStart",
+ THICK_END => "thickEnd",
+ COLOR => "itemRgb",
+ BLOCK_COUNT => "blockCount",
+ BLOCK_LENS => "blockSizes",
+ Q_BEGS => "blockStarts",
+);
+
+
+sub bed_entry_get
+{
+ # Martin A. Hansen, September 2008.
+
+ # Reads a BED entry given a filehandle.
+
+ my ( $fh, # file handle
+ $cols, # columns to read - OPTIONAL (3,4,5,6,8,9 or 12)
+ ) = @_;
+
+ # Returns a list.
+
+ my ( $line, @entry );
+
+ $line = <$fh>;
+
+ $line =~ tr/\n\r//d; # some people have carriage returns in their BED files -> Grrrr
+
+ return if not defined $line;
+
+ if ( not defined $cols ) {
+ $cols = 1 + $line =~ tr/\t//;
+ }
+
+ @entry = split "\t", $line, $cols + 1;
+
+ pop @entry if scalar @entry > $cols;
+
+ bed_entry_check( \@entry ) if $CHECK_ALL;
+
+ return wantarray ? @entry : \@entry;
+}
+
+
+sub bed_entry_put
+{
+ # Martin A. Hansen, September 2008.
+
+ # Writes a BED entry array to file.
+
+ my ( $entry, # list
+ $fh, # file handle - OPTIONAL
+ $cols, # number of columns in BED file - OPTIONAL (3,4,5,6,8,9 or 12)
+ ) = @_;
+
+ # Returns nothing.
+
+ if ( $cols and $cols < scalar @{ $entry } ) {
+ @{ $entry } = @{ $entry }[ 0 .. $cols - 1 ];
+ }
+
+ bed_entry_check( $entry ) if $CHECK_ALL;
+
+ $fh = \*STDOUT if not defined $fh;
+
+ print $fh join( "\t", @{ $entry } ), "\n";
+}
+
+
+
+sub bed_entry_check
+{
+ # Martin A. Hansen, November 2008.
+
+ # Checks a BED entry for integrity and
+ # raises an error if there is a problem.
+
+ my ( $bed, # array ref
+ ) = @_;
+
+ # Returns nothing.
+
+ my ( $cols, @block_sizes, @block_starts );
+
+ $cols = scalar @{ $bed };
+
+ if ( $cols < 3 ) {
+ Maasha::Common::error( qq(Bad BED entry - must contain at least 3 columns - not $cols) );
+ }
+
+ if ( $cols > 12 ) {
+ Maasha::Common::error( qq(Bad BED entry - must contains no more than 12 columns - not $cols) );
+ }
+
+ if ( $bed->[ chrom ] =~ /\s/ ) {
+ Maasha::Common::error( qq(Bad BED entry - no white space allowed in chrom field: "$bed->[ chrom ]") );
+ }
+
+ if ( $bed->[ chromStart ] =~ /\D/ ) {
+ Maasha::Common::error( qq(Bad BED entry - chromStart must be a whole number - not "$bed->[ chromStart ]") );
+ }
+
+ if ( $bed->[ chromEnd ] =~ /\D/ ) {
+ Maasha::Common::error( qq(Bad BED entry - chromEnd must be a whole number - not "$bed->[ chromEnd ]") );
+ }
+
+ if ( $bed->[ chromEnd ] < $bed->[ chromStart ] ) {
+ Maasha::Common::error( qq(Bad BED entry - chromEnd must be greater than chromStart - not "$bed->[ chromStart ] > $bed->[ chromEnd ]") );
+ }
+
+ return if @{ $bed } == 3;
+
+ if ( $bed->[ name ] =~ /\s/ ) {
+ Maasha::Common::error( qq(Bad BED entry - no white space allowed in name field: "$bed->[ name ]") );
+ }
+
+ return if @{ $bed } == 4;
+
+ if ( $bed->[ score ] =~ /\D/ ) {
+ Maasha::Common::error( qq(Bad BED entry - score must be a whole number - not "$bed->[ score ]") );
+ }
+
+ if ( $bed->[ score ] < 0 or $bed->[ score ] > 1000 ) {
+ Maasha::Common::error( qq(Bad BED entry - score must be between 0 and 1000 - not "$bed->[ score ]") );
+ }
+
+ return if @{ $bed } == 5;
+
+ if ( $bed->[ strand ] ne '+' and $bed->[ strand ] ne '-' ) {
+ Maasha::Common::error( qq(Bad BED entry - strand must be + or - not "$bed->[ strand ]") );
+ }
+
+ return if @{ $bed } == 6;
+
+ if ( $bed->[ thickStart ] =~ /\D/ ) {
+ Maasha::Common::error( qq(Bad BED entry - thickStart must be a whole number - not "$bed->[ thickStart ]") );
+ }
+
+ if ( $bed->[ thickEnd ] =~ /\D/ ) {
+ Maasha::Common::error( qq(Bad BED entry - thickEnd must be a whole number - not "$bed->[ thickEnd ]") );
+ }
+
+ if ( $bed->[ thickEnd ] < $bed->[ thickStart ] ) {
+ Maasha::Common::error( qq(Bad BED entry - thickEnd must be greater than thickStart - not "$bed->[ thickStart ] > $bed->[ thickEnd ]") );
+ }
+
+ if ( $bed->[ thickStart ] < $bed->[ chromStart ] ) {
+ Maasha::Common::error( qq(Bad BED entry - thickStart must be greater than chromStart - not "$bed->[ thickStart ] < $bed->[ chromStart ]") );
+ }
+
+ if ( $bed->[ thickStart ] > $bed->[ chromEnd ] ) {
+ Maasha::Common::error( qq(Bad BED entry - thickStart must be less than chromEnd - not "$bed->[ thickStart ] > $bed->[ chromEnd ]") );
+ }
+
+ if ( $bed->[ thickEnd ] < $bed->[ chromStart ] ) {
+ Maasha::Common::error( qq(Bad BED entry - thickEnd must be greater than chromStart - not "$bed->[ thickEnd ] < $bed->[ chromStart ]") );
+ }
+
+ if ( $bed->[ thickEnd ] > $bed->[ chromEnd ] ) {
+ Maasha::Common::error( qq(Bad BED entry - thickEnd must be less than chromEnd - not "$bed->[ thickEnd ] > $bed->[ chromEnd ]") );
+ }
+
+ return if @{ $bed } == 8;
+
+ if ( $bed->[ itemRgb ] !~ /^(0|\d{1,3},\d{1,3},\d{1,3},?)$/ ) {
+ Maasha::Common::error( qq(Bad BED entry - itemRgb must be 0 or in the form of 255,0,0 - not "$bed->[ itemRgb ]") );
+ }
+
+ return if @{ $bed } == 9;
+
+ if ( $bed->[ blockCount ] =~ /\D/ ) {
+ Maasha::Common::error( qq(Bad BED entry - blockCount must be a whole number - not "$bed->[ blockCount ]") );
+ }
+
+ @block_sizes = split ",", $bed->[ blockSizes ];
+
+ if ( grep /\D/, @block_sizes ) {
+ Maasha::Common::error( qq(Bad BED entry - blockSizes must be whole numbers - not "$bed->[ blockSizes ]") );
+ }
+
+ if ( $bed->[ blockCount ] != scalar @block_sizes ) {
+ Maasha::Common::error( qq(Bad BED entry - blockSizes "$bed->[ blockSizes ]" must match blockCount "$bed->[ blockCount ]") );
+ }
+
+ @block_starts = split ",", $bed->[ blockStarts ];
+
+ if ( grep /\D/, @block_starts ) {
+ Maasha::Common::error( qq(Bad BED entry - blockStarts must be whole numbers - not "$bed->[ blockStarts ]") );
+ }
+
+ if ( $bed->[ blockCount ] != scalar @block_starts ) {
+ Maasha::Common::error( qq(Bad BED entry - blockStarts "$bed->[ blockStarts ]" must match blockCount "$bed->[ blockCount ]") );
+ }
+
+ if ( $bed->[ chromStart ] + $block_starts[ -1 ] + $block_sizes[ -1 ] != $bed->[ chromEnd ] ) {
+ Maasha::Common::error( qq(Bad BED entry - chromStart + blockStarts[last] + blockSizes[last] must equal chromEnd: ) .
+ qq($bed->[ chromStart ] + $block_starts[ -1 ] + $block_sizes[ -1 ] != $bed->[ chromEnd ]) );
+ }
+}
+
+
+sub bed_sort
+{
+ # Martin A. hansen, November 2008.
+
+ # Sorts a given BED file according to a given
+ # sorting scheme:
+ # 1: chr AND chr_beg.
+ # 2: chr AND strand AND chr_beg.
+ # 3: chr_beg.
+ # 4: strand AND chr_beg.
+
+ # Currently 'bed_sort' is used for sorting = a standalone c program
+ # that uses qsort (unstable sort).
+
+ my ( $file_in, # path to BED file.
+ $scheme, # sort scheme
+ $cols, # number of columns in BED file
+ ) = @_;
+
+ # Returns nothing.
+
+ my ( $file_out );
+
+ $file_out = "$file_in.sort";
+
+ Maasha::Common::run( "bed_sort", "--sort $scheme --cols $cols $file_in > $file_out" );
+
+ if ( Maasha::Filesys::file_size( $file_in ) != Maasha::Filesys::file_size( $file_out ) ) {
+ Maasha::Common::error( qq(bed_sort of file "$file_in" failed) );
+ }
+
+ rename $file_out, $file_in;
+}
+
+
+sub bed_file_split_on_chr
+{
+ # Martin A. Hansen, November 2008.
+
+ # Given a path to a BED file splits
+ # this file into one file per chromosome
+ # in a temporary directory. Returns a hash
+ # with chromosome name as key and the corresponding
+ # file as value.
+
+ my ( $file, # path to BED file
+ $dir, # working directory
+ $cols, # number of BED columns to read - OPTIONAL
+ ) = @_;
+
+ # Returns a hashref
+
+ my ( $fh_in, $fh_out, $entry, %fh_hash, %file_hash );
+
+ $fh_in = Maasha::Filesys::file_read_open( $file );
+
+ while ( $entry = bed_entry_get( $fh_in, $cols ) )
+ {
+ if ( not exists $file_hash{ $entry->[ chrom ] } )
+ {
+ $fh_hash{ $entry->[ chrom ] } = Maasha::Filesys::file_write_open( "$dir/$entry->[ chrom ]" );
+ $file_hash{ $entry->[ chrom ] } = "$dir/$entry->[ chrom ]";
+ }
+
+ $fh_out = $fh_hash{ $entry->[ chrom ] };
+
+ Maasha::UCSC::BED::bed_entry_put( $entry, $fh_out, $cols );
+ }
+
+ map { close $_ } keys %fh_hash;
+
+ return wantarray ? %file_hash : \%file_hash;
+}
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> BIOPIECES <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+sub bed2biopiece
+{
+ # Martin A. Hansen, November 2008.
+
+ # Converts a BED entry given as an arrayref
+ # to a Biopiece record which is returned as
+ # a hashref.
+
+ # IMPORTANT! The BED_END and THICK_END positions
+ # will be the exact position in contrary to the
+ # UCSC scheme.
+
+ my ( $bed_entry, # BED entry as arrayref
+ ) = @_;
+
+ # Returns a hashref
+
+ my ( $cols, %bp_record );
+
+ $cols = scalar @{ $bed_entry };
+
+ if ( not defined $bed_entry->[ chrom ] and
+ not defined $bed_entry->[ chromStart ] and
+ not defined $bed_entry->[ chromEnd ] )
+ {
+ return 0;
+ }
+
+ $bp_record{ "REC_TYPE" } = "BED";
+ $bp_record{ "BED_COLS" } = $cols;
+ $bp_record{ "CHR" } = $bed_entry->[ chrom ];
+ $bp_record{ "CHR_BEG" } = $bed_entry->[ chromStart ];
+ $bp_record{ "CHR_END" } = $bed_entry->[ chromEnd ] - 1;
+ $bp_record{ "BED_LEN" } = $bed_entry->[ chromEnd ] - $bed_entry->[ chromEnd ];
+
+ return wantarray ? %bp_record : \%bp_record if $cols == 3;
+
+ $bp_record{ "Q_ID" } = $bed_entry->[ name ];
+
+ return wantarray ? %bp_record : \%bp_record if $cols == 4;
+
+ $bp_record{ "SCORE" } = $bed_entry->[ score ];
+
+ return wantarray ? %bp_record : \%bp_record if $cols == 5;
+
+ $bp_record{ "STRAND" } = $bed_entry->[ strand ];
+
+ return wantarray ? %bp_record : \%bp_record if $cols == 6;
+
+ $bp_record{ "THICK_BEG" } = $bed_entry->[ thickStart ];
+ $bp_record{ "THICK_END" } = $bed_entry->[ thickEnd ] - 1;
+
+ return wantarray ? %bp_record : \%bp_record if $cols == 8;
+
+ $bp_record{ "COLOR" } = $bed_entry->[ itemRgb ];
+
+ return wantarray ? %bp_record : \%bp_record if $cols == 9;
+
+ $bp_record{ "BLOCK_COUNT" } = $bed_entry->[ blockCount ];
+ $bp_record{ "BLOCK_LENS" } = $bed_entry->[ blockSizes ];
+ $bp_record{ "Q_BEGS" } = $bed_entry->[ blockStarts ];
+
+ return wantarray ? %bp_record : \%bp_record;
+}
+
+
+sub biopiece2bed
+{
+ # Martin A. Hansen, November 2008.
+
+ # Converts a Biopieces record given as a hashref
+ # to a BED record which is returned as
+ # an arrayref. As much as possible of the Biopiece
+ # record is converted and undef is returned if
+ # convertion failed.
+
+ # IMPORTANT! The BED_END and THICK_END positions
+ # will be the inexact position used in the
+ # UCSC scheme.
+
+ my ( $bp_record, # hashref
+ $cols, # number of columns in BED entry - OPTIONAL (but faster)
+ ) = @_;
+
+ # Returns arrayref.
+
+ my ( @bed_entry );
+
+ $cols ||= 12; # max number of columns possible
+
+ $bed_entry[ chrom ] = $bp_record->{ "CHR" } ||
+ $bp_record->{ "S_ID" } ||
+ return undef;
+
+ $bed_entry[ chromStart ] = $bp_record->{ "CHR_BEG" } ||
+ $bp_record->{ "S_BEG" } ||
+ return undef;
+
+ $bed_entry[ chromEnd ] = $bp_record->{ "CHR_END" } ||
+ $bp_record->{ "S_END" } ||
+ return undef;
+
+ $bed_entry[ chromEnd ]++;
+
+ return wantarray ? @bed_entry : \@bed_entry if $cols == 3;
+
+ $bed_entry[ name ] = $bp_record->{ "Q_ID" } || return wantarray ? @bed_entry : \@bed_entry;
+
+ return wantarray ? @bed_entry : \@bed_entry if $cols == 4;
+
+ if ( exists $bp_record->{ "SCORE" } ) {
+ $bed_entry[ score ] = $bp_record->{ "SCORE" };
+ } else {
+ return wantarray ? @bed_entry : \@bed_entry;
+ }
+
+ return wantarray ? @bed_entry : \@bed_entry if $cols == 5;
+
+ if ( exists $bp_record->{ "STRAND" } ) {
+ $bed_entry[ strand ] = $bp_record->{ "STRAND" };
+ } else {
+ return wantarray ? @bed_entry : \@bed_entry;
+ }
+
+ return wantarray ? @bed_entry : \@bed_entry if $cols == 6;
+
+ if ( defined $bp_record->{ "THICK_BEG" } and
+ defined $bp_record->{ "THICK_END" } )
+ {
+ $bed_entry[ thickStart ] = $bp_record->{ "THICK_BEG" };
+ $bed_entry[ thickEnd ] = $bp_record->{ "THICK_END" };
+ }
+
+ return wantarray ? @bed_entry : \@bed_entry if $cols == 8;
+
+ if ( exists $bp_record->{ "COLOR" } ) {
+ $bed_entry[ itemRgb ] = $bp_record->{ "COLOR" };
+ } else {
+ return wantarray ? @bed_entry : \@bed_entry;
+ }
+
+ return wantarray ? @bed_entry : \@bed_entry if $cols == 9;
+
+ if ( defined $bp_record->{ "BLOCK_COUNT" } and
+ defined $bp_record->{ "BLOCK_LENS" } and
+ defined $bp_record->{ "Q_BEGS" } )
+ {
+ $bed_entry[ blockCount ] = $bp_record->{ "BLOCK_COUNT" };
+ $bed_entry[ blockSizes ] = $bp_record->{ "BLOCK_LENS" };
+ $bed_entry[ blockStarts ] = $bp_record->{ "Q_BEGS" };
+ $bed_entry[ thickEnd ]++;
+ }
+
+ return wantarray ? @bed_entry : \@bed_entry;
+}
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> TAG CONTIGS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+sub tag_contigs_assemble
+{
+ # Martin A. Hansen, November 2008.
+
+ # Returns a path to a BED file with tab contigs.
+
+ my ( $bed_file, # path to BED file
+ $chr, # chromosome
+ $dir, # working directory
+ ) = @_;
+
+ # Returns a string
+
+ my ( $fh_in, $fh_out, $array, $new_file, $pos, $id, $beg, $end, $score, $entry );
+
+ $fh_in = Maasha::Filesys::file_read_open( $bed_file );
+
+ $array = tag_contigs_assemble_array( $fh_in );
+
+ close $fh_in;
+
+ $new_file = "$bed_file.tag_contigs";
+
+ $fh_out = Maasha::Filesys::file_write_open( $new_file );
+
+ $pos = 0;
+ $id = 0;
+
+ while ( ( $beg, $end, $score ) = tag_contigs_scan( $array, $pos ) and $beg )
+ {
+ $entry->[ chrom ] = $chr;
+ $entry->[ chromStart ] = $beg;
+ $entry->[ chromEnd ] = $end;
+ $entry->[ name ] = sprintf( "TC%06d", $id );
+ $entry->[ score ] = $score;
+
+ bed_entry_put( $entry, $fh_out );
+
+ $pos = $end;
+ $id++;
+ }
+
+ close $fh_out;
+
+ return $new_file;
+}
+
+
+sub tag_contigs_assemble_array
+{
+ # Martin A. Hansen, November 2008.
+
+ # Given a BED file with entries from only one
+ # chromosome assembles tag contigs from these
+ # ignoring strand information. Only tags with
+ # a score higher than the clone count over
+ # genomic loci (the score field) is included
+ # in the tag contigs.
+
+ # ----------- tags
+ # -------------
+ # ----------
+ # ----------
+ # ======================== tag contig
+
+
+ my ( $fh, # file handle to BED file
+ ) = @_;
+
+ # Returns an arrayref.
+
+ my ( $entry, $clones, $score, @array );
+
+ while ( $entry = bed_entry_get( $fh ) )
+ {
+ if ( $entry->[ name ] =~ /_(\d+)$/ )
+ {
+ $clones = $1;
+
+ if ( $entry->[ score ] )
+ {
+ $score = int( $clones / $entry->[ score ] );
+
+ map { $array[ $_ ] += $score } $entry->[ chromStart ] .. $entry->[ chromEnd ] - 1 if $score >= 1;
+ }
+ }
+ }
+
+ return wantarray ? @array : \@array;
+}
+
+
+sub tag_contigs_scan
+{
+ # Martin A. Hansen, November 2008.
+
+ # Scans an array with tag contigs and locates
+ # the next contig from a given position. The
+ # score of the tag contig is determined as the
+ # maximum value of the tag contig. If a tag contig
+ # is found a triple is returned with beg, end and score
+ # otherwise an empty list is returned.
+
+ my ( $array, # array to scan
+ $beg, # position to start scanning from
+ ) = @_;
+
+ # Returns an arrayref.
+
+ my ( $end, $score );
+
+ $score = 0;
+
+ while ( $beg < scalar @{ $array } and not $array->[ $beg ] ) { $beg++ }
+
+ $end = $beg;
+
+ while ( $array->[ $end ] )
+ {
+ $score = Maasha::Calc::max( $score, $array->[ $end ] );
+
+ $end++
+ }
+
+ if ( $score > 0 ) {
+ return wantarray ? ( $beg, $end, $score ) : [ $beg, $end, $score ];
+ } else {
+ return wantarray ? () : [];
+ }
+}
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+1;
+
+
+__END__
--- /dev/null
+package Maasha::UCSC::PSL;
+
+# Copyright (C) 2007-2008 Martin A. Hansen.
+
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+# http://www.gnu.org/copyleft/gpl.html
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+# Routines for interaction with PSL entries and files.
+
+# Read more about the PSL format here: http://genome.ucsc.edu/goldenPath/help/hgTracksHelp.html#PSL
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+use strict;
+
+use Data::Dumper;
+use Maasha::Common;
+use Maasha::Filesys;
+
+use vars qw( @ISA @EXPORT_OK );
+
+require Exporter;
+
+@ISA = qw( Exporter );
+
+@EXPORT_OK = qw(
+);
+
+use constant {
+ matches => 0, # PSL field names
+ misMatches => 1,
+ repMatches => 2,
+ nCount => 3,
+ qNumInsert => 4,
+ qBaseInsert => 5,
+ tNumInsert => 6,
+ tBaseInsert => 7,
+ strand => 8,
+ qName => 9,
+ qSize => 10,
+ qStart => 11,
+ qEnd => 12,
+ tName => 13,
+ tSize => 14,
+ tStart => 15,
+ tEnd => 16,
+ blockCount => 17,
+ blockSizes => 18,
+ qStarts => 19,
+ tStarts => 20,
+ MATCHES => 0, # Biopieces field names
+ MISMATCHES => 1,
+ REPMATCHES => 2,
+ NCOUNT => 3,
+ QNUMINSERT => 4,
+ QBASEINSERT => 5,
+ SNUMINSERT => 6,
+ SBASEINSERT => 7,
+ STRAND => 8,
+ Q_ID => 9,
+ Q_LEN => 10,
+ Q_BEG => 11,
+ Q_END => 12,
+ S_ID => 13,
+ S_LEN => 14,
+ S_BEG => 15,
+ S_END => 16,
+ BLOCKCOUNT => 17,
+ BLOCKSIZES => 18,
+ Q_BEGS => 19,
+ S_BEGS => 20,
+};
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> PSL format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+sub psl_entry_get
+{
+ # Martin A. Hansen, August 2008.
+
+ # Reads PSL next entry from a PSL file and returns an array ref.
+
+ my ( $fh, # file handle of PSL filefull path to PSL file
+ ) = @_;
+
+ # Returns hashref.
+
+ my ( $line, @fields );
+
+ while ( $line = <$fh> )
+ {
+ chomp $line;
+
+ @fields = split "\t", $line;
+
+ return wantarray ? @fields : \@fields if scalar @fields == 21;
+ }
+
+ return undef;
+}
+
+
+sub psl2record
+{
+ # Martin A. Hansen, November 2008
+
+ # Converts PSL record keys to Biopiece record keys.
+
+ my ( $psl, # hashref
+ ) = @_;
+
+ # returns hashref
+
+ my %record;
+
+ %record = (
+ REC_TYPE => "PSL",
+ BLOCKSIZES => $psl->{ 'blockSize' },
+ SNUMINSERT => $psl->{ 'tNumInsert' },
+ Q_END => $psl->{ 'qEnd' },
+ SBASEINSERT => $psl->{ 'tBaseInsert' },
+ S_END => $psl->{ 'tEnd' },
+ QBASEINSERT => $psl->{ 'qBaseInsert' },
+ REPMATCHES => $psl->{ 'repMatches' },
+ QNUMINSERT => $psl->{ 'qNumInsert' },
+ MISMATCHES => $psl->{ 'misMatches' },
+ BLOCKCOUNT => $psl->{ 'blockCount' },
+ Q_LEN => $psl->{ 'qSize' },
+ S_ID => $psl->{ 'tName' },
+ STRAND => $psl->{ 'strand' },
+ Q_ID => $psl->{ 'qName' },
+ MATCHES => $psl->{ 'matches' },
+ S_LEN => $psl->{ 'tSize' },
+ NCOUNT => $psl->{ 'nCount' },
+ Q_BEGS => $psl->{ 'qStarts' },
+ S_BEGS => $psl->{ 'tStarts' },
+ S_BEG => $psl->{ 'tStart' },
+ Q_BEG => $psl->{ 'qStart ' },
+ );
+
+ $record{ "SCORE" } = $record{ "MATCHES" } + int( $record{ "REPMATCHES" } / 2 ) - $record{ "MISMATCHES" } - $record{ "QNUMINSERT" } - $record{ "SNUMINSERT" };
+ $record{ "SPAN" } = $record{ "Q_END" } - $record{ "Q_BEG" };
+
+ return wantarray ? %record : \%record;
+}
+
+
+sub psl_calc_score
+{
+ # Martin A. Hansen, November 2008.
+
+ # Calculates the score for a PSL entry according to:
+ # http://genome.ucsc.edu/FAQ/FAQblat#blat4
+
+ my ( $psl_entry, # array ref
+ ) = @_;
+
+ # Returns a float
+
+ my ( $score );
+
+ $score = $psl_entry[ MATCHES ] +
+ int( $psl_entry[ REPMATCHES ] / 2 ) -
+ $psl_entry[ MISMATCHES ] -
+ $psl_entry[ QNUMINSERT ] -
+ $psl_entry[ SNUMINSERT ];
+
+ return $score;
+}
+
+
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+1;
+
+
+__END__
--- /dev/null
+package Maasha::UCSC::Wiggle;
+
+# Copyright (C) 2007-2008 Martin A. Hansen.
+
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+# http://www.gnu.org/copyleft/gpl.html
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+# Routines for manipulation of wiggle data.
+
+# http://genome.ucsc.edu/goldenPath/help/wiggle.html
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+use strict;
+
+use Data::Dumper;
+use Maasha::Common;
+use Maasha::Filesys;
+
+use vars qw( @ISA @EXPORT_OK );
+
+require Exporter;
+
+@ISA = qw( Exporter );
+
+@EXPORT_OK = qw(
+);
+
+
+use constant {
+ chrom => 0, # BED field names
+ chromStart => 1,
+ chromEnd => 2,
+ name => 3,
+ score => 4,
+ strand => 5,
+ thickStart => 6,
+ thickEnd => 7,
+ itemRgb => 8,
+ blockCount => 9,
+ blockSizes => 10,
+ blockStarts => 11,
+};
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> FixedStep format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+sub fixedstep_entry_get
+{
+ # Martin A. Hansen, December 2007.
+
+ # Given a file handle to a PhastCons file get the
+ # next entry which is all the lines after a "fixedStep"
+ # line and until the next "fixedStep" line or EOF.
+
+ my ( $fh, # filehandle
+ ) = @_;
+
+ # Returns a list of lines
+
+ my ( $entry, @lines );
+
+ local $/ = "\nfixedStep ";
+
+ $entry = <$fh>;
+
+ chomp $entry;
+
+ @lines = split "\n", $entry;
+
+ return if @lines == 0;
+
+ $lines[ 0 ] =~ s/fixedStep?\s*//;
+ $lines[ 0 ] = 'fixedStep ' . $lines[ 0 ];
+
+ return wantarray ? @lines : \@lines;
+}
+
+
+sub fixedstep_entry_put
+{
+ # Martin A. Hansen, April 2008.
+
+ # Outputs a block of fixedStep values.
+ # Used for outputting wiggle data.
+
+ my ( $chr, # chromosome
+ $beg, # start position
+ $block, # list of scores
+ $fh, # filehandle - OPTIONAL
+ ) = @_;
+
+ # Returns nothing.
+
+ $beg += 1; # fixedStep format is 1 based.
+
+ $fh ||= \*STDOUT;
+
+ print $fh "fixedStep chrom=$chr start=$beg step=1\n";
+
+ map { print $fh "$_\n" } @{ $block };
+}
+
+
+sub fixedstep2biopiece
+{
+ # Martin A. Hansen, November 2008.
+
+ # Converts a fixedstep entry to a Biopiece record.
+
+ my ( $entry, # fixedstep entry arrayref
+ ) = @_;
+
+ # Returns a hashref
+
+ my ( $head, %record );
+
+ $head = shift @{ $entry };
+
+ if ( $head =~ /^fixedStep chrom=([^ ]+) start=(\d+) step=(\d+)$/ )
+ {
+ $record{ "REC_TYPE" } = "fixed_step";
+ $record{ "CHR" } = $1;
+ $record{ "CHR_BEG" } = $2;
+ $record{ "STEP" } = $3;
+ $record{ "VALS" } = join ";", @{ $entry };
+ }
+ else
+ {
+ Maasha::Common::error( qq(could not convert fixedStep entry to Biopiece record) );
+ }
+
+ return wantarray ? %record : \%record;
+}
+
+
+sub biopiece2fixedstep
+{
+ # Martin A. Hansen, November 2008.
+
+ # Converts a Biopiece record to a fixedStep entry.
+
+ my ( $record, # Biopiece record
+ ) = @_;
+
+ # Returns a list
+
+ my ( @entry, $vals );
+
+ if ( exists $record->{ "REC_TYPE" } and $record->{ "REC_TYPE" } eq 'fixed_step' )
+ {
+ if ( exists $record->{ 'CHR' } and exists $record->{ 'CHR_BEG' } and exists $record->{ 'STEP' } )
+ {
+ push @entry, "fixedStep chrom=$record->{ 'CHR' } start=$record->{ 'CHR_BEG' } step=$record->{ 'STEP' }";
+
+ $vals = $record->{ 'VALS' };
+
+ map { push @entry, $_ } split ";", $vals;
+ }
+ else
+ {
+ Maasha::Common::error( qq(could not convert Biopiece record to fixedStep) );
+ }
+ }
+
+ return wantarray ? @entry : \@entry;
+}
+
+
+sub fixedstep_calc
+{
+ # Martin A. Hansen, November 2008.
+
+ # Calculates fixedstep entries for a given BED file.
+ # Implemented using large Perl arrays to avoid sorting.
+ # Returns path to the fixedstep file.
+
+ my ( $bed_file, # path to BED file
+ $chr, # chromosome name
+ $use_score, # flag indicating that the score field should be used - OPTIONAL
+ ) = @_;
+
+ # Returns a string
+
+ my ( $fixedstep_file, $fh_in, $fh_out, $array, $beg, $end, @block );
+
+ $fixedstep_file = "$bed_file.fixedstep";
+
+ $fh_in = Maasha::Filesys::file_read_open( $bed_file );
+
+ $array = fixedstep_calc_array( $fh_in, $use_score );
+
+ close $fh_in;
+
+ $fh_out = Maasha::Filesys::file_write_open( $fixedstep_file );
+
+ $beg = 0;
+
+ while ( ( $beg, $end ) = fixedstep_scan( $array, $beg ) and $beg )
+ {
+ @block = @{ $array }[ $beg .. $end ];
+
+ fixedstep_entry_put( $chr, $beg, \@block, $fh_out );
+
+ $beg = $end;
+ }
+
+ close $fh_out;
+
+ return $fixedstep_file;
+}
+
+
+sub fixedstep_calc_array
+{
+ # Martin A. Hansen, November 2008.
+
+ # Given a filehandle to a BED file for a single
+ # chromosome fills an array with scores based on
+ # clone count (or sequence read count) for the
+ # begin position through the end position of each
+ # BED entry.
+
+ my ( $fh_in, # filehandle to BED file
+ $use_score, # flag indicating that the score field should be used - OPTIONAL
+ ) = @_;
+
+ # Returns arrayref.
+
+ my ( $bed, $clones, @array );
+
+ while ( $bed = Maasha::UCSC::BED::bed_entry_get( $fh_in, 5 ) )
+ {
+ if ( $use_score ) {
+ $clones = $bed->[ score ];
+ } elsif ( $bed->[ name ] =~ /_(\d+)$/ ) {
+ $clones = $1;
+ } else {
+ $clones = 1;
+ }
+
+ map { $array[ $_ ] += $clones } $bed->[ chromStart ] .. $bed->[ chromEnd ] - 1;
+ }
+
+ return wantarray ? @array : \@array;
+}
+
+
+sub fixedstep_scan
+{
+ # Martin A. Hansen, November 2008.
+
+ # Given a fixedstep array locates the next block of
+ # non-zero elements from a given begin position.
+ # A [ begin end ] tuple of the block posittion is returned
+ # if a block was found otherwise and empty tuple is returned.
+
+ my ( $array, # array to scan
+ $beg, # position to start scanning from
+ ) = @_;
+
+ # Returns an arrayref.
+
+ my ( $end );
+
+ while ( $beg < scalar @{ $array } and not $array->[ $beg ] ) { $beg++ }
+
+ $end = $beg;
+
+ while ( $array->[ $end ] ) { $end++ }
+
+ if ( $end > $beg ) {
+ return wantarray ? ( $beg, $end ) : [ $beg, $end ];
+ } else {
+ return wantarray ? () : [];
+ }
+}
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+1;
+
+
+__END__