From 430634c816f774703a59c0122006daf70ee570f8 Mon Sep 17 00:00:00 2001 From: martinahansen Date: Mon, 24 Nov 2008 00:13:00 +0000 Subject: [PATCH] major code upgrade 1 git-svn-id: http://biopieces.googlecode.com/svn/trunk@319 74ccb610-7750-0410-82ae-013aeee3265d --- code_perl/Maasha/Biopieces.pm | 506 ++++++----------------- code_perl/Maasha/Fasta.pm | 26 ++ code_perl/Maasha/Filesys.pm | 190 +++++++++ code_perl/Maasha/UCSC.pm | 134 ++----- code_perl/Maasha/UCSC/BED.pm | 686 ++++++++++++++++++++++++++++++++ code_perl/Maasha/UCSC/PSL.pm | 200 ++++++++++ code_perl/Maasha/UCSC/Wiggle.pm | 306 ++++++++++++++ 7 files changed, 1552 insertions(+), 496 deletions(-) create mode 100644 code_perl/Maasha/Filesys.pm create mode 100644 code_perl/Maasha/UCSC/BED.pm create mode 100644 code_perl/Maasha/UCSC/PSL.pm create mode 100644 code_perl/Maasha/UCSC/Wiggle.pm diff --git a/code_perl/Maasha/Biopieces.pm b/code_perl/Maasha/Biopieces.pm index 4d6ec7e..ef96559 100644 --- a/code_perl/Maasha/Biopieces.pm +++ b/code_perl/Maasha/Biopieces.pm @@ -36,6 +36,7 @@ use Time::HiRes qw( gettimeofday ); use Storable qw( dclone ); use Maasha::Config; use Maasha::Common; +use Maasha::Filesys; use Maasha::Fasta; use Maasha::Align; use Maasha::Matrix; @@ -47,6 +48,8 @@ use Maasha::Patscan; use Maasha::Plot; use Maasha::Calc; use Maasha::UCSC; +use Maasha::UCSC::BED; +use Maasha::UCSC::Wiggle; use Maasha::NCBI; use Maasha::GFF; use Maasha::TwoBit; @@ -87,8 +90,8 @@ $SIG{ 'TERM' } = \&sig_handler; my ( $script, $BP_TMP ); -$script = Maasha::Common::get_scriptname(); -$BP_TMP = Maasha::Common::get_tmpdir(); +$script = Maasha::Common::get_scriptname(); +$BP_TMP = Maasha::Common::get_tmpdir(); # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> LOG <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< @@ -323,6 +326,7 @@ sub get_options { @options = qw( data_in|i=s + cols|c=s num|n=s ); } @@ -1437,7 +1441,9 @@ sub script_read_bed # Returns nothing. - my ( $file, $record, $entry, $data_in, $num ); + my ( $cols, $file, $record, $bed_entry, $data_in, $num ); + + $cols = $options->{ 'cols' }->[ 0 ]; while ( $record = get_record( $in ) ) { put_record( $record, $out ); @@ -1449,9 +1455,11 @@ sub script_read_bed { $data_in = Maasha::Common::read_open( $file ); - while ( $entry = Maasha::UCSC::bed_get_entry( $data_in ) ) + while ( $bed_entry = Maasha::UCSC::BED::bed_entry_get( $data_in, $cols ) ) { - put_record( $entry, $out ); + $record = Maasha::UCSC::BED::bed2biopiece( $bed_entry ); + + put_record( $record, $out ); goto NUM if $options->{ "num" } and $num == $options->{ "num" }; @@ -1480,7 +1488,7 @@ sub script_read_fixedstep # Returns nothing. - my ( $file, $record, $entry, $head, $chr, $chr_beg, $step, $data_in, $num ); + my ( $file, $record, $entry, $data_in, $num ); while ( $record = get_record( $in ) ) { put_record( $record, $out ); @@ -1492,18 +1500,9 @@ sub script_read_fixedstep { $data_in = Maasha::Common::read_open( $file ); - while ( $entry = Maasha::UCSC::fixedstep_get_entry( $data_in ) ) + while ( $entry = Maasha::UCSC::Wiggle::fixedstep_entry_get( $data_in ) ) { - $head = shift @{ $entry }; - - if ( $head =~ /^chrom=([^ ]+) start=(\d+) step=(\d+)$/ ) - { - $record->{ "REC_TYPE" } = "fixed_step"; - $record->{ "CHR" } = $1; - $record->{ "CHR_BEG" } = $2; - $record->{ "STEP" } = $3; - $record->{ "VALS" } = join ";", @{ $entry }; - } + $record = Maasha::UCSC::Wiggle::fixedstep2biopiece( $entry ); put_record( $record, $out ); @@ -2146,7 +2145,7 @@ sub script_read_ucsc_config { $data_in = Maasha::Common::read_open( $file ); - while ( $record = Maasha::UCSC::ucsc_config_get_entry( $data_in ) ) + while ( $record = Maasha::UCSC::ucsc_config_entry_get( $data_in ) ) { $record->{ 'REC_TYPE' } = "UCSC Config"; @@ -2173,6 +2172,10 @@ sub script_assemble_tag_contigs # Assemble tags from the stream into # tag contigs. + # The current implementation is quite + # slow because of heavy use of temporary + # files. + my ( $in, # handle to in stream $out, # handle to out stream $options, # options hash @@ -2180,136 +2183,43 @@ sub script_assemble_tag_contigs # Returns nothing. - my ( $record, $new_record, %fh_hash, $fh_out, $chr, $array, $pos, $beg, $end, $score, $file, $id ); + my ( $bed_file, $fh_in, $fh_out, $cols, $record, $bed_entry, $file_hash, $chr, + $array, $pos, $id, $beg, $end, $score, $new_record ); + + $bed_file = "$BP_TMP/assemble_tag_contigs.bed"; + $fh_out = Maasha::Filesys::file_write_open( $bed_file ); + $cols = 5; # we only need the first 5 BED columns while ( $record = get_record( $in ) ) { - $record->{ "CHR" } = $record->{ "S_ID" } if not defined $record->{ "CHR" }; - $record->{ "CHR_BEG" } = $record->{ "S_BEG" } if not defined $record->{ "CHR_BEG" }; - $record->{ "CHR_END" } = $record->{ "S_END" } if not defined $record->{ "CHR_END" }; - - if ( $record->{ "CHR" } and defined $record->{ "CHR_BEG" } and $record->{ "CHR_END" } ) - { - $fh_hash{ $record->{ "CHR" } } = Maasha::Common::write_open( "$BP_TMP/$record->{ 'CHR' }" ) if not exists $fh_hash{ $record->{ "CHR" } }; - - $fh_out = $fh_hash{ $record->{ "CHR" } }; - - Maasha::UCSC::bed_put_entry( $record, $fh_out, 5 ); - } - } - - map { close $_ } keys %fh_hash; - - foreach $chr ( sort keys %fh_hash ) - { - $array = tag_contigs_assemble( "$BP_TMP/$chr" ); - - $pos = 0; - $id = 0; - - while ( ( $beg, $end, $score ) = tag_contigs_scan( $array, $pos ) and $beg ) - { - $new_record = { - CHR => $chr, - CHR_BEG => $beg, - CHR_END => $end - 1, - Q_ID => sprintf( "TC%06d", $id ), - SCORE => $score, - STRAND => $record->{ 'STRAND' } || '+', - }; - - put_record( $new_record, $out ); - - $pos = $end; - $id++; + if ( $bed_entry = Maasha::UCSC::BED::biopiece2bed( $record, $cols ) ) { + Maasha::UCSC::BED::bed_entry_put( $bed_entry, $fh_out, $cols ); } - - unlink "$BP_TMP/$chr"; } -} - - -sub tag_contigs_assemble -{ - # Martin A. Hansen, November 2008. - - # Given a BED file with entries from only one - # chromosome assembles tag contigs from these - # ignoring strand information. Only tags with - # a score higher than the clone count over - # genomic loci (the SCORE field) is included - # in the tag contigs. - - # ----------- tags - # ------------- - # ---------- - # ---------- - # ======================== tag contig + close $fh_out; - my ( $path, # full path to BED file - ) = @_; + $file_hash = Maasha::UCSC::BED::bed_file_split_on_chr( $bed_file ); - # Returns an arrayref. + unlink $bed_file; - my ( $fh, $entry, $clones, $score, @array ); + foreach $chr ( sort keys %{ $file_hash } ) + { + $bed_file = Maasha::UCSC::BED::tag_contigs_assemble( $file_hash->{ $chr }, $chr, $BP_TMP ); - $fh = Maasha::Common::read_open( $path ); + $fh_in = Maasha::Filesys::file_read_open( $bed_file ); - while ( $entry = Maasha::UCSC::bed_get_entry( $fh ) ) - { - if ( $entry->{ 'Q_ID' } =~ /(\d+)$/ ) + while ( $bed_entry = Maasha::UCSC::BED::bed_entry_get( $fh_in ) ) { - $clones = $1; - - $score = int( $clones / $entry->{ 'SCORE' } ); - - map { $array[ $_ ] += $score } $entry->{ 'CHR_BEG' } .. $entry->{ 'CHR_END' } if $score >= 1; + if ( $record = Maasha::UCSC::BED::bed2biopiece( $bed_entry ) ) { + put_record( $record, $out ); + } } - } - close $fh; - - return wantarray ? @array : \@array; -} - - -sub tag_contigs_scan -{ - # Martin A. Hansen, November 2008. - - # Scans an array with tag contigs and locates - # the next contig from a given position. The - # score of the tag contig is determined as the - # maximum value of the tag contig. If a tag contig - # is found a triple is returned with beg, end and score - # otherwise an empty triple is returned. - - my ( $array, # array to scan - $beg, # position to start scanning from - ) = @_; - - # Returns an arrayref. - - my ( $end, $score ); - - $score = 0; - - while ( $beg < scalar @{ $array } and not $array->[ $beg ] ) { $beg++ } - - $end = $beg; - - while ( $array->[ $end ] ) - { - $score = Maasha::Calc::max( $score, $array->[ $end ] ); - - $end++ - } + close $fh_in; - if ( $score > 0 ) { - return wantarray ? ( $beg, $end, $score ) : [ $beg, $end, $score ]; - } else { - return wantarray ? () : []; + unlink $file_hash->{ $chr }; + unlink $bed_file; } } @@ -2362,7 +2272,7 @@ sub script_format_genome while ( $record = get_record( $in ) ) { - if ( $fh_out and $entry = record2fasta( $record ) ) + if ( $fh_out and $entry = Maasha::Fasta::biopiece2fasta( $record ) ) { Maasha::Fasta::put_entry( $entry, $fh_out, $options->{ "wrap" } ); } @@ -2747,102 +2657,42 @@ sub script_calc_fixedstep # Returns nothing. - my ( $record, %fh_hash, $fh_in, $fh_out, $chr, $chr, $beg, $end, $q_id, $block, $entry, $clones, $beg_block, $max, $i ); + my ( $bed_file, $fh_in, $fh_out, $cols, $record, $file_hash, $chr, $bed_entry, $fixedstep_file, $fixedstep_entry ); + + $bed_file = "$BP_TMP/calc_fixedstep.bed"; + $fh_out = Maasha::Filesys::file_write_open( $bed_file ); + $cols = 5; # we only need the first 5 BED columns while ( $record = get_record( $in ) ) { - $record->{ "CHR" } = $record->{ "S_ID" } if not defined $record->{ "CHR" }; - $record->{ "CHR_BEG" } = $record->{ "S_BEG" } if not defined $record->{ "CHR_BEG" }; - $record->{ "CHR_END" } = $record->{ "S_END" } if not defined $record->{ "CHR_END" }; - - if ( $record->{ "CHR" } and defined $record->{ "CHR_BEG" } and $record->{ "CHR_END" } ) - { - $fh_hash{ $record->{ "CHR" } } = Maasha::Common::write_open( "$BP_TMP/$record->{ 'CHR' }" ) if not exists $fh_hash{ $record->{ "CHR" } }; - - $fh_out = $fh_hash{ $record->{ "CHR" } }; - - Maasha::UCSC::bed_put_entry( $record, $fh_out, 5 ); + if ( $bed_entry = Maasha::UCSC::BED::biopiece2bed( $record, $cols ) ) { + Maasha::UCSC::BED::bed_entry_put( $bed_entry, $fh_out, $cols ); } } - map { close $_ } keys %fh_hash; + close $fh_out; - foreach $chr ( sort keys %fh_hash ) - { - #Maasha::Common::run( "bedSort", "$BP_TMP/$chr $BP_TMP/$chr" ); - Maasha::Common::run( "bed_sort", "--sort 3 --cols 5 $BP_TMP/$chr > $BP_TMP/$chr.sort" ); + $file_hash = Maasha::UCSC::BED::bed_file_split_on_chr( $bed_file ); - rename "$BP_TMP/$chr.sort", "$BP_TMP/$chr"; + unlink $bed_file; - $fh_in = Maasha::Common::read_open( "$BP_TMP/$chr" ); + foreach $chr ( sort keys %{ $file_hash } ) + { + $fixedstep_file = Maasha::UCSC::Wiggle::fixedstep_calc( $file_hash->{ $chr }, $chr, $options->{ 'score' } ); - undef $block; + $fh_in = Maasha::Filesys::file_read_open( $fixedstep_file ); - while ( $entry = Maasha::UCSC::bed_get_entry( $fh_in, 5 ) ) + while ( $fixedstep_entry = Maasha::UCSC::Wiggle::fixedstep_entry_get( $fh_in ) ) { - $chr = $entry->{ 'CHR' }; - $beg = $entry->{ 'CHR_BEG' }; - $end = $entry->{ 'CHR_END' }; - $q_id = $entry->{ 'Q_ID' }; - - if ( $options->{ "score" } ) { - $clones = $entry->{ 'SCORE' }; - } elsif ( $q_id =~ /_(\d+)$/ ) { - $clones = $1; - } else { - $clones = 1; - } - - if ( $block ) - { - if ( $beg > $max ) - { - map { $_ = sprintf( "%.4f", Maasha::Calc::log10( $_ ) ) } @{ $block } if $options->{ "log10" }; - - $record->{ "CHR" } = $chr; - $record->{ "CHR_BEG" } = $beg_block; - $record->{ "STEP" } = 1; - $record->{ "VALS" } = join ";", @{ $block }; - $record->{ "REC_TYPE" } = "fixed_step"; - - put_record( $record, $out ); - - undef $block; - } - else - { - for ( $i = $beg - $beg_block; $i < ( $beg - $beg_block ) + ( $end - $beg ); $i++ ) { - $block->[ $i ] += $clones; - } - - $max = Maasha::Calc::max( $max, $end ); - } - } - - if ( not $block ) - { - $beg_block = $beg; - $max = $end; - - for ( $i = 0; $i < ( $end - $beg ); $i++ ) { - $block->[ $i ] += $clones; - } + if ( $record = Maasha::UCSC::Wiggle::fixedstep2biopiece( $fixedstep_entry ) ) { + put_record( $record, $out ); } } close $fh_in; - map { $_ = sprintf( "%.4f", Maasha::Calc::log10( $_ ) ) } @{ $block } if $options->{ "log10" }; - - $record->{ "CHR" } = $chr; - $record->{ "CHR_BEG" } = $beg_block; - $record->{ "STEP" } = 1; - $record->{ "VALS" } = join ";", @{ $block }; - $record->{ "REC_TYPE" } = "fixed_step"; - - put_record( $record, $out ); - - unlink "$BP_TMP/$chr"; + unlink $file_hash->{ $chr }; + unlink $bed_file; } } @@ -3778,7 +3628,7 @@ sub script_create_blast_db { put_record( $record, $out ) if not $options->{ "no_stream" }; - if ( $entry = record2fasta( $record ) ) + if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) ) { $seq_type = Maasha::Seq::seq_guess_type( $entry->[ SEQ ] ) if not $seq_type; @@ -3827,7 +3677,7 @@ sub script_blast_seq while ( $record = get_record( $in ) ) { - if ( $entry = record2fasta( $record ) ) + if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) ) { $q_type = Maasha::Seq::seq_guess_type( $entry->[ SEQ ] ) if not $q_type; @@ -3978,7 +3828,7 @@ sub script_blat_seq while ( $record = get_record( $in ) ) { - if ( $entry = record2fasta( $record ) ) + if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) ) { $type = Maasha::Seq::seq_guess_type( $entry->[ SEQ ] ) if not $type; Maasha::Fasta::put_entry( $entry, $fh_out, 80 ); @@ -4039,7 +3889,7 @@ sub script_soap_seq while ( $record = get_record( $in ) ) { - if ( $entry = record2fasta( $record ) ) + if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) ) { Maasha::Fasta::put_entry( $entry, $fh_out ); @@ -4163,7 +4013,7 @@ sub script_create_vmatch_index while ( $record = get_record( $in ) ) { - if ( $options->{ "index_name" } and $entry = record2fasta( $record ) ) + if ( $options->{ "index_name" } and $entry = Maasha::Fasta::biopiece2fasta( $record ) ) { Maasha::Fasta::put_entry( $entry, $fh_tmp ); @@ -4260,7 +4110,7 @@ sub script_write_fasta while ( $record = get_record( $in ) ) { - if ( $entry = record2fasta( $record ) ) { + if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) ) { Maasha::Fasta::put_entry( $entry, $fh, $options->{ "wrap" } ); } @@ -4446,98 +4296,21 @@ sub script_write_bed # Returns nothing. - my ( $cols, $fh, $record, $new_record ); + my ( $cols, $fh, $record, $bed_entry, $new_record ); $cols = $options->{ 'cols' }->[ 0 ]; - $fh = write_stream( $options->{ "data_out" }, $options->{ "compress" } ); + $fh = write_stream( $options->{ 'data_out' }, $options->{ 'compress' } ); while ( $record = get_record( $in ) ) { - if ( $record->{ "REC_TYPE" } eq "BED" ) # ---- Hits from BED ---- - { - $cols ||= $record->{ "BED_COLS" }; - - Maasha::UCSC::bed_put_entry( $record, $fh, $cols ); - } - elsif ( $record->{ "REC_TYPE" } eq "PSL" and $record->{ "S_ID" } =~ /^chr/i ) # ---- Hits from BLAT (PSL) ---- - { - $new_record->{ "CHR" } = $record->{ "S_ID" }; - $new_record->{ "CHR_BEG" } = $record->{ "S_BEG" }; - $new_record->{ "CHR_END" } = $record->{ "S_END" }; - $new_record->{ "Q_ID" } = $record->{ "Q_ID" }; - $new_record->{ "SCORE" } = $record->{ "SCORE" } || 999; - $new_record->{ "STRAND" } = $record->{ "STRAND" }; + $record = Maasha::UCSC::psl2record( $record ) if $record->{ 'tBaseInsert' }; # Dirty addition to allow Affy data from MySQL to be dumped - $cols ||= 6; - - Maasha::UCSC::bed_put_entry( $new_record, $fh, $cols ); - } - elsif ( $record->{ "REC_TYPE" } eq "PATSCAN" and $record->{ "CHR" } ) # ---- Hits from patscan_seq ---- - { - $cols ||= 6; - - Maasha::UCSC::bed_put_entry( $record, $fh, $cols ); + if ( $bed_entry = Maasha::UCSC::BED::biopiece2bed( $record, $cols ) ) { + Maasha::UCSC::BED::bed_entry_put( $bed_entry, $fh, $cols ); } - elsif ( $record->{ "REC_TYPE" } eq "BLAST" and $record->{ "S_ID" } =~ /^chr/i ) # ---- Hits from BLAST ---- - { - $new_record->{ "CHR" } = $record->{ "S_ID" }; - $new_record->{ "CHR_BEG" } = $record->{ "S_BEG" }; - $new_record->{ "CHR_END" } = $record->{ "S_END" }; - $new_record->{ "Q_ID" } = $record->{ "Q_ID" }; - $new_record->{ "SCORE" } = $record->{ "SCORE" } || 999; # or use E_VAL somehow - $new_record->{ "STRAND" } = $record->{ "STRAND" }; - - $cols ||= 6; - Maasha::UCSC::bed_put_entry( $new_record, $fh, $cols ); - } - elsif ( $record->{ "REC_TYPE" } eq "VMATCH" and $record->{ "S_ID" } =~ /^chr/i ) # ---- Hits from Vmatch ---- - { - $new_record->{ "CHR" } = $record->{ "S_ID" }; - $new_record->{ "CHR_BEG" } = $record->{ "S_BEG" }; - $new_record->{ "CHR_END" } = $record->{ "S_END" }; - $new_record->{ "Q_ID" } = $record->{ "Q_ID" }; - $new_record->{ "SCORE" } = $record->{ "SCORE" } || 999; # or use E_VAL somehow - $new_record->{ "STRAND" } = $record->{ "STRAND" }; - - $cols ||= 6; - - Maasha::UCSC::bed_put_entry( $new_record, $fh, $cols ); - } - elsif ( $record->{ "REC_TYPE" } eq "SOAP" and $record->{ "S_ID" } =~ /^chr/i ) # ---- Hits from Soap ---- - { - $new_record->{ "CHR" } = $record->{ "S_ID" }; - $new_record->{ "CHR_BEG" } = $record->{ "S_BEG" }; - $new_record->{ "CHR_END" } = $record->{ "S_END" }; - $new_record->{ "Q_ID" } = $record->{ "Q_ID" }; - $new_record->{ "SCORE" } = $record->{ "SCORE" } || 999; - $new_record->{ "STRAND" } = $record->{ "STRAND" }; - - $cols ||= 6; - - Maasha::UCSC::bed_put_entry( $new_record, $fh, $cols ); - } - elsif ( $record->{ 'tBaseInsert' } ) # ---- Dirty addition to allow Affy data from MySQL to be dumped ---- - { - $record = Maasha::UCSC::psl2record( $record ); - - $new_record = $record; - $new_record->{ "CHR" } = $record->{ "S_ID" }; - $new_record->{ "CHR_BEG" } = $record->{ "S_BEG" }; - $new_record->{ "CHR_END" } = $record->{ "S_END" }; - $new_record->{ "Q_ID" } = $record->{ "Q_ID" }; - $new_record->{ "SCORE" } = $record->{ "SCORE" } || 999; - $new_record->{ "STRAND" } = $record->{ "STRAND" }; - - Maasha::UCSC::bed_put_entry( $new_record, $fh, $cols ); - } - elsif ( $record->{ "CHR" } and defined $record->{ "CHR_BEG" } and $record->{ "CHR_END" } ) # ---- Generic data from tables ---- - { - Maasha::UCSC::bed_put_entry( $record, $fh, $cols ); - } - - put_record( $record, $out ) if not $options->{ "no_stream" }; + put_record( $record, $out ) if not $options->{ 'no_stream' }; } close $fh; @@ -4592,21 +4365,14 @@ sub script_write_fixedstep # Returns nothing. - my ( $fh, $record, $vals ); + my ( $fh, $record, $entry ); $fh = write_stream( $options->{ "data_out" }, $options->{ "compress" } ); while ( $record = get_record( $in ) ) { - if ( $record->{ "CHR" } and $record->{ "CHR_BEG" } and $record->{ "STEP" } and $record->{ "VALS" } ) - { - print $fh "fixedStep chrom=$record->{ 'CHR' } start=$record->{ 'CHR_BEG' } step=$record->{ 'STEP' }\n"; - - $vals = $record->{ 'VALS' }; - - $vals =~ tr/;/\n/; - - print $fh "$vals\n"; + if ( $entry = Maasha::UCSC::Wiggle::biopiece2fixedstep( $record ) ) { + map { print $fh "$_\n" } @{ $entry }; } put_record( $record, $out ) if not $options->{ "no_stream" }; @@ -4640,7 +4406,7 @@ sub script_write_2bit while ( $record = get_record( $in ) ) { - if ( $entry = record2fasta( $record ) ) { + if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) ) { Maasha::Fasta::put_entry( $entry, $fh_tmp ); } @@ -4679,7 +4445,7 @@ sub script_write_solid while ( $record = get_record( $in ) ) { - if ( $entry = record2fasta( $record ) ) + if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) ) { $entry->[ SEQ ] = Maasha::Solid::seq2color_space( uc $entry->[ SEQ ] ); @@ -4713,7 +4479,7 @@ sub script_write_ucsc_config while ( $record = get_record( $in ) ) { - Maasha::UCSC::ucsc_config_put_entry( $record, $fh ) if $record->{ "REC_TYPE" } eq "UCSC Config"; + Maasha::UCSC::ucsc_config_entry_put( $record, $fh ) if $record->{ "REC_TYPE" } eq "UCSC Config"; put_record( $record, $out ) if not $options->{ "no_stream" }; } @@ -5130,7 +4896,7 @@ sub script_remove_ucsc_tracks $fh_in = Maasha::Common::read_open( $options->{ 'config_file' } ); - while ( $track = Maasha::UCSC::ucsc_config_get_entry( $fh_in ) ) { + while ( $track = Maasha::UCSC::ucsc_config_entry_get( $fh_in ) ) { push @tracks, $track; } @@ -5144,7 +4910,7 @@ sub script_remove_ucsc_tracks $fh_out = Maasha::Common::write_open( $options->{ 'config_file' } ); - map { Maasha::UCSC::ucsc_config_put_entry( $_, $fh_out ) } @new_tracks; + map { Maasha::UCSC::ucsc_config_entry_put( $_, $fh_out ) } @new_tracks; close $fh_out; @@ -5564,7 +5330,7 @@ sub script_compute } else { - warn qq(WARNING: Bad compute expression: "$options->{ 'eval' }"\n); + Maasha::Common::error( qq(Bad compute expression: "$options->{ 'eval' }"\n) ); } } @@ -6362,7 +6128,7 @@ sub script_upload_to_ucsc # Returns nothing. - my ( $record, $file, $wib_file, $wig_file, $wib_dir, $fh_out, $i, $first, $format, $type, $columns, $append, $vals ); + my ( $record, $file, $wib_file, $wig_file, $wib_dir, $fh_out, $i, $first, $format, $type, $columns, $append, $entry ); $options->{ "short_label" } ||= $options->{ 'table' }; $options->{ "long_label" } ||= $options->{ 'table' }; @@ -6372,13 +6138,10 @@ sub script_upload_to_ucsc $options->{ "color" } ||= join( ",", int( rand( 255 ) ), int( rand( 255 ) ), int( rand( 255 ) ) ); $options->{ "chunk_size" } ||= 10_000_000_000; # Due to 32-bit UCSC compilation really large tables cannot be loaded in one go. - $file = "$BP_TMP/ucsc_upload.tmp"; - + $file = "$BP_TMP/ucsc_upload.tmp"; $append = 0; - - $first = 1; - - $i = 0; + $first = 1; + $i = 0; $fh_out = Maasha::Common::write_open( $file ); @@ -6388,27 +6151,27 @@ sub script_upload_to_ucsc if ( $record->{ "REC_TYPE" } eq "fixed_step" ) { - $vals = $record->{ "VALS" }; - $vals =~ tr/;/\n/; - - print $fh_out "fixedStep chrom=$record->{ 'CHR' } start=$record->{ 'CHR_BEG' } step=$record->{ 'STEP' }\n"; - print $fh_out "$vals\n"; + $format = "WIGGLE"; - $format = "WIGGLE" if not $format; + if ( $entry = Maasha::UCSC::Wiggle::biopiece2fixedstep( $record ) ) { + Maasha::UCSC::Wiggle::fixedstep_entry_put( $entry, $fh_out ); + } } elsif ( $record->{ "REC_TYPE" } eq "PSL" ) { + $format = "PSL"; + Maasha::UCSC::psl_put_header( $fh_out ) if $first; Maasha::UCSC::psl_put_entry( $record, $fh_out ); $first = 0; - - $format = "PSL" if not $format; } elsif ( $record->{ "REC_TYPE" } eq "BED" and $record->{ "SEC_STRUCT" } ) { # chrom chromStart chromEnd name score strand size secStr conf + $format = "BED_SS"; + print $fh_out join ( "\t", $record->{ "CHR" }, $record->{ "CHR_BEG" }, @@ -6420,47 +6183,44 @@ sub script_upload_to_ucsc $record->{ "SEC_STRUCT" }, $record->{ "CONF" }, ), "\n"; - - $format = "BED_SS" if not $format; } elsif ( $record->{ "REC_TYPE" } eq "BED" ) { - Maasha::UCSC::bed_put_entry( $record, $fh_out, $record->{ "BED_COLS" } ); + $format = "BED"; + $columns = $record->{ "BED_COLS" }; - $format = "BED" if not $format; - $columns = $record->{ "BED_COLS" } if not $columns; + if ( $entry = Maasha::UCSC::BED::biopiece2bed( $record, $columns ) ) { + Maasha::UCSC::BED::bed_entry_put( $entry, $fh_out, $columns ); + } } elsif ( $record->{ "REC_TYPE" } eq "PATSCAN" and $record->{ "CHR" } ) { - Maasha::UCSC::bed_put_entry( $record, $fh_out, 6 ); + $format = "BED"; + $columns = 6; - $format = "BED" if not $format; - $columns = 6 if not $columns; + if ( $entry = Maasha::UCSC::BED::biopiece2bed( $record, $columns ) ) { + Maasha::UCSC::BED::bed_entry_put( $entry, $fh_out, $columns ); + } } elsif ( $record->{ "REC_TYPE" } eq "BLAST" and $record->{ "S_ID" } =~ /^chr/ ) { - $record->{ "CHR" } = $record->{ "S_ID" }; - $record->{ "CHR_BEG" } = $record->{ "S_BEG" }; - $record->{ "CHR_END" } = $record->{ "S_END" }; - $record->{ "SCORE" } = $record->{ "BIT_SCORE" } * 1000; + $format = "BED"; + $columns = 6; - $format = "BED" if not $format; - $columns = 6 if not $columns; + $record->{ "SCORE" } = $record->{ "BIT_SCORE" } * 1000; - Maasha::UCSC::bed_put_entry( $record, $fh_out ); + if ( $entry = Maasha::UCSC::BED::biopiece2bed( $record, $columns ) ) { + Maasha::UCSC::BED::bed_entry_put( $entry, $fh_out, $columns ); + } } elsif ( $record->{ "REC_TYPE" } eq "VMATCH" and $record->{ "S_ID" } =~ /^chr/i ) { - $record->{ "CHR" } = $record->{ "S_ID" }; - $record->{ "CHR_BEG" } = $record->{ "S_BEG" }; - $record->{ "CHR_END" } = $record->{ "S_END" }; - $record->{ "SCORE" } = $record->{ "SCORE" } || 999; - $record->{ "SCORE" } = int( $record->{ "SCORE" } ); + $format = "BED"; + $columns = 6; - $format = "BED" if not $format; - $columns = 6 if not $columns; - - Maasha::UCSC::bed_put_entry( $record, $fh_out, 6 ); + if ( $entry = Maasha::UCSC::BED::biopiece2bed( $record, $columns ) ) { + Maasha::UCSC::BED::bed_entry_put( $entry, $fh_out, $columns ); + } } if ( $i == $options->{ "chunk_size" } ) @@ -6497,9 +6257,7 @@ sub script_upload_to_ucsc } elsif ( $format eq "BED_SS" ) { - $options->{ "sec_struct" } = 1; - - $type = "sec_struct"; + $type = "type bed 6 +"; Maasha::UCSC::bed_upload_to_ucsc( $BP_TMP, $file, $options, $append ); } @@ -6539,7 +6297,7 @@ sub script_upload_to_ucsc unlink $file; - Maasha::UCSC::update_my_tracks( $options, $type ); + Maasha::UCSC::ucsc_update_config( $options, $type ); } } @@ -6690,32 +6448,6 @@ sub grab_eval } -sub record2fasta -{ - # Martin A. Hansen, July 2008. - - # Given a biopiece record converts it to a FASTA record. - # If no generic SEQ or SEQ_NAME is found, the Q_* and S_* are - # tried in that order. - - my ( $record, # record - ) = @_; - - # Returns a tuple. - - my ( $seq_name, $seq ); - - $seq_name = $record->{ "SEQ_NAME" } || $record->{ "Q_ID" } || $record->{ "S_ID" }; - $seq = $record->{ "SEQ" } || $record->{ "Q_SEQ" } || $record->{ "S_SEQ" }; - - if ( defined $seq_name and defined $seq ) { - return wantarray ? ( $seq_name, $seq ) : [ $seq_name, $seq ]; - } else { - return; - } -} - - sub read_stream { # Martin A. Hansen, July 2007. diff --git a/code_perl/Maasha/Fasta.pm b/code_perl/Maasha/Fasta.pm index cf399cf..c4920a8 100644 --- a/code_perl/Maasha/Fasta.pm +++ b/code_perl/Maasha/Fasta.pm @@ -488,4 +488,30 @@ sub index_retrieve } +sub biopiece2fasta +{ + # Martin A. Hansen, July 2008. + + # Given a biopiece record converts it to a FASTA record. + # If no generic SEQ or SEQ_NAME is found, the Q_* and S_* are + # tried in that order. + + my ( $record, # record + ) = @_; + + # Returns a tuple. + + my ( $seq_name, $seq ); + + $seq_name = $record->{ "SEQ_NAME" } || $record->{ "Q_ID" } || $record->{ "S_ID" }; + $seq = $record->{ "SEQ" } || $record->{ "Q_SEQ" } || $record->{ "S_SEQ" }; + + if ( defined $seq_name and defined $seq ) { + return wantarray ? ( $seq_name, $seq ) : [ $seq_name, $seq ]; + } else { + return; + } +} + + # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< diff --git a/code_perl/Maasha/Filesys.pm b/code_perl/Maasha/Filesys.pm new file mode 100644 index 0000000..229c7c3 --- /dev/null +++ b/code_perl/Maasha/Filesys.pm @@ -0,0 +1,190 @@ +package Maasha::Filesys; + + +# Copyright (C) 2006-2008 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +# This module contains routines for manipulation of files and directories. + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +use strict; +use IO::File; +use Maasha::Common; + +use Exporter; + +use vars qw( @ISA @EXPORT @EXPORT_OK ); + +@ISA = qw( Exporter ) ; + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> FILES <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +sub file_read_open +{ + # Martin A. Hansen, January 2004. + + # Read opens a file that may be gzipped and returns a filehandle. + + my ( $path, # full path to file + ) = @_; + + # Returns filehandle + + my ( $fh ); + + if ( is_gzipped( $path ) ) { + $fh = new IO::File "zcat $path|" or Maasha::Common::error( qq(Could not read-open file "$path": $!) ); + } else { + $fh = new IO::File $path, "r" or Maasha::Common::error( qq(Could not read-open file "$path": $!) ); + } + + return $fh; +} + + +sub file_write_open +{ + # Martin A. Hansen, January 2004. + + # write opens a file and returns a filehandle + + my ( $path, # full path to file + $gzip, # flag if data is to be gzipped - OPRIONAL + ) = @_; + + # returns filehandle + + my ( $fh ); + + if ( $gzip ) { + $fh = new IO::File "|gzip -f>$path" or Maasha::Common::error( qq(Could not write-open file "$path": $!) ); + } else { + $fh = new IO::File $path, "w" or Maasha::Common::error( qq(Could not write-open file "$path": $!) ); + } + + return $fh; +} + + +sub file_append_open +{ + # Martin A. Hansen, February 2006. + + # append opens file and returns a filehandle + + my ( $path, # path to file + ) = @_; + + # returns filehandle + + my ( $fh ); + + $fh = new IO::File $path, "a" or Maasha::Common::error( qq(Could not append-open file "$path": $!) ); + + return $fh; +} + + +sub file_copy +{ + # Martin A. Hansen, November 2008. + + # Copy the content of a file from source path to + # destination path. + + my ( $src, # source path + $dst, # destination path + ) = @_; + + # Returns nothing. + + my ( $fh_in, $fh_out, $line ); + + Maasha::Common::error( qq(copy failed: destination equals source "$src") ) if $src eq $dst; + + $fh_in = file_read_open( $src ); + $fh_out = file_write_open( $dst ); + + while ( $line = <$fh_in> ) { + print $fh_out $line; + } + + close $fh_in; + close $fh_out; +} + + +sub is_gzipped +{ + # Martin A. Hansen, November 2008. + + # Checks if a given file is gzipped. + # Currrently uses a call to the systems + # file tool. Returns 1 if gzipped otherwise + # returns 0. + + my ( $path, # path to file + ) = @_; + + # Returns boolean. + + my ( $type ); + + $type = `file $path`; + + if ( $type =~ /gzip compressed/ ) { + return 1; + } else { + return 0; + } +} + + +sub file_size +{ + # Martin A. Hansen, March 2007 + + # returns the file size for a given file + + my ( $path, # full path to file + ) = @_; + + # returns integer + + my $file_size = ( stat ( $path ) )[ 7 ]; + + return $file_size; +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +1; + + +__END__ diff --git a/code_perl/Maasha/UCSC.pm b/code_perl/Maasha/UCSC.pm index fcd08ff..73558a9 100644 --- a/code_perl/Maasha/UCSC.pm +++ b/code_perl/Maasha/UCSC.pm @@ -32,9 +32,8 @@ use strict; use vars qw ( @ISA @EXPORT ); use Data::Dumper; -use Time::HiRes qw( gettimeofday ); - use Maasha::Common; +use Maasha::Filesys; use Maasha::Calc; use Maasha::Matrix; @@ -61,8 +60,6 @@ use constant { @ISA = qw( Exporter ); -my $TIME = gettimeofday(); - # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> BED format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< @@ -688,50 +685,6 @@ CREATE TABLE $table ( # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> PSL format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< -sub psl2record -{ - # Martin A. Hansen, November 2008 - - # Converts PSL record keys to Biopiece record keys. - - my ( $psl, # hashref - ) = @_; - - # returns hashref - - my %record; - - %record = ( - REC_TYPE => "PSL", - BLOCKSIZES => $psl->{ 'blockSize' }, - SNUMINSERT => $psl->{ 'tNumInsert' }, - Q_END => $psl->{ 'qEnd' }, - SBASEINSERT => $psl->{ 'tBaseInsert' }, - S_END => $psl->{ 'tEnd' }, - QBASEINSERT => $psl->{ 'qBaseInsert' }, - REPMATCHES => $psl->{ 'repMatches' }, - QNUMINSERT => $psl->{ 'qNumInsert' }, - MISMATCHES => $psl->{ 'misMatches' }, - BLOCKCOUNT => $psl->{ 'blockCount' }, - Q_LEN => $psl->{ 'qSize' }, - S_ID => $psl->{ 'tName' }, - STRAND => $psl->{ 'strand' }, - Q_ID => $psl->{ 'qName' }, - MATCHES => $psl->{ 'matches' }, - S_LEN => $psl->{ 'tSize' }, - NCOUNT => $psl->{ 'nCount' }, - Q_BEGS => $psl->{ 'qStarts' }, - S_BEGS => $psl->{ 'tStarts' }, - S_BEG => $psl->{ 'tStart' }, - Q_BEG => $psl->{ 'qStart ' }, - ); - - $record{ "SCORE" } = $record{ "MATCHES" } + int( $record{ "REPMATCHES" } / 2 ) - $record{ "MISMATCHES" } - $record{ "QNUMINSERT" } - $record{ "SNUMINSERT" }; - $record{ "SPAN" } = $record{ "Q_END" } - $record{ "Q_BEG" }; - - return wantarray ? %record : \%record; -} - sub psl_get_entry { @@ -943,7 +896,7 @@ sub psl_upload_to_ucsc # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> TRACK FILE <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< -sub ucsc_config_get_entry +sub ucsc_config_entry_get { # Martin A. Hansen, November 2008. @@ -987,7 +940,7 @@ sub ucsc_config_get_entry } -sub ucsc_config_put_entry +sub ucsc_config_entry_put { # Martin A. Hansen, November 2008. @@ -1028,11 +981,12 @@ sub ucsc_config_put_entry visibility maxHeightPixels color + mafTrack type ); } -sub update_my_tracks +sub ucsc_update_config { # Martin A. Hansen, September 2007. @@ -1044,70 +998,32 @@ sub update_my_tracks # Returns nothing. - my ( $file, $fh_in, $fh_out, $line, $time ); + my ( $file, %entry, $fh_out ); $file = $ENV{ "HOME" } . "/ucsc/my_tracks.ra"; - # ---- create a backup ---- - - $fh_in = Maasha::Common::read_open( $file ); - $fh_out = Maasha::Common::write_open( "$file~" ); - - while ( $line = <$fh_in> ) { - print $fh_out $line; - } - - close $fh_in; - close $fh_out; - - # ---- append track ---- - - $time = Maasha::Common::time_stamp(); - - $fh_out = Maasha::Common::append_open( $file ); - - if ( $type eq "sec_struct" ) - { - print $fh_out "\n\n# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n"; - - print $fh_out "\n# Track added by 'upload_to_ucsc' $time\n\n"; - - print $fh_out "# Database $options->{ 'database' }\n\n"; - - print $fh_out "track $options->{ 'table' }\n"; - print $fh_out "shortLabel $options->{ 'short_label' }\n"; - print $fh_out "longLabel $options->{ 'long_label' }\n"; - print $fh_out "group $options->{ 'group' }\n"; - print $fh_out "priority $options->{ 'priority' }\n"; - print $fh_out "visibility $options->{ 'visibility' }\n"; - print $fh_out "color $options->{ 'color' }\n"; - print $fh_out "type bed 6 +\n"; - print $fh_out "mafTrack multiz17way\n"; - - print $fh_out "\n# //\n"; - } - else - { - print $fh_out "\n\n# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n"; - - print $fh_out "\n# Track added by 'upload_to_ucsc' $time\n\n"; - - print $fh_out "# Database $options->{ 'database' }\n\n"; + Maasha::Filesys::file_copy( $file, "$file~" ); # create a backup + + %entry = ( + database => $options->{ 'database' }, + track => $options->{ 'table' }, + shortLabel => $options->{ 'short_label' }, + longLabel => $options->{ 'long_label' }, + group => $options->{ 'group' }, + priority => $options->{ 'priority' }, + useScore => $options->{ 'use_score' }, + visibility => $options->{ 'visibility' }, + color => $options->{ 'color' }, + type => $type, + ); - print $fh_out "track $options->{ 'table' }\n"; - print $fh_out "shortLabel $options->{ 'short_label' }\n"; - print $fh_out "longLabel $options->{ 'long_label' }\n"; - print $fh_out "group $options->{ 'group' }\n"; - print $fh_out "priority $options->{ 'priority' }\n"; - print $fh_out "useScore 1\n" if $options->{ 'use_score' }; - print $fh_out "visibility $options->{ 'visibility' }\n"; - print $fh_out "maxHeightPixels 50:50:11\n" if $type eq "wig 0"; - print $fh_out "color $options->{ 'color' }\n"; - print $fh_out "type $type\n"; + $entry{ 'mafTrack' } = "multiz17way" if $type eq "type bed 6 +"; + $entry{ 'maxHeightPixels' } = "50:50:11" if $type eq "wig 0"; - print $fh_out "\n# //\n"; - } + $fh_out = Maasha::Filesys::file_append_open( $file ); + ucsc_config_entry_put( \%entry, $fh_out ); + close $fh_out; Maasha::Common::run( "ucscMakeTracks.pl", "-b > /dev/null 2>&1" ); diff --git a/code_perl/Maasha/UCSC/BED.pm b/code_perl/Maasha/UCSC/BED.pm new file mode 100644 index 0000000..f2a11e8 --- /dev/null +++ b/code_perl/Maasha/UCSC/BED.pm @@ -0,0 +1,686 @@ +package Maasha::UCSC::BED; + +# Copyright (C) 2007-2008 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +# Routines for interaction with Browser Extensible DATA (BED) entries and files. + +# Read about the BED format here: http://genome.ucsc.edu/FAQ/FAQformat#format1 + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +use strict; + +use Data::Dumper; +use Maasha::Common; +use Maasha::Filesys; +use Maasha::Calc; + +use vars qw( @ISA @EXPORT_OK ); + +require Exporter; + +@ISA = qw( Exporter ); + +@EXPORT_OK = qw( +); + +use constant { + chrom => 0, # BED field names + chromStart => 1, + chromEnd => 2, + name => 3, + score => 4, + strand => 5, + thickStart => 6, + thickEnd => 7, + itemRgb => 8, + blockCount => 9, + blockSizes => 10, + blockStarts => 11, + CHR => 0, # Biopieces field names + CHR_BEG => 1, + CHR_END => 2, + Q_ID => 3, + SCORE => 4, + STRAND => 5, + THICK_BEG => 6, + THICK_END => 7, + COLOR => 8, + BLOCK_COUNT => 9, + BLOCK_LENS => 10, + Q_BEGS => 11, +}; + + +my $CHECK_ALL = 1; # Global flag indicating that BED input and output is checked thoroughly. + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> BED format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +# Hash for converting BED keys to Biopieces keys. + +my %BED2BIOPIECES = ( + chrom => "CHR", + chromStart => "CHR_BEG", + chromEnd => "CHR_END", + name => "Q_ID", + score => "SCORE", + strand => "STRAND", + thickStart => "THICK_BEG", + thickEnd => "THICK_END", + itemRgb => "COLOR", + blockCount => "BLOCK_COUNT", + blockSizes => "BLOCK_LENS", + blockStarts => "Q_BEGS", +); + + +# Hash for converting biopieces keys to BED keys. + +my %BIOPIECES2BED = ( + CHR => "chrom", + CHR_BEG => "chromStart", + CHR_END => "chromEnd", + Q_ID => "name", + SCORE => "score", + STRAND => "strand", + THICK_BEG => "thickStart", + THICK_END => "thickEnd", + COLOR => "itemRgb", + BLOCK_COUNT => "blockCount", + BLOCK_LENS => "blockSizes", + Q_BEGS => "blockStarts", +); + + +sub bed_entry_get +{ + # Martin A. Hansen, September 2008. + + # Reads a BED entry given a filehandle. + + my ( $fh, # file handle + $cols, # columns to read - OPTIONAL (3,4,5,6,8,9 or 12) + ) = @_; + + # Returns a list. + + my ( $line, @entry ); + + $line = <$fh>; + + $line =~ tr/\n\r//d; # some people have carriage returns in their BED files -> Grrrr + + return if not defined $line; + + if ( not defined $cols ) { + $cols = 1 + $line =~ tr/\t//; + } + + @entry = split "\t", $line, $cols + 1; + + pop @entry if scalar @entry > $cols; + + bed_entry_check( \@entry ) if $CHECK_ALL; + + return wantarray ? @entry : \@entry; +} + + +sub bed_entry_put +{ + # Martin A. Hansen, September 2008. + + # Writes a BED entry array to file. + + my ( $entry, # list + $fh, # file handle - OPTIONAL + $cols, # number of columns in BED file - OPTIONAL (3,4,5,6,8,9 or 12) + ) = @_; + + # Returns nothing. + + if ( $cols and $cols < scalar @{ $entry } ) { + @{ $entry } = @{ $entry }[ 0 .. $cols - 1 ]; + } + + bed_entry_check( $entry ) if $CHECK_ALL; + + $fh = \*STDOUT if not defined $fh; + + print $fh join( "\t", @{ $entry } ), "\n"; +} + + + +sub bed_entry_check +{ + # Martin A. Hansen, November 2008. + + # Checks a BED entry for integrity and + # raises an error if there is a problem. + + my ( $bed, # array ref + ) = @_; + + # Returns nothing. + + my ( $cols, @block_sizes, @block_starts ); + + $cols = scalar @{ $bed }; + + if ( $cols < 3 ) { + Maasha::Common::error( qq(Bad BED entry - must contain at least 3 columns - not $cols) ); + } + + if ( $cols > 12 ) { + Maasha::Common::error( qq(Bad BED entry - must contains no more than 12 columns - not $cols) ); + } + + if ( $bed->[ chrom ] =~ /\s/ ) { + Maasha::Common::error( qq(Bad BED entry - no white space allowed in chrom field: "$bed->[ chrom ]") ); + } + + if ( $bed->[ chromStart ] =~ /\D/ ) { + Maasha::Common::error( qq(Bad BED entry - chromStart must be a whole number - not "$bed->[ chromStart ]") ); + } + + if ( $bed->[ chromEnd ] =~ /\D/ ) { + Maasha::Common::error( qq(Bad BED entry - chromEnd must be a whole number - not "$bed->[ chromEnd ]") ); + } + + if ( $bed->[ chromEnd ] < $bed->[ chromStart ] ) { + Maasha::Common::error( qq(Bad BED entry - chromEnd must be greater than chromStart - not "$bed->[ chromStart ] > $bed->[ chromEnd ]") ); + } + + return if @{ $bed } == 3; + + if ( $bed->[ name ] =~ /\s/ ) { + Maasha::Common::error( qq(Bad BED entry - no white space allowed in name field: "$bed->[ name ]") ); + } + + return if @{ $bed } == 4; + + if ( $bed->[ score ] =~ /\D/ ) { + Maasha::Common::error( qq(Bad BED entry - score must be a whole number - not "$bed->[ score ]") ); + } + + if ( $bed->[ score ] < 0 or $bed->[ score ] > 1000 ) { + Maasha::Common::error( qq(Bad BED entry - score must be between 0 and 1000 - not "$bed->[ score ]") ); + } + + return if @{ $bed } == 5; + + if ( $bed->[ strand ] ne '+' and $bed->[ strand ] ne '-' ) { + Maasha::Common::error( qq(Bad BED entry - strand must be + or - not "$bed->[ strand ]") ); + } + + return if @{ $bed } == 6; + + if ( $bed->[ thickStart ] =~ /\D/ ) { + Maasha::Common::error( qq(Bad BED entry - thickStart must be a whole number - not "$bed->[ thickStart ]") ); + } + + if ( $bed->[ thickEnd ] =~ /\D/ ) { + Maasha::Common::error( qq(Bad BED entry - thickEnd must be a whole number - not "$bed->[ thickEnd ]") ); + } + + if ( $bed->[ thickEnd ] < $bed->[ thickStart ] ) { + Maasha::Common::error( qq(Bad BED entry - thickEnd must be greater than thickStart - not "$bed->[ thickStart ] > $bed->[ thickEnd ]") ); + } + + if ( $bed->[ thickStart ] < $bed->[ chromStart ] ) { + Maasha::Common::error( qq(Bad BED entry - thickStart must be greater than chromStart - not "$bed->[ thickStart ] < $bed->[ chromStart ]") ); + } + + if ( $bed->[ thickStart ] > $bed->[ chromEnd ] ) { + Maasha::Common::error( qq(Bad BED entry - thickStart must be less than chromEnd - not "$bed->[ thickStart ] > $bed->[ chromEnd ]") ); + } + + if ( $bed->[ thickEnd ] < $bed->[ chromStart ] ) { + Maasha::Common::error( qq(Bad BED entry - thickEnd must be greater than chromStart - not "$bed->[ thickEnd ] < $bed->[ chromStart ]") ); + } + + if ( $bed->[ thickEnd ] > $bed->[ chromEnd ] ) { + Maasha::Common::error( qq(Bad BED entry - thickEnd must be less than chromEnd - not "$bed->[ thickEnd ] > $bed->[ chromEnd ]") ); + } + + return if @{ $bed } == 8; + + if ( $bed->[ itemRgb ] !~ /^(0|\d{1,3},\d{1,3},\d{1,3},?)$/ ) { + Maasha::Common::error( qq(Bad BED entry - itemRgb must be 0 or in the form of 255,0,0 - not "$bed->[ itemRgb ]") ); + } + + return if @{ $bed } == 9; + + if ( $bed->[ blockCount ] =~ /\D/ ) { + Maasha::Common::error( qq(Bad BED entry - blockCount must be a whole number - not "$bed->[ blockCount ]") ); + } + + @block_sizes = split ",", $bed->[ blockSizes ]; + + if ( grep /\D/, @block_sizes ) { + Maasha::Common::error( qq(Bad BED entry - blockSizes must be whole numbers - not "$bed->[ blockSizes ]") ); + } + + if ( $bed->[ blockCount ] != scalar @block_sizes ) { + Maasha::Common::error( qq(Bad BED entry - blockSizes "$bed->[ blockSizes ]" must match blockCount "$bed->[ blockCount ]") ); + } + + @block_starts = split ",", $bed->[ blockStarts ]; + + if ( grep /\D/, @block_starts ) { + Maasha::Common::error( qq(Bad BED entry - blockStarts must be whole numbers - not "$bed->[ blockStarts ]") ); + } + + if ( $bed->[ blockCount ] != scalar @block_starts ) { + Maasha::Common::error( qq(Bad BED entry - blockStarts "$bed->[ blockStarts ]" must match blockCount "$bed->[ blockCount ]") ); + } + + if ( $bed->[ chromStart ] + $block_starts[ -1 ] + $block_sizes[ -1 ] != $bed->[ chromEnd ] ) { + Maasha::Common::error( qq(Bad BED entry - chromStart + blockStarts[last] + blockSizes[last] must equal chromEnd: ) . + qq($bed->[ chromStart ] + $block_starts[ -1 ] + $block_sizes[ -1 ] != $bed->[ chromEnd ]) ); + } +} + + +sub bed_sort +{ + # Martin A. hansen, November 2008. + + # Sorts a given BED file according to a given + # sorting scheme: + # 1: chr AND chr_beg. + # 2: chr AND strand AND chr_beg. + # 3: chr_beg. + # 4: strand AND chr_beg. + + # Currently 'bed_sort' is used for sorting = a standalone c program + # that uses qsort (unstable sort). + + my ( $file_in, # path to BED file. + $scheme, # sort scheme + $cols, # number of columns in BED file + ) = @_; + + # Returns nothing. + + my ( $file_out ); + + $file_out = "$file_in.sort"; + + Maasha::Common::run( "bed_sort", "--sort $scheme --cols $cols $file_in > $file_out" ); + + if ( Maasha::Filesys::file_size( $file_in ) != Maasha::Filesys::file_size( $file_out ) ) { + Maasha::Common::error( qq(bed_sort of file "$file_in" failed) ); + } + + rename $file_out, $file_in; +} + + +sub bed_file_split_on_chr +{ + # Martin A. Hansen, November 2008. + + # Given a path to a BED file splits + # this file into one file per chromosome + # in a temporary directory. Returns a hash + # with chromosome name as key and the corresponding + # file as value. + + my ( $file, # path to BED file + $dir, # working directory + $cols, # number of BED columns to read - OPTIONAL + ) = @_; + + # Returns a hashref + + my ( $fh_in, $fh_out, $entry, %fh_hash, %file_hash ); + + $fh_in = Maasha::Filesys::file_read_open( $file ); + + while ( $entry = bed_entry_get( $fh_in, $cols ) ) + { + if ( not exists $file_hash{ $entry->[ chrom ] } ) + { + $fh_hash{ $entry->[ chrom ] } = Maasha::Filesys::file_write_open( "$dir/$entry->[ chrom ]" ); + $file_hash{ $entry->[ chrom ] } = "$dir/$entry->[ chrom ]"; + } + + $fh_out = $fh_hash{ $entry->[ chrom ] }; + + Maasha::UCSC::BED::bed_entry_put( $entry, $fh_out, $cols ); + } + + map { close $_ } keys %fh_hash; + + return wantarray ? %file_hash : \%file_hash; +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> BIOPIECES <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +sub bed2biopiece +{ + # Martin A. Hansen, November 2008. + + # Converts a BED entry given as an arrayref + # to a Biopiece record which is returned as + # a hashref. + + # IMPORTANT! The BED_END and THICK_END positions + # will be the exact position in contrary to the + # UCSC scheme. + + my ( $bed_entry, # BED entry as arrayref + ) = @_; + + # Returns a hashref + + my ( $cols, %bp_record ); + + $cols = scalar @{ $bed_entry }; + + if ( not defined $bed_entry->[ chrom ] and + not defined $bed_entry->[ chromStart ] and + not defined $bed_entry->[ chromEnd ] ) + { + return 0; + } + + $bp_record{ "REC_TYPE" } = "BED"; + $bp_record{ "BED_COLS" } = $cols; + $bp_record{ "CHR" } = $bed_entry->[ chrom ]; + $bp_record{ "CHR_BEG" } = $bed_entry->[ chromStart ]; + $bp_record{ "CHR_END" } = $bed_entry->[ chromEnd ] - 1; + $bp_record{ "BED_LEN" } = $bed_entry->[ chromEnd ] - $bed_entry->[ chromEnd ]; + + return wantarray ? %bp_record : \%bp_record if $cols == 3; + + $bp_record{ "Q_ID" } = $bed_entry->[ name ]; + + return wantarray ? %bp_record : \%bp_record if $cols == 4; + + $bp_record{ "SCORE" } = $bed_entry->[ score ]; + + return wantarray ? %bp_record : \%bp_record if $cols == 5; + + $bp_record{ "STRAND" } = $bed_entry->[ strand ]; + + return wantarray ? %bp_record : \%bp_record if $cols == 6; + + $bp_record{ "THICK_BEG" } = $bed_entry->[ thickStart ]; + $bp_record{ "THICK_END" } = $bed_entry->[ thickEnd ] - 1; + + return wantarray ? %bp_record : \%bp_record if $cols == 8; + + $bp_record{ "COLOR" } = $bed_entry->[ itemRgb ]; + + return wantarray ? %bp_record : \%bp_record if $cols == 9; + + $bp_record{ "BLOCK_COUNT" } = $bed_entry->[ blockCount ]; + $bp_record{ "BLOCK_LENS" } = $bed_entry->[ blockSizes ]; + $bp_record{ "Q_BEGS" } = $bed_entry->[ blockStarts ]; + + return wantarray ? %bp_record : \%bp_record; +} + + +sub biopiece2bed +{ + # Martin A. Hansen, November 2008. + + # Converts a Biopieces record given as a hashref + # to a BED record which is returned as + # an arrayref. As much as possible of the Biopiece + # record is converted and undef is returned if + # convertion failed. + + # IMPORTANT! The BED_END and THICK_END positions + # will be the inexact position used in the + # UCSC scheme. + + my ( $bp_record, # hashref + $cols, # number of columns in BED entry - OPTIONAL (but faster) + ) = @_; + + # Returns arrayref. + + my ( @bed_entry ); + + $cols ||= 12; # max number of columns possible + + $bed_entry[ chrom ] = $bp_record->{ "CHR" } || + $bp_record->{ "S_ID" } || + return undef; + + $bed_entry[ chromStart ] = $bp_record->{ "CHR_BEG" } || + $bp_record->{ "S_BEG" } || + return undef; + + $bed_entry[ chromEnd ] = $bp_record->{ "CHR_END" } || + $bp_record->{ "S_END" } || + return undef; + + $bed_entry[ chromEnd ]++; + + return wantarray ? @bed_entry : \@bed_entry if $cols == 3; + + $bed_entry[ name ] = $bp_record->{ "Q_ID" } || return wantarray ? @bed_entry : \@bed_entry; + + return wantarray ? @bed_entry : \@bed_entry if $cols == 4; + + if ( exists $bp_record->{ "SCORE" } ) { + $bed_entry[ score ] = $bp_record->{ "SCORE" }; + } else { + return wantarray ? @bed_entry : \@bed_entry; + } + + return wantarray ? @bed_entry : \@bed_entry if $cols == 5; + + if ( exists $bp_record->{ "STRAND" } ) { + $bed_entry[ strand ] = $bp_record->{ "STRAND" }; + } else { + return wantarray ? @bed_entry : \@bed_entry; + } + + return wantarray ? @bed_entry : \@bed_entry if $cols == 6; + + if ( defined $bp_record->{ "THICK_BEG" } and + defined $bp_record->{ "THICK_END" } ) + { + $bed_entry[ thickStart ] = $bp_record->{ "THICK_BEG" }; + $bed_entry[ thickEnd ] = $bp_record->{ "THICK_END" }; + } + + return wantarray ? @bed_entry : \@bed_entry if $cols == 8; + + if ( exists $bp_record->{ "COLOR" } ) { + $bed_entry[ itemRgb ] = $bp_record->{ "COLOR" }; + } else { + return wantarray ? @bed_entry : \@bed_entry; + } + + return wantarray ? @bed_entry : \@bed_entry if $cols == 9; + + if ( defined $bp_record->{ "BLOCK_COUNT" } and + defined $bp_record->{ "BLOCK_LENS" } and + defined $bp_record->{ "Q_BEGS" } ) + { + $bed_entry[ blockCount ] = $bp_record->{ "BLOCK_COUNT" }; + $bed_entry[ blockSizes ] = $bp_record->{ "BLOCK_LENS" }; + $bed_entry[ blockStarts ] = $bp_record->{ "Q_BEGS" }; + $bed_entry[ thickEnd ]++; + } + + return wantarray ? @bed_entry : \@bed_entry; +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> TAG CONTIGS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +sub tag_contigs_assemble +{ + # Martin A. Hansen, November 2008. + + # Returns a path to a BED file with tab contigs. + + my ( $bed_file, # path to BED file + $chr, # chromosome + $dir, # working directory + ) = @_; + + # Returns a string + + my ( $fh_in, $fh_out, $array, $new_file, $pos, $id, $beg, $end, $score, $entry ); + + $fh_in = Maasha::Filesys::file_read_open( $bed_file ); + + $array = tag_contigs_assemble_array( $fh_in ); + + close $fh_in; + + $new_file = "$bed_file.tag_contigs"; + + $fh_out = Maasha::Filesys::file_write_open( $new_file ); + + $pos = 0; + $id = 0; + + while ( ( $beg, $end, $score ) = tag_contigs_scan( $array, $pos ) and $beg ) + { + $entry->[ chrom ] = $chr; + $entry->[ chromStart ] = $beg; + $entry->[ chromEnd ] = $end; + $entry->[ name ] = sprintf( "TC%06d", $id ); + $entry->[ score ] = $score; + + bed_entry_put( $entry, $fh_out ); + + $pos = $end; + $id++; + } + + close $fh_out; + + return $new_file; +} + + +sub tag_contigs_assemble_array +{ + # Martin A. Hansen, November 2008. + + # Given a BED file with entries from only one + # chromosome assembles tag contigs from these + # ignoring strand information. Only tags with + # a score higher than the clone count over + # genomic loci (the score field) is included + # in the tag contigs. + + # ----------- tags + # ------------- + # ---------- + # ---------- + # ======================== tag contig + + + my ( $fh, # file handle to BED file + ) = @_; + + # Returns an arrayref. + + my ( $entry, $clones, $score, @array ); + + while ( $entry = bed_entry_get( $fh ) ) + { + if ( $entry->[ name ] =~ /_(\d+)$/ ) + { + $clones = $1; + + if ( $entry->[ score ] ) + { + $score = int( $clones / $entry->[ score ] ); + + map { $array[ $_ ] += $score } $entry->[ chromStart ] .. $entry->[ chromEnd ] - 1 if $score >= 1; + } + } + } + + return wantarray ? @array : \@array; +} + + +sub tag_contigs_scan +{ + # Martin A. Hansen, November 2008. + + # Scans an array with tag contigs and locates + # the next contig from a given position. The + # score of the tag contig is determined as the + # maximum value of the tag contig. If a tag contig + # is found a triple is returned with beg, end and score + # otherwise an empty list is returned. + + my ( $array, # array to scan + $beg, # position to start scanning from + ) = @_; + + # Returns an arrayref. + + my ( $end, $score ); + + $score = 0; + + while ( $beg < scalar @{ $array } and not $array->[ $beg ] ) { $beg++ } + + $end = $beg; + + while ( $array->[ $end ] ) + { + $score = Maasha::Calc::max( $score, $array->[ $end ] ); + + $end++ + } + + if ( $score > 0 ) { + return wantarray ? ( $beg, $end, $score ) : [ $beg, $end, $score ]; + } else { + return wantarray ? () : []; + } +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +1; + + +__END__ diff --git a/code_perl/Maasha/UCSC/PSL.pm b/code_perl/Maasha/UCSC/PSL.pm new file mode 100644 index 0000000..4c64565 --- /dev/null +++ b/code_perl/Maasha/UCSC/PSL.pm @@ -0,0 +1,200 @@ +package Maasha::UCSC::PSL; + +# Copyright (C) 2007-2008 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +# Routines for interaction with PSL entries and files. + +# Read more about the PSL format here: http://genome.ucsc.edu/goldenPath/help/hgTracksHelp.html#PSL + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +use strict; + +use Data::Dumper; +use Maasha::Common; +use Maasha::Filesys; + +use vars qw( @ISA @EXPORT_OK ); + +require Exporter; + +@ISA = qw( Exporter ); + +@EXPORT_OK = qw( +); + +use constant { + matches => 0, # PSL field names + misMatches => 1, + repMatches => 2, + nCount => 3, + qNumInsert => 4, + qBaseInsert => 5, + tNumInsert => 6, + tBaseInsert => 7, + strand => 8, + qName => 9, + qSize => 10, + qStart => 11, + qEnd => 12, + tName => 13, + tSize => 14, + tStart => 15, + tEnd => 16, + blockCount => 17, + blockSizes => 18, + qStarts => 19, + tStarts => 20, + MATCHES => 0, # Biopieces field names + MISMATCHES => 1, + REPMATCHES => 2, + NCOUNT => 3, + QNUMINSERT => 4, + QBASEINSERT => 5, + SNUMINSERT => 6, + SBASEINSERT => 7, + STRAND => 8, + Q_ID => 9, + Q_LEN => 10, + Q_BEG => 11, + Q_END => 12, + S_ID => 13, + S_LEN => 14, + S_BEG => 15, + S_END => 16, + BLOCKCOUNT => 17, + BLOCKSIZES => 18, + Q_BEGS => 19, + S_BEGS => 20, +}; + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> PSL format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +sub psl_entry_get +{ + # Martin A. Hansen, August 2008. + + # Reads PSL next entry from a PSL file and returns an array ref. + + my ( $fh, # file handle of PSL filefull path to PSL file + ) = @_; + + # Returns hashref. + + my ( $line, @fields ); + + while ( $line = <$fh> ) + { + chomp $line; + + @fields = split "\t", $line; + + return wantarray ? @fields : \@fields if scalar @fields == 21; + } + + return undef; +} + + +sub psl2record +{ + # Martin A. Hansen, November 2008 + + # Converts PSL record keys to Biopiece record keys. + + my ( $psl, # hashref + ) = @_; + + # returns hashref + + my %record; + + %record = ( + REC_TYPE => "PSL", + BLOCKSIZES => $psl->{ 'blockSize' }, + SNUMINSERT => $psl->{ 'tNumInsert' }, + Q_END => $psl->{ 'qEnd' }, + SBASEINSERT => $psl->{ 'tBaseInsert' }, + S_END => $psl->{ 'tEnd' }, + QBASEINSERT => $psl->{ 'qBaseInsert' }, + REPMATCHES => $psl->{ 'repMatches' }, + QNUMINSERT => $psl->{ 'qNumInsert' }, + MISMATCHES => $psl->{ 'misMatches' }, + BLOCKCOUNT => $psl->{ 'blockCount' }, + Q_LEN => $psl->{ 'qSize' }, + S_ID => $psl->{ 'tName' }, + STRAND => $psl->{ 'strand' }, + Q_ID => $psl->{ 'qName' }, + MATCHES => $psl->{ 'matches' }, + S_LEN => $psl->{ 'tSize' }, + NCOUNT => $psl->{ 'nCount' }, + Q_BEGS => $psl->{ 'qStarts' }, + S_BEGS => $psl->{ 'tStarts' }, + S_BEG => $psl->{ 'tStart' }, + Q_BEG => $psl->{ 'qStart ' }, + ); + + $record{ "SCORE" } = $record{ "MATCHES" } + int( $record{ "REPMATCHES" } / 2 ) - $record{ "MISMATCHES" } - $record{ "QNUMINSERT" } - $record{ "SNUMINSERT" }; + $record{ "SPAN" } = $record{ "Q_END" } - $record{ "Q_BEG" }; + + return wantarray ? %record : \%record; +} + + +sub psl_calc_score +{ + # Martin A. Hansen, November 2008. + + # Calculates the score for a PSL entry according to: + # http://genome.ucsc.edu/FAQ/FAQblat#blat4 + + my ( $psl_entry, # array ref + ) = @_; + + # Returns a float + + my ( $score ); + + $score = $psl_entry[ MATCHES ] + + int( $psl_entry[ REPMATCHES ] / 2 ) - + $psl_entry[ MISMATCHES ] - + $psl_entry[ QNUMINSERT ] - + $psl_entry[ SNUMINSERT ]; + + return $score; +} + + + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +1; + + +__END__ diff --git a/code_perl/Maasha/UCSC/Wiggle.pm b/code_perl/Maasha/UCSC/Wiggle.pm new file mode 100644 index 0000000..5e7b5de --- /dev/null +++ b/code_perl/Maasha/UCSC/Wiggle.pm @@ -0,0 +1,306 @@ +package Maasha::UCSC::Wiggle; + +# Copyright (C) 2007-2008 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +# Routines for manipulation of wiggle data. + +# http://genome.ucsc.edu/goldenPath/help/wiggle.html + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +use strict; + +use Data::Dumper; +use Maasha::Common; +use Maasha::Filesys; + +use vars qw( @ISA @EXPORT_OK ); + +require Exporter; + +@ISA = qw( Exporter ); + +@EXPORT_OK = qw( +); + + +use constant { + chrom => 0, # BED field names + chromStart => 1, + chromEnd => 2, + name => 3, + score => 4, + strand => 5, + thickStart => 6, + thickEnd => 7, + itemRgb => 8, + blockCount => 9, + blockSizes => 10, + blockStarts => 11, +}; + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> FixedStep format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +sub fixedstep_entry_get +{ + # Martin A. Hansen, December 2007. + + # Given a file handle to a PhastCons file get the + # next entry which is all the lines after a "fixedStep" + # line and until the next "fixedStep" line or EOF. + + my ( $fh, # filehandle + ) = @_; + + # Returns a list of lines + + my ( $entry, @lines ); + + local $/ = "\nfixedStep "; + + $entry = <$fh>; + + chomp $entry; + + @lines = split "\n", $entry; + + return if @lines == 0; + + $lines[ 0 ] =~ s/fixedStep?\s*//; + $lines[ 0 ] = 'fixedStep ' . $lines[ 0 ]; + + return wantarray ? @lines : \@lines; +} + + +sub fixedstep_entry_put +{ + # Martin A. Hansen, April 2008. + + # Outputs a block of fixedStep values. + # Used for outputting wiggle data. + + my ( $chr, # chromosome + $beg, # start position + $block, # list of scores + $fh, # filehandle - OPTIONAL + ) = @_; + + # Returns nothing. + + $beg += 1; # fixedStep format is 1 based. + + $fh ||= \*STDOUT; + + print $fh "fixedStep chrom=$chr start=$beg step=1\n"; + + map { print $fh "$_\n" } @{ $block }; +} + + +sub fixedstep2biopiece +{ + # Martin A. Hansen, November 2008. + + # Converts a fixedstep entry to a Biopiece record. + + my ( $entry, # fixedstep entry arrayref + ) = @_; + + # Returns a hashref + + my ( $head, %record ); + + $head = shift @{ $entry }; + + if ( $head =~ /^fixedStep chrom=([^ ]+) start=(\d+) step=(\d+)$/ ) + { + $record{ "REC_TYPE" } = "fixed_step"; + $record{ "CHR" } = $1; + $record{ "CHR_BEG" } = $2; + $record{ "STEP" } = $3; + $record{ "VALS" } = join ";", @{ $entry }; + } + else + { + Maasha::Common::error( qq(could not convert fixedStep entry to Biopiece record) ); + } + + return wantarray ? %record : \%record; +} + + +sub biopiece2fixedstep +{ + # Martin A. Hansen, November 2008. + + # Converts a Biopiece record to a fixedStep entry. + + my ( $record, # Biopiece record + ) = @_; + + # Returns a list + + my ( @entry, $vals ); + + if ( exists $record->{ "REC_TYPE" } and $record->{ "REC_TYPE" } eq 'fixed_step' ) + { + if ( exists $record->{ 'CHR' } and exists $record->{ 'CHR_BEG' } and exists $record->{ 'STEP' } ) + { + push @entry, "fixedStep chrom=$record->{ 'CHR' } start=$record->{ 'CHR_BEG' } step=$record->{ 'STEP' }"; + + $vals = $record->{ 'VALS' }; + + map { push @entry, $_ } split ";", $vals; + } + else + { + Maasha::Common::error( qq(could not convert Biopiece record to fixedStep) ); + } + } + + return wantarray ? @entry : \@entry; +} + + +sub fixedstep_calc +{ + # Martin A. Hansen, November 2008. + + # Calculates fixedstep entries for a given BED file. + # Implemented using large Perl arrays to avoid sorting. + # Returns path to the fixedstep file. + + my ( $bed_file, # path to BED file + $chr, # chromosome name + $use_score, # flag indicating that the score field should be used - OPTIONAL + ) = @_; + + # Returns a string + + my ( $fixedstep_file, $fh_in, $fh_out, $array, $beg, $end, @block ); + + $fixedstep_file = "$bed_file.fixedstep"; + + $fh_in = Maasha::Filesys::file_read_open( $bed_file ); + + $array = fixedstep_calc_array( $fh_in, $use_score ); + + close $fh_in; + + $fh_out = Maasha::Filesys::file_write_open( $fixedstep_file ); + + $beg = 0; + + while ( ( $beg, $end ) = fixedstep_scan( $array, $beg ) and $beg ) + { + @block = @{ $array }[ $beg .. $end ]; + + fixedstep_entry_put( $chr, $beg, \@block, $fh_out ); + + $beg = $end; + } + + close $fh_out; + + return $fixedstep_file; +} + + +sub fixedstep_calc_array +{ + # Martin A. Hansen, November 2008. + + # Given a filehandle to a BED file for a single + # chromosome fills an array with scores based on + # clone count (or sequence read count) for the + # begin position through the end position of each + # BED entry. + + my ( $fh_in, # filehandle to BED file + $use_score, # flag indicating that the score field should be used - OPTIONAL + ) = @_; + + # Returns arrayref. + + my ( $bed, $clones, @array ); + + while ( $bed = Maasha::UCSC::BED::bed_entry_get( $fh_in, 5 ) ) + { + if ( $use_score ) { + $clones = $bed->[ score ]; + } elsif ( $bed->[ name ] =~ /_(\d+)$/ ) { + $clones = $1; + } else { + $clones = 1; + } + + map { $array[ $_ ] += $clones } $bed->[ chromStart ] .. $bed->[ chromEnd ] - 1; + } + + return wantarray ? @array : \@array; +} + + +sub fixedstep_scan +{ + # Martin A. Hansen, November 2008. + + # Given a fixedstep array locates the next block of + # non-zero elements from a given begin position. + # A [ begin end ] tuple of the block posittion is returned + # if a block was found otherwise and empty tuple is returned. + + my ( $array, # array to scan + $beg, # position to start scanning from + ) = @_; + + # Returns an arrayref. + + my ( $end ); + + while ( $beg < scalar @{ $array } and not $array->[ $beg ] ) { $beg++ } + + $end = $beg; + + while ( $array->[ $end ] ) { $end++ } + + if ( $end > $beg ) { + return wantarray ? ( $beg, $end ) : [ $beg, $end ]; + } else { + return wantarray ? () : []; + } +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +1; + + +__END__ -- 2.39.5