X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=code_perl%2FMaasha%2FBiopieces.pm;h=161f73d16eb39daa0d0f388f1082240bc10798ac;hb=55ce9bcb65d11cdebe020d16078013e282b5bd7c;hp=2a7ef7d0d54fbc9f038112b4437e4387f9baa61a;hpb=4f7a684fc91d1759b00d78ac98bc4a3fddff9748;p=biopieces.git diff --git a/code_perl/Maasha/Biopieces.pm b/code_perl/Maasha/Biopieces.pm index 2a7ef7d..161f73d 100644 --- a/code_perl/Maasha/Biopieces.pm +++ b/code_perl/Maasha/Biopieces.pm @@ -36,6 +36,7 @@ use Time::HiRes qw( gettimeofday ); use Storable qw( dclone ); use Maasha::Config; use Maasha::Common; +use Maasha::Filesys; use Maasha::Fasta; use Maasha::Align; use Maasha::Matrix; @@ -47,10 +48,13 @@ use Maasha::Patscan; use Maasha::Plot; use Maasha::Calc; use Maasha::UCSC; +use Maasha::UCSC::BED; +use Maasha::UCSC::Wiggle; use Maasha::NCBI; use Maasha::GFF; use Maasha::TwoBit; use Maasha::Solid; +use Maasha::Solexa; use Maasha::SQL; use Maasha::Gwiki; @@ -86,33 +90,31 @@ $SIG{ 'TERM' } = \&sig_handler; my ( $script, $BP_TMP ); -$script = Maasha::Common::get_scriptname(); -$BP_TMP = Maasha::Common::get_tmpdir(); +$script = Maasha::Common::get_scriptname(); +$BP_TMP = Maasha::Common::get_tmpdir(); # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> LOG <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< -my $log_fh = Maasha::Common::append_open( "$ENV{ 'BP_TMP' }/biopieces.log" ); +my $log_global = Maasha::Common::append_open( "$ENV{ 'BP_LOG' }/biopieces.log" ); +my $log_local = Maasha::Common::append_open( "$ENV{ 'HOME' }/.biopieces.log" ); -$log_fh->autoflush( 1 ); +$log_global->autoflush( 1 ); +$log_local->autoflush( 1 ); -&log( $log_fh, $script, \@ARGV ); +&log( $log_global, $script, \@ARGV ); +&log( $log_local, $script, \@ARGV ); -close $log_fh; +close $log_global; +close $log_local; # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> RUN SCRIPT <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< -my $t0 = gettimeofday(); - run_script( $script ); -my $t1 = gettimeofday(); - -print STDERR "Program: $script" . ( " " x ( 25 - length( $script ) ) ) . sprintf( "Run time: %.4f\n", ( $t1 - $t0 ) ); - # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SUBROUTINES <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< @@ -153,10 +155,14 @@ sub run_script # Returns nothing. - my ( $options, $in, $out ); + my ( $t0, $t1, $options, $in, $out ); + + $t0 = gettimeofday(); $options = get_options( $script ); + $options->{ "SCRIPT" } = $script; + if ( $script ne "list_biopieces" and $script ne "list_genomes" ) { $script = "print_usage" if ( -t STDIN and keys %{ $options } <= 1 or $options->{ 'help' } ); } @@ -182,6 +188,8 @@ sub run_script elsif ( $script eq "read_solexa" ) { script_read_solexa( $in, $out, $options ) } elsif ( $script eq "read_solid" ) { script_read_solid( $in, $out, $options ) } elsif ( $script eq "read_mysql" ) { script_read_mysql( $in, $out, $options ) } + elsif ( $script eq "read_ucsc_config" ) { script_read_ucsc_config( $in, $out, $options ) } + elsif ( $script eq "assemble_tag_contigs" ) { script_assemble_tag_contigs( $in, $out, $options ) } elsif ( $script eq "format_genome" ) { script_format_genome( $in, $out, $options ) } elsif ( $script eq "length_seq" ) { script_length_seq( $in, $out, $options ) } elsif ( $script eq "uppercase_seq" ) { script_uppercase_seq( $in, $out, $options ) } @@ -192,6 +200,7 @@ sub run_script elsif ( $script eq "oligo_freq" ) { script_oligo_freq( $in, $out, $options ) } elsif ( $script eq "create_weight_matrix" ) { script_create_weight_matrix( $in, $out, $options ) } elsif ( $script eq "calc_bit_scores" ) { script_calc_bit_scores( $in, $out, $options ) } + elsif ( $script eq "calc_fixedstep" ) { script_calc_fixedstep( $in, $out, $options ) } elsif ( $script eq "reverse_seq" ) { script_reverse_seq( $in, $out, $options ) } elsif ( $script eq "complement_seq" ) { script_complement_seq( $in, $out, $options ) } elsif ( $script eq "remove_indels" ) { script_remove_indels( $in, $out, $options ) } @@ -212,6 +221,7 @@ sub run_script elsif ( $script eq "create_blast_db" ) { script_create_blast_db( $in, $out, $options ) } elsif ( $script eq "blast_seq" ) { script_blast_seq( $in, $out, $options ) } elsif ( $script eq "blat_seq" ) { script_blat_seq( $in, $out, $options ) } + elsif ( $script eq "soap_seq" ) { script_soap_seq( $in, $out, $options ) } elsif ( $script eq "match_seq" ) { script_match_seq( $in, $out, $options ) } elsif ( $script eq "create_vmatch_index" ) { script_create_vmatch_index( $in, $out, $options ) } elsif ( $script eq "vmatch_seq" ) { script_vmatch_seq( $in, $out, $options ) } @@ -224,11 +234,16 @@ sub run_script elsif ( $script eq "write_fixedstep" ) { script_write_fixedstep( $in, $out, $options ) } elsif ( $script eq "write_2bit" ) { script_write_2bit( $in, $out, $options ) } elsif ( $script eq "write_solid" ) { script_write_solid( $in, $out, $options ) } + elsif ( $script eq "write_ucsc_config" ) { script_write_ucsc_config( $in, $out, $options ) } elsif ( $script eq "head_records" ) { script_head_records( $in, $out, $options ) } elsif ( $script eq "remove_keys" ) { script_remove_keys( $in, $out, $options ) } + elsif ( $script eq "remove_adaptor" ) { script_remove_adaptor( $in, $out, $options ) } + elsif ( $script eq "remove_mysql_tables" ) { script_remove_mysql_tables( $in, $out, $options ) } + elsif ( $script eq "remove_ucsc_tracks" ) { script_remove_ucsc_tracks( $in, $out, $options ) } elsif ( $script eq "rename_keys" ) { script_rename_keys( $in, $out, $options ) } elsif ( $script eq "uniq_vals" ) { script_uniq_vals( $in, $out, $options ) } elsif ( $script eq "merge_vals" ) { script_merge_vals( $in, $out, $options ) } + elsif ( $script eq "merge_records" ) { script_merge_records( $in, $out, $options ) } elsif ( $script eq "grab" ) { script_grab( $in, $out, $options ) } elsif ( $script eq "compute" ) { script_compute( $in, $out, $options ) } elsif ( $script eq "flip_tab" ) { script_flip_tab( $in, $out, $options ) } @@ -257,8 +272,9 @@ sub run_script close $in if defined $in; close $out; - # unset status - missing - # write log file - missing + $t1 = gettimeofday(); + + print STDERR "Program: $script" . ( " " x ( 25 - length( $script ) ) ) . sprintf( "Run time: %.4f\n", ( $t1 - $t0 ) ) if $options->{ 'verbose' }; } @@ -273,7 +289,7 @@ sub get_options # Returns hash - my ( %options, @options, $opt, @genomes ); + my ( %options, @options, $opt, @genomes, $real ); if ( $script eq "print_usage" ) { @@ -310,7 +326,9 @@ sub get_options { @options = qw( data_in|i=s + cols|c=s num|n=s + check|C ); } elsif ( $script eq "read_fixedstep" ) @@ -359,6 +377,7 @@ sub get_options { @options = qw( data_in|i=s + samples|s=s num|n=s ); } @@ -382,6 +401,7 @@ sub get_options @options = qw( data_in|i=s num|n=s + format|f=s quality|q=s ); } @@ -402,6 +422,25 @@ sub get_options password|p=s ); } + elsif ( $script eq "read_ucsc_config" ) + { + @options = qw( + data_in|i=s + num|n=s + ); + } + elsif ( $script eq "assemble_tag_contigs" ) + { + @options = qw( + check|C + ); + } + elsif ( $script eq "calc_fixedstep" ) + { + @options = qw( + check|C + ); + } elsif ( $script eq "format_genome" ) { @options = qw( @@ -565,6 +604,17 @@ sub get_options ooc|c ); } + elsif ( $script eq "soap_seq" ) + { + @options = qw( + in_file|i=s + genome|g=s + seed_size|s=s + mismatches|m=s + gap_size|G=s + cpus|c=s + ); + } elsif ( $script eq "match_seq" ) { @options = qw( @@ -634,6 +684,8 @@ sub get_options elsif ( $script eq "write_bed" ) { @options = qw( + cols|c=s + check|C no_stream|x data_out|o=s compress|Z @@ -672,6 +724,13 @@ sub get_options compress|Z ); } + elsif ( $script eq "write_ucsc_config" ) + { + @options = qw( + no_stream|x + data_out|o=s + ); + } elsif ( $script eq "plot_seqlogo" ) { @options = qw( @@ -714,6 +773,38 @@ sub get_options save_keys|K=s ); } + elsif ( $script eq "remove_adaptor" ) + { + @options = qw( + adaptor|a=s + mismatches|m=s + remove|r=s + offset|o=s + ); + } + elsif ( $script eq "remove_mysql_tables" ) + { + @options = qw( + database|d=s + tables|t=s + keys|k=s + user|u=s + password|p=s + no_stream|x + ); + } + elsif ( $script eq "remove_ucsc_tracks" ) + { + @options = qw( + database|d=s + tracks|t=s + keys|k=s + config_file|c=s + user|u=s + password|p=s + no_stream|x + ); + } elsif ( $script eq "rename_keys" ) { @options = qw( @@ -734,6 +825,13 @@ sub get_options delimit|d=s ); } + elsif ( $script eq "merge_records" ) + { + @options = qw( + keys|k=s + merge|m=s + ); + } elsif ( $script eq "grab" ) { @options = qw( @@ -902,32 +1000,29 @@ sub get_options group|g=s priority|p=f use_score|u - visibility|v=s - wiggle|w + visibility|V=s color|c=s - chunk_size|C=s + check|C ); } push @options, qw( stream_in|I=s stream_out|O=s - verbose + verbose|v help|? ); # print STDERR Dumper( \@options ); - + GetOptions( \%options, @options, ); - $options{ "script" } = $script; - # print STDERR Dumper( \%options ); - if ( -t STDIN && scalar( keys %options ) == 1 or $options{ "help" } ) { + if ( -t STDIN && scalar( keys %options ) == 0 or $options{ "help" } ) { return wantarray ? %options : \%options; } @@ -939,6 +1034,9 @@ sub get_options $options{ "feats" } = [ split ",", $options{ "feats" } ] if defined $options{ "feats" }; $options{ "frames" } = [ split ",", $options{ "frames" } ] if defined $options{ "frames" }; $options{ "formats" } = [ split ",", $options{ "formats" } ] if defined $options{ "formats" }; + $options{ "samples" } = [ split ",", $options{ "samples" } ] if defined $options{ "samples" }; + $options{ "tables" } = [ split ",", $options{ "tables" } ] if defined $options{ "tables" }; + $options{ "tracks" } = [ split ",", $options{ "tracks" } ] if defined $options{ "tracks" }; # ---- check arguments ---- @@ -951,7 +1049,9 @@ sub get_options map { Maasha::Common::error( qq(Argument to --cols must be a whole numbers - not "$_") ) if $_ !~ /^\d+$/ } @{ $options{ "cols" } } if $options{ "cols" }; -# print STDERR Dumper( \%options ); + # print STDERR Dumper( \%options ); + + $real = "beg|end|word_size|wrap|tile_size|len|prefix_length|mismatches|offset|num|skip|cpus|window_size|step_size"; foreach $opt ( keys %options ) { @@ -959,7 +1059,7 @@ sub get_options { Maasha::Common::error( qq(Argument to --$opt must be a valid file or fileglob expression - not "$options{ $opt }") ); } - elsif ( $opt =~ /beg|end|word_size|wrap|chunk_size|tile_size|len|prefix_length|num|skip|cpus|window_size|step_size/ and $options{ $opt } !~ /^\d+$/ ) + elsif ( $opt =~ /$real/ and $options{ $opt } !~ /^\d+$/ ) { Maasha::Common::error( qq(Argument to --$opt must be a whole number - not "$options{ $opt }") ); } @@ -988,13 +1088,25 @@ sub get_options Maasha::Common::error( qq(Genome $options{ $opt } not found in "$ENV{ 'BP_DATA' }/genomes/") ); } } - elsif ( $opt eq "terminal" and not $options{ $opt } =~ /^(svg|post|dumb)/ ) + elsif ( $opt eq "terminal" and not $options{ $opt } =~ /^(svg|post|dumb|x11)/ ) { Maasha::Common::error( qq(Bad --$opt argument "$options{ $opt }") ); } - elsif ( $opt eq "table" and $options{ $opt } =~ /-\./ ) + elsif ( $opt eq "table" and $options{ $opt } =~ /(-|\.)/ ) + { + Maasha::Common::error( qq(Character '$1' is not allowed in table name: $options{ $opt }) ); + } + elsif ( $opt eq "merge" and $options{ $opt } !~ /^(AandB|AorB|BorA|AnotB|BnotA)$/ ) + { + Maasha::Common::error( qq(Argument to --$opt must be AandB, AorB, BorA, AnotB, or BnotA - not "$options{ $opt }") ); + } + elsif ( $opt eq "format" and $script eq "read_solexa" and $options{ $opt } !~ /octal|decimal/ ) + { + Maasha::Common::error( qq(Argument to --$opt must be octal or decimal - not "$options{ $opt }") ); + } + elsif ( $opt eq "remove" and $script eq "remove_adaptor" and $options{ $opt } !~ /before|after|skip/ ) { - Maasha::Common::error( qq(Character '$options{ $opt }' is not allowed in table names) ); + Maasha::Common::error( qq(Argument to --$opt must be before, after, or skip - not "$options{ $opt }") ); } } @@ -1003,7 +1115,9 @@ sub get_options Maasha::Common::error( qq(no --database or --genome specified) ) if $script eq "blast_seq" and not $options{ "genome" } and not $options{ "database" }; Maasha::Common::error( qq(both --database and --genome specified) ) if $script eq "blast_seq" and $options{ "genome" } and $options{ "database" }; Maasha::Common::error( qq(no --index_name or --genome specified) ) if $script eq "vmatch_seq" and not $options{ "genome" } and not $options{ "index_name" }; - Maasha::Common::error( qq(both --index and --genome specified) ) if $script eq "vmatch_seq" and $options{ "genome" } and $options{ "index" }; + Maasha::Common::error( qq(both --index and --genome specified) ) if $script eq "vmatch_seq" and $options{ "genome" } and $options{ "index_name" }; + Maasha::Common::error( qq(no --in_file or --genome specified) ) if $script eq "soap_seq" and not $options{ "genome" } and not $options{ "in_file" }; + Maasha::Common::error( qq(both --in_file and --genome specified) ) if $script eq "soap_seq" and $options{ "genome" } and $options{ "in_file" }; Maasha::Common::error( qq(no --genome specified) ) if $script =~ /format_genome|get_genome_seq|get_genome_align|get_genome_phastcons|blat_seq|plot_phastcons_profiles|plot_karyogram/ and not $options{ "genome" }; Maasha::Common::error( qq(no --key specified) ) if $script =~ /plot_lendist|plot_histogram/ and not $options{ "key" }; Maasha::Common::error( qq(no --keys speficied) ) if $script =~ /sort_records|count_vals|sum_vals|mean_vals|median_vals|length_vals/ and not $options{ "keys" }; @@ -1040,7 +1154,7 @@ sub script_print_usage if ( $options->{ 'data_in' } ) { $file = $options->{ 'data_in' }; } else { - $file = join "", $ENV{ 'BP_DIR' }, "/bp_usage/", $options->{ 'script' }, ".wiki"; + $file = join "", $ENV{ 'BP_DIR' }, "/bp_usage/", $options->{ 'SCRIPT' }, ".wiki"; } $wiki = Maasha::Gwiki::gwiki_read( $file ); @@ -1086,6 +1200,7 @@ sub script_list_biopieces @{ $wiki } = grep { $_->[ 0 ]->{ 'FORMAT' } =~ /paragraph/ } @{ $wiki }; $synopsis = $wiki->[ 0 ]->[ 0 ]->{ 'TEXT' }; + $synopsis =~ s/!(\w)/$1/g; printf( "%-30s%s\n", $program, $synopsis ); } @@ -1294,7 +1409,7 @@ sub script_read_psl # Returns nothing. - my ( $record, @files, $file, $entries, $entry, $num ); + my ( $record, $file, $data_in, $num ); while ( $record = get_record( $in ) ) { put_record( $record, $out ); @@ -1304,11 +1419,11 @@ sub script_read_psl foreach $file ( @{ $options->{ "files" } } ) { - $entries = Maasha::UCSC::psl_get_entries( $file ); + $data_in = Maasha::Common::read_open( $file ); - foreach $entry ( @{ $entries } ) + while ( $record = Maasha::UCSC::psl_get_entry( $data_in ) ) { - put_record( $entry, $out ); + put_record( $record, $out ); goto NUM if $options->{ "num" } and $num == $options->{ "num" }; @@ -1333,7 +1448,9 @@ sub script_read_bed # Returns nothing. - my ( $file, $record, $entry, $data_in, $num ); + my ( $cols, $file, $record, $bed_entry, $data_in, $num ); + + $cols = $options->{ 'cols' }->[ 0 ]; while ( $record = get_record( $in ) ) { put_record( $record, $out ); @@ -1345,9 +1462,11 @@ sub script_read_bed { $data_in = Maasha::Common::read_open( $file ); - while ( $entry = Maasha::UCSC::bed_get_entry( $data_in ) ) + while ( $bed_entry = Maasha::UCSC::BED::bed_entry_get( $data_in, $cols, $options->{ 'check' } ) ) { - put_record( $entry, $out ); + $record = Maasha::UCSC::BED::bed2biopiece( $bed_entry ); + + put_record( $record, $out ); goto NUM if $options->{ "num" } and $num == $options->{ "num" }; @@ -1367,7 +1486,7 @@ sub script_read_fixedstep { # Martin A. Hansen, Juli 2008. - # Read fixedStep wiggle format from stream or file. + # Read fixedstep wiggle format from stream or file. my ( $in, # handle to in stream $out, # handle to out stream @@ -1376,7 +1495,7 @@ sub script_read_fixedstep # Returns nothing. - my ( $file, $record, $entry, $head, $chr, $chr_beg, $step, $data_in, $num ); + my ( $file, $record, $entry, $data_in, $num ); while ( $record = get_record( $in ) ) { put_record( $record, $out ); @@ -1388,17 +1507,9 @@ sub script_read_fixedstep { $data_in = Maasha::Common::read_open( $file ); - while ( $entry = Maasha::UCSC::fixedstep_get_entry( $data_in ) ) + while ( $entry = Maasha::UCSC::Wiggle::fixedstep_entry_get( $data_in ) ) { - $head = shift @{ $entry }; - - if ( $head =~ /^chrom=([^ ]+) start=(\d+) step=(\d+)$/ ) - { - $record->{ "CHR" } = $1; - $record->{ "CHR_BEG" } = $2; - $record->{ "STEP" } = $3; - $record->{ "VALS" } = join ",", @{ $entry }; - } + $record = Maasha::UCSC::Wiggle::fixedstep2biopiece( $entry ); put_record( $record, $out ); @@ -1702,7 +1813,7 @@ sub script_read_soft # Returns nothing. - my ( $data_in, $file, $num, $records, $record, $soft_index, $fh, @platforms, $plat_table, @samples, $sample, $old_end ); + my ( $data_in, $file, $num, $records, $record, $soft_index, $fh, @platforms, $plat_table, @samples, $sample, $old_end, $skip ); while ( $record = get_record( $in ) ) { put_record( $record, $out ); @@ -1712,21 +1823,30 @@ sub script_read_soft foreach $file ( @{ $options->{ "files" } } ) { + print STDERR "Creating index for file: $file\n" if $options->{ "verbose" }; + $soft_index = Maasha::NCBI::soft_index_file( $file ); $fh = Maasha::Common::read_open( $file ); - @platforms = grep { $_->[ 0 ] =~ /PLATFORM/ } @{ $soft_index }; + @platforms = grep { $_->{ "SECTION" } =~ /PLATFORM/ } @{ $soft_index }; + + print STDERR "Getting platform tables for file: $file\n" if $options->{ "verbose" }; - $plat_table = Maasha::NCBI::soft_get_platform( $fh, $platforms[ 0 ]->[ 1 ], $platforms[ -1 ]->[ 2 ] ); + $plat_table = Maasha::NCBI::soft_get_platform( $fh, $platforms[ 0 ]->{ "LINE_BEG" }, $platforms[ -1 ]->{ "LINE_END" } ); - @samples = grep { $_->[ 0 ] =~ /SAMPLE/ } @{ $soft_index }; + @samples = grep { $_->{ "SECTION" } =~ /SAMPLE/ } @{ $soft_index }; - $old_end = $platforms[ -1 ]->[ 2 ]; + $old_end = $platforms[ -1 ]->{ "LINE_END" }; foreach $sample ( @samples ) { - $records = Maasha::NCBI::soft_get_sample( $fh, $plat_table, $sample->[ 1 ] - $old_end - 1, $sample->[ 2 ] - $old_end - 1 ); + $skip = 0; + $skip = 1 if ( $options->{ "samples" } and grep { $sample->{ "SECTION" } !~ /$_/ } @{ $options->{ "samples" } } ); + + print STDERR "Getting samples for dataset: $sample->{ 'SECTION' }\n" if $options->{ "verbose" } and not $skip; + + $records = Maasha::NCBI::soft_get_sample( $fh, $plat_table, $sample->{ "LINE_BEG" } - $old_end - 1, $sample->{ "LINE_END" } - $old_end - 1, $skip ); foreach $record ( @{ $records } ) { @@ -1737,7 +1857,7 @@ sub script_read_soft $num++; } - $old_end = $sample->[ 2 ]; + $old_end = $sample->{ "LINE_END" }; } close $fh; @@ -1858,8 +1978,9 @@ sub script_read_solexa # Returns nothing. - my ( $record, $file, $base_name, $data_in, $line, $num, @fields, @seqs, @scores, $i, $seq, $seq_count ); + my ( $record, $file, $data_in, $entry, $num, @seqs, @scores, $i ); + $options->{ "format" } ||= "octal"; $options->{ "quality" } ||= 20; while ( $record = get_record( $in ) ) { @@ -1870,35 +1991,33 @@ sub script_read_solexa foreach $file ( @{ $options->{ "files" } } ) { - $data_in = Maasha::Common::read_open( $file ); - $base_name = Maasha::Common::get_basename( $file ); - $base_name =~ s/\..*//; - - $seq_count = 0; + $data_in = Maasha::Common::read_open( $file ); - while ( $line = <$data_in> ) + if ( $options->{ "format" } eq "octal" ) { - @fields = split /:/, $line; - @seqs = split //, $fields[ 5 ]; - @scores = split / /, $fields[ -1 ]; + while ( $entry = Maasha::Solexa::solexa_get_entry_octal( $data_in ) ) + { + $record = Maasha::Solexa::solexa2biopiece( $entry, $options->{ "quality" } ); - for ( $i = 0; $i < @scores; $i++ ) { - $seqs[ $i ] = lc $seqs[ $i ] if $scores[ $i ] < $options->{ "quality" }; - } + put_record( $record, $out ); - $seq = join "", @seqs; + goto NUM if $options->{ "num" } and $num == $options->{ "num" }; - $record->{ "SEQ_NAME" } = sprintf( "%s_ID%08d", $base_name, $seq_count ); - $record->{ "SEQ" } = $seq; - $record->{ "SEQ_LEN" } = length $seq; - $record->{ "SCORE_MEAN" } = sprintf ( "%.2f", Maasha::Calc::mean( \@scores ) ); + $num++; + } + } + else + { + while ( $entry = Maasha::Solexa::solexa_get_entry_decimal( $data_in ) ) + { + $record = Maasha::Solexa::solexa2biopiece( $entry, $options->{ "quality" } ); - put_record( $record, $out ); + put_record( $record, $out ); - goto NUM if $options->{ "num" } and $num == $options->{ "num" }; + goto NUM if $options->{ "num" } and $num == $options->{ "num" }; - $seq_count++; - $num++; + $num++; + } } close $data_in; @@ -1951,9 +2070,10 @@ sub script_read_solid } $record = { + REC_TYPE => 'SOLID', SEQ_NAME => $seq_name, SEQ_CS => $seq_cs, - SEQ_QUAL => $seq_qual, + SEQ_QUAL => join( ";", @scores ), SEQ_LEN => length $seq_cs, SEQ => join( "", @seqs ), SCORE_MEAN => sprintf( "%.2f", Maasha::Calc::mean( \@scores ) ), @@ -2007,6 +2127,115 @@ sub script_read_mysql } +sub script_read_ucsc_config +{ + # Martin A. Hansen, November 2008. + + # Read track entries from UCSC Genome Browser '.ra' files. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $file, $data_in, $entry, $num ); + + while ( $record = get_record( $in ) ) { + put_record( $record, $out ); + } + + $num = 1; + + foreach $file ( @{ $options->{ "files" } } ) + { + $data_in = Maasha::Common::read_open( $file ); + + while ( $record = Maasha::UCSC::ucsc_config_entry_get( $data_in ) ) + { + $record->{ 'REC_TYPE' } = "UCSC Config"; + + put_record( $record, $out ); + + goto NUM if $options->{ "num" } and $num == $options->{ "num" }; + + $num++; + } + + close $data_in; + } + + NUM: + + close $data_in if $data_in; +} + + +sub script_assemble_tag_contigs +{ + # Martin A. Hansen, November 2008. + + # Assemble tags from the stream into + # tag contigs. + + # The current implementation is quite + # slow because of heavy use of temporary + # files. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $bed_file, $tag_file, $fh_in, $fh_out, $cols, $record, $bed_entry, $file_hash, $chr, $strand ); + + $bed_file = "$BP_TMP/assemble_tag_contigs.bed"; + $fh_out = Maasha::Filesys::file_write_open( $bed_file ); + $cols = 6; # we only need the first 6 BED columns + + while ( $record = get_record( $in ) ) + { + if ( $bed_entry = Maasha::UCSC::BED::biopiece2bed( $record, $cols ) ) + { + $strand = $record->{ 'STRAND' } || '+'; + + Maasha::UCSC::BED::bed_entry_put( $bed_entry, $fh_out, $cols, $options->{ 'check' } ); + } + } + + close $fh_out; + + $file_hash = Maasha::UCSC::BED::bed_file_split_on_chr( $bed_file, $BP_TMP, $cols ); + + unlink $bed_file; + + foreach $chr ( sort keys %{ $file_hash } ) + { + $bed_file = $file_hash->{ $chr }; + $tag_file = "$bed_file.tc"; + + Maasha::Common::run( "bed2tag_contigs", "< $bed_file > $tag_file" ); + + $fh_in = Maasha::Filesys::file_read_open( $tag_file ); + + while ( $bed_entry = Maasha::UCSC::BED::bed_entry_get( $fh_in, $cols, $options->{ 'check' } ) ) + { + if ( $record = Maasha::UCSC::BED::bed2biopiece( $bed_entry ) ) { + put_record( $record, $out ); + } + } + + close $fh_in; + + unlink $bed_file; + unlink $tag_file; + } +} + + sub script_format_genome { # Martin A. Hansen, Juli 2008. @@ -2020,7 +2249,7 @@ sub script_format_genome # Returns nothing. - my ( $dir, $genome, $fasta_dir, $phastcons_dir, $vals, $fh_out, $record, $format, $index ); + my ( $dir, $genome, $fasta_dir, $phastcons_dir, $vals, $fh_out, $record, $format, $index, $entry ); $dir = $options->{ 'dir' } || $ENV{ 'BP_DATA' }; $genome = $options->{ 'genome' }; @@ -2055,9 +2284,9 @@ sub script_format_genome while ( $record = get_record( $in ) ) { - if ( $fh_out and $record->{ "SEQ_NAME" } and $record->{ "SEQ" } ) + if ( $fh_out and $entry = Maasha::Fasta::biopiece2fasta( $record ) ) { - Maasha::Fasta::put_entry( [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ], $fh_out, $options->{ "wrap" } ); + Maasha::Fasta::put_entry( $entry, $fh_out, $options->{ "wrap" } ); } elsif ( $fh_out and $record->{ "CHR" } and $record->{ "CHR_BEG" } and $record->{ "STEP" } and $record->{ "VALS" } ) { @@ -2427,6 +2656,62 @@ sub script_calc_bit_scores } +sub script_calc_fixedstep +{ + # Martin A. Hansen, September 2008. + + # Calculates fixedstep entries from data in the stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $bed_file, $fh_in, $fh_out, $cols, $record, $file_hash, $chr, $bed_entry, $fixedstep_file, $fixedstep_entry ); + + $bed_file = "$BP_TMP/calc_fixedstep.bed"; + $fh_out = Maasha::Filesys::file_write_open( $bed_file ); + $cols = 5; # we only need the first 5 BED columns + + while ( $record = get_record( $in ) ) + { + if ( $bed_entry = Maasha::UCSC::BED::biopiece2bed( $record, $cols ) ) { + Maasha::UCSC::BED::bed_entry_put( $bed_entry, $fh_out, $cols, $options->{ 'check' } ); + } + } + + close $fh_out; + + $file_hash = Maasha::UCSC::BED::bed_file_split_on_chr( $bed_file, $BP_TMP, $cols ); + + unlink $bed_file; + + foreach $chr ( sort keys %{ $file_hash } ) + { + $bed_file = $file_hash->{ $chr }; + $fixedstep_file = "$bed_file.fixedstep"; + + Maasha::Common::run( "bed2fixedstep", "< $bed_file > $fixedstep_file" ); + + $fh_in = Maasha::Filesys::file_read_open( $fixedstep_file ); + + while ( $fixedstep_entry = Maasha::UCSC::Wiggle::fixedstep_entry_get( $fh_in ) ) + { + if ( $record = Maasha::UCSC::Wiggle::fixedstep2biopiece( $fixedstep_entry ) ) { + put_record( $record, $out ); + } + } + + close $fh_in; + + unlink $bed_file; + unlink $fixedstep_file; + } +} + + sub script_reverse_seq { # Martin A. Hansen, August 2007. @@ -2881,6 +3166,10 @@ sub script_get_genome_align { $align = Maasha::UCSC::maf_extract( $BP_TMP, $options->{ "genome" }, $maf_track, $record->{ "CHR" }, $record->{ "CHR_BEG" }, $record->{ "CHR_END" }, $record->{ "STRAND" } ); } + elsif ( $record->{ "REC_TYPE" } eq "VMATCH" ) + { + $align = Maasha::UCSC::maf_extract( $BP_TMP, $options->{ "genome" }, $maf_track, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" } + 1, $record->{ "STRAND" } ); + } elsif ( $record->{ "REC_TYPE" } eq "PSL" ) { $align = Maasha::UCSC::maf_extract( $BP_TMP, $options->{ "genome" }, $maf_track, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" }, $record->{ "STRAND" } ); @@ -2927,7 +3216,7 @@ sub script_get_genome_phastcons $phastcons_file = Maasha::Config::genome_phastcons( $options->{ "genome" } ); $phastcons_index = Maasha::Config::genome_phastcons_index( $options->{ "genome" } ); - $index = Maasha::UCSC::phastcons_index_retrieve( $phastcons_index ); + $index = Maasha::UCSC::fixedstep_index_retrieve( $phastcons_index ); $fh_phastcons = Maasha::Common::read_open( $phastcons_file ); if ( defined $options->{ "chr" } and defined $options->{ "beg" } and ( defined $options->{ "end" } or defined $options->{ "len" } ) ) @@ -2939,7 +3228,7 @@ sub script_get_genome_phastcons $options->{ "end" } = $options->{ "beg" } + $options->{ "len" } - 1; } - $scores = Maasha::UCSC::phastcons_index_lookup( $index, $fh_phastcons, $options->{ "chr" }, $options->{ "beg" }, $options->{ "end" }, $options->{ "flank" } ); + $scores = Maasha::UCSC::fixedstep_index_lookup( $index, $fh_phastcons, $options->{ "chr" }, $options->{ "beg" }, $options->{ "end" }, $options->{ "flank" } ); $record->{ "CHR" } = $options->{ "chr" }; $record->{ "CHR_BEG" } = $options->{ "beg" } - $options->{ "flank" }; @@ -2955,15 +3244,15 @@ sub script_get_genome_phastcons { if ( $record->{ "REC_TYPE" } eq "BED" ) { - $scores = Maasha::UCSC::phastcons_index_lookup( $index, $fh_phastcons, $record->{ "CHR" }, $record->{ "CHR_BEG" }, $record->{ "CHR_END" }, $options->{ "flank" } ); + $scores = Maasha::UCSC::fixedstep_index_lookup( $index, $fh_phastcons, $record->{ "CHR" }, $record->{ "CHR_BEG" }, $record->{ "CHR_END" }, $options->{ "flank" } ); } elsif ( $record->{ "REC_TYPE" } eq "PSL" ) { - $scores = Maasha::UCSC::phastcons_index_lookup( $index, $fh_phastcons, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" }, $options->{ "flank" } ); + $scores = Maasha::UCSC::fixedstep_index_lookup( $index, $fh_phastcons, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" }, $options->{ "flank" } ); } elsif ( $record->{ "REC_TYPE" } eq "BLAST" ) { - $scores = Maasha::UCSC::phastcons_index_lookup( $index, $fh_phastcons, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" }, $options->{ "flank" } ); + $scores = Maasha::UCSC::fixedstep_index_lookup( $index, $fh_phastcons, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" }, $options->{ "flank" } ); } $record->{ "PHASTCONS" } = join ",", @{ $scores } if @{ $scores }; @@ -3279,8 +3568,6 @@ sub script_patscan_seq $i++; } - -# put_record( $record, $out ); } close $fh_out; @@ -3350,7 +3637,7 @@ sub script_create_blast_db # Returns nothing. - my ( $fh, $seq_type, $path, $record ); + my ( $fh, $seq_type, $path, $record, $entry ); $path = $options->{ "database" }; @@ -3360,11 +3647,11 @@ sub script_create_blast_db { put_record( $record, $out ) if not $options->{ "no_stream" }; - if ( $record->{ "SEQ" } and $record->{ "SEQ_NAME" } ) + if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) ) { - $seq_type = Maasha::Seq::seq_guess_type( $record->{ "SEQ" } ) if not $seq_type; + $seq_type = Maasha::Seq::seq_guess_type( $entry->[ SEQ ] ) if not $seq_type; - Maasha::Fasta::put_entry( [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ], $fh ); + Maasha::Fasta::put_entry( $entry, $fh ); } } @@ -3393,27 +3680,27 @@ sub script_blast_seq # Returns nothing. - my ( $genome, $q_type, $s_type, $tmp_in, $tmp_out, $fh_in, $fh_out, $record, $line, @fields ); + my ( $genome, $q_type, $s_type, $tmp_in, $tmp_out, $fh_in, $fh_out, $record, $line, @fields, $entry ); $options->{ "e_val" } = 10 if not defined $options->{ "e_val" }; $options->{ "filter" } = "F"; $options->{ "filter" } = "T" if $options->{ "filter" }; $options->{ "cpus" } ||= 1; - $options->{ "database" } = "$ENV{ 'BP_DATA' }/genomes/$options->{ 'genome' }/blast/$options->{ 'genome' }.fna"if $options->{ 'genome' }; + $options->{ "database" } = "$ENV{ 'BP_DATA' }/genomes/$options->{ 'genome' }/blast/$options->{ 'genome' }.fna" if $options->{ 'genome' }; $tmp_in = "$BP_TMP/blast_query.seq"; $tmp_out = "$BP_TMP/blast.result"; - $fh_out = Maasha::Common::write_open( $tmp_in ); + $fh_out = Maasha::Filesys::file_write_open( $tmp_in ); while ( $record = get_record( $in ) ) { - if ( $record->{ "SEQ_NAME" } and $record->{ "SEQ" } ) + if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) ) { - $q_type = Maasha::Seq::seq_guess_type( $record->{ "SEQ" } ) if not $q_type; + $q_type = Maasha::Seq::seq_guess_type( $entry->[ SEQ ] ) if not $q_type; - Maasha::Fasta::put_entry( [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ], $fh_out ); + Maasha::Fasta::put_entry( $entry, $fh_out ); } put_record( $record, $out ); @@ -3440,11 +3727,45 @@ sub script_blast_seq } } - Maasha::Common::run( "blastall", "-p $options->{ 'program' } -e $options->{ 'e_val' } -a $options->{ 'cpus' } -m 8 -i $tmp_in -d $options->{ 'database' } -F $options->{ 'filter' } -o $tmp_out > /dev/null 2>&1", 1 ); + if ( $options->{ 'verbose' } ) + { + Maasha::Common::run( + "blastall", + join( " ", + "-p $options->{ 'program' }", + "-e $options->{ 'e_val' }", + "-a $options->{ 'cpus' }", + "-m 8", + "-i $tmp_in", + "-d $options->{ 'database' }", + "-F $options->{ 'filter' }", + "-o $tmp_out", + ), + 1 + ); + } + else + { + Maasha::Common::run( + "blastall", + join( " ", + "-p $options->{ 'program' }", + "-e $options->{ 'e_val' }", + "-a $options->{ 'cpus' }", + "-m 8", + "-i $tmp_in", + "-d $options->{ 'database' }", + "-F $options->{ 'filter' }", + "-o $tmp_out", + "> /dev/null 2>&1" + ), + 1 + ); + } unlink $tmp_in; - $fh_out = Maasha::Common::read_open( $tmp_out ); + $fh_out = Maasha::Filesys::file_read_open( $tmp_out ); undef $record; @@ -3503,7 +3824,7 @@ sub script_blat_seq # Returns nothing. - my ( $blat_args, $genome_file, $query_file, $fh_in, $fh_out, $type, $record, $result_file, $entries ); + my ( $blat_args, $genome_file, $query_file, $fh_in, $fh_out, $type, $record, $result_file, $entries, $entry ); $genome_file = "$ENV{ 'BP_DATA' }/genomes/$options->{ 'genome' }/fasta/$options->{ 'genome' }.fna"; @@ -3526,10 +3847,10 @@ sub script_blat_seq while ( $record = get_record( $in ) ) { - if ( $record->{ "SEQ_NAME" } and $record->{ "SEQ" } ) + if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) ) { - Maasha::Fasta::put_entry( [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ], $fh_out, 80 ); - $type = Maasha::Seq::seq_guess_type( $record->{ "SEQ" } ) if not $type; + $type = Maasha::Seq::seq_guess_type( $entry->[ SEQ ] ) if not $type; + Maasha::Fasta::put_entry( $entry, $fh_out, 80 ); } put_record( $record, $out ); @@ -3554,11 +3875,11 @@ sub script_blat_seq } -sub script_match_seq +sub script_soap_seq { - # Martin A. Hansen, August 2007. + # Martin A. Hansen, July 2008. - # BLATs sequences in stream against a given genome. + # soap sequences in stream against a given file or genome. my ( $in, # handle to in stream $out, # handle to out stream @@ -3567,25 +3888,117 @@ sub script_match_seq # Returns nothing. - my ( $record, @entries, $results ); - - $options->{ "word_size" } ||= 20; - $options->{ "direction" } ||= "both"; + my ( $genome, $tmp_in, $tmp_out, $fh_in, $fh_out, $record, $line, @fields, $entry, $count, $args ); - while ( $record = get_record( $in ) ) - { - if ( $record->{ "SEQ_NAME" } and $record->{ "SEQ" } ) { - push @entries, [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ]; - } + $options->{ "seed_size" } ||= 10; + $options->{ "mismatches" } ||= 2; + $options->{ "gap_size" } ||= 0; + $options->{ "cpus" } ||= 1; - put_record( $record, $out ); + if ( $options->{ "genome" } ) { + $options->{ "in_file" } = "$ENV{ 'BP_DATA' }/genomes/$options->{ 'genome' }/fasta/$options->{ 'genome' }.fna"; } - if ( @entries == 1 ) - { - $results = Maasha::Match::match_mummer( [ $entries[ 0 ] ], [ $entries[ 0 ] ], $options, $BP_TMP ); + $tmp_in = "$BP_TMP/soap_query.seq"; + $tmp_out = "$BP_TMP/soap.result"; - map { put_record( $_, $out ) } @{ $results }; + $fh_out = Maasha::Common::write_open( $tmp_in ); + + $count = 0; + + while ( $record = get_record( $in ) ) + { + if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) ) + { + Maasha::Fasta::put_entry( $entry, $fh_out ); + + $count++; + } + + put_record( $record, $out ); + } + + close $fh_out; + + if ( $count > 0 ) + { + $args = join( " ", + "-s $options->{ 'seed_size' }", + "-r 2", + "-a $tmp_in", + "-v $options->{ 'mismatches' }", + "-g $options->{ 'gap_size' }", + "-p $options->{ 'cpus' }", + "-d $options->{ 'in_file' }", + "-o $tmp_out", + ); + + $args .= " > /dev/null 2>&1" if not $options->{ 'verbose' }; + + Maasha::Common::run( "soap", $args, 1 ); + + unlink $tmp_in; + + $fh_out = Maasha::Common::read_open( $tmp_out ); + + undef $record; + + while ( $line = <$fh_out> ) + { + chomp $line; + + @fields = split /\t/, $line; + + $record->{ "REC_TYPE" } = "SOAP"; + $record->{ "Q_ID" } = $fields[ 0 ]; + $record->{ "SCORE" } = $fields[ 3 ]; + $record->{ "STRAND" } = $fields[ 6 ]; + $record->{ "S_ID" } = $fields[ 7 ]; + $record->{ "S_BEG" } = $fields[ 8 ] - 1; # soap is 1-based + $record->{ "S_END" } = $fields[ 8 ] + $fields[ 5 ] - 2; + + put_record( $record, $out ); + } + + close $fh_out; + } + + unlink $tmp_out; +} + + +sub script_match_seq +{ + # Martin A. Hansen, August 2007. + + # BLATs sequences in stream against a given genome. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, @entries, $results ); + + $options->{ "word_size" } ||= 20; + $options->{ "direction" } ||= "both"; + + while ( $record = get_record( $in ) ) + { + if ( $record->{ "SEQ_NAME" } and $record->{ "SEQ" } ) { + push @entries, [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ]; + } + + put_record( $record, $out ); + } + + if ( @entries == 1 ) + { + $results = Maasha::Match::match_mummer( [ $entries[ 0 ] ], [ $entries[ 0 ] ], $options, $BP_TMP ); + + map { put_record( $_, $out ) } @{ $results }; } elsif ( @entries == 2 ) { @@ -3609,7 +4022,7 @@ sub script_create_vmatch_index # Returns nothing. - my ( $record, $file_tmp, $fh_tmp, $type ); + my ( $record, $file_tmp, $fh_tmp, $type, $entry ); if ( $options->{ "index_name" } ) { @@ -3619,11 +4032,11 @@ sub script_create_vmatch_index while ( $record = get_record( $in ) ) { - if ( $options->{ "index_name" } and $record->{ "SEQ_NAME" } and $record->{ "SEQ" } ) + if ( $options->{ "index_name" } and $entry = Maasha::Fasta::biopiece2fasta( $record ) ) { - Maasha::Fasta::put_entry( [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ], $fh_tmp ); + Maasha::Fasta::put_entry( $entry, $fh_tmp ); - $type = Maasha::Seq::seq_guess_type( $record->{ "SEQ" } ) if not $type; + $type = Maasha::Seq::seq_guess_type( $entry->[ SEQ ] ) if not defined $type; } put_record( $record, $out ) if not $options->{ "no_stream" }; @@ -3710,14 +4123,14 @@ sub script_write_fasta # Returns nothing. - my ( $record, $fh ); + my ( $record, $fh, $entry ); $fh = write_stream( $options->{ "data_out" }, $options->{ "compress" } ); while ( $record = get_record( $in ) ) { - if ( $record->{ "SEQ_NAME" } and $record->{ "SEQ" } ) { - Maasha::Fasta::put_entry( [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ], $fh, $options->{ "wrap" } ); + if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) ) { + Maasha::Fasta::put_entry( $entry, $fh, $options->{ "wrap" } ); } put_record( $record, $out ) if not $options->{ "no_stream" }; @@ -3902,59 +4315,21 @@ sub script_write_bed # Returns nothing. - my ( $fh, $record, $new_record ); + my ( $cols, $fh, $record, $bed_entry, $new_record ); - $fh = write_stream( $options->{ "data_out" }, $options->{ "compress" } ); + $cols = $options->{ 'cols' }->[ 0 ]; + + $fh = write_stream( $options->{ 'data_out' }, $options->{ 'compress' } ); while ( $record = get_record( $in ) ) { - if ( $record->{ "REC_TYPE" } eq "BED" ) # ---- Hits from BED ---- - { - Maasha::UCSC::bed_put_entry( $record, $fh, $record->{ "BED_COLS" } ); - } - elsif ( $record->{ "REC_TYPE" } eq "PSL" and $record->{ "S_ID" } =~ /^chr/i ) # ---- Hits from BLAT (PSL) ---- - { - $new_record->{ "CHR" } = $record->{ "S_ID" }; - $new_record->{ "CHR_BEG" } = $record->{ "S_BEG" }; - $new_record->{ "CHR_END" } = $record->{ "S_END" }; - $new_record->{ "Q_ID" } = $record->{ "Q_ID" }; - $new_record->{ "SCORE" } = $record->{ "SCORE" } || 999; - $new_record->{ "STRAND" } = $record->{ "STRAND" }; - - Maasha::UCSC::bed_put_entry( $new_record, $fh, 6 ); - } - elsif ( $record->{ "REC_TYPE" } eq "PATSCAN" and $record->{ "CHR" } ) # ---- Hits from patscan_seq ---- - { - Maasha::UCSC::bed_put_entry( $record, $fh, 6 ); - } - elsif ( $record->{ "REC_TYPE" } eq "BLAST" and $record->{ "S_ID" } =~ /^chr/i ) # ---- Hits from BLAST ---- - { - $new_record->{ "CHR" } = $record->{ "S_ID" }; - $new_record->{ "CHR_BEG" } = $record->{ "S_BEG" }; - $new_record->{ "CHR_END" } = $record->{ "S_END" }; - $new_record->{ "Q_ID" } = $record->{ "Q_ID" }; - $new_record->{ "SCORE" } = $record->{ "SCORE" } || 999; # or use E_VAL somehow - $new_record->{ "STRAND" } = $record->{ "STRAND" }; - - Maasha::UCSC::bed_put_entry( $new_record, $fh, 6 ); - } - elsif ( $record->{ "REC_TYPE" } eq "VMATCH" and $record->{ "S_ID" } =~ /^chr/i ) # ---- Hits from Vmatch ---- - { - $new_record->{ "CHR" } = $record->{ "S_ID" }; - $new_record->{ "CHR_BEG" } = $record->{ "S_BEG" }; - $new_record->{ "CHR_END" } = $record->{ "S_END" }; - $new_record->{ "Q_ID" } = $record->{ "Q_ID" }; - $new_record->{ "SCORE" } = $record->{ "SCORE" } || 999; # or use E_VAL somehow - $new_record->{ "STRAND" } = $record->{ "STRAND" }; + $record = Maasha::UCSC::psl2record( $record ) if $record->{ 'tBaseInsert' }; # Dirty addition to allow Affy data from MySQL to be dumped - Maasha::UCSC::bed_put_entry( $new_record, $fh, 6 ); - } - elsif ( $record->{ "CHR" } and defined $record->{ "CHR_BEG" } and $record->{ "CHR_END" } ) # ---- Generic data from tables ---- - { - Maasha::UCSC::bed_put_entry( $record, $fh ); + if ( $bed_entry = Maasha::UCSC::BED::biopiece2bed( $record, $cols ) ) { + Maasha::UCSC::BED::bed_entry_put( $bed_entry, $fh, $cols, $options->{ 'check' } ); } - put_record( $record, $out ) if not $options->{ "no_stream" }; + put_record( $record, $out ) if not $options->{ 'no_stream' }; } close $fh; @@ -4009,21 +4384,14 @@ sub script_write_fixedstep # Returns nothing. - my ( $fh, $record, $vals ); + my ( $fh, $record, $entry ); $fh = write_stream( $options->{ "data_out" }, $options->{ "compress" } ); while ( $record = get_record( $in ) ) { - if ( $record->{ "CHR" } and $record->{ "CHR_BEG" } and $record->{ "STEP" } and $record->{ "VALS" } ) - { - print $fh "fixedStep chrom=$record->{ 'CHR' } start=$record->{ 'CHR_BEG' } step=$record->{ 'STEP' }\n"; - - $vals = $record->{ 'VALS' }; - - $vals =~ tr/,/\n/; - - print $fh "$vals\n"; + if ( $entry = Maasha::UCSC::Wiggle::biopiece2fixedstep( $record ) ) { + Maasha::UCSC::Wiggle::fixedstep_entry_put( $entry, $fh ); } put_record( $record, $out ) if not $options->{ "no_stream" }; @@ -4046,7 +4414,7 @@ sub script_write_2bit # Returns nothing. - my ( $record, $mask, $tmp_file, $fh_tmp, $fh_in, $fh_out ); + my ( $record, $mask, $tmp_file, $fh_tmp, $fh_in, $fh_out, $entry ); $mask = 1 if not $options->{ "no_mask" }; @@ -4057,8 +4425,8 @@ sub script_write_2bit while ( $record = get_record( $in ) ) { - if ( $record->{ "SEQ_NAME" } and $record->{ "SEQ" } ) { - Maasha::Fasta::put_entry( [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ], $fh_tmp ); + if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) ) { + Maasha::Fasta::put_entry( $entry, $fh_tmp ); } put_record( $record, $out ) if not $options->{ "no_stream" }; @@ -4090,17 +4458,17 @@ sub script_write_solid # Returns nothing. - my ( $record, $fh, $seq_cs ); + my ( $record, $fh, $entry ); $fh = write_stream( $options->{ "data_out" }, $options->{ "compress" } ); while ( $record = get_record( $in ) ) { - if ( $record->{ "SEQ_NAME" } and $record->{ "SEQ" } ) + if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) ) { - $seq_cs = Maasha::Solid::seq2color_space( uc $record->{ "SEQ" } ); + $entry->[ SEQ ] = Maasha::Solid::seq2color_space( uc $entry->[ SEQ ] ); - Maasha::Fasta::put_entry( [ $record->{ "SEQ_NAME" }, $seq_cs ], $fh, $options->{ "wrap" } ); + Maasha::Fasta::put_entry( $entry, $fh, $options->{ "wrap" } ); } put_record( $record, $out ) if not $options->{ "no_stream" }; @@ -4110,6 +4478,35 @@ sub script_write_solid } +sub script_write_ucsc_config +{ + # Martin A. Hansen, November 2008. + + # Write UCSC Genome Broser configuration (.ra file type) from + # records in the stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $fh ); + + $fh = write_stream( $options->{ "data_out" } ); + + while ( $record = get_record( $in ) ) + { + Maasha::UCSC::ucsc_config_entry_put( $record, $fh ) if $record->{ "REC_TYPE" } eq "UCSC Config"; + + put_record( $record, $out ) if not $options->{ "no_stream" }; + } + + close $fh; +} + + sub script_plot_seqlogo { # Martin A. Hansen, August 2007. @@ -4164,14 +4561,17 @@ sub script_plot_phastcons_profiles $phastcons_file = Maasha::Config::genome_phastcons( $options->{ "genome" } ); $phastcons_index = Maasha::Config::genome_phastcons_index( $options->{ "genome" } ); - $index = Maasha::UCSC::phastcons_index_retrieve( $phastcons_index ); + $index = Maasha::UCSC::fixedstep_index_retrieve( $phastcons_index ); $fh_phastcons = Maasha::Common::read_open( $phastcons_file ); while ( $record = get_record( $in ) ) { if ( $record->{ "CHR" } and $record->{ "CHR_BEG" } and $record->{ "CHR_END" } ) { - $scores = Maasha::UCSC::phastcons_index_lookup( $index, $fh_phastcons, $record->{ "CHR" }, $record->{ "CHR_BEG" }, $record->{ "CHR_END" }, $options->{ "flank" } ); + $scores = Maasha::UCSC::fixedstep_index_lookup( $index, $fh_phastcons, $record->{ "CHR" }, + $record->{ "CHR_BEG" }, + $record->{ "CHR_END" }, + $options->{ "flank" } ); push @{ $AoA }, [ @{ $scores } ]; } @@ -4372,6 +4772,191 @@ sub script_remove_keys } +sub script_remove_adaptor +{ + # Martin A. Hansen, August 2008. + + # Find and remove adaptor from sequences in the stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $adaptor, $seq, $adaptor_len, $seq_len, $offset, $max_match, $max_mismatch, $pos ); + + $options->{ "remove" } ||= "after"; + + $max_mismatch = $options->{ "mismatches" } || 0; + $offset = $options->{ "offset" }; + + if ( not defined $offset ) { + $offset = 0; + } else { + $offset--; + } + + $adaptor = uc $options->{ "adaptor" }; + $adaptor_len = length $adaptor; + + while ( $record = get_record( $in ) ) + { + if ( $record->{ "SEQ" } ) + { + $seq = uc $record->{ "SEQ" }; + $seq_len = length $seq; + + $pos = Maasha::Common::index_m( $seq, $adaptor, $seq_len, $adaptor_len, $offset, $max_mismatch ); + + $record->{ "ADAPTOR_POS" } = $pos; + + if ( $pos >= 0 and $options->{ "remove" } ne "skip" ) + { + if ( $options->{ "remove" } eq "after" ) + { + $record->{ "SEQ" } = substr $record->{ "SEQ" }, 0, $pos; + $record->{ "SEQ_LEN" } = $pos; + } + else + { + $record->{ "SEQ" } = substr $record->{ "SEQ" }, $pos + $adaptor_len; + $record->{ "SEQ_LEN" } = length $record->{ "SEQ" }; + } + } + + put_record( $record, $out ); + } + else + { + put_record( $record, $out ); + } + } +} + + +sub script_remove_mysql_tables +{ + # Martin A. Hansen, November 2008. + + # Remove MySQL tables from values in stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, %table_hash, $dbh, $table ); + + $options->{ "user" } ||= Maasha::UCSC::ucsc_get_user(); + $options->{ "password" } ||= Maasha::UCSC::ucsc_get_password(); + + map { $table_hash{ $_ } = 1 } @{ $options->{ 'tables' } }; + + while ( $record = get_record( $in ) ) + { + map { $table_hash{ $record->{ $_ } } = 1 } @{ $options->{ 'keys' } }; + + put_record( $record, $out ) if not $options->{ 'no_stream' }; + } + + $dbh = Maasha::SQL::connect( $options->{ "database" }, $options->{ "user" }, $options->{ "password" } ); + + foreach $table ( sort keys %table_hash ) + { + if ( Maasha::SQL::table_exists( $dbh, $table ) ) + { + print STDERR qq(Removing table "$table" from database "$options->{ 'database' }" ... ) if $options->{ 'verbose' }; + Maasha::SQL::delete_table( $dbh, $table ); + print STDERR "done.\n" if $options->{ 'verbose' }; + } + else + { + print STDERR qq(WARNING: table "$table" not found in database "$options->{ 'database' }\n"); + } + } + + Maasha::SQL::disconnect( $dbh ); +} + + +sub script_remove_ucsc_tracks +{ + # Martin A. Hansen, November 2008. + + # Remove track from MySQL tables and config file. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, %track_hash, $fh_in, $fh_out, $track, @tracks, @new_tracks, $dbh ); + + $options->{ 'user' } ||= Maasha::UCSC::ucsc_get_user(); + $options->{ 'password' } ||= Maasha::UCSC::ucsc_get_password(); + $options->{ 'config_file' } ||= "$ENV{ 'HOME' }/ucsc/my_tracks.ra"; + + map { $track_hash{ $_ } = 1 } @{ $options->{ 'tracks' } }; + + while ( $record = get_record( $in ) ) + { + map { $track_hash{ $record->{ $_ } } = 1 } @{ $options->{ 'keys' } }; + + put_record( $record, $out ) if not $options->{ 'no_stream' }; + } + + # ---- locate track in config file ---- + + $fh_in = Maasha::Common::read_open( $options->{ 'config_file' } ); + + while ( $track = Maasha::UCSC::ucsc_config_entry_get( $fh_in ) ) { + push @tracks, $track; + } + + close $fh_in; + + map { push @new_tracks, $_ if not exists $track_hash{ $_->{ 'track' } } } @tracks; + + print STDERR qq(WARNING: track not found in config file: "$options->{ 'config_file' }"\n) if scalar @tracks == scalar @new_tracks; + + rename "$options->{ 'config_file' }", "$options->{ 'config_file' }~"; + + $fh_out = Maasha::Common::write_open( $options->{ 'config_file' } ); + + map { Maasha::UCSC::ucsc_config_entry_put( $_, $fh_out ) } @new_tracks; + + close $fh_out; + + # ---- locate track in database ---- + + $dbh = Maasha::SQL::connect( $options->{ "database" }, $options->{ "user" }, $options->{ "password" } ); + + foreach $track ( sort keys %track_hash ) + { + if ( Maasha::SQL::table_exists( $dbh, $track ) ) + { + print STDERR qq(Removing table "$track" from database "$options->{ 'database' }" ... ) if $options->{ 'verbose' }; + Maasha::SQL::delete_table( $dbh, $track ); + print STDERR "done.\n" if $options->{ 'verbose' }; + } + else + { + print STDERR qq(WARNING: table "$track" not found in database "$options->{ 'database' }\n"); + } + } + + Maasha::SQL::disconnect( $dbh ); + + Maasha::Common::run( "ucscMakeTracks.pl", "-b > /dev/null 2>&1" ); +} + + sub script_rename_keys { # Martin A. Hansen, August 2007. @@ -4478,11 +5063,11 @@ sub script_merge_vals } -sub script_grab +sub script_merge_records { - # Martin A. Hansen, August 2007. + # Martin A. Hansen, July 2008. - # Grab for records in stream. + # Merges records in the stream based on identical values of two given keys. my ( $in, # handle to in stream $out, # handle to out stream @@ -4491,182 +5076,235 @@ sub script_grab # Returns nothing. - my ( $patterns, $pattern, $record, $key, $pos, $op, $val, %lookup_hash ); + my ( $merge, $record, $file1, $file2, $fh1, $fh2, $key1, $key2, @keys1, @keys2, @vals1, @vals2, + $num1, $num2, $num, $cmp, $i ); + + $merge = $options->{ "merge" } || "AandB"; + + $file1 = "$BP_TMP/merge_records1.tmp"; + $file2 = "$BP_TMP/merge_records2.tmp"; - if ( $options->{ "patterns" } ) + $fh1 = Maasha::Common::write_open( $file1 ); + $fh2 = Maasha::Common::write_open( $file2 ); + + $key1 = $options->{ "keys" }->[ 0 ]; + $key2 = $options->{ "keys" }->[ 1 ]; + + $num = $key2 =~ s/n$//; + $num1 = 0; + $num2 = 0; + + while ( $record = get_record( $in ) ) { - $patterns = [ split ",", $options->{ "patterns" } ]; + if ( exists $record->{ $key1 } ) + { + @keys1 = $key1; + @vals1 = $record->{ $key1 }; + + delete $record->{ $key1 }; + + map { push @keys1, $_; push @vals1, $record->{ $_ } } keys %{ $record }; + + print $fh1 join( "\t", @vals1 ), "\n"; + + $num1++; + } + elsif ( exists $record->{ $key2 } ) + { + @keys2 = $key2; + @vals2 = $record->{ $key2 }; + + delete $record->{ $key2 }; + + map { push @keys2, $_; push @vals2, $record->{ $_ } } keys %{ $record }; + + print $fh2 join( "\t", @vals2 ), "\n"; + + $num2++; + } } - elsif ( -f $options->{ "patterns_in" } ) + + close $fh1; + close $fh2; + + if ( $num ) { - $patterns = Maasha::Patscan::read_patterns( $options->{ "patterns_in" } ); + Maasha::Common::run( "sort", "-k 1,1n $file1 > $file1.sort" ) and rename "$file1.sort", $file1; + Maasha::Common::run( "sort", "-k 1,1n $file2 > $file2.sort" ) and rename "$file2.sort", $file2; } - elsif ( -f $options->{ "exact_in" } ) + else { - $patterns = Maasha::Patscan::read_patterns( $options->{ "exact_in" } ); + Maasha::Common::run( "sort", "-k 1,1 $file1 > $file1.sort" ) and rename "$file1.sort", $file1; + Maasha::Common::run( "sort", "-k 1,1 $file2 > $file2.sort" ) and rename "$file2.sort", $file2; + } - map { $lookup_hash{ $_ } = 1 } @{ $patterns }; + $fh1 = Maasha::Common::read_open( $file1 ); + $fh2 = Maasha::Common::read_open( $file2 ); - undef $patterns; - } + @vals1 = Maasha::Common::get_fields( $fh1 ); + @vals2 = Maasha::Common::get_fields( $fh2 ); - if ( $options->{ "eval" } ) + while ( $num1 > 0 and $num2 > 0 ) { - if ( $options->{ "eval" } =~ /^([^><=! ]+)\s*(>=|<=|>|<|=|!=|eq|ne)\s*(.+)$/ ) - { - $key = $1; - $op = $2; - $val = $3; + undef $record; + + if ( $num ) { + $cmp = $vals1[ 0 ] <=> $vals2[ 0 ]; + } else { + $cmp = $vals1[ 0 ] cmp $vals2[ 0 ]; } - } - while ( $record = get_record( $in ) ) - { - $pos = -1; - - if ( %lookup_hash ) + if ( $cmp < 0 ) { - if ( $options->{ "keys" } ) + if ( $merge =~ /^(AorB|AnotB)$/ ) { - foreach $key ( @{ $options->{ "keys" } } ) - { - if ( exists $lookup_hash{ $record->{ $key } } ) - { - $pos = 1; - goto FOUND; - } + for ( $i = 0; $i < @keys1; $i++ ) { + $record->{ $keys1[ $i ] } = $vals1[ $i ]; } - } - else - { - foreach $key ( keys %{ $record } ) - { - if ( not $options->{ "vals_only" } ) - { - if ( exists $lookup_hash{ $key } ) - { - $pos = 1; - goto FOUND; - } - } - if ( not $options->{ "keys_only" } ) - { - if ( exists $lookup_hash{ $record->{ $key } } ) - { - $pos = 1; - goto FOUND; - } - } - } + put_record( $record, $out ); } + + @vals1 = Maasha::Common::get_fields( $fh1 ); + $num1--; } - elsif ( $patterns ) + elsif ( $cmp > 0 ) { - foreach $pattern ( @{ $patterns } ) + if ( $merge =~ /^(BorA|BnotA)$/ ) { - if ( $options->{ "keys" } ) - { - foreach $key ( @{ $options->{ "keys" } } ) - { - $pos = index $record->{ $key }, $pattern; - - goto FOUND if $pos >= 0; - } + for ( $i = 0; $i < @keys2; $i++ ) { + $record->{ $keys2[ $i ] } = $vals2[ $i ]; } - else - { - foreach $key ( keys %{ $record } ) - { - if ( not $options->{ "vals_only" } ) - { - $pos = index $key, $pattern; - goto FOUND if $pos >= 0; - } - - if ( not $options->{ "keys_only" } ) - { - $pos = index $record->{ $key }, $pattern; - - goto FOUND if $pos >= 0; - } - } - } + put_record( $record, $out ); } + + @vals2 = Maasha::Common::get_fields( $fh2 ); + $num2--; } - elsif ( $options->{ "regex" } ) + else { - if ( $options->{ "keys" } ) + if ( $merge =~ /^(AandB|AorB|BorA)$/ ) { - foreach $key ( @{ $options->{ "keys" } } ) - { - if ( $options->{ "case_insensitive" } ) { - $pos = 1 if $record->{ $key } =~ /$options->{'regex'}/i; - } else { - $pos = 1 if $record->{ $key } =~ /$options->{'regex'}/; - } + for ( $i = 0; $i < @keys1; $i++ ) { + $record->{ $keys1[ $i ] } = $vals1[ $i ]; + } - goto FOUND if $pos >= 0; + for ( $i = 1; $i < @keys2; $i++ ) { + $record->{ $keys2[ $i ] } = $vals2[ $i ]; } + + put_record( $record, $out ); } - else - { - foreach $key ( keys %{ $record } ) - { - if ( not $options->{ "vals_only" } ) - { - if ( $options->{ "case_insensitive" } ) { - $pos = 1 if $key =~ /$options->{'regex'}/i; - } else { - $pos = 1 if $key =~ /$options->{'regex'}/; - } - goto FOUND if $pos >= 0; - } + @vals1 = Maasha::Common::get_fields( $fh1 ); + @vals2 = Maasha::Common::get_fields( $fh2 ); + $num1--; + $num2--; + } + } + + close $fh1; + close $fh2; + + unlink $file1; + unlink $file2; + + if ( $num1 > 0 and $merge =~ /^(AorB|AnotB)$/ ) + { + undef $record; + + for ( $i = 0; $i < @keys1; $i++ ) { + $record->{ $keys1[ $i ] } = $vals1[ $i ]; + } + + put_record( $record, $out ); + } + + if ( $num2 > 0 and $merge =~ /^(BorA|BnotA)$/ ) + { + undef $record; + + for ( $i = 0; $i < @keys2; $i++ ) { + $record->{ $keys2[ $i ] } = $vals2[ $i ]; + } + + put_record( $record, $out ); + } +} + + +sub script_grab +{ + # Martin A. Hansen, August 2007. + + # Grab for records in stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. - if ( not $options->{ "keys_only" } ) - { - if ( $options->{ "case_insensitive" } ) { - $pos = 1 if $record->{ $key } =~ /$options->{'regex'}/i; - } else { - $pos = 1 if $record->{ $key } =~ /$options->{'regex'}/; - } + my ( $keys, $vals_only, $keys_only, $invert, $patterns, $pattern, $regex, $record, $key, $op, $val, %lookup_hash, $found ); - goto FOUND if $pos >= 0; - } - } - } + $keys = $options->{ 'keys' }; + $vals_only = $options->{ 'vals_only' }; + $keys_only = $options->{ 'keys_only' }; + $invert = $options->{ 'invert' }; + + if ( $options->{ 'patterns' } ) + { + $patterns = [ split ",", $options->{ 'patterns' } ]; + } + elsif ( -f $options->{ 'patterns_in' } ) + { + $patterns = Maasha::Patscan::read_patterns( $options->{ 'patterns_in' } ); + } + elsif ( $options->{ 'regex' } ) + { + if ( $options->{ 'case_insensitive' } ) { + $regex = qr/$options->{ 'regex' }/i; + } else { + $regex = qr/$options->{ 'regex' }/; } - elsif ( $options->{ "eval" } ) + } + elsif ( -f $options->{ 'exact_in' } ) + { + $patterns = Maasha::Patscan::read_patterns( $options->{ 'exact_in' } ); + + map { $lookup_hash{ $_ } = 1 } @{ $patterns }; + + undef $patterns; + } + elsif ( $options->{ 'eval' } ) + { + if ( $options->{ 'eval' } =~ /^([^><=! ]+)\s*(>=|<=|>|<|=|!=|eq|ne)\s*(.+)$/ ) { - if ( defined $record->{ $key } ) - { - if ( $op eq "<" and $record->{ $key } < $val ) { - $pos = 1 and goto FOUND; - } elsif ( $op eq ">" and $record->{ $key } > $val ) { - $pos = 1 and goto FOUND; - } elsif ( $op eq ">=" and $record->{ $key } >= $val ) { - $pos = 1 and goto FOUND; - } elsif ( $op eq "<=" and $record->{ $key } <= $val ) { - $pos = 1 and goto FOUND; - } elsif ( $op eq "=" and $record->{ $key } == $val ) { - $pos = 1 and goto FOUND; - } elsif ( $op eq "!=" and $record->{ $key } != $val ) { - $pos = 1 and goto FOUND; - } elsif ( $op eq "eq" and $record->{ $key } eq $val ) { - $pos = 1 and goto FOUND; - } elsif ( $op eq "ne" and $record->{ $key } ne $val ) { - $pos = 1 and goto FOUND; - } - } + $key = $1; + $op = $2; + $val = $3; } + } + + while ( $record = get_record( $in ) ) + { + $found = 0; - FOUND: + if ( %lookup_hash ) { + $found = grab_lookup( \%lookup_hash, $record, $keys, $vals_only, $keys_only ); + } elsif ( $patterns ) { + $found = grab_patterns( $patterns, $record, $keys, $vals_only, $keys_only ); + } elsif ( $regex ) { + $found = grab_regex( $regex, $record, $keys, $vals_only, $keys_only ); + } elsif ( $op ) { + $found = grab_eval( $key, $op, $val, $record ); + } - if ( $pos >= 0 and not $options->{ "invert" } ) { + if ( $found and not $invert ) { put_record( $record, $out ); - } elsif ( $pos < 0 and $options->{ "invert" } ) { + } elsif ( not $found and $invert ) { put_record( $record, $out ); } } @@ -4686,29 +5324,33 @@ sub script_compute # Returns nothing. - my ( $record, $eval_key, $eval_val, $check, @keys ); + my ( $record, $eval_key, @keys, $eval_val ); while ( $record = get_record( $in ) ) { if ( $options->{ "eval" } ) { - if ( $options->{ "eval" } =~ /^(.+)\s*=\s*(.+)$/ ) + if ( $options->{ "eval" } =~ /^(\S+)\s*=\s*(.+)$/ ) { $eval_key = $1; $eval_val = $2; - } - if ( not $check ) - { - @keys = split /\W+/, $eval_val; - @keys = grep { ! /^\d+$/ } @keys; + if ( not @keys ) + { + @keys = split /\s+|\+|-|\*|\/|\*\*/, $eval_val; - $check = 1; - } + @keys = grep { exists $record->{ $_ } } @keys; + } - map { $eval_val =~ s/$_/$record->{ $_ }/g } @keys; + map { $eval_val =~ s/\Q$_\E/$record->{ $_ }/g } @keys; - $record->{ $eval_key } = eval "$eval_val" or Maasha::Common::error( "eval failed -> $@" ); + $record->{ $eval_key } = eval "$eval_val"; + Maasha::Common::error( qq(eval "$eval_key = $eval_val" failed -> $@) ) if $@; + } + else + { + Maasha::Common::error( qq(Bad compute expression: "$options->{ 'eval' }"\n) ); + } } put_record( $record, $out ); @@ -4826,7 +5468,7 @@ sub script_count_records } } - $result = { "count_records" => $count }; + $result = { "RECORDS_COUNT" => $count }; $fh = write_stream( $options->{ "data_out" } ); @@ -4973,6 +5615,7 @@ sub script_count_vals $fh_out = Maasha::Common::write_open( $tmp_file ); + $cache = 0; $num = 0; while ( $record = get_record( $in ) ) @@ -4999,7 +5642,7 @@ sub script_count_vals if ( $cache ) { - $num = 0; + $num = 0; $fh_in = Maasha::Common::read_open( $tmp_file ); @@ -5048,7 +5691,7 @@ sub script_plot_histogram while ( $record = get_record( $in ) ) { - $data_hash{ $record->{ $options->{ "key" } } }++ if $record->{ $options->{ "key" } }; + $data_hash{ $record->{ $options->{ "key" } } }++ if defined $record->{ $options->{ "key" } }; put_record( $record, $out ) if not $options->{ "no_stream" }; } @@ -5088,7 +5731,7 @@ sub script_plot_lendist while ( $record = get_record( $in ) ) { - $data_hash{ $record->{ $options->{ "key" } } }++ if $record->{ $options->{ "key" } }; + $data_hash{ $record->{ $options->{ "key" } } }++ if defined $record->{ $options->{ "key" } }; put_record( $record, $out ) if not $options->{ "no_stream" }; } @@ -5495,6 +6138,8 @@ sub script_upload_to_ucsc # Calculate the mean of values of given keys. + # This routine has developed into the most ugly hack. Do something! + my ( $in, # handle to in stream $out, # handle to out stream $options, # options hash @@ -5502,8 +6147,7 @@ sub script_upload_to_ucsc # Returns nothing. - my ( $record, $file, $wib_file, $wig_file, $wib_dir, $fh_in, $fh_out, $i, $first, $format, $args, $type, $columns, $append, %fh_hash, - $chr, $beg, $end, $block, $line, $max, $beg_block, $entry, $q_id, $clones ); + my ( $record, $file, $wib_file, $wig_file, $wib_dir, $fh_out, $i, $first, $format, $type, $columns, $append, $entry ); $options->{ "short_label" } ||= $options->{ 'table' }; $options->{ "long_label" } ||= $options->{ 'table' }; @@ -5513,209 +6157,111 @@ sub script_upload_to_ucsc $options->{ "color" } ||= join( ",", int( rand( 255 ) ), int( rand( 255 ) ), int( rand( 255 ) ) ); $options->{ "chunk_size" } ||= 10_000_000_000; # Due to 32-bit UCSC compilation really large tables cannot be loaded in one go. - $file = "$BP_TMP/ucsc_upload.tmp"; - + $file = "$BP_TMP/ucsc_upload.tmp"; $append = 0; + $first = 1; + $i = 0; - $first = 1; - - $i = 0; + $fh_out = Maasha::Common::write_open( $file ); - if ( $options->{ 'wiggle' } ) + while ( $record = get_record( $in ) ) { - $options->{ "visibility" } = "full"; + put_record( $record, $out ) if not $options->{ "no_stream" }; - while ( $record = get_record( $in ) ) + if ( $record->{ "REC_TYPE" } eq "fixed_step" ) { - put_record( $record, $out ) if not $options->{ "no_stream" }; - - $record->{ "CHR" } = $record->{ "S_ID" } if not defined $record->{ "CHR" }; - $record->{ "CHR_BEG" } = $record->{ "S_BEG" } if not defined $record->{ "CHR_BEG" }; - $record->{ "CHR_END" } = $record->{ "S_END" } if not defined $record->{ "CHR_END" }; + $format = "WIGGLE"; - $fh_hash{ $record->{ "CHR" } } = Maasha::Common::write_open( "$BP_TMP/$record->{ 'CHR' }" ) if not exists $fh_hash{ $record->{ "CHR" } }; + if ( $entry = Maasha::UCSC::Wiggle::biopiece2fixedstep( $record ) ) { + Maasha::UCSC::Wiggle::fixedstep_entry_put( $entry, $fh_out ); + } + } + elsif ( $record->{ "REC_TYPE" } eq "PSL" ) + { + $format = "PSL"; - $fh_out = $fh_hash{ $record->{ "CHR" } }; + Maasha::UCSC::psl_put_header( $fh_out ) if $first; + Maasha::UCSC::psl_put_entry( $record, $fh_out ); - Maasha::UCSC::bed_put_entry( $record, $fh_out, 5 ); + $first = 0; } - - map { close $_ } keys %fh_hash; - - $fh_out = Maasha::Common::write_open( $file ); - - foreach $chr ( sort keys %fh_hash ) + elsif ( $record->{ "REC_TYPE" } eq "BED" and $record->{ "SEC_STRUCT" } ) { - Maasha::Common::run( "bedSort", "$BP_TMP/$chr $BP_TMP/$chr" ); - - $fh_in = Maasha::Common::read_open( "$BP_TMP/$chr" ); - - undef $block; - - while ( $entry = Maasha::UCSC::bed_get_entry( $fh_in, 5 ) ) - { - $chr = $entry->{ 'CHR' }; - $beg = $entry->{ 'CHR_BEG' }; - $end = $entry->{ 'CHR_END' }; - $q_id = $entry->{ 'Q_ID' }; - - if ( $q_id =~ /_(\d+)$/ ) { - $clones = $1; - } else { - $clones = 1; - } - - if ( $block ) - { - if ( $beg > $max ) - { - Maasha::UCSC::fixedstep_put_entry( $chr, $beg_block, $block, $fh_out ); - undef $block; - } - else - { - for ( $i = $beg - $beg_block; $i < ( $beg - $beg_block ) + ( $end - $beg ); $i++ ) { - $block->[ $i ] += $clones; - } + # chrom chromStart chromEnd name score strand size secStr conf - $max = Maasha::Calc::max( $max, $end ); - } - } + $format = "BED_SS"; - if ( not $block ) - { - $beg_block = $beg; - $max = $end; + print $fh_out join ( "\t", + $record->{ "CHR" }, + $record->{ "CHR_BEG" }, + $record->{ "CHR_END" } + 1, + $record->{ "Q_ID" }, + $record->{ "SCORE" }, + $record->{ "STRAND" }, + $record->{ "SIZE" }, + $record->{ "SEC_STRUCT" }, + $record->{ "CONF" }, + ), "\n"; + } + elsif ( $record->{ "REC_TYPE" } eq "BED" ) + { + $format = "BED"; + $columns = $record->{ "BED_COLS" }; - for ( $i = 0; $i < ( $end - $beg ); $i++ ) { - $block->[ $i ] += $clones; - } - } + if ( $entry = Maasha::UCSC::BED::biopiece2bed( $record, $columns ) ) { + Maasha::UCSC::BED::bed_entry_put( $entry, $fh_out, $columns, $options->{ 'check' } ); } - - close $fh_in; - - Maasha::UCSC::fixedstep_put_entry( $chr, $beg_block, $block, $fh_out ); - - unlink "$BP_TMP/$chr"; } - - close $fh_out; - - $wig_file = "$options->{ 'table' }.wig"; - $wib_file = "$options->{ 'table' }.wib"; - - $wib_dir = "$ENV{ 'BP_DATA' }/genomes/$options->{ 'database' }/wib"; - - Maasha::Common::dir_create_if_not_exists( $wib_dir ); - - # Maasha::Common::run( "wigEncode", "$file $wig_file $wib_file > /dev/null 2>&1" ); - - `cd $BP_TMP && wigEncode $file $wig_file $wib_file > /dev/null 2>&1`; - Maasha::Common::run( "mv", "$BP_TMP/$wib_file $wib_dir" ); - - unlink $file; - - $file = $wig_file; - - $format = "WIGGLE"; - } - else - { - $fh_out = Maasha::Common::write_open( $file ); - - while ( $record = get_record( $in ) ) + elsif ( $record->{ "REC_TYPE" } eq "PATSCAN" and $record->{ "CHR" } ) { - put_record( $record, $out ) if not $options->{ "no_stream" }; - - if ( $record->{ "REC_TYPE" } eq "PSL" ) - { - Maasha::UCSC::psl_put_header( $fh_out ) if $first; - Maasha::UCSC::psl_put_entry( $record, $fh_out ); - - $first = 0; + $format = "BED"; + $columns = 6; - $format = "PSL" if not $format; - } - elsif ( $record->{ "REC_TYPE" } eq "BED" and $record->{ "SEC_STRUCT" } ) - { - # chrom chromStart chromEnd name score strand size secStr conf - - print $fh_out join ( "\t", - $record->{ "CHR" }, - $record->{ "CHR_BEG" }, - $record->{ "CHR_END" } + 1, - $record->{ "Q_ID" }, - $record->{ "SCORE" }, - $record->{ "STRAND" }, - $record->{ "SIZE" }, - $record->{ "SEC_STRUCT" }, - $record->{ "CONF" }, - ), "\n"; - - $format = "BED_SS" if not $format; + if ( $entry = Maasha::UCSC::BED::biopiece2bed( $record, $columns ) ) { + Maasha::UCSC::BED::bed_entry_put( $entry, $fh_out, $columns, $options->{ 'check' } ); } - elsif ( $record->{ "REC_TYPE" } eq "BED" ) - { - Maasha::UCSC::bed_put_entry( $record, $fh_out, $record->{ "BED_COLS" } ); + } + elsif ( $record->{ "REC_TYPE" } eq "BLAST" and $record->{ "S_ID" } =~ /^chr/ ) + { + $format = "BED"; + $columns = 6; - $format = "BED" if not $format; - $columns = $record->{ "BED_COLS" } if not $columns; - } - elsif ( $record->{ "REC_TYPE" } eq "PATSCAN" and $record->{ "CHR" } ) - { - Maasha::UCSC::bed_put_entry( $record, $fh_out, 6 ); + $record->{ "SCORE" } = $record->{ "BIT_SCORE" } * 1000; - $format = "BED" if not $format; - $columns = 6 if not $columns; + if ( $entry = Maasha::UCSC::BED::biopiece2bed( $record, $columns ) ) { + Maasha::UCSC::BED::bed_entry_put( $entry, $fh_out, $columns, $options->{ 'check' } ); } - elsif ( $record->{ "REC_TYPE" } eq "BLAST" and $record->{ "S_ID" } =~ /^chr/ ) - { - $record->{ "CHR" } = $record->{ "S_ID" }; - $record->{ "CHR_BEG" } = $record->{ "S_BEG" }; - $record->{ "CHR_END" } = $record->{ "S_END" }; - $record->{ "SCORE" } = $record->{ "BIT_SCORE" } * 1000; - - $format = "BED" if not $format; - $columns = 6 if not $columns; + } + elsif ( $record->{ "REC_TYPE" } eq "VMATCH" and $record->{ "S_ID" } =~ /^chr/i ) + { + $format = "BED"; + $columns = 6; - Maasha::UCSC::bed_put_entry( $record, $fh_out ); + if ( $entry = Maasha::UCSC::BED::biopiece2bed( $record, $columns ) ) { + Maasha::UCSC::BED::bed_entry_put( $entry, $fh_out, $columns, $options->{ 'check' } ); } - elsif ( $record->{ "REC_TYPE" } eq "VMATCH" and $record->{ "S_ID" } =~ /^chr/i ) - { - $record->{ "CHR" } = $record->{ "S_ID" }; - $record->{ "CHR_BEG" } = $record->{ "S_BEG" }; - $record->{ "CHR_END" } = $record->{ "S_END" }; - $record->{ "SCORE" } = $record->{ "SCORE" } || 999; - $record->{ "SCORE" } = int( $record->{ "SCORE" } ); + } - $format = "BED" if not $format; - $columns = 6 if not $columns; + if ( $i == $options->{ "chunk_size" } ) + { + close $fh_out; - Maasha::UCSC::bed_put_entry( $record, $fh_out, 6 ); + if ( $format eq "BED" ) { + Maasha::UCSC::bed_upload_to_ucsc( $BP_TMP, $file, $options, $append ); + } elsif ( $format eq "PSL" ) { + Maasha::UCSC::psl_upload_to_ucsc( $file, $options, $append ); } - if ( $i == $options->{ "chunk_size" } ) - { - close $fh_out; - - if ( $format eq "BED" ) { - Maasha::UCSC::bed_upload_to_ucsc( $BP_TMP, $file, $options, $append ); - } elsif ( $format eq "PSL" ) { - Maasha::UCSC::psl_upload_to_ucsc( $file, $options, $append ); - } - - unlink $file; + unlink $file; - $first = 1; + $first = 1; - $append = 1; + $append = 1; - $fh_out = Maasha::Common::write_open( $file ); - } - - $i++; + $fh_out = Maasha::Common::write_open( $file ); } + + $i++; } close $fh_out; @@ -5730,9 +6276,7 @@ sub script_upload_to_ucsc } elsif ( $format eq "BED_SS" ) { - $options->{ "sec_struct" } = 1; - - $type = "sec_struct"; + $type = "type bed 6 +"; Maasha::UCSC::bed_upload_to_ucsc( $BP_TMP, $file, $options, $append ); } @@ -5744,6 +6288,27 @@ sub script_upload_to_ucsc } elsif ( $format eq "WIGGLE" ) { + $options->{ "visibility" } = "full"; + + $wig_file = "$options->{ 'table' }.wig"; + $wib_file = "$options->{ 'table' }.wib"; + + $wib_dir = "$ENV{ 'HOME' }/ucsc/wib"; + + Maasha::Common::dir_create_if_not_exists( $wib_dir ); + + if ( $options->{ 'verbose' } ) { + `cd $BP_TMP && wigEncode $file $wig_file $wib_file`; + } else { + `cd $BP_TMP && wigEncode $file $wig_file $wib_file > /dev/null 2>&1`; + } + + Maasha::Common::run( "mv", "$BP_TMP/$wib_file $wib_dir" ); + + unlink $file; + + $file = $wig_file; + $type = "wig 0"; Maasha::UCSC::wiggle_upload_to_ucsc( $BP_TMP, $wib_dir, $file, $options ); @@ -5751,7 +6316,7 @@ sub script_upload_to_ucsc unlink $file; - Maasha::UCSC::update_my_tracks( $options, $type ); + Maasha::UCSC::ucsc_update_config( $options, $type ); } } @@ -5759,6 +6324,149 @@ sub script_upload_to_ucsc # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< +sub grab_lookup +{ + # Martin A. Hansen, November 2009. + + # Uses keys from a lookup hash to search records. Optionally, a list of + # keys can be given so the lookup is limited to these, also, flags + # can be given to limit lookup to keys or vals only. Returns 1 if lookup + # succeeded, else 0. + + my ( $lookup_hash, # hashref with patterns + $record, # hashref + $keys, # list of keys - OPTIONAL + $vals_only, # only vals flag - OPTIONAL + $keys_only, # only keys flag - OPTIONAL + ) = @_; + + # Returns boolean. + + if ( $keys ) + { + map { return 1 if exists $lookup_hash->{ $record->{ $_ } } } @{ $keys }; + } + else + { + if ( not $vals_only ) { + map { return 1 if exists $lookup_hash->{ $_ } } keys %{ $record }; + } + + if ( not $keys_only ) { + map { return 1 if exists $lookup_hash->{ $record->{ $_ } } } keys %{ $record }; + } + } + + return 0; +} + + +sub grab_patterns +{ + # Martin A. Hansen, November 2009. + + # Uses patterns to match records containing the pattern as a substring. + # Returns 1 if the record is matched, else 0. + + my ( $patterns, # list of patterns + $record, # hashref + $keys, # list of keys - OPTIONAL + $vals_only, # only vals flag - OPTIONAL + $keys_only, # only keys flag - OPTIONAL + ) = @_; + + # Returns boolean. + + my ( $pattern ); + + foreach $pattern ( @{ $patterns } ) + { + if ( $keys ) + { + map { return 1 if index( $record->{ $_ }, $pattern ) >= 0 } @{ $keys }; + } + else + { + if ( not $vals_only ) { + map { return 1 if index( $_, $pattern ) >= 0 } keys %{ $record }; + } + + if ( not $keys_only ) { + map { return 1 if index( $record->{ $_ }, $pattern ) >= 0 } keys %{ $record }; + } + } + } + + return 0; +} + + +sub grab_regex +{ + # Martin A. Hansen, November 2009. + + # Uses regex to match records. + # Returns 1 if the record is matched, else 0. + + my ( $regex, # regex to match + $record, # hashref + $keys, # list of keys - OPTIONAL + $vals_only, # only vals flag - OPTIONAL + $keys_only, # only keys flag - OPTIONAL + ) = @_; + + # Returns boolean. + + if ( $keys ) + { + map { return 1 if $record->{ $_ } =~ /$regex/ } @{ $keys }; + } + else + { + if ( not $vals_only ) { + map { return 1 if $_ =~ /$regex/ } keys %{ $record }; + } + + if ( not $keys_only ) { + map { return 1 if $record->{ $_ } =~ /$regex/ } keys %{ $record }; + } + } + + return 0; +} + + +sub grab_eval +{ + # Martin A. Hansen, November 2009. + + # Test if the value of a given record key evaluates according + # to a given operator. Returns 1 if eval is OK, else 0. + + my ( $key, # record key + $op, # operator + $val, # value + $record, # hashref + ) = @_; + + # Returns boolean. + + if ( defined $record->{ $key } ) + { + return 1 if ( $op eq "<" and $record->{ $key } < $val ); + return 1 if ( $op eq ">" and $record->{ $key } > $val ); + return 1 if ( $op eq ">=" and $record->{ $key } >= $val ); + return 1 if ( $op eq "<=" and $record->{ $key } <= $val ); + return 1 if ( $op eq "=" and $record->{ $key } == $val ); + return 1 if ( $op eq "!=" and $record->{ $key } != $val ); + return 1 if ( $op eq "eq" and $record->{ $key } eq $val ); + return 1 if ( $op eq "ne" and $record->{ $key } ne $val ); + } + + return 0; +} + + sub read_stream { # Martin A. Hansen, July 2007. @@ -5817,10 +6525,10 @@ sub get_record # Reads one record at a time and converts that record # to a Perl data structure (a hash) which is returned. - my ( $fh, + my ( $fh, # handle to stream ) = @_; - # Returns data structure. + # Returns a hash. my ( $block, @lines, $line, $key, $value, %record ); @@ -5836,7 +6544,7 @@ sub get_record foreach $line ( @lines ) { - ( $key, $value ) = split ": ", $line; + ( $key, $value ) = split ": ", $line, 2; $record{ $key } = $value; } @@ -5936,69 +6644,63 @@ sub sig_handler print STDERR "\nProgram '$script' died->$sig" . " - Please wait for temporary data to be removed\n"; } - # This is a really bad solution, potentially, anyone can include this module and set - # the BP_TMP to point at any dir and thus take out the machine !!! - - Maasha::Common::dir_remove( $BP_TMP ); + clean_tmp(); } exit( 0 ); } -END -{ - # This is a really bad solution, potentially, anyone can include this module and set - # the BP_TMP to point at any dir and thus take out the machine !!! - - Maasha::Common::dir_remove( $BP_TMP ); -} - - -# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - -1; - -__END__ - - -sub script_read_soft +sub clean_tmp { - # Martin A. Hansen, December 2007. + # Martin A. Hansen, July 2008. - # Read soft format. - # http://www.ncbi.nlm.nih.gov/geo/info/soft2.html - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; + # Cleans out any unused temporary files and directories in BP_TMP. # Returns nothing. - my ( $data_in, $file, $num, $records, $record ); + my ( $tmpdir, @dirs, $curr_pid, $dir, $user, $sid, $pid ); - while ( $record = get_record( $in ) ) { - put_record( $record, $out ); - } + $tmpdir = $ENV{ 'BP_TMP' } || Maasha::Common::error( 'No BP_TMP variable in environment.' ); - $num = 1; + $curr_pid = Maasha::Common::get_processid(); - foreach $file ( @{ $options->{ "files" } } ) - { - $records = Maasha::NCBI::soft_parse( $file ); + @dirs = Maasha::Common::ls_dirs( $tmpdir ); - foreach $record ( @{ $records } ) + foreach $dir ( @dirs ) + { + if ( $dir =~ /^$tmpdir\/(.+)_(\d+)_(\d+)_bp_tmp$/ ) { - put_record( $record, $out ); - - goto NUM if $options->{ "num" } and $num == $options->{ "num" }; + $user = $1; + $sid = $2; + $pid = $3; - $num++; + if ( $user eq Maasha::Common::get_user() ) + { + if ( not Maasha::Common::process_running( $pid ) ) + { + # print STDERR "Removing stale dir: $dir\n"; + Maasha::Common::dir_remove( $dir ); + } + elsif ( $pid == $curr_pid ) + { + # print STDERR "Removing current dir: $dir\n"; + # Maasha::Common::dir_remove( $dir ); + } + } } } +} - NUM: - close $data_in if $data_in; +END +{ + clean_tmp(); } + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +1; + +__END__