X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=code_perl%2FMaasha%2FBiopieces.pm;h=434cdad8cf738328478eba8a4d6a3e154a2824a2;hb=2f0fd91b461033529a4a72e161bd133252a22eb6;hp=4d6ec7eef5eee974004338ae1309036a92918b55;hpb=15502970edc329bf6f14138be809733d8a8f16a2;p=biopieces.git diff --git a/code_perl/Maasha/Biopieces.pm b/code_perl/Maasha/Biopieces.pm index 4d6ec7e..434cdad 100644 --- a/code_perl/Maasha/Biopieces.pm +++ b/code_perl/Maasha/Biopieces.pm @@ -1,7 +1,7 @@ package Maasha::Biopieces; -# Copyright (C) 2007-2008 Martin A. Hansen. +# Copyright (C) 2007-2009 Martin A. Hansen. # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License @@ -29,6840 +29,704 @@ package Maasha::Biopieces; # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< -use strict; -use Data::Dumper; -use Getopt::Long qw( :config bundling ); -use Time::HiRes qw( gettimeofday ); -use Storable qw( dclone ); -use Maasha::Config; -use Maasha::Common; -use Maasha::Fasta; -use Maasha::Align; -use Maasha::Matrix; -use Maasha::Match; -use Maasha::EMBL; -use Maasha::Stockholm; -use Maasha::Seq; -use Maasha::Patscan; -use Maasha::Plot; -use Maasha::Calc; -use Maasha::UCSC; -use Maasha::NCBI; -use Maasha::GFF; -use Maasha::TwoBit; -use Maasha::Solid; -use Maasha::Solexa; -use Maasha::SQL; -use Maasha::Gwiki; - -use vars qw( @ISA @EXPORT_OK ); - -require Exporter; - -@ISA = qw( Exporter ); - -@EXPORT_OK = qw( - read_stream - write_stream - get_record - put_record -); - -use constant { - SEQ_NAME => 0, - SEQ => 1, -}; - - -# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SIGNAL HANDLER <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - - -$SIG{ '__DIE__' } = \&sig_handler; -$SIG{ 'INT' } = \&sig_handler; -$SIG{ 'TERM' } = \&sig_handler; - - -# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> GLOBALS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - - -my ( $script, $BP_TMP ); - -$script = Maasha::Common::get_scriptname(); -$BP_TMP = Maasha::Common::get_tmpdir(); - - -# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> LOG <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - - -my $log_global = Maasha::Common::append_open( "$ENV{ 'BP_LOG' }/biopieces.log" ); -my $log_local = Maasha::Common::append_open( "$ENV{ 'HOME' }/.biopieces.log" ); - -$log_global->autoflush( 1 ); -$log_local->autoflush( 1 ); - -&log( $log_global, $script, \@ARGV ); -&log( $log_local, $script, \@ARGV ); - -close $log_global; -close $log_local; - - -# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> RUN SCRIPT <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - - -run_script( $script ); - - -# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SUBROUTINES <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - - -sub log -{ - # Martin A. Hansen, January 2008. - - # Log messages to logfile. - - my ( $fh, # filehandle to logfile - $script, # script name - $argv, # reference to @ARGV - ) = @_; - - # Returns nothing. - - my ( $time_stamp, $user ); - - $time_stamp = Maasha::Common::time_stamp(); - - $user = $ENV{ 'USER' }; - - $script = "biopieces" if $script eq "-e"; - - print $fh "$time_stamp\t$user\t$script ", join( " ", @{ $argv } ), "\n"; -} - - -sub run_script -{ - # Martin A. Hansen, August 2007. - - # Run a specific script. - - my ( $script, # script name - ) = @_; - - # Returns nothing. - - my ( $t0, $t1, $options, $in, $out ); - - $t0 = gettimeofday(); - - $options = get_options( $script ); - - $options->{ "SCRIPT" } = $script; - - if ( $script ne "list_biopieces" and $script ne "list_genomes" ) { - $script = "print_usage" if ( -t STDIN and keys %{ $options } <= 1 or $options->{ 'help' } ); - } - - $in = read_stream( $options->{ "stream_in" } ); - $out = write_stream( $options->{ "stream_out" } ); - - if ( $script eq "print_usage" ) { script_print_usage( $in, $out, $options ) } - elsif ( $script eq "list_biopieces" ) { script_list_biopieces( $in, $out, $options ) } - elsif ( $script eq "list_genomes" ) { script_list_genomes( $in, $out, $options ) } - elsif ( $script eq "read_fasta" ) { script_read_fasta( $in, $out, $options ) } - elsif ( $script eq "read_tab" ) { script_read_tab( $in, $out, $options ) } - elsif ( $script eq "read_psl" ) { script_read_psl( $in, $out, $options ) } - elsif ( $script eq "read_bed" ) { script_read_bed( $in, $out, $options ) } - elsif ( $script eq "read_fixedstep" ) { script_read_fixedstep( $in, $out, $options ) } - elsif ( $script eq "read_blast_tab" ) { script_read_blast_tab( $in, $out, $options ) } - elsif ( $script eq "read_embl" ) { script_read_embl( $in, $out, $options ) } - elsif ( $script eq "read_stockholm" ) { script_read_stockholm( $in, $out, $options ) } - elsif ( $script eq "read_phastcons" ) { script_read_phastcons( $in, $out, $options ) } - elsif ( $script eq "read_soft" ) { script_read_soft( $in, $out, $options ) } - elsif ( $script eq "read_gff" ) { script_read_gff( $in, $out, $options ) } - elsif ( $script eq "read_2bit" ) { script_read_2bit( $in, $out, $options ) } - elsif ( $script eq "read_solexa" ) { script_read_solexa( $in, $out, $options ) } - elsif ( $script eq "read_solid" ) { script_read_solid( $in, $out, $options ) } - elsif ( $script eq "read_mysql" ) { script_read_mysql( $in, $out, $options ) } - elsif ( $script eq "read_ucsc_config" ) { script_read_ucsc_config( $in, $out, $options ) } - elsif ( $script eq "assemble_tag_contigs" ) { script_assemble_tag_contigs( $in, $out, $options ) } - elsif ( $script eq "format_genome" ) { script_format_genome( $in, $out, $options ) } - elsif ( $script eq "length_seq" ) { script_length_seq( $in, $out, $options ) } - elsif ( $script eq "uppercase_seq" ) { script_uppercase_seq( $in, $out, $options ) } - elsif ( $script eq "shuffle_seq" ) { script_shuffle_seq( $in, $out, $options ) } - elsif ( $script eq "analyze_seq" ) { script_analyze_seq( $in, $out, $options ) } - elsif ( $script eq "analyze_tags" ) { script_analyze_tags( $in, $out, $options ) } - elsif ( $script eq "complexity_seq" ) { script_complexity_seq( $in, $out, $options ) } - elsif ( $script eq "oligo_freq" ) { script_oligo_freq( $in, $out, $options ) } - elsif ( $script eq "create_weight_matrix" ) { script_create_weight_matrix( $in, $out, $options ) } - elsif ( $script eq "calc_bit_scores" ) { script_calc_bit_scores( $in, $out, $options ) } - elsif ( $script eq "calc_fixedstep" ) { script_calc_fixedstep( $in, $out, $options ) } - elsif ( $script eq "reverse_seq" ) { script_reverse_seq( $in, $out, $options ) } - elsif ( $script eq "complement_seq" ) { script_complement_seq( $in, $out, $options ) } - elsif ( $script eq "remove_indels" ) { script_remove_indels( $in, $out, $options ) } - elsif ( $script eq "transliterate_seq" ) { script_transliterate_seq( $in, $out, $options ) } - elsif ( $script eq "transliterate_vals" ) { script_transliterate_vals( $in, $out, $options ) } - elsif ( $script eq "translate_seq" ) { script_translate_seq( $in, $out, $options ) } - elsif ( $script eq "extract_seq" ) { script_extract_seq( $in, $out, $options ) } - elsif ( $script eq "get_genome_seq" ) { script_get_genome_seq( $in, $out, $options ) } - elsif ( $script eq "get_genome_align" ) { script_get_genome_align( $in, $out, $options ) } - elsif ( $script eq "get_genome_phastcons" ) { script_get_genome_phastcons( $in, $out, $options ) } - elsif ( $script eq "fold_seq" ) { script_fold_seq( $in, $out, $options ) } - elsif ( $script eq "split_seq" ) { script_split_seq( $in, $out, $options ) } - elsif ( $script eq "split_bed" ) { script_split_bed( $in, $out, $options ) } - elsif ( $script eq "align_seq" ) { script_align_seq( $in, $out, $options ) } - elsif ( $script eq "tile_seq" ) { script_tile_seq( $in, $out, $options ) } - elsif ( $script eq "invert_align" ) { script_invert_align( $in, $out, $options ) } - elsif ( $script eq "patscan_seq" ) { script_patscan_seq( $in, $out, $options ) } - elsif ( $script eq "create_blast_db" ) { script_create_blast_db( $in, $out, $options ) } - elsif ( $script eq "blast_seq" ) { script_blast_seq( $in, $out, $options ) } - elsif ( $script eq "blat_seq" ) { script_blat_seq( $in, $out, $options ) } - elsif ( $script eq "soap_seq" ) { script_soap_seq( $in, $out, $options ) } - elsif ( $script eq "match_seq" ) { script_match_seq( $in, $out, $options ) } - elsif ( $script eq "create_vmatch_index" ) { script_create_vmatch_index( $in, $out, $options ) } - elsif ( $script eq "vmatch_seq" ) { script_vmatch_seq( $in, $out, $options ) } - elsif ( $script eq "write_fasta" ) { script_write_fasta( $in, $out, $options ) } - elsif ( $script eq "write_align" ) { script_write_align( $in, $out, $options ) } - elsif ( $script eq "write_blast" ) { script_write_blast( $in, $out, $options ) } - elsif ( $script eq "write_tab" ) { script_write_tab( $in, $out, $options ) } - elsif ( $script eq "write_bed" ) { script_write_bed( $in, $out, $options ) } - elsif ( $script eq "write_psl" ) { script_write_psl( $in, $out, $options ) } - elsif ( $script eq "write_fixedstep" ) { script_write_fixedstep( $in, $out, $options ) } - elsif ( $script eq "write_2bit" ) { script_write_2bit( $in, $out, $options ) } - elsif ( $script eq "write_solid" ) { script_write_solid( $in, $out, $options ) } - elsif ( $script eq "write_ucsc_config" ) { script_write_ucsc_config( $in, $out, $options ) } - elsif ( $script eq "head_records" ) { script_head_records( $in, $out, $options ) } - elsif ( $script eq "remove_keys" ) { script_remove_keys( $in, $out, $options ) } - elsif ( $script eq "remove_adaptor" ) { script_remove_adaptor( $in, $out, $options ) } - elsif ( $script eq "remove_mysql_tables" ) { script_remove_mysql_tables( $in, $out, $options ) } - elsif ( $script eq "remove_ucsc_tracks" ) { script_remove_ucsc_tracks( $in, $out, $options ) } - elsif ( $script eq "rename_keys" ) { script_rename_keys( $in, $out, $options ) } - elsif ( $script eq "uniq_vals" ) { script_uniq_vals( $in, $out, $options ) } - elsif ( $script eq "merge_vals" ) { script_merge_vals( $in, $out, $options ) } - elsif ( $script eq "merge_records" ) { script_merge_records( $in, $out, $options ) } - elsif ( $script eq "grab" ) { script_grab( $in, $out, $options ) } - elsif ( $script eq "compute" ) { script_compute( $in, $out, $options ) } - elsif ( $script eq "flip_tab" ) { script_flip_tab( $in, $out, $options ) } - elsif ( $script eq "add_ident" ) { script_add_ident( $in, $out, $options ) } - elsif ( $script eq "count_records" ) { script_count_records( $in, $out, $options ) } - elsif ( $script eq "random_records" ) { script_random_records( $in, $out, $options ) } - elsif ( $script eq "sort_records" ) { script_sort_records( $in, $out, $options ) } - elsif ( $script eq "count_vals" ) { script_count_vals( $in, $out, $options ) } - elsif ( $script eq "plot_histogram" ) { script_plot_histogram( $in, $out, $options ) } - elsif ( $script eq "plot_lendist" ) { script_plot_lendist( $in, $out, $options ) } - elsif ( $script eq "plot_chrdist" ) { script_plot_chrdist( $in, $out, $options ) } - elsif ( $script eq "plot_karyogram" ) { script_plot_karyogram( $in, $out, $options ) } - elsif ( $script eq "plot_matches" ) { script_plot_matches( $in, $out, $options ) } - elsif ( $script eq "plot_seqlogo" ) { script_plot_seqlogo( $in, $out, $options ) } - elsif ( $script eq "plot_phastcons_profiles" ) { script_plot_phastcons_profiles( $in, $out, $options ) } - elsif ( $script eq "analyze_bed" ) { script_analyze_bed( $in, $out, $options ) } - elsif ( $script eq "analyze_vals" ) { script_analyze_vals( $in, $out, $options ) } - elsif ( $script eq "length_vals" ) { script_length_vals( $in, $out, $options ) } - elsif ( $script eq "sum_vals" ) { script_sum_vals( $in, $out, $options ) } - elsif ( $script eq "mean_vals" ) { script_mean_vals( $in, $out, $options ) } - elsif ( $script eq "median_vals" ) { script_median_vals( $in, $out, $options ) } - elsif ( $script eq "max_vals" ) { script_max_vals( $in, $out, $options ) } - elsif ( $script eq "min_vals" ) { script_min_vals( $in, $out, $options ) } - elsif ( $script eq "upload_to_ucsc" ) { script_upload_to_ucsc( $in, $out, $options ) } - - close $in if defined $in; - close $out; - - $t1 = gettimeofday(); - - print STDERR "Program: $script" . ( " " x ( 25 - length( $script ) ) ) . sprintf( "Run time: %.4f\n", ( $t1 - $t0 ) ) if $options->{ 'verbose' }; -} - - -sub get_options -{ - # Martin A. Hansen, February 2008. - - # Gets options from commandline and checks these vigerously. - - my ( $script, # name of script - ) = @_; - - # Returns hash - - my ( %options, @options, $opt, @genomes, $real ); - - if ( $script eq "print_usage" ) - { - @options = qw( - data_in|i=s - ); - } - elsif ( $script eq "read_fasta" ) - { - @options = qw( - data_in|i=s - num|n=s - ); - } - elsif ( $script eq "read_tab" ) - { - @options = qw( - data_in|i=s - delimit|d=s - cols|c=s - keys|k=s - skip|s=s - num|n=s - ); - } - elsif ( $script eq "read_psl" ) - { - @options = qw( - data_in|i=s - num|n=s - ); - } - elsif ( $script eq "read_bed" ) - { - @options = qw( - data_in|i=s - num|n=s - ); - } - elsif ( $script eq "read_fixedstep" ) - { - @options = qw( - data_in|i=s - num|n=s - ); - } - elsif ( $script eq "read_blast_tab" ) - { - @options = qw( - data_in|i=s - num|n=s - ); - } - elsif ( $script eq "read_embl" ) - { - @options = qw( - data_in|i=s - num|n=s - keys|k=s - feats|f=s - quals|q=s - ); - } - elsif ( $script eq "read_stockholm" ) - { - @options = qw( - data_in|i=s - num|n=s - ); - } - elsif ( $script eq "read_phastcons" ) - { - @options = qw( - data_in|i=s - num|n=s - min|m=s - dist|d=s - threshold|t=f - gap|g=s - ); - } - elsif ( $script eq "read_soft" ) - { - @options = qw( - data_in|i=s - samples|s=s - num|n=s - ); - } - elsif ( $script eq "read_gff" ) - { - @options = qw( - data_in|i=s - num|n=s - ); - } - elsif ( $script eq "read_2bit" ) - { - @options = qw( - data_in|i=s - num|n=s - no_mask|N - ); - } - elsif ( $script eq "read_solexa" ) - { - @options = qw( - data_in|i=s - num|n=s - format|f=s - quality|q=s - ); - } - elsif ( $script eq "read_solid" ) - { - @options = qw( - data_in|i=s - num|n=s - quality|q=s - ); - } - elsif ( $script eq "read_mysql" ) - { - @options = qw( - database|d=s - query|q=s - user|u=s - password|p=s - ); - } - elsif ( $script eq "read_ucsc_config" ) - { - @options = qw( - data_in|i=s - num|n=s - ); - } - elsif ( $script eq "format_genome" ) - { - @options = qw( - no_stream|x - dir|d=s - genome|g=s - formats|f=s - ); - } - elsif ( $script eq "length_seq" ) - { - @options = qw( - no_stream|x - data_out|o=s - ); - } - elsif ( $script eq "oligo_freq" ) - { - @options = qw( - word_size|w=s - all|a - ); - } - elsif ( $script eq "create_weight_matrix" ) - { - @options = qw( - percent|p - ); - } - elsif ( $script eq "calc_fixedstep" ) - { - @options = qw( - score|S - log10|L - ); - } - elsif ( $script eq "transliterate_seq" ) - { - @options = qw( - search|s=s - replace|r=s - delete|d=s - ); - } - elsif ( $script eq "transliterate_vals" ) - { - @options = qw( - keys|k=s - search|s=s - replace|r=s - delete|d=s - ); - } - elsif ( $script eq "translate_seq" ) - { - @options = qw( - frames|f=s - ); - } - elsif ( $script eq "extract_seq" ) - { - @options = qw( - beg|b=s - end|e=s - len|l=s - ); - } - elsif ( $script eq "get_genome_seq" ) - { - @options = qw( - genome|g=s - chr|c=s - beg|b=s - end|e=s - len|l=s - flank|f=s - mask|m - ); - } - elsif ( $script eq "get_genome_align" ) - { - @options = qw( - genome|g=s - chr|c=s - beg|b=s - end|e=s - len|l=s - strand|s=s - ); - } - elsif ( $script eq "get_genome_phastcons" ) - { - @options = qw( - genome|g=s - chr|c=s - beg|b=s - end|e=s - len|l=s - flank|f=s - ); - } - elsif ( $script eq "split_seq" ) - { - @options = qw( - word_size|w=s - uniq|u - ); - } - elsif ( $script eq "split_bed" ) - { - @options = qw( - window_size|w=s - step_size|s=s - ); - } - elsif ( $script eq "tile_seq" ) - { - @options = qw( - identity|i=s - supress_indels|s - ); - } - elsif ( $script eq "invert_align" ) - { - @options = qw( - soft|s - ); - } - elsif ( $script eq "patscan_seq" ) - { - @options = qw( - patterns|p=s - patterns_in|P=s - comp|c - max_hits|h=s - max_misses|m=s - genome|g=s - ); - } - elsif ( $script eq "create_blast_db" ) - { - @options = qw( - no_stream|x - database|d=s - ); - } - elsif ( $script eq "blast_seq" ) - { - @options = qw( - database|d=s - genome|g=s - program|p=s - e_val|e=f - filter|f - cpus|c=s - no_filter|F - ); - } - elsif ( $script eq "blat_seq" ) - { - @options = qw( - genome|g=s - tile_size|t=s - step_size|s=s - min_identity|m=s - min_score|M=s - one_off|o=s - ooc|c - ); - } - elsif ( $script eq "soap_seq" ) - { - @options = qw( - in_file|i=s - genome|g=s - seed_size|s=s - mismatches|m=s - gap_size|G=s - cpus|c=s - ); - } - elsif ( $script eq "match_seq" ) - { - @options = qw( - word_size|w=s - direction|d=s - ); - } - elsif ( $script eq "create_vmatch_index" ) - { - @options = qw( - index_name|i=s - prefix_length|p=s - no_stream|x - ); - } - elsif ( $script eq "vmatch_seq" ) - { - @options = qw( - genome|g=s - index_name|i=s - count|c - max_hits|m=s - hamming_dist|h=s - edit_dist|e=s - ); - } - elsif ( $script eq "write_fasta" ) - { - @options = qw( - wrap|w=s - no_stream|x - data_out|o=s - compress|Z - ); - } - elsif ( $script eq "write_align" ) - { - @options = qw( - wrap|w=s - no_stream|x - no_ruler|R - no_consensus|C - data_out|o=s - ); - } - elsif ( $script eq "write_blast" ) - { - @options = qw( - no_stream|x - data_out|o=s - comment|c - compress|Z - ); - } - elsif ( $script eq "write_tab" ) - { - @options = qw( - no_stream|x - data_out|o=s - delimit|d=s - keys|k=s - no_keys|K=s - comment|c - compress|Z - ); - } - elsif ( $script eq "write_bed" ) - { - @options = qw( - cols|c=s - no_stream|x - data_out|o=s - compress|Z - ); - } - elsif ( $script eq "write_psl" ) - { - @options = qw( - no_stream|x - data_out|o=s - compress|Z - ); - } - elsif ( $script eq "write_fixedstep" ) - { - @options = qw( - no_stream|x - data_out|o=s - compress|Z - ); - } - elsif ( $script eq "write_2bit" ) - { - @options = qw( - no_stream|x - data_out|o=s - no_mask|N - ); - } - elsif ( $script eq "write_solid" ) - { - @options = qw( - wrap|w=s - no_stream|x - data_out|o=s - compress|Z - ); - } - elsif ( $script eq "write_ucsc_config" ) - { - @options = qw( - no_stream|x - data_out|o=s - ); - } - elsif ( $script eq "plot_seqlogo" ) - { - @options = qw( - no_stream|x - data_out|o=s - ); - } - elsif ( $script eq "plot_phastcons_profiles" ) - { - @options = qw( - no_stream|x - data_out|o=s - genome|g=s - mean|m - median|M - flank|f=s - terminal|t=s - title|T=s - xlabel|X=s - ylabel|Y=s - ); - } - elsif ( $script eq "analyze_vals" ) - { - @options = qw( - no_stream|x - keys|k=s - ); - } - elsif ( $script eq "head_records" ) - { - @options = qw( - num|n=s - ); - } - elsif ( $script eq "remove_keys" ) - { - @options = qw( - keys|k=s - save_keys|K=s - ); - } - elsif ( $script eq "remove_adaptor" ) - { - @options = qw( - adaptor|a=s - mismatches|m=s - remove|r=s - offset|o=s - ); - } - elsif ( $script eq "remove_mysql_tables" ) - { - @options = qw( - database|d=s - tables|t=s - keys|k=s - user|u=s - password|p=s - no_stream|x - ); - } - elsif ( $script eq "remove_ucsc_tracks" ) - { - @options = qw( - database|d=s - tracks|t=s - keys|k=s - config_file|c=s - user|u=s - password|p=s - no_stream|x - ); - } - elsif ( $script eq "rename_keys" ) - { - @options = qw( - keys|k=s - ); - } - elsif ( $script eq "uniq_vals" ) - { - @options = qw( - key|k=s - invert|i - ); - } - elsif ( $script eq "merge_vals" ) - { - @options = qw( - keys|k=s - delimit|d=s - ); - } - elsif ( $script eq "merge_records" ) - { - @options = qw( - keys|k=s - merge|m=s - ); - } - elsif ( $script eq "grab" ) - { - @options = qw( - patterns|p=s - patterns_in|P=s - regex|r=s - eval|e=s - exact_in|E=s - invert|i - case_insensitive|c - keys|k=s - keys_only|K - vals_only|V - ); - } - elsif ( $script eq "compute" ) - { - @options = qw( - eval|e=s - ); - } - elsif ( $script eq "add_ident" ) - { - @options = qw( - prefix|p=s - key|k=s - ); - } - elsif ( $script eq "count_records" ) - { - @options = qw( - no_stream|x - data_out|o=s - ); - } - elsif ( $script eq "random_records" ) - { - @options = qw( - num|n=s - ); - } - elsif ( $script eq "sort_records" ) - { - @options = qw( - reverse|r - keys|k=s - ); - } - elsif ( $script eq "count_vals" ) - { - @options = qw( - keys|k=s - ); - } - elsif ( $script eq "plot_histogram" ) - { - @options = qw( - no_stream|x - data_out|o=s - terminal|t=s - title|T=s - xlabel|X=s - ylabel|Y=s - key|k=s - sort|s=s - ); - } - elsif ( $script eq "plot_lendist" ) - { - @options = qw( - no_stream|x - data_out|o=s - terminal|t=s - title|T=s - xlabel|X=s - ylabel|Y=s - key|k=s - ); - } - elsif ( $script eq "plot_chrdist" ) - { - @options = qw( - no_stream|x - data_out|o=s - terminal|t=s - title|T=s - xlabel|X=s - ylabel|Y=s - ); - } - elsif ( $script eq "plot_karyogram" ) - { - @options = qw( - no_stream|x - data_out|o=s - genome|g=s - feat_color|f=s - ); - } - elsif ( $script eq "plot_matches" ) - { - @options = qw( - no_stream|x - data_out|o=s - terminal|t=s - title|T=s - xlabel|X=s - ylabel|Y=s - direction|d=s - ); - } - elsif ( $script eq "length_vals" ) - { - @options = qw( - keys|k=s - ); - } - elsif ( $script eq "sum_vals" ) - { - @options = qw( - no_stream|x - data_out|o=s - keys|k=s - ); - } - elsif ( $script eq "mean_vals" ) - { - @options = qw( - no_stream|x - data_out|o=s - keys|k=s - ); - } - elsif ( $script eq "median_vals" ) - { - @options = qw( - no_stream|x - data_out|o=s - keys|k=s - ); - } - elsif ( $script eq "max_vals" ) - { - @options = qw( - no_stream|x - data_out|o=s - keys|k=s - ); - } - elsif ( $script eq "min_vals" ) - { - @options = qw( - no_stream|x - data_out|o=s - keys|k=s - ); - } - elsif ( $script eq "upload_to_ucsc" ) - { - @options = qw( - no_stream|x - database|d=s - table|t=s - short_label|s=s - long_label|l=s - group|g=s - priority|p=f - use_score|u - visibility|V=s - color|c=s - chunk_size|C=s - ); - } - - push @options, qw( - stream_in|I=s - stream_out|O=s - verbose|v - help|? - ); - -# print STDERR Dumper( \@options ); - - GetOptions( - \%options, - @options, - ); - -# print STDERR Dumper( \%options ); - - if ( -t STDIN && scalar( keys %options ) == 0 or $options{ "help" } ) { - return wantarray ? %options : \%options; - } - - $options{ "cols" } = [ split ",", $options{ "cols" } ] if defined $options{ "cols" }; - $options{ "keys" } = [ split ",", $options{ "keys" } ] if defined $options{ "keys" }; - $options{ "no_keys" } = [ split ",", $options{ "no_keys" } ] if defined $options{ "no_keys" }; - $options{ "save_keys" } = [ split ",", $options{ "save_keys" } ] if defined $options{ "save_keys" }; - $options{ "quals" } = [ split ",", $options{ "quals" } ] if defined $options{ "quals" }; - $options{ "feats" } = [ split ",", $options{ "feats" } ] if defined $options{ "feats" }; - $options{ "frames" } = [ split ",", $options{ "frames" } ] if defined $options{ "frames" }; - $options{ "formats" } = [ split ",", $options{ "formats" } ] if defined $options{ "formats" }; - $options{ "samples" } = [ split ",", $options{ "samples" } ] if defined $options{ "samples" }; - $options{ "tables" } = [ split ",", $options{ "tables" } ] if defined $options{ "tables" }; - $options{ "tracks" } = [ split ",", $options{ "tracks" } ] if defined $options{ "tracks" }; - - # ---- check arguments ---- - - if ( $options{ 'data_in' } ) - { - $options{ "files" } = getopt_files( $options{ 'data_in' } ); - - Maasha::Common::error( qq(Argument to --data_in must be a valid file or fileglob expression) ) if scalar @{ $options{ "files" } } == 0; - } - - map { Maasha::Common::error( qq(Argument to --cols must be a whole numbers - not "$_") ) if $_ !~ /^\d+$/ } @{ $options{ "cols" } } if $options{ "cols" }; - - # print STDERR Dumper( \%options ); - - $real = "beg|end|word_size|wrap|chunk_size|tile_size|len|prefix_length|mismatches|offset|num|skip|cpus|window_size|step_size"; - - foreach $opt ( keys %options ) - { - if ( $opt =~ /stream_in|pattern_in|exact_in/ and not -f $options{ $opt } ) - { - Maasha::Common::error( qq(Argument to --$opt must be a valid file or fileglob expression - not "$options{ $opt }") ); - } - elsif ( $opt =~ /$real/ and $options{ $opt } !~ /^\d+$/ ) - { - Maasha::Common::error( qq(Argument to --$opt must be a whole number - not "$options{ $opt }") ); - } - elsif ( $opt =~ /max_hits|max_hits|max_misses|dist|edit_dist|flank|gap|hamming_dist|priority/ and $options{ $opt } !~ /^-?\d+$/ ) - { - Maasha::Common::error( qq(Argument to --$opt must be an integer - not "$options{ $opt }") ); - } - elsif ( $opt =~ /identity|threshold/ and $options{ $opt } !~ /^-?(?:\d+(?:\.\d*)?|\.\d+)$/ ) - { - Maasha::Common::error( qq(Argument to --$opt must be a decimal number - not "$options{ $opt }") ); - } - elsif ( $opt =~ /e_val/ and $options{ $opt } !~ /^([+-]?)(?=\d|\.\d)\d*(\.\d*)?([Ee]([+-]?\d+))?$/ ) - { - Maasha::Common::error( qq(Argument to --$opt must be a float - not "$options{ $opt }") ); - } - elsif ( $opt =~ /strand/ and $options{ $opt } !~ /^(\+|-)$/ ) - { - Maasha::Common::error( qq(Argument to --$opt must be "+" or "-" - not "$options{ $opt }") ); - } - elsif ( $opt eq "genome" and $script ne "format_genome" ) - { - @genomes = Maasha::Common::ls_dirs( "$ENV{ 'BP_DATA' }/genomes" ); - map { $_ =~ s/.*\/(.+)$/$1/ } @genomes; - - if ( not grep { $_ =~ /^$options{ $opt }$/ } @genomes ) { - Maasha::Common::error( qq(Genome $options{ $opt } not found in "$ENV{ 'BP_DATA' }/genomes/") ); - } - } - elsif ( $opt eq "terminal" and not $options{ $opt } =~ /^(svg|post|dumb|x11)/ ) - { - Maasha::Common::error( qq(Bad --$opt argument "$options{ $opt }") ); - } - elsif ( $opt eq "table" and $options{ $opt } =~ /(-|\.)/ ) - { - Maasha::Common::error( qq(Character '$1' is not allowed in table name: $options{ $opt }) ); - } - elsif ( $opt eq "merge" and $options{ $opt } !~ /^(AandB|AorB|BorA|AnotB|BnotA)$/ ) - { - Maasha::Common::error( qq(Argument to --$opt must be AandB, AorB, BorA, AnotB, or BnotA - not "$options{ $opt }") ); - } - elsif ( $opt eq "format" and $script eq "read_solexa" and $options{ $opt } !~ /octal|decimal/ ) - { - Maasha::Common::error( qq(Argument to --$opt must be octal or decimal - not "$options{ $opt }") ); - } - elsif ( $opt eq "remove" and $script eq "remove_adaptor" and $options{ $opt } !~ /before|after|skip/ ) - { - Maasha::Common::error( qq(Argument to --$opt must be before, after, or skip - not "$options{ $opt }") ); - } - } - - Maasha::Common::error( qq(no --database specified) ) if $script eq "create_blast_db" and not $options{ "database" }; - Maasha::Common::error( qq(no --index_name specified) ) if $script =~ /create_vmatch_index/ and not $options{ "index_name" }; - Maasha::Common::error( qq(no --database or --genome specified) ) if $script eq "blast_seq" and not $options{ "genome" } and not $options{ "database" }; - Maasha::Common::error( qq(both --database and --genome specified) ) if $script eq "blast_seq" and $options{ "genome" } and $options{ "database" }; - Maasha::Common::error( qq(no --index_name or --genome specified) ) if $script eq "vmatch_seq" and not $options{ "genome" } and not $options{ "index_name" }; - Maasha::Common::error( qq(both --index and --genome specified) ) if $script eq "vmatch_seq" and $options{ "genome" } and $options{ "index_name" }; - Maasha::Common::error( qq(no --in_file or --genome specified) ) if $script eq "soap_seq" and not $options{ "genome" } and not $options{ "in_file" }; - Maasha::Common::error( qq(both --in_file and --genome specified) ) if $script eq "soap_seq" and $options{ "genome" } and $options{ "in_file" }; - Maasha::Common::error( qq(no --genome specified) ) if $script =~ /format_genome|get_genome_seq|get_genome_align|get_genome_phastcons|blat_seq|plot_phastcons_profiles|plot_karyogram/ and not $options{ "genome" }; - Maasha::Common::error( qq(no --key specified) ) if $script =~ /plot_lendist|plot_histogram/ and not $options{ "key" }; - Maasha::Common::error( qq(no --keys speficied) ) if $script =~ /sort_records|count_vals|sum_vals|mean_vals|median_vals|length_vals/ and not $options{ "keys" }; - - if ( $script eq "upload_to_ucsc" ) - { - Maasha::Common::error( qq(no --database specified) ) if not $options{ "database" }; - Maasha::Common::error( qq(no --table specified) ) if not $options{ "table" }; - } - - return wantarray ? %options : \%options; -} - - -# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SCRIPTS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - - -sub script_print_usage -{ - # Martin A. Hansen, January 2008. - - # Retrieves usage information from file and - # prints this nicely formatted. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $file, $wiki, $lines ); - - if ( $options->{ 'data_in' } ) { - $file = $options->{ 'data_in' }; - } else { - $file = join "", $ENV{ 'BP_DIR' }, "/bp_usage/", $options->{ 'SCRIPT' }, ".wiki"; - } - - $wiki = Maasha::Gwiki::gwiki_read( $file ); - - if ( not $options->{ "help" } ) { - @{ $wiki } = grep { $_->[ 0 ]->{ 'SECTION' } =~ /Biopiece|Synopsis|Usage|Options|Help/ } @{ $wiki }; - } - - $lines = Maasha::Gwiki::gwiki2ascii( $wiki ); - - print STDERR "$_\n" foreach @{ $lines }; - - exit; -} - - -sub script_list_biopieces -{ - # Martin A. Hansen, January 2008. - - # Prints the synopsis from the usage for each of the biopieces. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( @files, $file, $wiki, $program, $synopsis ); - - @files = Maasha::Common::ls_files( "$ENV{ 'BP_DIR' }/bp_usage" ); - - foreach $file ( sort @files ) - { - if ( $file =~ /\/([a-z0-9_]+)\.wiki$/ ) - { - $program = $1; - - $wiki = Maasha::Gwiki::gwiki_read( $file ); - - @{ $wiki } = grep { $_->[ 0 ]->{ 'SECTION' } =~ /Synopsis/ } @{ $wiki }; - @{ $wiki } = grep { $_->[ 0 ]->{ 'FORMAT' } =~ /paragraph/ } @{ $wiki }; - - $synopsis = $wiki->[ 0 ]->[ 0 ]->{ 'TEXT' }; - $synopsis =~ s/!(\w)/$1/g; - - printf( "%-30s%s\n", $program, $synopsis ); - } - } - - exit; -} - - -sub script_list_genomes -{ - # Martin A. Hansen, January 2008. - - # Prints the synopsis from the usage for each of the biopieces. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( @genomes, $genome, @formats, $format, %hash, %found, @row ); - - @genomes = Maasha::Common::ls_dirs( "$ENV{ 'BP_DATA' }/genomes" ); - - foreach $genome ( @genomes ) - { - next if $genome =~ /\.$/; - - @formats = Maasha::Common::ls_dirs( $genome ); - - foreach $format ( @formats ) - { - if ( $format =~ /\/([^\/]+)\/(\w+)$/ ) - { - $hash{ $1 }{ $2 } = 1; - - $found{ $2 } = 1; - } - } - } - - @row = "Genome"; - - map { push @row, $_ } sort keys %found; - - print join( "\t", @row ), "\n"; - - foreach $genome ( sort keys %hash ) - { - @row = $genome; - - foreach $format ( sort keys %found ) - { - if ( exists $hash{ $genome }{ $format } ) { - push @row, "yes"; - } else { - push @row, "no"; - } - } - - print join( "\t", @row ), "\n"; - } -} - - -sub script_read_fasta -{ - # Martin A. Hansen, August 2007. - - # Read sequences from FASTA file. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $file, $data_in, $entry, $num ); - - while ( $record = get_record( $in ) ) { - put_record( $record, $out ); - } - - $num = 1; - - foreach $file ( @{ $options->{ "files" } } ) - { - $data_in = Maasha::Common::read_open( $file ); - - while ( $entry = Maasha::Fasta::get_entry( $data_in ) ) - { - if ( defined $entry->[ SEQ_NAME ] and $entry->[ SEQ ] ) - { - $record = { - SEQ_NAME => $entry->[ SEQ_NAME ], - SEQ => $entry->[ SEQ ], - SEQ_LEN => length $entry->[ SEQ ], - }; - - put_record( $record, $out ); - } - - goto NUM if $options->{ "num" } and $num == $options->{ "num" }; - - $num++; - } - - close $data_in; - } - - NUM: - - close $data_in if $data_in; -} - - -sub script_read_tab -{ - # Martin A. Hansen, August 2007. - - # Read table or table columns from stream or file. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $file, $line, @fields, @fields2, $i, $record, $data_in, $skip, $num ); - - $options->{ 'delimit' } ||= '\s+'; - - while ( $record = get_record( $in ) ) { - put_record( $record, $out ); - } - - $skip = $options->{ 'skip' } ||= 0; - $num = 1; - - foreach $file ( @{ $options->{ "files" } } ) - { - $data_in = Maasha::Common::read_open( $file ); - - while ( $line = <$data_in> ) - { - if ( $skip ) - { - $skip--; - next; - } - - next if $line =~ /^#|^$/; - - chomp $line; - - undef $record; - undef @fields2; - - @fields = split /$options->{'delimit'}/, $line; - - if ( $options->{ "cols" } ) { - map { push @fields2, $fields[ $_ ] } @{ $options->{ "cols" } }; - } else { - @fields2 = @fields; - } - - for ( $i = 0; $i < @fields2; $i++ ) - { - if ( $options->{ "keys" }->[ $i ] ) { - $record->{ $options->{ "keys" }->[ $i ] } = $fields2[ $i ]; - } else { - $record->{ "V" . $i } = $fields2[ $i ]; - } - } - - put_record( $record, $out ); - - goto NUM if $options->{ "num" } and $num == $options->{ "num" }; - - $num++; - } - - close $data_in; - } - - NUM: - - close $data_in if $data_in; -} - - -sub script_read_psl -{ - # Martin A. Hansen, August 2007. - - # Read psl table from stream or file. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $file, $data_in, $num ); - - while ( $record = get_record( $in ) ) { - put_record( $record, $out ); - } - - $num = 1; - - foreach $file ( @{ $options->{ "files" } } ) - { - $data_in = Maasha::Common::read_open( $file ); - - while ( $record = Maasha::UCSC::psl_get_entry( $data_in ) ) - { - put_record( $record, $out ); - - goto NUM if $options->{ "num" } and $num == $options->{ "num" }; - - $num++; - } - } - - NUM: -} - - -sub script_read_bed -{ - # Martin A. Hansen, August 2007. - - # Read bed table from stream or file. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $file, $record, $entry, $data_in, $num ); - - while ( $record = get_record( $in ) ) { - put_record( $record, $out ); - } - - $num = 1; - - foreach $file ( @{ $options->{ "files" } } ) - { - $data_in = Maasha::Common::read_open( $file ); - - while ( $entry = Maasha::UCSC::bed_get_entry( $data_in ) ) - { - put_record( $entry, $out ); - - goto NUM if $options->{ "num" } and $num == $options->{ "num" }; - - $num++; - } - - close $data_in; - } - - NUM: - - close $data_in if $data_in; -} - - -sub script_read_fixedstep -{ - # Martin A. Hansen, Juli 2008. - - # Read fixedstep wiggle format from stream or file. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $file, $record, $entry, $head, $chr, $chr_beg, $step, $data_in, $num ); - - while ( $record = get_record( $in ) ) { - put_record( $record, $out ); - } - - $num = 1; - - foreach $file ( @{ $options->{ "files" } } ) - { - $data_in = Maasha::Common::read_open( $file ); - - while ( $entry = Maasha::UCSC::fixedstep_get_entry( $data_in ) ) - { - $head = shift @{ $entry }; - - if ( $head =~ /^chrom=([^ ]+) start=(\d+) step=(\d+)$/ ) - { - $record->{ "REC_TYPE" } = "fixed_step"; - $record->{ "CHR" } = $1; - $record->{ "CHR_BEG" } = $2; - $record->{ "STEP" } = $3; - $record->{ "VALS" } = join ";", @{ $entry }; - } - - put_record( $record, $out ); - - goto NUM if $options->{ "num" } and $num == $options->{ "num" }; - - $num++; - } - - close $data_in; - } - - NUM: - - close $data_in if $data_in; -} - - -sub script_read_blast_tab -{ - # Martin A. Hansen, September 2007. - - # Read tabular BLAST output from NCBI blast run with -m8 or -m9. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $file, $line, @fields, $strand, $record, $data_in, $num ); - - while ( $record = get_record( $in ) ) { - put_record( $record, $out ); - } - - $num = 1; - - foreach $file ( @{ $options->{ "files" } } ) - { - $data_in = Maasha::Common::read_open( $file ); - - while ( $line = <$data_in> ) - { - chomp $line; - - next if $line =~ /^#/; - - @fields = split /\t/, $line; - - $record->{ "REC_TYPE" } = "BLAST"; - $record->{ "Q_ID" } = $fields[ 0 ]; - $record->{ "S_ID" } = $fields[ 1 ]; - $record->{ "IDENT" } = $fields[ 2 ]; - $record->{ "ALIGN_LEN" } = $fields[ 3 ]; - $record->{ "MISMATCHES" } = $fields[ 4 ]; - $record->{ "GAPS" } = $fields[ 5 ]; - $record->{ "Q_BEG" } = $fields[ 6 ] - 1; # BLAST is 1-based - $record->{ "Q_END" } = $fields[ 7 ] - 1; # BLAST is 1-based - $record->{ "S_BEG" } = $fields[ 8 ] - 1; # BLAST is 1-based - $record->{ "S_END" } = $fields[ 9 ] - 1; # BLAST is 1-based - $record->{ "E_VAL" } = $fields[ 10 ]; - $record->{ "BIT_SCORE" } = $fields[ 11 ]; - - if ( $record->{ "S_BEG" } > $record->{ "S_END" } ) - { - $record->{ "STRAND" } = '-'; - - ( $record->{ "S_BEG" }, $record->{ "S_END" } ) = ( $record->{ "S_END" }, $record->{ "S_BEG" } ); - } - else - { - $record->{ "STRAND" } = '+'; - } - - put_record( $record, $out ); - - goto NUM if $options->{ "num" } and $num == $options->{ "num" }; - - $num++; - } - - close $data_in; - } - - NUM: - - close $data_in if $data_in; -} - - -sub script_read_embl -{ - # Martin A. Hansen, August 2007. - - # Read EMBL format. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( %options2, $file, $data_in, $num, $entry, $record ); - - map { $options2{ "keys" }{ $_ } = 1 } @{ $options->{ "keys" } }; - map { $options2{ "feats" }{ $_ } = 1 } @{ $options->{ "feats" } }; - map { $options2{ "quals" }{ $_ } = 1 } @{ $options->{ "quals" } }; - - while ( $record = get_record( $in ) ) { - put_record( $record, $out ); - } - - $num = 1; - - foreach $file ( @{ $options->{ "files" } } ) - { - $data_in = Maasha::Common::read_open( $file ); - - while ( $entry = Maasha::EMBL::get_embl_entry( $data_in ) ) - { - $record = Maasha::EMBL::parse_embl_entry( $entry, \%options2 ); - - my ( $feat, $feat2, $qual, $qual_val, $record_copy ); - - $record_copy = dclone $record; - - delete $record_copy->{ "FT" }; - - put_record( $record_copy, $out ); - - delete $record_copy->{ "SEQ" }; - - foreach $feat ( keys %{ $record->{ "FT" } } ) - { - $record_copy->{ "FEAT_TYPE" } = $feat; - - foreach $feat2 ( @{ $record->{ "FT" }->{ $feat } } ) - { - foreach $qual ( keys %{ $feat2 } ) - { - $qual_val = join "; ", @{ $feat2->{ $qual } }; - - $qual =~ s/^_//; - $qual = uc $qual; - - $record_copy->{ $qual } = $qual_val; - } - - put_record( $record_copy, $out ); - } - } - - goto NUM if $options->{ "num" } and $num == $options->{ "num" }; - - $num++; - } - - close $data_in; - } - - NUM: - - close $data_in if $data_in; -} - - -sub script_read_stockholm -{ - # Martin A. Hansen, August 2007. - - # Read Stockholm format. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $data_in, $file, $num, $entry, $record, $record_anno, $record_align, $key, $seq ); - - while ( $record = get_record( $in ) ) { - put_record( $record, $out ); - } - - $num = 1; - - foreach $file ( @{ $options->{ "files" } } ) - { - $data_in = Maasha::Common::read_open( $file ); - - while ( $entry = Maasha::Stockholm::get_stockholm_entry( $data_in ) ) - { - $record = Maasha::Stockholm::parse_stockholm_entry( $entry ); - - undef $record_anno; - - foreach $key ( keys %{ $record->{ "GF" } } ) { - $record_anno->{ $key } = $record->{ "GF" }->{ $key }; - } - - $record_anno->{ "ALIGN" } = $num; - - put_record( $record_anno, $out ); - - foreach $seq ( @{ $record->{ "ALIGN" } } ) - { - undef $record_align; - - $record_align = { - SEQ_NAME => $seq->[ 0 ], - SEQ => $seq->[ 1 ], - }; - - put_record( $record_align, $out ); - } - - goto NUM if $options->{ "num" } and $num == $options->{ "num" }; - - $num++; - } - - close $data_in; - } - - NUM: - - close $data_in if $data_in; -} - - -sub script_read_phastcons -{ - # Martin A. Hansen, December 2007. - - # Read PhastCons format. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $data_in, $file, $num, $entry, @records, $record ); - - $options->{ "min" } ||= 10; - $options->{ "dist" } ||= 25; - $options->{ "threshold" } ||= 0.8; - $options->{ "gap" } ||= 5; - - while ( $record = get_record( $in ) ) { - put_record( $record, $out ); - } - - $num = 1; - - foreach $file ( @{ $options->{ "files" } } ) - { - $data_in = Maasha::Common::read_open( $file ); - - while ( $entry = Maasha::UCSC::fixedstep_get_entry( $data_in ) ) - { - @records = Maasha::UCSC::phastcons_parse_entry( $entry, $options ); - - foreach $record ( @records ) - { - $record->{ "REC_TYPE" } = "BED"; - $record->{ "BED_LEN" } = $record->{ "CHR_END" } - $record->{ "CHR_BEG" } + 1; - - put_record( $record, $out ); - - goto NUM if $options->{ "num" } and $num == $options->{ "num" }; - - $num++; - } - } - - close $data_in; - } - - NUM: - - close $data_in if $data_in; -} - - -sub script_read_soft -{ - # Martin A. Hansen, December 2007. - - # Read soft format. - # http://www.ncbi.nlm.nih.gov/geo/info/soft2.html - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $data_in, $file, $num, $records, $record, $soft_index, $fh, @platforms, $plat_table, @samples, $sample, $old_end, $skip ); - - while ( $record = get_record( $in ) ) { - put_record( $record, $out ); - } - - $num = 1; - - foreach $file ( @{ $options->{ "files" } } ) - { - print STDERR "Creating index for file: $file\n" if $options->{ "verbose" }; - - $soft_index = Maasha::NCBI::soft_index_file( $file ); - - $fh = Maasha::Common::read_open( $file ); - - @platforms = grep { $_->{ "SECTION" } =~ /PLATFORM/ } @{ $soft_index }; - - print STDERR "Getting platform tables for file: $file\n" if $options->{ "verbose" }; - - $plat_table = Maasha::NCBI::soft_get_platform( $fh, $platforms[ 0 ]->{ "LINE_BEG" }, $platforms[ -1 ]->{ "LINE_END" } ); - - @samples = grep { $_->{ "SECTION" } =~ /SAMPLE/ } @{ $soft_index }; - - $old_end = $platforms[ -1 ]->{ "LINE_END" }; - - foreach $sample ( @samples ) - { - $skip = 0; - $skip = 1 if ( $options->{ "samples" } and grep { $sample->{ "SECTION" } !~ /$_/ } @{ $options->{ "samples" } } ); - - print STDERR "Getting samples for dataset: $sample->{ 'SECTION' }\n" if $options->{ "verbose" } and not $skip; - - $records = Maasha::NCBI::soft_get_sample( $fh, $plat_table, $sample->{ "LINE_BEG" } - $old_end - 1, $sample->{ "LINE_END" } - $old_end - 1, $skip ); - - foreach $record ( @{ $records } ) - { - put_record( $record, $out ); - - goto NUM if $options->{ "num" } and $num == $options->{ "num" }; - - $num++; - } - - $old_end = $sample->{ "LINE_END" }; - } - - close $fh; - } - - NUM: - - close $data_in if $data_in; - close $fh if $fh; -} - - -sub script_read_gff -{ - # Martin A. Hansen, February 2008. - - # Read soft format. - # http://www.ncbi.nlm.nih.gov/geo/info/soft2.html - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $data_in, $file, $fh, $num, $record, $entry ); - - while ( $record = get_record( $in ) ) { - put_record( $record, $out ); - } - - $num = 1; - - foreach $file ( @{ $options->{ "files" } } ) - { - $fh = Maasha::Common::read_open( $file ); - - while ( $entry = Maasha::GFF::get_entry( $fh ) ) - { - put_record( $entry, $out ); - - goto NUM if $options->{ "num" } and $num == $options->{ "num" }; - - $num++; - } - - close $fh; - } - - NUM: - - close $data_in if $data_in; -} - - -sub script_read_2bit -{ - # Martin A. Hansen, March 2008. - - # Read sequences from 2bit file. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $file, $data_in, $mask, $toc, $line, $num ); - - $mask = 1 if not $options->{ "no_mask" }; - - while ( $record = get_record( $in ) ) { - put_record( $record, $out ); - } - - $num = 1; - - foreach $file ( @{ $options->{ "files" } } ) - { - $data_in = Maasha::Common::read_open( $file ); - - $toc = Maasha::TwoBit::twobit_get_TOC( $data_in ); - - foreach $line ( @{ $toc } ) - { - $record->{ "SEQ_NAME" } = $line->[ 0 ]; - $record->{ "SEQ" } = Maasha::TwoBit::twobit_get_seq( $data_in, $line->[ 1 ], undef, undef, $mask ); - $record->{ "SEQ_LEN" } = length $record->{ "SEQ" }; - - put_record( $record, $out ); - - goto NUM if $options->{ "num" } and $num == $options->{ "num" }; - - $num++; - } - - close $data_in; - } - - NUM: - - close $data_in if $data_in; -} - - -sub script_read_solexa -{ - # Martin A. Hansen, March 2008. - - # Read Solexa sequence reads from file. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $file, $data_in, $entry, $num, @seqs, @scores, $i ); - - $options->{ "format" } ||= "octal"; - $options->{ "quality" } ||= 20; - - while ( $record = get_record( $in ) ) { - put_record( $record, $out ); - } - - $num = 1; - - foreach $file ( @{ $options->{ "files" } } ) - { - $data_in = Maasha::Common::read_open( $file ); - - if ( $options->{ "format" } eq "octal" ) - { - while ( $entry = Maasha::Solexa::solexa_get_entry_octal( $data_in ) ) - { - $record = Maasha::Solexa::solexa2biopiece( $entry, $options->{ "quality" } ); - - put_record( $record, $out ); - - goto NUM if $options->{ "num" } and $num == $options->{ "num" }; - - $num++; - } - } - else - { - while ( $entry = Maasha::Solexa::solexa_get_entry_decimal( $data_in ) ) - { - $record = Maasha::Solexa::solexa2biopiece( $entry, $options->{ "quality" } ); - - put_record( $record, $out ); - - goto NUM if $options->{ "num" } and $num == $options->{ "num" }; - - $num++; - } - } - - close $data_in; - } - - NUM: - - close $data_in if $data_in; -} - - -sub script_read_solid -{ - # Martin A. Hansen, April 2008. - - # Read Solid sequence from file. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $file, $data_in, $line, $num, $seq_name, $seq_cs, $seq_qual, @scores, @seqs, $i ); - - $options->{ "quality" } ||= 15; - - while ( $record = get_record( $in ) ) { - put_record( $record, $out ); - } - - $num = 1; - - foreach $file ( @{ $options->{ "files" } } ) - { - $data_in = Maasha::Common::read_open( $file ); - - while ( $line = <$data_in> ) - { - chomp $line; - - ( $seq_name, $seq_cs, $seq_qual ) = split /\t/, $line; - - @scores = split /,/, $seq_qual; - @seqs = split //, Maasha::Solid::color_space2seq( $seq_cs ); - - for ( $i = 0; $i < @seqs; $i++ ) { - $seqs[ $i ] = lc $seqs[ $i ] if $scores[ $i ] < $options->{ "quality" }; - } - - $record = { - REC_TYPE => 'SOLID', - SEQ_NAME => $seq_name, - SEQ_CS => $seq_cs, - SEQ_QUAL => join( ";", @scores ), - SEQ_LEN => length $seq_cs, - SEQ => join( "", @seqs ), - SCORE_MEAN => sprintf( "%.2f", Maasha::Calc::mean( \@scores ) ), - }; - - put_record( $record, $out ); - - goto NUM if $options->{ "num" } and $num == $options->{ "num" }; - - $num++; - } - - close $data_in; - } - - NUM: - - close $data_in if $data_in; -} - - -sub script_read_mysql -{ - # Martin A. Hansen, May 2008. - - # Read a MySQL query into stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $dbh, $results ); - - $options->{ "user" } ||= Maasha::UCSC::ucsc_get_user(); - $options->{ "password" } ||= Maasha::UCSC::ucsc_get_password(); - - while ( $record = get_record( $in ) ) { - put_record( $record, $out ); - } - - $dbh = Maasha::SQL::connect( $options->{ "database" }, $options->{ "user" }, $options->{ "password" } ); - - $results = Maasha::SQL::query_hashref_list( $dbh, $options->{ "query" } ); - - Maasha::SQL::disconnect( $dbh ); - - map { put_record( $_ ) } @{ $results }; -} - - -sub script_read_ucsc_config -{ - # Martin A. Hansen, November 2008. - - # Read track entries from UCSC Genome Browser '.ra' files. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $file, $data_in, $entry, $num ); - - while ( $record = get_record( $in ) ) { - put_record( $record, $out ); - } - - $num = 1; - - foreach $file ( @{ $options->{ "files" } } ) - { - $data_in = Maasha::Common::read_open( $file ); - - while ( $record = Maasha::UCSC::ucsc_config_get_entry( $data_in ) ) - { - $record->{ 'REC_TYPE' } = "UCSC Config"; - - put_record( $record, $out ); - - goto NUM if $options->{ "num" } and $num == $options->{ "num" }; - - $num++; - } - - close $data_in; - } - - NUM: - - close $data_in if $data_in; -} - - -sub script_assemble_tag_contigs -{ - # Martin A. Hansen, November 2008. - - # Assemble tags from the stream into - # tag contigs. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $new_record, %fh_hash, $fh_out, $chr, $array, $pos, $beg, $end, $score, $file, $id ); - - while ( $record = get_record( $in ) ) - { - $record->{ "CHR" } = $record->{ "S_ID" } if not defined $record->{ "CHR" }; - $record->{ "CHR_BEG" } = $record->{ "S_BEG" } if not defined $record->{ "CHR_BEG" }; - $record->{ "CHR_END" } = $record->{ "S_END" } if not defined $record->{ "CHR_END" }; - - if ( $record->{ "CHR" } and defined $record->{ "CHR_BEG" } and $record->{ "CHR_END" } ) - { - $fh_hash{ $record->{ "CHR" } } = Maasha::Common::write_open( "$BP_TMP/$record->{ 'CHR' }" ) if not exists $fh_hash{ $record->{ "CHR" } }; - - $fh_out = $fh_hash{ $record->{ "CHR" } }; - - Maasha::UCSC::bed_put_entry( $record, $fh_out, 5 ); - } - } - - map { close $_ } keys %fh_hash; - - foreach $chr ( sort keys %fh_hash ) - { - $array = tag_contigs_assemble( "$BP_TMP/$chr" ); - - $pos = 0; - $id = 0; - - while ( ( $beg, $end, $score ) = tag_contigs_scan( $array, $pos ) and $beg ) - { - $new_record = { - CHR => $chr, - CHR_BEG => $beg, - CHR_END => $end - 1, - Q_ID => sprintf( "TC%06d", $id ), - SCORE => $score, - STRAND => $record->{ 'STRAND' } || '+', - }; - - put_record( $new_record, $out ); - - $pos = $end; - $id++; - } - - unlink "$BP_TMP/$chr"; - } -} - - -sub tag_contigs_assemble -{ - # Martin A. Hansen, November 2008. - - # Given a BED file with entries from only one - # chromosome assembles tag contigs from these - # ignoring strand information. Only tags with - # a score higher than the clone count over - # genomic loci (the SCORE field) is included - # in the tag contigs. - - # ----------- tags - # ------------- - # ---------- - # ---------- - # ======================== tag contig - - - my ( $path, # full path to BED file - ) = @_; - - # Returns an arrayref. - - my ( $fh, $entry, $clones, $score, @array ); - - $fh = Maasha::Common::read_open( $path ); - - while ( $entry = Maasha::UCSC::bed_get_entry( $fh ) ) - { - if ( $entry->{ 'Q_ID' } =~ /(\d+)$/ ) - { - $clones = $1; - - $score = int( $clones / $entry->{ 'SCORE' } ); - - map { $array[ $_ ] += $score } $entry->{ 'CHR_BEG' } .. $entry->{ 'CHR_END' } if $score >= 1; - } - } - - close $fh; - - return wantarray ? @array : \@array; -} - - -sub tag_contigs_scan -{ - # Martin A. Hansen, November 2008. - - # Scans an array with tag contigs and locates - # the next contig from a given position. The - # score of the tag contig is determined as the - # maximum value of the tag contig. If a tag contig - # is found a triple is returned with beg, end and score - # otherwise an empty triple is returned. - - my ( $array, # array to scan - $beg, # position to start scanning from - ) = @_; - - # Returns an arrayref. - - my ( $end, $score ); - - $score = 0; - - while ( $beg < scalar @{ $array } and not $array->[ $beg ] ) { $beg++ } - - $end = $beg; - - while ( $array->[ $end ] ) - { - $score = Maasha::Calc::max( $score, $array->[ $end ] ); - - $end++ - } - - if ( $score > 0 ) { - return wantarray ? ( $beg, $end, $score ) : [ $beg, $end, $score ]; - } else { - return wantarray ? () : []; - } -} - - -sub script_format_genome -{ - # Martin A. Hansen, Juli 2008. - - # Format a genome to speficed formats. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $dir, $genome, $fasta_dir, $phastcons_dir, $vals, $fh_out, $record, $format, $index, $entry ); - - $dir = $options->{ 'dir' } || $ENV{ 'BP_DATA' }; - $genome = $options->{ 'genome' }; - - Maasha::Common::error( "Directory: $dir does not exist" ) if not -d $dir; - Maasha::Common::dir_create_if_not_exists( "$dir/genomes" ); - Maasha::Common::dir_create_if_not_exists( "$dir/genomes/$genome" ); - - if ( grep { $_ =~ /fasta|blast|vmatch/i } @{ $options->{ "formats" } } ) - { - if ( -f "$dir/genomes/$genome/fasta/$genome.fna" ) - { - $fasta_dir = "$dir/genomes/$genome/fasta"; - } - else - { - Maasha::Common::dir_create_if_not_exists( "$dir/genomes/$genome/fasta" ); - - $fasta_dir = "$dir/genomes/$genome/fasta"; - - $fh_out = Maasha::Common::write_open( "$fasta_dir/$genome.fna" ); - } - } - elsif ( grep { $_ =~ /phastcons/i } @{ $options->{ "formats" } } ) - { - Maasha::Common::dir_create_if_not_exists( "$dir/genomes/$genome/phastcons" ); - - $phastcons_dir = "$dir/genomes/$genome/phastcons"; - - $fh_out = Maasha::Common::write_open( "$phastcons_dir/$genome.pp" ); - } - - while ( $record = get_record( $in ) ) - { - if ( $fh_out and $entry = record2fasta( $record ) ) - { - Maasha::Fasta::put_entry( $entry, $fh_out, $options->{ "wrap" } ); - } - elsif ( $fh_out and $record->{ "CHR" } and $record->{ "CHR_BEG" } and $record->{ "STEP" } and $record->{ "VALS" } ) - { - print $fh_out "fixedStep chrom=$record->{ 'CHR' } start=$record->{ 'CHR_BEG' } step=$record->{ 'STEP' }\n"; - - $vals = $record->{ 'VALS' }; - - $vals =~ tr/,/\n/; - - print $fh_out "$vals\n"; - } - - put_record( $record, $out ) if not $options->{ "no_stream" }; - } - - foreach $format ( @{ $options->{ 'formats' } } ) - { - if ( $format =~ /^fasta$/i ) { Maasha::Fasta::fasta_index( "$fasta_dir/$genome.fna", "$dir/genomes/$genome/fasta/$genome.index" ) } - elsif ( $format =~ /^blast$/i ) { Maasha::NCBI::blast_index( "$genome.fna", $fasta_dir, "$dir/genomes/$genome/blast", "dna", $genome ) } - elsif ( $format =~ /^blat$/i ) { print STDERR "BLAT FORMAT NOT IMPLEMENTED" } - elsif ( $format =~ /^vmatch$/i ) { Maasha::Match::vmatch_index( "$genome.fna", $fasta_dir, "$dir/genomes/$genome/vmatch", $BP_TMP ) } - elsif ( $format =~ /^phastcons$/i ) { Maasha::UCSC::phastcons_index( "$genome.pp", $phastcons_dir ) } - } - - close $fh_out if $fh_out; -} - - -sub script_length_seq -{ - # Martin A. Hansen, August 2007. - - # Determine the length of sequences in stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $total ); - - while ( $record = get_record( $in ) ) - { - if ( $record->{ "SEQ" } ) - { - $record->{ "SEQ_LEN" } = length $record->{ "SEQ" }; - $total += $record->{ "SEQ_LEN" }; - } - - put_record( $record, $out ) if not $options->{ "no_stream" }; - } - - put_record( { TOTAL_SEQ_LEN => $total }, $out ); -} - - -sub script_uppercase_seq -{ - # Martin A. Hansen, August 2007. - - # Uppercases sequences in stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - ) = @_; - - # Returns nothing. - - my ( $record ); - - while ( $record = get_record( $in ) ) - { - $record->{ "SEQ" } = uc $record->{ "SEQ" } if $record->{ "SEQ" }; - - put_record( $record, $out ); - } -} - - -sub script_shuffle_seq -{ - # Martin A. Hansen, December 2007. - - # Shuffle sequences in stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - ) = @_; - - # Returns nothing. - - my ( $record ); - - while ( $record = get_record( $in ) ) - { - $record->{ "SEQ" } = Maasha::Seq::seq_shuffle( $record->{ "SEQ" } ) if $record->{ "SEQ" }; - - put_record( $record, $out ); - } -} - - -sub script_analyze_seq -{ - # Martin A. Hansen, August 2007. - - # Analyze sequence composition of sequences in stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - ) = @_; - - # Returns nothing. - - my ( $record, $analysis ); - - while ( $record = get_record( $in ) ) - { - if ( $record->{ "SEQ" } ) - { - $analysis = Maasha::Seq::seq_analyze( $record->{ "SEQ" } ); - - map { $record->{ $_ } = $analysis->{ $_ } } keys %{ $analysis }; - } - - put_record( $record, $out ); - } -} - - -sub script_analyze_tags -{ - # Martin A. Hansen, August 2008. - - # Analyze sequence tags in stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - ) = @_; - - # Returns nothing. - - my ( $record, $analysis, %len_hash, %clone_hash, $clones, $key, $tag_record ); - - while ( $record = get_record( $in ) ) - { - if ( $record->{ "SEQ_NAME" } and $record->{ "SEQ" } ) - { - if ( $record->{ "SEQ_NAME" } =~ /_(\d+)$/ ) - { - $clones = $1; - - $len_hash{ length( $record->{ "SEQ" } ) }++; - $clone_hash{ length( $record->{ "SEQ" } ) } += $clones; - } - } - elsif ( $record->{ "Q_ID" } and $record->{ "BED_LEN" } ) - { - if ( $record->{ "Q_ID" } =~ /_(\d+)$/ ) - { - $clones = $1; - - $len_hash{ $record->{ "BED_LEN" } }++; - $clone_hash{ $record->{ "BED_LEN" } } += $clones; - } - } - } - - foreach $key ( sort { $a <=> $b } keys %len_hash ) - { - $tag_record->{ "TAG_LEN" } = $key; - $tag_record->{ "TAG_COUNT" } = $len_hash{ $key }; - $tag_record->{ "TAG_CLONES" } = $clone_hash{ $key }; - - put_record( $tag_record, $out ); - } -} - - -sub script_complexity_seq -{ - # Martin A. Hansen, May 2008. - - # Generates an index calculated as the most common di-residue over - # the sequence length for all sequences in stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - ) = @_; - - # Returns nothing. - - my ( $record, $index ); - - while ( $record = get_record( $in ) ) - { - $record->{ "SEQ_COMPLEXITY" } = sprintf( "%.2f", Maasha::Seq::seq_complexity( $record->{ "SEQ" } ) ) if $record->{ "SEQ" }; - - put_record( $record, $out ); - } -} - - -sub script_oligo_freq -{ - # Martin A. Hansen, August 2007. - - # Determine the length of sequences in stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, %oligos, @freq_table ); - - $options->{ "word_size" } ||= 7; - - while ( $record = get_record( $in ) ) - { - if ( $record->{ "SEQ" } ) - { - map { $oligos{ $_ }++ } Maasha::Seq::seq2oligos( \$record->{ "SEQ" }, $options->{ "word_size" } ); - - if ( not $options->{ "all" } ) - { - @freq_table = Maasha::Seq::oligo_freq( \%oligos ); - - map { put_record( $_, $out ) } @freq_table; - - undef %oligos; - } - } - - put_record( $record, $out ); - } - - if ( $options->{ "all" } ) - { - @freq_table = Maasha::Seq::oligo_freq( \%oligos ); - - map { put_record( $_, $out ) } @freq_table; - } -} - - -sub script_create_weight_matrix -{ - # Martin A. Hansen, August 2007. - - # Creates a weight matrix from an alignmnet. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $count, $i, $res, %freq_hash, %res_hash, $freq ); - - $count = 0; - - while ( $record = get_record( $in ) ) - { - if ( $record->{ "SEQ" } ) - { - for ( $i = 0; $i < length $record->{ "SEQ" }; $i++ ) - { - $res = substr $record->{ "SEQ" }, $i, 1; - - $freq_hash{ $i }{ $res }++; - $res_hash{ $res } = 1; - } - - $count++; - } - else - { - put_record( $record, $out ); - } - } - - foreach $res ( sort keys %res_hash ) - { - undef $record; - - $record->{ "V0" } = $res; - - for ( $i = 0; $i < keys %freq_hash; $i++ ) - { - $freq = $freq_hash{ $i }{ $res } || 0; - - if ( $options->{ "percent" } ) { - $freq = sprintf( "%.0f", 100 * $freq / $count ) if $freq > 0; - } - - $record->{ "V" . ( $i + 1 ) } = $freq; - } - - put_record( $record, $out ); - } -} - - -sub script_calc_bit_scores -{ - # Martin A. Hansen, March 2007. - - # Calculates the bit scores for each position from an alignmnet in the stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - ) = @_; - - # Returns nothing. - - my ( $record, $type, $count, $i, $res, %freq_hash, $bit_max, $bit_height, $bit_diff ); - - $count = 0; - - while ( $record = get_record( $in ) ) - { - if ( $record->{ "SEQ" } ) - { - $type = Maasha::Seq::seq_guess_type( $record->{ "SEQ" } ) if not $type; - - for ( $i = 0; $i < length $record->{ "SEQ" }; $i++ ) - { - $res = substr $record->{ "SEQ" }, $i, 1; - - next if $res =~ /-|_|~|\./; - - $freq_hash{ $i }{ $res }++; - } - - $count++; - } - else - { - put_record( $record, $out ); - } - } - - undef $record; - - if ( $type eq "protein" ) { - $bit_max = 4; - } else { - $bit_max = 2; - } - - for ( $i = 0; $i < keys %freq_hash; $i++ ) - { - $bit_height = Maasha::Seq::seqlogo_calc_bit_height( $freq_hash{ $i }, $count ); - - $bit_diff = $bit_max - $bit_height; - - $record->{ "V" . ( $i ) } = sprintf( "%.2f", $bit_diff ); - } - - put_record( $record, $out ); -} - - -sub script_calc_fixedstep -{ - # Martin A. Hansen, September 2008. - - # Calculates fixedstep entries from data in the stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, %fh_hash, $fh_in, $fh_out, $chr, $chr, $beg, $end, $q_id, $block, $entry, $clones, $beg_block, $max, $i ); - - while ( $record = get_record( $in ) ) - { - $record->{ "CHR" } = $record->{ "S_ID" } if not defined $record->{ "CHR" }; - $record->{ "CHR_BEG" } = $record->{ "S_BEG" } if not defined $record->{ "CHR_BEG" }; - $record->{ "CHR_END" } = $record->{ "S_END" } if not defined $record->{ "CHR_END" }; - - if ( $record->{ "CHR" } and defined $record->{ "CHR_BEG" } and $record->{ "CHR_END" } ) - { - $fh_hash{ $record->{ "CHR" } } = Maasha::Common::write_open( "$BP_TMP/$record->{ 'CHR' }" ) if not exists $fh_hash{ $record->{ "CHR" } }; - - $fh_out = $fh_hash{ $record->{ "CHR" } }; - - Maasha::UCSC::bed_put_entry( $record, $fh_out, 5 ); - } - } - - map { close $_ } keys %fh_hash; - - foreach $chr ( sort keys %fh_hash ) - { - #Maasha::Common::run( "bedSort", "$BP_TMP/$chr $BP_TMP/$chr" ); - Maasha::Common::run( "bed_sort", "--sort 3 --cols 5 $BP_TMP/$chr > $BP_TMP/$chr.sort" ); - - rename "$BP_TMP/$chr.sort", "$BP_TMP/$chr"; - - $fh_in = Maasha::Common::read_open( "$BP_TMP/$chr" ); - - undef $block; - - while ( $entry = Maasha::UCSC::bed_get_entry( $fh_in, 5 ) ) - { - $chr = $entry->{ 'CHR' }; - $beg = $entry->{ 'CHR_BEG' }; - $end = $entry->{ 'CHR_END' }; - $q_id = $entry->{ 'Q_ID' }; - - if ( $options->{ "score" } ) { - $clones = $entry->{ 'SCORE' }; - } elsif ( $q_id =~ /_(\d+)$/ ) { - $clones = $1; - } else { - $clones = 1; - } - - if ( $block ) - { - if ( $beg > $max ) - { - map { $_ = sprintf( "%.4f", Maasha::Calc::log10( $_ ) ) } @{ $block } if $options->{ "log10" }; - - $record->{ "CHR" } = $chr; - $record->{ "CHR_BEG" } = $beg_block; - $record->{ "STEP" } = 1; - $record->{ "VALS" } = join ";", @{ $block }; - $record->{ "REC_TYPE" } = "fixed_step"; - - put_record( $record, $out ); - - undef $block; - } - else - { - for ( $i = $beg - $beg_block; $i < ( $beg - $beg_block ) + ( $end - $beg ); $i++ ) { - $block->[ $i ] += $clones; - } - - $max = Maasha::Calc::max( $max, $end ); - } - } - - if ( not $block ) - { - $beg_block = $beg; - $max = $end; - - for ( $i = 0; $i < ( $end - $beg ); $i++ ) { - $block->[ $i ] += $clones; - } - } - } - - close $fh_in; - - map { $_ = sprintf( "%.4f", Maasha::Calc::log10( $_ ) ) } @{ $block } if $options->{ "log10" }; - - $record->{ "CHR" } = $chr; - $record->{ "CHR_BEG" } = $beg_block; - $record->{ "STEP" } = 1; - $record->{ "VALS" } = join ";", @{ $block }; - $record->{ "REC_TYPE" } = "fixed_step"; - - put_record( $record, $out ); - - unlink "$BP_TMP/$chr"; - } -} - - -sub script_reverse_seq -{ - # Martin A. Hansen, August 2007. - - # Reverse sequence in record. - - my ( $in, # handle to in stream - $out, # handle to out stream - ) = @_; - - # Returns nothing. - - my ( $record ); - - while ( $record = get_record( $in ) ) - { - if ( $record->{ "SEQ" } ) { - $record->{ "SEQ" } = reverse $record->{ "SEQ" }; - } - - put_record( $record, $out ); - } -} - - -sub script_complement_seq -{ - # Martin A. Hansen, August 2007. - - # Complement sequence in record. - - my ( $in, # handle to in stream - $out, # handle to out stream - ) = @_; - - # Returns nothing. - - my ( $record, $type ); - - while ( $record = get_record( $in ) ) - { - if ( $record->{ "SEQ" } ) - { - if ( not $type ) { - $type = Maasha::Seq::seq_guess_type( $record->{ "SEQ" } ); - } - - if ( $type eq "rna" ) { - Maasha::Seq::rna_comp( \$record->{ "SEQ" } ); - } elsif ( $type eq "dna" ) { - Maasha::Seq::dna_comp( \$record->{ "SEQ" } ); - } - } - - put_record( $record, $out ); - } -} - - -sub script_remove_indels -{ - # Martin A. Hansen, August 2007. - - # Remove indels from sequences in stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - ) = @_; - - # Returns nothing. - - my ( $record ); - - while ( $record = get_record( $in ) ) - { - $record->{ 'SEQ' } =~ tr/-~.//d if $record->{ "SEQ" }; - - put_record( $record, $out ); - } -} - - -sub script_transliterate_seq -{ - # Martin A. Hansen, August 2007. - - # Transliterate chars from sequence in record. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $search, $replace, $delete ); - - $search = $options->{ "search" } || ""; - $replace = $options->{ "replace" } || ""; - $delete = $options->{ "delete" } || ""; - - while ( $record = get_record( $in ) ) - { - if ( $record->{ "SEQ" } ) - { - if ( $search and $replace ) { - eval "\$record->{ 'SEQ' } =~ tr/$search/$replace/"; - } elsif ( $delete ) { - eval "\$record->{ 'SEQ' } =~ tr/$delete//d"; - } - } - - put_record( $record, $out ); - } -} - - -sub script_transliterate_vals -{ - # Martin A. Hansen, April 2008. - - # Transliterate chars from values in record. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $search, $replace, $delete, $key ); - - $search = $options->{ "search" } || ""; - $replace = $options->{ "replace" } || ""; - $delete = $options->{ "delete" } || ""; - - while ( $record = get_record( $in ) ) - { - foreach $key ( @{ $options->{ "keys" } } ) - { - if ( exists $record->{ $key } ) - { - if ( $search and $replace ) { - eval "\$record->{ $key } =~ tr/$search/$replace/"; - } elsif ( $delete ) { - eval "\$record->{ $key } =~ tr/$delete//d"; - } - } - } - - put_record( $record, $out ); - } -} - - -sub script_translate_seq -{ - # Martin A. Hansen, February 2008. - - # Translate DNA sequence into protein sequence. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $frame, %new_record ); - - $options->{ "frames" } ||= [ 1, 2, 3, -1, -2, -3 ]; - - while ( $record = get_record( $in ) ) - { - if ( $record->{ "SEQ" } ) - { - if ( Maasha::Seq::seq_guess_type( $record->{ "SEQ" } ) eq "dna" ) - { - foreach $frame ( @{ $options->{ "frames" } } ) - { - %new_record = %{ $record }; - - $new_record{ "SEQ" } = Maasha::Seq::translate( $record->{ "SEQ" }, $frame ); - $new_record{ "SEQ_LEN" } = length $new_record{ "SEQ" }; - $new_record{ "FRAME" } = $frame; - - put_record( \%new_record, $out ); - } - } - } - else - { - put_record( $record, $out ); - } - } -} - - -sub script_extract_seq -{ - # Martin A. Hansen, August 2007. - - # Extract subsequences from sequences in record. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $beg, $end, $len, $record ); - - if ( not defined $options->{ "beg" } or $options->{ "beg" } < 0 ) { - $beg = 0; - } else { - $beg = $options->{ "beg" } - 1; # correcting for start offset - } - - if ( defined $options->{ "end" } and $options->{ "end" } - 1 < $beg ) { - $end = $beg - 1; - } elsif ( defined $options->{ "end" } ) { - $end = $options->{ "end" } - 1; # correcting for start offset - } - - $len = $options->{ "len" }; - -# print "beg->$beg, end->$end, len->$len\n"; - - while ( $record = get_record( $in ) ) - { - if ( $record->{ "SEQ" } ) - { - if ( defined $beg and defined $end ) - { - if ( $end - $beg + 1 > length $record->{ "SEQ" } ) { - $record->{ "SEQ" } = substr $record->{ "SEQ" }, $beg; - } else { - $record->{ "SEQ" } = substr $record->{ "SEQ" }, $beg, $end - $beg + 1; - } - } - elsif ( defined $beg and defined $len ) - { - if ( $len > length $record->{ "SEQ" } ) { - $record->{ "SEQ" } = substr $record->{ "SEQ" }, $beg; - } else { - $record->{ "SEQ" } = substr $record->{ "SEQ" }, $beg, $len; - } - } - elsif ( defined $beg ) - { - $record->{ "SEQ" } = substr $record->{ "SEQ" }, $beg; - } - } - - $record->{ "SEQ_LEN" } = length $record->{ "SEQ" }; - - put_record( $record, $out ); - } -} - - -sub script_get_genome_seq -{ - # Martin A. Hansen, December 2007. - - # Gets a subsequence from a genome. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $genome, $genome_file, $index_file, $index, $fh, $index_head, $index_beg, $index_len, $beg, $len, %lookup_hash, @begs, @lens, $i ); - - $options->{ "flank" } ||= 0; - - if ( $options->{ "genome" } ) - { - $genome = $options->{ "genome" }; - - $genome_file = "$ENV{ 'BP_DATA' }/genomes/$genome/fasta/$genome.fna"; - $index_file = "$ENV{ 'BP_DATA' }/genomes/$genome/fasta/$genome.index"; - - $fh = Maasha::Common::read_open( $genome_file ); - $index = Maasha::Fasta::index_retrieve( $index_file ); - - shift @{ $index }; # Get rid of the file size info - - map { $lookup_hash{ $_->[ 0 ] } = [ $_->[ 1 ], $_->[ 2 ] ] } @{ $index }; - - if ( exists $lookup_hash{ $options->{ "chr" } } and defined $options->{ "beg" } and ( defined $options->{ "end" } or defined $options->{ "len" } ) ) - { - ( $index_beg, $index_len ) = @{ $lookup_hash{ $options->{ "chr" } } }; - - $beg = $index_beg + $options->{ "beg" } - 1; - - if ( $options->{ "len" } ) { - $len = $options->{ "len" }; - } elsif ( $options->{ "end" } ) { - $len = ( $options->{ "end" } - $options->{ "beg" } + 1 ); - } - - $beg -= $options->{ "flank" }; - $len += 2 * $options->{ "flank" }; - - if ( $beg <= $index_beg ) - { - $len -= $index_beg - $beg; - $beg = $index_beg; - } - - $len = $index_beg + $index_len - $beg if $beg + $len > $index_beg + $index_len; - - next if $beg > $index_beg + $index_len; - - $record->{ "CHR" } = $options->{ "chr" }; - $record->{ "CHR_BEG" } = $beg - $index_beg; - $record->{ "CHR_END" } = $record->{ "CHR_BEG" } + $len - 1; - - $record->{ "SEQ" } = Maasha::Common::file_read( $fh, $beg, $len ); - $record->{ "SEQ_LEN" } = $len; - - put_record( $record, $out ); - } - } - - while ( $record = get_record( $in ) ) - { - if ( $options->{ "genome" } and not $record->{ "SEQ" } ) - { - if ( $record->{ "REC_TYPE" } eq "BED" and exists $lookup_hash{ $record->{ "CHR" } } ) - { - ( $index_beg, $index_len ) = @{ $lookup_hash{ $record->{ "CHR" } } }; - - $beg = $record->{ "CHR_BEG" } + $index_beg; - $len = $record->{ "CHR_END" } - $record->{ "CHR_BEG" } + 1; - } - elsif ( $record->{ "REC_TYPE" } eq "PSL" and exists $lookup_hash{ $record->{ "S_ID" } } ) - { - ( $index_beg, $index_len ) = @{ $lookup_hash{ $record->{ "S_ID" } } }; - - $beg = $record->{ "S_BEG" } + $index_beg; - $len = $record->{ "S_END" } - $record->{ "S_BEG" } + 1; - } - elsif ( $record->{ "REC_TYPE" } eq "BLAST" and exists $lookup_hash{ $record->{ "S_ID" } } ) - { - ( $index_beg, $index_len ) = @{ $lookup_hash{ $record->{ "S_ID" } } }; - - $beg = $record->{ "S_BEG" } + $index_beg; - $len = $record->{ "S_END" } - $record->{ "S_BEG" } + 1; - } - - $beg -= $options->{ "flank" }; - $len += 2 * $options->{ "flank" }; - - if ( $beg <= $index_beg ) - { - $len -= $index_beg - $beg; - $beg = $index_beg; - } - - $len = $index_beg + $index_len - $beg if $beg + $len > $index_beg + $index_len; - - next if $beg > $index_beg + $index_len; - - $record->{ "CHR_BEG" } = $beg - $index_beg; - $record->{ "CHR_END" } = $record->{ "CHR_BEG" } + $len - 1; - - $record->{ "SEQ" } = Maasha::Common::file_read( $fh, $beg, $len ); - - if ( $record->{ "STRAND" } and $record->{ "STRAND" } eq "-" ) - { - Maasha::Seq::dna_comp( \$record->{ "SEQ" } ); - $record->{ "SEQ" } = reverse $record->{ "SEQ" }; - } - - if ( $options->{ "mask" } ) - { - if ( $record->{ "BLOCKCOUNT" } > 1 ) # uppercase hit block segments and lowercase the rest. - { - $record->{ "SEQ" } = lc $record->{ "SEQ" }; - - @begs = split ",", $record->{ "Q_BEGS" }; - @lens = split ",", $record->{ "BLOCKSIZES" }; - - for ( $i = 0; $i < @begs; $i++ ) { - substr $record->{ "SEQ" }, $begs[ $i ], $lens[ $i ], uc substr $record->{ "SEQ" }, $begs[ $i ], $lens[ $i ]; - } - } - } - } - - put_record( $record, $out ); - } - - close $fh if $fh; -} - - -sub script_get_genome_align -{ - # Martin A. Hansen, April 2008. - - # Gets a subalignment from a multiple genome alignment. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $maf_track, $align, $align_num, $beg, $end, $len, $entry ); - - $options->{ "strand" } ||= "+"; - - $align_num = 1; - - $maf_track = Maasha::Config::maf_track( $options->{ "genome" } ); - - if ( $options->{ "chr" } and $options->{ "beg" } and ( $options->{ "end" } or $options->{ "len" } ) ) - { - $beg = $options->{ "beg" } - 1; - - if ( $options->{ "end" } ) { - $end = $options->{ "end" }; - } elsif ( $options->{ "len" } ) { - $end = $beg + $options->{ "len" }; - } - - $align = Maasha::UCSC::maf_extract( $BP_TMP, $options->{ "genome" }, $maf_track, $options->{ "chr" }, $beg, $end, $options->{ "strand" } ); - - foreach $entry ( @{ $align } ) - { - $entry->{ "CHR" } = $record->{ "CHR" }; - $entry->{ "CHR_BEG" } = $record->{ "CHR_BEG" }; - $entry->{ "CHR_END" } = $record->{ "CHR_END" }; - $entry->{ "STRAND" } = $record->{ "STRAND" } || '+'; - $entry->{ "Q_ID" } = $record->{ "Q_ID" }; - $entry->{ "SCORE" } = $record->{ "SCORE" }; - - put_record( $entry, $out ); - } - } - - while ( $record = get_record( $in ) ) - { - if ( $record->{ "REC_TYPE" } eq "BED" ) - { - $align = Maasha::UCSC::maf_extract( $BP_TMP, $options->{ "genome" }, $maf_track, $record->{ "CHR" }, $record->{ "CHR_BEG" }, $record->{ "CHR_END" }, $record->{ "STRAND" } ); - } - elsif ( $record->{ "REC_TYPE" } eq "PSL" ) - { - $align = Maasha::UCSC::maf_extract( $BP_TMP, $options->{ "genome" }, $maf_track, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" }, $record->{ "STRAND" } ); - } - elsif ( $record->{ "REC_TYPE" } eq "BLAST" ) - { - $align = Maasha::UCSC::maf_extract( $BP_TMP, $options->{ "genome" }, $maf_track, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" }, $record->{ "STRAND" } ); - } - - foreach $entry ( @{ $align } ) - { - $entry->{ "CHR" } = $record->{ "CHR" }; - $entry->{ "CHR_BEG" } = $record->{ "CHR_BEG" }; - $entry->{ "CHR_END" } = $record->{ "CHR_END" }; - $entry->{ "STRAND" } = $record->{ "STRAND" }; - $entry->{ "Q_ID" } = $record->{ "Q_ID" }; - $entry->{ "SCORE" } = $record->{ "SCORE" }; - - put_record( $entry, $out ); - } - - $align_num++; - } -} - - -sub script_get_genome_phastcons -{ - # Martin A. Hansen, February 2008. - - # Get phastcons scores from genome intervals. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $phastcons_file, $phastcons_index, $index, $fh_phastcons, $scores, $record ); - - $options->{ "flank" } ||= 0; - - $phastcons_file = Maasha::Config::genome_phastcons( $options->{ "genome" } ); - $phastcons_index = Maasha::Config::genome_phastcons_index( $options->{ "genome" } ); - - $index = Maasha::UCSC::fixedstep_index_retrieve( $phastcons_index ); - $fh_phastcons = Maasha::Common::read_open( $phastcons_file ); - - if ( defined $options->{ "chr" } and defined $options->{ "beg" } and ( defined $options->{ "end" } or defined $options->{ "len" } ) ) - { - $options->{ "beg" } -= 1; # request is 1-based - $options->{ "end" } -= 1; # request is 1-based - - if ( $options->{ "len" } ) { - $options->{ "end" } = $options->{ "beg" } + $options->{ "len" } - 1; - } - - $scores = Maasha::UCSC::fixedstep_index_lookup( $index, $fh_phastcons, $options->{ "chr" }, $options->{ "beg" }, $options->{ "end" }, $options->{ "flank" } ); - - $record->{ "CHR" } = $options->{ "chr" }; - $record->{ "CHR_BEG" } = $options->{ "beg" } - $options->{ "flank" }; - $record->{ "CHR_END" } = $options->{ "end" } + $options->{ "flank" }; - - $record->{ "PHASTCONS" } = join ",", @{ $scores }; - $record->{ "PHAST_COUNT" } = scalar @{ $scores }; # DEBUG - - put_record( $record, $out ); - } - - while ( $record = get_record( $in ) ) - { - if ( $record->{ "REC_TYPE" } eq "BED" ) - { - $scores = Maasha::UCSC::fixedstep_index_lookup( $index, $fh_phastcons, $record->{ "CHR" }, $record->{ "CHR_BEG" }, $record->{ "CHR_END" }, $options->{ "flank" } ); - } - elsif ( $record->{ "REC_TYPE" } eq "PSL" ) - { - $scores = Maasha::UCSC::fixedstep_index_lookup( $index, $fh_phastcons, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" }, $options->{ "flank" } ); - } - elsif ( $record->{ "REC_TYPE" } eq "BLAST" ) - { - $scores = Maasha::UCSC::fixedstep_index_lookup( $index, $fh_phastcons, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" }, $options->{ "flank" } ); - } - - $record->{ "PHASTCONS" } = join ",", @{ $scores } if @{ $scores }; -# $record->{ "PHAST_COUNT" } = @{ $scores } if @{ $scores }; # DEBUG - - put_record( $record, $out ); - } - - close $fh_phastcons if $fh_phastcons; -} - - -sub script_fold_seq -{ - # Martin A. Hansen, December 2007. - - # Folds sequences in stream into secondary structures. - - my ( $in, # handle to in stream - $out, # handle to out stream - ) = @_; - - # Returns nothing. - - my ( $record, $type, $struct, $index ); - - while ( $record = get_record( $in ) ) - { - if ( $record->{ "SEQ" } ) - { - if ( not $type ) { - $type = Maasha::Seq::seq_guess_type( $record->{ "SEQ" } ); - } - - if ( $type ne "protein" ) - { - ( $struct, $index ) = Maasha::Seq::fold_struct_rnafold( $record->{ "SEQ" } ); - $record->{ "SEC_STRUCT" } = $struct; - $record->{ "FREE_ENERGY" } = $index; - $record->{ "SCORE" } = abs int $index; - $record->{ "SIZE" } = length $struct; - $record->{ "CONF" } = "1," x $record->{ "SIZE" }; - } - } - - put_record( $record, $out ); - } -} - - -sub script_split_seq -{ - # Martin A. Hansen, August 2007. - - # Split a sequence in stream into words. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $new_record, $i, $subseq, %lookup ); - - $options->{ "word_size" } ||= 7; - - while ( $record = get_record( $in ) ) - { - if ( $record->{ "SEQ_NAME" } and $record->{ "SEQ" } ) - { - for ( $i = 0; $i < length( $record->{ "SEQ" } ) - $options->{ "word_size" } + 1; $i++ ) - { - $subseq = substr $record->{ "SEQ" }, $i, $options->{ "word_size" }; - - if ( $options->{ "uniq" } and not $lookup{ $subseq } ) - { - $new_record->{ "SEQ_NAME" } = $record->{ "SEQ_NAME" } . "[" . ( $i + 1 ) . "-" . ( $i + $options->{ "word_size" } ) . "]"; - $new_record->{ "SEQ" } = $subseq; - - put_record( $new_record, $out ); - - $lookup{ $subseq } = 1; - } - else - { - $new_record->{ "SEQ_NAME" } = $record->{ "SEQ_NAME" } . "[" . ( $i + 1 ) . "-" . ( $i + $options->{ "word_size" } ) . "]"; - $new_record->{ "SEQ" } = $subseq; - - put_record( $new_record, $out ); - } - } - } - else - { - put_record( $record, $out ); - } - } -} - - -sub script_split_bed -{ - # Martin A. Hansen, June 2008. - - # Split a BED record into overlapping windows. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $new_record, $i ); - - $options->{ "window_size" } ||= 20; - $options->{ "step_size" } ||= 1; - - while ( $record = get_record( $in ) ) - { - if ( $record->{ "CHR" } and $record->{ "CHR_BEG" } and $record->{ "CHR_END" } ) - { - $record->{ "BED_LEN" } = $record->{ "CHR_END" } - $record->{ "CHR_BEG" } + 1; - - for ( $i = 0; $i < $record->{ "BED_LEN" } - $options->{ "window_size" }; $i += $options->{ "step_size" } ) - { - $new_record->{ "REC_TYPE" } = "BED"; - $new_record->{ "CHR" } = $record->{ "CHR" }; - $new_record->{ "CHR_BEG" } = $record->{ "CHR_BEG" } + $i; - $new_record->{ "CHR_END" } = $record->{ "CHR_BEG" } + $i + $options->{ "window_size" }; - $new_record->{ "BED_LEN" } = $options->{ "window_size" }; - $new_record->{ "Q_ID" } = $record->{ "Q_ID" } . "_$i"; - $new_record->{ "SCORE" } = $record->{ "SCORE" }; - $new_record->{ "STRAND" } = $record->{ "STRAND" }; - - put_record( $new_record, $out ); - } - } - else - { - put_record( $record, $out ); - } - } -} - - -sub script_align_seq -{ - # Martin A. Hansen, August 2007. - - # Align sequences in stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - ) = @_; - - # Returns nothing. - - my ( $record, @entries, $entry ); - - while ( $record = get_record( $in ) ) - { - if ( $record->{ "SEQ_NAME" } and $record->{ "SEQ" } ) { - push @entries, [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ]; - } elsif ( $record->{ "Q_ID" } and $record->{ "SEQ" } ) { - push @entries, [ $record->{ "Q_ID" }, $record->{ "SEQ" } ]; - } else { - put_record( $record, $out ); - } - } - - @entries = Maasha::Align::align( \@entries ); - - foreach $entry ( @entries ) - { - if ( $entry->[ SEQ_NAME ] and $entry->[ SEQ ] ) - { - $record = { - SEQ_NAME => $entry->[ SEQ_NAME ], - SEQ => $entry->[ SEQ ], - }; - - put_record( $record, $out ); - } - } -} - - -sub script_tile_seq -{ - # Martin A. Hansen, February 2008. - - # Using the first sequence in stream as reference, tile - # all subsequent sequences based on pairwise alignments. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $first, $ref_entry, @entries ); - - $first = 1; - - while ( $record = get_record( $in ) ) - { - if ( $record->{ "SEQ_NAME" } and $record->{ "SEQ" } ) - { - if ( $first ) - { - $ref_entry = [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ]; - - $first = 0; - } - else - { - push @entries, [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ]; - } - } - else - { - put_record( $record, $out ); - } - } - - @entries = Maasha::Align::align_tile( $ref_entry, \@entries, $options ); - - map { put_record( { SEQ_NAME => $_->[ SEQ_NAME ], SEQ => $_->[ SEQ ] }, $out ) } @entries; -} - - -sub script_invert_align -{ - # Martin A. Hansen, February 2008. - - # Inverts an alignment showing only non-mathing residues - # using the first sequence as reference. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, @entries ); - - while ( $record = get_record( $in ) ) - { - if ( $record->{ "SEQ_NAME" } and $record->{ "SEQ" } ) - { - push @entries, [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ]; - } - else - { - put_record( $record, $out ); - } - } - - Maasha::Align::align_invert( \@entries, $options->{ "soft" } ); - - map { put_record( { SEQ_NAME => $_->[ SEQ_NAME ], SEQ => $_->[ SEQ ] }, $out ) } @entries; -} - - -sub script_patscan_seq -{ - # Martin A. Hansen, August 2007. - - # Locates patterns in sequences using scan_for_matches. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $genome_file, @args, $arg, $type, $seq_file, $pat_file, $out_file, $fh_in, $fh_out, $record, $patterns, $pattern, $entry, $result, %head_hash, $i ); - - if ( $options->{ "patterns" } ) { - $patterns = Maasha::Patscan::parse_patterns( $options->{ "patterns" } ); - } elsif ( -f $options->{ "patterns_in" } ) { - $patterns = Maasha::Patscan::read_patterns( $options->{ "patterns_in" } ); - } - - $genome_file = "$ENV{ 'BP_DATA' }/genomes/$options->{ 'genome' }/fasta/$options->{ 'genome' }.fna" if $options->{ 'genome' }; - - push @args, "-c" if $options->{ "comp" }; - push @args, "-m $options->{ 'max_hits' }" if $options->{ 'max_hits' }; - push @args, "-n $options->{ 'max_misses' }" if $options->{ 'max_hits' }; - - $seq_file = "$BP_TMP/patscan.seq"; - $pat_file = "$BP_TMP/patscan.pat"; - $out_file = "$BP_TMP/patscan.out"; - - $fh_out = Maasha::Common::write_open( $seq_file ); - - $i = 0; - - while ( $record = get_record( $in ) ) - { - if ( $record->{ "SEQ" } and $record->{ "SEQ_NAME" } ) - { - $type = Maasha::Seq::seq_guess_type( $record->{ "SEQ" } ) if not $type; - - Maasha::Fasta::put_entry( [ $i, $record->{ "SEQ" } ], $fh_out ); - - $head_hash{ $i } = $record->{ "SEQ_NAME" }; - - $i++; - } - } - - close $fh_out; - - $arg = join " ", @args; - $arg .= " -p" if $type eq "protein"; - - foreach $pattern ( @{ $patterns } ) - { - $fh_out = Maasha::Common::write_open( $pat_file ); - - print $fh_out "$pattern\n"; - - close $fh_out; - - if ( $options->{ 'genome' } ) { - `scan_for_matches $arg $pat_file < $genome_file > $out_file`; - # Maasha::Common::run( "scan_for_matches", "$arg $pat_file < $genome_file > $out_file" ); - } else { - `scan_for_matches $arg $pat_file < $seq_file > $out_file`; - # Maasha::Common::run( "scan_for_matches", "$arg $pat_file < $seq_file > $out_file" ); - } - - $fh_in = Maasha::Common::read_open( $out_file ); - - while ( $entry = Maasha::Fasta::get_entry( $fh_in ) ) - { - $result = Maasha::Patscan::parse_scan_result( $entry, $pattern ); - - if ( $options->{ 'genome' } ) - { - $result->{ "CHR" } = $result->{ "S_ID" }; - $result->{ "CHR_BEG" } = $result->{ "S_BEG" }; - $result->{ "CHR_END" } = $result->{ "S_END" }; - - delete $result->{ "S_ID" }; - delete $result->{ "S_BEG" }; - delete $result->{ "S_END" }; - } - else - { - $result->{ "S_ID" } = $head_hash{ $result->{ "S_ID" } }; - } - - put_record( $result, $out ); - } - - close $fh_in; - } - - unlink $pat_file; - unlink $seq_file; - unlink $out_file; -} - - -sub script_create_blast_db -{ - # Martin A. Hansen, September 2007. - - # Creates a NCBI BLAST database with formatdb - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $fh, $seq_type, $path, $record, $entry ); - - $path = $options->{ "database" }; - - $fh = Maasha::Common::write_open( $path ); - - while ( $record = get_record( $in ) ) - { - put_record( $record, $out ) if not $options->{ "no_stream" }; - - if ( $entry = record2fasta( $record ) ) - { - $seq_type = Maasha::Seq::seq_guess_type( $entry->[ SEQ ] ) if not $seq_type; - - Maasha::Fasta::put_entry( $entry, $fh ); - } - } - - close $fh; - - if ( $seq_type eq "protein" ) { - Maasha::Common::run( "formatdb", "-p T -i $path -t $options->{ 'database' }" ); - } else { - Maasha::Common::run( "formatdb", "-p F -i $path -t $options->{ 'database' }" ); - } - - unlink $path; -} - - -sub script_blast_seq -{ - # Martin A. Hansen, September 2007. - - # BLASTs sequences in stream against a given database. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $genome, $q_type, $s_type, $tmp_in, $tmp_out, $fh_in, $fh_out, $record, $line, @fields, $entry ); - - $options->{ "e_val" } = 10 if not defined $options->{ "e_val" }; - $options->{ "filter" } = "F"; - $options->{ "filter" } = "T" if $options->{ "filter" }; - $options->{ "cpus" } ||= 1; - - $options->{ "database" } = "$ENV{ 'BP_DATA' }/genomes/$options->{ 'genome' }/blast/$options->{ 'genome' }.fna" if $options->{ 'genome' }; - - $tmp_in = "$BP_TMP/blast_query.seq"; - $tmp_out = "$BP_TMP/blast.result"; - - $fh_out = Maasha::Common::write_open( $tmp_in ); - - while ( $record = get_record( $in ) ) - { - if ( $entry = record2fasta( $record ) ) - { - $q_type = Maasha::Seq::seq_guess_type( $entry->[ SEQ ] ) if not $q_type; - - Maasha::Fasta::put_entry( $entry, $fh_out ); - } - - put_record( $record, $out ); - } - - close $fh_out; - - if ( -f $options->{ 'database' } . ".phr" ) { - $s_type = "protein"; - } else { - $s_type = "nucleotide"; - } - - if ( not $options->{ 'program' } ) - { - if ( $q_type ne "protein" and $s_type ne "protein" ) { - $options->{ 'program' } = "blastn"; - } elsif ( $q_type eq "protein" and $s_type eq "protein" ) { - $options->{ 'program' } = "blastp"; - } elsif ( $q_type ne "protein" and $s_type eq "protein" ) { - $options->{ 'program' } = "blastx"; - } elsif ( $q_type eq "protein" and $s_type ne "protein" ) { - $options->{ 'program' } = "tblastn"; - } - } - - if ( $options->{ 'verbose' } ) - { - Maasha::Common::run( - "blastall", - join( " ", - "-p $options->{ 'program' }", - "-e $options->{ 'e_val' }", - "-a $options->{ 'cpus' }", - "-m 8", - "-i $tmp_in", - "-d $options->{ 'database' }", - "-F $options->{ 'filter' }", - "-o $tmp_out", - ), - 1 - ); - } - else - { - Maasha::Common::run( - "blastall", - join( " ", - "-p $options->{ 'program' }", - "-e $options->{ 'e_val' }", - "-a $options->{ 'cpus' }", - "-m 8", - "-i $tmp_in", - "-d $options->{ 'database' }", - "-F $options->{ 'filter' }", - "-o $tmp_out", - "> /dev/null 2>&1" - ), - 1 - ); - } - - unlink $tmp_in; - - $fh_out = Maasha::Common::read_open( $tmp_out ); - - undef $record; - - while ( $line = <$fh_out> ) - { - chomp $line; - - next if $line =~ /^#/; - - @fields = split /\s+/, $line; - - $record->{ "REC_TYPE" } = "BLAST"; - $record->{ "Q_ID" } = $fields[ 0 ]; - $record->{ "S_ID" } = $fields[ 1 ]; - $record->{ "IDENT" } = $fields[ 2 ]; - $record->{ "ALIGN_LEN" } = $fields[ 3 ]; - $record->{ "MISMATCHES" } = $fields[ 4 ]; - $record->{ "GAPS" } = $fields[ 5 ]; - $record->{ "Q_BEG" } = $fields[ 6 ] - 1; # BLAST is 1-based - $record->{ "Q_END" } = $fields[ 7 ] - 1; # BLAST is 1-based - $record->{ "S_BEG" } = $fields[ 8 ] - 1; # BLAST is 1-based - $record->{ "S_END" } = $fields[ 9 ] - 1; # BLAST is 1-based - $record->{ "E_VAL" } = $fields[ 10 ]; - $record->{ "BIT_SCORE" } = $fields[ 11 ]; - - if ( $record->{ "S_BEG" } > $record->{ "S_END" } ) - { - $record->{ "STRAND" } = '-'; - - ( $record->{ "S_BEG" }, $record->{ "S_END" } ) = ( $record->{ "S_END" }, $record->{ "S_BEG" } ); - } - else - { - $record->{ "STRAND" } = '+'; - } - - put_record( $record, $out ); - } - - close $fh_out; - - unlink $tmp_out; -} - - -sub script_blat_seq -{ - # Martin A. Hansen, August 2007. - - # BLATs sequences in stream against a given genome. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $blat_args, $genome_file, $query_file, $fh_in, $fh_out, $type, $record, $result_file, $entries, $entry ); - - $genome_file = "$ENV{ 'BP_DATA' }/genomes/$options->{ 'genome' }/fasta/$options->{ 'genome' }.fna"; - - $options->{ 'tile_size' } ||= 11; - $options->{ 'one_off' } ||= 0; - $options->{ 'min_identity' } ||= 90; - $options->{ 'min_score' } ||= 0; - $options->{ 'step_size' } ||= $options->{ 'tile_size' }; - - $blat_args .= " -tileSize=$options->{ 'tile_size' }"; - $blat_args .= " -oneOff=$options->{ 'one_off' }"; - $blat_args .= " -minIdentity=$options->{ 'min_identity' }"; - $blat_args .= " -minScore=$options->{ 'min_score' }"; - $blat_args .= " -stepSize=$options->{ 'step_size' }"; -# $blat_args .= " -ooc=" . Maasha::Config::genome_blat_ooc( $options->{ "genome" }, 11 ) if $options->{ 'ooc' }; - - $query_file = "$BP_TMP/blat.seq"; - - $fh_out = Maasha::Common::write_open( $query_file ); - - while ( $record = get_record( $in ) ) - { - if ( $entry = record2fasta( $record ) ) - { - $type = Maasha::Seq::seq_guess_type( $entry->[ SEQ ] ) if not $type; - Maasha::Fasta::put_entry( $entry, $fh_out, 80 ); - } - - put_record( $record, $out ); - } - - close $fh_out; - - $blat_args .= " -t=dnax" if $type eq "protein"; - $blat_args .= " -q=$type"; - - $result_file = "$BP_TMP/blat.psl"; - - Maasha::Common::run( "blat", "$genome_file $query_file $blat_args $result_file > /dev/null 2>&1" ); - - unlink $query_file; - - $entries = Maasha::UCSC::psl_get_entries( $result_file ); - - map { put_record( $_, $out ) } @{ $entries }; - - unlink $result_file; -} - - -sub script_soap_seq -{ - # Martin A. Hansen, July 2008. - - # soap sequences in stream against a given file or genome. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $genome, $tmp_in, $tmp_out, $fh_in, $fh_out, $record, $line, @fields, $entry, $count, $args ); - - $options->{ "seed_size" } ||= 10; - $options->{ "mismatches" } ||= 2; - $options->{ "gap_size" } ||= 0; - $options->{ "cpus" } ||= 1; - - if ( $options->{ "genome" } ) { - $options->{ "in_file" } = "$ENV{ 'BP_DATA' }/genomes/$options->{ 'genome' }/fasta/$options->{ 'genome' }.fna"; - } - - $tmp_in = "$BP_TMP/soap_query.seq"; - $tmp_out = "$BP_TMP/soap.result"; - - $fh_out = Maasha::Common::write_open( $tmp_in ); - - $count = 0; - - while ( $record = get_record( $in ) ) - { - if ( $entry = record2fasta( $record ) ) - { - Maasha::Fasta::put_entry( $entry, $fh_out ); - - $count++; - } - - put_record( $record, $out ); - } - - close $fh_out; - - if ( $count > 0 ) - { - $args = join( " ", - "-s $options->{ 'seed_size' }", - "-r 2", - "-a $tmp_in", - "-v $options->{ 'mismatches' }", - "-g $options->{ 'gap_size' }", - "-p $options->{ 'cpus' }", - "-d $options->{ 'in_file' }", - "-o $tmp_out", - ); - - $args .= " > /dev/null 2>&1" if not $options->{ 'verbose' }; - - Maasha::Common::run( "soap", $args, 1 ); - - unlink $tmp_in; - - $fh_out = Maasha::Common::read_open( $tmp_out ); - - undef $record; - - while ( $line = <$fh_out> ) - { - chomp $line; - - @fields = split /\t/, $line; - - $record->{ "REC_TYPE" } = "SOAP"; - $record->{ "Q_ID" } = $fields[ 0 ]; - $record->{ "SCORE" } = $fields[ 3 ]; - $record->{ "STRAND" } = $fields[ 6 ]; - $record->{ "S_ID" } = $fields[ 7 ]; - $record->{ "S_BEG" } = $fields[ 8 ] - 1; # soap is 1-based - $record->{ "S_END" } = $fields[ 8 ] + $fields[ 5 ] - 2; - - put_record( $record, $out ); - } - - close $fh_out; - } - - unlink $tmp_out; -} - - -sub script_match_seq -{ - # Martin A. Hansen, August 2007. - - # BLATs sequences in stream against a given genome. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, @entries, $results ); - - $options->{ "word_size" } ||= 20; - $options->{ "direction" } ||= "both"; - - while ( $record = get_record( $in ) ) - { - if ( $record->{ "SEQ_NAME" } and $record->{ "SEQ" } ) { - push @entries, [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ]; - } - - put_record( $record, $out ); - } - - if ( @entries == 1 ) - { - $results = Maasha::Match::match_mummer( [ $entries[ 0 ] ], [ $entries[ 0 ] ], $options, $BP_TMP ); - - map { put_record( $_, $out ) } @{ $results }; - } - elsif ( @entries == 2 ) - { - $results = Maasha::Match::match_mummer( [ $entries[ 0 ] ], [ $entries[ 1 ] ], $options, $BP_TMP ); - - map { put_record( $_, $out ) } @{ $results }; - } -} - - -sub script_create_vmatch_index -{ - # Martin A. Hansen, January 2008. - - # Create a vmatch index from sequences in the stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $file_tmp, $fh_tmp, $type, $entry ); - - if ( $options->{ "index_name" } ) - { - $file_tmp = $options->{ 'index_name' }; - $fh_tmp = Maasha::Common::write_open( $file_tmp ); - } - - while ( $record = get_record( $in ) ) - { - if ( $options->{ "index_name" } and $entry = record2fasta( $record ) ) - { - Maasha::Fasta::put_entry( $entry, $fh_tmp ); - - $type = Maasha::Seq::seq_guess_type( $entry->[ SEQ ] ) if not defined $type; - } - - put_record( $record, $out ) if not $options->{ "no_stream" }; - } - - if ( $options->{ "index_name" } ) - { - close $fh_tmp; - - if ( $type eq "protein" ) { - Maasha::Common::run( "mkvtree", "-db $file_tmp -protein -pl $options->{ 'prefix_length' } -allout -indexname $file_tmp > /dev/null 2>&1" ); - } else { - Maasha::Common::run( "mkvtree", "-db $file_tmp -dna -pl $options->{ 'prefix_length' } -allout -indexname $file_tmp > /dev/null 2>&1" ); - } - - unlink $file_tmp; - } -} - - -sub script_vmatch_seq -{ - # Martin A. Hansen, August 2007. - - # Vmatches sequences in stream against a given genome. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( @index_files, @records, $result_file, $fh_in, $record, %hash ); - - $options->{ 'count' } = 1 if $options->{ 'max_hits' }; - - if ( $options->{ "index_name" } ) - { - @index_files = $options->{ "index_name" }; - } - else - { - @index_files = Maasha::Common::ls_files( "$ENV{ 'BP_DATA' }/genomes/$options->{ 'genome' }/vmatch" ); - - map { $_ =~ /^(.+)\.[a-z1]{3,4}$/; $hash{ $1 } = 1 } @index_files; - - @index_files = sort keys %hash; - } - - while ( $record = get_record( $in ) ) - { - push @records, $record; - - put_record( $record, $out ); - } - - $result_file = Maasha::Match::match_vmatch( $BP_TMP, \@records, \@index_files, $options ); - - undef @records; - - $fh_in = Maasha::Common::read_open( $result_file ); - - while ( $record = Maasha::Match::vmatch_get_entry( $fh_in ) ) { - put_record( $record, $out ); - } - - close $fh_in; - - unlink $result_file; -} - - -sub script_write_fasta -{ - # Martin A. Hansen, August 2007. - - # Write FASTA entries from sequences in stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $fh, $entry ); - - $fh = write_stream( $options->{ "data_out" }, $options->{ "compress" } ); - - while ( $record = get_record( $in ) ) - { - if ( $entry = record2fasta( $record ) ) { - Maasha::Fasta::put_entry( $entry, $fh, $options->{ "wrap" } ); - } - - put_record( $record, $out ) if not $options->{ "no_stream" }; - } - - close $fh; -} - - -sub script_write_align -{ - # Martin A. Hansen, August 2007. - - # Write pretty alignments aligned sequences in stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $fh, $record, @entries ); - - $fh = write_stream( $options->{ "data_out" } ) ; - - while ( $record = get_record( $in ) ) - { - if ( $record->{ "SEQ_NAME" } and $record->{ "SEQ" } ) { - push @entries, [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ]; - } - - put_record( $record, $out ) if not $options->{ "no_stream" }; - } - - if ( scalar( @entries ) == 2 ) { - Maasha::Align::align_print_pairwise( $entries[ 0 ], $entries[ 1 ], $fh, $options->{ "wrap" } ); - } elsif ( scalar ( @entries ) > 2 ) { - Maasha::Align::align_print_multi( \@entries, $fh, $options->{ "wrap" }, $options->{ "no_ruler" }, $options->{ "no_consensus" } ); - } - - close $fh if $fh; -} - - -sub script_write_blast -{ - # Martin A. Hansen, November 2007. - - # Write data in blast table format (-m8 and 9). - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $fh, $record, $first ); - - $fh = write_stream( $options->{ "data_out" }, $options->{ "compress" } ) ; - - $first = 1; - - while ( $record = get_record( $in ) ) - { - if ( $record->{ "REC_TYPE" } eq "BLAST" ) - { - if ( $options->{ "comment" } and $first ) - { - print "# Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score\n"; - - $first = 0; - } - - if ( $record->{ "STRAND" } eq "-" ) { - ( $record->{ "S_BEG" }, $record->{ "S_END" } ) = ( $record->{ "S_END" }, $record->{ "S_BEG" } ); - } - - print $fh join( "\t", - $record->{ "Q_ID" }, - $record->{ "S_ID" }, - $record->{ "IDENT" }, - $record->{ "ALIGN_LEN" }, - $record->{ "MISMATCHES" }, - $record->{ "GAPS" }, - $record->{ "Q_BEG" } + 1, - $record->{ "Q_END" } + 1, - $record->{ "S_BEG" } + 1, - $record->{ "S_END" } + 1, - $record->{ "E_VAL" }, - $record->{ "BIT_SCORE" } - ), "\n"; - } - - put_record( $record, $out ) if not $options->{ "no_stream" }; - } - - close $fh; -} - - -sub script_write_tab -{ - # Martin A. Hansen, August 2007. - - # Write data as table. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $fh, $record, $key, @keys, @vals, $ok, %no_keys, $A, $B ); - - $options->{ "delimit" } ||= "\t"; - - map { $no_keys{ $_ } = 1 } @{ $options->{ "no_keys" } }; - - $fh = write_stream( $options->{ "data_out" }, $options->{ "compress" } ); - - while ( $record = get_record( $in ) ) - { - undef @vals; - $ok = 1; - - if ( $options->{ "keys" } ) - { - map { $ok = 0 if not exists $record->{ $_ } } @{ $options->{ "keys" } }; - - if ( $ok ) - { - foreach $key ( @{ $options->{ "keys" } } ) - { - if ( exists $record->{ $key } ) - { - push @keys, $key if $options->{ "comment" }; - push @vals, $record->{ $key }; - } - } - } - } - else - { - foreach $key ( sort { $A = $a; $B = $b; $A =~ s/^V(\d+)$/$1/; $B =~ s/^V(\d+)$/$1/; $A <=> $B } keys %{ $record } ) - { - next if exists $no_keys{ $key }; - - push @keys, $key if $options->{ "comment" }; - push @vals, $record->{ $key }; - } - } - - if ( @keys and $options->{ "comment" } ) - { - print $fh "#", join( $options->{ "delimit" }, @keys ), "\n"; - - delete $options->{ "comment" }; - } - - print $fh join( $options->{ "delimit" }, @vals ), "\n" if @vals; - - put_record( $record, $out ) if not $options->{ "no_stream" }; - } - - close $fh; -} - - -sub script_write_bed -{ - # Martin A. Hansen, August 2007. - - # Write BED format for the UCSC genome browser using records in stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $cols, $fh, $record, $new_record ); - - $cols = $options->{ 'cols' }->[ 0 ]; - - $fh = write_stream( $options->{ "data_out" }, $options->{ "compress" } ); - - while ( $record = get_record( $in ) ) - { - if ( $record->{ "REC_TYPE" } eq "BED" ) # ---- Hits from BED ---- - { - $cols ||= $record->{ "BED_COLS" }; - - Maasha::UCSC::bed_put_entry( $record, $fh, $cols ); - } - elsif ( $record->{ "REC_TYPE" } eq "PSL" and $record->{ "S_ID" } =~ /^chr/i ) # ---- Hits from BLAT (PSL) ---- - { - $new_record->{ "CHR" } = $record->{ "S_ID" }; - $new_record->{ "CHR_BEG" } = $record->{ "S_BEG" }; - $new_record->{ "CHR_END" } = $record->{ "S_END" }; - $new_record->{ "Q_ID" } = $record->{ "Q_ID" }; - $new_record->{ "SCORE" } = $record->{ "SCORE" } || 999; - $new_record->{ "STRAND" } = $record->{ "STRAND" }; - - $cols ||= 6; - - Maasha::UCSC::bed_put_entry( $new_record, $fh, $cols ); - } - elsif ( $record->{ "REC_TYPE" } eq "PATSCAN" and $record->{ "CHR" } ) # ---- Hits from patscan_seq ---- - { - $cols ||= 6; - - Maasha::UCSC::bed_put_entry( $record, $fh, $cols ); - } - elsif ( $record->{ "REC_TYPE" } eq "BLAST" and $record->{ "S_ID" } =~ /^chr/i ) # ---- Hits from BLAST ---- - { - $new_record->{ "CHR" } = $record->{ "S_ID" }; - $new_record->{ "CHR_BEG" } = $record->{ "S_BEG" }; - $new_record->{ "CHR_END" } = $record->{ "S_END" }; - $new_record->{ "Q_ID" } = $record->{ "Q_ID" }; - $new_record->{ "SCORE" } = $record->{ "SCORE" } || 999; # or use E_VAL somehow - $new_record->{ "STRAND" } = $record->{ "STRAND" }; - - $cols ||= 6; - - Maasha::UCSC::bed_put_entry( $new_record, $fh, $cols ); - } - elsif ( $record->{ "REC_TYPE" } eq "VMATCH" and $record->{ "S_ID" } =~ /^chr/i ) # ---- Hits from Vmatch ---- - { - $new_record->{ "CHR" } = $record->{ "S_ID" }; - $new_record->{ "CHR_BEG" } = $record->{ "S_BEG" }; - $new_record->{ "CHR_END" } = $record->{ "S_END" }; - $new_record->{ "Q_ID" } = $record->{ "Q_ID" }; - $new_record->{ "SCORE" } = $record->{ "SCORE" } || 999; # or use E_VAL somehow - $new_record->{ "STRAND" } = $record->{ "STRAND" }; - - $cols ||= 6; - - Maasha::UCSC::bed_put_entry( $new_record, $fh, $cols ); - } - elsif ( $record->{ "REC_TYPE" } eq "SOAP" and $record->{ "S_ID" } =~ /^chr/i ) # ---- Hits from Soap ---- - { - $new_record->{ "CHR" } = $record->{ "S_ID" }; - $new_record->{ "CHR_BEG" } = $record->{ "S_BEG" }; - $new_record->{ "CHR_END" } = $record->{ "S_END" }; - $new_record->{ "Q_ID" } = $record->{ "Q_ID" }; - $new_record->{ "SCORE" } = $record->{ "SCORE" } || 999; - $new_record->{ "STRAND" } = $record->{ "STRAND" }; - - $cols ||= 6; - - Maasha::UCSC::bed_put_entry( $new_record, $fh, $cols ); - } - elsif ( $record->{ 'tBaseInsert' } ) # ---- Dirty addition to allow Affy data from MySQL to be dumped ---- - { - $record = Maasha::UCSC::psl2record( $record ); - - $new_record = $record; - $new_record->{ "CHR" } = $record->{ "S_ID" }; - $new_record->{ "CHR_BEG" } = $record->{ "S_BEG" }; - $new_record->{ "CHR_END" } = $record->{ "S_END" }; - $new_record->{ "Q_ID" } = $record->{ "Q_ID" }; - $new_record->{ "SCORE" } = $record->{ "SCORE" } || 999; - $new_record->{ "STRAND" } = $record->{ "STRAND" }; - - Maasha::UCSC::bed_put_entry( $new_record, $fh, $cols ); - } - elsif ( $record->{ "CHR" } and defined $record->{ "CHR_BEG" } and $record->{ "CHR_END" } ) # ---- Generic data from tables ---- - { - Maasha::UCSC::bed_put_entry( $record, $fh, $cols ); - } - - put_record( $record, $out ) if not $options->{ "no_stream" }; - } - - close $fh; -} - - -sub script_write_psl -{ - # Martin A. Hansen, August 2007. - - # Write PSL output from stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $fh, $record, @output, $first ); - - $first = 1; - - $fh = write_stream( $options->{ "data_out" }, $options->{ "compress" } ); - - while ( $record = get_record( $in ) ) - { - put_record( $record, $out ) if not $options->{ "no_stream" }; - - if ( $record->{ "REC_TYPE" } and $record->{ "REC_TYPE" } eq "PSL" ) - { - Maasha::UCSC::psl_put_header( $fh ) if $first; - Maasha::UCSC::psl_put_entry( $record, $fh ); - $first = 0; - } - } - - close $fh; -} - - -sub script_write_fixedstep -{ - # Martin A. Hansen, Juli 2008. - - # Write fixedStep entries from recrods in the stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $fh, $record, $vals ); - - $fh = write_stream( $options->{ "data_out" }, $options->{ "compress" } ); - - while ( $record = get_record( $in ) ) - { - if ( $record->{ "CHR" } and $record->{ "CHR_BEG" } and $record->{ "STEP" } and $record->{ "VALS" } ) - { - print $fh "fixedStep chrom=$record->{ 'CHR' } start=$record->{ 'CHR_BEG' } step=$record->{ 'STEP' }\n"; - - $vals = $record->{ 'VALS' }; - - $vals =~ tr/;/\n/; - - print $fh "$vals\n"; - } - - put_record( $record, $out ) if not $options->{ "no_stream" }; - } - - close $fh; -} - - -sub script_write_2bit -{ - # Martin A. Hansen, March 2008. - - # Write sequence entries from stream in 2bit format. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $mask, $tmp_file, $fh_tmp, $fh_in, $fh_out, $entry ); - - $mask = 1 if not $options->{ "no_mask" }; - - $tmp_file = "$BP_TMP/write_2bit.fna"; - $fh_tmp = Maasha::Common::write_open( $tmp_file ); - - $fh_out = write_stream( $options->{ "data_out" } ); - - while ( $record = get_record( $in ) ) - { - if ( $entry = record2fasta( $record ) ) { - Maasha::Fasta::put_entry( $entry, $fh_tmp ); - } - - put_record( $record, $out ) if not $options->{ "no_stream" }; - } - - close $fh_tmp; - - $fh_in = Maasha::Common::read_open( $tmp_file ); - - Maasha::TwoBit::fasta2twobit( $fh_in, $fh_out, $mask ); - - close $fh_in; - close $fh_out; - - unlink $tmp_file; -} - - -sub script_write_solid -{ - # Martin A. Hansen, April 2008. - - # Write di-base encoded Solid sequence from entries in stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $fh, $entry ); - - $fh = write_stream( $options->{ "data_out" }, $options->{ "compress" } ); - - while ( $record = get_record( $in ) ) - { - if ( $entry = record2fasta( $record ) ) - { - $entry->[ SEQ ] = Maasha::Solid::seq2color_space( uc $entry->[ SEQ ] ); - - Maasha::Fasta::put_entry( $entry, $fh, $options->{ "wrap" } ); - } - - put_record( $record, $out ) if not $options->{ "no_stream" }; - } - - close $fh; -} - - -sub script_write_ucsc_config -{ - # Martin A. Hansen, November 2008. - - # Write UCSC Genome Broser configuration (.ra file type) from - # records in the stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $fh ); - - $fh = write_stream( $options->{ "data_out" } ); - - while ( $record = get_record( $in ) ) - { - Maasha::UCSC::ucsc_config_put_entry( $record, $fh ) if $record->{ "REC_TYPE" } eq "UCSC Config"; - - put_record( $record, $out ) if not $options->{ "no_stream" }; - } - - close $fh; -} - - -sub script_plot_seqlogo -{ - # Martin A. Hansen, August 2007. - - # Calculates and writes a sequence logo for alignments. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, @entries, $logo, $fh ); - - while ( $record = get_record( $in ) ) - { - if ( $record->{ "SEQ_NAME" } and $record->{ "SEQ" } ) { - push @entries, [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ]; - } - - put_record( $record, $out ) if not $options->{ "no_stream" }; - } - - $logo = Maasha::Plot::seq_logo( \@entries ); - - $fh = write_stream( $options->{ "data_out" } ); - - print $fh $logo; - - close $fh; -} - - -sub script_plot_phastcons_profiles -{ - # Martin A. Hansen, January 2008. - - # Plots PhastCons profiles. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $phastcons_file, $phastcons_index, $index, $fh_phastcons, $record, $scores, $AoA, $plot, $fh ); - - $options->{ "title" } ||= "PhastCons Profiles"; - - $phastcons_file = Maasha::Config::genome_phastcons( $options->{ "genome" } ); - $phastcons_index = Maasha::Config::genome_phastcons_index( $options->{ "genome" } ); - - $index = Maasha::UCSC::fixedstep_index_retrieve( $phastcons_index ); - $fh_phastcons = Maasha::Common::read_open( $phastcons_file ); - - while ( $record = get_record( $in ) ) - { - if ( $record->{ "CHR" } and $record->{ "CHR_BEG" } and $record->{ "CHR_END" } ) - { - $scores = Maasha::UCSC::fixedstep_index_lookup( $index, $fh_phastcons, $record->{ "CHR" }, - $record->{ "CHR_BEG" }, - $record->{ "CHR_END" }, - $options->{ "flank" } ); - - push @{ $AoA }, [ @{ $scores } ]; - } - - put_record( $record, $out ) if not $options->{ "no_stream" }; - } - - Maasha::UCSC::phastcons_normalize( $AoA ); - - $AoA = [ [ Maasha::UCSC::phastcons_mean( $AoA ) ] ] if $options->{ "mean" }; - $AoA = [ [ Maasha::UCSC::phastcons_median( $AoA ) ] ] if $options->{ "median" }; - - $AoA = Maasha::Matrix::matrix_flip( $AoA ); - - $plot = Maasha::Plot::lineplot_simple( $AoA, $options, $BP_TMP ); - - $fh = write_stream( $options->{ "data_out" } ); - - print $fh "$_\n" foreach @{ $plot }; - - close $fh; -} - - -sub script_analyze_bed -{ - # Martin A. Hansen, March 2008. - - # Analyze BED entries in stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record ); - - while ( $record = get_record( $in ) ) - { - $record = Maasha::UCSC::bed_analyze( $record ) if $record->{ "REC_TYPE" } eq "BED"; - - put_record( $record, $out ); - } -} - - -sub script_analyze_vals -{ - # Martin A. Hansen, August 2007. - - # Analyze values for given keys in stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $key, @keys, %key_hash, $analysis, $len ); - - map { $key_hash{ $_ } = 1 } @{ $options->{ "keys" } }; - - while ( $record = get_record( $in ) ) - { - foreach $key ( keys %{ $record } ) - { - next if $options->{ "keys" } and not exists $key_hash{ $key }; - - $analysis->{ $key }->{ "COUNT" }++; - - if ( Maasha::Calc::is_a_number( $record->{ $key } ) ) - { - $analysis->{ $key }->{ "TYPE" } = "num"; - $analysis->{ $key }->{ "SUM" } += $record->{ $key }; - $analysis->{ $key }->{ "MAX" } = $record->{ $key } if $record->{ $key } > $analysis->{ $key }->{ "MAX" } or not $analysis->{ $key }->{ "MAX" }; - $analysis->{ $key }->{ "MIN" } = $record->{ $key } if $record->{ $key } < $analysis->{ $key }->{ "MIN" } or not $analysis->{ $key }->{ "MIN" }; - } - else - { - $len = length $record->{ $key }; - - $analysis->{ $key }->{ "TYPE" } = "alph"; - $analysis->{ $key }->{ "SUM" } += $len; - $analysis->{ $key }->{ "MAX" } = $len if $len > $analysis->{ $key }->{ "MAX" } or not $analysis->{ $key }->{ "MAX" }; - $analysis->{ $key }->{ "MIN" } = $len if $len < $analysis->{ $key }->{ "MIM" } or not $analysis->{ $key }->{ "MIN" }; - } - } - - put_record( $record, $out ) if not $options->{ "no_stream" }; - } - - foreach $key ( keys %{ $analysis } ) - { - $analysis->{ $key }->{ "MEAN" } = sprintf "%.2f", $analysis->{ $key }->{ "SUM" } / $analysis->{ $key }->{ "COUNT" }; - $analysis->{ $key }->{ "SUM" } = sprintf "%.2f", $analysis->{ $key }->{ "SUM" }; - } - - my ( $keys, $types, $counts, $mins, $maxs, $sums, $means ); - - $keys = "KEY "; - $types = "TYPE "; - $counts = "COUNT"; - $mins = "MIN "; - $maxs = "MAX "; - $sums = "SUM "; - $means = "MEAN "; - - if ( $options->{ "keys" } ) { - @keys = @{ $options->{ "keys" } }; - } else { - @keys = keys %{ $analysis }; - } - - foreach $key ( @keys ) - { - $keys .= sprintf "% 15s", $key; - $types .= sprintf "% 15s", $analysis->{ $key }->{ "TYPE" }; - $counts .= sprintf "% 15s", $analysis->{ $key }->{ "COUNT" }; - $mins .= sprintf "% 15s", $analysis->{ $key }->{ "MIN" }; - $maxs .= sprintf "% 15s", $analysis->{ $key }->{ "MAX" }; - $sums .= sprintf "% 15s", $analysis->{ $key }->{ "SUM" }; - $means .= sprintf "% 15s", $analysis->{ $key }->{ "MEAN" }; - } - - print $out "$keys\n"; - print $out "$types\n"; - print $out "$counts\n"; - print $out "$mins\n"; - print $out "$maxs\n"; - print $out "$sums\n"; - print $out "$means\n"; -} - - -sub script_head_records -{ - # Martin A. Hansen, August 2007. - - # Display the first sequences in stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $count ); - - $options->{ "num" } ||= 10; - - $count = 0; - - while ( $record = get_record( $in ) ) - { - $count++; - - put_record( $record, $out ); - - last if $count == $options->{ "num" }; - } -} - - -sub script_remove_keys -{ - # Martin A. Hansen, August 2007. - - # Remove keys from stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $new_record ); - - while ( $record = get_record( $in ) ) - { - if ( $options->{ "keys" } ) - { - map { delete $record->{ $_ } } @{ $options->{ "keys" } }; - } - elsif ( $options->{ "save_keys" } ) - { - map { $new_record->{ $_ } = $record->{ $_ } if exists $record->{ $_ } } @{ $options->{ "save_keys" } }; - - $record = $new_record; - } - - put_record( $record, $out ) if keys %{ $record }; - } -} - - -sub script_remove_adaptor -{ - # Martin A. Hansen, August 2008. - - # Find and remove adaptor from sequences in the stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $adaptor, $seq, $adaptor_len, $seq_len, $offset, $max_match, $max_mismatch, $pos ); - - $options->{ "remove" } ||= "after"; - - $max_mismatch = $options->{ "mismatches" } || 0; - $offset = $options->{ "offset" }; - - if ( not defined $offset ) { - $offset = 0; - } else { - $offset--; - } - - $adaptor = uc $options->{ "adaptor" }; - $adaptor_len = length $adaptor; - - while ( $record = get_record( $in ) ) - { - if ( $record->{ "SEQ" } ) - { - $seq = uc $record->{ "SEQ" }; - $seq_len = length $seq; - - $pos = Maasha::Common::index_m( $seq, $adaptor, $seq_len, $adaptor_len, $offset, $max_mismatch ); - - $record->{ "ADAPTOR_POS" } = $pos; - - if ( $pos >= 0 and $options->{ "remove" } ne "skip" ) - { - if ( $options->{ "remove" } eq "after" ) - { - $record->{ "SEQ" } = substr $record->{ "SEQ" }, 0, $pos; - $record->{ "SEQ_LEN" } = $pos; - } - else - { - $record->{ "SEQ" } = substr $record->{ "SEQ" }, $pos + $adaptor_len; - $record->{ "SEQ_LEN" } = length $record->{ "SEQ" }; - } - } - - put_record( $record, $out ); - } - else - { - put_record( $record, $out ); - } - } -} - - -sub script_remove_mysql_tables -{ - # Martin A. Hansen, November 2008. - - # Remove MySQL tables from values in stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, %table_hash, $dbh, $table ); - - $options->{ "user" } ||= Maasha::UCSC::ucsc_get_user(); - $options->{ "password" } ||= Maasha::UCSC::ucsc_get_password(); - - map { $table_hash{ $_ } = 1 } @{ $options->{ 'tables' } }; - - while ( $record = get_record( $in ) ) - { - map { $table_hash{ $record->{ $_ } } = 1 } @{ $options->{ 'keys' } }; - - put_record( $record, $out ) if not $options->{ 'no_stream' }; - } - - $dbh = Maasha::SQL::connect( $options->{ "database" }, $options->{ "user" }, $options->{ "password" } ); - - foreach $table ( sort keys %table_hash ) - { - if ( Maasha::SQL::table_exists( $dbh, $table ) ) - { - print STDERR qq(Removing table "$table" from database "$options->{ 'database' }" ... ) if $options->{ 'verbose' }; - Maasha::SQL::delete_table( $dbh, $table ); - print STDERR "done.\n" if $options->{ 'verbose' }; - } - else - { - print STDERR qq(WARNING: table "$table" not found in database "$options->{ 'database' }\n"); - } - } - - Maasha::SQL::disconnect( $dbh ); -} - - -sub script_remove_ucsc_tracks -{ - # Martin A. Hansen, November 2008. - - # Remove track from MySQL tables and config file. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, %track_hash, $fh_in, $fh_out, $track, @tracks, @new_tracks, $dbh ); - - $options->{ 'user' } ||= Maasha::UCSC::ucsc_get_user(); - $options->{ 'password' } ||= Maasha::UCSC::ucsc_get_password(); - $options->{ 'config_file' } ||= "$ENV{ 'HOME' }/ucsc/my_tracks.ra"; - - map { $track_hash{ $_ } = 1 } @{ $options->{ 'tracks' } }; - - while ( $record = get_record( $in ) ) - { - map { $track_hash{ $record->{ $_ } } = 1 } @{ $options->{ 'keys' } }; - - put_record( $record, $out ) if not $options->{ 'no_stream' }; - } - - # ---- locate track in config file ---- - - $fh_in = Maasha::Common::read_open( $options->{ 'config_file' } ); - - while ( $track = Maasha::UCSC::ucsc_config_get_entry( $fh_in ) ) { - push @tracks, $track; - } - - close $fh_in; - - map { push @new_tracks, $_ if not exists $track_hash{ $_->{ 'track' } } } @tracks; - - print STDERR qq(WARNING: track not found in config file: "$options->{ 'config_file' }"\n) if scalar @tracks == scalar @new_tracks; - - rename "$options->{ 'config_file' }", "$options->{ 'config_file' }~"; - - $fh_out = Maasha::Common::write_open( $options->{ 'config_file' } ); - - map { Maasha::UCSC::ucsc_config_put_entry( $_, $fh_out ) } @new_tracks; - - close $fh_out; - - # ---- locate track in database ---- - - $dbh = Maasha::SQL::connect( $options->{ "database" }, $options->{ "user" }, $options->{ "password" } ); - - foreach $track ( sort keys %track_hash ) - { - if ( Maasha::SQL::table_exists( $dbh, $track ) ) - { - print STDERR qq(Removing table "$track" from database "$options->{ 'database' }" ... ) if $options->{ 'verbose' }; - Maasha::SQL::delete_table( $dbh, $track ); - print STDERR "done.\n" if $options->{ 'verbose' }; - } - else - { - print STDERR qq(WARNING: table "$track" not found in database "$options->{ 'database' }\n"); - } - } - - Maasha::SQL::disconnect( $dbh ); - - Maasha::Common::run( "ucscMakeTracks.pl", "-b > /dev/null 2>&1" ); -} - - -sub script_rename_keys -{ - # Martin A. Hansen, August 2007. - - # Rename keys in stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record ); - - while ( $record = get_record( $in ) ) - { - if ( exists $record->{ $options->{ "keys" }->[ 0 ] } ) - { - $record->{ $options->{ "keys" }->[ 1 ] } = $record->{ $options->{ "keys" }->[ 0 ] }; - - delete $record->{ $options->{ "keys" }->[ 0 ] }; - } - - put_record( $record, $out ); - } -} - - -sub script_uniq_vals -{ - # Martin A. Hansen, August 2007. - - # Find unique values in stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( %hash, $record ); - - while ( $record = get_record( $in ) ) - { - if ( $record->{ $options->{ "key" } } ) - { - if ( not $hash{ $record->{ $options->{ "key" } } } and not $options->{ "invert" } ) - { - put_record( $record, $out ); - - $hash{ $record->{ $options->{ "key" } } } = 1; - } - elsif ( $hash{ $record->{ $options->{ "key" } } } and $options->{ "invert" } ) - { - put_record( $record, $out ); - } - else - { - $hash{ $record->{ $options->{ "key" } } } = 1; - } - } - else - { - put_record( $record, $out ); - } - } -} - - -sub script_merge_vals -{ - # Martin A. Hansen, August 2007. - - # Rename keys in stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, @join, $i ); - - $options->{ "delimit" } ||= '_'; - - while ( $record = get_record( $in ) ) - { - if ( exists $record->{ $options->{ "keys" }->[ 0 ] } ) - { - @join = $record->{ $options->{ "keys" }->[ 0 ] }; - - for ( $i = 1; $i < @{ $options->{ "keys" } }; $i++ ) { - push @join, $record->{ $options->{ "keys" }->[ $i ] } if exists $record->{ $options->{ "keys" }->[ $i ] }; - } - - $record->{ $options->{ "keys" }->[ 0 ] } = join $options->{ "delimit" }, @join; - } - - put_record( $record, $out ); - } -} - - -sub script_merge_records -{ - # Martin A. Hansen, July 2008. - - # Merges records in the stream based on identical values of two given keys. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $merge, $record, $file1, $file2, $fh1, $fh2, $key1, $key2, @keys1, @keys2, @vals1, @vals2, - $num1, $num2, $num, $cmp, $i ); - - $merge = $options->{ "merge" } || "AandB"; - - $file1 = "$BP_TMP/merge_records1.tmp"; - $file2 = "$BP_TMP/merge_records2.tmp"; - - $fh1 = Maasha::Common::write_open( $file1 ); - $fh2 = Maasha::Common::write_open( $file2 ); - - $key1 = $options->{ "keys" }->[ 0 ]; - $key2 = $options->{ "keys" }->[ 1 ]; - - $num = $key2 =~ s/n$//; - $num1 = 0; - $num2 = 0; - - while ( $record = get_record( $in ) ) - { - if ( exists $record->{ $key1 } ) - { - @keys1 = $key1; - @vals1 = $record->{ $key1 }; - - delete $record->{ $key1 }; - - map { push @keys1, $_; push @vals1, $record->{ $_ } } keys %{ $record }; - - print $fh1 join( "\t", @vals1 ), "\n"; - - $num1++; - } - elsif ( exists $record->{ $key2 } ) - { - @keys2 = $key2; - @vals2 = $record->{ $key2 }; - - delete $record->{ $key2 }; - - map { push @keys2, $_; push @vals2, $record->{ $_ } } keys %{ $record }; - - print $fh2 join( "\t", @vals2 ), "\n"; - - $num2++; - } - } - - close $fh1; - close $fh2; - - if ( $num ) - { - Maasha::Common::run( "sort", "-k 1,1n $file1 > $file1.sort" ) and rename "$file1.sort", $file1; - Maasha::Common::run( "sort", "-k 1,1n $file2 > $file2.sort" ) and rename "$file2.sort", $file2; - } - else - { - Maasha::Common::run( "sort", "-k 1,1 $file1 > $file1.sort" ) and rename "$file1.sort", $file1; - Maasha::Common::run( "sort", "-k 1,1 $file2 > $file2.sort" ) and rename "$file2.sort", $file2; - } - - $fh1 = Maasha::Common::read_open( $file1 ); - $fh2 = Maasha::Common::read_open( $file2 ); - - @vals1 = Maasha::Common::get_fields( $fh1 ); - @vals2 = Maasha::Common::get_fields( $fh2 ); - - while ( $num1 > 0 and $num2 > 0 ) - { - undef $record; - - if ( $num ) { - $cmp = $vals1[ 0 ] <=> $vals2[ 0 ]; - } else { - $cmp = $vals1[ 0 ] cmp $vals2[ 0 ]; - } - - if ( $cmp < 0 ) - { - if ( $merge =~ /^(AorB|AnotB)$/ ) - { - for ( $i = 0; $i < @keys1; $i++ ) { - $record->{ $keys1[ $i ] } = $vals1[ $i ]; - } - - put_record( $record, $out ); - } - - @vals1 = Maasha::Common::get_fields( $fh1 ); - $num1--; - } - elsif ( $cmp > 0 ) - { - if ( $merge =~ /^(BorA|BnotA)$/ ) - { - for ( $i = 0; $i < @keys2; $i++ ) { - $record->{ $keys2[ $i ] } = $vals2[ $i ]; - } - - put_record( $record, $out ); - } - - @vals2 = Maasha::Common::get_fields( $fh2 ); - $num2--; - } - else - { - if ( $merge =~ /^(AandB|AorB|BorA)$/ ) - { - for ( $i = 0; $i < @keys1; $i++ ) { - $record->{ $keys1[ $i ] } = $vals1[ $i ]; - } - - for ( $i = 1; $i < @keys2; $i++ ) { - $record->{ $keys2[ $i ] } = $vals2[ $i ]; - } - - put_record( $record, $out ); - } - - @vals1 = Maasha::Common::get_fields( $fh1 ); - @vals2 = Maasha::Common::get_fields( $fh2 ); - $num1--; - $num2--; - } - } - - close $fh1; - close $fh2; - - unlink $file1; - unlink $file2; - - if ( $num1 > 0 and $merge =~ /^(AorB|AnotB)$/ ) - { - undef $record; - - for ( $i = 0; $i < @keys1; $i++ ) { - $record->{ $keys1[ $i ] } = $vals1[ $i ]; - } - - put_record( $record, $out ); - } - - if ( $num2 > 0 and $merge =~ /^(BorA|BnotA)$/ ) - { - undef $record; - - for ( $i = 0; $i < @keys2; $i++ ) { - $record->{ $keys2[ $i ] } = $vals2[ $i ]; - } - - put_record( $record, $out ); - } -} - - -sub script_grab -{ - # Martin A. Hansen, August 2007. - - # Grab for records in stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $keys, $vals_only, $keys_only, $invert, $patterns, $pattern, $regex, $record, $key, $op, $val, %lookup_hash, $found ); - - $keys = $options->{ 'keys' }; - $vals_only = $options->{ 'vals_only' }; - $keys_only = $options->{ 'keys_only' }; - $invert = $options->{ 'invert' }; - - if ( $options->{ 'patterns' } ) - { - $patterns = [ split ",", $options->{ 'patterns' } ]; - } - elsif ( -f $options->{ 'patterns_in' } ) - { - $patterns = Maasha::Patscan::read_patterns( $options->{ 'patterns_in' } ); - } - elsif ( $options->{ 'regex' } ) - { - if ( $options->{ 'case_insensitive' } ) { - $regex = qr/$options->{ 'regex' }/i; - } else { - $regex = qr/$options->{ 'regex' }/; - } - } - elsif ( -f $options->{ 'exact_in' } ) - { - $patterns = Maasha::Patscan::read_patterns( $options->{ 'exact_in' } ); - - map { $lookup_hash{ $_ } = 1 } @{ $patterns }; - - undef $patterns; - } - elsif ( $options->{ 'eval' } ) - { - if ( $options->{ 'eval' } =~ /^([^><=! ]+)\s*(>=|<=|>|<|=|!=|eq|ne)\s*(.+)$/ ) - { - $key = $1; - $op = $2; - $val = $3; - } - } - - while ( $record = get_record( $in ) ) - { - $found = 0; - - if ( %lookup_hash ) { - $found = grab_lookup( \%lookup_hash, $record, $keys, $vals_only, $keys_only ); - } elsif ( $patterns ) { - $found = grab_patterns( $patterns, $record, $keys, $vals_only, $keys_only ); - } elsif ( $regex ) { - $found = grab_regex( $regex, $record, $keys, $vals_only, $keys_only ); - } elsif ( $op ) { - $found = grab_eval( $key, $op, $val, $record ); - } - - if ( $found and not $invert ) { - put_record( $record, $out ); - } elsif ( not $found and $invert ) { - put_record( $record, $out ); - } - } -} - - -sub script_compute -{ - # Martin A. Hansen, August 2007. - - # Evaluate extression for records in stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $eval_key, @keys, $eval_val ); - - while ( $record = get_record( $in ) ) - { - if ( $options->{ "eval" } ) - { - if ( $options->{ "eval" } =~ /^(\S+)\s*=\s*(.+)$/ ) - { - $eval_key = $1; - $eval_val = $2; - - if ( not @keys ) - { - @keys = split /\s+|\+|-|\*|\/|\*\*/, $eval_val; - - @keys = grep { exists $record->{ $_ } } @keys; - } - - map { $eval_val =~ s/\Q$_\E/$record->{ $_ }/g } @keys; - - $record->{ $eval_key } = eval "$eval_val"; - Maasha::Common::error( qq(eval "$eval_key = $eval_val" failed -> $@) ) if $@; - } - else - { - warn qq(WARNING: Bad compute expression: "$options->{ 'eval' }"\n); - } - } - - put_record( $record, $out ); - } -} - - -sub script_flip_tab -{ - # Martin A. Hansen, June 2008. - - # Flip a table. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $key, $A, $B, @rows, @matrix, $row, $i ); - - while ( $record = get_record( $in ) ) - { - undef @rows; - - foreach $key ( sort { $A = $a; $B = $b; $A =~ s/^V(\d+)$/$1/; $B =~ s/^V(\d+)$/$1/; $A <=> $B } keys %{ $record } ) - { - push @rows, $record->{ $key }; - - } - - push @matrix, [ @rows ]; - } - - undef $record; - - @matrix = Maasha::Matrix::matrix_flip( \@matrix ); - - foreach $row ( @matrix ) - { - for ( $i = 0; $i < @{ $row }; $i++ ) { - $record->{ "V$i" } = $row->[ $i ]; - } - - put_record( $record, $out ); - } -} - - -sub script_add_ident -{ - # Martin A. Hansen, May 2008. - - # Add a unique identifier to each record in stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $key, $prefix, $i ); - - $key = $options->{ "key" } || "ID"; - $prefix = $options->{ "prefix" } || "ID"; - - $i = 0; - - while ( $record = get_record( $in ) ) - { - $record->{ $key } = sprintf( "$prefix%08d", $i ); - - put_record( $record, $out ); - - $i++; - } -} - - -sub script_count_records -{ - # Martin A. Hansen, August 2007. - - # Count records in stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $count, $result, $fh, $line ); - - $count = 0; - - if ( $options->{ "no_stream" } ) - { - while ( $line = <$in> ) - { - chomp $line; - - $count++ if $line eq "---"; - } - } - else - { - while ( $record = get_record( $in ) ) - { - put_record( $record, $out ); - - $count++; - } - } - - $result = { "RECORDS_COUNT" => $count }; - - $fh = write_stream( $options->{ "data_out" } ); - - put_record( $result, $fh ); - - close $fh; -} - - -sub script_random_records -{ - # Martin A. Hansen, August 2007. - - # Pick a number or random records from stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $tmp_file, $fh_out, $fh_in, $count, $i, %rand_hash, $rand, $max ); - - $options->{ "num" } ||= 10; - - $tmp_file = "$BP_TMP/random_records.tmp"; - - $fh_out = Maasha::Common::write_open( $tmp_file ); - - $count = 0; - - while ( $record = get_record( $in ) ) - { - put_record( $record, $fh_out ); - - $count++; - } - - close $fh_out; - - $max = 0; - $i = 0; - - Maasha::Common::error( qq(Requested random records > records in stream) ) if $options->{ "num" } > $count; - - while ( $i < $options->{ "num" } ) - { - $rand = int( rand( $count ) ); - - if ( not exists $rand_hash{ $rand } ) - { - $rand_hash{ $rand } = 1; - - $max = $rand if $rand > $max; - - $i++; - } - } - - $fh_in = Maasha::Common::read_open( $tmp_file ); - - $count = 0; - - while ( $record = get_record( $fh_in ) ) - { - put_record( $record, $out ) if exists $rand_hash{ $count }; - - last if $count == $max; - - $count++; - } - - close $fh_in; - - unlink $tmp_file; -} - - -sub script_sort_records -{ - # Martin A. Hansen, August 2007. - - # Sort to sort records according to keys. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( @keys, $key, @sort_cmd, $sort_str, $sort_sub, @records, $record, $i ); - - foreach $key ( @{ $options->{ "keys" } } ) - { - if ( $key =~ s/n$// ) { - push @sort_cmd, qq(\$a->{ "$key" } <=> \$b->{ "$key" }); - } else { - push @sort_cmd, qq(\$a->{ "$key" } cmp \$b->{ "$key" }); - } - } - - $sort_str = join " or ", @sort_cmd; - $sort_sub = eval "sub { $sort_str }"; # NB security issue! - - while ( $record = get_record( $in ) ) { - push @records, $record; - } - - @records = sort $sort_sub @records; - - if ( $options->{ "reverse" } ) - { - for ( $i = scalar @records - 1; $i >= 0; $i-- ) { - put_record( $records[ $i ], $out ); - } - } - else - { - for ( $i = 0; $i < scalar @records; $i++ ) { - put_record( $records[ $i ], $out ); - } - } -} - - -sub script_count_vals -{ - # Martin A. Hansen, August 2007. - - # Count records in stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $num, $record, %count_hash, @records, $tmp_file, $fh_out, $fh_in, $cache ); - - $tmp_file = "$BP_TMP/count_cache.tmp"; - - $fh_out = Maasha::Common::write_open( $tmp_file ); - - $cache = 0; - $num = 0; - - while ( $record = get_record( $in ) ) - { - map { $count_hash{ $_ }{ $record->{ $_ } }++ if exists $record->{ $_ } } @{ $options->{ "keys" } }; - - push @records, $record; - - if ( scalar @records > 5_000_000 ) # too many records to hold in memory - use disk cache - { - map { put_record( $_, $fh_out ) } @records; - - undef @records; - - $cache = 1; - } - - print STDERR "verbose: records read $num\n" if ( $options->{ 'verbose' } and ( $num % 1_000_000 ) == 0 ); - - $num++; - } - - close $fh_out; - - if ( $cache ) - { - $num = 0; - - $fh_in = Maasha::Common::read_open( $tmp_file ); - - while ( $record = get_record( $fh_in ) ) - { - map { $record->{ $_ . "_COUNT" } = $count_hash{ $_ }{ $record->{ $_ } } if exists $record->{ $_ } } @{ $options->{ "keys" } }; - - put_record( $record, $out ); - - print STDERR "verbose: cache read $num\n" if ( $options->{ 'verbose' } and ( $num % 1_000_000 ) == 0 ); - - $num++; - } - - close $fh_in; - } - - foreach $record ( @records ) - { - map { $record->{ $_ . "_COUNT" } = $count_hash{ $_ }{ $record->{ $_ } } if exists $record->{ $_ } } @{ $options->{ "keys" } }; - - put_record( $record, $out ); - } - - unlink $tmp_file; -} - - -sub script_plot_histogram -{ - # Martin A. Hansen, September 2007. - - # Plot a simple histogram for a given key using GNU plot. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. +use Getopt::Long qw( :config bundling ); +use Data::Dumper; +use Maasha::Match; +use Maasha::Common; +use Maasha::Filesys; +use vars qw( @ISA @EXPORT_OK ); - my ( $record, %data_hash, $max, @data_list, $i, $result, $fh ); +require Exporter; - $options->{ "title" } ||= "Histogram"; - $options->{ "sort" } ||= "num"; +@ISA = qw( Exporter ); - while ( $record = get_record( $in ) ) - { - $data_hash{ $record->{ $options->{ "key" } } }++ if defined $record->{ $options->{ "key" } }; +@EXPORT_OK = qw( + read_stream + write_stream + get_record + put_record +); - put_record( $record, $out ) if not $options->{ "no_stream" }; - } - if ( $options->{ "sort" } eq "num" ) { - map { push @data_list, [ $_, $data_hash{ $_ } ] } sort { $a <=> $b } keys %data_hash; - } else { - map { push @data_list, [ $_, $data_hash{ $_ } ] } sort keys %data_hash; - } +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SIGNAL HANDLER <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - $result = Maasha::Plot::histogram_simple( \@data_list, $options ); - $fh = write_stream( $options->{ "data_out" } ); +$SIG{ '__DIE__' } = \&sig_handler; +$SIG{ 'INT' } = \&sig_handler; +$SIG{ 'TERM' } = \&sig_handler; - print $fh "$_\n" foreach @{ $result }; - close $fh; -} +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SUBROUTINES <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< -sub script_plot_lendist +sub status_set { - # Martin A. Hansen, August 2007. - - # Plot length distribution using GNU plot. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. + my ( $time_stamp, $script, $user, $pid, $file, $fh ); - my ( $record, %data_hash, $max, @data_list, $i, $result, $fh ); - - $options->{ "title" } ||= "Length Distribution"; - - while ( $record = get_record( $in ) ) - { - $data_hash{ $record->{ $options->{ "key" } } }++ if defined $record->{ $options->{ "key" } }; - - put_record( $record, $out ) if not $options->{ "no_stream" }; - } - - $max = Maasha::Calc::list_max( [ keys %data_hash ] ); - - for ( $i = 0; $i < $max; $i++ ) { - push @data_list, [ $i, $data_hash{ $i } || 0 ]; - } - - $result = Maasha::Plot::histogram_lendist( \@data_list, $options ); + $time_stamp = Maasha::Common::time_stamp(); + $user = Maasha::Common::get_user(); + $script = Maasha::Common::get_scriptname(); + $pid = Maasha::Common::get_processid(); - $fh = write_stream( $options->{ "data_out" } ); + $file = bp_tmp() . "/" . join( ".", $user, $script, $pid ) . ".status"; + $fh = Maasha::Filesys::file_write_open( $file ); + flock($fh, 2); - print $fh "$_\n" foreach @{ $result }; + print $fh join( ";", $time_stamp, join( " ", @ARGV ) ) . "\n"; close $fh; } -sub script_plot_chrdist +sub status_log { - # Martin A. Hansen, August 2007. + # Martin A. Hansen, June 2009. - # Plot chromosome distribution using GNU plot. + # Retrieves initial status information written with status_set and uses this + # to write a status entry to the log file. - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash + my ( $status, # status - OPTIONAL ) = @_; # Returns nothing. - my ( $record, %data_hash, @data_list, $elem, $sort_key, $count, $result, $fh ); - - $options->{ "title" } ||= "Chromosome Distribution"; - - while ( $record = get_record( $in ) ) - { - if ( $record->{ "CHR" } ) { # generic - $data_hash{ $record->{ "CHR" } }++; - } elsif ( $record->{ "REC_TYPE" } eq "PATSCAN" and $record->{ "S_ID" } =~ /^chr/i ) { # patscan - $data_hash{ $record->{ "S_ID" } }++; - } elsif ( $record->{ "REC_TYPE" } eq "PSL" and $record->{ "S_ID" } =~ /^chr/i ) { # BLAT / PSL - $data_hash{ $record->{ "S_ID" } }++; - } elsif ( $record->{ "REC_TYPE" } eq "BLAST" and $record->{ "S_ID" } =~ /^chr/i ) { # BLAST - $data_hash{ $record->{ "S_ID" } }++; - } - - put_record( $record, $out ) if not $options->{ "no_stream" }; - } - - foreach $elem ( keys %data_hash ) - { - $sort_key = $elem; - - $sort_key =~ s/chr//i; - - $sort_key =~ s/^X(.*)/99$1/; - $sort_key =~ s/^Y(.*)/99$1/; - $sort_key =~ s/^Z(.*)/999$1/; - $sort_key =~ s/^M(.*)/9999$1/; - $sort_key =~ s/^U(.*)/99999$1/; - - $count = $sort_key =~ tr/_//; - - $sort_key =~ s/_.*/"999999" x $count/ex; + my ( $time0, $time1, $script, $user, $pid, $file, $fh, $elap, $fh_global, $fh_local, $line, $args, $tmp_dir ); - push @data_list, [ $elem, $data_hash{ $elem }, $sort_key ]; - } - - @data_list = sort { $a->[ 2 ] <=> $b->[ 2 ] } @data_list; + $status ||= "OK"; - $result = Maasha::Plot::histogram_chrdist( \@data_list, $options ); + $time1 = Maasha::Common::time_stamp(); + $user = Maasha::Common::get_user(); + $script = Maasha::Common::get_scriptname(); + $pid = Maasha::Common::get_processid(); - $fh = write_stream( $options->{ "data_out" } ); + $file = bp_tmp() . "/" . join( ".", $user, $script, $pid ) . ".status"; - print $fh "$_\n" foreach @{ $result }; + return if not -f $file; + $fh = Maasha::Filesys::file_read_open( $file ); + flock($fh, 1); + $line = <$fh>; + chomp $line; close $fh; -} - - -sub script_plot_karyogram -{ - # Martin A. Hansen, August 2007. - - # Plot hits on karyogram. - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. + unlink $file; - my ( %options, $record, @data, $fh, $result, %data_hash ); + ( $time0, $args, $tmp_dir ) = split /;/, $line; - $options->{ "genome" } ||= "human"; - $options->{ "feat_color" } ||= "black"; - - while ( $record = get_record( $in ) ) - { - if ( $record->{ "CHR" } and $record->{ "CHR_BEG" } and $record->{ "CHR_END" } ) - { - push @{ $data_hash{ $record->{ "CHR" } } }, [ $record->{ "CHR_BEG" }, $record->{ "CHR_END" }, $options->{ "feat_color" } ]; - } + Maasha::Filesys::dir_remove( $tmp_dir ) if defined $tmp_dir; - put_record( $record, $out ) if not $options->{ "no_stream" }; - } + $elap = Maasha::Common::time_stamp_diff( $time0, $time1 ); - $result = Maasha::Plot::karyogram( \%data_hash, \%options ); + $fh_global = Maasha::Filesys::file_append_open( "$ENV{ 'BP_LOG' }/biopieces.log" ); + flock($fh_global, 2); + $fh_local = Maasha::Filesys::file_append_open( "$ENV{ 'HOME' }/.biopieces.log" ); + flock($fh_local, 2); - $fh = write_stream( $options->{ "data_out" } ); + print $fh_global join( "\t", $time0, $time1, $elap, $user, $status, "$script $args" ) . "\n"; + print $fh_local join( "\t", $time0, $time1, $elap, $user, $status, "$script $args" ) . "\n"; - print $fh $result; + $fh_global->autoflush( 1 ); + $fh_local->autoflush( 1 ); - close $fh; + close $fh_global; + close $fh_local; } -sub script_plot_matches +sub log_biopiece { - # Martin A. Hansen, August 2007. - - # Plot matches in 2D generating a dotplot. + # Martin A. Hansen, January 2008. - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; + # Log messages to logfile. # Returns nothing. - my ( $record, @data, $fh, $result, %data_hash ); + my ( $time_stamp, $user, $script, $fh ); - $options->{ "direction" } ||= "both"; - - while ( $record = get_record( $in ) ) - { - if ( defined $record->{ "Q_BEG" } and defined $record->{ "S_BEG" } and $record->{ "Q_END" } and $record->{ "S_END" } ) { - push @data, $record; - } - - put_record( $record, $out ) if not $options->{ "no_stream" }; - } - - $options->{ "title" } ||= "plot_matches"; - $options->{ "xlabel" } ||= $data[ 0 ]->{ "Q_ID" }; - $options->{ "ylabel" } ||= $data[ 0 ]->{ "S_ID" }; + $time_stamp = Maasha::Common::time_stamp(); + $user = Maasha::Common::get_user(); + $script = Maasha::Common::get_scriptname(); - $result = Maasha::Plot::dotplot_matches( \@data, $options, $BP_TMP ); + $fh = Maasha::Filesys::file_append_open( "$ENV{ 'BP_LOG' }/biopieces.log" ); + flock($fh, 2); - $fh = write_stream( $options->{ "data_out" } ); + print $fh "$time_stamp\t$user\t$script ", join( " ", @ARGV ), "\n"; - print $fh "$_\n" foreach @{ $result }; + $fh->autoflush( 1 ); close $fh; } -sub script_length_vals +sub read_stream { - # Martin A. Hansen, August 2007. + # Martin A. Hansen, July 2007. - # Determine the length of the value for given keys. + # Opens a stream to STDIN or a file, - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash + my ( $file, # file - OPTIONAL ) = @_; - # Returns nothing. - - my ( $record, $key ); + # Returns filehandle. - while ( $record = get_record( $in ) ) - { - foreach $key ( @{ $options->{ "keys" } } ) - { - if ( $record->{ $key } ) { - $record->{ $key . "_LEN" } = length $record->{ $key }; - } - } + my ( $fh ); - put_record( $record, $out ); + if ( not -t STDIN ) { + $fh = Maasha::Filesys::stdin_read(); + } elsif ( not $file ) { + # Maasha::Common::error( qq(no data stream) ); + } else { + $fh = Maasha::Filesys::file_read_open( $file ); } + + return $fh; } -sub script_sum_vals +sub write_stream { # Martin A. Hansen, August 2007. - # Calculates the sums for values of given keys. + # Opens a stream to STDOUT or a file. - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash + my ( $path, # path - OPTIONAL + $gzip, # compress data - OPTIONAL ) = @_; - # Returns nothing. - - my ( $record, $key, %sum_hash, $fh ); - - while ( $record = get_record( $in ) ) - { - foreach $key ( @{ $options->{ "keys" } } ) - { - if ( $record->{ $key } ) { - $sum_hash{ $key } += $record->{ $key }; - } - } - - put_record( $record, $out ) if not $options->{ "no_stream" }; - } + # Returns filehandle. - $fh = write_stream( $options->{ "data_out" } ); + my ( $fh ); - foreach $key ( @{ $options->{ "keys" } } ) { - put_record( { $key . "_SUM" => $sum_hash{ $key } || 0 } , $fh ); + if ( $path ) { + $fh = Maasha::Filesys::file_write_open( $path, $gzip ); + } else { + $fh = Maasha::Filesys::stdout_write(); } - close $fh; + return $fh; } -sub script_mean_vals +sub close_stream { - # Martin A. Hansen, August 2007. + # Martin A. Hansen, May 2009. - # Calculate the mean of values of given keys. + # Close stream if open. - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash + my ( $fh, # filehandle ) = @_; # Returns nothing. - my ( $record, $key, %sum_hash, %count_hash, $mean, $fh ); - - while ( $record = get_record( $in ) ) - { - foreach $key ( @{ $options->{ "keys" } } ) - { - if ( $record->{ $key } ) - { - $sum_hash{ $key } += $record->{ $key }; - $count_hash{ $key }++; - } - } - - put_record( $record, $out ) if not $options->{ "no_stream" }; - } - - $fh = write_stream( $options->{ "data_out" } ); - - foreach $key ( @{ $options->{ "keys" } } ) - { - if ( $count_hash{ $key } ) { - $mean = sprintf( "%.2f", ( $sum_hash{ $key } / $count_hash{ $key } ) ); - } else { - $mean = "N/A"; - } - - put_record( { $key . "_MEAN" => $mean } , $fh ); - } - - close $fh; + close $fh if defined $fh; } -sub script_median_vals +sub get_record { - # Martin A. Hansen, March 2008. + # Martin A. Hansen, July 2007. - # Calculate the median values of given keys. + # Reads one record at a time and converts that record + # to a Perl data structure (a hash) which is returned. - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash + my ( $fh, # handle to stream ) = @_; - # Returns nothing. - - my ( $record, $key, %median_hash, $median, $fh ); - - while ( $record = get_record( $in ) ) - { - foreach $key ( @{ $options->{ "keys" } } ) { - push @{ $median_hash{ $key } }, $record->{ $key } if defined $record->{ $key }; - } - - put_record( $record, $out ) if not $options->{ "no_stream" }; - } - - $fh = write_stream( $options->{ "data_out" } ); - - foreach $key ( @{ $options->{ "keys" } } ) - { - if ( $median_hash{ $key } ) { - $median = Maasha::Calc::median( $median_hash{ $key } ); - } else { - $median = "N/A"; - } - - put_record( { $key . "_MEDIAN" => $median } , $fh ); - } + # Returns a hash. - close $fh; -} + my ( $block, @lines, $line, $key, $value, %record ); + return if not defined $fh; -sub script_max_vals -{ - # Martin A. Hansen, February 2008. + local $/ = "\n---\n"; - # Determine the maximum values of given keys. + $block = <$fh>; - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; + return if not defined $block; - # Returns nothing. + chomp $block; - my ( $record, $key, $fh, %max_hash, $max_record ); + @lines = split "\n", $block; - while ( $record = get_record( $in ) ) + foreach $line ( @lines ) { - foreach $key ( @{ $options->{ "keys" } } ) - { - if ( $record->{ $key } ) - { - $max_hash{ $key } = $record->{ $key } if $record->{ $key } > $max_hash{ $key }; - } - } - - put_record( $record, $out ) if not $options->{ "no_stream" }; - } - - $fh = write_stream( $options->{ "data_out" } ); + ( $key, $value ) = split ": ", $line, 2; - foreach $key ( @{ $options->{ "keys" } } ) - { - $max_record->{ $key . "_MAX" } = $max_hash{ $key }; + $record{ $key } = $value; } - put_record( $max_record, $fh ); - - close $fh; + return wantarray ? %record : \%record; } -sub script_min_vals +sub put_record { - # Martin A. Hansen, February 2008. + # Martin A. Hansen, July 2007. - # Determine the minimum values of given keys. + # Given a Perl datastructure (a hash ref) emits this to STDOUT or a filehandle. - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash + my ( $data, # data structure + $fh, # file handle - OPTIONAL ) = @_; - # Returns nothing. - - my ( $record, $key, $fh, %min_hash, $min_record ); - - while ( $record = get_record( $in ) ) - { - foreach $key ( @{ $options->{ "keys" } } ) - { - if ( defined $record->{ $key } ) - { - if ( exists $min_hash{ $key } ) { - $min_hash{ $key } = $record->{ $key } if $record->{ $key } < $min_hash{ $key }; - } else { - $min_hash{ $key } = $record->{ $key }; - } - } - } - - put_record( $record, $out ) if not $options->{ "no_stream" }; - } - - $fh = write_stream( $options->{ "data_out" } ); + # Returns nothing. - foreach $key ( @{ $options->{ "keys" } } ) + if ( scalar keys %{ $data } ) { - $min_record->{ $key . "_MIN" } = $min_hash{ $key }; + if ( $fh ) + { + map { print $fh "$_: $data->{ $_ }\n" } keys %{ $data }; + print $fh "---\n"; + } + else + { + map { print "$_: $data->{ $_ }\n" } keys %{ $data }; + print "---\n"; + } } - put_record( $min_record, $fh ); - - close $fh; + undef $data; } -sub script_upload_to_ucsc +sub parse_options { - # Martin A. Hansen, August 2007. + # Martin A. Hansen, May 2009 - # Calculate the mean of values of given keys. + # Parses and checks options for Biopieces. - # This routine has developed into the most ugly hack. Do something! + # First the argument list is checked for duplicates and then + # options are parsed from ARGV after which it is checked if + # the Biopieces usage information should be printed. Finally, + # all options from ARGV are checked according to the argument list. - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash + my ( $arg_list, # data structure with argument description ) = @_; - # Returns nothing. - - my ( $record, $file, $wib_file, $wig_file, $wib_dir, $fh_out, $i, $first, $format, $type, $columns, $append, $vals ); - - $options->{ "short_label" } ||= $options->{ 'table' }; - $options->{ "long_label" } ||= $options->{ 'table' }; - $options->{ "group" } ||= $ENV{ "LOGNAME" }; - $options->{ "priority" } ||= 1; - $options->{ "visibility" } ||= "pack"; - $options->{ "color" } ||= join( ",", int( rand( 255 ) ), int( rand( 255 ) ), int( rand( 255 ) ) ); - $options->{ "chunk_size" } ||= 10_000_000_000; # Due to 32-bit UCSC compilation really large tables cannot be loaded in one go. + # Returns hashref. - $file = "$BP_TMP/ucsc_upload.tmp"; + my ( $arg, @list, $options ); - $append = 0; + # ---- Adding the mandatory arguments to the arg_list ---- - $first = 1; + push @{ $arg_list }, ( + { long => 'help', short => '?', type => 'flag', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, + { long => 'stream_in', short => 'I', type => 'file!', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, + { long => 'stream_out', short => 'O', type => 'file', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, + { long => 'verbose', short => 'v', type => 'flag', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, + ); - $i = 0; + check_duplicates_args( $arg_list ); - $fh_out = Maasha::Common::write_open( $file ); + # ---- Compiling options list ---- - while ( $record = get_record( $in ) ) + foreach $arg ( @{ $arg_list } ) { - put_record( $record, $out ) if not $options->{ "no_stream" }; - - if ( $record->{ "REC_TYPE" } eq "fixed_step" ) - { - $vals = $record->{ "VALS" }; - $vals =~ tr/;/\n/; - - print $fh_out "fixedStep chrom=$record->{ 'CHR' } start=$record->{ 'CHR_BEG' } step=$record->{ 'STEP' }\n"; - print $fh_out "$vals\n"; - - $format = "WIGGLE" if not $format; + if ( $arg->{ 'type' } eq 'flag' ) { + push @list, "$arg->{ 'long' }|$arg->{ 'short' }"; + } else { + push @list, "$arg->{ 'long' }|$arg->{ 'short' }=s"; } - elsif ( $record->{ "REC_TYPE" } eq "PSL" ) - { - Maasha::UCSC::psl_put_header( $fh_out ) if $first; - Maasha::UCSC::psl_put_entry( $record, $fh_out ); - - $first = 0; + } - $format = "PSL" if not $format; - } - elsif ( $record->{ "REC_TYPE" } eq "BED" and $record->{ "SEC_STRUCT" } ) - { - # chrom chromStart chromEnd name score strand size secStr conf - - print $fh_out join ( "\t", - $record->{ "CHR" }, - $record->{ "CHR_BEG" }, - $record->{ "CHR_END" } + 1, - $record->{ "Q_ID" }, - $record->{ "SCORE" }, - $record->{ "STRAND" }, - $record->{ "SIZE" }, - $record->{ "SEC_STRUCT" }, - $record->{ "CONF" }, - ), "\n"; - - $format = "BED_SS" if not $format; - } - elsif ( $record->{ "REC_TYPE" } eq "BED" ) - { - Maasha::UCSC::bed_put_entry( $record, $fh_out, $record->{ "BED_COLS" } ); + # ---- Parsing options from @ARGV ---- - $format = "BED" if not $format; - $columns = $record->{ "BED_COLS" } if not $columns; - } - elsif ( $record->{ "REC_TYPE" } eq "PATSCAN" and $record->{ "CHR" } ) - { - Maasha::UCSC::bed_put_entry( $record, $fh_out, 6 ); + $options = {}; - $format = "BED" if not $format; - $columns = 6 if not $columns; - } - elsif ( $record->{ "REC_TYPE" } eq "BLAST" and $record->{ "S_ID" } =~ /^chr/ ) - { - $record->{ "CHR" } = $record->{ "S_ID" }; - $record->{ "CHR_BEG" } = $record->{ "S_BEG" }; - $record->{ "CHR_END" } = $record->{ "S_END" }; - $record->{ "SCORE" } = $record->{ "BIT_SCORE" } * 1000; + Getopt::Long::GetOptions( $options, @list ); - $format = "BED" if not $format; - $columns = 6 if not $columns; + # print Dumper( $options ); - Maasha::UCSC::bed_put_entry( $record, $fh_out ); - } - elsif ( $record->{ "REC_TYPE" } eq "VMATCH" and $record->{ "S_ID" } =~ /^chr/i ) - { - $record->{ "CHR" } = $record->{ "S_ID" }; - $record->{ "CHR_BEG" } = $record->{ "S_BEG" }; - $record->{ "CHR_END" } = $record->{ "S_END" }; - $record->{ "SCORE" } = $record->{ "SCORE" } || 999; - $record->{ "SCORE" } = int( $record->{ "SCORE" } ); + check_print_usage( $options ); - $format = "BED" if not $format; - $columns = 6 if not $columns; + # ---- Expanding and checking options ---- - Maasha::UCSC::bed_put_entry( $record, $fh_out, 6 ); - } + foreach $arg ( @{ $arg_list } ) + { + check_mandatory( $arg, $options ); + set_default( $arg, $options ); + check_uint( $arg, $options ); + check_int( $arg, $options ); + set_list( $arg, $options ); + check_dir( $arg, $options ); + check_file( $arg, $options ); + set_files( $arg, $options ); + check_files( $arg, $options ); + check_allowed( $arg, $options ); + check_disallowed( $arg, $options ); + } - if ( $i == $options->{ "chunk_size" } ) - { - close $fh_out; + # print Dumper( $options ); - if ( $format eq "BED" ) { - Maasha::UCSC::bed_upload_to_ucsc( $BP_TMP, $file, $options, $append ); - } elsif ( $format eq "PSL" ) { - Maasha::UCSC::psl_upload_to_ucsc( $file, $options, $append ); - } + # return wantarray ? $options : %{ $options }; # WTF! Someone changed the behaviour of wantarray??? - unlink $file; + return $options; +} - $first = 1; - $append = 1; +sub check_duplicates_args +{ + # Martin A. Hansen, May 2009 - $fh_out = Maasha::Common::write_open( $file ); - } + # Check if there are duplicate long or short arguments, + # and raise an error if so. - $i++; - } + my ( $arg_list, # List of argument hashrefs, + ) = @_; - close $fh_out; + # Returns nothing. - if ( exists $options->{ "database" } and $options->{ "table" } ) - { - if ( $format eq "BED" ) - { - $type = "bed $columns"; + my ( $arg, %check_hash ); - Maasha::UCSC::bed_upload_to_ucsc( $BP_TMP, $file, $options, $append ); - } - elsif ( $format eq "BED_SS" ) - { - $options->{ "sec_struct" } = 1; + foreach $arg ( @{ $arg_list } ) + { + Maasha::Common::error( qq(Duplicate long argument: $arg->{ 'long' }) ) if exists $check_hash{ $arg->{ 'long' } }; + Maasha::Common::error( qq(Duplicate short argument: $arg->{ 'short' }) ) if exists $check_hash{ $arg->{ 'short' } }; - $type = "sec_struct"; - - Maasha::UCSC::bed_upload_to_ucsc( $BP_TMP, $file, $options, $append ); - } - elsif ( $format eq "PSL" ) - { - $type = "psl"; + $check_hash{ $arg->{ 'long' } } = 1; + $check_hash{ $arg->{ 'short' } } = 1; + } +} - Maasha::UCSC::psl_upload_to_ucsc( $file, $options, $append ); - } - elsif ( $format eq "WIGGLE" ) - { - $options->{ "visibility" } = "full"; - $wig_file = "$options->{ 'table' }.wig"; - $wib_file = "$options->{ 'table' }.wib"; +sub check_print_usage +{ + # Martin A. Hansen, May 2009. - $wib_dir = "$ENV{ 'HOME' }/ucsc/wib"; + # Check if we need to print usage and print usage + # and exit if that is the case. - Maasha::Common::dir_create_if_not_exists( $wib_dir ); + my ( $options, # option hash + ) = @_; - if ( $options->{ 'verbose' } ) { - `cd $BP_TMP && wigEncode $file $wig_file $wib_file`; - } else { - `cd $BP_TMP && wigEncode $file $wig_file $wib_file > /dev/null 2>&1`; - } + # Returns nothing. - Maasha::Common::run( "mv", "$BP_TMP/$wib_file $wib_dir" ); + my ( %options, $help, $script, $wiki ); - unlink $file; + %options = %{ $options }; + $help = $options{ 'help' }; + delete $options{ 'help' }; - $file = $wig_file; + $script = Maasha::Common::get_scriptname(); - $type = "wig 0"; + if ( $script ne 'print_wiki' ) + { + if ( $help or -t STDIN ) + { + if ( not ( exists $options{ 'stream_in' } or $options{ 'data_in' } ) ) + { + if ( scalar keys %options == 0 ) + { + $wiki = $ENV{ 'BP_DIR' } . "/bp_usage/$script.wiki"; + + if ( $help ) { + `print_wiki --data_in=$wiki --help`; + } elsif ( $script =~ /^(list_biopieces|list_genomes|list_mysql_databases|biostat)$/ ) { + return; + } else { + `print_wiki --data_in=$wiki`; + } - Maasha::UCSC::wiggle_upload_to_ucsc( $BP_TMP, $wib_dir, $file, $options ); + exit; + } + } } - - unlink $file; - - Maasha::UCSC::update_my_tracks( $options, $type ); } } -# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< +sub check_mandatory +{ + # Martin A. Hansen, May 2009. + # Check if mandatory arguments are set and raises an error if not. -sub grab_lookup -{ - # Martin A. Hansen, November 2009. - - # Uses keys from a lookup hash to search records. Optionally, a list of - # keys can be given so the lookup is limited to these, also, flags - # can be given to limit lookup to keys or vals only. Returns 1 if lookup - # succeeded, else 0. - - my ( $lookup_hash, # hashref with patterns - $record, # hashref - $keys, # list of keys - OPTIONAL - $vals_only, # only vals flag - OPTIONAL - $keys_only, # only keys flag - OPTIONAL + my ( $arg, # hashref + $options, # options hash ) = @_; - # Returns boolean. - - if ( $keys ) - { - map { return 1 if exists $lookup_hash->{ $record->{ $_ } } } @{ $keys }; - } - else - { - if ( not $vals_only ) { - map { return 1 if exists $lookup_hash->{ $_ } } keys %{ $record }; - } + # Returns nothing. - if ( not $keys_only ) { - map { return 1 if exists $lookup_hash->{ $record->{ $_ } } } keys %{ $record }; - } + if ( $arg->{ 'mandatory' } eq 'yes' and not defined $options->{ $arg->{ 'long' } } ) { + Maasha::Common::error( qq(Argument --$arg->{ 'long' } is mandatory) ); } - - return 0; } -sub grab_patterns +sub set_default { - # Martin A. Hansen, November 2009. + # Martin A. Hansen, May 2009. - # Uses patterns to match records containing the pattern as a substring. - # Returns 1 if the record is matched, else 0. + # Set default values in option hash. - my ( $patterns, # list of patterns - $record, # hashref - $keys, # list of keys - OPTIONAL - $vals_only, # only vals flag - OPTIONAL - $keys_only, # only keys flag - OPTIONAL + my ( $arg, # hashref + $options, # options hash ) = @_; - # Returns boolean. - - my ( $pattern ); - - foreach $pattern ( @{ $patterns } ) - { - if ( $keys ) - { - map { return 1 if index( $record->{ $_ }, $pattern ) >= 0 } @{ $keys }; - } - else - { - if ( not $vals_only ) { - map { return 1 if index( $_, $pattern ) >= 0 } keys %{ $record }; - } + # Returns nothing. - if ( not $keys_only ) { - map { return 1 if index( $record->{ $_ }, $pattern ) >= 0 } keys %{ $record }; - } - } + if ( not defined $options->{ $arg->{ 'long' } } ) { + $options->{ $arg->{ 'long' } } = $arg->{ 'default' } } - - return 0; } -sub grab_regex +sub check_uint { - # Martin A. Hansen, November 2009. + # Martin A. Hansen, May 2009. - # Uses regex to match records. - # Returns 1 if the record is matched, else 0. + # Check if value to argument is an unsigned integer and + # raises an error if not. - my ( $regex, # regex to match - $record, # hashref - $keys, # list of keys - OPTIONAL - $vals_only, # only vals flag - OPTIONAL - $keys_only, # only keys flag - OPTIONAL + my ( $arg, # hashref + $options, # options hash ) = @_; - # Returns boolean. + # Returns nothing. - if ( $keys ) - { - map { return 1 if $record->{ $_ } =~ /$regex/ } @{ $keys }; - } - else + if ( $arg->{ 'type' } eq 'uint' and defined $options->{ $arg->{ 'long' } } ) { - if ( not $vals_only ) { - map { return 1 if $_ =~ /$regex/ } keys %{ $record }; - } - - if ( not $keys_only ) { - map { return 1 if $record->{ $_ } =~ /$regex/ } keys %{ $record }; + if ( $options->{ $arg->{ 'long' } } !~ /^\d+$/ ) { + Maasha::Common::error( qq(Argument --$arg->{ 'long' } must be an unsigned integer - not $options->{ $arg->{ 'long' } }) ); } } - - return 0; } -sub grab_eval +sub check_int { - # Martin A. Hansen, November 2009. + # Martin A. Hansen, May 2009. - # Test if the value of a given record key evaluates according - # to a given operator. Returns 1 if eval is OK, else 0. + # Check if value to argument is an integer and + # raises an error if not. - my ( $key, # record key - $op, # operator - $val, # value - $record, # hashref + my ( $arg, # hashref + $options, # options hash ) = @_; - - # Returns boolean. - if ( defined $record->{ $key } ) + # Returns nothing. + + if ( $arg->{ 'type' } eq 'int' and defined $options{ $arg->{ 'long' } } ) { - return 1 if ( $op eq "<" and $record->{ $key } < $val ); - return 1 if ( $op eq ">" and $record->{ $key } > $val ); - return 1 if ( $op eq ">=" and $record->{ $key } >= $val ); - return 1 if ( $op eq "<=" and $record->{ $key } <= $val ); - return 1 if ( $op eq "=" and $record->{ $key } == $val ); - return 1 if ( $op eq "!=" and $record->{ $key } != $val ); - return 1 if ( $op eq "eq" and $record->{ $key } eq $val ); - return 1 if ( $op eq "ne" and $record->{ $key } ne $val ); + if ( $options->{ $arg->{ 'long' } } !~ /^-?\d+$/ ) { + Maasha::Common::error( qq(Argument --$arg->{ 'long' } must be an integer - not $options->{ $arg->{ 'long' } }) ); + } } - - return 0; } -sub record2fasta +sub set_list { - # Martin A. Hansen, July 2008. + # Martin A. Hansen, May 2009. - # Given a biopiece record converts it to a FASTA record. - # If no generic SEQ or SEQ_NAME is found, the Q_* and S_* are - # tried in that order. + # Splits an argument of type 'list' into a list that is put + # in the options hash. - my ( $record, # record + my ( $arg, # hashref + $options, # options hash ) = @_; - # Returns a tuple. - - my ( $seq_name, $seq ); - - $seq_name = $record->{ "SEQ_NAME" } || $record->{ "Q_ID" } || $record->{ "S_ID" }; - $seq = $record->{ "SEQ" } || $record->{ "Q_SEQ" } || $record->{ "S_SEQ" }; + # Returns nothing. - if ( defined $seq_name and defined $seq ) { - return wantarray ? ( $seq_name, $seq ) : [ $seq_name, $seq ]; - } else { - return; + if ( $arg->{ 'type' } eq 'list' and defined $options->{ $arg->{ 'long' } } ) { + $options->{ $arg->{ 'long' } } = [ split /,/, $options->{ $arg->{ 'long' } } ]; } } -sub read_stream +sub check_dir { - # Martin A. Hansen, July 2007. + # Martin A. Hansen, May 2009. - # Opens a stream to STDIN or a file, + # Check if an argument of type 'dir!' truly is a directory and + # raises an error if not. - my ( $path, # path - OPTIONAL + my ( $arg, # hashref + $options, # options hash ) = @_; - # Returns filehandle. - - my ( $fh ); + # Returns nothing. - if ( not -t STDIN ) { - $fh = Maasha::Common::read_stdin(); - } elsif ( not $path ) { -# Maasha::Common::error( qq(no data stream) ); - } else { - $fh = Maasha::Common::read_open( $path ); + if ( $arg->{ 'type' } eq 'dir!' and defined $options->{ $arg->{ 'long' } } ) + { + if ( not -d $options->{ $arg->{ 'long' } } ) { + Maasha::Common::error( qq(No such directory: "$options->{ $arg->{ 'long' } }") ); + } } - -# $fh->autoflush(1) if $fh; # Disable file buffer for debugging. - - return $fh; } -sub write_stream +sub check_file { - # Martin A. Hansen, August 2007. + # Martin A. Hansen, May 2009. - # Opens a stream to STDOUT or a file. + # Check if an argument of type 'file!' truly is a file and + # raises an error if not. - my ( $path, # path - OPTIONAL - $gzip, # compress data - OPTIONAL + my ( $arg, # hashref + $options, # options hash ) = @_; - # Returns filehandle. - - my ( $fh ); + # Returns nothing. - if ( $path ) { - $fh = Maasha::Common::write_open( $path, $gzip ); - } else { - $fh = Maasha::Common::write_stdout(); + if ( $arg->{ 'type' } eq 'file!' and defined $options->{ $arg->{ 'long' } } ) + { + if ( not -f $options->{ $arg->{ 'long' } } ) { + Maasha::Common::error( qq(No such file: "$options->{ $arg->{ 'long' } }") ); + } } - - return $fh; } -sub get_record +sub set_files { - # Martin A. Hansen, July 2007. + # Martin A. Hansen, May 2009. - # Reads one record at a time and converts that record - # to a Perl data structure (a hash) which is returned. + # Split the argument to 'files' into a list that is put on the options hash. - my ( $fh, # handle to stream + my ( $arg, # hashref + $options, # options hash ) = @_; - # Returns a hash. + # Returns nothing. - my ( $block, @lines, $line, $key, $value, %record ); + if ( $arg->{ 'type' } eq 'files' and defined $options->{ $arg->{ 'long' } } ) { + $options->{ $arg->{ 'long' } } = [ split /,/, $options->{ $arg->{ 'long' } } ]; + } +} - local $/ = "\n---\n"; - $block = <$fh>; +sub check_files +{ + # Martin A. Hansen, May 2009. - chomp $block; + # Split the argument to 'files!' and check if each file do exists before adding + # the file list to the options hash. - return if not defined $block; + my ( $arg, # hashref + $options, # options hash + ) = @_; - @lines = split "\n", $block; + # Returns nothing. - foreach $line ( @lines ) + my ( $elem, @files ); + + if ( $arg->{ 'type' } eq 'files!' and defined $options->{ $arg->{ 'long' } } ) { - ( $key, $value ) = split ": ", $line, 2; + foreach $elem ( split /,/, $options->{ $arg->{ 'long' } } ) + { + if ( -f $elem ) { + push @files, $elem; + } elsif ( $elem =~ /\*/ ) { + push @files, glob( $elem ); + } + } - $record{ $key } = $value; - } + if ( scalar @files == 0 ) { + Maasha::Common::error( qq(Argument to --$arg->{ 'long' } must be a valid file or fileglob expression - not $options->{ $arg->{ 'long' } }) ); + } - return wantarray ? %record : \%record; + $options->{ $arg->{ 'long' } } = [ @files ]; + } } -sub put_record +sub check_allowed { - # Martin A. Hansen, July 2007. + # Martin A. Hansen, May 2009. - # Given a Perl datastructure (a hash ref) emits this to STDOUT or a filehandle. + # Check if all values to all arguement are allowed and raise an + # error if not. - my ( $data, # data structure - $fh, # file handle - OPTIONAL + my ( $arg, # hashref + $options, # options hash ) = @_; # Returns nothing. - if ( scalar keys %{ $data } ) + my ( $elem ); + + if ( defined $arg->{ 'allowed' } and defined $options->{ $arg->{ 'long' } } ) { - if ( $fh ) + map { $val_hash{ $_ } = 1 } split /,/, $arg->{ 'allowed' }; + + if ( $arg->{ 'type' } =~ /^(list|files|files!)$/ ) { - map { print $fh "$_: $data->{ $_ }\n" } keys %{ $data }; - print $fh "---\n"; + foreach $elem ( @{ $options->{ $arg->{ 'long' } } } ) + { + if ( not exists $val_hash{ $elem } ) { + Maasha::Common::error( qq(Argument to --$arg->{ 'long' } $elem is not allowed) ); + } + } } else { - map { print "$_: $data->{ $_ }\n" } keys %{ $data }; - print "---\n"; + if ( not exists $val_hash{ $options->{ $arg->{ 'long' } } } ) { + Maasha::Common::error( qq(Argument to --$arg->{ 'long' } $options->{ $arg->{ 'long' } } is not allowed) ); + } } } - - undef $data; } -sub getopt_files +sub check_disallowed { - # Martin A. Hansen, November 2007. + # Martin A. Hansen, May 2009. - # Extracts files from an explicit GetOpt::Long argument - # allowing for the use of glob. E.g. - # --data_in=test.fna - # --data_in=test.fna,test2.fna - # --data_in=*.fna - # --data_in=test.fna,/dir/*.fna + # Check if any values to all arguemnts are disallowed and raise an error if so. - my ( $option, # option from GetOpt::Long + my ( $arg, # hashref + $options, # options hash ) = @_; - # Returns a list. + # Returns nothing. - my ( $elem, @files ); + my ( $val, %val_hash ); - foreach $elem ( split ",", $option ) + if ( defined $arg->{ 'disallowed' } and defined $options->{ $arg->{ 'long' } } ) { - if ( -f $elem ) { - push @files, $elem; - } elsif ( $elem =~ /\*/ ) { - push @files, glob( $elem ); + foreach $val ( split /,/, $arg->{ 'disallowed' } ) + { + if ( $options->{ $arg->{ 'long' } } eq $val ) { + Maasha::Common::error( qq(Argument to --$arg->{ 'long' } $val is disallowed) ); + } } } - - return wantarray ? @files : \@files; } +# marked for deletion - obsolete? +#sub getopt_files +#{ +# # Martin A. Hansen, November 2007. +# +# # Extracts files from an explicit GetOpt::Long argument +# # allowing for the use of glob. E.g. +# # --data_in=test.fna +# # --data_in=test.fna,test2.fna +# # --data_in=*.fna +# # --data_in=test.fna,/dir/*.fna +# +# my ( $option, # option from GetOpt::Long +# ) = @_; +# +# # Returns a list. +# +# my ( $elem, @files ); +# +# foreach $elem ( split ",", $option ) +# { +# if ( -f $elem ) { +# push @files, $elem; +# } elsif ( $elem =~ /\*/ ) { +# push @files, glob( $elem ); +# } +# } +# +# return wantarray ? @files : \@files; +#} + + sub sig_handler { # Martin A. Hansen, April 2008. @@ -6877,24 +741,34 @@ sub sig_handler # print STDERR "signal->$sig<-\n"; + my $script = Maasha::Common::get_scriptname(); + chomp $sig; sleep 1; - if ( -d $BP_TMP ) + if ( $sig =~ /MAASHA_ERROR/ ) { - if ( $sig =~ /MAASHA_ERROR/ ) { - print STDERR "\nProgram '$script' had an error" . " - Please wait for temporary data to be removed\n"; - } elsif ( $sig eq "INT" ) { - print STDERR "\nProgram '$script' interrupted (ctrl-c was pressed)" . " - Please wait for temporary data to be removed\n"; - } elsif ( $sig eq "TERM" ) { - print STDERR "\nProgram '$script' terminated (someone used kill?)" . " - Please wait for temporary data to be removed\n"; - } else { - print STDERR "\nProgram '$script' died->$sig" . " - Please wait for temporary data to be removed\n"; - } - - clean_tmp(); + print STDERR "\nProgram '$script' had an error" . " - Please wait for temporary data to be removed\n"; + status_log( "ERROR" ); + } + elsif ( $sig eq "INT" ) + { + print STDERR "\nProgram '$script' interrupted (ctrl-c was pressed)" . " - Please wait for temporary data to be removed\n"; + status_log( "INTERRUPTED" ); + } + elsif ( $sig eq "TERM" ) + { + print STDERR "\nProgram '$script' terminated (someone used kill?)" . " - Please wait for temporary data to be removed\n"; + status_log( "TERMINATED" ); } + else + { + print STDERR "\nProgram '$script' died->$sig" . " - Please wait for temporary data to be removed\n"; + status_log( "DIED" ); + } + + clean_tmp(); exit( 0 ); } @@ -6910,11 +784,11 @@ sub clean_tmp my ( $tmpdir, @dirs, $curr_pid, $dir, $user, $sid, $pid ); - $tmpdir = $ENV{ 'BP_TMP' } || Maasha::Common::error( 'No BP_TMP variable in environment.' ); + $tmpdir = bp_tmp(); $curr_pid = Maasha::Common::get_processid(); - @dirs = Maasha::Common::ls_dirs( $tmpdir ); + @dirs = Maasha::Filesys::ls_dirs( $tmpdir ); foreach $dir ( @dirs ) { @@ -6924,17 +798,19 @@ sub clean_tmp $sid = $2; $pid = $3; +# next if $user eq "maasha"; # DEBUG + if ( $user eq Maasha::Common::get_user() ) { if ( not Maasha::Common::process_running( $pid ) ) { # print STDERR "Removing stale dir: $dir\n"; - Maasha::Common::dir_remove( $dir ); + Maasha::Filesys::dir_remove( $dir ); } elsif ( $pid == $curr_pid ) { # print STDERR "Removing current dir: $dir\n"; - # Maasha::Common::dir_remove( $dir ); + Maasha::Filesys::dir_remove( $dir ); } } } @@ -6942,6 +818,94 @@ sub clean_tmp } +sub get_tmpdir +{ + # Martin A. Hansen, April 2008. + + # Create a temporary directory based on + # $ENV{ 'BP_TMP' } and sessionid. The directory + # name is written to the status file. + + # Returns a path. + + my ( $user, $sid, $pid, $script, $path, $file, $fh, $line ); + + $user = Maasha::Common::get_user(); + $sid = Maasha::Common::get_sessionid(); + $pid = Maasha::Common::get_processid(); + $script = Maasha::Common::get_scriptname(); + + $path = bp_tmp() . "/" . join( "_", $user, $sid, $pid, "bp_tmp" ); + $file = bp_tmp() . "/" . join( ".", $user, $script, $pid ) . ".status"; + + $fh = Maasha::Filesys::file_read_open( $file ); + flock($fh, 1); + $line = <$fh>; + chomp $line; + close $fh; + + $fh = Maasha::Filesys::file_write_open( $file ); + flock($fh, 2); + print $fh "$line;$path\n"; + close $fh; + + Maasha::Filesys::dir_create( $path ); + + return $path; +} + + +sub biopiecesrc +{ + # Martin A. Hansen, July 2009. + + # Read Biopiece configuration info from .biopiecesrc. + # and returns the value of a given key. + + my ( $key, # configuration key + ) = @_; + + # Returns a string. + + my ( $file, $fh, $record ); + + $file = "$ENV{ 'HOME' }/.biopiecesrc"; + + return undef if not -f $file; + + $fh = Maasha::Filesys::file_read_open( $file ); + flock($fh, 1); + $record = get_record( $fh ); + close $fh; + + if ( exists $record->{ $key } ) { + return $record->{ $key }; + } else { + return undef; + } +} + +sub bp_tmp +{ + # Martin A. Hansen, March 2013. + + # Returns the BP_TMP path. + # Errs if no BP_TMP in ENV and + # creates BP_TMP if it doesn't exists. + + my ( $path ); + + Maasha::Common::error( qq(no BP_TMP set in %ENV) ) if not -d $ENV{ 'BP_TMP' }; + + $path = $ENV{ 'BP_TMP' }; + + unless ( -d $path ) { # No BP_TMP so we create it + mkdir $path or die qq(failed to create dir "$path": $!); + } + + return $path; +} + END { clean_tmp(); @@ -6950,6 +914,7 @@ END # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + 1; __END__