1 package Maasha::BioRun;
4 # Copyright (C) 2007-2009 Martin A. Hansen.
6 # This program is free software; you can redistribute it and/or
7 # modify it under the terms of the GNU General Public License
8 # as published by the Free Software Foundation; either version 2
9 # of the License, or (at your option) any later version.
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program; if not, write to the Free Software
18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 # http://www.gnu.org/copyleft/gpl.html
23 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
26 # Routines that contains Biopieces which are run.
29 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
34 use Getopt::Long qw( :config bundling );
35 use Time::HiRes qw( gettimeofday );
36 use Storable qw( dclone );
37 use Maasha::Biopieces;
46 use Maasha::Stockholm;
52 use Maasha::UCSC::BED;
53 use Maasha::UCSC::Wiggle;
62 use vars qw( @ISA @EXPORT_OK );
66 @ISA = qw( Exporter );
74 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> GLOBALS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
77 my ( $script, $BP_TMP );
79 $script = Maasha::Common::get_scriptname();
80 $BP_TMP = Maasha::Common::get_tmpdir();
83 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> RUN SCRIPT <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
86 run_script( $script );
89 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SUBROUTINES <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
94 # Martin A. Hansen, August 2007.
96 # Run a specific script.
98 my ( $script, # script name
103 my ( $t0, $t1, $options, $in, $out );
105 Maasha::Biopieces::log_biopiece();
107 $t0 = gettimeofday();
109 $options = get_options( $script );
111 $options->{ "SCRIPT" } = $script;
113 if ( $script ne "list_biopieces" and $script ne "list_genomes" ) {
114 $script = "print_usage" if ( -t STDIN and keys %{ $options } <= 1 or $options->{ 'help' } );
117 $in = Maasha::Biopieces::read_stream( $options->{ "stream_in" } );
118 $out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } );
120 if ( $script eq "print_usage" ) { script_print_usage( $in, $out, $options ) }
121 elsif ( $script eq "list_biopieces" ) { script_list_biopieces( $in, $out, $options ) }
122 elsif ( $script eq "list_genomes" ) { script_list_genomes( $in, $out, $options ) }
123 elsif ( $script eq "read_fasta" ) { script_read_fasta( $in, $out, $options ) }
124 elsif ( $script eq "read_tab" ) { script_read_tab( $in, $out, $options ) }
125 elsif ( $script eq "read_psl" ) { script_read_psl( $in, $out, $options ) }
126 elsif ( $script eq "read_bed" ) { script_read_bed( $in, $out, $options ) }
127 elsif ( $script eq "read_fixedstep" ) { script_read_fixedstep( $in, $out, $options ) }
128 elsif ( $script eq "read_blast_tab" ) { script_read_blast_tab( $in, $out, $options ) }
129 elsif ( $script eq "read_embl" ) { script_read_embl( $in, $out, $options ) }
130 elsif ( $script eq "read_stockholm" ) { script_read_stockholm( $in, $out, $options ) }
131 elsif ( $script eq "read_phastcons" ) { script_read_phastcons( $in, $out, $options ) }
132 elsif ( $script eq "read_soft" ) { script_read_soft( $in, $out, $options ) }
133 elsif ( $script eq "read_gff" ) { script_read_gff( $in, $out, $options ) }
134 elsif ( $script eq "read_2bit" ) { script_read_2bit( $in, $out, $options ) }
135 elsif ( $script eq "read_solexa" ) { script_read_solexa( $in, $out, $options ) }
136 elsif ( $script eq "read_solid" ) { script_read_solid( $in, $out, $options ) }
137 elsif ( $script eq "read_mysql" ) { script_read_mysql( $in, $out, $options ) }
138 elsif ( $script eq "read_ucsc_config" ) { script_read_ucsc_config( $in, $out, $options ) }
139 elsif ( $script eq "assemble_tag_contigs" ) { script_assemble_tag_contigs( $in, $out, $options ) }
140 elsif ( $script eq "format_genome" ) { script_format_genome( $in, $out, $options ) }
141 elsif ( $script eq "length_seq" ) { script_length_seq( $in, $out, $options ) }
142 elsif ( $script eq "uppercase_seq" ) { script_uppercase_seq( $in, $out, $options ) }
143 elsif ( $script eq "shuffle_seq" ) { script_shuffle_seq( $in, $out, $options ) }
144 elsif ( $script eq "analyze_seq" ) { script_analyze_seq( $in, $out, $options ) }
145 elsif ( $script eq "analyze_tags" ) { script_analyze_tags( $in, $out, $options ) }
146 elsif ( $script eq "complexity_seq" ) { script_complexity_seq( $in, $out, $options ) }
147 elsif ( $script eq "oligo_freq" ) { script_oligo_freq( $in, $out, $options ) }
148 elsif ( $script eq "create_weight_matrix" ) { script_create_weight_matrix( $in, $out, $options ) }
149 elsif ( $script eq "calc_bit_scores" ) { script_calc_bit_scores( $in, $out, $options ) }
150 elsif ( $script eq "calc_fixedstep" ) { script_calc_fixedstep( $in, $out, $options ) }
151 elsif ( $script eq "reverse_seq" ) { script_reverse_seq( $in, $out, $options ) }
152 elsif ( $script eq "complement_seq" ) { script_complement_seq( $in, $out, $options ) }
153 elsif ( $script eq "remove_indels" ) { script_remove_indels( $in, $out, $options ) }
154 elsif ( $script eq "transliterate_seq" ) { script_transliterate_seq( $in, $out, $options ) }
155 elsif ( $script eq "transliterate_vals" ) { script_transliterate_vals( $in, $out, $options ) }
156 elsif ( $script eq "translate_seq" ) { script_translate_seq( $in, $out, $options ) }
157 elsif ( $script eq "extract_seq" ) { script_extract_seq( $in, $out, $options ) }
158 elsif ( $script eq "get_genome_seq" ) { script_get_genome_seq( $in, $out, $options ) }
159 elsif ( $script eq "get_genome_align" ) { script_get_genome_align( $in, $out, $options ) }
160 elsif ( $script eq "get_genome_phastcons" ) { script_get_genome_phastcons( $in, $out, $options ) }
161 elsif ( $script eq "fold_seq" ) { script_fold_seq( $in, $out, $options ) }
162 elsif ( $script eq "split_seq" ) { script_split_seq( $in, $out, $options ) }
163 elsif ( $script eq "split_bed" ) { script_split_bed( $in, $out, $options ) }
164 elsif ( $script eq "align_seq" ) { script_align_seq( $in, $out, $options ) }
165 elsif ( $script eq "tile_seq" ) { script_tile_seq( $in, $out, $options ) }
166 elsif ( $script eq "invert_align" ) { script_invert_align( $in, $out, $options ) }
167 elsif ( $script eq "patscan_seq" ) { script_patscan_seq( $in, $out, $options ) }
168 elsif ( $script eq "create_blast_db" ) { script_create_blast_db( $in, $out, $options ) }
169 elsif ( $script eq "blast_seq" ) { script_blast_seq( $in, $out, $options ) }
170 elsif ( $script eq "blat_seq" ) { script_blat_seq( $in, $out, $options ) }
171 elsif ( $script eq "soap_seq" ) { script_soap_seq( $in, $out, $options ) }
172 elsif ( $script eq "match_seq" ) { script_match_seq( $in, $out, $options ) }
173 elsif ( $script eq "create_vmatch_index" ) { script_create_vmatch_index( $in, $out, $options ) }
174 elsif ( $script eq "vmatch_seq" ) { script_vmatch_seq( $in, $out, $options ) }
175 elsif ( $script eq "write_fasta" ) { script_write_fasta( $in, $out, $options ) }
176 elsif ( $script eq "write_align" ) { script_write_align( $in, $out, $options ) }
177 elsif ( $script eq "write_blast" ) { script_write_blast( $in, $out, $options ) }
178 elsif ( $script eq "write_tab" ) { script_write_tab( $in, $out, $options ) }
179 elsif ( $script eq "write_bed" ) { script_write_bed( $in, $out, $options ) }
180 elsif ( $script eq "write_psl" ) { script_write_psl( $in, $out, $options ) }
181 elsif ( $script eq "write_fixedstep" ) { script_write_fixedstep( $in, $out, $options ) }
182 elsif ( $script eq "write_2bit" ) { script_write_2bit( $in, $out, $options ) }
183 elsif ( $script eq "write_solid" ) { script_write_solid( $in, $out, $options ) }
184 elsif ( $script eq "write_ucsc_config" ) { script_write_ucsc_config( $in, $out, $options ) }
185 elsif ( $script eq "head_records" ) { script_head_records( $in, $out, $options ) }
186 elsif ( $script eq "remove_keys" ) { script_remove_keys( $in, $out, $options ) }
187 elsif ( $script eq "remove_adaptor" ) { script_remove_adaptor( $in, $out, $options ) }
188 elsif ( $script eq "remove_mysql_tables" ) { script_remove_mysql_tables( $in, $out, $options ) }
189 elsif ( $script eq "remove_ucsc_tracks" ) { script_remove_ucsc_tracks( $in, $out, $options ) }
190 elsif ( $script eq "rename_keys" ) { script_rename_keys( $in, $out, $options ) }
191 elsif ( $script eq "uniq_vals" ) { script_uniq_vals( $in, $out, $options ) }
192 elsif ( $script eq "merge_vals" ) { script_merge_vals( $in, $out, $options ) }
193 elsif ( $script eq "merge_records" ) { script_merge_records( $in, $out, $options ) }
194 elsif ( $script eq "grab" ) { script_grab( $in, $out, $options ) }
195 elsif ( $script eq "compute" ) { script_compute( $in, $out, $options ) }
196 elsif ( $script eq "flip_tab" ) { script_flip_tab( $in, $out, $options ) }
197 elsif ( $script eq "add_ident" ) { script_add_ident( $in, $out, $options ) }
198 elsif ( $script eq "count_records" ) { script_count_records( $in, $out, $options ) }
199 elsif ( $script eq "random_records" ) { script_random_records( $in, $out, $options ) }
200 elsif ( $script eq "sort_records" ) { script_sort_records( $in, $out, $options ) }
201 elsif ( $script eq "count_vals" ) { script_count_vals( $in, $out, $options ) }
202 elsif ( $script eq "plot_histogram" ) { script_plot_histogram( $in, $out, $options ) }
203 elsif ( $script eq "plot_lendist" ) { script_plot_lendist( $in, $out, $options ) }
204 elsif ( $script eq "plot_chrdist" ) { script_plot_chrdist( $in, $out, $options ) }
205 elsif ( $script eq "plot_karyogram" ) { script_plot_karyogram( $in, $out, $options ) }
206 elsif ( $script eq "plot_matches" ) { script_plot_matches( $in, $out, $options ) }
207 elsif ( $script eq "plot_seqlogo" ) { script_plot_seqlogo( $in, $out, $options ) }
208 elsif ( $script eq "plot_phastcons_profiles" ) { script_plot_phastcons_profiles( $in, $out, $options ) }
209 elsif ( $script eq "analyze_bed" ) { script_analyze_bed( $in, $out, $options ) }
210 elsif ( $script eq "analyze_vals" ) { script_analyze_vals( $in, $out, $options ) }
211 elsif ( $script eq "length_vals" ) { script_length_vals( $in, $out, $options ) }
212 elsif ( $script eq "sum_vals" ) { script_sum_vals( $in, $out, $options ) }
213 elsif ( $script eq "mean_vals" ) { script_mean_vals( $in, $out, $options ) }
214 elsif ( $script eq "median_vals" ) { script_median_vals( $in, $out, $options ) }
215 elsif ( $script eq "max_vals" ) { script_max_vals( $in, $out, $options ) }
216 elsif ( $script eq "min_vals" ) { script_min_vals( $in, $out, $options ) }
217 elsif ( $script eq "upload_to_ucsc" ) { script_upload_to_ucsc( $in, $out, $options ) }
219 close $in if defined $in;
222 $t1 = gettimeofday();
224 print STDERR "Program: $script" . ( " " x ( 25 - length( $script ) ) ) . sprintf( "Run time: %.4f\n", ( $t1 - $t0 ) ) if $options->{ 'verbose' };
230 # Martin A. Hansen, February 2008.
232 # Gets options from commandline and checks these vigerously.
234 my ( $script, # name of script
239 my ( %options, @options, $opt, @genomes, $real );
241 if ( $script eq "print_usage" )
247 elsif ( $script eq "read_fasta" )
254 elsif ( $script eq "read_tab" )
265 elsif ( $script eq "read_psl" )
272 elsif ( $script eq "read_bed" )
281 elsif ( $script eq "read_fixedstep" )
288 elsif ( $script eq "read_blast_tab" )
295 elsif ( $script eq "read_embl" )
305 elsif ( $script eq "read_stockholm" )
312 elsif ( $script eq "read_phastcons" )
323 elsif ( $script eq "read_soft" )
331 elsif ( $script eq "read_gff" )
338 elsif ( $script eq "read_2bit" )
346 elsif ( $script eq "read_solexa" )
355 elsif ( $script eq "read_solid" )
363 elsif ( $script eq "read_mysql" )
372 elsif ( $script eq "read_ucsc_config" )
379 elsif ( $script eq "assemble_tag_contigs" )
385 elsif ( $script eq "calc_fixedstep" )
391 elsif ( $script eq "format_genome" )
400 elsif ( $script eq "length_seq" )
407 elsif ( $script eq "oligo_freq" )
414 elsif ( $script eq "create_weight_matrix" )
420 elsif ( $script eq "transliterate_seq" )
428 elsif ( $script eq "transliterate_vals" )
437 elsif ( $script eq "translate_seq" )
443 elsif ( $script eq "extract_seq" )
451 elsif ( $script eq "get_genome_seq" )
464 elsif ( $script eq "get_genome_align" )
475 elsif ( $script eq "get_genome_phastcons" )
486 elsif ( $script eq "split_seq" )
493 elsif ( $script eq "split_bed" )
500 elsif ( $script eq "tile_seq" )
507 elsif ( $script eq "invert_align" )
513 elsif ( $script eq "patscan_seq" )
524 elsif ( $script eq "create_blast_db" )
531 elsif ( $script eq "blast_seq" )
543 elsif ( $script eq "blat_seq" )
555 elsif ( $script eq "soap_seq" )
566 elsif ( $script eq "match_seq" )
573 elsif ( $script eq "create_vmatch_index" )
581 elsif ( $script eq "vmatch_seq" )
592 elsif ( $script eq "write_fasta" )
601 elsif ( $script eq "write_align" )
611 elsif ( $script eq "write_blast" )
620 elsif ( $script eq "write_tab" )
632 elsif ( $script eq "write_bed" )
642 elsif ( $script eq "write_psl" )
650 elsif ( $script eq "write_fixedstep" )
658 elsif ( $script eq "write_2bit" )
666 elsif ( $script eq "write_solid" )
675 elsif ( $script eq "write_ucsc_config" )
682 elsif ( $script eq "plot_seqlogo" )
689 elsif ( $script eq "plot_phastcons_profiles" )
704 elsif ( $script eq "analyze_vals" )
711 elsif ( $script eq "head_records" )
717 elsif ( $script eq "remove_keys" )
724 elsif ( $script eq "remove_adaptor" )
733 elsif ( $script eq "remove_mysql_tables" )
744 elsif ( $script eq "remove_ucsc_tracks" )
756 elsif ( $script eq "rename_keys" )
762 elsif ( $script eq "uniq_vals" )
769 elsif ( $script eq "merge_vals" )
776 elsif ( $script eq "merge_records" )
783 elsif ( $script eq "grab" )
798 elsif ( $script eq "compute" )
804 elsif ( $script eq "add_ident" )
811 elsif ( $script eq "count_records" )
818 elsif ( $script eq "random_records" )
824 elsif ( $script eq "sort_records" )
831 elsif ( $script eq "count_vals" )
837 elsif ( $script eq "plot_histogram" )
850 elsif ( $script eq "plot_lendist" )
862 elsif ( $script eq "plot_chrdist" )
873 elsif ( $script eq "plot_karyogram" )
882 elsif ( $script eq "plot_matches" )
894 elsif ( $script eq "length_vals" )
900 elsif ( $script eq "sum_vals" )
908 elsif ( $script eq "mean_vals" )
916 elsif ( $script eq "median_vals" )
924 elsif ( $script eq "max_vals" )
932 elsif ( $script eq "min_vals" )
940 elsif ( $script eq "upload_to_ucsc" )
964 # print STDERR Dumper( \@options );
971 # print STDERR Dumper( \%options );
973 if ( -t STDIN && scalar( keys %options ) == 0 or $options{ "help" } ) {
974 return wantarray ? %options : \%options;
977 $options{ "cols" } = [ split ",", $options{ "cols" } ] if defined $options{ "cols" };
978 $options{ "keys" } = [ split ",", $options{ "keys" } ] if defined $options{ "keys" };
979 $options{ "no_keys" } = [ split ",", $options{ "no_keys" } ] if defined $options{ "no_keys" };
980 $options{ "save_keys" } = [ split ",", $options{ "save_keys" } ] if defined $options{ "save_keys" };
981 $options{ "quals" } = [ split ",", $options{ "quals" } ] if defined $options{ "quals" };
982 $options{ "feats" } = [ split ",", $options{ "feats" } ] if defined $options{ "feats" };
983 $options{ "frames" } = [ split ",", $options{ "frames" } ] if defined $options{ "frames" };
984 $options{ "formats" } = [ split ",", $options{ "formats" } ] if defined $options{ "formats" };
985 $options{ "samples" } = [ split ",", $options{ "samples" } ] if defined $options{ "samples" };
986 $options{ "tables" } = [ split ",", $options{ "tables" } ] if defined $options{ "tables" };
987 $options{ "tracks" } = [ split ",", $options{ "tracks" } ] if defined $options{ "tracks" };
989 # ---- check arguments ----
991 if ( $options{ 'data_in' } )
993 $options{ "files" } = Maasha::Biopieces::getopt_files( $options{ 'data_in' } );
995 Maasha::Common::error( qq(Argument to --data_in must be a valid file or fileglob expression) ) if scalar @{ $options{ "files" } } == 0;
998 map { Maasha::Common::error( qq(Argument to --cols must be a whole numbers - not "$_") ) if $_ !~ /^\d+$/ } @{ $options{ "cols" } } if $options{ "cols" };
1000 # print STDERR Dumper( \%options );
1002 $real = "beg|end|word_size|wrap|tile_size|len|prefix_length|mismatches|offset|num|skip|cpus|window_size|step_size";
1004 foreach $opt ( keys %options )
1006 if ( $opt =~ /stream_in|pattern_in|exact_in/ and not -f $options{ $opt } )
1008 Maasha::Common::error( qq(Argument to --$opt must be a valid file or fileglob expression - not "$options{ $opt }") );
1010 elsif ( $opt =~ /$real/ and $options{ $opt } !~ /^\d+$/ )
1012 Maasha::Common::error( qq(Argument to --$opt must be a whole number - not "$options{ $opt }") );
1014 elsif ( $opt =~ /max_hits|max_hits|max_misses|dist|edit_dist|flank|gap|hamming_dist|priority/ and $options{ $opt } !~ /^-?\d+$/ )
1016 Maasha::Common::error( qq(Argument to --$opt must be an integer - not "$options{ $opt }") );
1018 elsif ( $opt =~ /identity|threshold/ and $options{ $opt } !~ /^-?(?:\d+(?:\.\d*)?|\.\d+)$/ )
1020 Maasha::Common::error( qq(Argument to --$opt must be a decimal number - not "$options{ $opt }") );
1022 elsif ( $opt =~ /e_val/ and $options{ $opt } !~ /^([+-]?)(?=\d|\.\d)\d*(\.\d*)?([Ee]([+-]?\d+))?$/ )
1024 Maasha::Common::error( qq(Argument to --$opt must be a float - not "$options{ $opt }") );
1026 elsif ( $opt =~ /strand/ and $options{ $opt } !~ /^(\+|-)$/ )
1028 Maasha::Common::error( qq(Argument to --$opt must be "+" or "-" - not "$options{ $opt }") );
1030 elsif ( $opt eq "genome" and $script ne "format_genome" )
1032 @genomes = Maasha::Common::ls_dirs( "$ENV{ 'BP_DATA' }/genomes" );
1033 map { $_ =~ s/.*\/(.+)$/$1/ } @genomes;
1035 if ( not grep { $_ =~ /^$options{ $opt }$/ } @genomes ) {
1036 Maasha::Common::error( qq(Genome $options{ $opt } not found in "$ENV{ 'BP_DATA' }/genomes/") );
1039 elsif ( $opt eq "terminal" and not $options{ $opt } =~ /^(svg|post|dumb|x11)/ )
1041 Maasha::Common::error( qq(Bad --$opt argument "$options{ $opt }") );
1043 elsif ( $opt eq "table" and $options{ $opt } =~ /(-|\.)/ )
1045 Maasha::Common::error( qq(Character '$1' is not allowed in table name: $options{ $opt }) );
1047 elsif ( $opt eq "merge" and $options{ $opt } !~ /^(AandB|AorB|BorA|AnotB|BnotA)$/ )
1049 Maasha::Common::error( qq(Argument to --$opt must be AandB, AorB, BorA, AnotB, or BnotA - not "$options{ $opt }") );
1051 elsif ( $opt eq "format" and $script eq "read_solexa" and $options{ $opt } !~ /octal|decimal/ )
1053 Maasha::Common::error( qq(Argument to --$opt must be octal or decimal - not "$options{ $opt }") );
1055 elsif ( $opt eq "remove" and $script eq "remove_adaptor" and $options{ $opt } !~ /before|after|skip/ )
1057 Maasha::Common::error( qq(Argument to --$opt must be before, after, or skip - not "$options{ $opt }") );
1061 Maasha::Common::error( qq(no --database specified) ) if $script eq "remove_ucsc_tracks" and not $options{ "database" };
1062 Maasha::Common::error( qq(no --database specified) ) if $script eq "create_blast_db" and not $options{ "database" };
1063 Maasha::Common::error( qq(no --index_name specified) ) if $script =~ /create_vmatch_index/ and not $options{ "index_name" };
1064 Maasha::Common::error( qq(no --database or --genome specified) ) if $script eq "blast_seq" and not $options{ "genome" } and not $options{ "database" };
1065 Maasha::Common::error( qq(both --database and --genome specified) ) if $script eq "blast_seq" and $options{ "genome" } and $options{ "database" };
1066 Maasha::Common::error( qq(no --index_name or --genome specified) ) if $script eq "vmatch_seq" and not $options{ "genome" } and not $options{ "index_name" };
1067 Maasha::Common::error( qq(both --index and --genome specified) ) if $script eq "vmatch_seq" and $options{ "genome" } and $options{ "index_name" };
1068 Maasha::Common::error( qq(no --in_file or --genome specified) ) if $script eq "soap_seq" and not $options{ "genome" } and not $options{ "in_file" };
1069 Maasha::Common::error( qq(both --in_file and --genome specified) ) if $script eq "soap_seq" and $options{ "genome" } and $options{ "in_file" };
1070 Maasha::Common::error( qq(no --genome specified) ) if $script =~ /format_genome|get_genome_seq|get_genome_align|get_genome_phastcons|blat_seq|plot_phastcons_profiles|plot_karyogram/ and not $options{ "genome" };
1071 Maasha::Common::error( qq(no --key specified) ) if $script =~ /plot_lendist|plot_histogram/ and not $options{ "key" };
1072 Maasha::Common::error( qq(no --keys speficied) ) if $script =~ /sort_records|count_vals|sum_vals|mean_vals|median_vals|length_vals/ and not $options{ "keys" };
1074 if ( $script eq "upload_to_ucsc" )
1076 Maasha::Common::error( qq(no --database specified) ) if not $options{ "database" };
1077 Maasha::Common::error( qq(no --table specified) ) if not $options{ "table" };
1080 return wantarray ? %options : \%options;
1084 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SCRIPTS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1087 sub script_print_usage
1089 # Martin A. Hansen, January 2008.
1091 # Retrieves usage information from file and
1092 # prints this nicely formatted.
1094 my ( $in, # handle to in stream
1095 $out, # handle to out stream
1096 $options, # options hash
1101 my ( $file, $wiki, $lines );
1103 if ( $options->{ 'data_in' } ) {
1104 $file = $options->{ 'data_in' };
1106 $file = join "", $ENV{ 'BP_DIR' }, "/bp_usage/", $options->{ 'SCRIPT' }, ".wiki";
1109 $wiki = Maasha::Gwiki::gwiki_read( $file );
1111 ( $wiki->[ 2 ], $wiki->[ 3 ], $wiki->[ 0 ], $wiki->[ 1 ] ) = ( $wiki->[ 0 ], $wiki->[ 1 ], $wiki->[ 2 ], $wiki->[ 3 ] );
1113 if ( not $options->{ "help" } ) {
1114 @{ $wiki } = grep { $_->[ 0 ]->{ 'SECTION' } =~ /Biopiece|summary|Usage|Options|Help/ } @{ $wiki };
1117 $lines = Maasha::Gwiki::gwiki2ascii( $wiki );
1119 print STDERR "$_\n" foreach @{ $lines };
1125 sub script_list_biopieces
1127 # Martin A. Hansen, January 2008.
1129 # Prints the summary from the usage for each of the biopieces.
1131 my ( $in, # handle to in stream
1132 $out, # handle to out stream
1133 $options, # options hash
1138 my ( @files, $file, $wiki, $program, $summary );
1140 @files = Maasha::Common::ls_files( "$ENV{ 'BP_DIR' }/bp_usage" );
1142 foreach $file ( sort @files )
1144 if ( $file =~ /\/([a-z0-9_]+)\.wiki$/ )
1148 $wiki = Maasha::Gwiki::gwiki_read( $file );
1150 $summary = $wiki->[ 0 ]->[ 0 ]->{ 'TEXT' };
1151 $summary =~ s/^#summary\s+//;
1153 printf( "%-30s%s\n", $program, $summary );
1161 sub script_list_genomes
1163 # Martin A. Hansen, January 2008.
1165 # Prints the synopsis from the usage for each of the biopieces.
1167 my ( $in, # handle to in stream
1168 $out, # handle to out stream
1169 $options, # options hash
1174 my ( @genomes, $genome, @formats, $format, %hash, %found, @row );
1176 @genomes = Maasha::Common::ls_dirs( "$ENV{ 'BP_DATA' }/genomes" );
1178 foreach $genome ( @genomes )
1180 next if $genome =~ /\.$/;
1182 @formats = Maasha::Common::ls_dirs( $genome );
1184 foreach $format ( @formats )
1186 if ( $format =~ /\/([^\/]+)\/(\w+)$/ )
1188 $hash{ $1 }{ $2 } = 1;
1197 map { push @row, $_ } sort keys %found;
1199 print join( "\t", @row ), "\n";
1201 foreach $genome ( sort keys %hash )
1205 foreach $format ( sort keys %found )
1207 if ( exists $hash{ $genome }{ $format } ) {
1214 print join( "\t", @row ), "\n";
1219 sub script_read_fasta
1221 # Martin A. Hansen, August 2007.
1223 # Read sequences from FASTA file.
1225 my ( $in, # handle to in stream
1226 $out, # handle to out stream
1227 $options, # options hash
1232 my ( $record, $file, $data_in, $entry, $num );
1234 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
1235 Maasha::Biopieces::put_record( $record, $out );
1240 foreach $file ( @{ $options->{ "files" } } )
1242 $data_in = Maasha::Common::read_open( $file );
1244 while ( $entry = Maasha::Fasta::get_entry( $data_in ) )
1246 if ( defined $entry->[ SEQ_NAME ] and $entry->[ SEQ ] )
1249 SEQ_NAME => $entry->[ SEQ_NAME ],
1250 SEQ => $entry->[ SEQ ],
1251 SEQ_LEN => length $entry->[ SEQ ],
1254 Maasha::Biopieces::put_record( $record, $out );
1257 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
1267 close $data_in if $data_in;
1273 # Martin A. Hansen, August 2007.
1275 # Read table or table columns from stream or file.
1277 my ( $in, # handle to in stream
1278 $out, # handle to out stream
1279 $options, # options hash
1284 my ( $file, $line, @fields, @fields2, $i, $record, $data_in, $skip, $num );
1286 $options->{ 'delimit' } ||= '\s+';
1288 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
1289 Maasha::Biopieces::put_record( $record, $out );
1292 $skip = $options->{ 'skip' } ||= 0;
1295 foreach $file ( @{ $options->{ "files" } } )
1297 $data_in = Maasha::Common::read_open( $file );
1299 while ( $line = <$data_in> )
1307 next if $line =~ /^#|^$/;
1314 @fields = split /$options->{'delimit'}/, $line;
1316 if ( $options->{ "cols" } ) {
1317 map { push @fields2, $fields[ $_ ] } @{ $options->{ "cols" } };
1322 for ( $i = 0; $i < @fields2; $i++ )
1324 if ( $options->{ "keys" }->[ $i ] ) {
1325 $record->{ $options->{ "keys" }->[ $i ] } = $fields2[ $i ];
1327 $record->{ "V" . $i } = $fields2[ $i ];
1331 Maasha::Biopieces::put_record( $record, $out );
1333 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
1343 close $data_in if $data_in;
1349 # Martin A. Hansen, August 2007.
1351 # Read psl table from stream or file.
1353 my ( $in, # handle to in stream
1354 $out, # handle to out stream
1355 $options, # options hash
1360 my ( $record, $file, $data_in, $num );
1362 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
1363 Maasha::Biopieces::put_record( $record, $out );
1368 foreach $file ( @{ $options->{ "files" } } )
1370 $data_in = Maasha::Common::read_open( $file );
1372 while ( $record = Maasha::UCSC::psl_get_entry( $data_in ) )
1374 Maasha::Biopieces::put_record( $record, $out );
1376 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
1388 # Martin A. Hansen, August 2007.
1390 # Read bed table from stream or file.
1392 my ( $in, # handle to in stream
1393 $out, # handle to out stream
1394 $options, # options hash
1399 my ( $cols, $file, $record, $bed_entry, $data_in, $num );
1401 $cols = $options->{ 'cols' }->[ 0 ];
1403 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
1404 Maasha::Biopieces::put_record( $record, $out );
1409 foreach $file ( @{ $options->{ "files" } } )
1411 $data_in = Maasha::Common::read_open( $file );
1413 while ( $bed_entry = Maasha::UCSC::BED::bed_entry_get( $data_in, $cols, $options->{ 'check' } ) )
1415 $record = Maasha::UCSC::BED::bed2biopiece( $bed_entry );
1417 Maasha::Biopieces::put_record( $record, $out );
1419 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
1429 close $data_in if $data_in;
1433 sub script_read_fixedstep
1435 # Martin A. Hansen, Juli 2008.
1437 # Read fixedstep wiggle format from stream or file.
1439 my ( $in, # handle to in stream
1440 $out, # handle to out stream
1441 $options, # options hash
1446 my ( $file, $record, $entry, $data_in, $num );
1448 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
1449 Maasha::Biopieces::put_record( $record, $out );
1454 foreach $file ( @{ $options->{ "files" } } )
1456 $data_in = Maasha::Common::read_open( $file );
1458 while ( $entry = Maasha::UCSC::Wiggle::fixedstep_entry_get( $data_in ) )
1460 $record = Maasha::UCSC::Wiggle::fixedstep2biopiece( $entry );
1462 Maasha::Biopieces::put_record( $record, $out );
1464 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
1474 close $data_in if $data_in;
1478 sub script_read_blast_tab
1480 # Martin A. Hansen, September 2007.
1482 # Read tabular BLAST output from NCBI blast run with -m8 or -m9.
1484 my ( $in, # handle to in stream
1485 $out, # handle to out stream
1486 $options, # options hash
1491 my ( $file, $line, @fields, $strand, $record, $data_in, $num );
1493 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
1494 Maasha::Biopieces::put_record( $record, $out );
1499 foreach $file ( @{ $options->{ "files" } } )
1501 $data_in = Maasha::Common::read_open( $file );
1503 while ( $line = <$data_in> )
1507 next if $line =~ /^#/;
1509 @fields = split /\t/, $line;
1511 $record->{ "REC_TYPE" } = "BLAST";
1512 $record->{ "Q_ID" } = $fields[ 0 ];
1513 $record->{ "S_ID" } = $fields[ 1 ];
1514 $record->{ "IDENT" } = $fields[ 2 ];
1515 $record->{ "ALIGN_LEN" } = $fields[ 3 ];
1516 $record->{ "MISMATCHES" } = $fields[ 4 ];
1517 $record->{ "GAPS" } = $fields[ 5 ];
1518 $record->{ "Q_BEG" } = $fields[ 6 ] - 1; # BLAST is 1-based
1519 $record->{ "Q_END" } = $fields[ 7 ] - 1; # BLAST is 1-based
1520 $record->{ "S_BEG" } = $fields[ 8 ] - 1; # BLAST is 1-based
1521 $record->{ "S_END" } = $fields[ 9 ] - 1; # BLAST is 1-based
1522 $record->{ "E_VAL" } = $fields[ 10 ];
1523 $record->{ "BIT_SCORE" } = $fields[ 11 ];
1525 if ( $record->{ "S_BEG" } > $record->{ "S_END" } )
1527 $record->{ "STRAND" } = '-';
1529 ( $record->{ "S_BEG" }, $record->{ "S_END" } ) = ( $record->{ "S_END" }, $record->{ "S_BEG" } );
1533 $record->{ "STRAND" } = '+';
1536 Maasha::Biopieces::put_record( $record, $out );
1538 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
1548 close $data_in if $data_in;
1552 sub script_read_embl
1554 # Martin A. Hansen, August 2007.
1558 my ( $in, # handle to in stream
1559 $out, # handle to out stream
1560 $options, # options hash
1565 my ( %options2, $file, $data_in, $num, $entry, $record );
1567 map { $options2{ "keys" }{ $_ } = 1 } @{ $options->{ "keys" } };
1568 map { $options2{ "feats" }{ $_ } = 1 } @{ $options->{ "feats" } };
1569 map { $options2{ "quals" }{ $_ } = 1 } @{ $options->{ "quals" } };
1571 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
1572 Maasha::Biopieces::put_record( $record, $out );
1577 foreach $file ( @{ $options->{ "files" } } )
1579 $data_in = Maasha::Common::read_open( $file );
1581 while ( $entry = Maasha::EMBL::get_embl_entry( $data_in ) )
1583 $record = Maasha::EMBL::parse_embl_entry( $entry, \%options2 );
1585 my ( $feat, $feat2, $qual, $qual_val, $record_copy );
1587 $record_copy = dclone $record;
1589 delete $record_copy->{ "FT" };
1591 Maasha::Biopieces::put_record( $record_copy, $out );
1593 delete $record_copy->{ "SEQ" };
1595 foreach $feat ( keys %{ $record->{ "FT" } } )
1597 $record_copy->{ "FEAT_TYPE" } = $feat;
1599 foreach $feat2 ( @{ $record->{ "FT" }->{ $feat } } )
1601 foreach $qual ( keys %{ $feat2 } )
1603 $qual_val = join "; ", @{ $feat2->{ $qual } };
1608 $record_copy->{ $qual } = $qual_val;
1611 Maasha::Biopieces::put_record( $record_copy, $out );
1615 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
1625 close $data_in if $data_in;
1629 sub script_read_stockholm
1631 # Martin A. Hansen, August 2007.
1633 # Read Stockholm format.
1635 my ( $in, # handle to in stream
1636 $out, # handle to out stream
1637 $options, # options hash
1642 my ( $data_in, $file, $num, $entry, $record, $record_anno, $record_align, $key, $seq );
1644 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
1645 Maasha::Biopieces::put_record( $record, $out );
1650 foreach $file ( @{ $options->{ "files" } } )
1652 $data_in = Maasha::Common::read_open( $file );
1654 while ( $entry = Maasha::Stockholm::get_stockholm_entry( $data_in ) )
1656 $record = Maasha::Stockholm::parse_stockholm_entry( $entry );
1660 foreach $key ( keys %{ $record->{ "GF" } } ) {
1661 $record_anno->{ $key } = $record->{ "GF" }->{ $key };
1664 $record_anno->{ "ALIGN" } = $num;
1666 Maasha::Biopieces::put_record( $record_anno, $out );
1668 foreach $seq ( @{ $record->{ "ALIGN" } } )
1670 undef $record_align;
1673 SEQ_NAME => $seq->[ 0 ],
1677 Maasha::Biopieces::put_record( $record_align, $out );
1680 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
1690 close $data_in if $data_in;
1694 sub script_read_phastcons
1696 # Martin A. Hansen, December 2007.
1698 # Read PhastCons format.
1700 my ( $in, # handle to in stream
1701 $out, # handle to out stream
1702 $options, # options hash
1707 my ( $data_in, $file, $num, $entry, @records, $record );
1709 $options->{ "min" } ||= 10;
1710 $options->{ "dist" } ||= 25;
1711 $options->{ "threshold" } ||= 0.8;
1712 $options->{ "gap" } ||= 5;
1714 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
1715 Maasha::Biopieces::put_record( $record, $out );
1720 foreach $file ( @{ $options->{ "files" } } )
1722 $data_in = Maasha::Common::read_open( $file );
1724 while ( $entry = Maasha::UCSC::fixedstep_get_entry( $data_in ) )
1726 @records = Maasha::UCSC::phastcons_parse_entry( $entry, $options );
1728 foreach $record ( @records )
1730 $record->{ "REC_TYPE" } = "BED";
1731 $record->{ "BED_LEN" } = $record->{ "CHR_END" } - $record->{ "CHR_BEG" } + 1;
1733 Maasha::Biopieces::put_record( $record, $out );
1735 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
1746 close $data_in if $data_in;
1750 sub script_read_soft
1752 # Martin A. Hansen, December 2007.
1755 # http://www.ncbi.nlm.nih.gov/geo/info/soft2.html
1757 my ( $in, # handle to in stream
1758 $out, # handle to out stream
1759 $options, # options hash
1764 my ( $data_in, $file, $num, $records, $record, $soft_index, $fh, @platforms, $plat_table, @samples, $sample, $old_end, $skip );
1766 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
1767 Maasha::Biopieces::put_record( $record, $out );
1772 foreach $file ( @{ $options->{ "files" } } )
1774 print STDERR "Creating index for file: $file\n" if $options->{ "verbose" };
1776 $soft_index = Maasha::NCBI::soft_index_file( $file );
1778 $fh = Maasha::Common::read_open( $file );
1780 @platforms = grep { $_->{ "SECTION" } =~ /PLATFORM/ } @{ $soft_index };
1782 print STDERR "Getting platform tables for file: $file\n" if $options->{ "verbose" };
1784 $plat_table = Maasha::NCBI::soft_get_platform( $fh, $platforms[ 0 ]->{ "LINE_BEG" }, $platforms[ -1 ]->{ "LINE_END" } );
1786 @samples = grep { $_->{ "SECTION" } =~ /SAMPLE/ } @{ $soft_index };
1788 $old_end = $platforms[ -1 ]->{ "LINE_END" };
1790 foreach $sample ( @samples )
1793 $skip = 1 if ( $options->{ "samples" } and grep { $sample->{ "SECTION" } !~ /$_/ } @{ $options->{ "samples" } } );
1795 print STDERR "Getting samples for dataset: $sample->{ 'SECTION' }\n" if $options->{ "verbose" } and not $skip;
1797 $records = Maasha::NCBI::soft_get_sample( $fh, $plat_table, $sample->{ "LINE_BEG" } - $old_end - 1, $sample->{ "LINE_END" } - $old_end - 1, $skip );
1799 foreach $record ( @{ $records } )
1801 Maasha::Biopieces::put_record( $record, $out );
1803 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
1808 $old_end = $sample->{ "LINE_END" };
1816 close $data_in if $data_in;
1823 # Martin A. Hansen, February 2008.
1826 # http://www.ncbi.nlm.nih.gov/geo/info/soft2.html
1828 my ( $in, # handle to in stream
1829 $out, # handle to out stream
1830 $options, # options hash
1835 my ( $data_in, $file, $fh, $num, $record, $entry );
1837 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
1838 Maasha::Biopieces::put_record( $record, $out );
1843 foreach $file ( @{ $options->{ "files" } } )
1845 $fh = Maasha::Common::read_open( $file );
1847 while ( $entry = Maasha::GFF::get_entry( $fh ) )
1849 Maasha::Biopieces::put_record( $entry, $out );
1851 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
1861 close $data_in if $data_in;
1865 sub script_read_2bit
1867 # Martin A. Hansen, March 2008.
1869 # Read sequences from 2bit file.
1871 my ( $in, # handle to in stream
1872 $out, # handle to out stream
1873 $options, # options hash
1878 my ( $record, $file, $data_in, $mask, $toc, $line, $num );
1880 $mask = 1 if not $options->{ "no_mask" };
1882 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
1883 Maasha::Biopieces::put_record( $record, $out );
1888 foreach $file ( @{ $options->{ "files" } } )
1890 $data_in = Maasha::Common::read_open( $file );
1892 $toc = Maasha::TwoBit::twobit_get_TOC( $data_in );
1894 foreach $line ( @{ $toc } )
1896 $record->{ "SEQ_NAME" } = $line->[ 0 ];
1897 $record->{ "SEQ" } = Maasha::TwoBit::twobit_get_seq( $data_in, $line->[ 1 ], undef, undef, $mask );
1898 $record->{ "SEQ_LEN" } = length $record->{ "SEQ" };
1900 Maasha::Biopieces::put_record( $record, $out );
1902 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
1912 close $data_in if $data_in;
1916 sub script_read_solexa
1918 # Martin A. Hansen, March 2008.
1920 # Read Solexa sequence reads from file.
1922 my ( $in, # handle to in stream
1923 $out, # handle to out stream
1924 $options, # options hash
1929 my ( $record, $file, $data_in, $entry, $num, @seqs, @scores, $i );
1931 $options->{ "format" } ||= "octal";
1932 $options->{ "quality" } ||= 20;
1934 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
1935 Maasha::Biopieces::put_record( $record, $out );
1940 foreach $file ( @{ $options->{ "files" } } )
1942 $data_in = Maasha::Common::read_open( $file );
1944 if ( $options->{ "format" } eq "octal" )
1946 while ( $entry = Maasha::Solexa::solexa_get_entry_octal( $data_in ) )
1948 $record = Maasha::Solexa::solexa2biopiece( $entry, $options->{ "quality" } );
1950 Maasha::Biopieces::put_record( $record, $out );
1952 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
1959 while ( $entry = Maasha::Solexa::solexa_get_entry_decimal( $data_in ) )
1961 $record = Maasha::Solexa::solexa2biopiece( $entry, $options->{ "quality" } );
1963 Maasha::Biopieces::put_record( $record, $out );
1965 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
1976 close $data_in if $data_in;
1980 sub script_read_solid
1982 # Martin A. Hansen, April 2008.
1984 # Read Solid sequence from file.
1986 my ( $in, # handle to in stream
1987 $out, # handle to out stream
1988 $options, # options hash
1993 my ( $record, $file, $data_in, $line, $num, $seq_name, $seq_cs, $seq_qual, @scores, @seqs, $i );
1995 $options->{ "quality" } ||= 15;
1997 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
1998 Maasha::Biopieces::put_record( $record, $out );
2003 foreach $file ( @{ $options->{ "files" } } )
2005 $data_in = Maasha::Common::read_open( $file );
2007 while ( $line = <$data_in> )
2011 ( $seq_name, $seq_cs, $seq_qual ) = split /\t/, $line;
2013 @scores = split /,/, $seq_qual;
2014 @seqs = split //, Maasha::Solid::color_space2seq( $seq_cs );
2016 for ( $i = 0; $i < @seqs; $i++ ) {
2017 $seqs[ $i ] = lc $seqs[ $i ] if $scores[ $i ] < $options->{ "quality" };
2021 REC_TYPE => 'SOLID',
2022 SEQ_NAME => $seq_name,
2024 SEQ_QUAL => join( ";", @scores ),
2025 SEQ_LEN => length $seq_cs,
2026 SEQ => join( "", @seqs ),
2027 SCORE_MEAN => sprintf( "%.2f", Maasha::Calc::mean( \@scores ) ),
2030 Maasha::Biopieces::put_record( $record, $out );
2032 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
2042 close $data_in if $data_in;
2046 sub script_read_mysql
2048 # Martin A. Hansen, May 2008.
2050 # Read a MySQL query into stream.
2052 my ( $in, # handle to in stream
2053 $out, # handle to out stream
2054 $options, # options hash
2059 my ( $record, $dbh, $results );
2061 $options->{ "user" } ||= Maasha::UCSC::ucsc_get_user();
2062 $options->{ "password" } ||= Maasha::UCSC::ucsc_get_password();
2064 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
2065 Maasha::Biopieces::put_record( $record, $out );
2068 $dbh = Maasha::SQL::connect( $options->{ "database" }, $options->{ "user" }, $options->{ "password" } );
2070 $results = Maasha::SQL::query_hashref_list( $dbh, $options->{ "query" } );
2072 Maasha::SQL::disconnect( $dbh );
2074 map { Maasha::Biopieces::put_record( $_ ) } @{ $results };
2078 sub script_read_ucsc_config
2080 # Martin A. Hansen, November 2008.
2082 # Read track entries from UCSC Genome Browser '.ra' files.
2084 my ( $in, # handle to in stream
2085 $out, # handle to out stream
2086 $options, # options hash
2091 my ( $record, $file, $data_in, $entry, $num );
2093 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
2094 Maasha::Biopieces::put_record( $record, $out );
2099 foreach $file ( @{ $options->{ "files" } } )
2101 $data_in = Maasha::Common::read_open( $file );
2103 while ( $record = Maasha::UCSC::ucsc_config_entry_get( $data_in ) )
2105 $record->{ 'REC_TYPE' } = "UCSC Config";
2107 Maasha::Biopieces::put_record( $record, $out );
2109 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
2119 close $data_in if $data_in;
2123 sub script_assemble_tag_contigs
2125 # Martin A. Hansen, November 2008.
2127 # Assemble tags from the stream into
2130 # The current implementation is quite
2131 # slow because of heavy use of temporary
2134 my ( $in, # handle to in stream
2135 $out, # handle to out stream
2136 $options, # options hash
2141 my ( $bed_file, $tag_file, $fh_in, $fh_out, $cols, $record, $bed_entry, $file_hash, $chr, $strand );
2143 $bed_file = "$BP_TMP/assemble_tag_contigs.bed";
2144 $fh_out = Maasha::Filesys::file_write_open( $bed_file );
2145 $cols = 6; # we only need the first 6 BED columns
2147 while ( $record = Maasha::Biopieces::get_record( $in ) )
2149 if ( $bed_entry = Maasha::UCSC::BED::biopiece2bed( $record, $cols ) )
2151 $strand = $record->{ 'STRAND' } || '+';
2153 Maasha::UCSC::BED::bed_entry_put( $bed_entry, $fh_out, $cols, $options->{ 'check' } );
2159 $file_hash = Maasha::UCSC::BED::bed_file_split_on_chr( $bed_file, $BP_TMP, $cols );
2163 foreach $chr ( sort keys %{ $file_hash } )
2165 $bed_file = $file_hash->{ $chr };
2166 $tag_file = "$bed_file.tc";
2168 Maasha::Common::run( "bed2tag_contigs", "< $bed_file > $tag_file" );
2170 $fh_in = Maasha::Filesys::file_read_open( $tag_file );
2172 while ( $bed_entry = Maasha::UCSC::BED::bed_entry_get( $fh_in, $cols, $options->{ 'check' } ) )
2174 if ( $record = Maasha::UCSC::BED::bed2biopiece( $bed_entry ) ) {
2175 Maasha::Biopieces::put_record( $record, $out );
2187 sub script_format_genome
2189 # Martin A. Hansen, Juli 2008.
2191 # Format a genome to speficed formats.
2193 my ( $in, # handle to in stream
2194 $out, # handle to out stream
2195 $options, # options hash
2200 my ( $dir, $genome, $fasta_dir, $phastcons_dir, $vals, $fh_out, $record, $format, $index, $entry );
2202 $dir = $options->{ 'dir' } || $ENV{ 'BP_DATA' };
2203 $genome = $options->{ 'genome' };
2205 Maasha::Common::error( "Directory: $dir does not exist" ) if not -d $dir;
2206 Maasha::Common::dir_create_if_not_exists( "$dir/genomes" );
2207 Maasha::Common::dir_create_if_not_exists( "$dir/genomes/$genome" );
2209 if ( grep { $_ =~ /fasta|blast|vmatch/i } @{ $options->{ "formats" } } )
2211 if ( -f "$dir/genomes/$genome/fasta/$genome.fna" )
2213 $fasta_dir = "$dir/genomes/$genome/fasta";
2217 Maasha::Common::dir_create_if_not_exists( "$dir/genomes/$genome/fasta" );
2219 $fasta_dir = "$dir/genomes/$genome/fasta";
2221 $fh_out = Maasha::Common::write_open( "$fasta_dir/$genome.fna" );
2224 elsif ( grep { $_ =~ /phastcons/i } @{ $options->{ "formats" } } )
2226 Maasha::Common::dir_create_if_not_exists( "$dir/genomes/$genome/phastcons" );
2228 $phastcons_dir = "$dir/genomes/$genome/phastcons";
2230 $fh_out = Maasha::Common::write_open( "$phastcons_dir/$genome.pp" );
2233 while ( $record = Maasha::Biopieces::get_record( $in ) )
2235 if ( $fh_out and $entry = Maasha::Fasta::biopiece2fasta( $record ) )
2237 Maasha::Fasta::put_entry( $entry, $fh_out, $options->{ "wrap" } );
2239 elsif ( $fh_out and $record->{ "CHR" } and $record->{ "CHR_BEG" } and $record->{ "STEP" } and $record->{ "VALS" } )
2241 print $fh_out "fixedStep chrom=$record->{ 'CHR' } start=$record->{ 'CHR_BEG' } step=$record->{ 'STEP' }\n";
2243 $vals = $record->{ 'VALS' };
2247 print $fh_out "$vals\n";
2250 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
2253 foreach $format ( @{ $options->{ 'formats' } } )
2255 if ( $format =~ /^fasta$/i ) { Maasha::Fasta::fasta_index( "$fasta_dir/$genome.fna", "$dir/genomes/$genome/fasta/$genome.index" ) }
2256 elsif ( $format =~ /^blast$/i ) { Maasha::NCBI::blast_index( "$genome.fna", $fasta_dir, "$dir/genomes/$genome/blast", "dna", $genome ) }
2257 elsif ( $format =~ /^blat$/i ) { print STDERR "BLAT FORMAT NOT IMPLEMENTED" }
2258 elsif ( $format =~ /^vmatch$/i ) { Maasha::Match::vmatch_index( "$genome.fna", $fasta_dir, "$dir/genomes/$genome/vmatch", $BP_TMP ) }
2259 elsif ( $format =~ /^phastcons$/i ) { Maasha::UCSC::phastcons_index( "$genome.pp", $phastcons_dir ) }
2262 close $fh_out if $fh_out;
2266 sub script_length_seq
2268 # Martin A. Hansen, August 2007.
2270 # Determine the length of sequences in stream.
2272 my ( $in, # handle to in stream
2273 $out, # handle to out stream
2274 $options, # options hash
2279 my ( $record, $total );
2281 while ( $record = Maasha::Biopieces::get_record( $in ) )
2283 if ( $record->{ "SEQ" } )
2285 $record->{ "SEQ_LEN" } = length $record->{ "SEQ" };
2286 $total += $record->{ "SEQ_LEN" };
2289 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
2292 Maasha::Biopieces::put_record( { TOTAL_SEQ_LEN => $total }, $out );
2296 sub script_uppercase_seq
2298 # Martin A. Hansen, August 2007.
2300 # Uppercases sequences in stream.
2302 my ( $in, # handle to in stream
2303 $out, # handle to out stream
2310 while ( $record = Maasha::Biopieces::get_record( $in ) )
2312 $record->{ "SEQ" } = uc $record->{ "SEQ" } if $record->{ "SEQ" };
2314 Maasha::Biopieces::put_record( $record, $out );
2319 sub script_shuffle_seq
2321 # Martin A. Hansen, December 2007.
2323 # Shuffle sequences in stream.
2325 my ( $in, # handle to in stream
2326 $out, # handle to out stream
2333 while ( $record = Maasha::Biopieces::get_record( $in ) )
2335 $record->{ "SEQ" } = Maasha::Seq::seq_shuffle( $record->{ "SEQ" } ) if $record->{ "SEQ" };
2337 Maasha::Biopieces::put_record( $record, $out );
2342 sub script_analyze_seq
2344 # Martin A. Hansen, August 2007.
2346 # Analyze sequence composition of sequences in stream.
2348 my ( $in, # handle to in stream
2349 $out, # handle to out stream
2354 my ( $record, $analysis );
2356 while ( $record = Maasha::Biopieces::get_record( $in ) )
2358 if ( $record->{ "SEQ" } )
2360 $analysis = Maasha::Seq::seq_analyze( $record->{ "SEQ" } );
2362 map { $record->{ $_ } = $analysis->{ $_ } } keys %{ $analysis };
2365 Maasha::Biopieces::put_record( $record, $out );
2370 sub script_analyze_tags
2372 # Martin A. Hansen, August 2008.
2374 # Analyze sequence tags in stream.
2376 my ( $in, # handle to in stream
2377 $out, # handle to out stream
2382 my ( $record, $analysis, %len_hash, %clone_hash, $clones, $key, $tag_record );
2384 while ( $record = Maasha::Biopieces::get_record( $in ) )
2386 if ( $record->{ "SEQ_NAME" } and $record->{ "SEQ" } )
2388 if ( $record->{ "SEQ_NAME" } =~ /_(\d+)$/ )
2392 $len_hash{ length( $record->{ "SEQ" } ) }++;
2393 $clone_hash{ length( $record->{ "SEQ" } ) } += $clones;
2396 elsif ( $record->{ "Q_ID" } and $record->{ "BED_LEN" } )
2398 if ( $record->{ "Q_ID" } =~ /_(\d+)$/ )
2402 $len_hash{ $record->{ "BED_LEN" } }++;
2403 $clone_hash{ $record->{ "BED_LEN" } } += $clones;
2408 foreach $key ( sort { $a <=> $b } keys %len_hash )
2410 $tag_record->{ "TAG_LEN" } = $key;
2411 $tag_record->{ "TAG_COUNT" } = $len_hash{ $key };
2412 $tag_record->{ "TAG_CLONES" } = $clone_hash{ $key };
2414 Maasha::Biopieces::put_record( $tag_record, $out );
2419 sub script_complexity_seq
2421 # Martin A. Hansen, May 2008.
2423 # Generates an index calculated as the most common di-residue over
2424 # the sequence length for all sequences in stream.
2426 my ( $in, # handle to in stream
2427 $out, # handle to out stream
2432 my ( $record, $index );
2434 while ( $record = Maasha::Biopieces::get_record( $in ) )
2436 $record->{ "SEQ_COMPLEXITY" } = sprintf( "%.2f", Maasha::Seq::seq_complexity( $record->{ "SEQ" } ) ) if $record->{ "SEQ" };
2438 Maasha::Biopieces::put_record( $record, $out );
2443 sub script_oligo_freq
2445 # Martin A. Hansen, August 2007.
2447 # Determine the length of sequences in stream.
2449 my ( $in, # handle to in stream
2450 $out, # handle to out stream
2451 $options, # options hash
2456 my ( $record, %oligos, @freq_table );
2458 $options->{ "word_size" } ||= 7;
2460 while ( $record = Maasha::Biopieces::get_record( $in ) )
2462 if ( $record->{ "SEQ" } )
2464 map { $oligos{ $_ }++ } Maasha::Seq::seq2oligos( \$record->{ "SEQ" }, $options->{ "word_size" } );
2466 if ( not $options->{ "all" } )
2468 @freq_table = Maasha::Seq::oligo_freq( \%oligos );
2470 map { Maasha::Biopieces::put_record( $_, $out ) } @freq_table;
2476 Maasha::Biopieces::put_record( $record, $out );
2479 if ( $options->{ "all" } )
2481 @freq_table = Maasha::Seq::oligo_freq( \%oligos );
2483 map { Maasha::Biopieces::put_record( $_, $out ) } @freq_table;
2488 sub script_create_weight_matrix
2490 # Martin A. Hansen, August 2007.
2492 # Creates a weight matrix from an alignmnet.
2494 my ( $in, # handle to in stream
2495 $out, # handle to out stream
2496 $options, # options hash
2501 my ( $record, $count, $i, $res, %freq_hash, %res_hash, $freq );
2505 while ( $record = Maasha::Biopieces::get_record( $in ) )
2507 if ( $record->{ "SEQ" } )
2509 for ( $i = 0; $i < length $record->{ "SEQ" }; $i++ )
2511 $res = substr $record->{ "SEQ" }, $i, 1;
2513 $freq_hash{ $i }{ $res }++;
2514 $res_hash{ $res } = 1;
2521 Maasha::Biopieces::put_record( $record, $out );
2525 foreach $res ( sort keys %res_hash )
2529 $record->{ "V0" } = $res;
2531 for ( $i = 0; $i < keys %freq_hash; $i++ )
2533 $freq = $freq_hash{ $i }{ $res } || 0;
2535 if ( $options->{ "percent" } ) {
2536 $freq = sprintf( "%.0f", 100 * $freq / $count ) if $freq > 0;
2539 $record->{ "V" . ( $i + 1 ) } = $freq;
2542 Maasha::Biopieces::put_record( $record, $out );
2547 sub script_calc_bit_scores
2549 # Martin A. Hansen, March 2007.
2551 # Calculates the bit scores for each position from an alignmnet in the stream.
2553 my ( $in, # handle to in stream
2554 $out, # handle to out stream
2559 my ( $record, $type, $count, $i, $res, %freq_hash, $bit_max, $bit_height, $bit_diff );
2563 while ( $record = Maasha::Biopieces::get_record( $in ) )
2565 if ( $record->{ "SEQ" } )
2567 $type = Maasha::Seq::seq_guess_type( $record->{ "SEQ" } ) if not $type;
2569 for ( $i = 0; $i < length $record->{ "SEQ" }; $i++ )
2571 $res = substr $record->{ "SEQ" }, $i, 1;
2573 next if $res =~ /-|_|~|\./;
2575 $freq_hash{ $i }{ $res }++;
2582 Maasha::Biopieces::put_record( $record, $out );
2588 if ( $type eq "protein" ) {
2594 for ( $i = 0; $i < keys %freq_hash; $i++ )
2596 $bit_height = Maasha::Seq::seqlogo_calc_bit_height( $freq_hash{ $i }, $count );
2598 $bit_diff = $bit_max - $bit_height;
2600 $record->{ "V" . ( $i ) } = sprintf( "%.2f", $bit_diff );
2603 Maasha::Biopieces::put_record( $record, $out );
2607 sub script_calc_fixedstep
2609 # Martin A. Hansen, September 2008.
2611 # Calculates fixedstep entries from data in the stream.
2613 my ( $in, # handle to in stream
2614 $out, # handle to out stream
2615 $options, # options hash
2620 my ( $bed_file, $fh_in, $fh_out, $record, $file_hash, $chr, $bed_entry, $fixedstep_file, $fixedstep_entry );
2622 $bed_file = "$BP_TMP/calc_fixedstep.bed";
2623 $fh_out = Maasha::Filesys::file_write_open( $bed_file );
2625 while ( $record = Maasha::Biopieces::get_record( $in ) )
2627 if ( $bed_entry = Maasha::UCSC::BED::biopiece2bed( $record ) ) {
2628 Maasha::UCSC::BED::bed_entry_put( $bed_entry, $fh_out, undef, $options->{ 'check' } );
2634 $file_hash = Maasha::UCSC::BED::bed_file_split_on_chr( $bed_file, $BP_TMP );
2638 foreach $chr ( sort keys %{ $file_hash } )
2640 $bed_file = $file_hash->{ $chr };
2642 $fixedstep_file = Maasha::UCSC::Wiggle::fixedstep_calc( $bed_file, $chr, $options->{ 'use_score' }, $options->{ 'use_log10' } );
2644 #$fixedstep_file = "$bed_file.fixedstep";
2646 # Maasha::Common::run( "bed2fixedstep", "< $bed_file > $fixedstep_file" );
2649 $fh_in = Maasha::Filesys::file_read_open( $fixedstep_file );
2651 while ( $fixedstep_entry = Maasha::UCSC::Wiggle::fixedstep_entry_get( $fh_in ) )
2653 if ( $record = Maasha::UCSC::Wiggle::fixedstep2biopiece( $fixedstep_entry ) ) {
2654 Maasha::Biopieces::put_record( $record, $out );
2661 unlink $fixedstep_file;
2666 sub script_reverse_seq
2668 # Martin A. Hansen, August 2007.
2670 # Reverse sequence in record.
2672 my ( $in, # handle to in stream
2673 $out, # handle to out stream
2680 while ( $record = Maasha::Biopieces::get_record( $in ) )
2682 if ( $record->{ "SEQ" } ) {
2683 $record->{ "SEQ" } = reverse $record->{ "SEQ" };
2686 Maasha::Biopieces::put_record( $record, $out );
2691 sub script_complement_seq
2693 # Martin A. Hansen, August 2007.
2695 # Complement sequence in record.
2697 my ( $in, # handle to in stream
2698 $out, # handle to out stream
2703 my ( $record, $type );
2705 while ( $record = Maasha::Biopieces::get_record( $in ) )
2707 if ( $record->{ "SEQ" } )
2710 $type = Maasha::Seq::seq_guess_type( $record->{ "SEQ" } );
2713 if ( $type eq "rna" ) {
2714 Maasha::Seq::rna_comp( \$record->{ "SEQ" } );
2715 } elsif ( $type eq "dna" ) {
2716 Maasha::Seq::dna_comp( \$record->{ "SEQ" } );
2720 Maasha::Biopieces::put_record( $record, $out );
2725 sub script_remove_indels
2727 # Martin A. Hansen, August 2007.
2729 # Remove indels from sequences in stream.
2731 my ( $in, # handle to in stream
2732 $out, # handle to out stream
2739 while ( $record = Maasha::Biopieces::get_record( $in ) )
2741 $record->{ 'SEQ' } =~ tr/-~.//d if $record->{ "SEQ" };
2743 Maasha::Biopieces::put_record( $record, $out );
2748 sub script_transliterate_seq
2750 # Martin A. Hansen, August 2007.
2752 # Transliterate chars from sequence in record.
2754 my ( $in, # handle to in stream
2755 $out, # handle to out stream
2756 $options, # options hash
2761 my ( $record, $search, $replace, $delete );
2763 $search = $options->{ "search" } || "";
2764 $replace = $options->{ "replace" } || "";
2765 $delete = $options->{ "delete" } || "";
2767 while ( $record = Maasha::Biopieces::get_record( $in ) )
2769 if ( $record->{ "SEQ" } )
2771 if ( $search and $replace ) {
2772 eval "\$record->{ 'SEQ' } =~ tr/$search/$replace/";
2773 } elsif ( $delete ) {
2774 eval "\$record->{ 'SEQ' } =~ tr/$delete//d";
2778 Maasha::Biopieces::put_record( $record, $out );
2783 sub script_transliterate_vals
2785 # Martin A. Hansen, April 2008.
2787 # Transliterate chars from values in record.
2789 my ( $in, # handle to in stream
2790 $out, # handle to out stream
2791 $options, # options hash
2796 my ( $record, $search, $replace, $delete, $key );
2798 $search = $options->{ "search" } || "";
2799 $replace = $options->{ "replace" } || "";
2800 $delete = $options->{ "delete" } || "";
2802 while ( $record = Maasha::Biopieces::get_record( $in ) )
2804 foreach $key ( @{ $options->{ "keys" } } )
2806 if ( exists $record->{ $key } )
2808 if ( $search and $replace ) {
2809 eval "\$record->{ $key } =~ tr/$search/$replace/";
2810 } elsif ( $delete ) {
2811 eval "\$record->{ $key } =~ tr/$delete//d";
2816 Maasha::Biopieces::put_record( $record, $out );
2821 sub script_translate_seq
2823 # Martin A. Hansen, February 2008.
2825 # Translate DNA sequence into protein sequence.
2827 my ( $in, # handle to in stream
2828 $out, # handle to out stream
2829 $options, # options hash
2834 my ( $record, $frame, %new_record );
2836 $options->{ "frames" } ||= [ 1, 2, 3, -1, -2, -3 ];
2838 while ( $record = Maasha::Biopieces::get_record( $in ) )
2840 if ( $record->{ "SEQ" } )
2842 if ( Maasha::Seq::seq_guess_type( $record->{ "SEQ" } ) eq "dna" )
2844 foreach $frame ( @{ $options->{ "frames" } } )
2846 %new_record = %{ $record };
2848 $new_record{ "SEQ" } = Maasha::Seq::translate( $record->{ "SEQ" }, $frame );
2849 $new_record{ "SEQ_LEN" } = length $new_record{ "SEQ" };
2850 $new_record{ "FRAME" } = $frame;
2852 Maasha::Biopieces::put_record( \%new_record, $out );
2858 Maasha::Biopieces::put_record( $record, $out );
2864 sub script_extract_seq
2866 # Martin A. Hansen, August 2007.
2868 # Extract subsequences from sequences in record.
2870 my ( $in, # handle to in stream
2871 $out, # handle to out stream
2872 $options, # options hash
2877 my ( $beg, $end, $len, $record );
2879 if ( not defined $options->{ "beg" } or $options->{ "beg" } < 0 ) {
2882 $beg = $options->{ "beg" } - 1; # correcting for start offset
2885 if ( defined $options->{ "end" } and $options->{ "end" } - 1 < $beg ) {
2887 } elsif ( defined $options->{ "end" } ) {
2888 $end = $options->{ "end" } - 1; # correcting for start offset
2891 $len = $options->{ "len" };
2893 # print "beg->$beg, end->$end, len->$len\n";
2895 while ( $record = Maasha::Biopieces::get_record( $in ) )
2897 if ( $record->{ "SEQ" } )
2899 if ( defined $beg and defined $end )
2901 if ( $end - $beg + 1 > length $record->{ "SEQ" } ) {
2902 $record->{ "SEQ" } = substr $record->{ "SEQ" }, $beg;
2904 $record->{ "SEQ" } = substr $record->{ "SEQ" }, $beg, $end - $beg + 1;
2907 elsif ( defined $beg and defined $len )
2909 if ( $len > length $record->{ "SEQ" } ) {
2910 $record->{ "SEQ" } = substr $record->{ "SEQ" }, $beg;
2912 $record->{ "SEQ" } = substr $record->{ "SEQ" }, $beg, $len;
2915 elsif ( defined $beg )
2917 $record->{ "SEQ" } = substr $record->{ "SEQ" }, $beg;
2921 $record->{ "SEQ_LEN" } = length $record->{ "SEQ" };
2923 Maasha::Biopieces::put_record( $record, $out );
2928 sub script_get_genome_seq
2930 # Martin A. Hansen, December 2007.
2932 # Gets a subsequence from a genome.
2934 my ( $in, # handle to in stream
2935 $out, # handle to out stream
2936 $options, # options hash
2941 my ( $record, $genome, $genome_file, $index_file, $index, $fh, $index_head, $index_beg, $index_len, $beg, $len, %lookup_hash, @begs, @lens, $i, $seq );
2943 $options->{ "flank" } ||= 0;
2945 if ( $options->{ "genome" } )
2947 $genome = $options->{ "genome" };
2949 $genome_file = "$ENV{ 'BP_DATA' }/genomes/$genome/fasta/$genome.fna";
2950 $index_file = "$ENV{ 'BP_DATA' }/genomes/$genome/fasta/$genome.index";
2952 $fh = Maasha::Common::read_open( $genome_file );
2953 $index = Maasha::Fasta::index_retrieve( $index_file );
2955 shift @{ $index }; # Get rid of the file size info
2957 map { $lookup_hash{ $_->[ 0 ] } = [ $_->[ 1 ], $_->[ 2 ] ] } @{ $index };
2959 if ( exists $lookup_hash{ $options->{ "chr" } } and defined $options->{ "beg" } and ( defined $options->{ "end" } or defined $options->{ "len" } ) )
2961 ( $index_beg, $index_len ) = @{ $lookup_hash{ $options->{ "chr" } } };
2963 $beg = $index_beg + $options->{ "beg" } - 1;
2965 if ( $options->{ "len" } ) {
2966 $len = $options->{ "len" };
2967 } elsif ( $options->{ "end" } ) {
2968 $len = ( $options->{ "end" } - $options->{ "beg" } + 1 );
2971 $beg -= $options->{ "flank" };
2972 $len += 2 * $options->{ "flank" };
2974 if ( $beg <= $index_beg )
2976 $len -= $index_beg - $beg;
2980 $len = $index_beg + $index_len - $beg if $beg + $len > $index_beg + $index_len;
2982 next if $beg > $index_beg + $index_len;
2984 $record->{ "CHR" } = $options->{ "chr" };
2985 $record->{ "CHR_BEG" } = $beg - $index_beg;
2986 $record->{ "CHR_END" } = $record->{ "CHR_BEG" } + $len - 1;
2988 $record->{ "SEQ" } = Maasha::Common::file_read( $fh, $beg, $len );
2989 $record->{ "SEQ_LEN" } = $len;
2991 Maasha::Biopieces::put_record( $record, $out );
2995 while ( $record = Maasha::Biopieces::get_record( $in ) )
2997 if ( $options->{ "genome" } and not $record->{ "SEQ" } )
2999 if ( $record->{ "REC_TYPE" } eq "BED" and exists $lookup_hash{ $record->{ "CHR" } } )
3001 ( $index_beg, $index_len ) = @{ $lookup_hash{ $record->{ "CHR" } } };
3003 $beg = $record->{ "CHR_BEG" } + $index_beg;
3004 $len = $record->{ "CHR_END" } - $record->{ "CHR_BEG" } + 1;
3006 elsif ( $record->{ "REC_TYPE" } eq "PSL" and exists $lookup_hash{ $record->{ "S_ID" } } )
3008 ( $index_beg, $index_len ) = @{ $lookup_hash{ $record->{ "S_ID" } } };
3010 $beg = $record->{ "S_BEG" } + $index_beg;
3011 $len = $record->{ "S_END" } - $record->{ "S_BEG" } + 1;
3013 elsif ( $record->{ "REC_TYPE" } eq "BLAST" and exists $lookup_hash{ $record->{ "S_ID" } } )
3015 ( $index_beg, $index_len ) = @{ $lookup_hash{ $record->{ "S_ID" } } };
3017 $beg = $record->{ "S_BEG" } + $index_beg;
3018 $len = $record->{ "S_END" } - $record->{ "S_BEG" } + 1;
3021 $beg -= $options->{ "flank" };
3022 $len += 2 * $options->{ "flank" };
3024 if ( $beg <= $index_beg )
3026 $len -= $index_beg - $beg;
3030 $len = $index_beg + $index_len - $beg if $beg + $len > $index_beg + $index_len;
3032 next if $beg > $index_beg + $index_len;
3034 $record->{ "CHR_BEG" } = $beg - $index_beg;
3035 $record->{ "CHR_END" } = $record->{ "CHR_BEG" } + $len - 1;
3037 $record->{ "SEQ" } = Maasha::Common::file_read( $fh, $beg, $len );
3039 if ( $record->{ "STRAND" } and $record->{ "STRAND" } eq "-" )
3041 Maasha::Seq::dna_comp( \$record->{ "SEQ" } );
3042 $record->{ "SEQ" } = reverse $record->{ "SEQ" };
3045 if ( $options->{ "mask" } )
3047 if ( $record->{ "BLOCK_COUNT" } > 1 ) # uppercase hit block segments and lowercase the rest.
3049 $record->{ "SEQ" } = lc $record->{ "SEQ" };
3051 @begs = split ",", $record->{ "Q_BEGS" };
3052 @lens = split ",", $record->{ "BLOCK_LENS" };
3054 for ( $i = 0; $i < @begs; $i++ ) {
3055 substr $record->{ "SEQ" }, $begs[ $i ], $lens[ $i ], uc substr $record->{ "SEQ" }, $begs[ $i ], $lens[ $i ];
3059 elsif ( $options->{ "splice" } )
3061 if ( $record->{ "BLOCK_COUNT" } > 1 ) # splice block sequences
3064 @begs = split ",", $record->{ "Q_BEGS" };
3065 @lens = split ",", $record->{ "BLOCK_LENS" };
3067 for ( $i = 0; $i < @begs; $i++ ) {
3068 $seq .= substr $record->{ "SEQ" }, $begs[ $i ], $lens[ $i ];
3071 $record->{ "SEQ" } = $seq;
3075 $record->{ "SEQ_LEN" } = length $record->{ "SEQ" };
3078 Maasha::Biopieces::put_record( $record, $out );
3085 sub script_get_genome_align
3087 # Martin A. Hansen, April 2008.
3089 # Gets a subalignment from a multiple genome alignment.
3091 my ( $in, # handle to in stream
3092 $out, # handle to out stream
3093 $options, # options hash
3098 my ( $record, $maf_track, $align, $align_num, $beg, $end, $len, $entry );
3100 $options->{ "strand" } ||= "+";
3104 $maf_track = Maasha::Config::maf_track( $options->{ "genome" } );
3106 if ( $options->{ "chr" } and $options->{ "beg" } and ( $options->{ "end" } or $options->{ "len" } ) )
3108 $beg = $options->{ "beg" } - 1;
3110 if ( $options->{ "end" } ) {
3111 $end = $options->{ "end" };
3112 } elsif ( $options->{ "len" } ) {
3113 $end = $beg + $options->{ "len" };
3116 $align = Maasha::UCSC::maf_extract( $BP_TMP, $options->{ "genome" }, $maf_track, $options->{ "chr" }, $beg, $end, $options->{ "strand" } );
3118 foreach $entry ( @{ $align } )
3120 $entry->{ "CHR" } = $record->{ "CHR" };
3121 $entry->{ "CHR_BEG" } = $record->{ "CHR_BEG" };
3122 $entry->{ "CHR_END" } = $record->{ "CHR_END" };
3123 $entry->{ "STRAND" } = $record->{ "STRAND" } || '+';
3124 $entry->{ "Q_ID" } = $record->{ "Q_ID" };
3125 $entry->{ "SCORE" } = $record->{ "SCORE" };
3127 Maasha::Biopieces::put_record( $entry, $out );
3131 while ( $record = Maasha::Biopieces::get_record( $in ) )
3133 if ( $record->{ "REC_TYPE" } eq "BED" )
3135 $align = Maasha::UCSC::maf_extract( $BP_TMP, $options->{ "genome" }, $maf_track, $record->{ "CHR" }, $record->{ "CHR_BEG" }, $record->{ "CHR_END" }, $record->{ "STRAND" } );
3137 elsif ( $record->{ "REC_TYPE" } eq "VMATCH" )
3139 $align = Maasha::UCSC::maf_extract( $BP_TMP, $options->{ "genome" }, $maf_track, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" } + 1, $record->{ "STRAND" } );
3141 elsif ( $record->{ "REC_TYPE" } eq "PSL" )
3143 $align = Maasha::UCSC::maf_extract( $BP_TMP, $options->{ "genome" }, $maf_track, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" }, $record->{ "STRAND" } );
3145 elsif ( $record->{ "REC_TYPE" } eq "BLAST" )
3147 $align = Maasha::UCSC::maf_extract( $BP_TMP, $options->{ "genome" }, $maf_track, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" }, $record->{ "STRAND" } );
3150 foreach $entry ( @{ $align } )
3152 $entry->{ "CHR" } = $record->{ "CHR" };
3153 $entry->{ "CHR_BEG" } = $record->{ "CHR_BEG" };
3154 $entry->{ "CHR_END" } = $record->{ "CHR_END" };
3155 $entry->{ "STRAND" } = $record->{ "STRAND" };
3156 $entry->{ "Q_ID" } = $record->{ "Q_ID" };
3157 $entry->{ "SCORE" } = $record->{ "SCORE" };
3159 Maasha::Biopieces::put_record( $entry, $out );
3167 sub script_get_genome_phastcons
3169 # Martin A. Hansen, February 2008.
3171 # Get phastcons scores from genome intervals.
3173 my ( $in, # handle to in stream
3174 $out, # handle to out stream
3175 $options, # options hash
3180 my ( $phastcons_file, $phastcons_index, $index, $fh_phastcons, $scores, $record );
3182 $options->{ "flank" } ||= 0;
3184 $phastcons_file = Maasha::Config::genome_phastcons( $options->{ "genome" } );
3185 $phastcons_index = Maasha::Config::genome_phastcons_index( $options->{ "genome" } );
3187 $index = Maasha::UCSC::fixedstep_index_retrieve( $phastcons_index );
3188 $fh_phastcons = Maasha::Common::read_open( $phastcons_file );
3190 if ( defined $options->{ "chr" } and defined $options->{ "beg" } and ( defined $options->{ "end" } or defined $options->{ "len" } ) )
3192 $options->{ "beg" } -= 1; # request is 1-based
3193 $options->{ "end" } -= 1; # request is 1-based
3195 if ( $options->{ "len" } ) {
3196 $options->{ "end" } = $options->{ "beg" } + $options->{ "len" } - 1;
3199 $scores = Maasha::UCSC::fixedstep_index_lookup( $index, $fh_phastcons, $options->{ "chr" }, $options->{ "beg" }, $options->{ "end" }, $options->{ "flank" } );
3201 $record->{ "CHR" } = $options->{ "chr" };
3202 $record->{ "CHR_BEG" } = $options->{ "beg" } - $options->{ "flank" };
3203 $record->{ "CHR_END" } = $options->{ "end" } + $options->{ "flank" };
3205 $record->{ "PHASTCONS" } = join ",", @{ $scores };
3206 $record->{ "PHAST_COUNT" } = scalar @{ $scores }; # DEBUG
3208 Maasha::Biopieces::put_record( $record, $out );
3211 while ( $record = Maasha::Biopieces::get_record( $in ) )
3213 if ( $record->{ "REC_TYPE" } eq "BED" )
3215 $scores = Maasha::UCSC::fixedstep_index_lookup( $index, $fh_phastcons, $record->{ "CHR" }, $record->{ "CHR_BEG" }, $record->{ "CHR_END" }, $options->{ "flank" } );
3217 elsif ( $record->{ "REC_TYPE" } eq "PSL" )
3219 $scores = Maasha::UCSC::fixedstep_index_lookup( $index, $fh_phastcons, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" }, $options->{ "flank" } );
3221 elsif ( $record->{ "REC_TYPE" } eq "BLAST" )
3223 $scores = Maasha::UCSC::fixedstep_index_lookup( $index, $fh_phastcons, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" }, $options->{ "flank" } );
3226 $record->{ "PHASTCONS" } = join ",", @{ $scores } if @{ $scores };
3227 # $record->{ "PHAST_COUNT" } = @{ $scores } if @{ $scores }; # DEBUG
3229 Maasha::Biopieces::put_record( $record, $out );
3232 close $fh_phastcons if $fh_phastcons;
3238 # Martin A. Hansen, December 2007.
3240 # Folds sequences in stream into secondary structures.
3242 my ( $in, # handle to in stream
3243 $out, # handle to out stream
3248 my ( $record, $type, $struct, $index );
3250 while ( $record = Maasha::Biopieces::get_record( $in ) )
3252 if ( $record->{ "SEQ" } )
3255 $type = Maasha::Seq::seq_guess_type( $record->{ "SEQ" } );
3258 if ( $type ne "protein" )
3260 ( $struct, $index ) = Maasha::Seq::fold_struct_rnafold( $record->{ "SEQ" } );
3261 $record->{ "SEC_STRUCT" } = $struct;
3262 $record->{ "FREE_ENERGY" } = $index;
3263 $record->{ "SCORE" } = abs int $index;
3264 $record->{ "SIZE" } = length $struct;
3265 $record->{ "CONF" } = "1," x $record->{ "SIZE" };
3269 Maasha::Biopieces::put_record( $record, $out );
3274 sub script_split_seq
3276 # Martin A. Hansen, August 2007.
3278 # Split a sequence in stream into words.
3280 my ( $in, # handle to in stream
3281 $out, # handle to out stream
3282 $options, # options hash
3287 my ( $record, $new_record, $i, $subseq, %lookup );
3289 $options->{ "word_size" } ||= 7;
3291 while ( $record = Maasha::Biopieces::get_record( $in ) )
3293 if ( $record->{ "SEQ_NAME" } and $record->{ "SEQ" } )
3295 for ( $i = 0; $i < length( $record->{ "SEQ" } ) - $options->{ "word_size" } + 1; $i++ )
3297 $subseq = substr $record->{ "SEQ" }, $i, $options->{ "word_size" };
3299 if ( $options->{ "uniq" } and not $lookup{ $subseq } )
3301 $new_record->{ "SEQ_NAME" } = $record->{ "SEQ_NAME" } . "[" . ( $i + 1 ) . "-" . ( $i + $options->{ "word_size" } ) . "]";
3302 $new_record->{ "SEQ" } = $subseq;
3304 Maasha::Biopieces::put_record( $new_record, $out );
3306 $lookup{ $subseq } = 1;
3310 $new_record->{ "SEQ_NAME" } = $record->{ "SEQ_NAME" } . "[" . ( $i + 1 ) . "-" . ( $i + $options->{ "word_size" } ) . "]";
3311 $new_record->{ "SEQ" } = $subseq;
3313 Maasha::Biopieces::put_record( $new_record, $out );
3319 Maasha::Biopieces::put_record( $record, $out );
3325 sub script_split_bed
3327 # Martin A. Hansen, June 2008.
3329 # Split a BED record into overlapping windows.
3331 my ( $in, # handle to in stream
3332 $out, # handle to out stream
3333 $options, # options hash
3338 my ( $record, $new_record, $i );
3340 $options->{ "window_size" } ||= 20;
3341 $options->{ "step_size" } ||= 1;
3343 while ( $record = Maasha::Biopieces::get_record( $in ) )
3345 if ( $record->{ "CHR" } and $record->{ "CHR_BEG" } and $record->{ "CHR_END" } )
3347 $record->{ "BED_LEN" } = $record->{ "CHR_END" } - $record->{ "CHR_BEG" } + 1;
3349 for ( $i = 0; $i < $record->{ "BED_LEN" } - $options->{ "window_size" }; $i += $options->{ "step_size" } )
3351 $new_record->{ "REC_TYPE" } = "BED";
3352 $new_record->{ "CHR" } = $record->{ "CHR" };
3353 $new_record->{ "CHR_BEG" } = $record->{ "CHR_BEG" } + $i;
3354 $new_record->{ "CHR_END" } = $record->{ "CHR_BEG" } + $i + $options->{ "window_size" };
3355 $new_record->{ "BED_LEN" } = $options->{ "window_size" };
3356 $new_record->{ "Q_ID" } = $record->{ "Q_ID" } . "_$i";
3357 $new_record->{ "SCORE" } = $record->{ "SCORE" };
3358 $new_record->{ "STRAND" } = $record->{ "STRAND" };
3360 Maasha::Biopieces::put_record( $new_record, $out );
3365 Maasha::Biopieces::put_record( $record, $out );
3371 sub script_align_seq
3373 # Martin A. Hansen, August 2007.
3375 # Align sequences in stream.
3377 my ( $in, # handle to in stream
3378 $out, # handle to out stream
3383 my ( $record, @entries, $entry );
3385 while ( $record = Maasha::Biopieces::get_record( $in ) )
3387 if ( $record->{ "SEQ_NAME" } and $record->{ "SEQ" } ) {
3388 push @entries, [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ];
3389 } elsif ( $record->{ "Q_ID" } and $record->{ "SEQ" } ) {
3390 push @entries, [ $record->{ "Q_ID" }, $record->{ "SEQ" } ];
3392 Maasha::Biopieces::put_record( $record, $out );
3396 @entries = Maasha::Align::align( \@entries );
3398 foreach $entry ( @entries )
3400 if ( $entry->[ SEQ_NAME ] and $entry->[ SEQ ] )
3403 SEQ_NAME => $entry->[ SEQ_NAME ],
3404 SEQ => $entry->[ SEQ ],
3407 Maasha::Biopieces::put_record( $record, $out );
3415 # Martin A. Hansen, February 2008.
3417 # Using the first sequence in stream as reference, tile
3418 # all subsequent sequences based on pairwise alignments.
3420 my ( $in, # handle to in stream
3421 $out, # handle to out stream
3422 $options, # options hash
3427 my ( $record, $first, $ref_entry, @entries );
3431 while ( $record = Maasha::Biopieces::get_record( $in ) )
3433 if ( $record->{ "SEQ_NAME" } and $record->{ "SEQ" } )
3437 $ref_entry = [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ];
3443 push @entries, [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ];
3448 Maasha::Biopieces::put_record( $record, $out );
3452 @entries = Maasha::Align::align_tile( $ref_entry, \@entries, $options );
3454 map { Maasha::Biopieces::put_record( { SEQ_NAME => $_->[ SEQ_NAME ], SEQ => $_->[ SEQ ] }, $out ) } @entries;
3458 sub script_invert_align
3460 # Martin A. Hansen, February 2008.
3462 # Inverts an alignment showing only non-mathing residues
3463 # using the first sequence as reference.
3465 my ( $in, # handle to in stream
3466 $out, # handle to out stream
3467 $options, # options hash
3472 my ( $record, @entries );
3474 while ( $record = Maasha::Biopieces::get_record( $in ) )
3476 if ( $record->{ "SEQ_NAME" } and $record->{ "SEQ" } )
3478 push @entries, [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ];
3482 Maasha::Biopieces::put_record( $record, $out );
3486 Maasha::Align::align_invert( \@entries, $options->{ "soft" } );
3488 map { Maasha::Biopieces::put_record( { SEQ_NAME => $_->[ SEQ_NAME ], SEQ => $_->[ SEQ ] }, $out ) } @entries;
3492 sub script_patscan_seq
3494 # Martin A. Hansen, August 2007.
3496 # Locates patterns in sequences using scan_for_matches.
3498 my ( $in, # handle to in stream
3499 $out, # handle to out stream
3500 $options, # options hash
3505 my ( $genome_file, @args, $arg, $type, $seq_file, $pat_file, $out_file, $fh_in, $fh_out, $record, $patterns, $pattern, $entry, $result, %head_hash, $i );
3507 if ( $options->{ "patterns" } ) {
3508 $patterns = Maasha::Patscan::parse_patterns( $options->{ "patterns" } );
3509 } elsif ( -f $options->{ "patterns_in" } ) {
3510 $patterns = Maasha::Patscan::read_patterns( $options->{ "patterns_in" } );
3513 $genome_file = "$ENV{ 'BP_DATA' }/genomes/$options->{ 'genome' }/fasta/$options->{ 'genome' }.fna" if $options->{ 'genome' };
3515 push @args, "-c" if $options->{ "comp" };
3516 push @args, "-m $options->{ 'max_hits' }" if $options->{ 'max_hits' };
3517 push @args, "-n $options->{ 'max_misses' }" if $options->{ 'max_hits' };
3519 $seq_file = "$BP_TMP/patscan.seq";
3520 $pat_file = "$BP_TMP/patscan.pat";
3521 $out_file = "$BP_TMP/patscan.out";
3523 $fh_out = Maasha::Common::write_open( $seq_file );
3527 while ( $record = Maasha::Biopieces::get_record( $in ) )
3529 if ( $record->{ "SEQ" } and $record->{ "SEQ_NAME" } )
3531 $type = Maasha::Seq::seq_guess_type( $record->{ "SEQ" } ) if not $type;
3533 Maasha::Fasta::put_entry( [ $i, $record->{ "SEQ" } ], $fh_out );
3535 $head_hash{ $i } = $record->{ "SEQ_NAME" };
3543 $arg = join " ", @args;
3544 $arg .= " -p" if $type eq "protein";
3546 foreach $pattern ( @{ $patterns } )
3548 $fh_out = Maasha::Common::write_open( $pat_file );
3550 print $fh_out "$pattern\n";
3554 if ( $options->{ 'genome' } ) {
3555 `scan_for_matches $arg $pat_file < $genome_file > $out_file`;
3556 # Maasha::Common::run( "scan_for_matches", "$arg $pat_file < $genome_file > $out_file" );
3558 `scan_for_matches $arg $pat_file < $seq_file > $out_file`;
3559 # Maasha::Common::run( "scan_for_matches", "$arg $pat_file < $seq_file > $out_file" );
3562 $fh_in = Maasha::Common::read_open( $out_file );
3564 while ( $entry = Maasha::Fasta::get_entry( $fh_in ) )
3566 $result = Maasha::Patscan::parse_scan_result( $entry, $pattern );
3568 if ( $options->{ 'genome' } )
3570 $result->{ "CHR" } = $result->{ "S_ID" };
3571 $result->{ "CHR_BEG" } = $result->{ "S_BEG" };
3572 $result->{ "CHR_END" } = $result->{ "S_END" };
3574 delete $result->{ "S_ID" };
3575 delete $result->{ "S_BEG" };
3576 delete $result->{ "S_END" };
3580 $result->{ "S_ID" } = $head_hash{ $result->{ "S_ID" } };
3583 Maasha::Biopieces::put_record( $result, $out );
3595 sub script_create_blast_db
3597 # Martin A. Hansen, September 2007.
3599 # Creates a NCBI BLAST database with formatdb
3601 my ( $in, # handle to in stream
3602 $out, # handle to out stream
3603 $options, # options hash
3608 my ( $fh, $seq_type, $path, $record, $entry );
3610 $path = $options->{ "database" };
3612 $fh = Maasha::Common::write_open( $path );
3614 while ( $record = Maasha::Biopieces::get_record( $in ) )
3616 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
3618 if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) )
3620 $seq_type = Maasha::Seq::seq_guess_type( $entry->[ SEQ ] ) if not $seq_type;
3622 Maasha::Fasta::put_entry( $entry, $fh );
3628 if ( $seq_type eq "protein" ) {
3629 Maasha::Common::run( "formatdb", "-p T -i $path -t $options->{ 'database' }" );
3631 Maasha::Common::run( "formatdb", "-p F -i $path -t $options->{ 'database' }" );
3638 sub script_blast_seq
3640 # Martin A. Hansen, September 2007.
3642 # BLASTs sequences in stream against a given database.
3644 my ( $in, # handle to in stream
3645 $out, # handle to out stream
3646 $options, # options hash
3651 my ( $genome, $q_type, $s_type, $tmp_in, $tmp_out, $fh_in, $fh_out, $record, $line, @fields, $entry );
3653 $options->{ "e_val" } = 10 if not defined $options->{ "e_val" };
3654 $options->{ "filter" } = "F";
3655 $options->{ "filter" } = "T" if $options->{ "filter" };
3656 $options->{ "cpus" } ||= 1;
3658 $options->{ "database" } = "$ENV{ 'BP_DATA' }/genomes/$options->{ 'genome' }/blast/$options->{ 'genome' }.fna" if $options->{ 'genome' };
3660 $tmp_in = "$BP_TMP/blast_query.seq";
3661 $tmp_out = "$BP_TMP/blast.result";
3663 $fh_out = Maasha::Filesys::file_write_open( $tmp_in );
3665 while ( $record = Maasha::Biopieces::get_record( $in ) )
3667 if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) )
3669 $q_type = Maasha::Seq::seq_guess_type( $entry->[ SEQ ] ) if not $q_type;
3671 Maasha::Fasta::put_entry( $entry, $fh_out );
3674 Maasha::Biopieces::put_record( $record, $out );
3679 if ( -f $options->{ 'database' } . ".phr" ) {
3680 $s_type = "protein";
3682 $s_type = "nucleotide";
3685 if ( not $options->{ 'program' } )
3687 if ( $q_type ne "protein" and $s_type ne "protein" ) {
3688 $options->{ 'program' } = "blastn";
3689 } elsif ( $q_type eq "protein" and $s_type eq "protein" ) {
3690 $options->{ 'program' } = "blastp";
3691 } elsif ( $q_type ne "protein" and $s_type eq "protein" ) {
3692 $options->{ 'program' } = "blastx";
3693 } elsif ( $q_type eq "protein" and $s_type ne "protein" ) {
3694 $options->{ 'program' } = "tblastn";
3698 if ( $options->{ 'verbose' } )
3700 Maasha::Common::run(
3703 "-p $options->{ 'program' }",
3704 "-e $options->{ 'e_val' }",
3705 "-a $options->{ 'cpus' }",
3708 "-d $options->{ 'database' }",
3709 "-F $options->{ 'filter' }",
3717 Maasha::Common::run(
3720 "-p $options->{ 'program' }",
3721 "-e $options->{ 'e_val' }",
3722 "-a $options->{ 'cpus' }",
3725 "-d $options->{ 'database' }",
3726 "-F $options->{ 'filter' }",
3736 $fh_out = Maasha::Filesys::file_read_open( $tmp_out );
3740 while ( $line = <$fh_out> )
3744 next if $line =~ /^#/;
3746 @fields = split /\s+/, $line;
3748 $record->{ "REC_TYPE" } = "BLAST";
3749 $record->{ "Q_ID" } = $fields[ 0 ];
3750 $record->{ "S_ID" } = $fields[ 1 ];
3751 $record->{ "IDENT" } = $fields[ 2 ];
3752 $record->{ "ALIGN_LEN" } = $fields[ 3 ];
3753 $record->{ "MISMATCHES" } = $fields[ 4 ];
3754 $record->{ "GAPS" } = $fields[ 5 ];
3755 $record->{ "Q_BEG" } = $fields[ 6 ] - 1; # BLAST is 1-based
3756 $record->{ "Q_END" } = $fields[ 7 ] - 1; # BLAST is 1-based
3757 $record->{ "S_BEG" } = $fields[ 8 ] - 1; # BLAST is 1-based
3758 $record->{ "S_END" } = $fields[ 9 ] - 1; # BLAST is 1-based
3759 $record->{ "E_VAL" } = $fields[ 10 ];
3760 $record->{ "BIT_SCORE" } = $fields[ 11 ];
3762 if ( $record->{ "S_BEG" } > $record->{ "S_END" } )
3764 $record->{ "STRAND" } = '-';
3766 ( $record->{ "S_BEG" }, $record->{ "S_END" } ) = ( $record->{ "S_END" }, $record->{ "S_BEG" } );
3770 $record->{ "STRAND" } = '+';
3773 Maasha::Biopieces::put_record( $record, $out );
3784 # Martin A. Hansen, August 2007.
3786 # BLATs sequences in stream against a given genome.
3788 my ( $in, # handle to in stream
3789 $out, # handle to out stream
3790 $options, # options hash
3795 my ( $blat_args, $genome_file, $query_file, $fh_in, $fh_out, $type, $record, $result_file, $entries, $entry );
3797 $genome_file = "$ENV{ 'BP_DATA' }/genomes/$options->{ 'genome' }/fasta/$options->{ 'genome' }.fna";
3799 $options->{ 'tile_size' } ||= 11;
3800 $options->{ 'one_off' } ||= 0;
3801 $options->{ 'min_identity' } ||= 90;
3802 $options->{ 'min_score' } ||= 0;
3803 $options->{ 'step_size' } ||= $options->{ 'tile_size' };
3805 $blat_args .= " -tileSize=$options->{ 'tile_size' }";
3806 $blat_args .= " -oneOff=$options->{ 'one_off' }";
3807 $blat_args .= " -minIdentity=$options->{ 'min_identity' }";
3808 $blat_args .= " -minScore=$options->{ 'min_score' }";
3809 $blat_args .= " -stepSize=$options->{ 'step_size' }";
3810 # $blat_args .= " -ooc=" . Maasha::Config::genome_blat_ooc( $options->{ "genome" }, 11 ) if $options->{ 'ooc' };
3812 $query_file = "$BP_TMP/blat.seq";
3814 $fh_out = Maasha::Common::write_open( $query_file );
3816 while ( $record = Maasha::Biopieces::get_record( $in ) )
3818 if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) )
3820 $type = Maasha::Seq::seq_guess_type( $entry->[ SEQ ] ) if not $type;
3821 Maasha::Fasta::put_entry( $entry, $fh_out, 80 );
3824 Maasha::Biopieces::put_record( $record, $out );
3829 $blat_args .= " -t=dnax" if $type eq "protein";
3830 $blat_args .= " -q=$type";
3832 $result_file = "$BP_TMP/blat.psl";
3834 Maasha::Common::run( "blat", "$genome_file $query_file $blat_args $result_file > /dev/null 2>&1" );
3838 $entries = Maasha::UCSC::psl_get_entries( $result_file );
3840 map { Maasha::Biopieces::put_record( $_, $out ) } @{ $entries };
3842 unlink $result_file;
3848 # Martin A. Hansen, July 2008.
3850 # soap sequences in stream against a given file or genome.
3852 my ( $in, # handle to in stream
3853 $out, # handle to out stream
3854 $options, # options hash
3859 my ( $genome, $tmp_in, $tmp_out, $fh_in, $fh_out, $record, $line, @fields, $entry, $count, $args );
3861 $options->{ "seed_size" } ||= 10;
3862 $options->{ "mismatches" } ||= 2;
3863 $options->{ "gap_size" } ||= 0;
3864 $options->{ "cpus" } ||= 1;
3866 if ( $options->{ "genome" } ) {
3867 $options->{ "in_file" } = "$ENV{ 'BP_DATA' }/genomes/$options->{ 'genome' }/fasta/$options->{ 'genome' }.fna";
3870 $tmp_in = "$BP_TMP/soap_query.seq";
3871 $tmp_out = "$BP_TMP/soap.result";
3873 $fh_out = Maasha::Common::write_open( $tmp_in );
3877 while ( $record = Maasha::Biopieces::get_record( $in ) )
3879 if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) )
3881 Maasha::Fasta::put_entry( $entry, $fh_out );
3886 Maasha::Biopieces::put_record( $record, $out );
3894 "-s $options->{ 'seed_size' }",
3897 "-v $options->{ 'mismatches' }",
3898 "-g $options->{ 'gap_size' }",
3899 "-p $options->{ 'cpus' }",
3900 "-d $options->{ 'in_file' }",
3904 $args .= " > /dev/null 2>&1" if not $options->{ 'verbose' };
3906 Maasha::Common::run( "soap", $args, 1 );
3910 $fh_out = Maasha::Common::read_open( $tmp_out );
3914 while ( $line = <$fh_out> )
3918 @fields = split /\t/, $line;
3920 $record->{ "REC_TYPE" } = "SOAP";
3921 $record->{ "Q_ID" } = $fields[ 0 ];
3922 $record->{ "SCORE" } = $fields[ 3 ];
3923 $record->{ "STRAND" } = $fields[ 6 ];
3924 $record->{ "S_ID" } = $fields[ 7 ];
3925 $record->{ "S_BEG" } = $fields[ 8 ] - 1; # soap is 1-based
3926 $record->{ "S_END" } = $fields[ 8 ] + $fields[ 5 ] - 2;
3928 Maasha::Biopieces::put_record( $record, $out );
3938 sub script_match_seq
3940 # Martin A. Hansen, August 2007.
3942 # BLATs sequences in stream against a given genome.
3944 my ( $in, # handle to in stream
3945 $out, # handle to out stream
3946 $options, # options hash
3951 my ( $record, @entries, $results );
3953 $options->{ "word_size" } ||= 20;
3954 $options->{ "direction" } ||= "both";
3956 while ( $record = Maasha::Biopieces::get_record( $in ) )
3958 if ( $record->{ "SEQ_NAME" } and $record->{ "SEQ" } ) {
3959 push @entries, [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ];
3962 Maasha::Biopieces::put_record( $record, $out );
3965 if ( @entries == 1 )
3967 $results = Maasha::Match::match_mummer( [ $entries[ 0 ] ], [ $entries[ 0 ] ], $options, $BP_TMP );
3969 map { Maasha::Biopieces::put_record( $_, $out ) } @{ $results };
3971 elsif ( @entries == 2 )
3973 $results = Maasha::Match::match_mummer( [ $entries[ 0 ] ], [ $entries[ 1 ] ], $options, $BP_TMP );
3975 map { Maasha::Biopieces::put_record( $_, $out ) } @{ $results };
3980 sub script_create_vmatch_index
3982 # Martin A. Hansen, January 2008.
3984 # Create a vmatch index from sequences in the stream.
3986 my ( $in, # handle to in stream
3987 $out, # handle to out stream
3988 $options, # options hash
3993 my ( $record, $file_tmp, $fh_tmp, $type, $entry );
3995 if ( $options->{ "index_name" } )
3997 $file_tmp = $options->{ 'index_name' };
3998 $fh_tmp = Maasha::Common::write_open( $file_tmp );
4001 while ( $record = Maasha::Biopieces::get_record( $in ) )
4003 if ( $options->{ "index_name" } and $entry = Maasha::Fasta::biopiece2fasta( $record ) )
4005 Maasha::Fasta::put_entry( $entry, $fh_tmp );
4007 $type = Maasha::Seq::seq_guess_type( $entry->[ SEQ ] ) if not defined $type;
4010 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
4013 if ( $options->{ "index_name" } )
4017 if ( $type eq "protein" ) {
4018 Maasha::Common::run( "mkvtree", "-db $file_tmp -protein -pl $options->{ 'prefix_length' } -allout -indexname $file_tmp > /dev/null 2>&1" );
4020 Maasha::Common::run( "mkvtree", "-db $file_tmp -dna -pl $options->{ 'prefix_length' } -allout -indexname $file_tmp > /dev/null 2>&1" );
4028 sub script_vmatch_seq
4030 # Martin A. Hansen, August 2007.
4032 # Vmatches sequences in stream against a given genome.
4034 my ( $in, # handle to in stream
4035 $out, # handle to out stream
4036 $options, # options hash
4041 my ( @index_files, @records, $result_file, $fh_in, $record, %hash );
4043 $options->{ 'count' } = 1 if $options->{ 'max_hits' };
4045 if ( $options->{ "index_name" } )
4047 @index_files = $options->{ "index_name" };
4051 @index_files = Maasha::Common::ls_files( "$ENV{ 'BP_DATA' }/genomes/$options->{ 'genome' }/vmatch" );
4053 map { $_ =~ /^(.+)\.[a-z1]{3,4}$/; $hash{ $1 } = 1 } @index_files;
4055 @index_files = sort keys %hash;
4058 while ( $record = Maasha::Biopieces::get_record( $in ) )
4060 push @records, $record;
4062 Maasha::Biopieces::put_record( $record, $out );
4065 $result_file = Maasha::Match::match_vmatch( $BP_TMP, \@records, \@index_files, $options );
4069 $fh_in = Maasha::Common::read_open( $result_file );
4071 while ( $record = Maasha::Match::vmatch_get_entry( $fh_in ) ) {
4072 Maasha::Biopieces::put_record( $record, $out );
4077 unlink $result_file;
4081 sub script_write_fasta
4083 # Martin A. Hansen, August 2007.
4085 # Write FASTA entries from sequences in stream.
4087 my ( $in, # handle to in stream
4088 $out, # handle to out stream
4089 $options, # options hash
4094 my ( $record, $fh, $entry );
4096 $fh = write_stream( $options->{ "data_out" }, $options->{ "compress" } );
4098 while ( $record = Maasha::Biopieces::get_record( $in ) )
4100 if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) ) {
4101 Maasha::Fasta::put_entry( $entry, $fh, $options->{ "wrap" } );
4104 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
4111 sub script_write_align
4113 # Martin A. Hansen, August 2007.
4115 # Write pretty alignments aligned sequences in stream.
4117 my ( $in, # handle to in stream
4118 $out, # handle to out stream
4119 $options, # options hash
4124 my ( $fh, $record, @entries );
4126 $fh = write_stream( $options->{ "data_out" } ) ;
4128 while ( $record = Maasha::Biopieces::get_record( $in ) )
4130 if ( $record->{ "SEQ_NAME" } and $record->{ "SEQ" } ) {
4131 push @entries, [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ];
4134 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
4137 if ( scalar( @entries ) == 2 ) {
4138 Maasha::Align::align_print_pairwise( $entries[ 0 ], $entries[ 1 ], $fh, $options->{ "wrap" } );
4139 } elsif ( scalar ( @entries ) > 2 ) {
4140 Maasha::Align::align_print_multi( \@entries, $fh, $options->{ "wrap" }, $options->{ "no_ruler" }, $options->{ "no_consensus" } );
4147 sub script_write_blast
4149 # Martin A. Hansen, November 2007.
4151 # Write data in blast table format (-m8 and 9).
4153 my ( $in, # handle to in stream
4154 $out, # handle to out stream
4155 $options, # options hash
4160 my ( $fh, $record, $first );
4162 $fh = write_stream( $options->{ "data_out" }, $options->{ "compress" } ) ;
4166 while ( $record = Maasha::Biopieces::get_record( $in ) )
4168 if ( $record->{ "REC_TYPE" } eq "BLAST" )
4170 if ( $options->{ "comment" } and $first )
4172 print "# Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score\n";
4177 if ( $record->{ "STRAND" } eq "-" ) {
4178 ( $record->{ "S_BEG" }, $record->{ "S_END" } ) = ( $record->{ "S_END" }, $record->{ "S_BEG" } );
4181 print $fh join( "\t",
4182 $record->{ "Q_ID" },
4183 $record->{ "S_ID" },
4184 $record->{ "IDENT" },
4185 $record->{ "ALIGN_LEN" },
4186 $record->{ "MISMATCHES" },
4187 $record->{ "GAPS" },
4188 $record->{ "Q_BEG" } + 1,
4189 $record->{ "Q_END" } + 1,
4190 $record->{ "S_BEG" } + 1,
4191 $record->{ "S_END" } + 1,
4192 $record->{ "E_VAL" },
4193 $record->{ "BIT_SCORE" }
4197 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
4204 sub script_write_tab
4206 # Martin A. Hansen, August 2007.
4208 # Write data as table.
4210 my ( $in, # handle to in stream
4211 $out, # handle to out stream
4212 $options, # options hash
4217 my ( $fh, $record, $key, @keys, @vals, $ok, %no_keys, $A, $B );
4219 $options->{ "delimit" } ||= "\t";
4221 map { $no_keys{ $_ } = 1 } @{ $options->{ "no_keys" } };
4223 $fh = write_stream( $options->{ "data_out" }, $options->{ "compress" } );
4225 while ( $record = Maasha::Biopieces::get_record( $in ) )
4230 if ( $options->{ "keys" } )
4232 map { $ok = 0 if not exists $record->{ $_ } } @{ $options->{ "keys" } };
4236 foreach $key ( @{ $options->{ "keys" } } )
4238 if ( exists $record->{ $key } )
4240 push @keys, $key if $options->{ "comment" };
4241 push @vals, $record->{ $key };
4248 foreach $key ( sort { $A = $a; $B = $b; $A =~ s/^V(\d+)$/$1/; $B =~ s/^V(\d+)$/$1/; $A <=> $B } keys %{ $record } )
4250 next if exists $no_keys{ $key };
4252 push @keys, $key if $options->{ "comment" };
4253 push @vals, $record->{ $key };
4257 if ( @keys and $options->{ "comment" } )
4259 print $fh "#", join( $options->{ "delimit" }, @keys ), "\n";
4261 delete $options->{ "comment" };
4264 print $fh join( $options->{ "delimit" }, @vals ), "\n" if @vals;
4266 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
4273 sub script_write_bed
4275 # Martin A. Hansen, August 2007.
4277 # Write BED format for the UCSC genome browser using records in stream.
4279 my ( $in, # handle to in stream
4280 $out, # handle to out stream
4281 $options, # options hash
4286 my ( $cols, $fh, $record, $bed_entry, $new_record );
4288 $cols = $options->{ 'cols' }->[ 0 ];
4290 $fh = write_stream( $options->{ 'data_out' }, $options->{ 'compress' } );
4292 while ( $record = Maasha::Biopieces::get_record( $in ) )
4294 $record = Maasha::UCSC::psl2record( $record ) if $record->{ 'tBaseInsert' }; # Dirty addition to allow Affy data from MySQL to be dumped
4296 if ( $bed_entry = Maasha::UCSC::BED::biopiece2bed( $record, $cols ) ) {
4297 Maasha::UCSC::BED::bed_entry_put( $bed_entry, $fh, $cols, $options->{ 'check' } );
4300 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ 'no_stream' };
4307 sub script_write_psl
4309 # Martin A. Hansen, August 2007.
4311 # Write PSL output from stream.
4313 my ( $in, # handle to in stream
4314 $out, # handle to out stream
4315 $options, # options hash
4320 my ( $fh, $record, @output, $first );
4324 $fh = write_stream( $options->{ "data_out" }, $options->{ "compress" } );
4326 while ( $record = Maasha::Biopieces::get_record( $in ) )
4328 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
4330 if ( $record->{ "REC_TYPE" } and $record->{ "REC_TYPE" } eq "PSL" )
4332 Maasha::UCSC::psl_put_header( $fh ) if $first;
4333 Maasha::UCSC::psl_put_entry( $record, $fh );
4342 sub script_write_fixedstep
4344 # Martin A. Hansen, Juli 2008.
4346 # Write fixedStep entries from recrods in the stream.
4348 my ( $in, # handle to in stream
4349 $out, # handle to out stream
4350 $options, # options hash
4355 my ( $fh, $record, $entry );
4357 $fh = write_stream( $options->{ "data_out" }, $options->{ "compress" } );
4359 while ( $record = Maasha::Biopieces::get_record( $in ) )
4361 if ( $entry = Maasha::UCSC::Wiggle::biopiece2fixedstep( $record ) ) {
4362 Maasha::UCSC::Wiggle::fixedstep_entry_put( $entry, $fh );
4365 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
4372 sub script_write_2bit
4374 # Martin A. Hansen, March 2008.
4376 # Write sequence entries from stream in 2bit format.
4378 my ( $in, # handle to in stream
4379 $out, # handle to out stream
4380 $options, # options hash
4385 my ( $record, $mask, $tmp_file, $fh_tmp, $fh_in, $fh_out, $entry );
4387 $mask = 1 if not $options->{ "no_mask" };
4389 $tmp_file = "$BP_TMP/write_2bit.fna";
4390 $fh_tmp = Maasha::Common::write_open( $tmp_file );
4392 $fh_out = write_stream( $options->{ "data_out" } );
4394 while ( $record = Maasha::Biopieces::get_record( $in ) )
4396 if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) ) {
4397 Maasha::Fasta::put_entry( $entry, $fh_tmp );
4400 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
4405 $fh_in = Maasha::Common::read_open( $tmp_file );
4407 Maasha::TwoBit::fasta2twobit( $fh_in, $fh_out, $mask );
4416 sub script_write_solid
4418 # Martin A. Hansen, April 2008.
4420 # Write di-base encoded Solid sequence from entries in stream.
4422 my ( $in, # handle to in stream
4423 $out, # handle to out stream
4424 $options, # options hash
4429 my ( $record, $fh, $entry );
4431 $fh = write_stream( $options->{ "data_out" }, $options->{ "compress" } );
4433 while ( $record = Maasha::Biopieces::get_record( $in ) )
4435 if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) )
4437 $entry->[ SEQ ] = Maasha::Solid::seq2color_space( uc $entry->[ SEQ ] );
4439 Maasha::Fasta::put_entry( $entry, $fh, $options->{ "wrap" } );
4442 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
4449 sub script_write_ucsc_config
4451 # Martin A. Hansen, November 2008.
4453 # Write UCSC Genome Broser configuration (.ra file type) from
4454 # records in the stream.
4456 my ( $in, # handle to in stream
4457 $out, # handle to out stream
4458 $options, # options hash
4463 my ( $record, $fh );
4465 $fh = write_stream( $options->{ "data_out" } );
4467 while ( $record = Maasha::Biopieces::get_record( $in ) )
4469 Maasha::UCSC::ucsc_config_entry_put( $record, $fh ) if $record->{ "REC_TYPE" } eq "UCSC Config";
4471 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
4478 sub script_plot_seqlogo
4480 # Martin A. Hansen, August 2007.
4482 # Calculates and writes a sequence logo for alignments.
4484 my ( $in, # handle to in stream
4485 $out, # handle to out stream
4486 $options, # options hash
4491 my ( $record, @entries, $logo, $fh );
4493 while ( $record = Maasha::Biopieces::get_record( $in ) )
4495 if ( $record->{ "SEQ_NAME" } and $record->{ "SEQ" } ) {
4496 push @entries, [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ];
4499 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
4502 $logo = Maasha::Plot::seq_logo( \@entries );
4504 $fh = write_stream( $options->{ "data_out" } );
4512 sub script_plot_phastcons_profiles
4514 # Martin A. Hansen, January 2008.
4516 # Plots PhastCons profiles.
4518 my ( $in, # handle to in stream
4519 $out, # handle to out stream
4520 $options, # options hash
4525 my ( $phastcons_file, $phastcons_index, $index, $fh_phastcons, $record, $scores, $AoA, $plot, $fh );
4527 $options->{ "title" } ||= "PhastCons Profiles";
4529 $phastcons_file = Maasha::Config::genome_phastcons( $options->{ "genome" } );
4530 $phastcons_index = Maasha::Config::genome_phastcons_index( $options->{ "genome" } );
4532 $index = Maasha::UCSC::fixedstep_index_retrieve( $phastcons_index );
4533 $fh_phastcons = Maasha::Common::read_open( $phastcons_file );
4535 while ( $record = Maasha::Biopieces::get_record( $in ) )
4537 if ( $record->{ "CHR" } and $record->{ "CHR_BEG" } and $record->{ "CHR_END" } )
4539 $scores = Maasha::UCSC::fixedstep_index_lookup( $index, $fh_phastcons, $record->{ "CHR" },
4540 $record->{ "CHR_BEG" },
4541 $record->{ "CHR_END" },
4542 $options->{ "flank" } );
4544 push @{ $AoA }, [ @{ $scores } ];
4547 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
4550 Maasha::UCSC::phastcons_normalize( $AoA );
4552 $AoA = [ [ Maasha::UCSC::phastcons_mean( $AoA ) ] ] if $options->{ "mean" };
4553 $AoA = [ [ Maasha::UCSC::phastcons_median( $AoA ) ] ] if $options->{ "median" };
4555 $AoA = Maasha::Matrix::matrix_flip( $AoA );
4557 $plot = Maasha::Plot::lineplot_simple( $AoA, $options, $BP_TMP );
4559 $fh = write_stream( $options->{ "data_out" } );
4561 print $fh "$_\n" foreach @{ $plot };
4567 sub script_analyze_bed
4569 # Martin A. Hansen, March 2008.
4571 # Analyze BED entries in stream.
4573 my ( $in, # handle to in stream
4574 $out, # handle to out stream
4575 $options, # options hash
4582 while ( $record = Maasha::Biopieces::get_record( $in ) )
4584 $record = Maasha::UCSC::bed_analyze( $record ) if $record->{ "REC_TYPE" } eq "BED";
4586 Maasha::Biopieces::put_record( $record, $out );
4591 sub script_analyze_vals
4593 # Martin A. Hansen, August 2007.
4595 # Analyze values for given keys in stream.
4597 my ( $in, # handle to in stream
4598 $out, # handle to out stream
4599 $options, # options hash
4604 my ( $record, $key, @keys, %key_hash, $analysis, $len );
4606 map { $key_hash{ $_ } = 1 } @{ $options->{ "keys" } };
4608 while ( $record = Maasha::Biopieces::get_record( $in ) )
4610 foreach $key ( keys %{ $record } )
4612 next if $options->{ "keys" } and not exists $key_hash{ $key };
4614 $analysis->{ $key }->{ "COUNT" }++;
4616 if ( Maasha::Calc::is_a_number( $record->{ $key } ) )
4618 $analysis->{ $key }->{ "TYPE" } = "num";
4619 $analysis->{ $key }->{ "SUM" } += $record->{ $key };
4620 $analysis->{ $key }->{ "MAX" } = $record->{ $key } if $record->{ $key } > $analysis->{ $key }->{ "MAX" } or not $analysis->{ $key }->{ "MAX" };
4621 $analysis->{ $key }->{ "MIN" } = $record->{ $key } if $record->{ $key } < $analysis->{ $key }->{ "MIN" } or not $analysis->{ $key }->{ "MIN" };
4625 $len = length $record->{ $key };
4627 $analysis->{ $key }->{ "TYPE" } = "alph";
4628 $analysis->{ $key }->{ "SUM" } += $len;
4629 $analysis->{ $key }->{ "MAX" } = $len if $len > $analysis->{ $key }->{ "MAX" } or not $analysis->{ $key }->{ "MAX" };
4630 $analysis->{ $key }->{ "MIN" } = $len if $len < $analysis->{ $key }->{ "MIM" } or not $analysis->{ $key }->{ "MIN" };
4634 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
4637 foreach $key ( keys %{ $analysis } )
4639 $analysis->{ $key }->{ "MEAN" } = sprintf "%.2f", $analysis->{ $key }->{ "SUM" } / $analysis->{ $key }->{ "COUNT" };
4640 $analysis->{ $key }->{ "SUM" } = sprintf "%.2f", $analysis->{ $key }->{ "SUM" };
4643 my ( $keys, $types, $counts, $mins, $maxs, $sums, $means );
4653 if ( $options->{ "keys" } ) {
4654 @keys = @{ $options->{ "keys" } };
4656 @keys = keys %{ $analysis };
4659 foreach $key ( @keys )
4661 $keys .= sprintf "% 15s", $key;
4662 $types .= sprintf "% 15s", $analysis->{ $key }->{ "TYPE" };
4663 $counts .= sprintf "% 15s", $analysis->{ $key }->{ "COUNT" };
4664 $mins .= sprintf "% 15s", $analysis->{ $key }->{ "MIN" };
4665 $maxs .= sprintf "% 15s", $analysis->{ $key }->{ "MAX" };
4666 $sums .= sprintf "% 15s", $analysis->{ $key }->{ "SUM" };
4667 $means .= sprintf "% 15s", $analysis->{ $key }->{ "MEAN" };
4670 print $out "$keys\n";
4671 print $out "$types\n";
4672 print $out "$counts\n";
4673 print $out "$mins\n";
4674 print $out "$maxs\n";
4675 print $out "$sums\n";
4676 print $out "$means\n";
4680 sub script_head_records
4682 # Martin A. Hansen, August 2007.
4684 # Display the first sequences in stream.
4686 my ( $in, # handle to in stream
4687 $out, # handle to out stream
4688 $options, # options hash
4693 my ( $record, $count );
4695 $options->{ "num" } ||= 10;
4699 while ( $record = Maasha::Biopieces::get_record( $in ) )
4703 Maasha::Biopieces::put_record( $record, $out );
4705 last if $count == $options->{ "num" };
4710 sub script_remove_keys
4712 # Martin A. Hansen, August 2007.
4714 # Remove keys from stream.
4716 my ( $in, # handle to in stream
4717 $out, # handle to out stream
4718 $options, # options hash
4723 my ( $record, $new_record );
4725 while ( $record = Maasha::Biopieces::get_record( $in ) )
4727 if ( $options->{ "keys" } )
4729 map { delete $record->{ $_ } } @{ $options->{ "keys" } };
4731 elsif ( $options->{ "save_keys" } )
4733 map { $new_record->{ $_ } = $record->{ $_ } if exists $record->{ $_ } } @{ $options->{ "save_keys" } };
4735 $record = $new_record;
4738 Maasha::Biopieces::put_record( $record, $out ) if keys %{ $record };
4743 sub script_remove_adaptor
4745 # Martin A. Hansen, August 2008.
4747 # Find and remove adaptor from sequences in the stream.
4749 my ( $in, # handle to in stream
4750 $out, # handle to out stream
4751 $options, # options hash
4756 my ( $record, $adaptor, $seq, $adaptor_len, $seq_len, $offset, $max_match, $max_mismatch, $pos );
4758 $options->{ "remove" } ||= "after";
4760 $max_mismatch = $options->{ "mismatches" } || 0;
4761 $offset = $options->{ "offset" };
4763 if ( not defined $offset ) {
4769 $adaptor = uc $options->{ "adaptor" };
4770 $adaptor_len = length $adaptor;
4772 while ( $record = Maasha::Biopieces::get_record( $in ) )
4774 if ( $record->{ "SEQ" } )
4776 $seq = uc $record->{ "SEQ" };
4777 $seq_len = length $seq;
4779 $pos = Maasha::Common::index_m( $seq, $adaptor, $seq_len, $adaptor_len, $offset, $max_mismatch );
4781 $record->{ "ADAPTOR_POS" } = $pos;
4783 if ( $pos >= 0 and $options->{ "remove" } ne "skip" )
4785 if ( $options->{ "remove" } eq "after" )
4787 $record->{ "SEQ" } = substr $record->{ "SEQ" }, 0, $pos;
4788 $record->{ "SEQ_LEN" } = $pos;
4792 $record->{ "SEQ" } = substr $record->{ "SEQ" }, $pos + $adaptor_len;
4793 $record->{ "SEQ_LEN" } = length $record->{ "SEQ" };
4797 Maasha::Biopieces::put_record( $record, $out );
4801 Maasha::Biopieces::put_record( $record, $out );
4807 sub script_remove_mysql_tables
4809 # Martin A. Hansen, November 2008.
4811 # Remove MySQL tables from values in stream.
4813 my ( $in, # handle to in stream
4814 $out, # handle to out stream
4815 $options, # options hash
4820 my ( $record, %table_hash, $dbh, $table );
4822 $options->{ "user" } ||= Maasha::UCSC::ucsc_get_user();
4823 $options->{ "password" } ||= Maasha::UCSC::ucsc_get_password();
4825 map { $table_hash{ $_ } = 1 } @{ $options->{ 'tables' } };
4827 while ( $record = Maasha::Biopieces::get_record( $in ) )
4829 map { $table_hash{ $record->{ $_ } } = 1 } @{ $options->{ 'keys' } };
4831 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ 'no_stream' };
4834 $dbh = Maasha::SQL::connect( $options->{ "database" }, $options->{ "user" }, $options->{ "password" } );
4836 foreach $table ( sort keys %table_hash )
4838 if ( Maasha::SQL::table_exists( $dbh, $table ) )
4840 print STDERR qq(Removing table "$table" from database "$options->{ 'database' }" ... ) if $options->{ 'verbose' };
4841 Maasha::SQL::delete_table( $dbh, $table );
4842 print STDERR "done.\n" if $options->{ 'verbose' };
4846 print STDERR qq(WARNING: table "$table" not found in database "$options->{ 'database' }\n");
4850 Maasha::SQL::disconnect( $dbh );
4854 sub script_remove_ucsc_tracks
4856 # Martin A. Hansen, November 2008.
4858 # Remove track from MySQL tables and config file.
4860 my ( $in, # handle to in stream
4861 $out, # handle to out stream
4862 $options, # options hash
4867 my ( $record, %track_hash, $fh_in, $fh_out, $track, @tracks, @new_tracks, $dbh );
4869 $options->{ 'user' } ||= Maasha::UCSC::ucsc_get_user();
4870 $options->{ 'password' } ||= Maasha::UCSC::ucsc_get_password();
4871 $options->{ 'config_file' } ||= "$ENV{ 'HOME' }/ucsc/my_tracks.ra";
4873 map { $track_hash{ $_ } = 1 } @{ $options->{ 'tracks' } };
4875 while ( $record = Maasha::Biopieces::get_record( $in ) )
4877 map { $track_hash{ $record->{ $_ } } = 1 } @{ $options->{ 'keys' } };
4879 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ 'no_stream' };
4882 $fh_in = Maasha::Filesys::file_read_open( $options->{ 'config_file' } );
4884 while ( $track = Maasha::UCSC::ucsc_config_entry_get( $fh_in ) ) {
4885 push @tracks, $track;
4890 foreach $track ( @tracks )
4892 if ( $track->{ 'database' } eq $options->{ 'database' } and exists $track_hash{ $track->{ 'track' } } ) {
4893 print STDERR qq(Removing track "$track->{ 'track' }" from config file.\n) if $options->{ 'verbose' };
4895 push @new_tracks, $track;
4899 rename "$options->{ 'config_file' }", "$options->{ 'config_file' }~";
4901 $fh_out = Maasha::Filesys::file_write_open( $options->{ 'config_file' } );
4903 map { Maasha::UCSC::ucsc_config_entry_put( $_, $fh_out ) } @new_tracks;
4907 # ---- locate track in database ----
4909 $dbh = Maasha::SQL::connect( $options->{ "database" }, $options->{ "user" }, $options->{ "password" } );
4911 foreach $track ( sort keys %track_hash )
4913 if ( Maasha::SQL::table_exists( $dbh, $track ) )
4915 print STDERR qq(Removing table "$track" from database "$options->{ 'database' }" ... ) if $options->{ 'verbose' };
4916 Maasha::SQL::delete_table( $dbh, $track );
4917 print STDERR "done.\n" if $options->{ 'verbose' };
4921 print STDERR qq(WARNING: table "$track" not found in database "$options->{ 'database' }\n");
4925 Maasha::SQL::disconnect( $dbh );
4927 Maasha::Common::run( "ucscMakeTracks.pl", "-b > /dev/null 2>&1" );
4931 sub script_rename_keys
4933 # Martin A. Hansen, August 2007.
4935 # Rename keys in stream.
4937 my ( $in, # handle to in stream
4938 $out, # handle to out stream
4939 $options, # options hash
4946 while ( $record = Maasha::Biopieces::get_record( $in ) )
4948 if ( exists $record->{ $options->{ "keys" }->[ 0 ] } )
4950 $record->{ $options->{ "keys" }->[ 1 ] } = $record->{ $options->{ "keys" }->[ 0 ] };
4952 delete $record->{ $options->{ "keys" }->[ 0 ] };
4955 Maasha::Biopieces::put_record( $record, $out );
4960 sub script_uniq_vals
4962 # Martin A. Hansen, August 2007.
4964 # Find unique values in stream.
4966 my ( $in, # handle to in stream
4967 $out, # handle to out stream
4968 $options, # options hash
4973 my ( %hash, $record );
4975 while ( $record = Maasha::Biopieces::get_record( $in ) )
4977 if ( $record->{ $options->{ "key" } } )
4979 if ( not $hash{ $record->{ $options->{ "key" } } } and not $options->{ "invert" } )
4981 Maasha::Biopieces::put_record( $record, $out );
4983 $hash{ $record->{ $options->{ "key" } } } = 1;
4985 elsif ( $hash{ $record->{ $options->{ "key" } } } and $options->{ "invert" } )
4987 Maasha::Biopieces::put_record( $record, $out );
4991 $hash{ $record->{ $options->{ "key" } } } = 1;
4996 Maasha::Biopieces::put_record( $record, $out );
5002 sub script_merge_vals
5004 # Martin A. Hansen, August 2007.
5006 # Rename keys in stream.
5008 my ( $in, # handle to in stream
5009 $out, # handle to out stream
5010 $options, # options hash
5015 my ( $record, @join, $i );
5017 $options->{ "delimit" } ||= '_';
5019 while ( $record = Maasha::Biopieces::get_record( $in ) )
5021 if ( exists $record->{ $options->{ "keys" }->[ 0 ] } )
5023 @join = $record->{ $options->{ "keys" }->[ 0 ] };
5025 for ( $i = 1; $i < @{ $options->{ "keys" } }; $i++ ) {
5026 push @join, $record->{ $options->{ "keys" }->[ $i ] } if exists $record->{ $options->{ "keys" }->[ $i ] };
5029 $record->{ $options->{ "keys" }->[ 0 ] } = join $options->{ "delimit" }, @join;
5032 Maasha::Biopieces::put_record( $record, $out );
5037 sub script_merge_records
5039 # Martin A. Hansen, July 2008.
5041 # Merges records in the stream based on identical values of two given keys.
5043 my ( $in, # handle to in stream
5044 $out, # handle to out stream
5045 $options, # options hash
5050 my ( $merge, $record, $file1, $file2, $fh1, $fh2, $key1, $key2, @keys1, @keys2, @vals1, @vals2,
5051 $num1, $num2, $num, $cmp, $i );
5053 $merge = $options->{ "merge" } || "AandB";
5055 $file1 = "$BP_TMP/merge_records1.tmp";
5056 $file2 = "$BP_TMP/merge_records2.tmp";
5058 $fh1 = Maasha::Common::write_open( $file1 );
5059 $fh2 = Maasha::Common::write_open( $file2 );
5061 $key1 = $options->{ "keys" }->[ 0 ];
5062 $key2 = $options->{ "keys" }->[ 1 ];
5064 $num = $key2 =~ s/n$//;
5068 while ( $record = Maasha::Biopieces::get_record( $in ) )
5070 if ( exists $record->{ $key1 } )
5073 @vals1 = $record->{ $key1 };
5075 delete $record->{ $key1 };
5077 map { push @keys1, $_; push @vals1, $record->{ $_ } } keys %{ $record };
5079 print $fh1 join( "\t", @vals1 ), "\n";
5083 elsif ( exists $record->{ $key2 } )
5086 @vals2 = $record->{ $key2 };
5088 delete $record->{ $key2 };
5090 map { push @keys2, $_; push @vals2, $record->{ $_ } } keys %{ $record };
5092 print $fh2 join( "\t", @vals2 ), "\n";
5103 Maasha::Common::run( "sort", "-k 1,1n $file1 > $file1.sort" ) and rename "$file1.sort", $file1;
5104 Maasha::Common::run( "sort", "-k 1,1n $file2 > $file2.sort" ) and rename "$file2.sort", $file2;
5108 Maasha::Common::run( "sort", "-k 1,1 $file1 > $file1.sort" ) and rename "$file1.sort", $file1;
5109 Maasha::Common::run( "sort", "-k 1,1 $file2 > $file2.sort" ) and rename "$file2.sort", $file2;
5112 $fh1 = Maasha::Common::read_open( $file1 );
5113 $fh2 = Maasha::Common::read_open( $file2 );
5115 @vals1 = Maasha::Common::get_fields( $fh1 );
5116 @vals2 = Maasha::Common::get_fields( $fh2 );
5118 while ( $num1 > 0 and $num2 > 0 )
5123 $cmp = $vals1[ 0 ] <=> $vals2[ 0 ];
5125 $cmp = $vals1[ 0 ] cmp $vals2[ 0 ];
5130 if ( $merge =~ /^(AorB|AnotB)$/ )
5132 for ( $i = 0; $i < @keys1; $i++ ) {
5133 $record->{ $keys1[ $i ] } = $vals1[ $i ];
5136 Maasha::Biopieces::put_record( $record, $out );
5139 @vals1 = Maasha::Common::get_fields( $fh1 );
5144 if ( $merge =~ /^(BorA|BnotA)$/ )
5146 for ( $i = 0; $i < @keys2; $i++ ) {
5147 $record->{ $keys2[ $i ] } = $vals2[ $i ];
5150 Maasha::Biopieces::put_record( $record, $out );
5153 @vals2 = Maasha::Common::get_fields( $fh2 );
5158 if ( $merge =~ /^(AandB|AorB|BorA)$/ )
5160 for ( $i = 0; $i < @keys1; $i++ ) {
5161 $record->{ $keys1[ $i ] } = $vals1[ $i ];
5164 for ( $i = 1; $i < @keys2; $i++ ) {
5165 $record->{ $keys2[ $i ] } = $vals2[ $i ];
5168 Maasha::Biopieces::put_record( $record, $out );
5171 @vals1 = Maasha::Common::get_fields( $fh1 );
5172 @vals2 = Maasha::Common::get_fields( $fh2 );
5184 if ( $num1 > 0 and $merge =~ /^(AorB|AnotB)$/ )
5188 for ( $i = 0; $i < @keys1; $i++ ) {
5189 $record->{ $keys1[ $i ] } = $vals1[ $i ];
5192 Maasha::Biopieces::put_record( $record, $out );
5195 if ( $num2 > 0 and $merge =~ /^(BorA|BnotA)$/ )
5199 for ( $i = 0; $i < @keys2; $i++ ) {
5200 $record->{ $keys2[ $i ] } = $vals2[ $i ];
5203 Maasha::Biopieces::put_record( $record, $out );
5210 # Martin A. Hansen, August 2007.
5212 # Grab for records in stream.
5214 my ( $in, # handle to in stream
5215 $out, # handle to out stream
5216 $options, # options hash
5221 my ( $keys, $vals_only, $keys_only, $invert, $patterns, $pattern, $regex, $record, $key, $op, $val, %lookup_hash, $found );
5223 $keys = $options->{ 'keys' };
5224 $vals_only = $options->{ 'vals_only' };
5225 $keys_only = $options->{ 'keys_only' };
5226 $invert = $options->{ 'invert' };
5228 if ( $options->{ 'patterns' } )
5230 $patterns = [ split ",", $options->{ 'patterns' } ];
5232 elsif ( -f $options->{ 'patterns_in' } )
5234 $patterns = Maasha::Patscan::read_patterns( $options->{ 'patterns_in' } );
5236 elsif ( $options->{ 'regex' } )
5238 if ( $options->{ 'case_insensitive' } ) {
5239 $regex = qr/$options->{ 'regex' }/i;
5241 $regex = qr/$options->{ 'regex' }/;
5244 elsif ( -f $options->{ 'exact_in' } )
5246 $patterns = Maasha::Patscan::read_patterns( $options->{ 'exact_in' } );
5248 map { $lookup_hash{ $_ } = 1 } @{ $patterns };
5252 elsif ( $options->{ 'eval' } )
5254 if ( $options->{ 'eval' } =~ /^([^><=! ]+)\s*(>=|<=|>|<|=|!=|eq|ne)\s*(.+)$/ )
5262 while ( $record = Maasha::Biopieces::get_record( $in ) )
5266 if ( %lookup_hash ) {
5267 $found = grab_lookup( \%lookup_hash, $record, $keys, $vals_only, $keys_only );
5268 } elsif ( $patterns ) {
5269 $found = grab_patterns( $patterns, $record, $keys, $vals_only, $keys_only );
5270 } elsif ( $regex ) {
5271 $found = grab_regex( $regex, $record, $keys, $vals_only, $keys_only );
5273 $found = grab_eval( $key, $op, $val, $record );
5276 if ( $found and not $invert ) {
5277 Maasha::Biopieces::put_record( $record, $out );
5278 } elsif ( not $found and $invert ) {
5279 Maasha::Biopieces::put_record( $record, $out );
5287 # Martin A. Hansen, August 2007.
5289 # Evaluate extression for records in stream.
5291 my ( $in, # handle to in stream
5292 $out, # handle to out stream
5293 $options, # options hash
5298 my ( $record, $eval_key, @keys, $eval_val );
5300 while ( $record = Maasha::Biopieces::get_record( $in ) )
5302 if ( $options->{ "eval" } )
5304 if ( $options->{ "eval" } =~ /^(\S+)\s*=\s*(.+)$/ )
5311 @keys = split /\s+|\+|-|\*|\/|\*\*/, $eval_val;
5313 @keys = grep { exists $record->{ $_ } } @keys;
5316 map { $eval_val =~ s/\Q$_\E/$record->{ $_ }/g } @keys;
5318 $record->{ $eval_key } = eval "$eval_val";
5319 Maasha::Common::error( qq(eval "$eval_key = $eval_val" failed -> $@) ) if $@;
5323 Maasha::Common::error( qq(Bad compute expression: "$options->{ 'eval' }"\n) );
5327 Maasha::Biopieces::put_record( $record, $out );
5334 # Martin A. Hansen, June 2008.
5338 my ( $in, # handle to in stream
5339 $out, # handle to out stream
5340 $options, # options hash
5345 my ( $record, $key, $A, $B, @rows, @matrix, $row, $i );
5347 while ( $record = Maasha::Biopieces::get_record( $in ) )
5351 foreach $key ( sort { $A = $a; $B = $b; $A =~ s/^V(\d+)$/$1/; $B =~ s/^V(\d+)$/$1/; $A <=> $B } keys %{ $record } )
5353 push @rows, $record->{ $key };
5357 push @matrix, [ @rows ];
5362 @matrix = Maasha::Matrix::matrix_flip( \@matrix );
5364 foreach $row ( @matrix )
5366 for ( $i = 0; $i < @{ $row }; $i++ ) {
5367 $record->{ "V$i" } = $row->[ $i ];
5370 Maasha::Biopieces::put_record( $record, $out );
5375 sub script_add_ident
5377 # Martin A. Hansen, May 2008.
5379 # Add a unique identifier to each record in stream.
5381 my ( $in, # handle to in stream
5382 $out, # handle to out stream
5383 $options, # options hash
5388 my ( $record, $key, $prefix, $i );
5390 $key = $options->{ "key" } || "ID";
5391 $prefix = $options->{ "prefix" } || "ID";
5395 while ( $record = Maasha::Biopieces::get_record( $in ) )
5397 $record->{ $key } = sprintf( "$prefix%08d", $i );
5399 Maasha::Biopieces::put_record( $record, $out );
5406 sub script_count_records
5408 # Martin A. Hansen, August 2007.
5410 # Count records in stream.
5412 my ( $in, # handle to in stream
5413 $out, # handle to out stream
5414 $options, # options hash
5419 my ( $record, $count, $result, $fh, $line );
5423 if ( $options->{ "no_stream" } )
5425 while ( $line = <$in> )
5429 $count++ if $line eq "---";
5434 while ( $record = Maasha::Biopieces::get_record( $in ) )
5436 Maasha::Biopieces::put_record( $record, $out );
5442 $result = { "RECORDS_COUNT" => $count };
5444 $fh = write_stream( $options->{ "data_out" } );
5446 Maasha::Biopieces::put_record( $result, $fh );
5452 sub script_random_records
5454 # Martin A. Hansen, August 2007.
5456 # Pick a number or random records from stream.
5458 my ( $in, # handle to in stream
5459 $out, # handle to out stream
5460 $options, # options hash
5465 my ( $record, $tmp_file, $fh_out, $fh_in, $count, $i, %rand_hash, $rand, $max );
5467 $options->{ "num" } ||= 10;
5469 $tmp_file = "$BP_TMP/random_records.tmp";
5471 $fh_out = Maasha::Common::write_open( $tmp_file );
5475 while ( $record = Maasha::Biopieces::get_record( $in ) )
5477 Maasha::Biopieces::put_record( $record, $fh_out );
5487 Maasha::Common::error( qq(Requested random records > records in stream) ) if $options->{ "num" } > $count;
5489 while ( $i < $options->{ "num" } )
5491 $rand = int( rand( $count ) );
5493 if ( not exists $rand_hash{ $rand } )
5495 $rand_hash{ $rand } = 1;
5497 $max = $rand if $rand > $max;
5503 $fh_in = Maasha::Common::read_open( $tmp_file );
5507 while ( $record = Maasha::Biopieces::get_record( $fh_in ) )
5509 Maasha::Biopieces::put_record( $record, $out ) if exists $rand_hash{ $count };
5511 last if $count == $max;
5522 sub script_sort_records
5524 # Martin A. Hansen, August 2007.
5526 # Sort to sort records according to keys.
5528 my ( $in, # handle to in stream
5529 $out, # handle to out stream
5530 $options, # options hash
5535 my ( @keys, $key, @sort_cmd, $sort_str, $sort_sub, @records, $record, $i );
5537 foreach $key ( @{ $options->{ "keys" } } )
5539 if ( $key =~ s/n$// ) {
5540 push @sort_cmd, qq(\$a->{ "$key" } <=> \$b->{ "$key" });
5542 push @sort_cmd, qq(\$a->{ "$key" } cmp \$b->{ "$key" });
5546 $sort_str = join " or ", @sort_cmd;
5547 $sort_sub = eval "sub { $sort_str }"; # NB security issue!
5549 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
5550 push @records, $record;
5553 @records = sort $sort_sub @records;
5555 if ( $options->{ "reverse" } )
5557 for ( $i = scalar @records - 1; $i >= 0; $i-- ) {
5558 Maasha::Biopieces::put_record( $records[ $i ], $out );
5563 for ( $i = 0; $i < scalar @records; $i++ ) {
5564 Maasha::Biopieces::put_record( $records[ $i ], $out );
5570 sub script_count_vals
5572 # Martin A. Hansen, August 2007.
5574 # Count records in stream.
5576 my ( $in, # handle to in stream
5577 $out, # handle to out stream
5578 $options, # options hash
5583 my ( $num, $record, %count_hash, @records, $tmp_file, $fh_out, $fh_in, $cache );
5585 $tmp_file = "$BP_TMP/count_cache.tmp";
5587 $fh_out = Maasha::Common::write_open( $tmp_file );
5592 while ( $record = Maasha::Biopieces::get_record( $in ) )
5594 map { $count_hash{ $_ }{ $record->{ $_ } }++ if exists $record->{ $_ } } @{ $options->{ "keys" } };
5596 push @records, $record;
5598 if ( scalar @records > 5_000_000 ) # too many records to hold in memory - use disk cache
5600 map { Maasha::Biopieces::put_record( $_, $fh_out ) } @records;
5607 print STDERR "verbose: records read $num\n" if ( $options->{ 'verbose' } and ( $num % 1_000_000 ) == 0 );
5618 $fh_in = Maasha::Common::read_open( $tmp_file );
5620 while ( $record = Maasha::Biopieces::get_record( $fh_in ) )
5622 map { $record->{ $_ . "_COUNT" } = $count_hash{ $_ }{ $record->{ $_ } } if exists $record->{ $_ } } @{ $options->{ "keys" } };
5624 Maasha::Biopieces::put_record( $record, $out );
5626 print STDERR "verbose: cache read $num\n" if ( $options->{ 'verbose' } and ( $num % 1_000_000 ) == 0 );
5634 foreach $record ( @records )
5636 map { $record->{ $_ . "_COUNT" } = $count_hash{ $_ }{ $record->{ $_ } } if exists $record->{ $_ } } @{ $options->{ "keys" } };
5638 Maasha::Biopieces::put_record( $record, $out );
5645 sub script_plot_histogram
5647 # Martin A. Hansen, September 2007.
5649 # Plot a simple histogram for a given key using GNU plot.
5651 my ( $in, # handle to in stream
5652 $out, # handle to out stream
5653 $options, # options hash
5658 my ( $record, %data_hash, $max, @data_list, $i, $result, $fh );
5660 $options->{ "title" } ||= "Histogram";
5661 $options->{ "sort" } ||= "num";
5663 while ( $record = Maasha::Biopieces::get_record( $in ) )
5665 $data_hash{ $record->{ $options->{ "key" } } }++ if defined $record->{ $options->{ "key" } };
5667 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
5670 if ( $options->{ "sort" } eq "num" ) {
5671 map { push @data_list, [ $_, $data_hash{ $_ } ] } sort { $a <=> $b } keys %data_hash;
5673 map { push @data_list, [ $_, $data_hash{ $_ } ] } sort keys %data_hash;
5676 $result = Maasha::Plot::histogram_simple( \@data_list, $options );
5678 $fh = write_stream( $options->{ "data_out" } );
5680 print $fh "$_\n" foreach @{ $result };
5686 sub script_plot_lendist
5688 # Martin A. Hansen, August 2007.
5690 # Plot length distribution using GNU plot.
5692 my ( $in, # handle to in stream
5693 $out, # handle to out stream
5694 $options, # options hash
5699 my ( $record, %data_hash, $max, @data_list, $i, $result, $fh );
5701 $options->{ "title" } ||= "Length Distribution";
5703 while ( $record = Maasha::Biopieces::get_record( $in ) )
5705 $data_hash{ $record->{ $options->{ "key" } } }++ if defined $record->{ $options->{ "key" } };
5707 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
5710 $max = Maasha::Calc::list_max( [ keys %data_hash ] );
5712 for ( $i = 0; $i < $max; $i++ ) {
5713 push @data_list, [ $i, $data_hash{ $i } || 0 ];
5716 $result = Maasha::Plot::histogram_lendist( \@data_list, $options );
5718 $fh = write_stream( $options->{ "data_out" } );
5720 print $fh "$_\n" foreach @{ $result };
5726 sub script_plot_chrdist
5728 # Martin A. Hansen, August 2007.
5730 # Plot chromosome distribution using GNU plot.
5732 my ( $in, # handle to in stream
5733 $out, # handle to out stream
5734 $options, # options hash
5739 my ( $record, %data_hash, @data_list, $elem, $sort_key, $count, $result, $fh );
5741 $options->{ "title" } ||= "Chromosome Distribution";
5743 while ( $record = Maasha::Biopieces::get_record( $in ) )
5745 if ( $record->{ "CHR" } ) { # generic
5746 $data_hash{ $record->{ "CHR" } }++;
5747 } elsif ( $record->{ "REC_TYPE" } eq "PATSCAN" and $record->{ "S_ID" } =~ /^chr/i ) { # patscan
5748 $data_hash{ $record->{ "S_ID" } }++;
5749 } elsif ( $record->{ "REC_TYPE" } eq "PSL" and $record->{ "S_ID" } =~ /^chr/i ) { # BLAT / PSL
5750 $data_hash{ $record->{ "S_ID" } }++;
5751 } elsif ( $record->{ "REC_TYPE" } eq "BLAST" and $record->{ "S_ID" } =~ /^chr/i ) { # BLAST
5752 $data_hash{ $record->{ "S_ID" } }++;
5755 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
5758 foreach $elem ( keys %data_hash )
5762 $sort_key =~ s/chr//i;
5764 $sort_key =~ s/^X(.*)/99$1/;
5765 $sort_key =~ s/^Y(.*)/99$1/;
5766 $sort_key =~ s/^Z(.*)/999$1/;
5767 $sort_key =~ s/^M(.*)/9999$1/;
5768 $sort_key =~ s/^U(.*)/99999$1/;
5770 $count = $sort_key =~ tr/_//;
5772 $sort_key =~ s/_.*/"999999" x $count/ex;
5774 push @data_list, [ $elem, $data_hash{ $elem }, $sort_key ];
5777 @data_list = sort { $a->[ 2 ] <=> $b->[ 2 ] } @data_list;
5779 $result = Maasha::Plot::histogram_chrdist( \@data_list, $options );
5781 $fh = write_stream( $options->{ "data_out" } );
5783 print $fh "$_\n" foreach @{ $result };
5789 sub script_plot_karyogram
5791 # Martin A. Hansen, August 2007.
5793 # Plot hits on karyogram.
5795 my ( $in, # handle to in stream
5796 $out, # handle to out stream
5797 $options, # options hash
5802 my ( %options, $record, @data, $fh, $result, %data_hash );
5804 $options->{ "genome" } ||= "human";
5805 $options->{ "feat_color" } ||= "black";
5807 while ( $record = Maasha::Biopieces::get_record( $in ) )
5809 if ( $record->{ "CHR" } and $record->{ "CHR_BEG" } and $record->{ "CHR_END" } )
5811 push @{ $data_hash{ $record->{ "CHR" } } }, [ $record->{ "CHR_BEG" }, $record->{ "CHR_END" }, $options->{ "feat_color" } ];
5814 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
5817 $result = Maasha::Plot::karyogram( \%data_hash, \%options );
5819 $fh = write_stream( $options->{ "data_out" } );
5827 sub script_plot_matches
5829 # Martin A. Hansen, August 2007.
5831 # Plot matches in 2D generating a dotplot.
5833 my ( $in, # handle to in stream
5834 $out, # handle to out stream
5835 $options, # options hash
5840 my ( $record, @data, $fh, $result, %data_hash );
5842 $options->{ "direction" } ||= "both";
5844 while ( $record = Maasha::Biopieces::get_record( $in ) )
5846 if ( defined $record->{ "Q_BEG" } and defined $record->{ "S_BEG" } and $record->{ "Q_END" } and $record->{ "S_END" } ) {
5847 push @data, $record;
5850 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
5853 $options->{ "title" } ||= "plot_matches";
5854 $options->{ "xlabel" } ||= $data[ 0 ]->{ "Q_ID" };
5855 $options->{ "ylabel" } ||= $data[ 0 ]->{ "S_ID" };
5857 $result = Maasha::Plot::dotplot_matches( \@data, $options, $BP_TMP );
5859 $fh = write_stream( $options->{ "data_out" } );
5861 print $fh "$_\n" foreach @{ $result };
5867 sub script_length_vals
5869 # Martin A. Hansen, August 2007.
5871 # Determine the length of the value for given keys.
5873 my ( $in, # handle to in stream
5874 $out, # handle to out stream
5875 $options, # options hash
5880 my ( $record, $key );
5882 while ( $record = Maasha::Biopieces::get_record( $in ) )
5884 foreach $key ( @{ $options->{ "keys" } } )
5886 if ( $record->{ $key } ) {
5887 $record->{ $key . "_LEN" } = length $record->{ $key };
5891 Maasha::Biopieces::put_record( $record, $out );
5898 # Martin A. Hansen, August 2007.
5900 # Calculates the sums for values of given keys.
5902 my ( $in, # handle to in stream
5903 $out, # handle to out stream
5904 $options, # options hash
5909 my ( $record, $key, %sum_hash, $fh );
5911 while ( $record = Maasha::Biopieces::get_record( $in ) )
5913 foreach $key ( @{ $options->{ "keys" } } )
5915 if ( $record->{ $key } ) {
5916 $sum_hash{ $key } += $record->{ $key };
5920 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
5923 $fh = write_stream( $options->{ "data_out" } );
5925 foreach $key ( @{ $options->{ "keys" } } ) {
5926 Maasha::Biopieces::put_record( { $key . "_SUM" => $sum_hash{ $key } || 0 } , $fh );
5933 sub script_mean_vals
5935 # Martin A. Hansen, August 2007.
5937 # Calculate the mean of values of given keys.
5939 my ( $in, # handle to in stream
5940 $out, # handle to out stream
5941 $options, # options hash
5946 my ( $record, $key, %sum_hash, %count_hash, $mean, $fh );
5948 while ( $record = Maasha::Biopieces::get_record( $in ) )
5950 foreach $key ( @{ $options->{ "keys" } } )
5952 if ( $record->{ $key } )
5954 $sum_hash{ $key } += $record->{ $key };
5955 $count_hash{ $key }++;
5959 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
5962 $fh = write_stream( $options->{ "data_out" } );
5964 foreach $key ( @{ $options->{ "keys" } } )
5966 if ( $count_hash{ $key } ) {
5967 $mean = sprintf( "%.2f", ( $sum_hash{ $key } / $count_hash{ $key } ) );
5972 Maasha::Biopieces::put_record( { $key . "_MEAN" => $mean } , $fh );
5979 sub script_median_vals
5981 # Martin A. Hansen, March 2008.
5983 # Calculate the median values of given keys.
5985 my ( $in, # handle to in stream
5986 $out, # handle to out stream
5987 $options, # options hash
5992 my ( $record, $key, %median_hash, $median, $fh );
5994 while ( $record = Maasha::Biopieces::get_record( $in ) )
5996 foreach $key ( @{ $options->{ "keys" } } ) {
5997 push @{ $median_hash{ $key } }, $record->{ $key } if defined $record->{ $key };
6000 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
6003 $fh = write_stream( $options->{ "data_out" } );
6005 foreach $key ( @{ $options->{ "keys" } } )
6007 if ( $median_hash{ $key } ) {
6008 $median = Maasha::Calc::median( $median_hash{ $key } );
6013 Maasha::Biopieces::put_record( { $key . "_MEDIAN" => $median } , $fh );
6022 # Martin A. Hansen, February 2008.
6024 # Determine the maximum values of given keys.
6026 my ( $in, # handle to in stream
6027 $out, # handle to out stream
6028 $options, # options hash
6033 my ( $record, $key, $fh, %max_hash, $max_record );
6035 while ( $record = Maasha::Biopieces::get_record( $in ) )
6037 foreach $key ( @{ $options->{ "keys" } } )
6039 if ( $record->{ $key } )
6041 $max_hash{ $key } = $record->{ $key } if $record->{ $key } > $max_hash{ $key };
6045 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
6048 $fh = write_stream( $options->{ "data_out" } );
6050 foreach $key ( @{ $options->{ "keys" } } )
6052 $max_record->{ $key . "_MAX" } = $max_hash{ $key };
6055 Maasha::Biopieces::put_record( $max_record, $fh );
6063 # Martin A. Hansen, February 2008.
6065 # Determine the minimum values of given keys.
6067 my ( $in, # handle to in stream
6068 $out, # handle to out stream
6069 $options, # options hash
6074 my ( $record, $key, $fh, %min_hash, $min_record );
6076 while ( $record = Maasha::Biopieces::get_record( $in ) )
6078 foreach $key ( @{ $options->{ "keys" } } )
6080 if ( defined $record->{ $key } )
6082 if ( exists $min_hash{ $key } ) {
6083 $min_hash{ $key } = $record->{ $key } if $record->{ $key } < $min_hash{ $key };
6085 $min_hash{ $key } = $record->{ $key };
6090 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
6093 $fh = write_stream( $options->{ "data_out" } );
6095 foreach $key ( @{ $options->{ "keys" } } )
6097 $min_record->{ $key . "_MIN" } = $min_hash{ $key };
6100 Maasha::Biopieces::put_record( $min_record, $fh );
6106 sub script_upload_to_ucsc
6108 # Martin A. Hansen, August 2007.
6110 # Calculate the mean of values of given keys.
6112 # This routine has developed into the most ugly hack. Do something!
6114 my ( $in, # handle to in stream
6115 $out, # handle to out stream
6116 $options, # options hash
6121 my ( $record, $file, $wib_file, $wig_file, $wib_dir, $fh_out, $i, $first, $format, $type, $columns, $append, $entry );
6123 $options->{ "short_label" } ||= $options->{ 'table' };
6124 $options->{ "long_label" } ||= $options->{ 'table' };
6125 $options->{ "group" } ||= $ENV{ "LOGNAME" };
6126 $options->{ "priority" } ||= 1;
6127 $options->{ "visibility" } ||= "pack";
6128 $options->{ "color" } ||= join( ",", int( rand( 255 ) ), int( rand( 255 ) ), int( rand( 255 ) ) );
6129 $options->{ "chunk_size" } ||= 10_000_000_000; # Due to 32-bit UCSC compilation really large tables cannot be loaded in one go.
6131 $file = "$BP_TMP/ucsc_upload.tmp";
6136 $fh_out = Maasha::Common::write_open( $file );
6138 while ( $record = Maasha::Biopieces::get_record( $in ) )
6140 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
6142 if ( $record->{ "REC_TYPE" } eq "fixed_step" )
6146 if ( $entry = Maasha::UCSC::Wiggle::biopiece2fixedstep( $record ) ) {
6147 Maasha::UCSC::Wiggle::fixedstep_entry_put( $entry, $fh_out );
6150 elsif ( $record->{ "REC_TYPE" } eq "PSL" )
6154 Maasha::UCSC::psl_put_header( $fh_out ) if $first;
6155 Maasha::UCSC::psl_put_entry( $record, $fh_out );
6159 elsif ( $record->{ "REC_TYPE" } eq "BED" and $record->{ "SEC_STRUCT" } )
6161 # chrom chromStart chromEnd name score strand size secStr conf
6165 print $fh_out join ( "\t",
6167 $record->{ "CHR_BEG" },
6168 $record->{ "CHR_END" } + 1,
6169 $record->{ "Q_ID" },
6170 $record->{ "SCORE" },
6171 $record->{ "STRAND" },
6172 $record->{ "SIZE" },
6173 $record->{ "SEC_STRUCT" },
6174 $record->{ "CONF" },
6177 elsif ( $record->{ "REC_TYPE" } eq "BED" )
6180 $columns = $record->{ "BED_COLS" };
6182 if ( $entry = Maasha::UCSC::BED::biopiece2bed( $record, $columns ) ) {
6183 Maasha::UCSC::BED::bed_entry_put( $entry, $fh_out, $columns, $options->{ 'check' } );
6186 elsif ( $record->{ "REC_TYPE" } eq "PATSCAN" and $record->{ "CHR" } )
6191 if ( $entry = Maasha::UCSC::BED::biopiece2bed( $record, $columns ) ) {
6192 Maasha::UCSC::BED::bed_entry_put( $entry, $fh_out, $columns, $options->{ 'check' } );
6195 elsif ( $record->{ "REC_TYPE" } eq "BLAST" and $record->{ "S_ID" } =~ /^chr/ )
6200 $record->{ "SCORE" } = $record->{ "BIT_SCORE" } * 1000;
6202 if ( $entry = Maasha::UCSC::BED::biopiece2bed( $record, $columns ) ) {
6203 Maasha::UCSC::BED::bed_entry_put( $entry, $fh_out, $columns, $options->{ 'check' } );
6206 elsif ( $record->{ "REC_TYPE" } eq "VMATCH" and $record->{ "S_ID" } =~ /^chr/i )
6211 if ( $entry = Maasha::UCSC::BED::biopiece2bed( $record, $columns ) ) {
6212 Maasha::UCSC::BED::bed_entry_put( $entry, $fh_out, $columns, $options->{ 'check' } );
6216 if ( $i == $options->{ "chunk_size" } )
6220 if ( $format eq "BED" ) {
6221 Maasha::UCSC::bed_upload_to_ucsc( $BP_TMP, $file, $options, $append );
6222 } elsif ( $format eq "PSL" ) {
6223 Maasha::UCSC::psl_upload_to_ucsc( $file, $options, $append );
6232 $fh_out = Maasha::Common::write_open( $file );
6240 if ( exists $options->{ "database" } and $options->{ "table" } )
6242 if ( $format eq "BED" )
6244 $type = "bed $columns";
6246 Maasha::UCSC::bed_upload_to_ucsc( $BP_TMP, $file, $options, $append );
6248 elsif ( $format eq "BED_SS" )
6250 $type = "type bed 6 +";
6252 Maasha::UCSC::bed_upload_to_ucsc( $BP_TMP, $file, $options, $append );
6254 elsif ( $format eq "PSL" )
6258 Maasha::UCSC::psl_upload_to_ucsc( $file, $options, $append );
6260 elsif ( $format eq "WIGGLE" )
6262 $options->{ "visibility" } = "full";
6264 $wig_file = "$options->{ 'table' }.wig";
6265 $wib_file = "$options->{ 'table' }.wib";
6267 $wib_dir = "$ENV{ 'HOME' }/ucsc/wib";
6269 Maasha::Common::dir_create_if_not_exists( $wib_dir );
6271 if ( $options->{ 'verbose' } ) {
6272 `cd $BP_TMP && wigEncode $file $wig_file $wib_file`;
6274 `cd $BP_TMP && wigEncode $file $wig_file $wib_file > /dev/null 2>&1`;
6277 Maasha::Common::run( "mv", "$BP_TMP/$wib_file $wib_dir" );
6285 Maasha::UCSC::wiggle_upload_to_ucsc( $BP_TMP, $wib_dir, $file, $options );
6290 Maasha::UCSC::ucsc_update_config( $options, $type );
6295 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
6300 # Martin A. Hansen, November 2009.
6302 # Uses keys from a lookup hash to search records. Optionally, a list of
6303 # keys can be given so the lookup is limited to these, also, flags
6304 # can be given to limit lookup to keys or vals only. Returns 1 if lookup
6305 # succeeded, else 0.
6307 my ( $lookup_hash, # hashref with patterns
6309 $keys, # list of keys - OPTIONAL
6310 $vals_only, # only vals flag - OPTIONAL
6311 $keys_only, # only keys flag - OPTIONAL
6318 map { return 1 if exists $lookup_hash->{ $record->{ $_ } } } @{ $keys };
6322 if ( not $vals_only ) {
6323 map { return 1 if exists $lookup_hash->{ $_ } } keys %{ $record };
6326 if ( not $keys_only ) {
6327 map { return 1 if exists $lookup_hash->{ $record->{ $_ } } } keys %{ $record };
6337 # Martin A. Hansen, November 2009.
6339 # Uses patterns to match records containing the pattern as a substring.
6340 # Returns 1 if the record is matched, else 0.
6342 my ( $patterns, # list of patterns
6344 $keys, # list of keys - OPTIONAL
6345 $vals_only, # only vals flag - OPTIONAL
6346 $keys_only, # only keys flag - OPTIONAL
6353 foreach $pattern ( @{ $patterns } )
6357 map { return 1 if index( $record->{ $_ }, $pattern ) >= 0 } @{ $keys };
6361 if ( not $vals_only ) {
6362 map { return 1 if index( $_, $pattern ) >= 0 } keys %{ $record };
6365 if ( not $keys_only ) {
6366 map { return 1 if index( $record->{ $_ }, $pattern ) >= 0 } keys %{ $record };
6377 # Martin A. Hansen, November 2009.
6379 # Uses regex to match records.
6380 # Returns 1 if the record is matched, else 0.
6382 my ( $regex, # regex to match
6384 $keys, # list of keys - OPTIONAL
6385 $vals_only, # only vals flag - OPTIONAL
6386 $keys_only, # only keys flag - OPTIONAL
6393 map { return 1 if $record->{ $_ } =~ /$regex/ } @{ $keys };
6397 if ( not $vals_only ) {
6398 map { return 1 if $_ =~ /$regex/ } keys %{ $record };
6401 if ( not $keys_only ) {
6402 map { return 1 if $record->{ $_ } =~ /$regex/ } keys %{ $record };
6412 # Martin A. Hansen, November 2009.
6414 # Test if the value of a given record key evaluates according
6415 # to a given operator. Returns 1 if eval is OK, else 0.
6417 my ( $key, # record key
6425 if ( defined $record->{ $key } )
6427 return 1 if ( $op eq "<" and $record->{ $key } < $val );
6428 return 1 if ( $op eq ">" and $record->{ $key } > $val );
6429 return 1 if ( $op eq ">=" and $record->{ $key } >= $val );
6430 return 1 if ( $op eq "<=" and $record->{ $key } <= $val );
6431 return 1 if ( $op eq "=" and $record->{ $key } == $val );
6432 return 1 if ( $op eq "!=" and $record->{ $key } != $val );
6433 return 1 if ( $op eq "eq" and $record->{ $key } eq $val );
6434 return 1 if ( $op eq "ne" and $record->{ $key } ne $val );
6441 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<