1 package Maasha::BioRun;
4 # Copyright (C) 2007-2009 Martin A. Hansen.
6 # This program is free software; you can redistribute it and/or
7 # modify it under the terms of the GNU General Public License
8 # as published by the Free Software Foundation; either version 2
9 # of the License, or (at your option) any later version.
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program; if not, write to the Free Software
18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 # http://www.gnu.org/copyleft/gpl.html
23 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
26 # Routines that contains Biopieces which are run.
29 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
34 use Getopt::Long qw( :config bundling );
35 use Time::HiRes qw( gettimeofday );
36 use Storable qw( dclone );
37 use Maasha::Biopieces;
46 use Maasha::UCSC::BED;
47 use Maasha::UCSC::Wiggle;
56 use vars qw( @ISA @EXPORT_OK );
60 @ISA = qw( Exporter );
68 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> GLOBALS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
71 my ( $script, $BP_TMP );
73 $script = Maasha::Common::get_scriptname();
74 $BP_TMP = Maasha::Common::get_tmpdir();
77 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> RUN SCRIPT <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
80 run_script( $script );
83 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SUBROUTINES <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
88 # Martin A. Hansen, August 2007.
90 # Run a specific script.
92 my ( $script, # script name
97 my ( $t0, $t1, $options, $in, $out );
99 Maasha::Biopieces::log_biopiece();
101 $t0 = gettimeofday();
103 $options = get_options( $script );
105 $options->{ "SCRIPT" } = $script;
107 $script = "print_usage" if ( -t STDIN and keys %{ $options } <= 1 or $options->{ 'help' } );
109 $in = Maasha::Biopieces::read_stream( $options->{ "stream_in" } );
110 $out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } );
112 if ( $script eq "print_usage" ) { script_print_usage( $in, $out, $options ) }
113 elsif ( $script eq "complexity_seq" ) { script_complexity_seq( $in, $out, $options ) }
114 elsif ( $script eq "get_genome_align" ) { script_get_genome_align( $in, $out, $options ) }
115 elsif ( $script eq "get_genome_phastcons" ) { script_get_genome_phastcons( $in, $out, $options ) }
116 elsif ( $script eq "remove_mysql_tables" ) { script_remove_mysql_tables( $in, $out, $options ) }
117 elsif ( $script eq "remove_ucsc_tracks" ) { script_remove_ucsc_tracks( $in, $out, $options ) }
119 close $in if defined $in;
122 $t1 = gettimeofday();
124 print STDERR "Program: $script" . ( " " x ( 25 - length( $script ) ) ) . sprintf( "Run time: %.4f\n", ( $t1 - $t0 ) ) if $options->{ 'verbose' };
130 # Martin A. Hansen, February 2008.
132 # Gets options from commandline and checks these vigerously.
134 my ( $script, # name of script
139 my ( %options, @options, $opt, @genomes, $real );
141 if ( $script eq "print_usage" )
147 elsif ( $script eq "get_genome_align" )
158 elsif ( $script eq "get_genome_phastcons" )
169 elsif ( $script eq "remove_mysql_tables" )
180 elsif ( $script eq "remove_ucsc_tracks" )
200 # print STDERR Dumper( \@options );
207 # print STDERR Dumper( \%options );
209 if ( -t STDIN && scalar( keys %options ) == 0 or $options{ "help" } ) {
210 return wantarray ? %options : \%options;
213 $options{ "cols" } = [ split ",", $options{ "cols" } ] if defined $options{ "cols" };
214 $options{ "keys" } = [ split ",", $options{ "keys" } ] if defined $options{ "keys" };
215 $options{ "no_keys" } = [ split ",", $options{ "no_keys" } ] if defined $options{ "no_keys" };
216 $options{ "save_keys" } = [ split ",", $options{ "save_keys" } ] if defined $options{ "save_keys" };
217 $options{ "quals" } = [ split ",", $options{ "quals" } ] if defined $options{ "quals" };
218 $options{ "feats" } = [ split ",", $options{ "feats" } ] if defined $options{ "feats" };
219 $options{ "frames" } = [ split ",", $options{ "frames" } ] if defined $options{ "frames" };
220 $options{ "samples" } = [ split ",", $options{ "samples" } ] if defined $options{ "samples" };
221 $options{ "tables" } = [ split ",", $options{ "tables" } ] if defined $options{ "tables" };
222 $options{ "tracks" } = [ split ",", $options{ "tracks" } ] if defined $options{ "tracks" };
224 # ---- check arguments ----
226 if ( $options{ 'data_in' } )
228 $options{ "files" } = Maasha::Biopieces::getopt_files( $options{ 'data_in' } );
230 Maasha::Common::error( qq(Argument to --data_in must be a valid file or fileglob expression) ) if scalar @{ $options{ "files" } } == 0;
233 map { Maasha::Common::error( qq(Argument to --cols must be a whole numbers - not "$_") ) if $_ !~ /^\d+$/ } @{ $options{ "cols" } } if $options{ "cols" };
235 # print STDERR Dumper( \%options );
237 $real = "beg|end|word_size|wrap|len|prefix_length|mismatches|offset|num|skip|cpus|window_size|step_size";
239 foreach $opt ( keys %options )
241 if ( $opt =~ /stream_in|pattern_in|exact_in/ and not -f $options{ $opt } )
243 Maasha::Common::error( qq(Argument to --$opt must be a valid file or fileglob expression - not "$options{ $opt }") );
245 elsif ( $opt =~ /$real/ and $options{ $opt } !~ /^\d+$/ )
247 Maasha::Common::error( qq(Argument to --$opt must be a whole number - not "$options{ $opt }") );
249 elsif ( $opt =~ /max_hits|max_hits|max_misses|dist|edit_dist|flank|gap|hamming_dist|priority/ and $options{ $opt } !~ /^-?\d+$/ )
251 Maasha::Common::error( qq(Argument to --$opt must be an integer - not "$options{ $opt }") );
253 elsif ( $opt =~ /identity|threshold/ and $options{ $opt } !~ /^-?(?:\d+(?:\.\d*)?|\.\d+)$/ )
255 Maasha::Common::error( qq(Argument to --$opt must be a decimal number - not "$options{ $opt }") );
257 elsif ( $opt =~ /e_val/ and $options{ $opt } !~ /^([+-]?)(?=\d|\.\d)\d*(\.\d*)?([Ee]([+-]?\d+))?$/ )
259 Maasha::Common::error( qq(Argument to --$opt must be a float - not "$options{ $opt }") );
261 elsif ( $opt =~ /strand/ and $options{ $opt } !~ /^(\+|-)$/ )
263 Maasha::Common::error( qq(Argument to --$opt must be "+" or "-" - not "$options{ $opt }") );
265 elsif ( $opt eq "genome" )
267 @genomes = Maasha::Common::ls_dirs( "$ENV{ 'BP_DATA' }/genomes" );
268 map { $_ =~ s/.*\/(.+)$/$1/ } @genomes;
270 if ( not grep { $_ =~ /^$options{ $opt }$/ } @genomes ) {
271 Maasha::Common::error( qq(Genome $options{ $opt } not found in "$ENV{ 'BP_DATA' }/genomes/") );
274 elsif ( $opt eq "terminal" and not $options{ $opt } =~ /^(svg|post|dumb|x11)/ )
276 Maasha::Common::error( qq(Bad --$opt argument "$options{ $opt }") );
278 elsif ( $opt eq "table" and $options{ $opt } =~ /(-|\.)/ )
280 Maasha::Common::error( qq(Character '$1' is not allowed in table name: $options{ $opt }) );
282 elsif ( $opt eq "merge" and $options{ $opt } !~ /^(AandB|AorB|BorA|AnotB|BnotA)$/ )
284 Maasha::Common::error( qq(Argument to --$opt must be AandB, AorB, BorA, AnotB, or BnotA - not "$options{ $opt }") );
288 Maasha::Common::error( qq(no --database specified) ) if $script eq "remove_ucsc_tracks" and not $options{ "database" };
289 Maasha::Common::error( qq(no --genome specified) ) if $script =~ /get_genome_align|get_genome_phastcons/ and not $options{ "genome" };
291 return wantarray ? %options : \%options;
295 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SCRIPTS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
298 sub script_print_usage
300 # Martin A. Hansen, January 2008.
302 # Retrieves usage information from file and
303 # prints this nicely formatted.
305 my ( $in, # handle to in stream
306 $out, # handle to out stream
307 $options, # options hash
312 my ( $file, $wiki, $lines );
314 if ( $options->{ 'data_in' } ) {
315 $file = $options->{ 'data_in' };
317 $file = join "", $ENV{ 'BP_DIR' }, "/bp_usage/", $options->{ 'SCRIPT' }, ".wiki";
320 $wiki = Maasha::Gwiki::gwiki_read( $file );
322 ( $wiki->[ 2 ], $wiki->[ 3 ], $wiki->[ 0 ], $wiki->[ 1 ] ) = ( $wiki->[ 0 ], $wiki->[ 1 ], $wiki->[ 2 ], $wiki->[ 3 ] );
324 if ( not $options->{ "help" } ) {
325 @{ $wiki } = grep { $_->[ 0 ]->{ 'SECTION' } =~ /Biopiece|summary|Usage|Options|Help/ } @{ $wiki };
328 $lines = Maasha::Gwiki::gwiki2ascii( $wiki );
330 print STDERR "$_\n" foreach @{ $lines };
336 sub script_complexity_seq
338 # Martin A. Hansen, May 2008.
340 # Generates an index calculated as the most common di-residue over
341 # the sequence length for all sequences in stream.
343 my ( $in, # handle to in stream
344 $out, # handle to out stream
349 my ( $record, $index );
351 while ( $record = Maasha::Biopieces::get_record( $in ) )
353 $record->{ "SEQ_COMPLEXITY" } = sprintf( "%.2f", Maasha::Seq::seq_complexity( $record->{ "SEQ" } ) ) if $record->{ "SEQ" };
355 Maasha::Biopieces::put_record( $record, $out );
360 sub script_get_genome_align
362 # Martin A. Hansen, April 2008.
364 # Gets a subalignment from a multiple genome alignment.
366 my ( $in, # handle to in stream
367 $out, # handle to out stream
368 $options, # options hash
373 my ( $record, $maf_track, $align, $align_num, $beg, $end, $len, $entry );
375 $options->{ "strand" } ||= "+";
379 $maf_track = Maasha::Config::maf_track( $options->{ "genome" } );
381 if ( $options->{ "chr" } and $options->{ "beg" } and ( $options->{ "end" } or $options->{ "len" } ) )
383 $beg = $options->{ "beg" } - 1;
385 if ( $options->{ "end" } ) {
386 $end = $options->{ "end" };
387 } elsif ( $options->{ "len" } ) {
388 $end = $beg + $options->{ "len" };
391 $align = Maasha::UCSC::maf_extract( $BP_TMP, $options->{ "genome" }, $maf_track, $options->{ "chr" }, $beg, $end, $options->{ "strand" } );
393 foreach $entry ( @{ $align } )
395 $entry->{ "CHR" } = $record->{ "CHR" };
396 $entry->{ "CHR_BEG" } = $record->{ "CHR_BEG" };
397 $entry->{ "CHR_END" } = $record->{ "CHR_END" };
398 $entry->{ "STRAND" } = $record->{ "STRAND" } || '+';
399 $entry->{ "Q_ID" } = $record->{ "Q_ID" };
400 $entry->{ "SCORE" } = $record->{ "SCORE" };
402 Maasha::Biopieces::put_record( $entry, $out );
406 while ( $record = Maasha::Biopieces::get_record( $in ) )
408 if ( $record->{ "REC_TYPE" } eq "BED" )
410 $align = Maasha::UCSC::maf_extract( $BP_TMP, $options->{ "genome" }, $maf_track, $record->{ "CHR" }, $record->{ "CHR_BEG" }, $record->{ "CHR_END" }, $record->{ "STRAND" } );
412 elsif ( $record->{ "REC_TYPE" } eq "VMATCH" )
414 $align = Maasha::UCSC::maf_extract( $BP_TMP, $options->{ "genome" }, $maf_track, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" } + 1, $record->{ "STRAND" } );
416 elsif ( $record->{ "REC_TYPE" } eq "PSL" )
418 $align = Maasha::UCSC::maf_extract( $BP_TMP, $options->{ "genome" }, $maf_track, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" }, $record->{ "STRAND" } );
420 elsif ( $record->{ "REC_TYPE" } eq "BLAST" )
422 $align = Maasha::UCSC::maf_extract( $BP_TMP, $options->{ "genome" }, $maf_track, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" }, $record->{ "STRAND" } );
425 foreach $entry ( @{ $align } )
427 $entry->{ "CHR" } = $record->{ "CHR" };
428 $entry->{ "CHR_BEG" } = $record->{ "CHR_BEG" };
429 $entry->{ "CHR_END" } = $record->{ "CHR_END" };
430 $entry->{ "STRAND" } = $record->{ "STRAND" };
431 $entry->{ "Q_ID" } = $record->{ "Q_ID" };
432 $entry->{ "SCORE" } = $record->{ "SCORE" };
434 Maasha::Biopieces::put_record( $entry, $out );
442 sub script_get_genome_phastcons
444 # Martin A. Hansen, February 2008.
446 # Get phastcons scores from genome intervals.
448 my ( $in, # handle to in stream
449 $out, # handle to out stream
450 $options, # options hash
455 my ( $phastcons_file, $phastcons_index, $index, $fh_phastcons, $scores, $record );
457 $options->{ "flank" } ||= 0;
459 $phastcons_file = Maasha::Config::genome_phastcons( $options->{ "genome" } );
460 $phastcons_index = Maasha::Config::genome_phastcons_index( $options->{ "genome" } );
462 $index = Maasha::UCSC::fixedstep_index_retrieve( $phastcons_index );
463 $fh_phastcons = Maasha::Common::read_open( $phastcons_file );
465 if ( defined $options->{ "chr" } and defined $options->{ "beg" } and ( defined $options->{ "end" } or defined $options->{ "len" } ) )
467 $options->{ "beg" } -= 1; # request is 1-based
468 $options->{ "end" } -= 1; # request is 1-based
470 if ( $options->{ "len" } ) {
471 $options->{ "end" } = $options->{ "beg" } + $options->{ "len" } - 1;
474 $scores = Maasha::UCSC::fixedstep_index_lookup( $index, $fh_phastcons, $options->{ "chr" }, $options->{ "beg" }, $options->{ "end" }, $options->{ "flank" } );
476 $record->{ "CHR" } = $options->{ "chr" };
477 $record->{ "CHR_BEG" } = $options->{ "beg" } - $options->{ "flank" };
478 $record->{ "CHR_END" } = $options->{ "end" } + $options->{ "flank" };
480 $record->{ "PHASTCONS" } = join ",", @{ $scores };
481 $record->{ "PHAST_COUNT" } = scalar @{ $scores }; # DEBUG
483 Maasha::Biopieces::put_record( $record, $out );
486 while ( $record = Maasha::Biopieces::get_record( $in ) )
488 if ( $record->{ "REC_TYPE" } eq "BED" )
490 $scores = Maasha::UCSC::fixedstep_index_lookup( $index, $fh_phastcons, $record->{ "CHR" }, $record->{ "CHR_BEG" }, $record->{ "CHR_END" }, $options->{ "flank" } );
492 elsif ( $record->{ "REC_TYPE" } eq "PSL" )
494 $scores = Maasha::UCSC::fixedstep_index_lookup( $index, $fh_phastcons, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" }, $options->{ "flank" } );
496 elsif ( $record->{ "REC_TYPE" } eq "BLAST" )
498 $scores = Maasha::UCSC::fixedstep_index_lookup( $index, $fh_phastcons, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" }, $options->{ "flank" } );
501 $record->{ "PHASTCONS" } = join ",", @{ $scores } if @{ $scores };
502 # $record->{ "PHAST_COUNT" } = @{ $scores } if @{ $scores }; # DEBUG
504 Maasha::Biopieces::put_record( $record, $out );
507 close $fh_phastcons if $fh_phastcons;
511 sub script_remove_mysql_tables
513 # Martin A. Hansen, November 2008.
515 # Remove MySQL tables from values in stream.
517 my ( $in, # handle to in stream
518 $out, # handle to out stream
519 $options, # options hash
524 my ( $record, %table_hash, $dbh, $table );
526 $options->{ "user" } ||= Maasha::UCSC::ucsc_get_user();
527 $options->{ "password" } ||= Maasha::UCSC::ucsc_get_password();
529 map { $table_hash{ $_ } = 1 } @{ $options->{ 'tables' } };
531 while ( $record = Maasha::Biopieces::get_record( $in ) )
533 map { $table_hash{ $record->{ $_ } } = 1 } @{ $options->{ 'keys' } };
535 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ 'no_stream' };
538 $dbh = Maasha::SQL::connect( $options->{ "database" }, $options->{ "user" }, $options->{ "password" } );
540 foreach $table ( sort keys %table_hash )
542 if ( Maasha::SQL::table_exists( $dbh, $table ) )
544 print STDERR qq(Removing table "$table" from database "$options->{ 'database' }" ... ) if $options->{ 'verbose' };
545 Maasha::SQL::delete_table( $dbh, $table );
546 print STDERR "done.\n" if $options->{ 'verbose' };
550 print STDERR qq(WARNING: table "$table" not found in database "$options->{ 'database' }\n");
554 Maasha::SQL::disconnect( $dbh );
558 sub script_remove_ucsc_tracks
560 # Martin A. Hansen, November 2008.
562 # Remove track from MySQL tables and config file.
564 my ( $in, # handle to in stream
565 $out, # handle to out stream
566 $options, # options hash
571 my ( $record, %track_hash, $fh_in, $fh_out, $track, @tracks, @new_tracks, $dbh );
573 $options->{ 'user' } ||= Maasha::UCSC::ucsc_get_user();
574 $options->{ 'password' } ||= Maasha::UCSC::ucsc_get_password();
575 $options->{ 'config_file' } ||= "$ENV{ 'HOME' }/ucsc/my_tracks.ra";
577 map { $track_hash{ $_ } = 1 } @{ $options->{ 'tracks' } };
579 while ( $record = Maasha::Biopieces::get_record( $in ) )
581 map { $track_hash{ $record->{ $_ } } = 1 } @{ $options->{ 'keys' } };
583 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ 'no_stream' };
586 $fh_in = Maasha::Filesys::file_read_open( $options->{ 'config_file' } );
588 while ( $track = Maasha::UCSC::ucsc_config_entry_get( $fh_in ) ) {
589 push @tracks, $track;
594 foreach $track ( @tracks )
596 if ( $track->{ 'database' } eq $options->{ 'database' } and exists $track_hash{ $track->{ 'track' } } ) {
597 print STDERR qq(Removing track "$track->{ 'track' }" from config file.\n) if $options->{ 'verbose' };
599 push @new_tracks, $track;
603 rename "$options->{ 'config_file' }", "$options->{ 'config_file' }~";
605 $fh_out = Maasha::Filesys::file_write_open( $options->{ 'config_file' } );
607 map { Maasha::UCSC::ucsc_config_entry_put( $_, $fh_out ) } @new_tracks;
611 # ---- locate track in database ----
613 $dbh = Maasha::SQL::connect( $options->{ "database" }, $options->{ "user" }, $options->{ "password" } );
615 foreach $track ( sort keys %track_hash )
617 if ( Maasha::SQL::table_exists( $dbh, $track ) )
619 print STDERR qq(Removing table "$track" from database "$options->{ 'database' }" ... ) if $options->{ 'verbose' };
620 Maasha::SQL::delete_table( $dbh, $track );
621 print STDERR "done.\n" if $options->{ 'verbose' };
625 print STDERR qq(WARNING: table "$track" not found in database "$options->{ 'database' }\n");
629 Maasha::SQL::disconnect( $dbh );
631 Maasha::Common::run( "ucscMakeTracks.pl", "-b > /dev/null 2>&1" );
635 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<