1 package Maasha::BioRun;
4 # Copyright (C) 2007-2009 Martin A. Hansen.
6 # This program is free software; you can redistribute it and/or
7 # modify it under the terms of the GNU General Public License
8 # as published by the Free Software Foundation; either version 2
9 # of the License, or (at your option) any later version.
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program; if not, write to the Free Software
18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 # http://www.gnu.org/copyleft/gpl.html
23 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
26 # Routines that contains Biopieces which are run.
29 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
34 use Getopt::Long qw( :config bundling );
35 use Time::HiRes qw( gettimeofday );
36 use Storable qw( dclone );
37 use Maasha::Biopieces;
43 use Maasha::Stockholm;
47 use Maasha::UCSC::BED;
48 use Maasha::UCSC::Wiggle;
57 use vars qw( @ISA @EXPORT_OK );
61 @ISA = qw( Exporter );
69 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> GLOBALS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
72 my ( $script, $BP_TMP );
74 $script = Maasha::Common::get_scriptname();
75 $BP_TMP = Maasha::Common::get_tmpdir();
78 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> RUN SCRIPT <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
81 run_script( $script );
84 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SUBROUTINES <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
89 # Martin A. Hansen, August 2007.
91 # Run a specific script.
93 my ( $script, # script name
98 my ( $t0, $t1, $options, $in, $out );
100 Maasha::Biopieces::log_biopiece();
102 $t0 = gettimeofday();
104 $options = get_options( $script );
106 $options->{ "SCRIPT" } = $script;
108 $script = "print_usage" if ( -t STDIN and keys %{ $options } <= 1 or $options->{ 'help' } );
110 $in = Maasha::Biopieces::read_stream( $options->{ "stream_in" } );
111 $out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } );
113 if ( $script eq "print_usage" ) { script_print_usage( $in, $out, $options ) }
114 elsif ( $script eq "read_stockholm" ) { script_read_stockholm( $in, $out, $options ) }
115 elsif ( $script eq "read_phastcons" ) { script_read_phastcons( $in, $out, $options ) }
116 elsif ( $script eq "read_solid" ) { script_read_solid( $in, $out, $options ) }
117 elsif ( $script eq "read_mysql" ) { script_read_mysql( $in, $out, $options ) }
118 elsif ( $script eq "uppercase_seq" ) { script_uppercase_seq( $in, $out, $options ) }
119 elsif ( $script eq "complexity_seq" ) { script_complexity_seq( $in, $out, $options ) }
120 elsif ( $script eq "get_genome_align" ) { script_get_genome_align( $in, $out, $options ) }
121 elsif ( $script eq "get_genome_phastcons" ) { script_get_genome_phastcons( $in, $out, $options ) }
122 elsif ( $script eq "soap_seq" ) { script_soap_seq( $in, $out, $options ) }
123 elsif ( $script eq "remove_mysql_tables" ) { script_remove_mysql_tables( $in, $out, $options ) }
124 elsif ( $script eq "remove_ucsc_tracks" ) { script_remove_ucsc_tracks( $in, $out, $options ) }
125 elsif ( $script eq "upload_to_ucsc" ) { script_upload_to_ucsc( $in, $out, $options ) }
127 close $in if defined $in;
130 $t1 = gettimeofday();
132 print STDERR "Program: $script" . ( " " x ( 25 - length( $script ) ) ) . sprintf( "Run time: %.4f\n", ( $t1 - $t0 ) ) if $options->{ 'verbose' };
138 # Martin A. Hansen, February 2008.
140 # Gets options from commandline and checks these vigerously.
142 my ( $script, # name of script
147 my ( %options, @options, $opt, @genomes, $real );
149 if ( $script eq "print_usage" )
155 elsif ( $script eq "read_stockholm" )
162 elsif ( $script eq "read_phastcons" )
173 elsif ( $script eq "read_solid" )
181 elsif ( $script eq "read_mysql" )
190 elsif ( $script eq "get_genome_align" )
201 elsif ( $script eq "get_genome_phastcons" )
212 elsif ( $script eq "soap_seq" )
223 elsif ( $script eq "remove_mysql_tables" )
234 elsif ( $script eq "remove_ucsc_tracks" )
246 elsif ( $script eq "upload_to_ucsc" )
270 # print STDERR Dumper( \@options );
277 # print STDERR Dumper( \%options );
279 if ( -t STDIN && scalar( keys %options ) == 0 or $options{ "help" } ) {
280 return wantarray ? %options : \%options;
283 $options{ "cols" } = [ split ",", $options{ "cols" } ] if defined $options{ "cols" };
284 $options{ "keys" } = [ split ",", $options{ "keys" } ] if defined $options{ "keys" };
285 $options{ "no_keys" } = [ split ",", $options{ "no_keys" } ] if defined $options{ "no_keys" };
286 $options{ "save_keys" } = [ split ",", $options{ "save_keys" } ] if defined $options{ "save_keys" };
287 $options{ "quals" } = [ split ",", $options{ "quals" } ] if defined $options{ "quals" };
288 $options{ "feats" } = [ split ",", $options{ "feats" } ] if defined $options{ "feats" };
289 $options{ "frames" } = [ split ",", $options{ "frames" } ] if defined $options{ "frames" };
290 $options{ "samples" } = [ split ",", $options{ "samples" } ] if defined $options{ "samples" };
291 $options{ "tables" } = [ split ",", $options{ "tables" } ] if defined $options{ "tables" };
292 $options{ "tracks" } = [ split ",", $options{ "tracks" } ] if defined $options{ "tracks" };
294 # ---- check arguments ----
296 if ( $options{ 'data_in' } )
298 $options{ "files" } = Maasha::Biopieces::getopt_files( $options{ 'data_in' } );
300 Maasha::Common::error( qq(Argument to --data_in must be a valid file or fileglob expression) ) if scalar @{ $options{ "files" } } == 0;
303 map { Maasha::Common::error( qq(Argument to --cols must be a whole numbers - not "$_") ) if $_ !~ /^\d+$/ } @{ $options{ "cols" } } if $options{ "cols" };
305 # print STDERR Dumper( \%options );
307 $real = "beg|end|word_size|wrap|len|prefix_length|mismatches|offset|num|skip|cpus|window_size|step_size";
309 foreach $opt ( keys %options )
311 if ( $opt =~ /stream_in|pattern_in|exact_in/ and not -f $options{ $opt } )
313 Maasha::Common::error( qq(Argument to --$opt must be a valid file or fileglob expression - not "$options{ $opt }") );
315 elsif ( $opt =~ /$real/ and $options{ $opt } !~ /^\d+$/ )
317 Maasha::Common::error( qq(Argument to --$opt must be a whole number - not "$options{ $opt }") );
319 elsif ( $opt =~ /max_hits|max_hits|max_misses|dist|edit_dist|flank|gap|hamming_dist|priority/ and $options{ $opt } !~ /^-?\d+$/ )
321 Maasha::Common::error( qq(Argument to --$opt must be an integer - not "$options{ $opt }") );
323 elsif ( $opt =~ /identity|threshold/ and $options{ $opt } !~ /^-?(?:\d+(?:\.\d*)?|\.\d+)$/ )
325 Maasha::Common::error( qq(Argument to --$opt must be a decimal number - not "$options{ $opt }") );
327 elsif ( $opt =~ /e_val/ and $options{ $opt } !~ /^([+-]?)(?=\d|\.\d)\d*(\.\d*)?([Ee]([+-]?\d+))?$/ )
329 Maasha::Common::error( qq(Argument to --$opt must be a float - not "$options{ $opt }") );
331 elsif ( $opt =~ /strand/ and $options{ $opt } !~ /^(\+|-)$/ )
333 Maasha::Common::error( qq(Argument to --$opt must be "+" or "-" - not "$options{ $opt }") );
335 elsif ( $opt eq "genome" )
337 @genomes = Maasha::Common::ls_dirs( "$ENV{ 'BP_DATA' }/genomes" );
338 map { $_ =~ s/.*\/(.+)$/$1/ } @genomes;
340 if ( not grep { $_ =~ /^$options{ $opt }$/ } @genomes ) {
341 Maasha::Common::error( qq(Genome $options{ $opt } not found in "$ENV{ 'BP_DATA' }/genomes/") );
344 elsif ( $opt eq "terminal" and not $options{ $opt } =~ /^(svg|post|dumb|x11)/ )
346 Maasha::Common::error( qq(Bad --$opt argument "$options{ $opt }") );
348 elsif ( $opt eq "table" and $options{ $opt } =~ /(-|\.)/ )
350 Maasha::Common::error( qq(Character '$1' is not allowed in table name: $options{ $opt }) );
352 elsif ( $opt eq "merge" and $options{ $opt } !~ /^(AandB|AorB|BorA|AnotB|BnotA)$/ )
354 Maasha::Common::error( qq(Argument to --$opt must be AandB, AorB, BorA, AnotB, or BnotA - not "$options{ $opt }") );
358 Maasha::Common::error( qq(no --database specified) ) if $script eq "remove_ucsc_tracks" and not $options{ "database" };
359 Maasha::Common::error( qq(no --in_file or --genome specified) ) if $script eq "soap_seq" and not $options{ "genome" } and not $options{ "in_file" };
360 Maasha::Common::error( qq(both --in_file and --genome specified) ) if $script eq "soap_seq" and $options{ "genome" } and $options{ "in_file" };
361 Maasha::Common::error( qq(no --genome specified) ) if $script =~ /get_genome_align|get_genome_phastcons/ and not $options{ "genome" };
363 if ( $script eq "upload_to_ucsc" )
365 Maasha::Common::error( qq(no --database specified) ) if not $options{ "database" };
366 Maasha::Common::error( qq(no --table specified) ) if not $options{ "table" };
369 return wantarray ? %options : \%options;
373 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SCRIPTS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
376 sub script_print_usage
378 # Martin A. Hansen, January 2008.
380 # Retrieves usage information from file and
381 # prints this nicely formatted.
383 my ( $in, # handle to in stream
384 $out, # handle to out stream
385 $options, # options hash
390 my ( $file, $wiki, $lines );
392 if ( $options->{ 'data_in' } ) {
393 $file = $options->{ 'data_in' };
395 $file = join "", $ENV{ 'BP_DIR' }, "/bp_usage/", $options->{ 'SCRIPT' }, ".wiki";
398 $wiki = Maasha::Gwiki::gwiki_read( $file );
400 ( $wiki->[ 2 ], $wiki->[ 3 ], $wiki->[ 0 ], $wiki->[ 1 ] ) = ( $wiki->[ 0 ], $wiki->[ 1 ], $wiki->[ 2 ], $wiki->[ 3 ] );
402 if ( not $options->{ "help" } ) {
403 @{ $wiki } = grep { $_->[ 0 ]->{ 'SECTION' } =~ /Biopiece|summary|Usage|Options|Help/ } @{ $wiki };
406 $lines = Maasha::Gwiki::gwiki2ascii( $wiki );
408 print STDERR "$_\n" foreach @{ $lines };
414 sub script_read_stockholm
416 # Martin A. Hansen, August 2007.
418 # Read Stockholm format.
420 my ( $in, # handle to in stream
421 $out, # handle to out stream
422 $options, # options hash
427 my ( $data_in, $file, $num, $entry, $record, $record_anno, $record_align, $key, $seq );
429 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
430 Maasha::Biopieces::put_record( $record, $out );
435 foreach $file ( @{ $options->{ "files" } } )
437 $data_in = Maasha::Common::read_open( $file );
439 while ( $entry = Maasha::Stockholm::get_stockholm_entry( $data_in ) )
441 $record = Maasha::Stockholm::parse_stockholm_entry( $entry );
445 foreach $key ( keys %{ $record->{ "GF" } } ) {
446 $record_anno->{ $key } = $record->{ "GF" }->{ $key };
449 $record_anno->{ "ALIGN" } = $num;
451 Maasha::Biopieces::put_record( $record_anno, $out );
453 foreach $seq ( @{ $record->{ "ALIGN" } } )
458 SEQ_NAME => $seq->[ 0 ],
462 Maasha::Biopieces::put_record( $record_align, $out );
465 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
475 close $data_in if $data_in;
479 sub script_read_phastcons
481 # Martin A. Hansen, December 2007.
483 # Read PhastCons format.
485 my ( $in, # handle to in stream
486 $out, # handle to out stream
487 $options, # options hash
492 my ( $data_in, $file, $num, $entry, @records, $record );
494 $options->{ "min" } ||= 10;
495 $options->{ "dist" } ||= 25;
496 $options->{ "threshold" } ||= 0.8;
497 $options->{ "gap" } ||= 5;
499 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
500 Maasha::Biopieces::put_record( $record, $out );
505 foreach $file ( @{ $options->{ "files" } } )
507 $data_in = Maasha::Common::read_open( $file );
509 while ( $entry = Maasha::UCSC::fixedstep_get_entry( $data_in ) )
511 @records = Maasha::UCSC::phastcons_parse_entry( $entry, $options );
513 foreach $record ( @records )
515 $record->{ "REC_TYPE" } = "BED";
516 $record->{ "BED_LEN" } = $record->{ "CHR_END" } - $record->{ "CHR_BEG" } + 1;
518 Maasha::Biopieces::put_record( $record, $out );
520 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
531 close $data_in if $data_in;
535 sub script_read_solid
537 # Martin A. Hansen, April 2008.
539 # Read Solid sequence from file.
541 my ( $in, # handle to in stream
542 $out, # handle to out stream
543 $options, # options hash
548 my ( $record, $file, $data_in, $line, $num, $seq_name, $seq_cs, $seq_qual, @scores, @seqs, $i );
550 $options->{ "quality" } ||= 15;
552 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
553 Maasha::Biopieces::put_record( $record, $out );
558 foreach $file ( @{ $options->{ "files" } } )
560 $data_in = Maasha::Common::read_open( $file );
562 while ( $line = <$data_in> )
566 ( $seq_name, $seq_cs, $seq_qual ) = split /\t/, $line;
568 @scores = split /,/, $seq_qual;
569 @seqs = split //, Maasha::Solid::color_space2seq( $seq_cs );
571 for ( $i = 0; $i < @seqs; $i++ ) {
572 $seqs[ $i ] = lc $seqs[ $i ] if $scores[ $i ] < $options->{ "quality" };
577 SEQ_NAME => $seq_name,
579 SEQ_QUAL => join( ";", @scores ),
580 SEQ_LEN => length $seq_cs,
581 SEQ => join( "", @seqs ),
582 SCORE_MEAN => sprintf( "%.2f", Maasha::Calc::mean( \@scores ) ),
585 Maasha::Biopieces::put_record( $record, $out );
587 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
597 close $data_in if $data_in;
601 sub script_read_mysql
603 # Martin A. Hansen, May 2008.
605 # Read a MySQL query into stream.
607 my ( $in, # handle to in stream
608 $out, # handle to out stream
609 $options, # options hash
614 my ( $record, $dbh, $results );
616 $options->{ "user" } ||= Maasha::UCSC::ucsc_get_user();
617 $options->{ "password" } ||= Maasha::UCSC::ucsc_get_password();
619 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
620 Maasha::Biopieces::put_record( $record, $out );
623 $dbh = Maasha::SQL::connect( $options->{ "database" }, $options->{ "user" }, $options->{ "password" } );
625 $results = Maasha::SQL::query_hashref_list( $dbh, $options->{ "query" } );
627 Maasha::SQL::disconnect( $dbh );
629 map { Maasha::Biopieces::put_record( $_ ) } @{ $results };
633 sub script_complexity_seq
635 # Martin A. Hansen, May 2008.
637 # Generates an index calculated as the most common di-residue over
638 # the sequence length for all sequences in stream.
640 my ( $in, # handle to in stream
641 $out, # handle to out stream
646 my ( $record, $index );
648 while ( $record = Maasha::Biopieces::get_record( $in ) )
650 $record->{ "SEQ_COMPLEXITY" } = sprintf( "%.2f", Maasha::Seq::seq_complexity( $record->{ "SEQ" } ) ) if $record->{ "SEQ" };
652 Maasha::Biopieces::put_record( $record, $out );
657 sub script_get_genome_align
659 # Martin A. Hansen, April 2008.
661 # Gets a subalignment from a multiple genome alignment.
663 my ( $in, # handle to in stream
664 $out, # handle to out stream
665 $options, # options hash
670 my ( $record, $maf_track, $align, $align_num, $beg, $end, $len, $entry );
672 $options->{ "strand" } ||= "+";
676 $maf_track = Maasha::Config::maf_track( $options->{ "genome" } );
678 if ( $options->{ "chr" } and $options->{ "beg" } and ( $options->{ "end" } or $options->{ "len" } ) )
680 $beg = $options->{ "beg" } - 1;
682 if ( $options->{ "end" } ) {
683 $end = $options->{ "end" };
684 } elsif ( $options->{ "len" } ) {
685 $end = $beg + $options->{ "len" };
688 $align = Maasha::UCSC::maf_extract( $BP_TMP, $options->{ "genome" }, $maf_track, $options->{ "chr" }, $beg, $end, $options->{ "strand" } );
690 foreach $entry ( @{ $align } )
692 $entry->{ "CHR" } = $record->{ "CHR" };
693 $entry->{ "CHR_BEG" } = $record->{ "CHR_BEG" };
694 $entry->{ "CHR_END" } = $record->{ "CHR_END" };
695 $entry->{ "STRAND" } = $record->{ "STRAND" } || '+';
696 $entry->{ "Q_ID" } = $record->{ "Q_ID" };
697 $entry->{ "SCORE" } = $record->{ "SCORE" };
699 Maasha::Biopieces::put_record( $entry, $out );
703 while ( $record = Maasha::Biopieces::get_record( $in ) )
705 if ( $record->{ "REC_TYPE" } eq "BED" )
707 $align = Maasha::UCSC::maf_extract( $BP_TMP, $options->{ "genome" }, $maf_track, $record->{ "CHR" }, $record->{ "CHR_BEG" }, $record->{ "CHR_END" }, $record->{ "STRAND" } );
709 elsif ( $record->{ "REC_TYPE" } eq "VMATCH" )
711 $align = Maasha::UCSC::maf_extract( $BP_TMP, $options->{ "genome" }, $maf_track, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" } + 1, $record->{ "STRAND" } );
713 elsif ( $record->{ "REC_TYPE" } eq "PSL" )
715 $align = Maasha::UCSC::maf_extract( $BP_TMP, $options->{ "genome" }, $maf_track, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" }, $record->{ "STRAND" } );
717 elsif ( $record->{ "REC_TYPE" } eq "BLAST" )
719 $align = Maasha::UCSC::maf_extract( $BP_TMP, $options->{ "genome" }, $maf_track, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" }, $record->{ "STRAND" } );
722 foreach $entry ( @{ $align } )
724 $entry->{ "CHR" } = $record->{ "CHR" };
725 $entry->{ "CHR_BEG" } = $record->{ "CHR_BEG" };
726 $entry->{ "CHR_END" } = $record->{ "CHR_END" };
727 $entry->{ "STRAND" } = $record->{ "STRAND" };
728 $entry->{ "Q_ID" } = $record->{ "Q_ID" };
729 $entry->{ "SCORE" } = $record->{ "SCORE" };
731 Maasha::Biopieces::put_record( $entry, $out );
739 sub script_get_genome_phastcons
741 # Martin A. Hansen, February 2008.
743 # Get phastcons scores from genome intervals.
745 my ( $in, # handle to in stream
746 $out, # handle to out stream
747 $options, # options hash
752 my ( $phastcons_file, $phastcons_index, $index, $fh_phastcons, $scores, $record );
754 $options->{ "flank" } ||= 0;
756 $phastcons_file = Maasha::Config::genome_phastcons( $options->{ "genome" } );
757 $phastcons_index = Maasha::Config::genome_phastcons_index( $options->{ "genome" } );
759 $index = Maasha::UCSC::fixedstep_index_retrieve( $phastcons_index );
760 $fh_phastcons = Maasha::Common::read_open( $phastcons_file );
762 if ( defined $options->{ "chr" } and defined $options->{ "beg" } and ( defined $options->{ "end" } or defined $options->{ "len" } ) )
764 $options->{ "beg" } -= 1; # request is 1-based
765 $options->{ "end" } -= 1; # request is 1-based
767 if ( $options->{ "len" } ) {
768 $options->{ "end" } = $options->{ "beg" } + $options->{ "len" } - 1;
771 $scores = Maasha::UCSC::fixedstep_index_lookup( $index, $fh_phastcons, $options->{ "chr" }, $options->{ "beg" }, $options->{ "end" }, $options->{ "flank" } );
773 $record->{ "CHR" } = $options->{ "chr" };
774 $record->{ "CHR_BEG" } = $options->{ "beg" } - $options->{ "flank" };
775 $record->{ "CHR_END" } = $options->{ "end" } + $options->{ "flank" };
777 $record->{ "PHASTCONS" } = join ",", @{ $scores };
778 $record->{ "PHAST_COUNT" } = scalar @{ $scores }; # DEBUG
780 Maasha::Biopieces::put_record( $record, $out );
783 while ( $record = Maasha::Biopieces::get_record( $in ) )
785 if ( $record->{ "REC_TYPE" } eq "BED" )
787 $scores = Maasha::UCSC::fixedstep_index_lookup( $index, $fh_phastcons, $record->{ "CHR" }, $record->{ "CHR_BEG" }, $record->{ "CHR_END" }, $options->{ "flank" } );
789 elsif ( $record->{ "REC_TYPE" } eq "PSL" )
791 $scores = Maasha::UCSC::fixedstep_index_lookup( $index, $fh_phastcons, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" }, $options->{ "flank" } );
793 elsif ( $record->{ "REC_TYPE" } eq "BLAST" )
795 $scores = Maasha::UCSC::fixedstep_index_lookup( $index, $fh_phastcons, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" }, $options->{ "flank" } );
798 $record->{ "PHASTCONS" } = join ",", @{ $scores } if @{ $scores };
799 # $record->{ "PHAST_COUNT" } = @{ $scores } if @{ $scores }; # DEBUG
801 Maasha::Biopieces::put_record( $record, $out );
804 close $fh_phastcons if $fh_phastcons;
810 # Martin A. Hansen, July 2008.
812 # soap sequences in stream against a given file or genome.
814 my ( $in, # handle to in stream
815 $out, # handle to out stream
816 $options, # options hash
821 my ( $genome, $tmp_in, $tmp_out, $fh_in, $fh_out, $record, $line, @fields, $entry, $count, $args );
823 $options->{ "seed_size" } ||= 10;
824 $options->{ "mismatches" } ||= 2;
825 $options->{ "gap_size" } ||= 0;
826 $options->{ "cpus" } ||= 1;
828 if ( $options->{ "genome" } ) {
829 $options->{ "in_file" } = "$ENV{ 'BP_DATA' }/genomes/$options->{ 'genome' }/fasta/$options->{ 'genome' }.fna";
832 $tmp_in = "$BP_TMP/soap_query.seq";
833 $tmp_out = "$BP_TMP/soap.result";
835 $fh_out = Maasha::Common::write_open( $tmp_in );
839 while ( $record = Maasha::Biopieces::get_record( $in ) )
841 if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) )
843 Maasha::Fasta::put_entry( $entry, $fh_out );
848 Maasha::Biopieces::put_record( $record, $out );
856 "-s $options->{ 'seed_size' }",
859 "-v $options->{ 'mismatches' }",
860 "-g $options->{ 'gap_size' }",
861 "-p $options->{ 'cpus' }",
862 "-d $options->{ 'in_file' }",
866 $args .= " > /dev/null 2>&1" if not $options->{ 'verbose' };
868 Maasha::Common::run( "soap", $args, 1 );
872 $fh_out = Maasha::Common::read_open( $tmp_out );
876 while ( $line = <$fh_out> )
880 @fields = split /\t/, $line;
882 $record->{ "REC_TYPE" } = "SOAP";
883 $record->{ "Q_ID" } = $fields[ 0 ];
884 $record->{ "SCORE" } = $fields[ 3 ];
885 $record->{ "STRAND" } = $fields[ 6 ];
886 $record->{ "S_ID" } = $fields[ 7 ];
887 $record->{ "S_BEG" } = $fields[ 8 ] - 1; # soap is 1-based
888 $record->{ "S_END" } = $fields[ 8 ] + $fields[ 5 ] - 2;
890 Maasha::Biopieces::put_record( $record, $out );
900 sub script_remove_mysql_tables
902 # Martin A. Hansen, November 2008.
904 # Remove MySQL tables from values in stream.
906 my ( $in, # handle to in stream
907 $out, # handle to out stream
908 $options, # options hash
913 my ( $record, %table_hash, $dbh, $table );
915 $options->{ "user" } ||= Maasha::UCSC::ucsc_get_user();
916 $options->{ "password" } ||= Maasha::UCSC::ucsc_get_password();
918 map { $table_hash{ $_ } = 1 } @{ $options->{ 'tables' } };
920 while ( $record = Maasha::Biopieces::get_record( $in ) )
922 map { $table_hash{ $record->{ $_ } } = 1 } @{ $options->{ 'keys' } };
924 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ 'no_stream' };
927 $dbh = Maasha::SQL::connect( $options->{ "database" }, $options->{ "user" }, $options->{ "password" } );
929 foreach $table ( sort keys %table_hash )
931 if ( Maasha::SQL::table_exists( $dbh, $table ) )
933 print STDERR qq(Removing table "$table" from database "$options->{ 'database' }" ... ) if $options->{ 'verbose' };
934 Maasha::SQL::delete_table( $dbh, $table );
935 print STDERR "done.\n" if $options->{ 'verbose' };
939 print STDERR qq(WARNING: table "$table" not found in database "$options->{ 'database' }\n");
943 Maasha::SQL::disconnect( $dbh );
947 sub script_remove_ucsc_tracks
949 # Martin A. Hansen, November 2008.
951 # Remove track from MySQL tables and config file.
953 my ( $in, # handle to in stream
954 $out, # handle to out stream
955 $options, # options hash
960 my ( $record, %track_hash, $fh_in, $fh_out, $track, @tracks, @new_tracks, $dbh );
962 $options->{ 'user' } ||= Maasha::UCSC::ucsc_get_user();
963 $options->{ 'password' } ||= Maasha::UCSC::ucsc_get_password();
964 $options->{ 'config_file' } ||= "$ENV{ 'HOME' }/ucsc/my_tracks.ra";
966 map { $track_hash{ $_ } = 1 } @{ $options->{ 'tracks' } };
968 while ( $record = Maasha::Biopieces::get_record( $in ) )
970 map { $track_hash{ $record->{ $_ } } = 1 } @{ $options->{ 'keys' } };
972 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ 'no_stream' };
975 $fh_in = Maasha::Filesys::file_read_open( $options->{ 'config_file' } );
977 while ( $track = Maasha::UCSC::ucsc_config_entry_get( $fh_in ) ) {
978 push @tracks, $track;
983 foreach $track ( @tracks )
985 if ( $track->{ 'database' } eq $options->{ 'database' } and exists $track_hash{ $track->{ 'track' } } ) {
986 print STDERR qq(Removing track "$track->{ 'track' }" from config file.\n) if $options->{ 'verbose' };
988 push @new_tracks, $track;
992 rename "$options->{ 'config_file' }", "$options->{ 'config_file' }~";
994 $fh_out = Maasha::Filesys::file_write_open( $options->{ 'config_file' } );
996 map { Maasha::UCSC::ucsc_config_entry_put( $_, $fh_out ) } @new_tracks;
1000 # ---- locate track in database ----
1002 $dbh = Maasha::SQL::connect( $options->{ "database" }, $options->{ "user" }, $options->{ "password" } );
1004 foreach $track ( sort keys %track_hash )
1006 if ( Maasha::SQL::table_exists( $dbh, $track ) )
1008 print STDERR qq(Removing table "$track" from database "$options->{ 'database' }" ... ) if $options->{ 'verbose' };
1009 Maasha::SQL::delete_table( $dbh, $track );
1010 print STDERR "done.\n" if $options->{ 'verbose' };
1014 print STDERR qq(WARNING: table "$track" not found in database "$options->{ 'database' }\n");
1018 Maasha::SQL::disconnect( $dbh );
1020 Maasha::Common::run( "ucscMakeTracks.pl", "-b > /dev/null 2>&1" );
1024 sub script_upload_to_ucsc
1026 # Martin A. Hansen, August 2007.
1028 # Calculate the mean of values of given keys.
1030 # This routine has developed into the most ugly hack. Do something!
1032 my ( $in, # handle to in stream
1033 $out, # handle to out stream
1034 $options, # options hash
1039 my ( $record, $file, $wib_file, $wig_file, $wib_dir, $fh_out, $i, $first, $format, $type, $columns, $append, $entry );
1041 $options->{ "short_label" } ||= $options->{ 'table' };
1042 $options->{ "long_label" } ||= $options->{ 'table' };
1043 $options->{ "group" } ||= $ENV{ "LOGNAME" };
1044 $options->{ "priority" } ||= 1;
1045 $options->{ "visibility" } ||= "pack";
1046 $options->{ "color" } ||= join( ",", int( rand( 255 ) ), int( rand( 255 ) ), int( rand( 255 ) ) );
1047 $options->{ "chunk_size" } ||= 10_000_000_000; # Due to 32-bit UCSC compilation really large tables cannot be loaded in one go.
1049 $file = "$BP_TMP/ucsc_upload.tmp";
1054 $fh_out = Maasha::Common::write_open( $file );
1056 while ( $record = Maasha::Biopieces::get_record( $in ) )
1058 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
1060 if ( $record->{ "REC_TYPE" } eq "fixed_step" )
1064 if ( $entry = Maasha::UCSC::Wiggle::biopiece2fixedstep( $record ) ) {
1065 Maasha::UCSC::Wiggle::fixedstep_entry_put( $entry, $fh_out );
1068 elsif ( $record->{ "REC_TYPE" } eq "PSL" )
1072 Maasha::UCSC::psl_put_header( $fh_out ) if $first;
1073 Maasha::UCSC::psl_put_entry( $record, $fh_out );
1077 elsif ( $record->{ "REC_TYPE" } eq "BED" and $record->{ "SEC_STRUCT" } )
1079 # chrom chromStart chromEnd name score strand size secStr conf
1083 print $fh_out join ( "\t",
1085 $record->{ "CHR_BEG" },
1086 $record->{ "CHR_END" } + 1,
1087 $record->{ "Q_ID" },
1088 $record->{ "SCORE" },
1089 $record->{ "STRAND" },
1090 $record->{ "SIZE" },
1091 $record->{ "SEC_STRUCT" },
1092 $record->{ "CONF" },
1095 elsif ( $record->{ "REC_TYPE" } eq "BED" )
1098 $columns = $record->{ "BED_COLS" };
1100 if ( $entry = Maasha::UCSC::BED::biopiece2bed( $record, $columns ) ) {
1101 Maasha::UCSC::BED::bed_entry_put( $entry, $fh_out, $columns, $options->{ 'check' } );
1104 elsif ( $record->{ "REC_TYPE" } eq "PATSCAN" and $record->{ "CHR" } )
1109 if ( $entry = Maasha::UCSC::BED::biopiece2bed( $record, $columns ) ) {
1110 Maasha::UCSC::BED::bed_entry_put( $entry, $fh_out, $columns, $options->{ 'check' } );
1113 elsif ( $record->{ "REC_TYPE" } eq "BLAST" and $record->{ "S_ID" } =~ /^chr/ )
1118 $record->{ "SCORE" } = $record->{ "BIT_SCORE" } * 1000;
1120 if ( $entry = Maasha::UCSC::BED::biopiece2bed( $record, $columns ) ) {
1121 Maasha::UCSC::BED::bed_entry_put( $entry, $fh_out, $columns, $options->{ 'check' } );
1124 elsif ( $record->{ "REC_TYPE" } eq "VMATCH" and $record->{ "S_ID" } =~ /^chr/i )
1129 if ( $entry = Maasha::UCSC::BED::biopiece2bed( $record, $columns ) ) {
1130 Maasha::UCSC::BED::bed_entry_put( $entry, $fh_out, $columns, $options->{ 'check' } );
1134 if ( $i == $options->{ "chunk_size" } )
1138 if ( $format eq "BED" ) {
1139 Maasha::UCSC::bed_upload_to_ucsc( $BP_TMP, $file, $options, $append );
1140 } elsif ( $format eq "PSL" ) {
1141 Maasha::UCSC::psl_upload_to_ucsc( $file, $options, $append );
1150 $fh_out = Maasha::Common::write_open( $file );
1158 if ( exists $options->{ "database" } and $options->{ "table" } )
1160 if ( $format eq "BED" )
1162 $type = "bed $columns";
1164 Maasha::UCSC::bed_upload_to_ucsc( $BP_TMP, $file, $options, $append );
1166 elsif ( $format eq "BED_SS" )
1168 $type = "type bed 6 +";
1170 Maasha::UCSC::bed_upload_to_ucsc( $BP_TMP, $file, $options, $append );
1172 elsif ( $format eq "PSL" )
1176 Maasha::UCSC::psl_upload_to_ucsc( $file, $options, $append );
1178 elsif ( $format eq "WIGGLE" )
1180 $options->{ "visibility" } = "full";
1182 $wig_file = "$options->{ 'table' }.wig";
1183 $wib_file = "$options->{ 'table' }.wib";
1185 $wib_dir = "$ENV{ 'HOME' }/ucsc/wib";
1187 Maasha::Common::dir_create_if_not_exists( $wib_dir );
1189 if ( $options->{ 'verbose' } ) {
1190 `cd $BP_TMP && wigEncode $file $wig_file $wib_file`;
1192 `cd $BP_TMP && wigEncode $file $wig_file $wib_file > /dev/null 2>&1`;
1195 Maasha::Common::run( "mv", "$BP_TMP/$wib_file $wib_dir" );
1203 Maasha::UCSC::wiggle_upload_to_ucsc( $BP_TMP, $wib_dir, $file, $options );
1208 Maasha::UCSC::ucsc_update_config( $options, $type );
1213 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<