1 package Maasha::BioRun;
4 # Copyright (C) 2007-2009 Martin A. Hansen.
6 # This program is free software; you can redistribute it and/or
7 # modify it under the terms of the GNU General Public License
8 # as published by the Free Software Foundation; either version 2
9 # of the License, or (at your option) any later version.
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program; if not, write to the Free Software
18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 # http://www.gnu.org/copyleft/gpl.html
23 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
26 # Routines that contains Biopieces which are run.
29 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
34 use Getopt::Long qw( :config bundling );
35 use Time::HiRes qw( gettimeofday );
36 use Storable qw( dclone );
37 use Maasha::Biopieces;
43 use Maasha::Stockholm;
47 use Maasha::UCSC::BED;
48 use Maasha::UCSC::Wiggle;
57 use vars qw( @ISA @EXPORT_OK );
61 @ISA = qw( Exporter );
69 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> GLOBALS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
72 my ( $script, $BP_TMP );
74 $script = Maasha::Common::get_scriptname();
75 $BP_TMP = Maasha::Common::get_tmpdir();
78 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> RUN SCRIPT <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
81 run_script( $script );
84 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SUBROUTINES <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
89 # Martin A. Hansen, August 2007.
91 # Run a specific script.
93 my ( $script, # script name
98 my ( $t0, $t1, $options, $in, $out );
100 Maasha::Biopieces::log_biopiece();
102 $t0 = gettimeofday();
104 $options = get_options( $script );
106 $options->{ "SCRIPT" } = $script;
108 $script = "print_usage" if ( -t STDIN and keys %{ $options } <= 1 or $options->{ 'help' } );
110 $in = Maasha::Biopieces::read_stream( $options->{ "stream_in" } );
111 $out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } );
113 if ( $script eq "print_usage" ) { script_print_usage( $in, $out, $options ) }
114 elsif ( $script eq "read_psl" ) { script_read_psl( $in, $out, $options ) }
115 elsif ( $script eq "read_stockholm" ) { script_read_stockholm( $in, $out, $options ) }
116 elsif ( $script eq "read_phastcons" ) { script_read_phastcons( $in, $out, $options ) }
117 elsif ( $script eq "read_soft" ) { script_read_soft( $in, $out, $options ) }
118 elsif ( $script eq "read_solexa" ) { script_read_solexa( $in, $out, $options ) }
119 elsif ( $script eq "read_solid" ) { script_read_solid( $in, $out, $options ) }
120 elsif ( $script eq "read_mysql" ) { script_read_mysql( $in, $out, $options ) }
121 elsif ( $script eq "uppercase_seq" ) { script_uppercase_seq( $in, $out, $options ) }
122 elsif ( $script eq "complexity_seq" ) { script_complexity_seq( $in, $out, $options ) }
123 elsif ( $script eq "get_genome_align" ) { script_get_genome_align( $in, $out, $options ) }
124 elsif ( $script eq "get_genome_phastcons" ) { script_get_genome_phastcons( $in, $out, $options ) }
125 elsif ( $script eq "soap_seq" ) { script_soap_seq( $in, $out, $options ) }
126 elsif ( $script eq "remove_mysql_tables" ) { script_remove_mysql_tables( $in, $out, $options ) }
127 elsif ( $script eq "remove_ucsc_tracks" ) { script_remove_ucsc_tracks( $in, $out, $options ) }
128 elsif ( $script eq "upload_to_ucsc" ) { script_upload_to_ucsc( $in, $out, $options ) }
130 close $in if defined $in;
133 $t1 = gettimeofday();
135 print STDERR "Program: $script" . ( " " x ( 25 - length( $script ) ) ) . sprintf( "Run time: %.4f\n", ( $t1 - $t0 ) ) if $options->{ 'verbose' };
141 # Martin A. Hansen, February 2008.
143 # Gets options from commandline and checks these vigerously.
145 my ( $script, # name of script
150 my ( %options, @options, $opt, @genomes, $real );
152 if ( $script eq "print_usage" )
158 elsif ( $script eq "read_psl" )
165 elsif ( $script eq "read_stockholm" )
172 elsif ( $script eq "read_phastcons" )
183 elsif ( $script eq "read_soft" )
191 elsif ( $script eq "read_solexa" )
200 elsif ( $script eq "read_solid" )
208 elsif ( $script eq "read_mysql" )
217 elsif ( $script eq "get_genome_align" )
228 elsif ( $script eq "get_genome_phastcons" )
239 elsif ( $script eq "soap_seq" )
250 elsif ( $script eq "remove_mysql_tables" )
261 elsif ( $script eq "remove_ucsc_tracks" )
273 elsif ( $script eq "upload_to_ucsc" )
297 # print STDERR Dumper( \@options );
304 # print STDERR Dumper( \%options );
306 if ( -t STDIN && scalar( keys %options ) == 0 or $options{ "help" } ) {
307 return wantarray ? %options : \%options;
310 $options{ "cols" } = [ split ",", $options{ "cols" } ] if defined $options{ "cols" };
311 $options{ "keys" } = [ split ",", $options{ "keys" } ] if defined $options{ "keys" };
312 $options{ "no_keys" } = [ split ",", $options{ "no_keys" } ] if defined $options{ "no_keys" };
313 $options{ "save_keys" } = [ split ",", $options{ "save_keys" } ] if defined $options{ "save_keys" };
314 $options{ "quals" } = [ split ",", $options{ "quals" } ] if defined $options{ "quals" };
315 $options{ "feats" } = [ split ",", $options{ "feats" } ] if defined $options{ "feats" };
316 $options{ "frames" } = [ split ",", $options{ "frames" } ] if defined $options{ "frames" };
317 $options{ "samples" } = [ split ",", $options{ "samples" } ] if defined $options{ "samples" };
318 $options{ "tables" } = [ split ",", $options{ "tables" } ] if defined $options{ "tables" };
319 $options{ "tracks" } = [ split ",", $options{ "tracks" } ] if defined $options{ "tracks" };
321 # ---- check arguments ----
323 if ( $options{ 'data_in' } )
325 $options{ "files" } = Maasha::Biopieces::getopt_files( $options{ 'data_in' } );
327 Maasha::Common::error( qq(Argument to --data_in must be a valid file or fileglob expression) ) if scalar @{ $options{ "files" } } == 0;
330 map { Maasha::Common::error( qq(Argument to --cols must be a whole numbers - not "$_") ) if $_ !~ /^\d+$/ } @{ $options{ "cols" } } if $options{ "cols" };
332 # print STDERR Dumper( \%options );
334 $real = "beg|end|word_size|wrap|len|prefix_length|mismatches|offset|num|skip|cpus|window_size|step_size";
336 foreach $opt ( keys %options )
338 if ( $opt =~ /stream_in|pattern_in|exact_in/ and not -f $options{ $opt } )
340 Maasha::Common::error( qq(Argument to --$opt must be a valid file or fileglob expression - not "$options{ $opt }") );
342 elsif ( $opt =~ /$real/ and $options{ $opt } !~ /^\d+$/ )
344 Maasha::Common::error( qq(Argument to --$opt must be a whole number - not "$options{ $opt }") );
346 elsif ( $opt =~ /max_hits|max_hits|max_misses|dist|edit_dist|flank|gap|hamming_dist|priority/ and $options{ $opt } !~ /^-?\d+$/ )
348 Maasha::Common::error( qq(Argument to --$opt must be an integer - not "$options{ $opt }") );
350 elsif ( $opt =~ /identity|threshold/ and $options{ $opt } !~ /^-?(?:\d+(?:\.\d*)?|\.\d+)$/ )
352 Maasha::Common::error( qq(Argument to --$opt must be a decimal number - not "$options{ $opt }") );
354 elsif ( $opt =~ /e_val/ and $options{ $opt } !~ /^([+-]?)(?=\d|\.\d)\d*(\.\d*)?([Ee]([+-]?\d+))?$/ )
356 Maasha::Common::error( qq(Argument to --$opt must be a float - not "$options{ $opt }") );
358 elsif ( $opt =~ /strand/ and $options{ $opt } !~ /^(\+|-)$/ )
360 Maasha::Common::error( qq(Argument to --$opt must be "+" or "-" - not "$options{ $opt }") );
362 elsif ( $opt eq "genome" )
364 @genomes = Maasha::Common::ls_dirs( "$ENV{ 'BP_DATA' }/genomes" );
365 map { $_ =~ s/.*\/(.+)$/$1/ } @genomes;
367 if ( not grep { $_ =~ /^$options{ $opt }$/ } @genomes ) {
368 Maasha::Common::error( qq(Genome $options{ $opt } not found in "$ENV{ 'BP_DATA' }/genomes/") );
371 elsif ( $opt eq "terminal" and not $options{ $opt } =~ /^(svg|post|dumb|x11)/ )
373 Maasha::Common::error( qq(Bad --$opt argument "$options{ $opt }") );
375 elsif ( $opt eq "table" and $options{ $opt } =~ /(-|\.)/ )
377 Maasha::Common::error( qq(Character '$1' is not allowed in table name: $options{ $opt }) );
379 elsif ( $opt eq "merge" and $options{ $opt } !~ /^(AandB|AorB|BorA|AnotB|BnotA)$/ )
381 Maasha::Common::error( qq(Argument to --$opt must be AandB, AorB, BorA, AnotB, or BnotA - not "$options{ $opt }") );
383 elsif ( $opt eq "format" and $script eq "read_solexa" and $options{ $opt } !~ /octal|decimal/ )
385 Maasha::Common::error( qq(Argument to --$opt must be octal or decimal - not "$options{ $opt }") );
389 Maasha::Common::error( qq(no --database specified) ) if $script eq "remove_ucsc_tracks" and not $options{ "database" };
390 Maasha::Common::error( qq(no --in_file or --genome specified) ) if $script eq "soap_seq" and not $options{ "genome" } and not $options{ "in_file" };
391 Maasha::Common::error( qq(both --in_file and --genome specified) ) if $script eq "soap_seq" and $options{ "genome" } and $options{ "in_file" };
392 Maasha::Common::error( qq(no --genome specified) ) if $script =~ /get_genome_align|get_genome_phastcons/ and not $options{ "genome" };
394 if ( $script eq "upload_to_ucsc" )
396 Maasha::Common::error( qq(no --database specified) ) if not $options{ "database" };
397 Maasha::Common::error( qq(no --table specified) ) if not $options{ "table" };
400 return wantarray ? %options : \%options;
404 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SCRIPTS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
407 sub script_print_usage
409 # Martin A. Hansen, January 2008.
411 # Retrieves usage information from file and
412 # prints this nicely formatted.
414 my ( $in, # handle to in stream
415 $out, # handle to out stream
416 $options, # options hash
421 my ( $file, $wiki, $lines );
423 if ( $options->{ 'data_in' } ) {
424 $file = $options->{ 'data_in' };
426 $file = join "", $ENV{ 'BP_DIR' }, "/bp_usage/", $options->{ 'SCRIPT' }, ".wiki";
429 $wiki = Maasha::Gwiki::gwiki_read( $file );
431 ( $wiki->[ 2 ], $wiki->[ 3 ], $wiki->[ 0 ], $wiki->[ 1 ] ) = ( $wiki->[ 0 ], $wiki->[ 1 ], $wiki->[ 2 ], $wiki->[ 3 ] );
433 if ( not $options->{ "help" } ) {
434 @{ $wiki } = grep { $_->[ 0 ]->{ 'SECTION' } =~ /Biopiece|summary|Usage|Options|Help/ } @{ $wiki };
437 $lines = Maasha::Gwiki::gwiki2ascii( $wiki );
439 print STDERR "$_\n" foreach @{ $lines };
447 # Martin A. Hansen, August 2007.
449 # Read psl table from stream or file.
451 my ( $in, # handle to in stream
452 $out, # handle to out stream
453 $options, # options hash
458 my ( $record, $file, $data_in, $num );
460 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
461 Maasha::Biopieces::put_record( $record, $out );
466 foreach $file ( @{ $options->{ "files" } } )
468 $data_in = Maasha::Common::read_open( $file );
470 while ( $record = Maasha::UCSC::psl_get_entry( $data_in ) )
472 Maasha::Biopieces::put_record( $record, $out );
474 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
484 sub script_read_stockholm
486 # Martin A. Hansen, August 2007.
488 # Read Stockholm format.
490 my ( $in, # handle to in stream
491 $out, # handle to out stream
492 $options, # options hash
497 my ( $data_in, $file, $num, $entry, $record, $record_anno, $record_align, $key, $seq );
499 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
500 Maasha::Biopieces::put_record( $record, $out );
505 foreach $file ( @{ $options->{ "files" } } )
507 $data_in = Maasha::Common::read_open( $file );
509 while ( $entry = Maasha::Stockholm::get_stockholm_entry( $data_in ) )
511 $record = Maasha::Stockholm::parse_stockholm_entry( $entry );
515 foreach $key ( keys %{ $record->{ "GF" } } ) {
516 $record_anno->{ $key } = $record->{ "GF" }->{ $key };
519 $record_anno->{ "ALIGN" } = $num;
521 Maasha::Biopieces::put_record( $record_anno, $out );
523 foreach $seq ( @{ $record->{ "ALIGN" } } )
528 SEQ_NAME => $seq->[ 0 ],
532 Maasha::Biopieces::put_record( $record_align, $out );
535 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
545 close $data_in if $data_in;
549 sub script_read_phastcons
551 # Martin A. Hansen, December 2007.
553 # Read PhastCons format.
555 my ( $in, # handle to in stream
556 $out, # handle to out stream
557 $options, # options hash
562 my ( $data_in, $file, $num, $entry, @records, $record );
564 $options->{ "min" } ||= 10;
565 $options->{ "dist" } ||= 25;
566 $options->{ "threshold" } ||= 0.8;
567 $options->{ "gap" } ||= 5;
569 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
570 Maasha::Biopieces::put_record( $record, $out );
575 foreach $file ( @{ $options->{ "files" } } )
577 $data_in = Maasha::Common::read_open( $file );
579 while ( $entry = Maasha::UCSC::fixedstep_get_entry( $data_in ) )
581 @records = Maasha::UCSC::phastcons_parse_entry( $entry, $options );
583 foreach $record ( @records )
585 $record->{ "REC_TYPE" } = "BED";
586 $record->{ "BED_LEN" } = $record->{ "CHR_END" } - $record->{ "CHR_BEG" } + 1;
588 Maasha::Biopieces::put_record( $record, $out );
590 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
601 close $data_in if $data_in;
607 # Martin A. Hansen, December 2007.
610 # http://www.ncbi.nlm.nih.gov/geo/info/soft2.html
612 my ( $in, # handle to in stream
613 $out, # handle to out stream
614 $options, # options hash
619 my ( $data_in, $file, $num, $records, $record, $soft_index, $fh, @platforms, $plat_table, @samples, $sample, $old_end, $skip );
621 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
622 Maasha::Biopieces::put_record( $record, $out );
627 foreach $file ( @{ $options->{ "files" } } )
629 print STDERR "Creating index for file: $file\n" if $options->{ "verbose" };
631 $soft_index = Maasha::NCBI::soft_index_file( $file );
633 $fh = Maasha::Common::read_open( $file );
635 @platforms = grep { $_->{ "SECTION" } =~ /PLATFORM/ } @{ $soft_index };
637 print STDERR "Getting platform tables for file: $file\n" if $options->{ "verbose" };
639 $plat_table = Maasha::NCBI::soft_get_platform( $fh, $platforms[ 0 ]->{ "LINE_BEG" }, $platforms[ -1 ]->{ "LINE_END" } );
641 @samples = grep { $_->{ "SECTION" } =~ /SAMPLE/ } @{ $soft_index };
643 $old_end = $platforms[ -1 ]->{ "LINE_END" };
645 foreach $sample ( @samples )
648 $skip = 1 if ( $options->{ "samples" } and grep { $sample->{ "SECTION" } !~ /$_/ } @{ $options->{ "samples" } } );
650 print STDERR "Getting samples for dataset: $sample->{ 'SECTION' }\n" if $options->{ "verbose" } and not $skip;
652 $records = Maasha::NCBI::soft_get_sample( $fh, $plat_table, $sample->{ "LINE_BEG" } - $old_end - 1, $sample->{ "LINE_END" } - $old_end - 1, $skip );
654 foreach $record ( @{ $records } )
656 Maasha::Biopieces::put_record( $record, $out );
658 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
663 $old_end = $sample->{ "LINE_END" };
671 close $data_in if $data_in;
676 sub script_read_solexa
678 # Martin A. Hansen, March 2008.
680 # Read Solexa sequence reads from file.
682 my ( $in, # handle to in stream
683 $out, # handle to out stream
684 $options, # options hash
689 my ( $record, $file, $data_in, $entry, $num, @seqs, @scores, $i );
691 $options->{ "format" } ||= "octal";
692 $options->{ "quality" } ||= 20;
694 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
695 Maasha::Biopieces::put_record( $record, $out );
700 foreach $file ( @{ $options->{ "files" } } )
702 $data_in = Maasha::Common::read_open( $file );
704 if ( $options->{ "format" } eq "octal" )
706 while ( $entry = Maasha::Solexa::solexa_get_entry_octal( $data_in ) )
708 $record = Maasha::Solexa::solexa2biopiece( $entry, $options->{ "quality" } );
710 Maasha::Biopieces::put_record( $record, $out );
712 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
719 while ( $entry = Maasha::Solexa::solexa_get_entry_decimal( $data_in ) )
721 $record = Maasha::Solexa::solexa2biopiece( $entry, $options->{ "quality" } );
723 Maasha::Biopieces::put_record( $record, $out );
725 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
736 close $data_in if $data_in;
740 sub script_read_solid
742 # Martin A. Hansen, April 2008.
744 # Read Solid sequence from file.
746 my ( $in, # handle to in stream
747 $out, # handle to out stream
748 $options, # options hash
753 my ( $record, $file, $data_in, $line, $num, $seq_name, $seq_cs, $seq_qual, @scores, @seqs, $i );
755 $options->{ "quality" } ||= 15;
757 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
758 Maasha::Biopieces::put_record( $record, $out );
763 foreach $file ( @{ $options->{ "files" } } )
765 $data_in = Maasha::Common::read_open( $file );
767 while ( $line = <$data_in> )
771 ( $seq_name, $seq_cs, $seq_qual ) = split /\t/, $line;
773 @scores = split /,/, $seq_qual;
774 @seqs = split //, Maasha::Solid::color_space2seq( $seq_cs );
776 for ( $i = 0; $i < @seqs; $i++ ) {
777 $seqs[ $i ] = lc $seqs[ $i ] if $scores[ $i ] < $options->{ "quality" };
782 SEQ_NAME => $seq_name,
784 SEQ_QUAL => join( ";", @scores ),
785 SEQ_LEN => length $seq_cs,
786 SEQ => join( "", @seqs ),
787 SCORE_MEAN => sprintf( "%.2f", Maasha::Calc::mean( \@scores ) ),
790 Maasha::Biopieces::put_record( $record, $out );
792 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
802 close $data_in if $data_in;
806 sub script_read_mysql
808 # Martin A. Hansen, May 2008.
810 # Read a MySQL query into stream.
812 my ( $in, # handle to in stream
813 $out, # handle to out stream
814 $options, # options hash
819 my ( $record, $dbh, $results );
821 $options->{ "user" } ||= Maasha::UCSC::ucsc_get_user();
822 $options->{ "password" } ||= Maasha::UCSC::ucsc_get_password();
824 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
825 Maasha::Biopieces::put_record( $record, $out );
828 $dbh = Maasha::SQL::connect( $options->{ "database" }, $options->{ "user" }, $options->{ "password" } );
830 $results = Maasha::SQL::query_hashref_list( $dbh, $options->{ "query" } );
832 Maasha::SQL::disconnect( $dbh );
834 map { Maasha::Biopieces::put_record( $_ ) } @{ $results };
838 sub script_complexity_seq
840 # Martin A. Hansen, May 2008.
842 # Generates an index calculated as the most common di-residue over
843 # the sequence length for all sequences in stream.
845 my ( $in, # handle to in stream
846 $out, # handle to out stream
851 my ( $record, $index );
853 while ( $record = Maasha::Biopieces::get_record( $in ) )
855 $record->{ "SEQ_COMPLEXITY" } = sprintf( "%.2f", Maasha::Seq::seq_complexity( $record->{ "SEQ" } ) ) if $record->{ "SEQ" };
857 Maasha::Biopieces::put_record( $record, $out );
862 sub script_get_genome_align
864 # Martin A. Hansen, April 2008.
866 # Gets a subalignment from a multiple genome alignment.
868 my ( $in, # handle to in stream
869 $out, # handle to out stream
870 $options, # options hash
875 my ( $record, $maf_track, $align, $align_num, $beg, $end, $len, $entry );
877 $options->{ "strand" } ||= "+";
881 $maf_track = Maasha::Config::maf_track( $options->{ "genome" } );
883 if ( $options->{ "chr" } and $options->{ "beg" } and ( $options->{ "end" } or $options->{ "len" } ) )
885 $beg = $options->{ "beg" } - 1;
887 if ( $options->{ "end" } ) {
888 $end = $options->{ "end" };
889 } elsif ( $options->{ "len" } ) {
890 $end = $beg + $options->{ "len" };
893 $align = Maasha::UCSC::maf_extract( $BP_TMP, $options->{ "genome" }, $maf_track, $options->{ "chr" }, $beg, $end, $options->{ "strand" } );
895 foreach $entry ( @{ $align } )
897 $entry->{ "CHR" } = $record->{ "CHR" };
898 $entry->{ "CHR_BEG" } = $record->{ "CHR_BEG" };
899 $entry->{ "CHR_END" } = $record->{ "CHR_END" };
900 $entry->{ "STRAND" } = $record->{ "STRAND" } || '+';
901 $entry->{ "Q_ID" } = $record->{ "Q_ID" };
902 $entry->{ "SCORE" } = $record->{ "SCORE" };
904 Maasha::Biopieces::put_record( $entry, $out );
908 while ( $record = Maasha::Biopieces::get_record( $in ) )
910 if ( $record->{ "REC_TYPE" } eq "BED" )
912 $align = Maasha::UCSC::maf_extract( $BP_TMP, $options->{ "genome" }, $maf_track, $record->{ "CHR" }, $record->{ "CHR_BEG" }, $record->{ "CHR_END" }, $record->{ "STRAND" } );
914 elsif ( $record->{ "REC_TYPE" } eq "VMATCH" )
916 $align = Maasha::UCSC::maf_extract( $BP_TMP, $options->{ "genome" }, $maf_track, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" } + 1, $record->{ "STRAND" } );
918 elsif ( $record->{ "REC_TYPE" } eq "PSL" )
920 $align = Maasha::UCSC::maf_extract( $BP_TMP, $options->{ "genome" }, $maf_track, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" }, $record->{ "STRAND" } );
922 elsif ( $record->{ "REC_TYPE" } eq "BLAST" )
924 $align = Maasha::UCSC::maf_extract( $BP_TMP, $options->{ "genome" }, $maf_track, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" }, $record->{ "STRAND" } );
927 foreach $entry ( @{ $align } )
929 $entry->{ "CHR" } = $record->{ "CHR" };
930 $entry->{ "CHR_BEG" } = $record->{ "CHR_BEG" };
931 $entry->{ "CHR_END" } = $record->{ "CHR_END" };
932 $entry->{ "STRAND" } = $record->{ "STRAND" };
933 $entry->{ "Q_ID" } = $record->{ "Q_ID" };
934 $entry->{ "SCORE" } = $record->{ "SCORE" };
936 Maasha::Biopieces::put_record( $entry, $out );
944 sub script_get_genome_phastcons
946 # Martin A. Hansen, February 2008.
948 # Get phastcons scores from genome intervals.
950 my ( $in, # handle to in stream
951 $out, # handle to out stream
952 $options, # options hash
957 my ( $phastcons_file, $phastcons_index, $index, $fh_phastcons, $scores, $record );
959 $options->{ "flank" } ||= 0;
961 $phastcons_file = Maasha::Config::genome_phastcons( $options->{ "genome" } );
962 $phastcons_index = Maasha::Config::genome_phastcons_index( $options->{ "genome" } );
964 $index = Maasha::UCSC::fixedstep_index_retrieve( $phastcons_index );
965 $fh_phastcons = Maasha::Common::read_open( $phastcons_file );
967 if ( defined $options->{ "chr" } and defined $options->{ "beg" } and ( defined $options->{ "end" } or defined $options->{ "len" } ) )
969 $options->{ "beg" } -= 1; # request is 1-based
970 $options->{ "end" } -= 1; # request is 1-based
972 if ( $options->{ "len" } ) {
973 $options->{ "end" } = $options->{ "beg" } + $options->{ "len" } - 1;
976 $scores = Maasha::UCSC::fixedstep_index_lookup( $index, $fh_phastcons, $options->{ "chr" }, $options->{ "beg" }, $options->{ "end" }, $options->{ "flank" } );
978 $record->{ "CHR" } = $options->{ "chr" };
979 $record->{ "CHR_BEG" } = $options->{ "beg" } - $options->{ "flank" };
980 $record->{ "CHR_END" } = $options->{ "end" } + $options->{ "flank" };
982 $record->{ "PHASTCONS" } = join ",", @{ $scores };
983 $record->{ "PHAST_COUNT" } = scalar @{ $scores }; # DEBUG
985 Maasha::Biopieces::put_record( $record, $out );
988 while ( $record = Maasha::Biopieces::get_record( $in ) )
990 if ( $record->{ "REC_TYPE" } eq "BED" )
992 $scores = Maasha::UCSC::fixedstep_index_lookup( $index, $fh_phastcons, $record->{ "CHR" }, $record->{ "CHR_BEG" }, $record->{ "CHR_END" }, $options->{ "flank" } );
994 elsif ( $record->{ "REC_TYPE" } eq "PSL" )
996 $scores = Maasha::UCSC::fixedstep_index_lookup( $index, $fh_phastcons, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" }, $options->{ "flank" } );
998 elsif ( $record->{ "REC_TYPE" } eq "BLAST" )
1000 $scores = Maasha::UCSC::fixedstep_index_lookup( $index, $fh_phastcons, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" }, $options->{ "flank" } );
1003 $record->{ "PHASTCONS" } = join ",", @{ $scores } if @{ $scores };
1004 # $record->{ "PHAST_COUNT" } = @{ $scores } if @{ $scores }; # DEBUG
1006 Maasha::Biopieces::put_record( $record, $out );
1009 close $fh_phastcons if $fh_phastcons;
1015 # Martin A. Hansen, July 2008.
1017 # soap sequences in stream against a given file or genome.
1019 my ( $in, # handle to in stream
1020 $out, # handle to out stream
1021 $options, # options hash
1026 my ( $genome, $tmp_in, $tmp_out, $fh_in, $fh_out, $record, $line, @fields, $entry, $count, $args );
1028 $options->{ "seed_size" } ||= 10;
1029 $options->{ "mismatches" } ||= 2;
1030 $options->{ "gap_size" } ||= 0;
1031 $options->{ "cpus" } ||= 1;
1033 if ( $options->{ "genome" } ) {
1034 $options->{ "in_file" } = "$ENV{ 'BP_DATA' }/genomes/$options->{ 'genome' }/fasta/$options->{ 'genome' }.fna";
1037 $tmp_in = "$BP_TMP/soap_query.seq";
1038 $tmp_out = "$BP_TMP/soap.result";
1040 $fh_out = Maasha::Common::write_open( $tmp_in );
1044 while ( $record = Maasha::Biopieces::get_record( $in ) )
1046 if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) )
1048 Maasha::Fasta::put_entry( $entry, $fh_out );
1053 Maasha::Biopieces::put_record( $record, $out );
1061 "-s $options->{ 'seed_size' }",
1064 "-v $options->{ 'mismatches' }",
1065 "-g $options->{ 'gap_size' }",
1066 "-p $options->{ 'cpus' }",
1067 "-d $options->{ 'in_file' }",
1071 $args .= " > /dev/null 2>&1" if not $options->{ 'verbose' };
1073 Maasha::Common::run( "soap", $args, 1 );
1077 $fh_out = Maasha::Common::read_open( $tmp_out );
1081 while ( $line = <$fh_out> )
1085 @fields = split /\t/, $line;
1087 $record->{ "REC_TYPE" } = "SOAP";
1088 $record->{ "Q_ID" } = $fields[ 0 ];
1089 $record->{ "SCORE" } = $fields[ 3 ];
1090 $record->{ "STRAND" } = $fields[ 6 ];
1091 $record->{ "S_ID" } = $fields[ 7 ];
1092 $record->{ "S_BEG" } = $fields[ 8 ] - 1; # soap is 1-based
1093 $record->{ "S_END" } = $fields[ 8 ] + $fields[ 5 ] - 2;
1095 Maasha::Biopieces::put_record( $record, $out );
1105 sub script_remove_mysql_tables
1107 # Martin A. Hansen, November 2008.
1109 # Remove MySQL tables from values in stream.
1111 my ( $in, # handle to in stream
1112 $out, # handle to out stream
1113 $options, # options hash
1118 my ( $record, %table_hash, $dbh, $table );
1120 $options->{ "user" } ||= Maasha::UCSC::ucsc_get_user();
1121 $options->{ "password" } ||= Maasha::UCSC::ucsc_get_password();
1123 map { $table_hash{ $_ } = 1 } @{ $options->{ 'tables' } };
1125 while ( $record = Maasha::Biopieces::get_record( $in ) )
1127 map { $table_hash{ $record->{ $_ } } = 1 } @{ $options->{ 'keys' } };
1129 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ 'no_stream' };
1132 $dbh = Maasha::SQL::connect( $options->{ "database" }, $options->{ "user" }, $options->{ "password" } );
1134 foreach $table ( sort keys %table_hash )
1136 if ( Maasha::SQL::table_exists( $dbh, $table ) )
1138 print STDERR qq(Removing table "$table" from database "$options->{ 'database' }" ... ) if $options->{ 'verbose' };
1139 Maasha::SQL::delete_table( $dbh, $table );
1140 print STDERR "done.\n" if $options->{ 'verbose' };
1144 print STDERR qq(WARNING: table "$table" not found in database "$options->{ 'database' }\n");
1148 Maasha::SQL::disconnect( $dbh );
1152 sub script_remove_ucsc_tracks
1154 # Martin A. Hansen, November 2008.
1156 # Remove track from MySQL tables and config file.
1158 my ( $in, # handle to in stream
1159 $out, # handle to out stream
1160 $options, # options hash
1165 my ( $record, %track_hash, $fh_in, $fh_out, $track, @tracks, @new_tracks, $dbh );
1167 $options->{ 'user' } ||= Maasha::UCSC::ucsc_get_user();
1168 $options->{ 'password' } ||= Maasha::UCSC::ucsc_get_password();
1169 $options->{ 'config_file' } ||= "$ENV{ 'HOME' }/ucsc/my_tracks.ra";
1171 map { $track_hash{ $_ } = 1 } @{ $options->{ 'tracks' } };
1173 while ( $record = Maasha::Biopieces::get_record( $in ) )
1175 map { $track_hash{ $record->{ $_ } } = 1 } @{ $options->{ 'keys' } };
1177 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ 'no_stream' };
1180 $fh_in = Maasha::Filesys::file_read_open( $options->{ 'config_file' } );
1182 while ( $track = Maasha::UCSC::ucsc_config_entry_get( $fh_in ) ) {
1183 push @tracks, $track;
1188 foreach $track ( @tracks )
1190 if ( $track->{ 'database' } eq $options->{ 'database' } and exists $track_hash{ $track->{ 'track' } } ) {
1191 print STDERR qq(Removing track "$track->{ 'track' }" from config file.\n) if $options->{ 'verbose' };
1193 push @new_tracks, $track;
1197 rename "$options->{ 'config_file' }", "$options->{ 'config_file' }~";
1199 $fh_out = Maasha::Filesys::file_write_open( $options->{ 'config_file' } );
1201 map { Maasha::UCSC::ucsc_config_entry_put( $_, $fh_out ) } @new_tracks;
1205 # ---- locate track in database ----
1207 $dbh = Maasha::SQL::connect( $options->{ "database" }, $options->{ "user" }, $options->{ "password" } );
1209 foreach $track ( sort keys %track_hash )
1211 if ( Maasha::SQL::table_exists( $dbh, $track ) )
1213 print STDERR qq(Removing table "$track" from database "$options->{ 'database' }" ... ) if $options->{ 'verbose' };
1214 Maasha::SQL::delete_table( $dbh, $track );
1215 print STDERR "done.\n" if $options->{ 'verbose' };
1219 print STDERR qq(WARNING: table "$track" not found in database "$options->{ 'database' }\n");
1223 Maasha::SQL::disconnect( $dbh );
1225 Maasha::Common::run( "ucscMakeTracks.pl", "-b > /dev/null 2>&1" );
1229 sub script_upload_to_ucsc
1231 # Martin A. Hansen, August 2007.
1233 # Calculate the mean of values of given keys.
1235 # This routine has developed into the most ugly hack. Do something!
1237 my ( $in, # handle to in stream
1238 $out, # handle to out stream
1239 $options, # options hash
1244 my ( $record, $file, $wib_file, $wig_file, $wib_dir, $fh_out, $i, $first, $format, $type, $columns, $append, $entry );
1246 $options->{ "short_label" } ||= $options->{ 'table' };
1247 $options->{ "long_label" } ||= $options->{ 'table' };
1248 $options->{ "group" } ||= $ENV{ "LOGNAME" };
1249 $options->{ "priority" } ||= 1;
1250 $options->{ "visibility" } ||= "pack";
1251 $options->{ "color" } ||= join( ",", int( rand( 255 ) ), int( rand( 255 ) ), int( rand( 255 ) ) );
1252 $options->{ "chunk_size" } ||= 10_000_000_000; # Due to 32-bit UCSC compilation really large tables cannot be loaded in one go.
1254 $file = "$BP_TMP/ucsc_upload.tmp";
1259 $fh_out = Maasha::Common::write_open( $file );
1261 while ( $record = Maasha::Biopieces::get_record( $in ) )
1263 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
1265 if ( $record->{ "REC_TYPE" } eq "fixed_step" )
1269 if ( $entry = Maasha::UCSC::Wiggle::biopiece2fixedstep( $record ) ) {
1270 Maasha::UCSC::Wiggle::fixedstep_entry_put( $entry, $fh_out );
1273 elsif ( $record->{ "REC_TYPE" } eq "PSL" )
1277 Maasha::UCSC::psl_put_header( $fh_out ) if $first;
1278 Maasha::UCSC::psl_put_entry( $record, $fh_out );
1282 elsif ( $record->{ "REC_TYPE" } eq "BED" and $record->{ "SEC_STRUCT" } )
1284 # chrom chromStart chromEnd name score strand size secStr conf
1288 print $fh_out join ( "\t",
1290 $record->{ "CHR_BEG" },
1291 $record->{ "CHR_END" } + 1,
1292 $record->{ "Q_ID" },
1293 $record->{ "SCORE" },
1294 $record->{ "STRAND" },
1295 $record->{ "SIZE" },
1296 $record->{ "SEC_STRUCT" },
1297 $record->{ "CONF" },
1300 elsif ( $record->{ "REC_TYPE" } eq "BED" )
1303 $columns = $record->{ "BED_COLS" };
1305 if ( $entry = Maasha::UCSC::BED::biopiece2bed( $record, $columns ) ) {
1306 Maasha::UCSC::BED::bed_entry_put( $entry, $fh_out, $columns, $options->{ 'check' } );
1309 elsif ( $record->{ "REC_TYPE" } eq "PATSCAN" and $record->{ "CHR" } )
1314 if ( $entry = Maasha::UCSC::BED::biopiece2bed( $record, $columns ) ) {
1315 Maasha::UCSC::BED::bed_entry_put( $entry, $fh_out, $columns, $options->{ 'check' } );
1318 elsif ( $record->{ "REC_TYPE" } eq "BLAST" and $record->{ "S_ID" } =~ /^chr/ )
1323 $record->{ "SCORE" } = $record->{ "BIT_SCORE" } * 1000;
1325 if ( $entry = Maasha::UCSC::BED::biopiece2bed( $record, $columns ) ) {
1326 Maasha::UCSC::BED::bed_entry_put( $entry, $fh_out, $columns, $options->{ 'check' } );
1329 elsif ( $record->{ "REC_TYPE" } eq "VMATCH" and $record->{ "S_ID" } =~ /^chr/i )
1334 if ( $entry = Maasha::UCSC::BED::biopiece2bed( $record, $columns ) ) {
1335 Maasha::UCSC::BED::bed_entry_put( $entry, $fh_out, $columns, $options->{ 'check' } );
1339 if ( $i == $options->{ "chunk_size" } )
1343 if ( $format eq "BED" ) {
1344 Maasha::UCSC::bed_upload_to_ucsc( $BP_TMP, $file, $options, $append );
1345 } elsif ( $format eq "PSL" ) {
1346 Maasha::UCSC::psl_upload_to_ucsc( $file, $options, $append );
1355 $fh_out = Maasha::Common::write_open( $file );
1363 if ( exists $options->{ "database" } and $options->{ "table" } )
1365 if ( $format eq "BED" )
1367 $type = "bed $columns";
1369 Maasha::UCSC::bed_upload_to_ucsc( $BP_TMP, $file, $options, $append );
1371 elsif ( $format eq "BED_SS" )
1373 $type = "type bed 6 +";
1375 Maasha::UCSC::bed_upload_to_ucsc( $BP_TMP, $file, $options, $append );
1377 elsif ( $format eq "PSL" )
1381 Maasha::UCSC::psl_upload_to_ucsc( $file, $options, $append );
1383 elsif ( $format eq "WIGGLE" )
1385 $options->{ "visibility" } = "full";
1387 $wig_file = "$options->{ 'table' }.wig";
1388 $wib_file = "$options->{ 'table' }.wib";
1390 $wib_dir = "$ENV{ 'HOME' }/ucsc/wib";
1392 Maasha::Common::dir_create_if_not_exists( $wib_dir );
1394 if ( $options->{ 'verbose' } ) {
1395 `cd $BP_TMP && wigEncode $file $wig_file $wib_file`;
1397 `cd $BP_TMP && wigEncode $file $wig_file $wib_file > /dev/null 2>&1`;
1400 Maasha::Common::run( "mv", "$BP_TMP/$wib_file $wib_dir" );
1408 Maasha::UCSC::wiggle_upload_to_ucsc( $BP_TMP, $wib_dir, $file, $options );
1413 Maasha::UCSC::ucsc_update_config( $options, $type );
1418 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<