1 package Maasha::BioRun;
4 # Copyright (C) 2007-2009 Martin A. Hansen.
6 # This program is free software; you can redistribute it and/or
7 # modify it under the terms of the GNU General Public License
8 # as published by the Free Software Foundation; either version 2
9 # of the License, or (at your option) any later version.
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program; if not, write to the Free Software
18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 # http://www.gnu.org/copyleft/gpl.html
23 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
26 # Routines that contains Biopieces which are run.
29 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
34 use Getopt::Long qw( :config bundling );
35 use Time::HiRes qw( gettimeofday );
36 use Storable qw( dclone );
37 use Maasha::Biopieces;
46 use Maasha::Stockholm;
52 use Maasha::UCSC::BED;
53 use Maasha::UCSC::Wiggle;
62 use vars qw( @ISA @EXPORT_OK );
66 @ISA = qw( Exporter );
74 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> GLOBALS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
77 my ( $script, $BP_TMP );
79 $script = Maasha::Common::get_scriptname();
80 $BP_TMP = Maasha::Common::get_tmpdir();
83 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> RUN SCRIPT <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
86 run_script( $script );
89 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SUBROUTINES <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
94 # Martin A. Hansen, August 2007.
96 # Run a specific script.
98 my ( $script, # script name
103 my ( $t0, $t1, $options, $in, $out );
105 Maasha::Biopieces::log_biopiece();
107 $t0 = gettimeofday();
109 $options = get_options( $script );
111 $options->{ "SCRIPT" } = $script;
113 $script = "print_usage" if ( -t STDIN and keys %{ $options } <= 1 or $options->{ 'help' } );
115 $in = Maasha::Biopieces::read_stream( $options->{ "stream_in" } );
116 $out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } );
118 if ( $script eq "print_usage" ) { script_print_usage( $in, $out, $options ) }
119 elsif ( $script eq "read_psl" ) { script_read_psl( $in, $out, $options ) }
120 elsif ( $script eq "read_bed" ) { script_read_bed( $in, $out, $options ) }
121 elsif ( $script eq "read_fixedstep" ) { script_read_fixedstep( $in, $out, $options ) }
122 elsif ( $script eq "read_blast_tab" ) { script_read_blast_tab( $in, $out, $options ) }
123 elsif ( $script eq "read_embl" ) { script_read_embl( $in, $out, $options ) }
124 elsif ( $script eq "read_stockholm" ) { script_read_stockholm( $in, $out, $options ) }
125 elsif ( $script eq "read_phastcons" ) { script_read_phastcons( $in, $out, $options ) }
126 elsif ( $script eq "read_soft" ) { script_read_soft( $in, $out, $options ) }
127 elsif ( $script eq "read_gff" ) { script_read_gff( $in, $out, $options ) }
128 elsif ( $script eq "read_2bit" ) { script_read_2bit( $in, $out, $options ) }
129 elsif ( $script eq "read_solexa" ) { script_read_solexa( $in, $out, $options ) }
130 elsif ( $script eq "read_solid" ) { script_read_solid( $in, $out, $options ) }
131 elsif ( $script eq "read_mysql" ) { script_read_mysql( $in, $out, $options ) }
132 elsif ( $script eq "read_ucsc_config" ) { script_read_ucsc_config( $in, $out, $options ) }
133 elsif ( $script eq "uppercase_seq" ) { script_uppercase_seq( $in, $out, $options ) }
134 elsif ( $script eq "complexity_seq" ) { script_complexity_seq( $in, $out, $options ) }
135 elsif ( $script eq "get_genome_align" ) { script_get_genome_align( $in, $out, $options ) }
136 elsif ( $script eq "get_genome_phastcons" ) { script_get_genome_phastcons( $in, $out, $options ) }
137 elsif ( $script eq "soap_seq" ) { script_soap_seq( $in, $out, $options ) }
138 elsif ( $script eq "write_bed" ) { script_write_bed( $in, $out, $options ) }
139 elsif ( $script eq "write_psl" ) { script_write_psl( $in, $out, $options ) }
140 elsif ( $script eq "write_fixedstep" ) { script_write_fixedstep( $in, $out, $options ) }
141 elsif ( $script eq "write_2bit" ) { script_write_2bit( $in, $out, $options ) }
142 elsif ( $script eq "write_solid" ) { script_write_solid( $in, $out, $options ) }
143 elsif ( $script eq "write_ucsc_config" ) { script_write_ucsc_config( $in, $out, $options ) }
144 elsif ( $script eq "remove_mysql_tables" ) { script_remove_mysql_tables( $in, $out, $options ) }
145 elsif ( $script eq "remove_ucsc_tracks" ) { script_remove_ucsc_tracks( $in, $out, $options ) }
146 elsif ( $script eq "plot_matches" ) { script_plot_matches( $in, $out, $options ) }
147 elsif ( $script eq "plot_seqlogo" ) { script_plot_seqlogo( $in, $out, $options ) }
148 elsif ( $script eq "plot_phastcons_profiles" ) { script_plot_phastcons_profiles( $in, $out, $options ) }
149 elsif ( $script eq "upload_to_ucsc" ) { script_upload_to_ucsc( $in, $out, $options ) }
151 close $in if defined $in;
154 $t1 = gettimeofday();
156 print STDERR "Program: $script" . ( " " x ( 25 - length( $script ) ) ) . sprintf( "Run time: %.4f\n", ( $t1 - $t0 ) ) if $options->{ 'verbose' };
162 # Martin A. Hansen, February 2008.
164 # Gets options from commandline and checks these vigerously.
166 my ( $script, # name of script
171 my ( %options, @options, $opt, @genomes, $real );
173 if ( $script eq "print_usage" )
179 elsif ( $script eq "read_psl" )
186 elsif ( $script eq "read_bed" )
195 elsif ( $script eq "read_fixedstep" )
202 elsif ( $script eq "read_blast_tab" )
209 elsif ( $script eq "read_embl" )
219 elsif ( $script eq "read_stockholm" )
226 elsif ( $script eq "read_phastcons" )
237 elsif ( $script eq "read_soft" )
245 elsif ( $script eq "read_gff" )
252 elsif ( $script eq "read_2bit" )
260 elsif ( $script eq "read_solexa" )
269 elsif ( $script eq "read_solid" )
277 elsif ( $script eq "read_mysql" )
286 elsif ( $script eq "read_ucsc_config" )
293 elsif ( $script eq "get_genome_align" )
304 elsif ( $script eq "get_genome_phastcons" )
315 elsif ( $script eq "soap_seq" )
326 elsif ( $script eq "write_bed" )
336 elsif ( $script eq "write_psl" )
344 elsif ( $script eq "write_fixedstep" )
352 elsif ( $script eq "write_2bit" )
360 elsif ( $script eq "write_solid" )
369 elsif ( $script eq "write_ucsc_config" )
376 elsif ( $script eq "plot_seqlogo" )
383 elsif ( $script eq "plot_phastcons_profiles" )
398 elsif ( $script eq "remove_mysql_tables" )
409 elsif ( $script eq "remove_ucsc_tracks" )
421 elsif ( $script eq "plot_matches" )
433 elsif ( $script eq "upload_to_ucsc" )
457 # print STDERR Dumper( \@options );
464 # print STDERR Dumper( \%options );
466 if ( -t STDIN && scalar( keys %options ) == 0 or $options{ "help" } ) {
467 return wantarray ? %options : \%options;
470 $options{ "cols" } = [ split ",", $options{ "cols" } ] if defined $options{ "cols" };
471 $options{ "keys" } = [ split ",", $options{ "keys" } ] if defined $options{ "keys" };
472 $options{ "no_keys" } = [ split ",", $options{ "no_keys" } ] if defined $options{ "no_keys" };
473 $options{ "save_keys" } = [ split ",", $options{ "save_keys" } ] if defined $options{ "save_keys" };
474 $options{ "quals" } = [ split ",", $options{ "quals" } ] if defined $options{ "quals" };
475 $options{ "feats" } = [ split ",", $options{ "feats" } ] if defined $options{ "feats" };
476 $options{ "frames" } = [ split ",", $options{ "frames" } ] if defined $options{ "frames" };
477 $options{ "samples" } = [ split ",", $options{ "samples" } ] if defined $options{ "samples" };
478 $options{ "tables" } = [ split ",", $options{ "tables" } ] if defined $options{ "tables" };
479 $options{ "tracks" } = [ split ",", $options{ "tracks" } ] if defined $options{ "tracks" };
481 # ---- check arguments ----
483 if ( $options{ 'data_in' } )
485 $options{ "files" } = Maasha::Biopieces::getopt_files( $options{ 'data_in' } );
487 Maasha::Common::error( qq(Argument to --data_in must be a valid file or fileglob expression) ) if scalar @{ $options{ "files" } } == 0;
490 map { Maasha::Common::error( qq(Argument to --cols must be a whole numbers - not "$_") ) if $_ !~ /^\d+$/ } @{ $options{ "cols" } } if $options{ "cols" };
492 # print STDERR Dumper( \%options );
494 $real = "beg|end|word_size|wrap|len|prefix_length|mismatches|offset|num|skip|cpus|window_size|step_size";
496 foreach $opt ( keys %options )
498 if ( $opt =~ /stream_in|pattern_in|exact_in/ and not -f $options{ $opt } )
500 Maasha::Common::error( qq(Argument to --$opt must be a valid file or fileglob expression - not "$options{ $opt }") );
502 elsif ( $opt =~ /$real/ and $options{ $opt } !~ /^\d+$/ )
504 Maasha::Common::error( qq(Argument to --$opt must be a whole number - not "$options{ $opt }") );
506 elsif ( $opt =~ /max_hits|max_hits|max_misses|dist|edit_dist|flank|gap|hamming_dist|priority/ and $options{ $opt } !~ /^-?\d+$/ )
508 Maasha::Common::error( qq(Argument to --$opt must be an integer - not "$options{ $opt }") );
510 elsif ( $opt =~ /identity|threshold/ and $options{ $opt } !~ /^-?(?:\d+(?:\.\d*)?|\.\d+)$/ )
512 Maasha::Common::error( qq(Argument to --$opt must be a decimal number - not "$options{ $opt }") );
514 elsif ( $opt =~ /e_val/ and $options{ $opt } !~ /^([+-]?)(?=\d|\.\d)\d*(\.\d*)?([Ee]([+-]?\d+))?$/ )
516 Maasha::Common::error( qq(Argument to --$opt must be a float - not "$options{ $opt }") );
518 elsif ( $opt =~ /strand/ and $options{ $opt } !~ /^(\+|-)$/ )
520 Maasha::Common::error( qq(Argument to --$opt must be "+" or "-" - not "$options{ $opt }") );
522 elsif ( $opt eq "genome" )
524 @genomes = Maasha::Common::ls_dirs( "$ENV{ 'BP_DATA' }/genomes" );
525 map { $_ =~ s/.*\/(.+)$/$1/ } @genomes;
527 if ( not grep { $_ =~ /^$options{ $opt }$/ } @genomes ) {
528 Maasha::Common::error( qq(Genome $options{ $opt } not found in "$ENV{ 'BP_DATA' }/genomes/") );
531 elsif ( $opt eq "terminal" and not $options{ $opt } =~ /^(svg|post|dumb|x11)/ )
533 Maasha::Common::error( qq(Bad --$opt argument "$options{ $opt }") );
535 elsif ( $opt eq "table" and $options{ $opt } =~ /(-|\.)/ )
537 Maasha::Common::error( qq(Character '$1' is not allowed in table name: $options{ $opt }) );
539 elsif ( $opt eq "merge" and $options{ $opt } !~ /^(AandB|AorB|BorA|AnotB|BnotA)$/ )
541 Maasha::Common::error( qq(Argument to --$opt must be AandB, AorB, BorA, AnotB, or BnotA - not "$options{ $opt }") );
543 elsif ( $opt eq "format" and $script eq "read_solexa" and $options{ $opt } !~ /octal|decimal/ )
545 Maasha::Common::error( qq(Argument to --$opt must be octal or decimal - not "$options{ $opt }") );
549 Maasha::Common::error( qq(no --database specified) ) if $script eq "remove_ucsc_tracks" and not $options{ "database" };
550 Maasha::Common::error( qq(no --in_file or --genome specified) ) if $script eq "soap_seq" and not $options{ "genome" } and not $options{ "in_file" };
551 Maasha::Common::error( qq(both --in_file and --genome specified) ) if $script eq "soap_seq" and $options{ "genome" } and $options{ "in_file" };
552 Maasha::Common::error( qq(no --genome specified) ) if $script =~ /get_genome_align|get_genome_phastcons|plot_phastcons_profiles/ and not $options{ "genome" };
554 if ( $script eq "upload_to_ucsc" )
556 Maasha::Common::error( qq(no --database specified) ) if not $options{ "database" };
557 Maasha::Common::error( qq(no --table specified) ) if not $options{ "table" };
560 return wantarray ? %options : \%options;
564 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SCRIPTS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
567 sub script_print_usage
569 # Martin A. Hansen, January 2008.
571 # Retrieves usage information from file and
572 # prints this nicely formatted.
574 my ( $in, # handle to in stream
575 $out, # handle to out stream
576 $options, # options hash
581 my ( $file, $wiki, $lines );
583 if ( $options->{ 'data_in' } ) {
584 $file = $options->{ 'data_in' };
586 $file = join "", $ENV{ 'BP_DIR' }, "/bp_usage/", $options->{ 'SCRIPT' }, ".wiki";
589 $wiki = Maasha::Gwiki::gwiki_read( $file );
591 ( $wiki->[ 2 ], $wiki->[ 3 ], $wiki->[ 0 ], $wiki->[ 1 ] ) = ( $wiki->[ 0 ], $wiki->[ 1 ], $wiki->[ 2 ], $wiki->[ 3 ] );
593 if ( not $options->{ "help" } ) {
594 @{ $wiki } = grep { $_->[ 0 ]->{ 'SECTION' } =~ /Biopiece|summary|Usage|Options|Help/ } @{ $wiki };
597 $lines = Maasha::Gwiki::gwiki2ascii( $wiki );
599 print STDERR "$_\n" foreach @{ $lines };
607 # Martin A. Hansen, August 2007.
609 # Read psl table from stream or file.
611 my ( $in, # handle to in stream
612 $out, # handle to out stream
613 $options, # options hash
618 my ( $record, $file, $data_in, $num );
620 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
621 Maasha::Biopieces::put_record( $record, $out );
626 foreach $file ( @{ $options->{ "files" } } )
628 $data_in = Maasha::Common::read_open( $file );
630 while ( $record = Maasha::UCSC::psl_get_entry( $data_in ) )
632 Maasha::Biopieces::put_record( $record, $out );
634 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
646 # Martin A. Hansen, August 2007.
648 # Read bed table from stream or file.
650 my ( $in, # handle to in stream
651 $out, # handle to out stream
652 $options, # options hash
657 my ( $cols, $file, $record, $bed_entry, $data_in, $num );
659 $cols = $options->{ 'cols' }->[ 0 ];
661 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
662 Maasha::Biopieces::put_record( $record, $out );
667 foreach $file ( @{ $options->{ "files" } } )
669 $data_in = Maasha::Common::read_open( $file );
671 while ( $bed_entry = Maasha::UCSC::BED::bed_entry_get( $data_in, $cols, $options->{ 'check' } ) )
673 $record = Maasha::UCSC::BED::bed2biopiece( $bed_entry );
675 Maasha::Biopieces::put_record( $record, $out );
677 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
687 close $data_in if $data_in;
691 sub script_read_fixedstep
693 # Martin A. Hansen, Juli 2008.
695 # Read fixedstep wiggle format from stream or file.
697 my ( $in, # handle to in stream
698 $out, # handle to out stream
699 $options, # options hash
704 my ( $file, $record, $entry, $data_in, $num );
706 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
707 Maasha::Biopieces::put_record( $record, $out );
712 foreach $file ( @{ $options->{ "files" } } )
714 $data_in = Maasha::Common::read_open( $file );
716 while ( $entry = Maasha::UCSC::Wiggle::fixedstep_entry_get( $data_in ) )
718 $record = Maasha::UCSC::Wiggle::fixedstep2biopiece( $entry );
720 Maasha::Biopieces::put_record( $record, $out );
722 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
732 close $data_in if $data_in;
736 sub script_read_blast_tab
738 # Martin A. Hansen, September 2007.
740 # Read tabular BLAST output from NCBI blast run with -m8 or -m9.
742 my ( $in, # handle to in stream
743 $out, # handle to out stream
744 $options, # options hash
749 my ( $file, $line, @fields, $strand, $record, $data_in, $num );
751 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
752 Maasha::Biopieces::put_record( $record, $out );
757 foreach $file ( @{ $options->{ "files" } } )
759 $data_in = Maasha::Common::read_open( $file );
761 while ( $line = <$data_in> )
765 next if $line =~ /^#/;
767 @fields = split /\t/, $line;
769 $record->{ "REC_TYPE" } = "BLAST";
770 $record->{ "Q_ID" } = $fields[ 0 ];
771 $record->{ "S_ID" } = $fields[ 1 ];
772 $record->{ "IDENT" } = $fields[ 2 ];
773 $record->{ "ALIGN_LEN" } = $fields[ 3 ];
774 $record->{ "MISMATCHES" } = $fields[ 4 ];
775 $record->{ "GAPS" } = $fields[ 5 ];
776 $record->{ "Q_BEG" } = $fields[ 6 ] - 1; # BLAST is 1-based
777 $record->{ "Q_END" } = $fields[ 7 ] - 1; # BLAST is 1-based
778 $record->{ "S_BEG" } = $fields[ 8 ] - 1; # BLAST is 1-based
779 $record->{ "S_END" } = $fields[ 9 ] - 1; # BLAST is 1-based
780 $record->{ "E_VAL" } = $fields[ 10 ];
781 $record->{ "BIT_SCORE" } = $fields[ 11 ];
783 if ( $record->{ "S_BEG" } > $record->{ "S_END" } )
785 $record->{ "STRAND" } = '-';
787 ( $record->{ "S_BEG" }, $record->{ "S_END" } ) = ( $record->{ "S_END" }, $record->{ "S_BEG" } );
791 $record->{ "STRAND" } = '+';
794 Maasha::Biopieces::put_record( $record, $out );
796 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
806 close $data_in if $data_in;
812 # Martin A. Hansen, August 2007.
816 my ( $in, # handle to in stream
817 $out, # handle to out stream
818 $options, # options hash
823 my ( %options2, $file, $data_in, $num, $entry, $record );
825 map { $options2{ "keys" }{ $_ } = 1 } @{ $options->{ "keys" } };
826 map { $options2{ "feats" }{ $_ } = 1 } @{ $options->{ "feats" } };
827 map { $options2{ "quals" }{ $_ } = 1 } @{ $options->{ "quals" } };
829 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
830 Maasha::Biopieces::put_record( $record, $out );
835 foreach $file ( @{ $options->{ "files" } } )
837 $data_in = Maasha::Common::read_open( $file );
839 while ( $entry = Maasha::EMBL::get_embl_entry( $data_in ) )
841 $record = Maasha::EMBL::parse_embl_entry( $entry, \%options2 );
843 my ( $feat, $feat2, $qual, $qual_val, $record_copy );
845 $record_copy = dclone $record;
847 delete $record_copy->{ "FT" };
849 Maasha::Biopieces::put_record( $record_copy, $out );
851 delete $record_copy->{ "SEQ" };
853 foreach $feat ( keys %{ $record->{ "FT" } } )
855 $record_copy->{ "FEAT_TYPE" } = $feat;
857 foreach $feat2 ( @{ $record->{ "FT" }->{ $feat } } )
859 foreach $qual ( keys %{ $feat2 } )
861 $qual_val = join "; ", @{ $feat2->{ $qual } };
866 $record_copy->{ $qual } = $qual_val;
869 Maasha::Biopieces::put_record( $record_copy, $out );
873 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
883 close $data_in if $data_in;
887 sub script_read_stockholm
889 # Martin A. Hansen, August 2007.
891 # Read Stockholm format.
893 my ( $in, # handle to in stream
894 $out, # handle to out stream
895 $options, # options hash
900 my ( $data_in, $file, $num, $entry, $record, $record_anno, $record_align, $key, $seq );
902 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
903 Maasha::Biopieces::put_record( $record, $out );
908 foreach $file ( @{ $options->{ "files" } } )
910 $data_in = Maasha::Common::read_open( $file );
912 while ( $entry = Maasha::Stockholm::get_stockholm_entry( $data_in ) )
914 $record = Maasha::Stockholm::parse_stockholm_entry( $entry );
918 foreach $key ( keys %{ $record->{ "GF" } } ) {
919 $record_anno->{ $key } = $record->{ "GF" }->{ $key };
922 $record_anno->{ "ALIGN" } = $num;
924 Maasha::Biopieces::put_record( $record_anno, $out );
926 foreach $seq ( @{ $record->{ "ALIGN" } } )
931 SEQ_NAME => $seq->[ 0 ],
935 Maasha::Biopieces::put_record( $record_align, $out );
938 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
948 close $data_in if $data_in;
952 sub script_read_phastcons
954 # Martin A. Hansen, December 2007.
956 # Read PhastCons format.
958 my ( $in, # handle to in stream
959 $out, # handle to out stream
960 $options, # options hash
965 my ( $data_in, $file, $num, $entry, @records, $record );
967 $options->{ "min" } ||= 10;
968 $options->{ "dist" } ||= 25;
969 $options->{ "threshold" } ||= 0.8;
970 $options->{ "gap" } ||= 5;
972 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
973 Maasha::Biopieces::put_record( $record, $out );
978 foreach $file ( @{ $options->{ "files" } } )
980 $data_in = Maasha::Common::read_open( $file );
982 while ( $entry = Maasha::UCSC::fixedstep_get_entry( $data_in ) )
984 @records = Maasha::UCSC::phastcons_parse_entry( $entry, $options );
986 foreach $record ( @records )
988 $record->{ "REC_TYPE" } = "BED";
989 $record->{ "BED_LEN" } = $record->{ "CHR_END" } - $record->{ "CHR_BEG" } + 1;
991 Maasha::Biopieces::put_record( $record, $out );
993 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
1004 close $data_in if $data_in;
1008 sub script_read_soft
1010 # Martin A. Hansen, December 2007.
1013 # http://www.ncbi.nlm.nih.gov/geo/info/soft2.html
1015 my ( $in, # handle to in stream
1016 $out, # handle to out stream
1017 $options, # options hash
1022 my ( $data_in, $file, $num, $records, $record, $soft_index, $fh, @platforms, $plat_table, @samples, $sample, $old_end, $skip );
1024 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
1025 Maasha::Biopieces::put_record( $record, $out );
1030 foreach $file ( @{ $options->{ "files" } } )
1032 print STDERR "Creating index for file: $file\n" if $options->{ "verbose" };
1034 $soft_index = Maasha::NCBI::soft_index_file( $file );
1036 $fh = Maasha::Common::read_open( $file );
1038 @platforms = grep { $_->{ "SECTION" } =~ /PLATFORM/ } @{ $soft_index };
1040 print STDERR "Getting platform tables for file: $file\n" if $options->{ "verbose" };
1042 $plat_table = Maasha::NCBI::soft_get_platform( $fh, $platforms[ 0 ]->{ "LINE_BEG" }, $platforms[ -1 ]->{ "LINE_END" } );
1044 @samples = grep { $_->{ "SECTION" } =~ /SAMPLE/ } @{ $soft_index };
1046 $old_end = $platforms[ -1 ]->{ "LINE_END" };
1048 foreach $sample ( @samples )
1051 $skip = 1 if ( $options->{ "samples" } and grep { $sample->{ "SECTION" } !~ /$_/ } @{ $options->{ "samples" } } );
1053 print STDERR "Getting samples for dataset: $sample->{ 'SECTION' }\n" if $options->{ "verbose" } and not $skip;
1055 $records = Maasha::NCBI::soft_get_sample( $fh, $plat_table, $sample->{ "LINE_BEG" } - $old_end - 1, $sample->{ "LINE_END" } - $old_end - 1, $skip );
1057 foreach $record ( @{ $records } )
1059 Maasha::Biopieces::put_record( $record, $out );
1061 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
1066 $old_end = $sample->{ "LINE_END" };
1074 close $data_in if $data_in;
1081 # Martin A. Hansen, February 2008.
1084 # http://www.ncbi.nlm.nih.gov/geo/info/soft2.html
1086 my ( $in, # handle to in stream
1087 $out, # handle to out stream
1088 $options, # options hash
1093 my ( $data_in, $file, $fh, $num, $record, $entry );
1095 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
1096 Maasha::Biopieces::put_record( $record, $out );
1101 foreach $file ( @{ $options->{ "files" } } )
1103 $fh = Maasha::Common::read_open( $file );
1105 while ( $entry = Maasha::GFF::get_entry( $fh ) )
1107 Maasha::Biopieces::put_record( $entry, $out );
1109 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
1119 close $data_in if $data_in;
1123 sub script_read_2bit
1125 # Martin A. Hansen, March 2008.
1127 # Read sequences from 2bit file.
1129 my ( $in, # handle to in stream
1130 $out, # handle to out stream
1131 $options, # options hash
1136 my ( $record, $file, $data_in, $mask, $toc, $line, $num );
1138 $mask = 1 if not $options->{ "no_mask" };
1140 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
1141 Maasha::Biopieces::put_record( $record, $out );
1146 foreach $file ( @{ $options->{ "files" } } )
1148 $data_in = Maasha::Common::read_open( $file );
1150 $toc = Maasha::TwoBit::twobit_get_TOC( $data_in );
1152 foreach $line ( @{ $toc } )
1154 $record->{ "SEQ_NAME" } = $line->[ 0 ];
1155 $record->{ "SEQ" } = Maasha::TwoBit::twobit_get_seq( $data_in, $line->[ 1 ], undef, undef, $mask );
1156 $record->{ "SEQ_LEN" } = length $record->{ "SEQ" };
1158 Maasha::Biopieces::put_record( $record, $out );
1160 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
1170 close $data_in if $data_in;
1174 sub script_read_solexa
1176 # Martin A. Hansen, March 2008.
1178 # Read Solexa sequence reads from file.
1180 my ( $in, # handle to in stream
1181 $out, # handle to out stream
1182 $options, # options hash
1187 my ( $record, $file, $data_in, $entry, $num, @seqs, @scores, $i );
1189 $options->{ "format" } ||= "octal";
1190 $options->{ "quality" } ||= 20;
1192 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
1193 Maasha::Biopieces::put_record( $record, $out );
1198 foreach $file ( @{ $options->{ "files" } } )
1200 $data_in = Maasha::Common::read_open( $file );
1202 if ( $options->{ "format" } eq "octal" )
1204 while ( $entry = Maasha::Solexa::solexa_get_entry_octal( $data_in ) )
1206 $record = Maasha::Solexa::solexa2biopiece( $entry, $options->{ "quality" } );
1208 Maasha::Biopieces::put_record( $record, $out );
1210 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
1217 while ( $entry = Maasha::Solexa::solexa_get_entry_decimal( $data_in ) )
1219 $record = Maasha::Solexa::solexa2biopiece( $entry, $options->{ "quality" } );
1221 Maasha::Biopieces::put_record( $record, $out );
1223 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
1234 close $data_in if $data_in;
1238 sub script_read_solid
1240 # Martin A. Hansen, April 2008.
1242 # Read Solid sequence from file.
1244 my ( $in, # handle to in stream
1245 $out, # handle to out stream
1246 $options, # options hash
1251 my ( $record, $file, $data_in, $line, $num, $seq_name, $seq_cs, $seq_qual, @scores, @seqs, $i );
1253 $options->{ "quality" } ||= 15;
1255 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
1256 Maasha::Biopieces::put_record( $record, $out );
1261 foreach $file ( @{ $options->{ "files" } } )
1263 $data_in = Maasha::Common::read_open( $file );
1265 while ( $line = <$data_in> )
1269 ( $seq_name, $seq_cs, $seq_qual ) = split /\t/, $line;
1271 @scores = split /,/, $seq_qual;
1272 @seqs = split //, Maasha::Solid::color_space2seq( $seq_cs );
1274 for ( $i = 0; $i < @seqs; $i++ ) {
1275 $seqs[ $i ] = lc $seqs[ $i ] if $scores[ $i ] < $options->{ "quality" };
1279 REC_TYPE => 'SOLID',
1280 SEQ_NAME => $seq_name,
1282 SEQ_QUAL => join( ";", @scores ),
1283 SEQ_LEN => length $seq_cs,
1284 SEQ => join( "", @seqs ),
1285 SCORE_MEAN => sprintf( "%.2f", Maasha::Calc::mean( \@scores ) ),
1288 Maasha::Biopieces::put_record( $record, $out );
1290 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
1300 close $data_in if $data_in;
1304 sub script_read_mysql
1306 # Martin A. Hansen, May 2008.
1308 # Read a MySQL query into stream.
1310 my ( $in, # handle to in stream
1311 $out, # handle to out stream
1312 $options, # options hash
1317 my ( $record, $dbh, $results );
1319 $options->{ "user" } ||= Maasha::UCSC::ucsc_get_user();
1320 $options->{ "password" } ||= Maasha::UCSC::ucsc_get_password();
1322 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
1323 Maasha::Biopieces::put_record( $record, $out );
1326 $dbh = Maasha::SQL::connect( $options->{ "database" }, $options->{ "user" }, $options->{ "password" } );
1328 $results = Maasha::SQL::query_hashref_list( $dbh, $options->{ "query" } );
1330 Maasha::SQL::disconnect( $dbh );
1332 map { Maasha::Biopieces::put_record( $_ ) } @{ $results };
1336 sub script_read_ucsc_config
1338 # Martin A. Hansen, November 2008.
1340 # Read track entries from UCSC Genome Browser '.ra' files.
1342 my ( $in, # handle to in stream
1343 $out, # handle to out stream
1344 $options, # options hash
1349 my ( $record, $file, $data_in, $entry, $num );
1351 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
1352 Maasha::Biopieces::put_record( $record, $out );
1357 foreach $file ( @{ $options->{ "files" } } )
1359 $data_in = Maasha::Common::read_open( $file );
1361 while ( $record = Maasha::UCSC::ucsc_config_entry_get( $data_in ) )
1363 $record->{ 'REC_TYPE' } = "UCSC Config";
1365 Maasha::Biopieces::put_record( $record, $out );
1367 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
1377 close $data_in if $data_in;
1381 sub script_complexity_seq
1383 # Martin A. Hansen, May 2008.
1385 # Generates an index calculated as the most common di-residue over
1386 # the sequence length for all sequences in stream.
1388 my ( $in, # handle to in stream
1389 $out, # handle to out stream
1394 my ( $record, $index );
1396 while ( $record = Maasha::Biopieces::get_record( $in ) )
1398 $record->{ "SEQ_COMPLEXITY" } = sprintf( "%.2f", Maasha::Seq::seq_complexity( $record->{ "SEQ" } ) ) if $record->{ "SEQ" };
1400 Maasha::Biopieces::put_record( $record, $out );
1405 sub script_get_genome_align
1407 # Martin A. Hansen, April 2008.
1409 # Gets a subalignment from a multiple genome alignment.
1411 my ( $in, # handle to in stream
1412 $out, # handle to out stream
1413 $options, # options hash
1418 my ( $record, $maf_track, $align, $align_num, $beg, $end, $len, $entry );
1420 $options->{ "strand" } ||= "+";
1424 $maf_track = Maasha::Config::maf_track( $options->{ "genome" } );
1426 if ( $options->{ "chr" } and $options->{ "beg" } and ( $options->{ "end" } or $options->{ "len" } ) )
1428 $beg = $options->{ "beg" } - 1;
1430 if ( $options->{ "end" } ) {
1431 $end = $options->{ "end" };
1432 } elsif ( $options->{ "len" } ) {
1433 $end = $beg + $options->{ "len" };
1436 $align = Maasha::UCSC::maf_extract( $BP_TMP, $options->{ "genome" }, $maf_track, $options->{ "chr" }, $beg, $end, $options->{ "strand" } );
1438 foreach $entry ( @{ $align } )
1440 $entry->{ "CHR" } = $record->{ "CHR" };
1441 $entry->{ "CHR_BEG" } = $record->{ "CHR_BEG" };
1442 $entry->{ "CHR_END" } = $record->{ "CHR_END" };
1443 $entry->{ "STRAND" } = $record->{ "STRAND" } || '+';
1444 $entry->{ "Q_ID" } = $record->{ "Q_ID" };
1445 $entry->{ "SCORE" } = $record->{ "SCORE" };
1447 Maasha::Biopieces::put_record( $entry, $out );
1451 while ( $record = Maasha::Biopieces::get_record( $in ) )
1453 if ( $record->{ "REC_TYPE" } eq "BED" )
1455 $align = Maasha::UCSC::maf_extract( $BP_TMP, $options->{ "genome" }, $maf_track, $record->{ "CHR" }, $record->{ "CHR_BEG" }, $record->{ "CHR_END" }, $record->{ "STRAND" } );
1457 elsif ( $record->{ "REC_TYPE" } eq "VMATCH" )
1459 $align = Maasha::UCSC::maf_extract( $BP_TMP, $options->{ "genome" }, $maf_track, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" } + 1, $record->{ "STRAND" } );
1461 elsif ( $record->{ "REC_TYPE" } eq "PSL" )
1463 $align = Maasha::UCSC::maf_extract( $BP_TMP, $options->{ "genome" }, $maf_track, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" }, $record->{ "STRAND" } );
1465 elsif ( $record->{ "REC_TYPE" } eq "BLAST" )
1467 $align = Maasha::UCSC::maf_extract( $BP_TMP, $options->{ "genome" }, $maf_track, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" }, $record->{ "STRAND" } );
1470 foreach $entry ( @{ $align } )
1472 $entry->{ "CHR" } = $record->{ "CHR" };
1473 $entry->{ "CHR_BEG" } = $record->{ "CHR_BEG" };
1474 $entry->{ "CHR_END" } = $record->{ "CHR_END" };
1475 $entry->{ "STRAND" } = $record->{ "STRAND" };
1476 $entry->{ "Q_ID" } = $record->{ "Q_ID" };
1477 $entry->{ "SCORE" } = $record->{ "SCORE" };
1479 Maasha::Biopieces::put_record( $entry, $out );
1487 sub script_get_genome_phastcons
1489 # Martin A. Hansen, February 2008.
1491 # Get phastcons scores from genome intervals.
1493 my ( $in, # handle to in stream
1494 $out, # handle to out stream
1495 $options, # options hash
1500 my ( $phastcons_file, $phastcons_index, $index, $fh_phastcons, $scores, $record );
1502 $options->{ "flank" } ||= 0;
1504 $phastcons_file = Maasha::Config::genome_phastcons( $options->{ "genome" } );
1505 $phastcons_index = Maasha::Config::genome_phastcons_index( $options->{ "genome" } );
1507 $index = Maasha::UCSC::fixedstep_index_retrieve( $phastcons_index );
1508 $fh_phastcons = Maasha::Common::read_open( $phastcons_file );
1510 if ( defined $options->{ "chr" } and defined $options->{ "beg" } and ( defined $options->{ "end" } or defined $options->{ "len" } ) )
1512 $options->{ "beg" } -= 1; # request is 1-based
1513 $options->{ "end" } -= 1; # request is 1-based
1515 if ( $options->{ "len" } ) {
1516 $options->{ "end" } = $options->{ "beg" } + $options->{ "len" } - 1;
1519 $scores = Maasha::UCSC::fixedstep_index_lookup( $index, $fh_phastcons, $options->{ "chr" }, $options->{ "beg" }, $options->{ "end" }, $options->{ "flank" } );
1521 $record->{ "CHR" } = $options->{ "chr" };
1522 $record->{ "CHR_BEG" } = $options->{ "beg" } - $options->{ "flank" };
1523 $record->{ "CHR_END" } = $options->{ "end" } + $options->{ "flank" };
1525 $record->{ "PHASTCONS" } = join ",", @{ $scores };
1526 $record->{ "PHAST_COUNT" } = scalar @{ $scores }; # DEBUG
1528 Maasha::Biopieces::put_record( $record, $out );
1531 while ( $record = Maasha::Biopieces::get_record( $in ) )
1533 if ( $record->{ "REC_TYPE" } eq "BED" )
1535 $scores = Maasha::UCSC::fixedstep_index_lookup( $index, $fh_phastcons, $record->{ "CHR" }, $record->{ "CHR_BEG" }, $record->{ "CHR_END" }, $options->{ "flank" } );
1537 elsif ( $record->{ "REC_TYPE" } eq "PSL" )
1539 $scores = Maasha::UCSC::fixedstep_index_lookup( $index, $fh_phastcons, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" }, $options->{ "flank" } );
1541 elsif ( $record->{ "REC_TYPE" } eq "BLAST" )
1543 $scores = Maasha::UCSC::fixedstep_index_lookup( $index, $fh_phastcons, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" }, $options->{ "flank" } );
1546 $record->{ "PHASTCONS" } = join ",", @{ $scores } if @{ $scores };
1547 # $record->{ "PHAST_COUNT" } = @{ $scores } if @{ $scores }; # DEBUG
1549 Maasha::Biopieces::put_record( $record, $out );
1552 close $fh_phastcons if $fh_phastcons;
1558 # Martin A. Hansen, July 2008.
1560 # soap sequences in stream against a given file or genome.
1562 my ( $in, # handle to in stream
1563 $out, # handle to out stream
1564 $options, # options hash
1569 my ( $genome, $tmp_in, $tmp_out, $fh_in, $fh_out, $record, $line, @fields, $entry, $count, $args );
1571 $options->{ "seed_size" } ||= 10;
1572 $options->{ "mismatches" } ||= 2;
1573 $options->{ "gap_size" } ||= 0;
1574 $options->{ "cpus" } ||= 1;
1576 if ( $options->{ "genome" } ) {
1577 $options->{ "in_file" } = "$ENV{ 'BP_DATA' }/genomes/$options->{ 'genome' }/fasta/$options->{ 'genome' }.fna";
1580 $tmp_in = "$BP_TMP/soap_query.seq";
1581 $tmp_out = "$BP_TMP/soap.result";
1583 $fh_out = Maasha::Common::write_open( $tmp_in );
1587 while ( $record = Maasha::Biopieces::get_record( $in ) )
1589 if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) )
1591 Maasha::Fasta::put_entry( $entry, $fh_out );
1596 Maasha::Biopieces::put_record( $record, $out );
1604 "-s $options->{ 'seed_size' }",
1607 "-v $options->{ 'mismatches' }",
1608 "-g $options->{ 'gap_size' }",
1609 "-p $options->{ 'cpus' }",
1610 "-d $options->{ 'in_file' }",
1614 $args .= " > /dev/null 2>&1" if not $options->{ 'verbose' };
1616 Maasha::Common::run( "soap", $args, 1 );
1620 $fh_out = Maasha::Common::read_open( $tmp_out );
1624 while ( $line = <$fh_out> )
1628 @fields = split /\t/, $line;
1630 $record->{ "REC_TYPE" } = "SOAP";
1631 $record->{ "Q_ID" } = $fields[ 0 ];
1632 $record->{ "SCORE" } = $fields[ 3 ];
1633 $record->{ "STRAND" } = $fields[ 6 ];
1634 $record->{ "S_ID" } = $fields[ 7 ];
1635 $record->{ "S_BEG" } = $fields[ 8 ] - 1; # soap is 1-based
1636 $record->{ "S_END" } = $fields[ 8 ] + $fields[ 5 ] - 2;
1638 Maasha::Biopieces::put_record( $record, $out );
1648 sub script_write_bed
1650 # Martin A. Hansen, August 2007.
1652 # Write BED format for the UCSC genome browser using records in stream.
1654 my ( $in, # handle to in stream
1655 $out, # handle to out stream
1656 $options, # options hash
1661 my ( $cols, $fh, $record, $bed_entry, $new_record );
1663 $cols = $options->{ 'cols' }->[ 0 ];
1665 $fh = Maasha::Biopieces::write_stream( $options->{ 'data_out' }, $options->{ 'compress' } );
1667 while ( $record = Maasha::Biopieces::get_record( $in ) )
1669 $record = Maasha::UCSC::psl2record( $record ) if $record->{ 'tBaseInsert' }; # Dirty addition to allow Affy data from MySQL to be dumped
1671 if ( $bed_entry = Maasha::UCSC::BED::biopiece2bed( $record, $cols ) ) {
1672 Maasha::UCSC::BED::bed_entry_put( $bed_entry, $fh, $cols, $options->{ 'check' } );
1675 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ 'no_stream' };
1682 sub script_write_psl
1684 # Martin A. Hansen, August 2007.
1686 # Write PSL output from stream.
1688 my ( $in, # handle to in stream
1689 $out, # handle to out stream
1690 $options, # options hash
1695 my ( $fh, $record, @output, $first );
1699 $fh = Maasha::Biopieces::write_stream( $options->{ "data_out" }, $options->{ "compress" } );
1701 while ( $record = Maasha::Biopieces::get_record( $in ) )
1703 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
1705 if ( $record->{ "REC_TYPE" } and $record->{ "REC_TYPE" } eq "PSL" )
1707 Maasha::UCSC::psl_put_header( $fh ) if $first;
1708 Maasha::UCSC::psl_put_entry( $record, $fh );
1717 sub script_write_fixedstep
1719 # Martin A. Hansen, Juli 2008.
1721 # Write fixedStep entries from recrods in the stream.
1723 my ( $in, # handle to in stream
1724 $out, # handle to out stream
1725 $options, # options hash
1730 my ( $fh, $record, $entry );
1732 $fh = Maasha::Biopieces::write_stream( $options->{ "data_out" }, $options->{ "compress" } );
1734 while ( $record = Maasha::Biopieces::get_record( $in ) )
1736 if ( $entry = Maasha::UCSC::Wiggle::biopiece2fixedstep( $record ) ) {
1737 Maasha::UCSC::Wiggle::fixedstep_entry_put( $entry, $fh );
1740 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
1747 sub script_write_2bit
1749 # Martin A. Hansen, March 2008.
1751 # Write sequence entries from stream in 2bit format.
1753 my ( $in, # handle to in stream
1754 $out, # handle to out stream
1755 $options, # options hash
1760 my ( $record, $mask, $tmp_file, $fh_tmp, $fh_in, $fh_out, $entry );
1762 $mask = 1 if not $options->{ "no_mask" };
1764 $tmp_file = "$BP_TMP/write_2bit.fna";
1765 $fh_tmp = Maasha::Common::write_open( $tmp_file );
1767 $fh_out = Maasha::Biopieces::write_stream( $options->{ "data_out" } );
1769 while ( $record = Maasha::Biopieces::get_record( $in ) )
1771 if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) ) {
1772 Maasha::Fasta::put_entry( $entry, $fh_tmp );
1775 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
1780 $fh_in = Maasha::Common::read_open( $tmp_file );
1782 Maasha::TwoBit::fasta2twobit( $fh_in, $fh_out, $mask );
1791 sub script_write_solid
1793 # Martin A. Hansen, April 2008.
1795 # Write di-base encoded Solid sequence from entries in stream.
1797 my ( $in, # handle to in stream
1798 $out, # handle to out stream
1799 $options, # options hash
1804 my ( $record, $fh, $entry );
1806 $fh = Maasha::Biopieces::write_stream( $options->{ "data_out" }, $options->{ "compress" } );
1808 while ( $record = Maasha::Biopieces::get_record( $in ) )
1810 if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) )
1812 $entry->[ SEQ ] = Maasha::Solid::seq2color_space( uc $entry->[ SEQ ] );
1814 Maasha::Fasta::put_entry( $entry, $fh, $options->{ "wrap" } );
1817 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
1824 sub script_write_ucsc_config
1826 # Martin A. Hansen, November 2008.
1828 # Write UCSC Genome Broser configuration (.ra file type) from
1829 # records in the stream.
1831 my ( $in, # handle to in stream
1832 $out, # handle to out stream
1833 $options, # options hash
1838 my ( $record, $fh );
1840 $fh = Maasha::Biopieces::write_stream( $options->{ "data_out" } );
1842 while ( $record = Maasha::Biopieces::get_record( $in ) )
1844 Maasha::UCSC::ucsc_config_entry_put( $record, $fh ) if $record->{ "REC_TYPE" } eq "UCSC Config";
1846 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
1853 sub script_plot_seqlogo
1855 # Martin A. Hansen, August 2007.
1857 # Calculates and writes a sequence logo for alignments.
1859 my ( $in, # handle to in stream
1860 $out, # handle to out stream
1861 $options, # options hash
1866 my ( $record, @entries, $logo, $fh );
1868 while ( $record = Maasha::Biopieces::get_record( $in ) )
1870 if ( $record->{ "SEQ_NAME" } and $record->{ "SEQ" } ) {
1871 push @entries, [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ];
1874 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
1877 $logo = Maasha::Plot::seq_logo( \@entries );
1879 $fh = Maasha::Biopieces::write_stream( $options->{ "data_out" } );
1887 sub script_plot_phastcons_profiles
1889 # Martin A. Hansen, January 2008.
1891 # Plots PhastCons profiles.
1893 my ( $in, # handle to in stream
1894 $out, # handle to out stream
1895 $options, # options hash
1900 my ( $phastcons_file, $phastcons_index, $index, $fh_phastcons, $record, $scores, $AoA, $plot, $fh );
1902 $options->{ "title" } ||= "PhastCons Profiles";
1904 $phastcons_file = Maasha::Config::genome_phastcons( $options->{ "genome" } );
1905 $phastcons_index = Maasha::Config::genome_phastcons_index( $options->{ "genome" } );
1907 $index = Maasha::UCSC::fixedstep_index_retrieve( $phastcons_index );
1908 $fh_phastcons = Maasha::Common::read_open( $phastcons_file );
1910 while ( $record = Maasha::Biopieces::get_record( $in ) )
1912 if ( $record->{ "CHR" } and $record->{ "CHR_BEG" } and $record->{ "CHR_END" } )
1914 $scores = Maasha::UCSC::fixedstep_index_lookup( $index, $fh_phastcons, $record->{ "CHR" },
1915 $record->{ "CHR_BEG" },
1916 $record->{ "CHR_END" },
1917 $options->{ "flank" } );
1919 push @{ $AoA }, [ @{ $scores } ];
1922 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
1925 Maasha::UCSC::phastcons_normalize( $AoA );
1927 $AoA = [ [ Maasha::UCSC::phastcons_mean( $AoA ) ] ] if $options->{ "mean" };
1928 $AoA = [ [ Maasha::UCSC::phastcons_median( $AoA ) ] ] if $options->{ "median" };
1930 $AoA = Maasha::Matrix::matrix_flip( $AoA );
1932 $plot = Maasha::Plot::lineplot_simple( $AoA, $options, $BP_TMP );
1934 $fh = Maasha::Biopieces::write_stream( $options->{ "data_out" } );
1936 print $fh "$_\n" foreach @{ $plot };
1942 sub script_remove_mysql_tables
1944 # Martin A. Hansen, November 2008.
1946 # Remove MySQL tables from values in stream.
1948 my ( $in, # handle to in stream
1949 $out, # handle to out stream
1950 $options, # options hash
1955 my ( $record, %table_hash, $dbh, $table );
1957 $options->{ "user" } ||= Maasha::UCSC::ucsc_get_user();
1958 $options->{ "password" } ||= Maasha::UCSC::ucsc_get_password();
1960 map { $table_hash{ $_ } = 1 } @{ $options->{ 'tables' } };
1962 while ( $record = Maasha::Biopieces::get_record( $in ) )
1964 map { $table_hash{ $record->{ $_ } } = 1 } @{ $options->{ 'keys' } };
1966 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ 'no_stream' };
1969 $dbh = Maasha::SQL::connect( $options->{ "database" }, $options->{ "user" }, $options->{ "password" } );
1971 foreach $table ( sort keys %table_hash )
1973 if ( Maasha::SQL::table_exists( $dbh, $table ) )
1975 print STDERR qq(Removing table "$table" from database "$options->{ 'database' }" ... ) if $options->{ 'verbose' };
1976 Maasha::SQL::delete_table( $dbh, $table );
1977 print STDERR "done.\n" if $options->{ 'verbose' };
1981 print STDERR qq(WARNING: table "$table" not found in database "$options->{ 'database' }\n");
1985 Maasha::SQL::disconnect( $dbh );
1989 sub script_remove_ucsc_tracks
1991 # Martin A. Hansen, November 2008.
1993 # Remove track from MySQL tables and config file.
1995 my ( $in, # handle to in stream
1996 $out, # handle to out stream
1997 $options, # options hash
2002 my ( $record, %track_hash, $fh_in, $fh_out, $track, @tracks, @new_tracks, $dbh );
2004 $options->{ 'user' } ||= Maasha::UCSC::ucsc_get_user();
2005 $options->{ 'password' } ||= Maasha::UCSC::ucsc_get_password();
2006 $options->{ 'config_file' } ||= "$ENV{ 'HOME' }/ucsc/my_tracks.ra";
2008 map { $track_hash{ $_ } = 1 } @{ $options->{ 'tracks' } };
2010 while ( $record = Maasha::Biopieces::get_record( $in ) )
2012 map { $track_hash{ $record->{ $_ } } = 1 } @{ $options->{ 'keys' } };
2014 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ 'no_stream' };
2017 $fh_in = Maasha::Filesys::file_read_open( $options->{ 'config_file' } );
2019 while ( $track = Maasha::UCSC::ucsc_config_entry_get( $fh_in ) ) {
2020 push @tracks, $track;
2025 foreach $track ( @tracks )
2027 if ( $track->{ 'database' } eq $options->{ 'database' } and exists $track_hash{ $track->{ 'track' } } ) {
2028 print STDERR qq(Removing track "$track->{ 'track' }" from config file.\n) if $options->{ 'verbose' };
2030 push @new_tracks, $track;
2034 rename "$options->{ 'config_file' }", "$options->{ 'config_file' }~";
2036 $fh_out = Maasha::Filesys::file_write_open( $options->{ 'config_file' } );
2038 map { Maasha::UCSC::ucsc_config_entry_put( $_, $fh_out ) } @new_tracks;
2042 # ---- locate track in database ----
2044 $dbh = Maasha::SQL::connect( $options->{ "database" }, $options->{ "user" }, $options->{ "password" } );
2046 foreach $track ( sort keys %track_hash )
2048 if ( Maasha::SQL::table_exists( $dbh, $track ) )
2050 print STDERR qq(Removing table "$track" from database "$options->{ 'database' }" ... ) if $options->{ 'verbose' };
2051 Maasha::SQL::delete_table( $dbh, $track );
2052 print STDERR "done.\n" if $options->{ 'verbose' };
2056 print STDERR qq(WARNING: table "$track" not found in database "$options->{ 'database' }\n");
2060 Maasha::SQL::disconnect( $dbh );
2062 Maasha::Common::run( "ucscMakeTracks.pl", "-b > /dev/null 2>&1" );
2066 sub script_plot_matches
2068 # Martin A. Hansen, August 2007.
2070 # Plot matches in 2D generating a dotplot.
2072 my ( $in, # handle to in stream
2073 $out, # handle to out stream
2074 $options, # options hash
2079 my ( $record, @data, $fh, $result, %data_hash );
2081 $options->{ "direction" } ||= "both";
2083 while ( $record = Maasha::Biopieces::get_record( $in ) )
2085 if ( defined $record->{ "Q_BEG" } and defined $record->{ "S_BEG" } and $record->{ "Q_END" } and $record->{ "S_END" } ) {
2086 push @data, $record;
2089 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
2092 $options->{ "title" } ||= "plot_matches";
2093 $options->{ "xlabel" } ||= $data[ 0 ]->{ "Q_ID" };
2094 $options->{ "ylabel" } ||= $data[ 0 ]->{ "S_ID" };
2096 $result = Maasha::Plot::dotplot_matches( \@data, $options, $BP_TMP );
2098 $fh = Maasha::Biopieces::write_stream( $options->{ "data_out" } );
2100 print $fh "$_\n" foreach @{ $result };
2106 sub script_upload_to_ucsc
2108 # Martin A. Hansen, August 2007.
2110 # Calculate the mean of values of given keys.
2112 # This routine has developed into the most ugly hack. Do something!
2114 my ( $in, # handle to in stream
2115 $out, # handle to out stream
2116 $options, # options hash
2121 my ( $record, $file, $wib_file, $wig_file, $wib_dir, $fh_out, $i, $first, $format, $type, $columns, $append, $entry );
2123 $options->{ "short_label" } ||= $options->{ 'table' };
2124 $options->{ "long_label" } ||= $options->{ 'table' };
2125 $options->{ "group" } ||= $ENV{ "LOGNAME" };
2126 $options->{ "priority" } ||= 1;
2127 $options->{ "visibility" } ||= "pack";
2128 $options->{ "color" } ||= join( ",", int( rand( 255 ) ), int( rand( 255 ) ), int( rand( 255 ) ) );
2129 $options->{ "chunk_size" } ||= 10_000_000_000; # Due to 32-bit UCSC compilation really large tables cannot be loaded in one go.
2131 $file = "$BP_TMP/ucsc_upload.tmp";
2136 $fh_out = Maasha::Common::write_open( $file );
2138 while ( $record = Maasha::Biopieces::get_record( $in ) )
2140 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
2142 if ( $record->{ "REC_TYPE" } eq "fixed_step" )
2146 if ( $entry = Maasha::UCSC::Wiggle::biopiece2fixedstep( $record ) ) {
2147 Maasha::UCSC::Wiggle::fixedstep_entry_put( $entry, $fh_out );
2150 elsif ( $record->{ "REC_TYPE" } eq "PSL" )
2154 Maasha::UCSC::psl_put_header( $fh_out ) if $first;
2155 Maasha::UCSC::psl_put_entry( $record, $fh_out );
2159 elsif ( $record->{ "REC_TYPE" } eq "BED" and $record->{ "SEC_STRUCT" } )
2161 # chrom chromStart chromEnd name score strand size secStr conf
2165 print $fh_out join ( "\t",
2167 $record->{ "CHR_BEG" },
2168 $record->{ "CHR_END" } + 1,
2169 $record->{ "Q_ID" },
2170 $record->{ "SCORE" },
2171 $record->{ "STRAND" },
2172 $record->{ "SIZE" },
2173 $record->{ "SEC_STRUCT" },
2174 $record->{ "CONF" },
2177 elsif ( $record->{ "REC_TYPE" } eq "BED" )
2180 $columns = $record->{ "BED_COLS" };
2182 if ( $entry = Maasha::UCSC::BED::biopiece2bed( $record, $columns ) ) {
2183 Maasha::UCSC::BED::bed_entry_put( $entry, $fh_out, $columns, $options->{ 'check' } );
2186 elsif ( $record->{ "REC_TYPE" } eq "PATSCAN" and $record->{ "CHR" } )
2191 if ( $entry = Maasha::UCSC::BED::biopiece2bed( $record, $columns ) ) {
2192 Maasha::UCSC::BED::bed_entry_put( $entry, $fh_out, $columns, $options->{ 'check' } );
2195 elsif ( $record->{ "REC_TYPE" } eq "BLAST" and $record->{ "S_ID" } =~ /^chr/ )
2200 $record->{ "SCORE" } = $record->{ "BIT_SCORE" } * 1000;
2202 if ( $entry = Maasha::UCSC::BED::biopiece2bed( $record, $columns ) ) {
2203 Maasha::UCSC::BED::bed_entry_put( $entry, $fh_out, $columns, $options->{ 'check' } );
2206 elsif ( $record->{ "REC_TYPE" } eq "VMATCH" and $record->{ "S_ID" } =~ /^chr/i )
2211 if ( $entry = Maasha::UCSC::BED::biopiece2bed( $record, $columns ) ) {
2212 Maasha::UCSC::BED::bed_entry_put( $entry, $fh_out, $columns, $options->{ 'check' } );
2216 if ( $i == $options->{ "chunk_size" } )
2220 if ( $format eq "BED" ) {
2221 Maasha::UCSC::bed_upload_to_ucsc( $BP_TMP, $file, $options, $append );
2222 } elsif ( $format eq "PSL" ) {
2223 Maasha::UCSC::psl_upload_to_ucsc( $file, $options, $append );
2232 $fh_out = Maasha::Common::write_open( $file );
2240 if ( exists $options->{ "database" } and $options->{ "table" } )
2242 if ( $format eq "BED" )
2244 $type = "bed $columns";
2246 Maasha::UCSC::bed_upload_to_ucsc( $BP_TMP, $file, $options, $append );
2248 elsif ( $format eq "BED_SS" )
2250 $type = "type bed 6 +";
2252 Maasha::UCSC::bed_upload_to_ucsc( $BP_TMP, $file, $options, $append );
2254 elsif ( $format eq "PSL" )
2258 Maasha::UCSC::psl_upload_to_ucsc( $file, $options, $append );
2260 elsif ( $format eq "WIGGLE" )
2262 $options->{ "visibility" } = "full";
2264 $wig_file = "$options->{ 'table' }.wig";
2265 $wib_file = "$options->{ 'table' }.wib";
2267 $wib_dir = "$ENV{ 'HOME' }/ucsc/wib";
2269 Maasha::Common::dir_create_if_not_exists( $wib_dir );
2271 if ( $options->{ 'verbose' } ) {
2272 `cd $BP_TMP && wigEncode $file $wig_file $wib_file`;
2274 `cd $BP_TMP && wigEncode $file $wig_file $wib_file > /dev/null 2>&1`;
2277 Maasha::Common::run( "mv", "$BP_TMP/$wib_file $wib_dir" );
2285 Maasha::UCSC::wiggle_upload_to_ucsc( $BP_TMP, $wib_dir, $file, $options );
2290 Maasha::UCSC::ucsc_update_config( $options, $type );
2295 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<