1 package Maasha::BioRun;
4 # Copyright (C) 2007-2009 Martin A. Hansen.
6 # This program is free software; you can redistribute it and/or
7 # modify it under the terms of the GNU General Public License
8 # as published by the Free Software Foundation; either version 2
9 # of the License, or (at your option) any later version.
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program; if not, write to the Free Software
18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 # http://www.gnu.org/copyleft/gpl.html
23 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
26 # Routines that contains Biopieces which are run.
29 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
34 use Getopt::Long qw( :config bundling );
35 use Time::HiRes qw( gettimeofday );
36 use Storable qw( dclone );
37 use Maasha::Biopieces;
46 use Maasha::Stockholm;
52 use Maasha::UCSC::BED;
53 use Maasha::UCSC::Wiggle;
62 use vars qw( @ISA @EXPORT_OK );
66 @ISA = qw( Exporter );
74 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> GLOBALS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
77 my ( $script, $BP_TMP );
79 $script = Maasha::Common::get_scriptname();
80 $BP_TMP = Maasha::Common::get_tmpdir();
83 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> RUN SCRIPT <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
86 run_script( $script );
89 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SUBROUTINES <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
94 # Martin A. Hansen, August 2007.
96 # Run a specific script.
98 my ( $script, # script name
103 my ( $t0, $t1, $options, $in, $out );
105 Maasha::Biopieces::log_biopiece();
107 $t0 = gettimeofday();
109 $options = get_options( $script );
111 $options->{ "SCRIPT" } = $script;
113 $script = "print_usage" if ( -t STDIN and keys %{ $options } <= 1 or $options->{ 'help' } );
115 $in = Maasha::Biopieces::read_stream( $options->{ "stream_in" } );
116 $out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } );
118 if ( $script eq "print_usage" ) { script_print_usage( $in, $out, $options ) }
119 elsif ( $script eq "read_psl" ) { script_read_psl( $in, $out, $options ) }
120 elsif ( $script eq "read_bed" ) { script_read_bed( $in, $out, $options ) }
121 elsif ( $script eq "read_fixedstep" ) { script_read_fixedstep( $in, $out, $options ) }
122 elsif ( $script eq "read_blast_tab" ) { script_read_blast_tab( $in, $out, $options ) }
123 elsif ( $script eq "read_embl" ) { script_read_embl( $in, $out, $options ) }
124 elsif ( $script eq "read_stockholm" ) { script_read_stockholm( $in, $out, $options ) }
125 elsif ( $script eq "read_phastcons" ) { script_read_phastcons( $in, $out, $options ) }
126 elsif ( $script eq "read_soft" ) { script_read_soft( $in, $out, $options ) }
127 elsif ( $script eq "read_gff" ) { script_read_gff( $in, $out, $options ) }
128 elsif ( $script eq "read_2bit" ) { script_read_2bit( $in, $out, $options ) }
129 elsif ( $script eq "read_solexa" ) { script_read_solexa( $in, $out, $options ) }
130 elsif ( $script eq "read_solid" ) { script_read_solid( $in, $out, $options ) }
131 elsif ( $script eq "read_mysql" ) { script_read_mysql( $in, $out, $options ) }
132 elsif ( $script eq "read_ucsc_config" ) { script_read_ucsc_config( $in, $out, $options ) }
133 elsif ( $script eq "uppercase_seq" ) { script_uppercase_seq( $in, $out, $options ) }
134 elsif ( $script eq "complexity_seq" ) { script_complexity_seq( $in, $out, $options ) }
135 elsif ( $script eq "get_genome_align" ) { script_get_genome_align( $in, $out, $options ) }
136 elsif ( $script eq "get_genome_phastcons" ) { script_get_genome_phastcons( $in, $out, $options ) }
137 elsif ( $script eq "soap_seq" ) { script_soap_seq( $in, $out, $options ) }
138 elsif ( $script eq "write_bed" ) { script_write_bed( $in, $out, $options ) }
139 elsif ( $script eq "write_psl" ) { script_write_psl( $in, $out, $options ) }
140 elsif ( $script eq "write_fixedstep" ) { script_write_fixedstep( $in, $out, $options ) }
141 elsif ( $script eq "write_2bit" ) { script_write_2bit( $in, $out, $options ) }
142 elsif ( $script eq "write_solid" ) { script_write_solid( $in, $out, $options ) }
143 elsif ( $script eq "write_ucsc_config" ) { script_write_ucsc_config( $in, $out, $options ) }
144 elsif ( $script eq "remove_mysql_tables" ) { script_remove_mysql_tables( $in, $out, $options ) }
145 elsif ( $script eq "remove_ucsc_tracks" ) { script_remove_ucsc_tracks( $in, $out, $options ) }
146 elsif ( $script eq "plot_histogram" ) { script_plot_histogram( $in, $out, $options ) }
147 elsif ( $script eq "plot_lendist" ) { script_plot_lendist( $in, $out, $options ) }
148 elsif ( $script eq "plot_chrdist" ) { script_plot_chrdist( $in, $out, $options ) }
149 elsif ( $script eq "plot_karyogram" ) { script_plot_karyogram( $in, $out, $options ) }
150 elsif ( $script eq "plot_matches" ) { script_plot_matches( $in, $out, $options ) }
151 elsif ( $script eq "plot_seqlogo" ) { script_plot_seqlogo( $in, $out, $options ) }
152 elsif ( $script eq "plot_phastcons_profiles" ) { script_plot_phastcons_profiles( $in, $out, $options ) }
153 elsif ( $script eq "upload_to_ucsc" ) { script_upload_to_ucsc( $in, $out, $options ) }
155 close $in if defined $in;
158 $t1 = gettimeofday();
160 print STDERR "Program: $script" . ( " " x ( 25 - length( $script ) ) ) . sprintf( "Run time: %.4f\n", ( $t1 - $t0 ) ) if $options->{ 'verbose' };
166 # Martin A. Hansen, February 2008.
168 # Gets options from commandline and checks these vigerously.
170 my ( $script, # name of script
175 my ( %options, @options, $opt, @genomes, $real );
177 if ( $script eq "print_usage" )
183 elsif ( $script eq "read_psl" )
190 elsif ( $script eq "read_bed" )
199 elsif ( $script eq "read_fixedstep" )
206 elsif ( $script eq "read_blast_tab" )
213 elsif ( $script eq "read_embl" )
223 elsif ( $script eq "read_stockholm" )
230 elsif ( $script eq "read_phastcons" )
241 elsif ( $script eq "read_soft" )
249 elsif ( $script eq "read_gff" )
256 elsif ( $script eq "read_2bit" )
264 elsif ( $script eq "read_solexa" )
273 elsif ( $script eq "read_solid" )
281 elsif ( $script eq "read_mysql" )
290 elsif ( $script eq "read_ucsc_config" )
297 elsif ( $script eq "get_genome_align" )
308 elsif ( $script eq "get_genome_phastcons" )
319 elsif ( $script eq "soap_seq" )
330 elsif ( $script eq "write_bed" )
340 elsif ( $script eq "write_psl" )
348 elsif ( $script eq "write_fixedstep" )
356 elsif ( $script eq "write_2bit" )
364 elsif ( $script eq "write_solid" )
373 elsif ( $script eq "write_ucsc_config" )
380 elsif ( $script eq "plot_seqlogo" )
387 elsif ( $script eq "plot_phastcons_profiles" )
402 elsif ( $script eq "remove_mysql_tables" )
413 elsif ( $script eq "remove_ucsc_tracks" )
425 elsif ( $script eq "plot_histogram" )
438 elsif ( $script eq "plot_lendist" )
450 elsif ( $script eq "plot_chrdist" )
461 elsif ( $script eq "plot_karyogram" )
470 elsif ( $script eq "plot_matches" )
482 elsif ( $script eq "upload_to_ucsc" )
506 # print STDERR Dumper( \@options );
513 # print STDERR Dumper( \%options );
515 if ( -t STDIN && scalar( keys %options ) == 0 or $options{ "help" } ) {
516 return wantarray ? %options : \%options;
519 $options{ "cols" } = [ split ",", $options{ "cols" } ] if defined $options{ "cols" };
520 $options{ "keys" } = [ split ",", $options{ "keys" } ] if defined $options{ "keys" };
521 $options{ "no_keys" } = [ split ",", $options{ "no_keys" } ] if defined $options{ "no_keys" };
522 $options{ "save_keys" } = [ split ",", $options{ "save_keys" } ] if defined $options{ "save_keys" };
523 $options{ "quals" } = [ split ",", $options{ "quals" } ] if defined $options{ "quals" };
524 $options{ "feats" } = [ split ",", $options{ "feats" } ] if defined $options{ "feats" };
525 $options{ "frames" } = [ split ",", $options{ "frames" } ] if defined $options{ "frames" };
526 $options{ "samples" } = [ split ",", $options{ "samples" } ] if defined $options{ "samples" };
527 $options{ "tables" } = [ split ",", $options{ "tables" } ] if defined $options{ "tables" };
528 $options{ "tracks" } = [ split ",", $options{ "tracks" } ] if defined $options{ "tracks" };
530 # ---- check arguments ----
532 if ( $options{ 'data_in' } )
534 $options{ "files" } = Maasha::Biopieces::getopt_files( $options{ 'data_in' } );
536 Maasha::Common::error( qq(Argument to --data_in must be a valid file or fileglob expression) ) if scalar @{ $options{ "files" } } == 0;
539 map { Maasha::Common::error( qq(Argument to --cols must be a whole numbers - not "$_") ) if $_ !~ /^\d+$/ } @{ $options{ "cols" } } if $options{ "cols" };
541 # print STDERR Dumper( \%options );
543 $real = "beg|end|word_size|wrap|len|prefix_length|mismatches|offset|num|skip|cpus|window_size|step_size";
545 foreach $opt ( keys %options )
547 if ( $opt =~ /stream_in|pattern_in|exact_in/ and not -f $options{ $opt } )
549 Maasha::Common::error( qq(Argument to --$opt must be a valid file or fileglob expression - not "$options{ $opt }") );
551 elsif ( $opt =~ /$real/ and $options{ $opt } !~ /^\d+$/ )
553 Maasha::Common::error( qq(Argument to --$opt must be a whole number - not "$options{ $opt }") );
555 elsif ( $opt =~ /max_hits|max_hits|max_misses|dist|edit_dist|flank|gap|hamming_dist|priority/ and $options{ $opt } !~ /^-?\d+$/ )
557 Maasha::Common::error( qq(Argument to --$opt must be an integer - not "$options{ $opt }") );
559 elsif ( $opt =~ /identity|threshold/ and $options{ $opt } !~ /^-?(?:\d+(?:\.\d*)?|\.\d+)$/ )
561 Maasha::Common::error( qq(Argument to --$opt must be a decimal number - not "$options{ $opt }") );
563 elsif ( $opt =~ /e_val/ and $options{ $opt } !~ /^([+-]?)(?=\d|\.\d)\d*(\.\d*)?([Ee]([+-]?\d+))?$/ )
565 Maasha::Common::error( qq(Argument to --$opt must be a float - not "$options{ $opt }") );
567 elsif ( $opt =~ /strand/ and $options{ $opt } !~ /^(\+|-)$/ )
569 Maasha::Common::error( qq(Argument to --$opt must be "+" or "-" - not "$options{ $opt }") );
571 elsif ( $opt eq "genome" )
573 @genomes = Maasha::Common::ls_dirs( "$ENV{ 'BP_DATA' }/genomes" );
574 map { $_ =~ s/.*\/(.+)$/$1/ } @genomes;
576 if ( not grep { $_ =~ /^$options{ $opt }$/ } @genomes ) {
577 Maasha::Common::error( qq(Genome $options{ $opt } not found in "$ENV{ 'BP_DATA' }/genomes/") );
580 elsif ( $opt eq "terminal" and not $options{ $opt } =~ /^(svg|post|dumb|x11)/ )
582 Maasha::Common::error( qq(Bad --$opt argument "$options{ $opt }") );
584 elsif ( $opt eq "table" and $options{ $opt } =~ /(-|\.)/ )
586 Maasha::Common::error( qq(Character '$1' is not allowed in table name: $options{ $opt }) );
588 elsif ( $opt eq "merge" and $options{ $opt } !~ /^(AandB|AorB|BorA|AnotB|BnotA)$/ )
590 Maasha::Common::error( qq(Argument to --$opt must be AandB, AorB, BorA, AnotB, or BnotA - not "$options{ $opt }") );
592 elsif ( $opt eq "format" and $script eq "read_solexa" and $options{ $opt } !~ /octal|decimal/ )
594 Maasha::Common::error( qq(Argument to --$opt must be octal or decimal - not "$options{ $opt }") );
598 Maasha::Common::error( qq(no --database specified) ) if $script eq "remove_ucsc_tracks" and not $options{ "database" };
599 Maasha::Common::error( qq(no --in_file or --genome specified) ) if $script eq "soap_seq" and not $options{ "genome" } and not $options{ "in_file" };
600 Maasha::Common::error( qq(both --in_file and --genome specified) ) if $script eq "soap_seq" and $options{ "genome" } and $options{ "in_file" };
601 Maasha::Common::error( qq(no --genome specified) ) if $script =~ /get_genome_align|get_genome_phastcons|plot_phastcons_profiles|plot_karyogram/ and not $options{ "genome" };
602 Maasha::Common::error( qq(no --key specified) ) if $script =~ /plot_lendist|plot_histogram/ and not $options{ "key" };
604 if ( $script eq "upload_to_ucsc" )
606 Maasha::Common::error( qq(no --database specified) ) if not $options{ "database" };
607 Maasha::Common::error( qq(no --table specified) ) if not $options{ "table" };
610 return wantarray ? %options : \%options;
614 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SCRIPTS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
617 sub script_print_usage
619 # Martin A. Hansen, January 2008.
621 # Retrieves usage information from file and
622 # prints this nicely formatted.
624 my ( $in, # handle to in stream
625 $out, # handle to out stream
626 $options, # options hash
631 my ( $file, $wiki, $lines );
633 if ( $options->{ 'data_in' } ) {
634 $file = $options->{ 'data_in' };
636 $file = join "", $ENV{ 'BP_DIR' }, "/bp_usage/", $options->{ 'SCRIPT' }, ".wiki";
639 $wiki = Maasha::Gwiki::gwiki_read( $file );
641 ( $wiki->[ 2 ], $wiki->[ 3 ], $wiki->[ 0 ], $wiki->[ 1 ] ) = ( $wiki->[ 0 ], $wiki->[ 1 ], $wiki->[ 2 ], $wiki->[ 3 ] );
643 if ( not $options->{ "help" } ) {
644 @{ $wiki } = grep { $_->[ 0 ]->{ 'SECTION' } =~ /Biopiece|summary|Usage|Options|Help/ } @{ $wiki };
647 $lines = Maasha::Gwiki::gwiki2ascii( $wiki );
649 print STDERR "$_\n" foreach @{ $lines };
657 # Martin A. Hansen, August 2007.
659 # Read psl table from stream or file.
661 my ( $in, # handle to in stream
662 $out, # handle to out stream
663 $options, # options hash
668 my ( $record, $file, $data_in, $num );
670 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
671 Maasha::Biopieces::put_record( $record, $out );
676 foreach $file ( @{ $options->{ "files" } } )
678 $data_in = Maasha::Common::read_open( $file );
680 while ( $record = Maasha::UCSC::psl_get_entry( $data_in ) )
682 Maasha::Biopieces::put_record( $record, $out );
684 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
696 # Martin A. Hansen, August 2007.
698 # Read bed table from stream or file.
700 my ( $in, # handle to in stream
701 $out, # handle to out stream
702 $options, # options hash
707 my ( $cols, $file, $record, $bed_entry, $data_in, $num );
709 $cols = $options->{ 'cols' }->[ 0 ];
711 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
712 Maasha::Biopieces::put_record( $record, $out );
717 foreach $file ( @{ $options->{ "files" } } )
719 $data_in = Maasha::Common::read_open( $file );
721 while ( $bed_entry = Maasha::UCSC::BED::bed_entry_get( $data_in, $cols, $options->{ 'check' } ) )
723 $record = Maasha::UCSC::BED::bed2biopiece( $bed_entry );
725 Maasha::Biopieces::put_record( $record, $out );
727 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
737 close $data_in if $data_in;
741 sub script_read_fixedstep
743 # Martin A. Hansen, Juli 2008.
745 # Read fixedstep wiggle format from stream or file.
747 my ( $in, # handle to in stream
748 $out, # handle to out stream
749 $options, # options hash
754 my ( $file, $record, $entry, $data_in, $num );
756 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
757 Maasha::Biopieces::put_record( $record, $out );
762 foreach $file ( @{ $options->{ "files" } } )
764 $data_in = Maasha::Common::read_open( $file );
766 while ( $entry = Maasha::UCSC::Wiggle::fixedstep_entry_get( $data_in ) )
768 $record = Maasha::UCSC::Wiggle::fixedstep2biopiece( $entry );
770 Maasha::Biopieces::put_record( $record, $out );
772 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
782 close $data_in if $data_in;
786 sub script_read_blast_tab
788 # Martin A. Hansen, September 2007.
790 # Read tabular BLAST output from NCBI blast run with -m8 or -m9.
792 my ( $in, # handle to in stream
793 $out, # handle to out stream
794 $options, # options hash
799 my ( $file, $line, @fields, $strand, $record, $data_in, $num );
801 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
802 Maasha::Biopieces::put_record( $record, $out );
807 foreach $file ( @{ $options->{ "files" } } )
809 $data_in = Maasha::Common::read_open( $file );
811 while ( $line = <$data_in> )
815 next if $line =~ /^#/;
817 @fields = split /\t/, $line;
819 $record->{ "REC_TYPE" } = "BLAST";
820 $record->{ "Q_ID" } = $fields[ 0 ];
821 $record->{ "S_ID" } = $fields[ 1 ];
822 $record->{ "IDENT" } = $fields[ 2 ];
823 $record->{ "ALIGN_LEN" } = $fields[ 3 ];
824 $record->{ "MISMATCHES" } = $fields[ 4 ];
825 $record->{ "GAPS" } = $fields[ 5 ];
826 $record->{ "Q_BEG" } = $fields[ 6 ] - 1; # BLAST is 1-based
827 $record->{ "Q_END" } = $fields[ 7 ] - 1; # BLAST is 1-based
828 $record->{ "S_BEG" } = $fields[ 8 ] - 1; # BLAST is 1-based
829 $record->{ "S_END" } = $fields[ 9 ] - 1; # BLAST is 1-based
830 $record->{ "E_VAL" } = $fields[ 10 ];
831 $record->{ "BIT_SCORE" } = $fields[ 11 ];
833 if ( $record->{ "S_BEG" } > $record->{ "S_END" } )
835 $record->{ "STRAND" } = '-';
837 ( $record->{ "S_BEG" }, $record->{ "S_END" } ) = ( $record->{ "S_END" }, $record->{ "S_BEG" } );
841 $record->{ "STRAND" } = '+';
844 Maasha::Biopieces::put_record( $record, $out );
846 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
856 close $data_in if $data_in;
862 # Martin A. Hansen, August 2007.
866 my ( $in, # handle to in stream
867 $out, # handle to out stream
868 $options, # options hash
873 my ( %options2, $file, $data_in, $num, $entry, $record );
875 map { $options2{ "keys" }{ $_ } = 1 } @{ $options->{ "keys" } };
876 map { $options2{ "feats" }{ $_ } = 1 } @{ $options->{ "feats" } };
877 map { $options2{ "quals" }{ $_ } = 1 } @{ $options->{ "quals" } };
879 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
880 Maasha::Biopieces::put_record( $record, $out );
885 foreach $file ( @{ $options->{ "files" } } )
887 $data_in = Maasha::Common::read_open( $file );
889 while ( $entry = Maasha::EMBL::get_embl_entry( $data_in ) )
891 $record = Maasha::EMBL::parse_embl_entry( $entry, \%options2 );
893 my ( $feat, $feat2, $qual, $qual_val, $record_copy );
895 $record_copy = dclone $record;
897 delete $record_copy->{ "FT" };
899 Maasha::Biopieces::put_record( $record_copy, $out );
901 delete $record_copy->{ "SEQ" };
903 foreach $feat ( keys %{ $record->{ "FT" } } )
905 $record_copy->{ "FEAT_TYPE" } = $feat;
907 foreach $feat2 ( @{ $record->{ "FT" }->{ $feat } } )
909 foreach $qual ( keys %{ $feat2 } )
911 $qual_val = join "; ", @{ $feat2->{ $qual } };
916 $record_copy->{ $qual } = $qual_val;
919 Maasha::Biopieces::put_record( $record_copy, $out );
923 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
933 close $data_in if $data_in;
937 sub script_read_stockholm
939 # Martin A. Hansen, August 2007.
941 # Read Stockholm format.
943 my ( $in, # handle to in stream
944 $out, # handle to out stream
945 $options, # options hash
950 my ( $data_in, $file, $num, $entry, $record, $record_anno, $record_align, $key, $seq );
952 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
953 Maasha::Biopieces::put_record( $record, $out );
958 foreach $file ( @{ $options->{ "files" } } )
960 $data_in = Maasha::Common::read_open( $file );
962 while ( $entry = Maasha::Stockholm::get_stockholm_entry( $data_in ) )
964 $record = Maasha::Stockholm::parse_stockholm_entry( $entry );
968 foreach $key ( keys %{ $record->{ "GF" } } ) {
969 $record_anno->{ $key } = $record->{ "GF" }->{ $key };
972 $record_anno->{ "ALIGN" } = $num;
974 Maasha::Biopieces::put_record( $record_anno, $out );
976 foreach $seq ( @{ $record->{ "ALIGN" } } )
981 SEQ_NAME => $seq->[ 0 ],
985 Maasha::Biopieces::put_record( $record_align, $out );
988 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
998 close $data_in if $data_in;
1002 sub script_read_phastcons
1004 # Martin A. Hansen, December 2007.
1006 # Read PhastCons format.
1008 my ( $in, # handle to in stream
1009 $out, # handle to out stream
1010 $options, # options hash
1015 my ( $data_in, $file, $num, $entry, @records, $record );
1017 $options->{ "min" } ||= 10;
1018 $options->{ "dist" } ||= 25;
1019 $options->{ "threshold" } ||= 0.8;
1020 $options->{ "gap" } ||= 5;
1022 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
1023 Maasha::Biopieces::put_record( $record, $out );
1028 foreach $file ( @{ $options->{ "files" } } )
1030 $data_in = Maasha::Common::read_open( $file );
1032 while ( $entry = Maasha::UCSC::fixedstep_get_entry( $data_in ) )
1034 @records = Maasha::UCSC::phastcons_parse_entry( $entry, $options );
1036 foreach $record ( @records )
1038 $record->{ "REC_TYPE" } = "BED";
1039 $record->{ "BED_LEN" } = $record->{ "CHR_END" } - $record->{ "CHR_BEG" } + 1;
1041 Maasha::Biopieces::put_record( $record, $out );
1043 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
1054 close $data_in if $data_in;
1058 sub script_read_soft
1060 # Martin A. Hansen, December 2007.
1063 # http://www.ncbi.nlm.nih.gov/geo/info/soft2.html
1065 my ( $in, # handle to in stream
1066 $out, # handle to out stream
1067 $options, # options hash
1072 my ( $data_in, $file, $num, $records, $record, $soft_index, $fh, @platforms, $plat_table, @samples, $sample, $old_end, $skip );
1074 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
1075 Maasha::Biopieces::put_record( $record, $out );
1080 foreach $file ( @{ $options->{ "files" } } )
1082 print STDERR "Creating index for file: $file\n" if $options->{ "verbose" };
1084 $soft_index = Maasha::NCBI::soft_index_file( $file );
1086 $fh = Maasha::Common::read_open( $file );
1088 @platforms = grep { $_->{ "SECTION" } =~ /PLATFORM/ } @{ $soft_index };
1090 print STDERR "Getting platform tables for file: $file\n" if $options->{ "verbose" };
1092 $plat_table = Maasha::NCBI::soft_get_platform( $fh, $platforms[ 0 ]->{ "LINE_BEG" }, $platforms[ -1 ]->{ "LINE_END" } );
1094 @samples = grep { $_->{ "SECTION" } =~ /SAMPLE/ } @{ $soft_index };
1096 $old_end = $platforms[ -1 ]->{ "LINE_END" };
1098 foreach $sample ( @samples )
1101 $skip = 1 if ( $options->{ "samples" } and grep { $sample->{ "SECTION" } !~ /$_/ } @{ $options->{ "samples" } } );
1103 print STDERR "Getting samples for dataset: $sample->{ 'SECTION' }\n" if $options->{ "verbose" } and not $skip;
1105 $records = Maasha::NCBI::soft_get_sample( $fh, $plat_table, $sample->{ "LINE_BEG" } - $old_end - 1, $sample->{ "LINE_END" } - $old_end - 1, $skip );
1107 foreach $record ( @{ $records } )
1109 Maasha::Biopieces::put_record( $record, $out );
1111 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
1116 $old_end = $sample->{ "LINE_END" };
1124 close $data_in if $data_in;
1131 # Martin A. Hansen, February 2008.
1134 # http://www.ncbi.nlm.nih.gov/geo/info/soft2.html
1136 my ( $in, # handle to in stream
1137 $out, # handle to out stream
1138 $options, # options hash
1143 my ( $data_in, $file, $fh, $num, $record, $entry );
1145 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
1146 Maasha::Biopieces::put_record( $record, $out );
1151 foreach $file ( @{ $options->{ "files" } } )
1153 $fh = Maasha::Common::read_open( $file );
1155 while ( $entry = Maasha::GFF::get_entry( $fh ) )
1157 Maasha::Biopieces::put_record( $entry, $out );
1159 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
1169 close $data_in if $data_in;
1173 sub script_read_2bit
1175 # Martin A. Hansen, March 2008.
1177 # Read sequences from 2bit file.
1179 my ( $in, # handle to in stream
1180 $out, # handle to out stream
1181 $options, # options hash
1186 my ( $record, $file, $data_in, $mask, $toc, $line, $num );
1188 $mask = 1 if not $options->{ "no_mask" };
1190 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
1191 Maasha::Biopieces::put_record( $record, $out );
1196 foreach $file ( @{ $options->{ "files" } } )
1198 $data_in = Maasha::Common::read_open( $file );
1200 $toc = Maasha::TwoBit::twobit_get_TOC( $data_in );
1202 foreach $line ( @{ $toc } )
1204 $record->{ "SEQ_NAME" } = $line->[ 0 ];
1205 $record->{ "SEQ" } = Maasha::TwoBit::twobit_get_seq( $data_in, $line->[ 1 ], undef, undef, $mask );
1206 $record->{ "SEQ_LEN" } = length $record->{ "SEQ" };
1208 Maasha::Biopieces::put_record( $record, $out );
1210 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
1220 close $data_in if $data_in;
1224 sub script_read_solexa
1226 # Martin A. Hansen, March 2008.
1228 # Read Solexa sequence reads from file.
1230 my ( $in, # handle to in stream
1231 $out, # handle to out stream
1232 $options, # options hash
1237 my ( $record, $file, $data_in, $entry, $num, @seqs, @scores, $i );
1239 $options->{ "format" } ||= "octal";
1240 $options->{ "quality" } ||= 20;
1242 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
1243 Maasha::Biopieces::put_record( $record, $out );
1248 foreach $file ( @{ $options->{ "files" } } )
1250 $data_in = Maasha::Common::read_open( $file );
1252 if ( $options->{ "format" } eq "octal" )
1254 while ( $entry = Maasha::Solexa::solexa_get_entry_octal( $data_in ) )
1256 $record = Maasha::Solexa::solexa2biopiece( $entry, $options->{ "quality" } );
1258 Maasha::Biopieces::put_record( $record, $out );
1260 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
1267 while ( $entry = Maasha::Solexa::solexa_get_entry_decimal( $data_in ) )
1269 $record = Maasha::Solexa::solexa2biopiece( $entry, $options->{ "quality" } );
1271 Maasha::Biopieces::put_record( $record, $out );
1273 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
1284 close $data_in if $data_in;
1288 sub script_read_solid
1290 # Martin A. Hansen, April 2008.
1292 # Read Solid sequence from file.
1294 my ( $in, # handle to in stream
1295 $out, # handle to out stream
1296 $options, # options hash
1301 my ( $record, $file, $data_in, $line, $num, $seq_name, $seq_cs, $seq_qual, @scores, @seqs, $i );
1303 $options->{ "quality" } ||= 15;
1305 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
1306 Maasha::Biopieces::put_record( $record, $out );
1311 foreach $file ( @{ $options->{ "files" } } )
1313 $data_in = Maasha::Common::read_open( $file );
1315 while ( $line = <$data_in> )
1319 ( $seq_name, $seq_cs, $seq_qual ) = split /\t/, $line;
1321 @scores = split /,/, $seq_qual;
1322 @seqs = split //, Maasha::Solid::color_space2seq( $seq_cs );
1324 for ( $i = 0; $i < @seqs; $i++ ) {
1325 $seqs[ $i ] = lc $seqs[ $i ] if $scores[ $i ] < $options->{ "quality" };
1329 REC_TYPE => 'SOLID',
1330 SEQ_NAME => $seq_name,
1332 SEQ_QUAL => join( ";", @scores ),
1333 SEQ_LEN => length $seq_cs,
1334 SEQ => join( "", @seqs ),
1335 SCORE_MEAN => sprintf( "%.2f", Maasha::Calc::mean( \@scores ) ),
1338 Maasha::Biopieces::put_record( $record, $out );
1340 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
1350 close $data_in if $data_in;
1354 sub script_read_mysql
1356 # Martin A. Hansen, May 2008.
1358 # Read a MySQL query into stream.
1360 my ( $in, # handle to in stream
1361 $out, # handle to out stream
1362 $options, # options hash
1367 my ( $record, $dbh, $results );
1369 $options->{ "user" } ||= Maasha::UCSC::ucsc_get_user();
1370 $options->{ "password" } ||= Maasha::UCSC::ucsc_get_password();
1372 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
1373 Maasha::Biopieces::put_record( $record, $out );
1376 $dbh = Maasha::SQL::connect( $options->{ "database" }, $options->{ "user" }, $options->{ "password" } );
1378 $results = Maasha::SQL::query_hashref_list( $dbh, $options->{ "query" } );
1380 Maasha::SQL::disconnect( $dbh );
1382 map { Maasha::Biopieces::put_record( $_ ) } @{ $results };
1386 sub script_read_ucsc_config
1388 # Martin A. Hansen, November 2008.
1390 # Read track entries from UCSC Genome Browser '.ra' files.
1392 my ( $in, # handle to in stream
1393 $out, # handle to out stream
1394 $options, # options hash
1399 my ( $record, $file, $data_in, $entry, $num );
1401 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
1402 Maasha::Biopieces::put_record( $record, $out );
1407 foreach $file ( @{ $options->{ "files" } } )
1409 $data_in = Maasha::Common::read_open( $file );
1411 while ( $record = Maasha::UCSC::ucsc_config_entry_get( $data_in ) )
1413 $record->{ 'REC_TYPE' } = "UCSC Config";
1415 Maasha::Biopieces::put_record( $record, $out );
1417 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
1427 close $data_in if $data_in;
1431 sub script_complexity_seq
1433 # Martin A. Hansen, May 2008.
1435 # Generates an index calculated as the most common di-residue over
1436 # the sequence length for all sequences in stream.
1438 my ( $in, # handle to in stream
1439 $out, # handle to out stream
1444 my ( $record, $index );
1446 while ( $record = Maasha::Biopieces::get_record( $in ) )
1448 $record->{ "SEQ_COMPLEXITY" } = sprintf( "%.2f", Maasha::Seq::seq_complexity( $record->{ "SEQ" } ) ) if $record->{ "SEQ" };
1450 Maasha::Biopieces::put_record( $record, $out );
1455 sub script_get_genome_align
1457 # Martin A. Hansen, April 2008.
1459 # Gets a subalignment from a multiple genome alignment.
1461 my ( $in, # handle to in stream
1462 $out, # handle to out stream
1463 $options, # options hash
1468 my ( $record, $maf_track, $align, $align_num, $beg, $end, $len, $entry );
1470 $options->{ "strand" } ||= "+";
1474 $maf_track = Maasha::Config::maf_track( $options->{ "genome" } );
1476 if ( $options->{ "chr" } and $options->{ "beg" } and ( $options->{ "end" } or $options->{ "len" } ) )
1478 $beg = $options->{ "beg" } - 1;
1480 if ( $options->{ "end" } ) {
1481 $end = $options->{ "end" };
1482 } elsif ( $options->{ "len" } ) {
1483 $end = $beg + $options->{ "len" };
1486 $align = Maasha::UCSC::maf_extract( $BP_TMP, $options->{ "genome" }, $maf_track, $options->{ "chr" }, $beg, $end, $options->{ "strand" } );
1488 foreach $entry ( @{ $align } )
1490 $entry->{ "CHR" } = $record->{ "CHR" };
1491 $entry->{ "CHR_BEG" } = $record->{ "CHR_BEG" };
1492 $entry->{ "CHR_END" } = $record->{ "CHR_END" };
1493 $entry->{ "STRAND" } = $record->{ "STRAND" } || '+';
1494 $entry->{ "Q_ID" } = $record->{ "Q_ID" };
1495 $entry->{ "SCORE" } = $record->{ "SCORE" };
1497 Maasha::Biopieces::put_record( $entry, $out );
1501 while ( $record = Maasha::Biopieces::get_record( $in ) )
1503 if ( $record->{ "REC_TYPE" } eq "BED" )
1505 $align = Maasha::UCSC::maf_extract( $BP_TMP, $options->{ "genome" }, $maf_track, $record->{ "CHR" }, $record->{ "CHR_BEG" }, $record->{ "CHR_END" }, $record->{ "STRAND" } );
1507 elsif ( $record->{ "REC_TYPE" } eq "VMATCH" )
1509 $align = Maasha::UCSC::maf_extract( $BP_TMP, $options->{ "genome" }, $maf_track, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" } + 1, $record->{ "STRAND" } );
1511 elsif ( $record->{ "REC_TYPE" } eq "PSL" )
1513 $align = Maasha::UCSC::maf_extract( $BP_TMP, $options->{ "genome" }, $maf_track, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" }, $record->{ "STRAND" } );
1515 elsif ( $record->{ "REC_TYPE" } eq "BLAST" )
1517 $align = Maasha::UCSC::maf_extract( $BP_TMP, $options->{ "genome" }, $maf_track, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" }, $record->{ "STRAND" } );
1520 foreach $entry ( @{ $align } )
1522 $entry->{ "CHR" } = $record->{ "CHR" };
1523 $entry->{ "CHR_BEG" } = $record->{ "CHR_BEG" };
1524 $entry->{ "CHR_END" } = $record->{ "CHR_END" };
1525 $entry->{ "STRAND" } = $record->{ "STRAND" };
1526 $entry->{ "Q_ID" } = $record->{ "Q_ID" };
1527 $entry->{ "SCORE" } = $record->{ "SCORE" };
1529 Maasha::Biopieces::put_record( $entry, $out );
1537 sub script_get_genome_phastcons
1539 # Martin A. Hansen, February 2008.
1541 # Get phastcons scores from genome intervals.
1543 my ( $in, # handle to in stream
1544 $out, # handle to out stream
1545 $options, # options hash
1550 my ( $phastcons_file, $phastcons_index, $index, $fh_phastcons, $scores, $record );
1552 $options->{ "flank" } ||= 0;
1554 $phastcons_file = Maasha::Config::genome_phastcons( $options->{ "genome" } );
1555 $phastcons_index = Maasha::Config::genome_phastcons_index( $options->{ "genome" } );
1557 $index = Maasha::UCSC::fixedstep_index_retrieve( $phastcons_index );
1558 $fh_phastcons = Maasha::Common::read_open( $phastcons_file );
1560 if ( defined $options->{ "chr" } and defined $options->{ "beg" } and ( defined $options->{ "end" } or defined $options->{ "len" } ) )
1562 $options->{ "beg" } -= 1; # request is 1-based
1563 $options->{ "end" } -= 1; # request is 1-based
1565 if ( $options->{ "len" } ) {
1566 $options->{ "end" } = $options->{ "beg" } + $options->{ "len" } - 1;
1569 $scores = Maasha::UCSC::fixedstep_index_lookup( $index, $fh_phastcons, $options->{ "chr" }, $options->{ "beg" }, $options->{ "end" }, $options->{ "flank" } );
1571 $record->{ "CHR" } = $options->{ "chr" };
1572 $record->{ "CHR_BEG" } = $options->{ "beg" } - $options->{ "flank" };
1573 $record->{ "CHR_END" } = $options->{ "end" } + $options->{ "flank" };
1575 $record->{ "PHASTCONS" } = join ",", @{ $scores };
1576 $record->{ "PHAST_COUNT" } = scalar @{ $scores }; # DEBUG
1578 Maasha::Biopieces::put_record( $record, $out );
1581 while ( $record = Maasha::Biopieces::get_record( $in ) )
1583 if ( $record->{ "REC_TYPE" } eq "BED" )
1585 $scores = Maasha::UCSC::fixedstep_index_lookup( $index, $fh_phastcons, $record->{ "CHR" }, $record->{ "CHR_BEG" }, $record->{ "CHR_END" }, $options->{ "flank" } );
1587 elsif ( $record->{ "REC_TYPE" } eq "PSL" )
1589 $scores = Maasha::UCSC::fixedstep_index_lookup( $index, $fh_phastcons, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" }, $options->{ "flank" } );
1591 elsif ( $record->{ "REC_TYPE" } eq "BLAST" )
1593 $scores = Maasha::UCSC::fixedstep_index_lookup( $index, $fh_phastcons, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" }, $options->{ "flank" } );
1596 $record->{ "PHASTCONS" } = join ",", @{ $scores } if @{ $scores };
1597 # $record->{ "PHAST_COUNT" } = @{ $scores } if @{ $scores }; # DEBUG
1599 Maasha::Biopieces::put_record( $record, $out );
1602 close $fh_phastcons if $fh_phastcons;
1608 # Martin A. Hansen, July 2008.
1610 # soap sequences in stream against a given file or genome.
1612 my ( $in, # handle to in stream
1613 $out, # handle to out stream
1614 $options, # options hash
1619 my ( $genome, $tmp_in, $tmp_out, $fh_in, $fh_out, $record, $line, @fields, $entry, $count, $args );
1621 $options->{ "seed_size" } ||= 10;
1622 $options->{ "mismatches" } ||= 2;
1623 $options->{ "gap_size" } ||= 0;
1624 $options->{ "cpus" } ||= 1;
1626 if ( $options->{ "genome" } ) {
1627 $options->{ "in_file" } = "$ENV{ 'BP_DATA' }/genomes/$options->{ 'genome' }/fasta/$options->{ 'genome' }.fna";
1630 $tmp_in = "$BP_TMP/soap_query.seq";
1631 $tmp_out = "$BP_TMP/soap.result";
1633 $fh_out = Maasha::Common::write_open( $tmp_in );
1637 while ( $record = Maasha::Biopieces::get_record( $in ) )
1639 if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) )
1641 Maasha::Fasta::put_entry( $entry, $fh_out );
1646 Maasha::Biopieces::put_record( $record, $out );
1654 "-s $options->{ 'seed_size' }",
1657 "-v $options->{ 'mismatches' }",
1658 "-g $options->{ 'gap_size' }",
1659 "-p $options->{ 'cpus' }",
1660 "-d $options->{ 'in_file' }",
1664 $args .= " > /dev/null 2>&1" if not $options->{ 'verbose' };
1666 Maasha::Common::run( "soap", $args, 1 );
1670 $fh_out = Maasha::Common::read_open( $tmp_out );
1674 while ( $line = <$fh_out> )
1678 @fields = split /\t/, $line;
1680 $record->{ "REC_TYPE" } = "SOAP";
1681 $record->{ "Q_ID" } = $fields[ 0 ];
1682 $record->{ "SCORE" } = $fields[ 3 ];
1683 $record->{ "STRAND" } = $fields[ 6 ];
1684 $record->{ "S_ID" } = $fields[ 7 ];
1685 $record->{ "S_BEG" } = $fields[ 8 ] - 1; # soap is 1-based
1686 $record->{ "S_END" } = $fields[ 8 ] + $fields[ 5 ] - 2;
1688 Maasha::Biopieces::put_record( $record, $out );
1698 sub script_write_bed
1700 # Martin A. Hansen, August 2007.
1702 # Write BED format for the UCSC genome browser using records in stream.
1704 my ( $in, # handle to in stream
1705 $out, # handle to out stream
1706 $options, # options hash
1711 my ( $cols, $fh, $record, $bed_entry, $new_record );
1713 $cols = $options->{ 'cols' }->[ 0 ];
1715 $fh = Maasha::Biopieces::write_stream( $options->{ 'data_out' }, $options->{ 'compress' } );
1717 while ( $record = Maasha::Biopieces::get_record( $in ) )
1719 $record = Maasha::UCSC::psl2record( $record ) if $record->{ 'tBaseInsert' }; # Dirty addition to allow Affy data from MySQL to be dumped
1721 if ( $bed_entry = Maasha::UCSC::BED::biopiece2bed( $record, $cols ) ) {
1722 Maasha::UCSC::BED::bed_entry_put( $bed_entry, $fh, $cols, $options->{ 'check' } );
1725 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ 'no_stream' };
1732 sub script_write_psl
1734 # Martin A. Hansen, August 2007.
1736 # Write PSL output from stream.
1738 my ( $in, # handle to in stream
1739 $out, # handle to out stream
1740 $options, # options hash
1745 my ( $fh, $record, @output, $first );
1749 $fh = write_stream( $options->{ "data_out" }, $options->{ "compress" } );
1751 while ( $record = Maasha::Biopieces::get_record( $in ) )
1753 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
1755 if ( $record->{ "REC_TYPE" } and $record->{ "REC_TYPE" } eq "PSL" )
1757 Maasha::UCSC::psl_put_header( $fh ) if $first;
1758 Maasha::UCSC::psl_put_entry( $record, $fh );
1767 sub script_write_fixedstep
1769 # Martin A. Hansen, Juli 2008.
1771 # Write fixedStep entries from recrods in the stream.
1773 my ( $in, # handle to in stream
1774 $out, # handle to out stream
1775 $options, # options hash
1780 my ( $fh, $record, $entry );
1782 $fh = write_stream( $options->{ "data_out" }, $options->{ "compress" } );
1784 while ( $record = Maasha::Biopieces::get_record( $in ) )
1786 if ( $entry = Maasha::UCSC::Wiggle::biopiece2fixedstep( $record ) ) {
1787 Maasha::UCSC::Wiggle::fixedstep_entry_put( $entry, $fh );
1790 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
1797 sub script_write_2bit
1799 # Martin A. Hansen, March 2008.
1801 # Write sequence entries from stream in 2bit format.
1803 my ( $in, # handle to in stream
1804 $out, # handle to out stream
1805 $options, # options hash
1810 my ( $record, $mask, $tmp_file, $fh_tmp, $fh_in, $fh_out, $entry );
1812 $mask = 1 if not $options->{ "no_mask" };
1814 $tmp_file = "$BP_TMP/write_2bit.fna";
1815 $fh_tmp = Maasha::Common::write_open( $tmp_file );
1817 $fh_out = write_stream( $options->{ "data_out" } );
1819 while ( $record = Maasha::Biopieces::get_record( $in ) )
1821 if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) ) {
1822 Maasha::Fasta::put_entry( $entry, $fh_tmp );
1825 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
1830 $fh_in = Maasha::Common::read_open( $tmp_file );
1832 Maasha::TwoBit::fasta2twobit( $fh_in, $fh_out, $mask );
1841 sub script_write_solid
1843 # Martin A. Hansen, April 2008.
1845 # Write di-base encoded Solid sequence from entries in stream.
1847 my ( $in, # handle to in stream
1848 $out, # handle to out stream
1849 $options, # options hash
1854 my ( $record, $fh, $entry );
1856 $fh = write_stream( $options->{ "data_out" }, $options->{ "compress" } );
1858 while ( $record = Maasha::Biopieces::get_record( $in ) )
1860 if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) )
1862 $entry->[ SEQ ] = Maasha::Solid::seq2color_space( uc $entry->[ SEQ ] );
1864 Maasha::Fasta::put_entry( $entry, $fh, $options->{ "wrap" } );
1867 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
1874 sub script_write_ucsc_config
1876 # Martin A. Hansen, November 2008.
1878 # Write UCSC Genome Broser configuration (.ra file type) from
1879 # records in the stream.
1881 my ( $in, # handle to in stream
1882 $out, # handle to out stream
1883 $options, # options hash
1888 my ( $record, $fh );
1890 $fh = write_stream( $options->{ "data_out" } );
1892 while ( $record = Maasha::Biopieces::get_record( $in ) )
1894 Maasha::UCSC::ucsc_config_entry_put( $record, $fh ) if $record->{ "REC_TYPE" } eq "UCSC Config";
1896 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
1903 sub script_plot_seqlogo
1905 # Martin A. Hansen, August 2007.
1907 # Calculates and writes a sequence logo for alignments.
1909 my ( $in, # handle to in stream
1910 $out, # handle to out stream
1911 $options, # options hash
1916 my ( $record, @entries, $logo, $fh );
1918 while ( $record = Maasha::Biopieces::get_record( $in ) )
1920 if ( $record->{ "SEQ_NAME" } and $record->{ "SEQ" } ) {
1921 push @entries, [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ];
1924 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
1927 $logo = Maasha::Plot::seq_logo( \@entries );
1929 $fh = write_stream( $options->{ "data_out" } );
1937 sub script_plot_phastcons_profiles
1939 # Martin A. Hansen, January 2008.
1941 # Plots PhastCons profiles.
1943 my ( $in, # handle to in stream
1944 $out, # handle to out stream
1945 $options, # options hash
1950 my ( $phastcons_file, $phastcons_index, $index, $fh_phastcons, $record, $scores, $AoA, $plot, $fh );
1952 $options->{ "title" } ||= "PhastCons Profiles";
1954 $phastcons_file = Maasha::Config::genome_phastcons( $options->{ "genome" } );
1955 $phastcons_index = Maasha::Config::genome_phastcons_index( $options->{ "genome" } );
1957 $index = Maasha::UCSC::fixedstep_index_retrieve( $phastcons_index );
1958 $fh_phastcons = Maasha::Common::read_open( $phastcons_file );
1960 while ( $record = Maasha::Biopieces::get_record( $in ) )
1962 if ( $record->{ "CHR" } and $record->{ "CHR_BEG" } and $record->{ "CHR_END" } )
1964 $scores = Maasha::UCSC::fixedstep_index_lookup( $index, $fh_phastcons, $record->{ "CHR" },
1965 $record->{ "CHR_BEG" },
1966 $record->{ "CHR_END" },
1967 $options->{ "flank" } );
1969 push @{ $AoA }, [ @{ $scores } ];
1972 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
1975 Maasha::UCSC::phastcons_normalize( $AoA );
1977 $AoA = [ [ Maasha::UCSC::phastcons_mean( $AoA ) ] ] if $options->{ "mean" };
1978 $AoA = [ [ Maasha::UCSC::phastcons_median( $AoA ) ] ] if $options->{ "median" };
1980 $AoA = Maasha::Matrix::matrix_flip( $AoA );
1982 $plot = Maasha::Plot::lineplot_simple( $AoA, $options, $BP_TMP );
1984 $fh = write_stream( $options->{ "data_out" } );
1986 print $fh "$_\n" foreach @{ $plot };
1992 sub script_remove_mysql_tables
1994 # Martin A. Hansen, November 2008.
1996 # Remove MySQL tables from values in stream.
1998 my ( $in, # handle to in stream
1999 $out, # handle to out stream
2000 $options, # options hash
2005 my ( $record, %table_hash, $dbh, $table );
2007 $options->{ "user" } ||= Maasha::UCSC::ucsc_get_user();
2008 $options->{ "password" } ||= Maasha::UCSC::ucsc_get_password();
2010 map { $table_hash{ $_ } = 1 } @{ $options->{ 'tables' } };
2012 while ( $record = Maasha::Biopieces::get_record( $in ) )
2014 map { $table_hash{ $record->{ $_ } } = 1 } @{ $options->{ 'keys' } };
2016 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ 'no_stream' };
2019 $dbh = Maasha::SQL::connect( $options->{ "database" }, $options->{ "user" }, $options->{ "password" } );
2021 foreach $table ( sort keys %table_hash )
2023 if ( Maasha::SQL::table_exists( $dbh, $table ) )
2025 print STDERR qq(Removing table "$table" from database "$options->{ 'database' }" ... ) if $options->{ 'verbose' };
2026 Maasha::SQL::delete_table( $dbh, $table );
2027 print STDERR "done.\n" if $options->{ 'verbose' };
2031 print STDERR qq(WARNING: table "$table" not found in database "$options->{ 'database' }\n");
2035 Maasha::SQL::disconnect( $dbh );
2039 sub script_remove_ucsc_tracks
2041 # Martin A. Hansen, November 2008.
2043 # Remove track from MySQL tables and config file.
2045 my ( $in, # handle to in stream
2046 $out, # handle to out stream
2047 $options, # options hash
2052 my ( $record, %track_hash, $fh_in, $fh_out, $track, @tracks, @new_tracks, $dbh );
2054 $options->{ 'user' } ||= Maasha::UCSC::ucsc_get_user();
2055 $options->{ 'password' } ||= Maasha::UCSC::ucsc_get_password();
2056 $options->{ 'config_file' } ||= "$ENV{ 'HOME' }/ucsc/my_tracks.ra";
2058 map { $track_hash{ $_ } = 1 } @{ $options->{ 'tracks' } };
2060 while ( $record = Maasha::Biopieces::get_record( $in ) )
2062 map { $track_hash{ $record->{ $_ } } = 1 } @{ $options->{ 'keys' } };
2064 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ 'no_stream' };
2067 $fh_in = Maasha::Filesys::file_read_open( $options->{ 'config_file' } );
2069 while ( $track = Maasha::UCSC::ucsc_config_entry_get( $fh_in ) ) {
2070 push @tracks, $track;
2075 foreach $track ( @tracks )
2077 if ( $track->{ 'database' } eq $options->{ 'database' } and exists $track_hash{ $track->{ 'track' } } ) {
2078 print STDERR qq(Removing track "$track->{ 'track' }" from config file.\n) if $options->{ 'verbose' };
2080 push @new_tracks, $track;
2084 rename "$options->{ 'config_file' }", "$options->{ 'config_file' }~";
2086 $fh_out = Maasha::Filesys::file_write_open( $options->{ 'config_file' } );
2088 map { Maasha::UCSC::ucsc_config_entry_put( $_, $fh_out ) } @new_tracks;
2092 # ---- locate track in database ----
2094 $dbh = Maasha::SQL::connect( $options->{ "database" }, $options->{ "user" }, $options->{ "password" } );
2096 foreach $track ( sort keys %track_hash )
2098 if ( Maasha::SQL::table_exists( $dbh, $track ) )
2100 print STDERR qq(Removing table "$track" from database "$options->{ 'database' }" ... ) if $options->{ 'verbose' };
2101 Maasha::SQL::delete_table( $dbh, $track );
2102 print STDERR "done.\n" if $options->{ 'verbose' };
2106 print STDERR qq(WARNING: table "$track" not found in database "$options->{ 'database' }\n");
2110 Maasha::SQL::disconnect( $dbh );
2112 Maasha::Common::run( "ucscMakeTracks.pl", "-b > /dev/null 2>&1" );
2116 sub script_plot_histogram
2118 # Martin A. Hansen, September 2007.
2120 # Plot a simple histogram for a given key using GNU plot.
2122 my ( $in, # handle to in stream
2123 $out, # handle to out stream
2124 $options, # options hash
2129 my ( $record, %data_hash, $max, @data_list, $i, $result, $fh );
2131 $options->{ "title" } ||= "Histogram";
2132 $options->{ "sort" } ||= "num";
2134 while ( $record = Maasha::Biopieces::get_record( $in ) )
2136 $data_hash{ $record->{ $options->{ "key" } } }++ if defined $record->{ $options->{ "key" } };
2138 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
2141 if ( $options->{ "sort" } eq "num" ) {
2142 map { push @data_list, [ $_, $data_hash{ $_ } ] } sort { $a <=> $b } keys %data_hash;
2144 map { push @data_list, [ $_, $data_hash{ $_ } ] } sort keys %data_hash;
2147 $result = Maasha::Plot::histogram_simple( \@data_list, $options );
2149 $fh = write_stream( $options->{ "data_out" } );
2151 print $fh "$_\n" foreach @{ $result };
2157 sub script_plot_lendist
2159 # Martin A. Hansen, August 2007.
2161 # Plot length distribution using GNU plot.
2163 my ( $in, # handle to in stream
2164 $out, # handle to out stream
2165 $options, # options hash
2170 my ( $record, %data_hash, $max, @data_list, $i, $result, $fh );
2172 $options->{ "title" } ||= "Length Distribution";
2174 while ( $record = Maasha::Biopieces::get_record( $in ) )
2176 $data_hash{ $record->{ $options->{ "key" } } }++ if defined $record->{ $options->{ "key" } };
2178 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
2181 $max = Maasha::Calc::list_max( [ keys %data_hash ] );
2183 for ( $i = 0; $i < $max; $i++ ) {
2184 push @data_list, [ $i, $data_hash{ $i } || 0 ];
2187 $result = Maasha::Plot::histogram_lendist( \@data_list, $options );
2189 $fh = write_stream( $options->{ "data_out" } );
2191 print $fh "$_\n" foreach @{ $result };
2197 sub script_plot_chrdist
2199 # Martin A. Hansen, August 2007.
2201 # Plot chromosome distribution using GNU plot.
2203 my ( $in, # handle to in stream
2204 $out, # handle to out stream
2205 $options, # options hash
2210 my ( $record, %data_hash, @data_list, $elem, $sort_key, $count, $result, $fh );
2212 $options->{ "title" } ||= "Chromosome Distribution";
2214 while ( $record = Maasha::Biopieces::get_record( $in ) )
2216 if ( $record->{ "CHR" } ) { # generic
2217 $data_hash{ $record->{ "CHR" } }++;
2218 } elsif ( $record->{ "REC_TYPE" } eq "PATSCAN" and $record->{ "S_ID" } =~ /^chr/i ) { # patscan
2219 $data_hash{ $record->{ "S_ID" } }++;
2220 } elsif ( $record->{ "REC_TYPE" } eq "PSL" and $record->{ "S_ID" } =~ /^chr/i ) { # BLAT / PSL
2221 $data_hash{ $record->{ "S_ID" } }++;
2222 } elsif ( $record->{ "REC_TYPE" } eq "BLAST" and $record->{ "S_ID" } =~ /^chr/i ) { # BLAST
2223 $data_hash{ $record->{ "S_ID" } }++;
2226 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
2229 foreach $elem ( keys %data_hash )
2233 $sort_key =~ s/chr//i;
2235 $sort_key =~ s/^X(.*)/99$1/;
2236 $sort_key =~ s/^Y(.*)/99$1/;
2237 $sort_key =~ s/^Z(.*)/999$1/;
2238 $sort_key =~ s/^M(.*)/9999$1/;
2239 $sort_key =~ s/^U(.*)/99999$1/;
2241 $count = $sort_key =~ tr/_//;
2243 $sort_key =~ s/_.*/"999999" x $count/ex;
2245 push @data_list, [ $elem, $data_hash{ $elem }, $sort_key ];
2248 @data_list = sort { $a->[ 2 ] <=> $b->[ 2 ] } @data_list;
2250 $result = Maasha::Plot::histogram_chrdist( \@data_list, $options );
2252 $fh = write_stream( $options->{ "data_out" } );
2254 print $fh "$_\n" foreach @{ $result };
2260 sub script_plot_karyogram
2262 # Martin A. Hansen, August 2007.
2264 # Plot hits on karyogram.
2266 my ( $in, # handle to in stream
2267 $out, # handle to out stream
2268 $options, # options hash
2273 my ( %options, $record, @data, $fh, $result, %data_hash );
2275 $options->{ "genome" } ||= "human";
2276 $options->{ "feat_color" } ||= "black";
2278 while ( $record = Maasha::Biopieces::get_record( $in ) )
2280 if ( $record->{ "CHR" } and $record->{ "CHR_BEG" } and $record->{ "CHR_END" } )
2282 push @{ $data_hash{ $record->{ "CHR" } } }, [ $record->{ "CHR_BEG" }, $record->{ "CHR_END" }, $options->{ "feat_color" } ];
2285 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
2288 $result = Maasha::Plot::karyogram( \%data_hash, \%options );
2290 $fh = write_stream( $options->{ "data_out" } );
2298 sub script_plot_matches
2300 # Martin A. Hansen, August 2007.
2302 # Plot matches in 2D generating a dotplot.
2304 my ( $in, # handle to in stream
2305 $out, # handle to out stream
2306 $options, # options hash
2311 my ( $record, @data, $fh, $result, %data_hash );
2313 $options->{ "direction" } ||= "both";
2315 while ( $record = Maasha::Biopieces::get_record( $in ) )
2317 if ( defined $record->{ "Q_BEG" } and defined $record->{ "S_BEG" } and $record->{ "Q_END" } and $record->{ "S_END" } ) {
2318 push @data, $record;
2321 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
2324 $options->{ "title" } ||= "plot_matches";
2325 $options->{ "xlabel" } ||= $data[ 0 ]->{ "Q_ID" };
2326 $options->{ "ylabel" } ||= $data[ 0 ]->{ "S_ID" };
2328 $result = Maasha::Plot::dotplot_matches( \@data, $options, $BP_TMP );
2330 $fh = write_stream( $options->{ "data_out" } );
2332 print $fh "$_\n" foreach @{ $result };
2338 sub script_upload_to_ucsc
2340 # Martin A. Hansen, August 2007.
2342 # Calculate the mean of values of given keys.
2344 # This routine has developed into the most ugly hack. Do something!
2346 my ( $in, # handle to in stream
2347 $out, # handle to out stream
2348 $options, # options hash
2353 my ( $record, $file, $wib_file, $wig_file, $wib_dir, $fh_out, $i, $first, $format, $type, $columns, $append, $entry );
2355 $options->{ "short_label" } ||= $options->{ 'table' };
2356 $options->{ "long_label" } ||= $options->{ 'table' };
2357 $options->{ "group" } ||= $ENV{ "LOGNAME" };
2358 $options->{ "priority" } ||= 1;
2359 $options->{ "visibility" } ||= "pack";
2360 $options->{ "color" } ||= join( ",", int( rand( 255 ) ), int( rand( 255 ) ), int( rand( 255 ) ) );
2361 $options->{ "chunk_size" } ||= 10_000_000_000; # Due to 32-bit UCSC compilation really large tables cannot be loaded in one go.
2363 $file = "$BP_TMP/ucsc_upload.tmp";
2368 $fh_out = Maasha::Common::write_open( $file );
2370 while ( $record = Maasha::Biopieces::get_record( $in ) )
2372 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
2374 if ( $record->{ "REC_TYPE" } eq "fixed_step" )
2378 if ( $entry = Maasha::UCSC::Wiggle::biopiece2fixedstep( $record ) ) {
2379 Maasha::UCSC::Wiggle::fixedstep_entry_put( $entry, $fh_out );
2382 elsif ( $record->{ "REC_TYPE" } eq "PSL" )
2386 Maasha::UCSC::psl_put_header( $fh_out ) if $first;
2387 Maasha::UCSC::psl_put_entry( $record, $fh_out );
2391 elsif ( $record->{ "REC_TYPE" } eq "BED" and $record->{ "SEC_STRUCT" } )
2393 # chrom chromStart chromEnd name score strand size secStr conf
2397 print $fh_out join ( "\t",
2399 $record->{ "CHR_BEG" },
2400 $record->{ "CHR_END" } + 1,
2401 $record->{ "Q_ID" },
2402 $record->{ "SCORE" },
2403 $record->{ "STRAND" },
2404 $record->{ "SIZE" },
2405 $record->{ "SEC_STRUCT" },
2406 $record->{ "CONF" },
2409 elsif ( $record->{ "REC_TYPE" } eq "BED" )
2412 $columns = $record->{ "BED_COLS" };
2414 if ( $entry = Maasha::UCSC::BED::biopiece2bed( $record, $columns ) ) {
2415 Maasha::UCSC::BED::bed_entry_put( $entry, $fh_out, $columns, $options->{ 'check' } );
2418 elsif ( $record->{ "REC_TYPE" } eq "PATSCAN" and $record->{ "CHR" } )
2423 if ( $entry = Maasha::UCSC::BED::biopiece2bed( $record, $columns ) ) {
2424 Maasha::UCSC::BED::bed_entry_put( $entry, $fh_out, $columns, $options->{ 'check' } );
2427 elsif ( $record->{ "REC_TYPE" } eq "BLAST" and $record->{ "S_ID" } =~ /^chr/ )
2432 $record->{ "SCORE" } = $record->{ "BIT_SCORE" } * 1000;
2434 if ( $entry = Maasha::UCSC::BED::biopiece2bed( $record, $columns ) ) {
2435 Maasha::UCSC::BED::bed_entry_put( $entry, $fh_out, $columns, $options->{ 'check' } );
2438 elsif ( $record->{ "REC_TYPE" } eq "VMATCH" and $record->{ "S_ID" } =~ /^chr/i )
2443 if ( $entry = Maasha::UCSC::BED::biopiece2bed( $record, $columns ) ) {
2444 Maasha::UCSC::BED::bed_entry_put( $entry, $fh_out, $columns, $options->{ 'check' } );
2448 if ( $i == $options->{ "chunk_size" } )
2452 if ( $format eq "BED" ) {
2453 Maasha::UCSC::bed_upload_to_ucsc( $BP_TMP, $file, $options, $append );
2454 } elsif ( $format eq "PSL" ) {
2455 Maasha::UCSC::psl_upload_to_ucsc( $file, $options, $append );
2464 $fh_out = Maasha::Common::write_open( $file );
2472 if ( exists $options->{ "database" } and $options->{ "table" } )
2474 if ( $format eq "BED" )
2476 $type = "bed $columns";
2478 Maasha::UCSC::bed_upload_to_ucsc( $BP_TMP, $file, $options, $append );
2480 elsif ( $format eq "BED_SS" )
2482 $type = "type bed 6 +";
2484 Maasha::UCSC::bed_upload_to_ucsc( $BP_TMP, $file, $options, $append );
2486 elsif ( $format eq "PSL" )
2490 Maasha::UCSC::psl_upload_to_ucsc( $file, $options, $append );
2492 elsif ( $format eq "WIGGLE" )
2494 $options->{ "visibility" } = "full";
2496 $wig_file = "$options->{ 'table' }.wig";
2497 $wib_file = "$options->{ 'table' }.wib";
2499 $wib_dir = "$ENV{ 'HOME' }/ucsc/wib";
2501 Maasha::Common::dir_create_if_not_exists( $wib_dir );
2503 if ( $options->{ 'verbose' } ) {
2504 `cd $BP_TMP && wigEncode $file $wig_file $wib_file`;
2506 `cd $BP_TMP && wigEncode $file $wig_file $wib_file > /dev/null 2>&1`;
2509 Maasha::Common::run( "mv", "$BP_TMP/$wib_file $wib_dir" );
2517 Maasha::UCSC::wiggle_upload_to_ucsc( $BP_TMP, $wib_dir, $file, $options );
2522 Maasha::UCSC::ucsc_update_config( $options, $type );
2527 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<