1 package Maasha::BioRun;
4 # Copyright (C) 2007-2009 Martin A. Hansen.
6 # This program is free software; you can redistribute it and/or
7 # modify it under the terms of the GNU General Public License
8 # as published by the Free Software Foundation; either version 2
9 # of the License, or (at your option) any later version.
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program; if not, write to the Free Software
18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 # http://www.gnu.org/copyleft/gpl.html
23 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
26 # Routines that contains Biopieces which are run.
29 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
34 use Getopt::Long qw( :config bundling );
35 use Time::HiRes qw( gettimeofday );
36 use Storable qw( dclone );
37 use Maasha::Biopieces;
46 use Maasha::Stockholm;
52 use Maasha::UCSC::BED;
53 use Maasha::UCSC::Wiggle;
62 use vars qw( @ISA @EXPORT_OK );
66 @ISA = qw( Exporter );
74 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> GLOBALS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
77 my ( $script, $BP_TMP );
79 $script = Maasha::Common::get_scriptname();
80 $BP_TMP = Maasha::Common::get_tmpdir();
83 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> RUN SCRIPT <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
86 run_script( $script );
89 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SUBROUTINES <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
94 # Martin A. Hansen, August 2007.
96 # Run a specific script.
98 my ( $script, # script name
103 my ( $t0, $t1, $options, $in, $out );
105 Maasha::Biopieces::log_biopiece();
107 $t0 = gettimeofday();
109 $options = get_options( $script );
111 $options->{ "SCRIPT" } = $script;
113 $script = "print_usage" if ( -t STDIN and keys %{ $options } <= 1 or $options->{ 'help' } );
115 $in = Maasha::Biopieces::read_stream( $options->{ "stream_in" } );
116 $out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } );
118 if ( $script eq "print_usage" ) { script_print_usage( $in, $out, $options ) }
119 elsif ( $script eq "read_psl" ) { script_read_psl( $in, $out, $options ) }
120 elsif ( $script eq "read_bed" ) { script_read_bed( $in, $out, $options ) }
121 elsif ( $script eq "read_fixedstep" ) { script_read_fixedstep( $in, $out, $options ) }
122 elsif ( $script eq "read_blast_tab" ) { script_read_blast_tab( $in, $out, $options ) }
123 elsif ( $script eq "read_embl" ) { script_read_embl( $in, $out, $options ) }
124 elsif ( $script eq "read_stockholm" ) { script_read_stockholm( $in, $out, $options ) }
125 elsif ( $script eq "read_phastcons" ) { script_read_phastcons( $in, $out, $options ) }
126 elsif ( $script eq "read_soft" ) { script_read_soft( $in, $out, $options ) }
127 elsif ( $script eq "read_gff" ) { script_read_gff( $in, $out, $options ) }
128 elsif ( $script eq "read_2bit" ) { script_read_2bit( $in, $out, $options ) }
129 elsif ( $script eq "read_solexa" ) { script_read_solexa( $in, $out, $options ) }
130 elsif ( $script eq "read_solid" ) { script_read_solid( $in, $out, $options ) }
131 elsif ( $script eq "read_mysql" ) { script_read_mysql( $in, $out, $options ) }
132 elsif ( $script eq "read_ucsc_config" ) { script_read_ucsc_config( $in, $out, $options ) }
133 elsif ( $script eq "length_seq" ) { script_length_seq( $in, $out, $options ) }
134 elsif ( $script eq "uppercase_seq" ) { script_uppercase_seq( $in, $out, $options ) }
135 elsif ( $script eq "complexity_seq" ) { script_complexity_seq( $in, $out, $options ) }
136 elsif ( $script eq "oligo_freq" ) { script_oligo_freq( $in, $out, $options ) }
137 elsif ( $script eq "create_weight_matrix" ) { script_create_weight_matrix( $in, $out, $options ) }
138 elsif ( $script eq "remove_indels" ) { script_remove_indels( $in, $out, $options ) }
139 elsif ( $script eq "transliterate_seq" ) { script_transliterate_seq( $in, $out, $options ) }
140 elsif ( $script eq "transliterate_vals" ) { script_transliterate_vals( $in, $out, $options ) }
141 elsif ( $script eq "translate_seq" ) { script_translate_seq( $in, $out, $options ) }
142 elsif ( $script eq "get_genome_align" ) { script_get_genome_align( $in, $out, $options ) }
143 elsif ( $script eq "get_genome_phastcons" ) { script_get_genome_phastcons( $in, $out, $options ) }
144 elsif ( $script eq "fold_seq" ) { script_fold_seq( $in, $out, $options ) }
145 elsif ( $script eq "tile_seq" ) { script_tile_seq( $in, $out, $options ) }
146 elsif ( $script eq "invert_align" ) { script_invert_align( $in, $out, $options ) }
147 elsif ( $script eq "patscan_seq" ) { script_patscan_seq( $in, $out, $options ) }
148 elsif ( $script eq "soap_seq" ) { script_soap_seq( $in, $out, $options ) }
149 elsif ( $script eq "match_seq" ) { script_match_seq( $in, $out, $options ) }
150 elsif ( $script eq "create_vmatch_index" ) { script_create_vmatch_index( $in, $out, $options ) }
151 elsif ( $script eq "write_bed" ) { script_write_bed( $in, $out, $options ) }
152 elsif ( $script eq "write_psl" ) { script_write_psl( $in, $out, $options ) }
153 elsif ( $script eq "write_fixedstep" ) { script_write_fixedstep( $in, $out, $options ) }
154 elsif ( $script eq "write_2bit" ) { script_write_2bit( $in, $out, $options ) }
155 elsif ( $script eq "write_solid" ) { script_write_solid( $in, $out, $options ) }
156 elsif ( $script eq "write_ucsc_config" ) { script_write_ucsc_config( $in, $out, $options ) }
157 elsif ( $script eq "head_records" ) { script_head_records( $in, $out, $options ) }
158 elsif ( $script eq "remove_keys" ) { script_remove_keys( $in, $out, $options ) }
159 elsif ( $script eq "remove_adaptor" ) { script_remove_adaptor( $in, $out, $options ) }
160 elsif ( $script eq "remove_mysql_tables" ) { script_remove_mysql_tables( $in, $out, $options ) }
161 elsif ( $script eq "remove_ucsc_tracks" ) { script_remove_ucsc_tracks( $in, $out, $options ) }
162 elsif ( $script eq "rename_keys" ) { script_rename_keys( $in, $out, $options ) }
163 elsif ( $script eq "uniq_vals" ) { script_uniq_vals( $in, $out, $options ) }
164 elsif ( $script eq "merge_vals" ) { script_merge_vals( $in, $out, $options ) }
165 elsif ( $script eq "merge_records" ) { script_merge_records( $in, $out, $options ) }
166 elsif ( $script eq "compute" ) { script_compute( $in, $out, $options ) }
167 elsif ( $script eq "flip_tab" ) { script_flip_tab( $in, $out, $options ) }
168 elsif ( $script eq "sort_records" ) { script_sort_records( $in, $out, $options ) }
169 elsif ( $script eq "count_vals" ) { script_count_vals( $in, $out, $options ) }
170 elsif ( $script eq "plot_histogram" ) { script_plot_histogram( $in, $out, $options ) }
171 elsif ( $script eq "plot_lendist" ) { script_plot_lendist( $in, $out, $options ) }
172 elsif ( $script eq "plot_chrdist" ) { script_plot_chrdist( $in, $out, $options ) }
173 elsif ( $script eq "plot_karyogram" ) { script_plot_karyogram( $in, $out, $options ) }
174 elsif ( $script eq "plot_matches" ) { script_plot_matches( $in, $out, $options ) }
175 elsif ( $script eq "plot_seqlogo" ) { script_plot_seqlogo( $in, $out, $options ) }
176 elsif ( $script eq "plot_phastcons_profiles" ) { script_plot_phastcons_profiles( $in, $out, $options ) }
177 elsif ( $script eq "length_vals" ) { script_length_vals( $in, $out, $options ) }
178 elsif ( $script eq "sum_vals" ) { script_sum_vals( $in, $out, $options ) }
179 elsif ( $script eq "mean_vals" ) { script_mean_vals( $in, $out, $options ) }
180 elsif ( $script eq "median_vals" ) { script_median_vals( $in, $out, $options ) }
181 elsif ( $script eq "max_vals" ) { script_max_vals( $in, $out, $options ) }
182 elsif ( $script eq "min_vals" ) { script_min_vals( $in, $out, $options ) }
183 elsif ( $script eq "upload_to_ucsc" ) { script_upload_to_ucsc( $in, $out, $options ) }
185 close $in if defined $in;
188 $t1 = gettimeofday();
190 print STDERR "Program: $script" . ( " " x ( 25 - length( $script ) ) ) . sprintf( "Run time: %.4f\n", ( $t1 - $t0 ) ) if $options->{ 'verbose' };
196 # Martin A. Hansen, February 2008.
198 # Gets options from commandline and checks these vigerously.
200 my ( $script, # name of script
205 my ( %options, @options, $opt, @genomes, $real );
207 if ( $script eq "print_usage" )
213 elsif ( $script eq "read_psl" )
220 elsif ( $script eq "read_bed" )
229 elsif ( $script eq "read_fixedstep" )
236 elsif ( $script eq "read_blast_tab" )
243 elsif ( $script eq "read_embl" )
253 elsif ( $script eq "read_stockholm" )
260 elsif ( $script eq "read_phastcons" )
271 elsif ( $script eq "read_soft" )
279 elsif ( $script eq "read_gff" )
286 elsif ( $script eq "read_2bit" )
294 elsif ( $script eq "read_solexa" )
303 elsif ( $script eq "read_solid" )
311 elsif ( $script eq "read_mysql" )
320 elsif ( $script eq "read_ucsc_config" )
327 elsif ( $script eq "length_seq" )
334 elsif ( $script eq "oligo_freq" )
341 elsif ( $script eq "create_weight_matrix" )
347 elsif ( $script eq "transliterate_seq" )
355 elsif ( $script eq "transliterate_vals" )
364 elsif ( $script eq "translate_seq" )
370 elsif ( $script eq "get_genome_align" )
381 elsif ( $script eq "get_genome_phastcons" )
392 elsif ( $script eq "tile_seq" )
399 elsif ( $script eq "invert_align" )
405 elsif ( $script eq "patscan_seq" )
416 elsif ( $script eq "soap_seq" )
427 elsif ( $script eq "match_seq" )
434 elsif ( $script eq "create_vmatch_index" )
442 elsif ( $script eq "write_bed" )
452 elsif ( $script eq "write_psl" )
460 elsif ( $script eq "write_fixedstep" )
468 elsif ( $script eq "write_2bit" )
476 elsif ( $script eq "write_solid" )
485 elsif ( $script eq "write_ucsc_config" )
492 elsif ( $script eq "plot_seqlogo" )
499 elsif ( $script eq "plot_phastcons_profiles" )
514 elsif ( $script eq "head_records" )
520 elsif ( $script eq "remove_keys" )
527 elsif ( $script eq "remove_adaptor" )
536 elsif ( $script eq "remove_mysql_tables" )
547 elsif ( $script eq "remove_ucsc_tracks" )
559 elsif ( $script eq "rename_keys" )
565 elsif ( $script eq "uniq_vals" )
572 elsif ( $script eq "merge_vals" )
579 elsif ( $script eq "merge_records" )
586 elsif ( $script eq "compute" )
592 elsif ( $script eq "sort_records" )
599 elsif ( $script eq "count_vals" )
605 elsif ( $script eq "plot_histogram" )
618 elsif ( $script eq "plot_lendist" )
630 elsif ( $script eq "plot_chrdist" )
641 elsif ( $script eq "plot_karyogram" )
650 elsif ( $script eq "plot_matches" )
662 elsif ( $script eq "length_vals" )
668 elsif ( $script eq "sum_vals" )
676 elsif ( $script eq "mean_vals" )
684 elsif ( $script eq "median_vals" )
692 elsif ( $script eq "max_vals" )
700 elsif ( $script eq "min_vals" )
708 elsif ( $script eq "upload_to_ucsc" )
732 # print STDERR Dumper( \@options );
739 # print STDERR Dumper( \%options );
741 if ( -t STDIN && scalar( keys %options ) == 0 or $options{ "help" } ) {
742 return wantarray ? %options : \%options;
745 $options{ "cols" } = [ split ",", $options{ "cols" } ] if defined $options{ "cols" };
746 $options{ "keys" } = [ split ",", $options{ "keys" } ] if defined $options{ "keys" };
747 $options{ "no_keys" } = [ split ",", $options{ "no_keys" } ] if defined $options{ "no_keys" };
748 $options{ "save_keys" } = [ split ",", $options{ "save_keys" } ] if defined $options{ "save_keys" };
749 $options{ "quals" } = [ split ",", $options{ "quals" } ] if defined $options{ "quals" };
750 $options{ "feats" } = [ split ",", $options{ "feats" } ] if defined $options{ "feats" };
751 $options{ "frames" } = [ split ",", $options{ "frames" } ] if defined $options{ "frames" };
752 $options{ "samples" } = [ split ",", $options{ "samples" } ] if defined $options{ "samples" };
753 $options{ "tables" } = [ split ",", $options{ "tables" } ] if defined $options{ "tables" };
754 $options{ "tracks" } = [ split ",", $options{ "tracks" } ] if defined $options{ "tracks" };
756 # ---- check arguments ----
758 if ( $options{ 'data_in' } )
760 $options{ "files" } = Maasha::Biopieces::getopt_files( $options{ 'data_in' } );
762 Maasha::Common::error( qq(Argument to --data_in must be a valid file or fileglob expression) ) if scalar @{ $options{ "files" } } == 0;
765 map { Maasha::Common::error( qq(Argument to --cols must be a whole numbers - not "$_") ) if $_ !~ /^\d+$/ } @{ $options{ "cols" } } if $options{ "cols" };
767 # print STDERR Dumper( \%options );
769 $real = "beg|end|word_size|wrap|tile_size|len|prefix_length|mismatches|offset|num|skip|cpus|window_size|step_size";
771 foreach $opt ( keys %options )
773 if ( $opt =~ /stream_in|pattern_in|exact_in/ and not -f $options{ $opt } )
775 Maasha::Common::error( qq(Argument to --$opt must be a valid file or fileglob expression - not "$options{ $opt }") );
777 elsif ( $opt =~ /$real/ and $options{ $opt } !~ /^\d+$/ )
779 Maasha::Common::error( qq(Argument to --$opt must be a whole number - not "$options{ $opt }") );
781 elsif ( $opt =~ /max_hits|max_hits|max_misses|dist|edit_dist|flank|gap|hamming_dist|priority/ and $options{ $opt } !~ /^-?\d+$/ )
783 Maasha::Common::error( qq(Argument to --$opt must be an integer - not "$options{ $opt }") );
785 elsif ( $opt =~ /identity|threshold/ and $options{ $opt } !~ /^-?(?:\d+(?:\.\d*)?|\.\d+)$/ )
787 Maasha::Common::error( qq(Argument to --$opt must be a decimal number - not "$options{ $opt }") );
789 elsif ( $opt =~ /e_val/ and $options{ $opt } !~ /^([+-]?)(?=\d|\.\d)\d*(\.\d*)?([Ee]([+-]?\d+))?$/ )
791 Maasha::Common::error( qq(Argument to --$opt must be a float - not "$options{ $opt }") );
793 elsif ( $opt =~ /strand/ and $options{ $opt } !~ /^(\+|-)$/ )
795 Maasha::Common::error( qq(Argument to --$opt must be "+" or "-" - not "$options{ $opt }") );
797 elsif ( $opt eq "genome" )
799 @genomes = Maasha::Common::ls_dirs( "$ENV{ 'BP_DATA' }/genomes" );
800 map { $_ =~ s/.*\/(.+)$/$1/ } @genomes;
802 if ( not grep { $_ =~ /^$options{ $opt }$/ } @genomes ) {
803 Maasha::Common::error( qq(Genome $options{ $opt } not found in "$ENV{ 'BP_DATA' }/genomes/") );
806 elsif ( $opt eq "terminal" and not $options{ $opt } =~ /^(svg|post|dumb|x11)/ )
808 Maasha::Common::error( qq(Bad --$opt argument "$options{ $opt }") );
810 elsif ( $opt eq "table" and $options{ $opt } =~ /(-|\.)/ )
812 Maasha::Common::error( qq(Character '$1' is not allowed in table name: $options{ $opt }) );
814 elsif ( $opt eq "merge" and $options{ $opt } !~ /^(AandB|AorB|BorA|AnotB|BnotA)$/ )
816 Maasha::Common::error( qq(Argument to --$opt must be AandB, AorB, BorA, AnotB, or BnotA - not "$options{ $opt }") );
818 elsif ( $opt eq "format" and $script eq "read_solexa" and $options{ $opt } !~ /octal|decimal/ )
820 Maasha::Common::error( qq(Argument to --$opt must be octal or decimal - not "$options{ $opt }") );
822 elsif ( $opt eq "remove" and $script eq "remove_adaptor" and $options{ $opt } !~ /before|after|skip/ )
824 Maasha::Common::error( qq(Argument to --$opt must be before, after, or skip - not "$options{ $opt }") );
828 Maasha::Common::error( qq(no --database specified) ) if $script eq "remove_ucsc_tracks" and not $options{ "database" };
829 Maasha::Common::error( qq(no --index_name specified) ) if $script =~ /create_vmatch_index/ and not $options{ "index_name" };
830 Maasha::Common::error( qq(no --in_file or --genome specified) ) if $script eq "soap_seq" and not $options{ "genome" } and not $options{ "in_file" };
831 Maasha::Common::error( qq(both --in_file and --genome specified) ) if $script eq "soap_seq" and $options{ "genome" } and $options{ "in_file" };
832 Maasha::Common::error( qq(no --genome specified) ) if $script =~ /get_genome_align|get_genome_phastcons|plot_phastcons_profiles|plot_karyogram/ and not $options{ "genome" };
833 Maasha::Common::error( qq(no --key specified) ) if $script =~ /plot_lendist|plot_histogram/ and not $options{ "key" };
834 Maasha::Common::error( qq(no --keys speficied) ) if $script =~ /sort_records|count_vals|sum_vals|mean_vals|median_vals|length_vals/ and not $options{ "keys" };
836 if ( $script eq "upload_to_ucsc" )
838 Maasha::Common::error( qq(no --database specified) ) if not $options{ "database" };
839 Maasha::Common::error( qq(no --table specified) ) if not $options{ "table" };
842 return wantarray ? %options : \%options;
846 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SCRIPTS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
849 sub script_print_usage
851 # Martin A. Hansen, January 2008.
853 # Retrieves usage information from file and
854 # prints this nicely formatted.
856 my ( $in, # handle to in stream
857 $out, # handle to out stream
858 $options, # options hash
863 my ( $file, $wiki, $lines );
865 if ( $options->{ 'data_in' } ) {
866 $file = $options->{ 'data_in' };
868 $file = join "", $ENV{ 'BP_DIR' }, "/bp_usage/", $options->{ 'SCRIPT' }, ".wiki";
871 $wiki = Maasha::Gwiki::gwiki_read( $file );
873 ( $wiki->[ 2 ], $wiki->[ 3 ], $wiki->[ 0 ], $wiki->[ 1 ] ) = ( $wiki->[ 0 ], $wiki->[ 1 ], $wiki->[ 2 ], $wiki->[ 3 ] );
875 if ( not $options->{ "help" } ) {
876 @{ $wiki } = grep { $_->[ 0 ]->{ 'SECTION' } =~ /Biopiece|summary|Usage|Options|Help/ } @{ $wiki };
879 $lines = Maasha::Gwiki::gwiki2ascii( $wiki );
881 print STDERR "$_\n" foreach @{ $lines };
889 # Martin A. Hansen, August 2007.
891 # Read psl table from stream or file.
893 my ( $in, # handle to in stream
894 $out, # handle to out stream
895 $options, # options hash
900 my ( $record, $file, $data_in, $num );
902 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
903 Maasha::Biopieces::put_record( $record, $out );
908 foreach $file ( @{ $options->{ "files" } } )
910 $data_in = Maasha::Common::read_open( $file );
912 while ( $record = Maasha::UCSC::psl_get_entry( $data_in ) )
914 Maasha::Biopieces::put_record( $record, $out );
916 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
928 # Martin A. Hansen, August 2007.
930 # Read bed table from stream or file.
932 my ( $in, # handle to in stream
933 $out, # handle to out stream
934 $options, # options hash
939 my ( $cols, $file, $record, $bed_entry, $data_in, $num );
941 $cols = $options->{ 'cols' }->[ 0 ];
943 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
944 Maasha::Biopieces::put_record( $record, $out );
949 foreach $file ( @{ $options->{ "files" } } )
951 $data_in = Maasha::Common::read_open( $file );
953 while ( $bed_entry = Maasha::UCSC::BED::bed_entry_get( $data_in, $cols, $options->{ 'check' } ) )
955 $record = Maasha::UCSC::BED::bed2biopiece( $bed_entry );
957 Maasha::Biopieces::put_record( $record, $out );
959 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
969 close $data_in if $data_in;
973 sub script_read_fixedstep
975 # Martin A. Hansen, Juli 2008.
977 # Read fixedstep wiggle format from stream or file.
979 my ( $in, # handle to in stream
980 $out, # handle to out stream
981 $options, # options hash
986 my ( $file, $record, $entry, $data_in, $num );
988 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
989 Maasha::Biopieces::put_record( $record, $out );
994 foreach $file ( @{ $options->{ "files" } } )
996 $data_in = Maasha::Common::read_open( $file );
998 while ( $entry = Maasha::UCSC::Wiggle::fixedstep_entry_get( $data_in ) )
1000 $record = Maasha::UCSC::Wiggle::fixedstep2biopiece( $entry );
1002 Maasha::Biopieces::put_record( $record, $out );
1004 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
1014 close $data_in if $data_in;
1018 sub script_read_blast_tab
1020 # Martin A. Hansen, September 2007.
1022 # Read tabular BLAST output from NCBI blast run with -m8 or -m9.
1024 my ( $in, # handle to in stream
1025 $out, # handle to out stream
1026 $options, # options hash
1031 my ( $file, $line, @fields, $strand, $record, $data_in, $num );
1033 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
1034 Maasha::Biopieces::put_record( $record, $out );
1039 foreach $file ( @{ $options->{ "files" } } )
1041 $data_in = Maasha::Common::read_open( $file );
1043 while ( $line = <$data_in> )
1047 next if $line =~ /^#/;
1049 @fields = split /\t/, $line;
1051 $record->{ "REC_TYPE" } = "BLAST";
1052 $record->{ "Q_ID" } = $fields[ 0 ];
1053 $record->{ "S_ID" } = $fields[ 1 ];
1054 $record->{ "IDENT" } = $fields[ 2 ];
1055 $record->{ "ALIGN_LEN" } = $fields[ 3 ];
1056 $record->{ "MISMATCHES" } = $fields[ 4 ];
1057 $record->{ "GAPS" } = $fields[ 5 ];
1058 $record->{ "Q_BEG" } = $fields[ 6 ] - 1; # BLAST is 1-based
1059 $record->{ "Q_END" } = $fields[ 7 ] - 1; # BLAST is 1-based
1060 $record->{ "S_BEG" } = $fields[ 8 ] - 1; # BLAST is 1-based
1061 $record->{ "S_END" } = $fields[ 9 ] - 1; # BLAST is 1-based
1062 $record->{ "E_VAL" } = $fields[ 10 ];
1063 $record->{ "BIT_SCORE" } = $fields[ 11 ];
1065 if ( $record->{ "S_BEG" } > $record->{ "S_END" } )
1067 $record->{ "STRAND" } = '-';
1069 ( $record->{ "S_BEG" }, $record->{ "S_END" } ) = ( $record->{ "S_END" }, $record->{ "S_BEG" } );
1073 $record->{ "STRAND" } = '+';
1076 Maasha::Biopieces::put_record( $record, $out );
1078 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
1088 close $data_in if $data_in;
1092 sub script_read_embl
1094 # Martin A. Hansen, August 2007.
1098 my ( $in, # handle to in stream
1099 $out, # handle to out stream
1100 $options, # options hash
1105 my ( %options2, $file, $data_in, $num, $entry, $record );
1107 map { $options2{ "keys" }{ $_ } = 1 } @{ $options->{ "keys" } };
1108 map { $options2{ "feats" }{ $_ } = 1 } @{ $options->{ "feats" } };
1109 map { $options2{ "quals" }{ $_ } = 1 } @{ $options->{ "quals" } };
1111 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
1112 Maasha::Biopieces::put_record( $record, $out );
1117 foreach $file ( @{ $options->{ "files" } } )
1119 $data_in = Maasha::Common::read_open( $file );
1121 while ( $entry = Maasha::EMBL::get_embl_entry( $data_in ) )
1123 $record = Maasha::EMBL::parse_embl_entry( $entry, \%options2 );
1125 my ( $feat, $feat2, $qual, $qual_val, $record_copy );
1127 $record_copy = dclone $record;
1129 delete $record_copy->{ "FT" };
1131 Maasha::Biopieces::put_record( $record_copy, $out );
1133 delete $record_copy->{ "SEQ" };
1135 foreach $feat ( keys %{ $record->{ "FT" } } )
1137 $record_copy->{ "FEAT_TYPE" } = $feat;
1139 foreach $feat2 ( @{ $record->{ "FT" }->{ $feat } } )
1141 foreach $qual ( keys %{ $feat2 } )
1143 $qual_val = join "; ", @{ $feat2->{ $qual } };
1148 $record_copy->{ $qual } = $qual_val;
1151 Maasha::Biopieces::put_record( $record_copy, $out );
1155 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
1165 close $data_in if $data_in;
1169 sub script_read_stockholm
1171 # Martin A. Hansen, August 2007.
1173 # Read Stockholm format.
1175 my ( $in, # handle to in stream
1176 $out, # handle to out stream
1177 $options, # options hash
1182 my ( $data_in, $file, $num, $entry, $record, $record_anno, $record_align, $key, $seq );
1184 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
1185 Maasha::Biopieces::put_record( $record, $out );
1190 foreach $file ( @{ $options->{ "files" } } )
1192 $data_in = Maasha::Common::read_open( $file );
1194 while ( $entry = Maasha::Stockholm::get_stockholm_entry( $data_in ) )
1196 $record = Maasha::Stockholm::parse_stockholm_entry( $entry );
1200 foreach $key ( keys %{ $record->{ "GF" } } ) {
1201 $record_anno->{ $key } = $record->{ "GF" }->{ $key };
1204 $record_anno->{ "ALIGN" } = $num;
1206 Maasha::Biopieces::put_record( $record_anno, $out );
1208 foreach $seq ( @{ $record->{ "ALIGN" } } )
1210 undef $record_align;
1213 SEQ_NAME => $seq->[ 0 ],
1217 Maasha::Biopieces::put_record( $record_align, $out );
1220 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
1230 close $data_in if $data_in;
1234 sub script_read_phastcons
1236 # Martin A. Hansen, December 2007.
1238 # Read PhastCons format.
1240 my ( $in, # handle to in stream
1241 $out, # handle to out stream
1242 $options, # options hash
1247 my ( $data_in, $file, $num, $entry, @records, $record );
1249 $options->{ "min" } ||= 10;
1250 $options->{ "dist" } ||= 25;
1251 $options->{ "threshold" } ||= 0.8;
1252 $options->{ "gap" } ||= 5;
1254 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
1255 Maasha::Biopieces::put_record( $record, $out );
1260 foreach $file ( @{ $options->{ "files" } } )
1262 $data_in = Maasha::Common::read_open( $file );
1264 while ( $entry = Maasha::UCSC::fixedstep_get_entry( $data_in ) )
1266 @records = Maasha::UCSC::phastcons_parse_entry( $entry, $options );
1268 foreach $record ( @records )
1270 $record->{ "REC_TYPE" } = "BED";
1271 $record->{ "BED_LEN" } = $record->{ "CHR_END" } - $record->{ "CHR_BEG" } + 1;
1273 Maasha::Biopieces::put_record( $record, $out );
1275 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
1286 close $data_in if $data_in;
1290 sub script_read_soft
1292 # Martin A. Hansen, December 2007.
1295 # http://www.ncbi.nlm.nih.gov/geo/info/soft2.html
1297 my ( $in, # handle to in stream
1298 $out, # handle to out stream
1299 $options, # options hash
1304 my ( $data_in, $file, $num, $records, $record, $soft_index, $fh, @platforms, $plat_table, @samples, $sample, $old_end, $skip );
1306 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
1307 Maasha::Biopieces::put_record( $record, $out );
1312 foreach $file ( @{ $options->{ "files" } } )
1314 print STDERR "Creating index for file: $file\n" if $options->{ "verbose" };
1316 $soft_index = Maasha::NCBI::soft_index_file( $file );
1318 $fh = Maasha::Common::read_open( $file );
1320 @platforms = grep { $_->{ "SECTION" } =~ /PLATFORM/ } @{ $soft_index };
1322 print STDERR "Getting platform tables for file: $file\n" if $options->{ "verbose" };
1324 $plat_table = Maasha::NCBI::soft_get_platform( $fh, $platforms[ 0 ]->{ "LINE_BEG" }, $platforms[ -1 ]->{ "LINE_END" } );
1326 @samples = grep { $_->{ "SECTION" } =~ /SAMPLE/ } @{ $soft_index };
1328 $old_end = $platforms[ -1 ]->{ "LINE_END" };
1330 foreach $sample ( @samples )
1333 $skip = 1 if ( $options->{ "samples" } and grep { $sample->{ "SECTION" } !~ /$_/ } @{ $options->{ "samples" } } );
1335 print STDERR "Getting samples for dataset: $sample->{ 'SECTION' }\n" if $options->{ "verbose" } and not $skip;
1337 $records = Maasha::NCBI::soft_get_sample( $fh, $plat_table, $sample->{ "LINE_BEG" } - $old_end - 1, $sample->{ "LINE_END" } - $old_end - 1, $skip );
1339 foreach $record ( @{ $records } )
1341 Maasha::Biopieces::put_record( $record, $out );
1343 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
1348 $old_end = $sample->{ "LINE_END" };
1356 close $data_in if $data_in;
1363 # Martin A. Hansen, February 2008.
1366 # http://www.ncbi.nlm.nih.gov/geo/info/soft2.html
1368 my ( $in, # handle to in stream
1369 $out, # handle to out stream
1370 $options, # options hash
1375 my ( $data_in, $file, $fh, $num, $record, $entry );
1377 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
1378 Maasha::Biopieces::put_record( $record, $out );
1383 foreach $file ( @{ $options->{ "files" } } )
1385 $fh = Maasha::Common::read_open( $file );
1387 while ( $entry = Maasha::GFF::get_entry( $fh ) )
1389 Maasha::Biopieces::put_record( $entry, $out );
1391 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
1401 close $data_in if $data_in;
1405 sub script_read_2bit
1407 # Martin A. Hansen, March 2008.
1409 # Read sequences from 2bit file.
1411 my ( $in, # handle to in stream
1412 $out, # handle to out stream
1413 $options, # options hash
1418 my ( $record, $file, $data_in, $mask, $toc, $line, $num );
1420 $mask = 1 if not $options->{ "no_mask" };
1422 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
1423 Maasha::Biopieces::put_record( $record, $out );
1428 foreach $file ( @{ $options->{ "files" } } )
1430 $data_in = Maasha::Common::read_open( $file );
1432 $toc = Maasha::TwoBit::twobit_get_TOC( $data_in );
1434 foreach $line ( @{ $toc } )
1436 $record->{ "SEQ_NAME" } = $line->[ 0 ];
1437 $record->{ "SEQ" } = Maasha::TwoBit::twobit_get_seq( $data_in, $line->[ 1 ], undef, undef, $mask );
1438 $record->{ "SEQ_LEN" } = length $record->{ "SEQ" };
1440 Maasha::Biopieces::put_record( $record, $out );
1442 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
1452 close $data_in if $data_in;
1456 sub script_read_solexa
1458 # Martin A. Hansen, March 2008.
1460 # Read Solexa sequence reads from file.
1462 my ( $in, # handle to in stream
1463 $out, # handle to out stream
1464 $options, # options hash
1469 my ( $record, $file, $data_in, $entry, $num, @seqs, @scores, $i );
1471 $options->{ "format" } ||= "octal";
1472 $options->{ "quality" } ||= 20;
1474 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
1475 Maasha::Biopieces::put_record( $record, $out );
1480 foreach $file ( @{ $options->{ "files" } } )
1482 $data_in = Maasha::Common::read_open( $file );
1484 if ( $options->{ "format" } eq "octal" )
1486 while ( $entry = Maasha::Solexa::solexa_get_entry_octal( $data_in ) )
1488 $record = Maasha::Solexa::solexa2biopiece( $entry, $options->{ "quality" } );
1490 Maasha::Biopieces::put_record( $record, $out );
1492 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
1499 while ( $entry = Maasha::Solexa::solexa_get_entry_decimal( $data_in ) )
1501 $record = Maasha::Solexa::solexa2biopiece( $entry, $options->{ "quality" } );
1503 Maasha::Biopieces::put_record( $record, $out );
1505 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
1516 close $data_in if $data_in;
1520 sub script_read_solid
1522 # Martin A. Hansen, April 2008.
1524 # Read Solid sequence from file.
1526 my ( $in, # handle to in stream
1527 $out, # handle to out stream
1528 $options, # options hash
1533 my ( $record, $file, $data_in, $line, $num, $seq_name, $seq_cs, $seq_qual, @scores, @seqs, $i );
1535 $options->{ "quality" } ||= 15;
1537 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
1538 Maasha::Biopieces::put_record( $record, $out );
1543 foreach $file ( @{ $options->{ "files" } } )
1545 $data_in = Maasha::Common::read_open( $file );
1547 while ( $line = <$data_in> )
1551 ( $seq_name, $seq_cs, $seq_qual ) = split /\t/, $line;
1553 @scores = split /,/, $seq_qual;
1554 @seqs = split //, Maasha::Solid::color_space2seq( $seq_cs );
1556 for ( $i = 0; $i < @seqs; $i++ ) {
1557 $seqs[ $i ] = lc $seqs[ $i ] if $scores[ $i ] < $options->{ "quality" };
1561 REC_TYPE => 'SOLID',
1562 SEQ_NAME => $seq_name,
1564 SEQ_QUAL => join( ";", @scores ),
1565 SEQ_LEN => length $seq_cs,
1566 SEQ => join( "", @seqs ),
1567 SCORE_MEAN => sprintf( "%.2f", Maasha::Calc::mean( \@scores ) ),
1570 Maasha::Biopieces::put_record( $record, $out );
1572 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
1582 close $data_in if $data_in;
1586 sub script_read_mysql
1588 # Martin A. Hansen, May 2008.
1590 # Read a MySQL query into stream.
1592 my ( $in, # handle to in stream
1593 $out, # handle to out stream
1594 $options, # options hash
1599 my ( $record, $dbh, $results );
1601 $options->{ "user" } ||= Maasha::UCSC::ucsc_get_user();
1602 $options->{ "password" } ||= Maasha::UCSC::ucsc_get_password();
1604 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
1605 Maasha::Biopieces::put_record( $record, $out );
1608 $dbh = Maasha::SQL::connect( $options->{ "database" }, $options->{ "user" }, $options->{ "password" } );
1610 $results = Maasha::SQL::query_hashref_list( $dbh, $options->{ "query" } );
1612 Maasha::SQL::disconnect( $dbh );
1614 map { Maasha::Biopieces::put_record( $_ ) } @{ $results };
1618 sub script_read_ucsc_config
1620 # Martin A. Hansen, November 2008.
1622 # Read track entries from UCSC Genome Browser '.ra' files.
1624 my ( $in, # handle to in stream
1625 $out, # handle to out stream
1626 $options, # options hash
1631 my ( $record, $file, $data_in, $entry, $num );
1633 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
1634 Maasha::Biopieces::put_record( $record, $out );
1639 foreach $file ( @{ $options->{ "files" } } )
1641 $data_in = Maasha::Common::read_open( $file );
1643 while ( $record = Maasha::UCSC::ucsc_config_entry_get( $data_in ) )
1645 $record->{ 'REC_TYPE' } = "UCSC Config";
1647 Maasha::Biopieces::put_record( $record, $out );
1649 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
1659 close $data_in if $data_in;
1663 sub script_length_seq
1665 # Martin A. Hansen, August 2007.
1667 # Determine the length of sequences in stream.
1669 my ( $in, # handle to in stream
1670 $out, # handle to out stream
1671 $options, # options hash
1676 my ( $record, $total );
1678 while ( $record = Maasha::Biopieces::get_record( $in ) )
1680 if ( $record->{ "SEQ" } )
1682 $record->{ "SEQ_LEN" } = length $record->{ "SEQ" };
1683 $total += $record->{ "SEQ_LEN" };
1686 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
1689 Maasha::Biopieces::put_record( { TOTAL_SEQ_LEN => $total }, $out );
1693 sub script_complexity_seq
1695 # Martin A. Hansen, May 2008.
1697 # Generates an index calculated as the most common di-residue over
1698 # the sequence length for all sequences in stream.
1700 my ( $in, # handle to in stream
1701 $out, # handle to out stream
1706 my ( $record, $index );
1708 while ( $record = Maasha::Biopieces::get_record( $in ) )
1710 $record->{ "SEQ_COMPLEXITY" } = sprintf( "%.2f", Maasha::Seq::seq_complexity( $record->{ "SEQ" } ) ) if $record->{ "SEQ" };
1712 Maasha::Biopieces::put_record( $record, $out );
1717 sub script_oligo_freq
1719 # Martin A. Hansen, August 2007.
1721 # Determine the length of sequences in stream.
1723 my ( $in, # handle to in stream
1724 $out, # handle to out stream
1725 $options, # options hash
1730 my ( $record, %oligos, @freq_table );
1732 $options->{ "word_size" } ||= 7;
1734 while ( $record = Maasha::Biopieces::get_record( $in ) )
1736 if ( $record->{ "SEQ" } )
1738 map { $oligos{ $_ }++ } Maasha::Seq::seq2oligos( \$record->{ "SEQ" }, $options->{ "word_size" } );
1740 if ( not $options->{ "all" } )
1742 @freq_table = Maasha::Seq::oligo_freq( \%oligos );
1744 map { Maasha::Biopieces::put_record( $_, $out ) } @freq_table;
1750 Maasha::Biopieces::put_record( $record, $out );
1753 if ( $options->{ "all" } )
1755 @freq_table = Maasha::Seq::oligo_freq( \%oligos );
1757 map { Maasha::Biopieces::put_record( $_, $out ) } @freq_table;
1762 sub script_create_weight_matrix
1764 # Martin A. Hansen, August 2007.
1766 # Creates a weight matrix from an alignmnet.
1768 my ( $in, # handle to in stream
1769 $out, # handle to out stream
1770 $options, # options hash
1775 my ( $record, $count, $i, $res, %freq_hash, %res_hash, $freq );
1779 while ( $record = Maasha::Biopieces::get_record( $in ) )
1781 if ( $record->{ "SEQ" } )
1783 for ( $i = 0; $i < length $record->{ "SEQ" }; $i++ )
1785 $res = substr $record->{ "SEQ" }, $i, 1;
1787 $freq_hash{ $i }{ $res }++;
1788 $res_hash{ $res } = 1;
1795 Maasha::Biopieces::put_record( $record, $out );
1799 foreach $res ( sort keys %res_hash )
1803 $record->{ "V0" } = $res;
1805 for ( $i = 0; $i < keys %freq_hash; $i++ )
1807 $freq = $freq_hash{ $i }{ $res } || 0;
1809 if ( $options->{ "percent" } ) {
1810 $freq = sprintf( "%.0f", 100 * $freq / $count ) if $freq > 0;
1813 $record->{ "V" . ( $i + 1 ) } = $freq;
1816 Maasha::Biopieces::put_record( $record, $out );
1821 sub script_remove_indels
1823 # Martin A. Hansen, August 2007.
1825 # Remove indels from sequences in stream.
1827 my ( $in, # handle to in stream
1828 $out, # handle to out stream
1835 while ( $record = Maasha::Biopieces::get_record( $in ) )
1837 $record->{ 'SEQ' } =~ tr/-~.//d if $record->{ "SEQ" };
1839 Maasha::Biopieces::put_record( $record, $out );
1844 sub script_transliterate_seq
1846 # Martin A. Hansen, August 2007.
1848 # Transliterate chars from sequence in record.
1850 my ( $in, # handle to in stream
1851 $out, # handle to out stream
1852 $options, # options hash
1857 my ( $record, $search, $replace, $delete );
1859 $search = $options->{ "search" } || "";
1860 $replace = $options->{ "replace" } || "";
1861 $delete = $options->{ "delete" } || "";
1863 while ( $record = Maasha::Biopieces::get_record( $in ) )
1865 if ( $record->{ "SEQ" } )
1867 if ( $search and $replace ) {
1868 eval "\$record->{ 'SEQ' } =~ tr/$search/$replace/";
1869 } elsif ( $delete ) {
1870 eval "\$record->{ 'SEQ' } =~ tr/$delete//d";
1874 Maasha::Biopieces::put_record( $record, $out );
1879 sub script_transliterate_vals
1881 # Martin A. Hansen, April 2008.
1883 # Transliterate chars from values in record.
1885 my ( $in, # handle to in stream
1886 $out, # handle to out stream
1887 $options, # options hash
1892 my ( $record, $search, $replace, $delete, $key );
1894 $search = $options->{ "search" } || "";
1895 $replace = $options->{ "replace" } || "";
1896 $delete = $options->{ "delete" } || "";
1898 while ( $record = Maasha::Biopieces::get_record( $in ) )
1900 foreach $key ( @{ $options->{ "keys" } } )
1902 if ( exists $record->{ $key } )
1904 if ( $search and $replace ) {
1905 eval "\$record->{ $key } =~ tr/$search/$replace/";
1906 } elsif ( $delete ) {
1907 eval "\$record->{ $key } =~ tr/$delete//d";
1912 Maasha::Biopieces::put_record( $record, $out );
1917 sub script_translate_seq
1919 # Martin A. Hansen, February 2008.
1921 # Translate DNA sequence into protein sequence.
1923 my ( $in, # handle to in stream
1924 $out, # handle to out stream
1925 $options, # options hash
1930 my ( $record, $frame, %new_record );
1932 $options->{ "frames" } ||= [ 1, 2, 3, -1, -2, -3 ];
1934 while ( $record = Maasha::Biopieces::get_record( $in ) )
1936 if ( $record->{ "SEQ" } )
1938 if ( Maasha::Seq::seq_guess_type( $record->{ "SEQ" } ) eq "dna" )
1940 foreach $frame ( @{ $options->{ "frames" } } )
1942 %new_record = %{ $record };
1944 $new_record{ "SEQ" } = Maasha::Seq::translate( $record->{ "SEQ" }, $frame );
1945 $new_record{ "SEQ_LEN" } = length $new_record{ "SEQ" };
1946 $new_record{ "FRAME" } = $frame;
1948 Maasha::Biopieces::put_record( \%new_record, $out );
1954 Maasha::Biopieces::put_record( $record, $out );
1960 sub script_get_genome_align
1962 # Martin A. Hansen, April 2008.
1964 # Gets a subalignment from a multiple genome alignment.
1966 my ( $in, # handle to in stream
1967 $out, # handle to out stream
1968 $options, # options hash
1973 my ( $record, $maf_track, $align, $align_num, $beg, $end, $len, $entry );
1975 $options->{ "strand" } ||= "+";
1979 $maf_track = Maasha::Config::maf_track( $options->{ "genome" } );
1981 if ( $options->{ "chr" } and $options->{ "beg" } and ( $options->{ "end" } or $options->{ "len" } ) )
1983 $beg = $options->{ "beg" } - 1;
1985 if ( $options->{ "end" } ) {
1986 $end = $options->{ "end" };
1987 } elsif ( $options->{ "len" } ) {
1988 $end = $beg + $options->{ "len" };
1991 $align = Maasha::UCSC::maf_extract( $BP_TMP, $options->{ "genome" }, $maf_track, $options->{ "chr" }, $beg, $end, $options->{ "strand" } );
1993 foreach $entry ( @{ $align } )
1995 $entry->{ "CHR" } = $record->{ "CHR" };
1996 $entry->{ "CHR_BEG" } = $record->{ "CHR_BEG" };
1997 $entry->{ "CHR_END" } = $record->{ "CHR_END" };
1998 $entry->{ "STRAND" } = $record->{ "STRAND" } || '+';
1999 $entry->{ "Q_ID" } = $record->{ "Q_ID" };
2000 $entry->{ "SCORE" } = $record->{ "SCORE" };
2002 Maasha::Biopieces::put_record( $entry, $out );
2006 while ( $record = Maasha::Biopieces::get_record( $in ) )
2008 if ( $record->{ "REC_TYPE" } eq "BED" )
2010 $align = Maasha::UCSC::maf_extract( $BP_TMP, $options->{ "genome" }, $maf_track, $record->{ "CHR" }, $record->{ "CHR_BEG" }, $record->{ "CHR_END" }, $record->{ "STRAND" } );
2012 elsif ( $record->{ "REC_TYPE" } eq "VMATCH" )
2014 $align = Maasha::UCSC::maf_extract( $BP_TMP, $options->{ "genome" }, $maf_track, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" } + 1, $record->{ "STRAND" } );
2016 elsif ( $record->{ "REC_TYPE" } eq "PSL" )
2018 $align = Maasha::UCSC::maf_extract( $BP_TMP, $options->{ "genome" }, $maf_track, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" }, $record->{ "STRAND" } );
2020 elsif ( $record->{ "REC_TYPE" } eq "BLAST" )
2022 $align = Maasha::UCSC::maf_extract( $BP_TMP, $options->{ "genome" }, $maf_track, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" }, $record->{ "STRAND" } );
2025 foreach $entry ( @{ $align } )
2027 $entry->{ "CHR" } = $record->{ "CHR" };
2028 $entry->{ "CHR_BEG" } = $record->{ "CHR_BEG" };
2029 $entry->{ "CHR_END" } = $record->{ "CHR_END" };
2030 $entry->{ "STRAND" } = $record->{ "STRAND" };
2031 $entry->{ "Q_ID" } = $record->{ "Q_ID" };
2032 $entry->{ "SCORE" } = $record->{ "SCORE" };
2034 Maasha::Biopieces::put_record( $entry, $out );
2042 sub script_get_genome_phastcons
2044 # Martin A. Hansen, February 2008.
2046 # Get phastcons scores from genome intervals.
2048 my ( $in, # handle to in stream
2049 $out, # handle to out stream
2050 $options, # options hash
2055 my ( $phastcons_file, $phastcons_index, $index, $fh_phastcons, $scores, $record );
2057 $options->{ "flank" } ||= 0;
2059 $phastcons_file = Maasha::Config::genome_phastcons( $options->{ "genome" } );
2060 $phastcons_index = Maasha::Config::genome_phastcons_index( $options->{ "genome" } );
2062 $index = Maasha::UCSC::fixedstep_index_retrieve( $phastcons_index );
2063 $fh_phastcons = Maasha::Common::read_open( $phastcons_file );
2065 if ( defined $options->{ "chr" } and defined $options->{ "beg" } and ( defined $options->{ "end" } or defined $options->{ "len" } ) )
2067 $options->{ "beg" } -= 1; # request is 1-based
2068 $options->{ "end" } -= 1; # request is 1-based
2070 if ( $options->{ "len" } ) {
2071 $options->{ "end" } = $options->{ "beg" } + $options->{ "len" } - 1;
2074 $scores = Maasha::UCSC::fixedstep_index_lookup( $index, $fh_phastcons, $options->{ "chr" }, $options->{ "beg" }, $options->{ "end" }, $options->{ "flank" } );
2076 $record->{ "CHR" } = $options->{ "chr" };
2077 $record->{ "CHR_BEG" } = $options->{ "beg" } - $options->{ "flank" };
2078 $record->{ "CHR_END" } = $options->{ "end" } + $options->{ "flank" };
2080 $record->{ "PHASTCONS" } = join ",", @{ $scores };
2081 $record->{ "PHAST_COUNT" } = scalar @{ $scores }; # DEBUG
2083 Maasha::Biopieces::put_record( $record, $out );
2086 while ( $record = Maasha::Biopieces::get_record( $in ) )
2088 if ( $record->{ "REC_TYPE" } eq "BED" )
2090 $scores = Maasha::UCSC::fixedstep_index_lookup( $index, $fh_phastcons, $record->{ "CHR" }, $record->{ "CHR_BEG" }, $record->{ "CHR_END" }, $options->{ "flank" } );
2092 elsif ( $record->{ "REC_TYPE" } eq "PSL" )
2094 $scores = Maasha::UCSC::fixedstep_index_lookup( $index, $fh_phastcons, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" }, $options->{ "flank" } );
2096 elsif ( $record->{ "REC_TYPE" } eq "BLAST" )
2098 $scores = Maasha::UCSC::fixedstep_index_lookup( $index, $fh_phastcons, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" }, $options->{ "flank" } );
2101 $record->{ "PHASTCONS" } = join ",", @{ $scores } if @{ $scores };
2102 # $record->{ "PHAST_COUNT" } = @{ $scores } if @{ $scores }; # DEBUG
2104 Maasha::Biopieces::put_record( $record, $out );
2107 close $fh_phastcons if $fh_phastcons;
2113 # Martin A. Hansen, December 2007.
2115 # Folds sequences in stream into secondary structures.
2117 my ( $in, # handle to in stream
2118 $out, # handle to out stream
2123 my ( $record, $type, $struct, $index );
2125 while ( $record = Maasha::Biopieces::get_record( $in ) )
2127 if ( $record->{ "SEQ" } )
2130 $type = Maasha::Seq::seq_guess_type( $record->{ "SEQ" } );
2133 if ( $type ne "protein" )
2135 ( $struct, $index ) = Maasha::Seq::fold_struct_rnafold( $record->{ "SEQ" } );
2136 $record->{ "SEC_STRUCT" } = $struct;
2137 $record->{ "FREE_ENERGY" } = $index;
2138 $record->{ "SCORE" } = abs int $index;
2139 $record->{ "SIZE" } = length $struct;
2140 $record->{ "CONF" } = "1," x $record->{ "SIZE" };
2144 Maasha::Biopieces::put_record( $record, $out );
2151 # Martin A. Hansen, February 2008.
2153 # Using the first sequence in stream as reference, tile
2154 # all subsequent sequences based on pairwise alignments.
2156 my ( $in, # handle to in stream
2157 $out, # handle to out stream
2158 $options, # options hash
2163 my ( $record, $first, $ref_entry, @entries );
2167 while ( $record = Maasha::Biopieces::get_record( $in ) )
2169 if ( $record->{ "SEQ_NAME" } and $record->{ "SEQ" } )
2173 $ref_entry = [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ];
2179 push @entries, [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ];
2184 Maasha::Biopieces::put_record( $record, $out );
2188 @entries = Maasha::Align::align_tile( $ref_entry, \@entries, $options );
2190 map { Maasha::Biopieces::put_record( { SEQ_NAME => $_->[ SEQ_NAME ], SEQ => $_->[ SEQ ] }, $out ) } @entries;
2194 sub script_invert_align
2196 # Martin A. Hansen, February 2008.
2198 # Inverts an alignment showing only non-mathing residues
2199 # using the first sequence as reference.
2201 my ( $in, # handle to in stream
2202 $out, # handle to out stream
2203 $options, # options hash
2208 my ( $record, @entries );
2210 while ( $record = Maasha::Biopieces::get_record( $in ) )
2212 if ( $record->{ "SEQ_NAME" } and $record->{ "SEQ" } )
2214 push @entries, [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ];
2218 Maasha::Biopieces::put_record( $record, $out );
2222 Maasha::Align::align_invert( \@entries, $options->{ "soft" } );
2224 map { Maasha::Biopieces::put_record( { SEQ_NAME => $_->[ SEQ_NAME ], SEQ => $_->[ SEQ ] }, $out ) } @entries;
2228 sub script_patscan_seq
2230 # Martin A. Hansen, August 2007.
2232 # Locates patterns in sequences using scan_for_matches.
2234 my ( $in, # handle to in stream
2235 $out, # handle to out stream
2236 $options, # options hash
2241 my ( $genome_file, @args, $arg, $type, $seq_file, $pat_file, $out_file, $fh_in, $fh_out, $record, $patterns, $pattern, $entry, $result, %head_hash, $i );
2243 if ( $options->{ "patterns" } ) {
2244 $patterns = Maasha::Patscan::parse_patterns( $options->{ "patterns" } );
2245 } elsif ( -f $options->{ "patterns_in" } ) {
2246 $patterns = Maasha::Patscan::read_patterns( $options->{ "patterns_in" } );
2249 $genome_file = "$ENV{ 'BP_DATA' }/genomes/$options->{ 'genome' }/fasta/$options->{ 'genome' }.fna" if $options->{ 'genome' };
2251 push @args, "-c" if $options->{ "comp" };
2252 push @args, "-m $options->{ 'max_hits' }" if $options->{ 'max_hits' };
2253 push @args, "-n $options->{ 'max_misses' }" if $options->{ 'max_hits' };
2255 $seq_file = "$BP_TMP/patscan.seq";
2256 $pat_file = "$BP_TMP/patscan.pat";
2257 $out_file = "$BP_TMP/patscan.out";
2259 $fh_out = Maasha::Common::write_open( $seq_file );
2263 while ( $record = Maasha::Biopieces::get_record( $in ) )
2265 if ( $record->{ "SEQ" } and $record->{ "SEQ_NAME" } )
2267 $type = Maasha::Seq::seq_guess_type( $record->{ "SEQ" } ) if not $type;
2269 Maasha::Fasta::put_entry( [ $i, $record->{ "SEQ" } ], $fh_out );
2271 $head_hash{ $i } = $record->{ "SEQ_NAME" };
2279 $arg = join " ", @args;
2280 $arg .= " -p" if $type eq "protein";
2282 foreach $pattern ( @{ $patterns } )
2284 $fh_out = Maasha::Common::write_open( $pat_file );
2286 print $fh_out "$pattern\n";
2290 if ( $options->{ 'genome' } ) {
2291 `scan_for_matches $arg $pat_file < $genome_file > $out_file`;
2292 # Maasha::Common::run( "scan_for_matches", "$arg $pat_file < $genome_file > $out_file" );
2294 `scan_for_matches $arg $pat_file < $seq_file > $out_file`;
2295 # Maasha::Common::run( "scan_for_matches", "$arg $pat_file < $seq_file > $out_file" );
2298 $fh_in = Maasha::Common::read_open( $out_file );
2300 while ( $entry = Maasha::Fasta::get_entry( $fh_in ) )
2302 $result = Maasha::Patscan::parse_scan_result( $entry, $pattern );
2304 if ( $options->{ 'genome' } )
2306 $result->{ "CHR" } = $result->{ "S_ID" };
2307 $result->{ "CHR_BEG" } = $result->{ "S_BEG" };
2308 $result->{ "CHR_END" } = $result->{ "S_END" };
2310 delete $result->{ "S_ID" };
2311 delete $result->{ "S_BEG" };
2312 delete $result->{ "S_END" };
2316 $result->{ "S_ID" } = $head_hash{ $result->{ "S_ID" } };
2319 Maasha::Biopieces::put_record( $result, $out );
2333 # Martin A. Hansen, July 2008.
2335 # soap sequences in stream against a given file or genome.
2337 my ( $in, # handle to in stream
2338 $out, # handle to out stream
2339 $options, # options hash
2344 my ( $genome, $tmp_in, $tmp_out, $fh_in, $fh_out, $record, $line, @fields, $entry, $count, $args );
2346 $options->{ "seed_size" } ||= 10;
2347 $options->{ "mismatches" } ||= 2;
2348 $options->{ "gap_size" } ||= 0;
2349 $options->{ "cpus" } ||= 1;
2351 if ( $options->{ "genome" } ) {
2352 $options->{ "in_file" } = "$ENV{ 'BP_DATA' }/genomes/$options->{ 'genome' }/fasta/$options->{ 'genome' }.fna";
2355 $tmp_in = "$BP_TMP/soap_query.seq";
2356 $tmp_out = "$BP_TMP/soap.result";
2358 $fh_out = Maasha::Common::write_open( $tmp_in );
2362 while ( $record = Maasha::Biopieces::get_record( $in ) )
2364 if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) )
2366 Maasha::Fasta::put_entry( $entry, $fh_out );
2371 Maasha::Biopieces::put_record( $record, $out );
2379 "-s $options->{ 'seed_size' }",
2382 "-v $options->{ 'mismatches' }",
2383 "-g $options->{ 'gap_size' }",
2384 "-p $options->{ 'cpus' }",
2385 "-d $options->{ 'in_file' }",
2389 $args .= " > /dev/null 2>&1" if not $options->{ 'verbose' };
2391 Maasha::Common::run( "soap", $args, 1 );
2395 $fh_out = Maasha::Common::read_open( $tmp_out );
2399 while ( $line = <$fh_out> )
2403 @fields = split /\t/, $line;
2405 $record->{ "REC_TYPE" } = "SOAP";
2406 $record->{ "Q_ID" } = $fields[ 0 ];
2407 $record->{ "SCORE" } = $fields[ 3 ];
2408 $record->{ "STRAND" } = $fields[ 6 ];
2409 $record->{ "S_ID" } = $fields[ 7 ];
2410 $record->{ "S_BEG" } = $fields[ 8 ] - 1; # soap is 1-based
2411 $record->{ "S_END" } = $fields[ 8 ] + $fields[ 5 ] - 2;
2413 Maasha::Biopieces::put_record( $record, $out );
2423 sub script_match_seq
2425 # Martin A. Hansen, August 2007.
2427 # BLATs sequences in stream against a given genome.
2429 my ( $in, # handle to in stream
2430 $out, # handle to out stream
2431 $options, # options hash
2436 my ( $record, @entries, $results );
2438 $options->{ "word_size" } ||= 20;
2439 $options->{ "direction" } ||= "both";
2441 while ( $record = Maasha::Biopieces::get_record( $in ) )
2443 if ( $record->{ "SEQ_NAME" } and $record->{ "SEQ" } ) {
2444 push @entries, [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ];
2447 Maasha::Biopieces::put_record( $record, $out );
2450 if ( @entries == 1 )
2452 $results = Maasha::Match::match_mummer( [ $entries[ 0 ] ], [ $entries[ 0 ] ], $options, $BP_TMP );
2454 map { Maasha::Biopieces::put_record( $_, $out ) } @{ $results };
2456 elsif ( @entries == 2 )
2458 $results = Maasha::Match::match_mummer( [ $entries[ 0 ] ], [ $entries[ 1 ] ], $options, $BP_TMP );
2460 map { Maasha::Biopieces::put_record( $_, $out ) } @{ $results };
2465 sub script_create_vmatch_index
2467 # Martin A. Hansen, January 2008.
2469 # Create a vmatch index from sequences in the stream.
2471 my ( $in, # handle to in stream
2472 $out, # handle to out stream
2473 $options, # options hash
2478 my ( $record, $file_tmp, $fh_tmp, $type, $entry );
2480 if ( $options->{ "index_name" } )
2482 $file_tmp = $options->{ 'index_name' };
2483 $fh_tmp = Maasha::Common::write_open( $file_tmp );
2486 while ( $record = Maasha::Biopieces::get_record( $in ) )
2488 if ( $options->{ "index_name" } and $entry = Maasha::Fasta::biopiece2fasta( $record ) )
2490 Maasha::Fasta::put_entry( $entry, $fh_tmp );
2492 $type = Maasha::Seq::seq_guess_type( $entry->[ SEQ ] ) if not defined $type;
2495 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
2498 if ( $options->{ "index_name" } )
2502 if ( $type eq "protein" ) {
2503 Maasha::Common::run( "mkvtree", "-db $file_tmp -protein -pl $options->{ 'prefix_length' } -allout -indexname $file_tmp > /dev/null 2>&1" );
2505 Maasha::Common::run( "mkvtree", "-db $file_tmp -dna -pl $options->{ 'prefix_length' } -allout -indexname $file_tmp > /dev/null 2>&1" );
2513 sub script_write_bed
2515 # Martin A. Hansen, August 2007.
2517 # Write BED format for the UCSC genome browser using records in stream.
2519 my ( $in, # handle to in stream
2520 $out, # handle to out stream
2521 $options, # options hash
2526 my ( $cols, $fh, $record, $bed_entry, $new_record );
2528 $cols = $options->{ 'cols' }->[ 0 ];
2530 $fh = Maasha::Biopieces::write_stream( $options->{ 'data_out' }, $options->{ 'compress' } );
2532 while ( $record = Maasha::Biopieces::get_record( $in ) )
2534 $record = Maasha::UCSC::psl2record( $record ) if $record->{ 'tBaseInsert' }; # Dirty addition to allow Affy data from MySQL to be dumped
2536 if ( $bed_entry = Maasha::UCSC::BED::biopiece2bed( $record, $cols ) ) {
2537 Maasha::UCSC::BED::bed_entry_put( $bed_entry, $fh, $cols, $options->{ 'check' } );
2540 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ 'no_stream' };
2547 sub script_write_psl
2549 # Martin A. Hansen, August 2007.
2551 # Write PSL output from stream.
2553 my ( $in, # handle to in stream
2554 $out, # handle to out stream
2555 $options, # options hash
2560 my ( $fh, $record, @output, $first );
2564 $fh = write_stream( $options->{ "data_out" }, $options->{ "compress" } );
2566 while ( $record = Maasha::Biopieces::get_record( $in ) )
2568 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
2570 if ( $record->{ "REC_TYPE" } and $record->{ "REC_TYPE" } eq "PSL" )
2572 Maasha::UCSC::psl_put_header( $fh ) if $first;
2573 Maasha::UCSC::psl_put_entry( $record, $fh );
2582 sub script_write_fixedstep
2584 # Martin A. Hansen, Juli 2008.
2586 # Write fixedStep entries from recrods in the stream.
2588 my ( $in, # handle to in stream
2589 $out, # handle to out stream
2590 $options, # options hash
2595 my ( $fh, $record, $entry );
2597 $fh = write_stream( $options->{ "data_out" }, $options->{ "compress" } );
2599 while ( $record = Maasha::Biopieces::get_record( $in ) )
2601 if ( $entry = Maasha::UCSC::Wiggle::biopiece2fixedstep( $record ) ) {
2602 Maasha::UCSC::Wiggle::fixedstep_entry_put( $entry, $fh );
2605 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
2612 sub script_write_2bit
2614 # Martin A. Hansen, March 2008.
2616 # Write sequence entries from stream in 2bit format.
2618 my ( $in, # handle to in stream
2619 $out, # handle to out stream
2620 $options, # options hash
2625 my ( $record, $mask, $tmp_file, $fh_tmp, $fh_in, $fh_out, $entry );
2627 $mask = 1 if not $options->{ "no_mask" };
2629 $tmp_file = "$BP_TMP/write_2bit.fna";
2630 $fh_tmp = Maasha::Common::write_open( $tmp_file );
2632 $fh_out = write_stream( $options->{ "data_out" } );
2634 while ( $record = Maasha::Biopieces::get_record( $in ) )
2636 if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) ) {
2637 Maasha::Fasta::put_entry( $entry, $fh_tmp );
2640 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
2645 $fh_in = Maasha::Common::read_open( $tmp_file );
2647 Maasha::TwoBit::fasta2twobit( $fh_in, $fh_out, $mask );
2656 sub script_write_solid
2658 # Martin A. Hansen, April 2008.
2660 # Write di-base encoded Solid sequence from entries in stream.
2662 my ( $in, # handle to in stream
2663 $out, # handle to out stream
2664 $options, # options hash
2669 my ( $record, $fh, $entry );
2671 $fh = write_stream( $options->{ "data_out" }, $options->{ "compress" } );
2673 while ( $record = Maasha::Biopieces::get_record( $in ) )
2675 if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) )
2677 $entry->[ SEQ ] = Maasha::Solid::seq2color_space( uc $entry->[ SEQ ] );
2679 Maasha::Fasta::put_entry( $entry, $fh, $options->{ "wrap" } );
2682 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
2689 sub script_write_ucsc_config
2691 # Martin A. Hansen, November 2008.
2693 # Write UCSC Genome Broser configuration (.ra file type) from
2694 # records in the stream.
2696 my ( $in, # handle to in stream
2697 $out, # handle to out stream
2698 $options, # options hash
2703 my ( $record, $fh );
2705 $fh = write_stream( $options->{ "data_out" } );
2707 while ( $record = Maasha::Biopieces::get_record( $in ) )
2709 Maasha::UCSC::ucsc_config_entry_put( $record, $fh ) if $record->{ "REC_TYPE" } eq "UCSC Config";
2711 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
2718 sub script_plot_seqlogo
2720 # Martin A. Hansen, August 2007.
2722 # Calculates and writes a sequence logo for alignments.
2724 my ( $in, # handle to in stream
2725 $out, # handle to out stream
2726 $options, # options hash
2731 my ( $record, @entries, $logo, $fh );
2733 while ( $record = Maasha::Biopieces::get_record( $in ) )
2735 if ( $record->{ "SEQ_NAME" } and $record->{ "SEQ" } ) {
2736 push @entries, [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ];
2739 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
2742 $logo = Maasha::Plot::seq_logo( \@entries );
2744 $fh = write_stream( $options->{ "data_out" } );
2752 sub script_plot_phastcons_profiles
2754 # Martin A. Hansen, January 2008.
2756 # Plots PhastCons profiles.
2758 my ( $in, # handle to in stream
2759 $out, # handle to out stream
2760 $options, # options hash
2765 my ( $phastcons_file, $phastcons_index, $index, $fh_phastcons, $record, $scores, $AoA, $plot, $fh );
2767 $options->{ "title" } ||= "PhastCons Profiles";
2769 $phastcons_file = Maasha::Config::genome_phastcons( $options->{ "genome" } );
2770 $phastcons_index = Maasha::Config::genome_phastcons_index( $options->{ "genome" } );
2772 $index = Maasha::UCSC::fixedstep_index_retrieve( $phastcons_index );
2773 $fh_phastcons = Maasha::Common::read_open( $phastcons_file );
2775 while ( $record = Maasha::Biopieces::get_record( $in ) )
2777 if ( $record->{ "CHR" } and $record->{ "CHR_BEG" } and $record->{ "CHR_END" } )
2779 $scores = Maasha::UCSC::fixedstep_index_lookup( $index, $fh_phastcons, $record->{ "CHR" },
2780 $record->{ "CHR_BEG" },
2781 $record->{ "CHR_END" },
2782 $options->{ "flank" } );
2784 push @{ $AoA }, [ @{ $scores } ];
2787 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
2790 Maasha::UCSC::phastcons_normalize( $AoA );
2792 $AoA = [ [ Maasha::UCSC::phastcons_mean( $AoA ) ] ] if $options->{ "mean" };
2793 $AoA = [ [ Maasha::UCSC::phastcons_median( $AoA ) ] ] if $options->{ "median" };
2795 $AoA = Maasha::Matrix::matrix_flip( $AoA );
2797 $plot = Maasha::Plot::lineplot_simple( $AoA, $options, $BP_TMP );
2799 $fh = write_stream( $options->{ "data_out" } );
2801 print $fh "$_\n" foreach @{ $plot };
2807 sub script_head_records
2809 # Martin A. Hansen, August 2007.
2811 # Display the first sequences in stream.
2813 my ( $in, # handle to in stream
2814 $out, # handle to out stream
2815 $options, # options hash
2820 my ( $record, $count );
2822 $options->{ "num" } ||= 10;
2826 while ( $record = Maasha::Biopieces::get_record( $in ) )
2830 Maasha::Biopieces::put_record( $record, $out );
2832 last if $count == $options->{ "num" };
2837 sub script_remove_keys
2839 # Martin A. Hansen, August 2007.
2841 # Remove keys from stream.
2843 my ( $in, # handle to in stream
2844 $out, # handle to out stream
2845 $options, # options hash
2850 my ( $record, $new_record );
2852 while ( $record = Maasha::Biopieces::get_record( $in ) )
2854 if ( $options->{ "keys" } )
2856 map { delete $record->{ $_ } } @{ $options->{ "keys" } };
2858 elsif ( $options->{ "save_keys" } )
2860 map { $new_record->{ $_ } = $record->{ $_ } if exists $record->{ $_ } } @{ $options->{ "save_keys" } };
2862 $record = $new_record;
2865 Maasha::Biopieces::put_record( $record, $out ) if keys %{ $record };
2870 sub script_remove_adaptor
2872 # Martin A. Hansen, August 2008.
2874 # Find and remove adaptor from sequences in the stream.
2876 my ( $in, # handle to in stream
2877 $out, # handle to out stream
2878 $options, # options hash
2883 my ( $record, $adaptor, $seq, $adaptor_len, $seq_len, $offset, $max_match, $max_mismatch, $pos );
2885 $options->{ "remove" } ||= "after";
2887 $max_mismatch = $options->{ "mismatches" } || 0;
2888 $offset = $options->{ "offset" };
2890 if ( not defined $offset ) {
2896 $adaptor = uc $options->{ "adaptor" };
2897 $adaptor_len = length $adaptor;
2899 while ( $record = Maasha::Biopieces::get_record( $in ) )
2901 if ( $record->{ "SEQ" } )
2903 $seq = uc $record->{ "SEQ" };
2904 $seq_len = length $seq;
2906 $pos = Maasha::Common::index_m( $seq, $adaptor, $seq_len, $adaptor_len, $offset, $max_mismatch );
2908 $record->{ "ADAPTOR_POS" } = $pos;
2910 if ( $pos >= 0 and $options->{ "remove" } ne "skip" )
2912 if ( $options->{ "remove" } eq "after" )
2914 $record->{ "SEQ" } = substr $record->{ "SEQ" }, 0, $pos;
2915 $record->{ "SEQ_LEN" } = $pos;
2919 $record->{ "SEQ" } = substr $record->{ "SEQ" }, $pos + $adaptor_len;
2920 $record->{ "SEQ_LEN" } = length $record->{ "SEQ" };
2924 Maasha::Biopieces::put_record( $record, $out );
2928 Maasha::Biopieces::put_record( $record, $out );
2934 sub script_remove_mysql_tables
2936 # Martin A. Hansen, November 2008.
2938 # Remove MySQL tables from values in stream.
2940 my ( $in, # handle to in stream
2941 $out, # handle to out stream
2942 $options, # options hash
2947 my ( $record, %table_hash, $dbh, $table );
2949 $options->{ "user" } ||= Maasha::UCSC::ucsc_get_user();
2950 $options->{ "password" } ||= Maasha::UCSC::ucsc_get_password();
2952 map { $table_hash{ $_ } = 1 } @{ $options->{ 'tables' } };
2954 while ( $record = Maasha::Biopieces::get_record( $in ) )
2956 map { $table_hash{ $record->{ $_ } } = 1 } @{ $options->{ 'keys' } };
2958 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ 'no_stream' };
2961 $dbh = Maasha::SQL::connect( $options->{ "database" }, $options->{ "user" }, $options->{ "password" } );
2963 foreach $table ( sort keys %table_hash )
2965 if ( Maasha::SQL::table_exists( $dbh, $table ) )
2967 print STDERR qq(Removing table "$table" from database "$options->{ 'database' }" ... ) if $options->{ 'verbose' };
2968 Maasha::SQL::delete_table( $dbh, $table );
2969 print STDERR "done.\n" if $options->{ 'verbose' };
2973 print STDERR qq(WARNING: table "$table" not found in database "$options->{ 'database' }\n");
2977 Maasha::SQL::disconnect( $dbh );
2981 sub script_remove_ucsc_tracks
2983 # Martin A. Hansen, November 2008.
2985 # Remove track from MySQL tables and config file.
2987 my ( $in, # handle to in stream
2988 $out, # handle to out stream
2989 $options, # options hash
2994 my ( $record, %track_hash, $fh_in, $fh_out, $track, @tracks, @new_tracks, $dbh );
2996 $options->{ 'user' } ||= Maasha::UCSC::ucsc_get_user();
2997 $options->{ 'password' } ||= Maasha::UCSC::ucsc_get_password();
2998 $options->{ 'config_file' } ||= "$ENV{ 'HOME' }/ucsc/my_tracks.ra";
3000 map { $track_hash{ $_ } = 1 } @{ $options->{ 'tracks' } };
3002 while ( $record = Maasha::Biopieces::get_record( $in ) )
3004 map { $track_hash{ $record->{ $_ } } = 1 } @{ $options->{ 'keys' } };
3006 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ 'no_stream' };
3009 $fh_in = Maasha::Filesys::file_read_open( $options->{ 'config_file' } );
3011 while ( $track = Maasha::UCSC::ucsc_config_entry_get( $fh_in ) ) {
3012 push @tracks, $track;
3017 foreach $track ( @tracks )
3019 if ( $track->{ 'database' } eq $options->{ 'database' } and exists $track_hash{ $track->{ 'track' } } ) {
3020 print STDERR qq(Removing track "$track->{ 'track' }" from config file.\n) if $options->{ 'verbose' };
3022 push @new_tracks, $track;
3026 rename "$options->{ 'config_file' }", "$options->{ 'config_file' }~";
3028 $fh_out = Maasha::Filesys::file_write_open( $options->{ 'config_file' } );
3030 map { Maasha::UCSC::ucsc_config_entry_put( $_, $fh_out ) } @new_tracks;
3034 # ---- locate track in database ----
3036 $dbh = Maasha::SQL::connect( $options->{ "database" }, $options->{ "user" }, $options->{ "password" } );
3038 foreach $track ( sort keys %track_hash )
3040 if ( Maasha::SQL::table_exists( $dbh, $track ) )
3042 print STDERR qq(Removing table "$track" from database "$options->{ 'database' }" ... ) if $options->{ 'verbose' };
3043 Maasha::SQL::delete_table( $dbh, $track );
3044 print STDERR "done.\n" if $options->{ 'verbose' };
3048 print STDERR qq(WARNING: table "$track" not found in database "$options->{ 'database' }\n");
3052 Maasha::SQL::disconnect( $dbh );
3054 Maasha::Common::run( "ucscMakeTracks.pl", "-b > /dev/null 2>&1" );
3058 sub script_rename_keys
3060 # Martin A. Hansen, August 2007.
3062 # Rename keys in stream.
3064 my ( $in, # handle to in stream
3065 $out, # handle to out stream
3066 $options, # options hash
3073 while ( $record = Maasha::Biopieces::get_record( $in ) )
3075 if ( exists $record->{ $options->{ "keys" }->[ 0 ] } )
3077 $record->{ $options->{ "keys" }->[ 1 ] } = $record->{ $options->{ "keys" }->[ 0 ] };
3079 delete $record->{ $options->{ "keys" }->[ 0 ] };
3082 Maasha::Biopieces::put_record( $record, $out );
3087 sub script_uniq_vals
3089 # Martin A. Hansen, August 2007.
3091 # Find unique values in stream.
3093 my ( $in, # handle to in stream
3094 $out, # handle to out stream
3095 $options, # options hash
3100 my ( %hash, $record );
3102 while ( $record = Maasha::Biopieces::get_record( $in ) )
3104 if ( $record->{ $options->{ "key" } } )
3106 if ( not $hash{ $record->{ $options->{ "key" } } } and not $options->{ "invert" } )
3108 Maasha::Biopieces::put_record( $record, $out );
3110 $hash{ $record->{ $options->{ "key" } } } = 1;
3112 elsif ( $hash{ $record->{ $options->{ "key" } } } and $options->{ "invert" } )
3114 Maasha::Biopieces::put_record( $record, $out );
3118 $hash{ $record->{ $options->{ "key" } } } = 1;
3123 Maasha::Biopieces::put_record( $record, $out );
3129 sub script_merge_vals
3131 # Martin A. Hansen, August 2007.
3133 # Rename keys in stream.
3135 my ( $in, # handle to in stream
3136 $out, # handle to out stream
3137 $options, # options hash
3142 my ( $record, @join, $i );
3144 $options->{ "delimit" } ||= '_';
3146 while ( $record = Maasha::Biopieces::get_record( $in ) )
3148 if ( exists $record->{ $options->{ "keys" }->[ 0 ] } )
3150 @join = $record->{ $options->{ "keys" }->[ 0 ] };
3152 for ( $i = 1; $i < @{ $options->{ "keys" } }; $i++ ) {
3153 push @join, $record->{ $options->{ "keys" }->[ $i ] } if exists $record->{ $options->{ "keys" }->[ $i ] };
3156 $record->{ $options->{ "keys" }->[ 0 ] } = join $options->{ "delimit" }, @join;
3159 Maasha::Biopieces::put_record( $record, $out );
3164 sub script_merge_records
3166 # Martin A. Hansen, July 2008.
3168 # Merges records in the stream based on identical values of two given keys.
3170 my ( $in, # handle to in stream
3171 $out, # handle to out stream
3172 $options, # options hash
3177 my ( $merge, $record, $file1, $file2, $fh1, $fh2, $key1, $key2, @keys1, @keys2, @vals1, @vals2,
3178 $num1, $num2, $num, $cmp, $i );
3180 $merge = $options->{ "merge" } || "AandB";
3182 $file1 = "$BP_TMP/merge_records1.tmp";
3183 $file2 = "$BP_TMP/merge_records2.tmp";
3185 $fh1 = Maasha::Common::write_open( $file1 );
3186 $fh2 = Maasha::Common::write_open( $file2 );
3188 $key1 = $options->{ "keys" }->[ 0 ];
3189 $key2 = $options->{ "keys" }->[ 1 ];
3191 $num = $key2 =~ s/n$//;
3195 while ( $record = Maasha::Biopieces::get_record( $in ) )
3197 if ( exists $record->{ $key1 } )
3200 @vals1 = $record->{ $key1 };
3202 delete $record->{ $key1 };
3204 map { push @keys1, $_; push @vals1, $record->{ $_ } } keys %{ $record };
3206 print $fh1 join( "\t", @vals1 ), "\n";
3210 elsif ( exists $record->{ $key2 } )
3213 @vals2 = $record->{ $key2 };
3215 delete $record->{ $key2 };
3217 map { push @keys2, $_; push @vals2, $record->{ $_ } } keys %{ $record };
3219 print $fh2 join( "\t", @vals2 ), "\n";
3230 Maasha::Common::run( "sort", "-k 1,1n $file1 > $file1.sort" ) and rename "$file1.sort", $file1;
3231 Maasha::Common::run( "sort", "-k 1,1n $file2 > $file2.sort" ) and rename "$file2.sort", $file2;
3235 Maasha::Common::run( "sort", "-k 1,1 $file1 > $file1.sort" ) and rename "$file1.sort", $file1;
3236 Maasha::Common::run( "sort", "-k 1,1 $file2 > $file2.sort" ) and rename "$file2.sort", $file2;
3239 $fh1 = Maasha::Common::read_open( $file1 );
3240 $fh2 = Maasha::Common::read_open( $file2 );
3242 @vals1 = Maasha::Common::get_fields( $fh1 );
3243 @vals2 = Maasha::Common::get_fields( $fh2 );
3245 while ( $num1 > 0 and $num2 > 0 )
3250 $cmp = $vals1[ 0 ] <=> $vals2[ 0 ];
3252 $cmp = $vals1[ 0 ] cmp $vals2[ 0 ];
3257 if ( $merge =~ /^(AorB|AnotB)$/ )
3259 for ( $i = 0; $i < @keys1; $i++ ) {
3260 $record->{ $keys1[ $i ] } = $vals1[ $i ];
3263 Maasha::Biopieces::put_record( $record, $out );
3266 @vals1 = Maasha::Common::get_fields( $fh1 );
3271 if ( $merge =~ /^(BorA|BnotA)$/ )
3273 for ( $i = 0; $i < @keys2; $i++ ) {
3274 $record->{ $keys2[ $i ] } = $vals2[ $i ];
3277 Maasha::Biopieces::put_record( $record, $out );
3280 @vals2 = Maasha::Common::get_fields( $fh2 );
3285 if ( $merge =~ /^(AandB|AorB|BorA)$/ )
3287 for ( $i = 0; $i < @keys1; $i++ ) {
3288 $record->{ $keys1[ $i ] } = $vals1[ $i ];
3291 for ( $i = 1; $i < @keys2; $i++ ) {
3292 $record->{ $keys2[ $i ] } = $vals2[ $i ];
3295 Maasha::Biopieces::put_record( $record, $out );
3298 @vals1 = Maasha::Common::get_fields( $fh1 );
3299 @vals2 = Maasha::Common::get_fields( $fh2 );
3311 if ( $num1 > 0 and $merge =~ /^(AorB|AnotB)$/ )
3315 for ( $i = 0; $i < @keys1; $i++ ) {
3316 $record->{ $keys1[ $i ] } = $vals1[ $i ];
3319 Maasha::Biopieces::put_record( $record, $out );
3322 if ( $num2 > 0 and $merge =~ /^(BorA|BnotA)$/ )
3326 for ( $i = 0; $i < @keys2; $i++ ) {
3327 $record->{ $keys2[ $i ] } = $vals2[ $i ];
3330 Maasha::Biopieces::put_record( $record, $out );
3337 # Martin A. Hansen, August 2007.
3339 # Evaluate extression for records in stream.
3341 my ( $in, # handle to in stream
3342 $out, # handle to out stream
3343 $options, # options hash
3348 my ( $record, $eval_key, @keys, $eval_val );
3350 while ( $record = Maasha::Biopieces::get_record( $in ) )
3352 if ( $options->{ "eval" } )
3354 if ( $options->{ "eval" } =~ /^(\S+)\s*=\s*(.+)$/ )
3361 @keys = split /\s+|\+|-|\*|\/|\*\*/, $eval_val;
3363 @keys = grep { exists $record->{ $_ } } @keys;
3366 map { $eval_val =~ s/\Q$_\E/$record->{ $_ }/g } @keys;
3368 $record->{ $eval_key } = eval "$eval_val";
3369 Maasha::Common::error( qq(eval "$eval_key = $eval_val" failed -> $@) ) if $@;
3373 Maasha::Common::error( qq(Bad compute expression: "$options->{ 'eval' }"\n) );
3377 Maasha::Biopieces::put_record( $record, $out );
3384 # Martin A. Hansen, June 2008.
3388 my ( $in, # handle to in stream
3389 $out, # handle to out stream
3390 $options, # options hash
3395 my ( $record, $key, $A, $B, @rows, @matrix, $row, $i );
3397 while ( $record = Maasha::Biopieces::get_record( $in ) )
3401 foreach $key ( sort { $A = $a; $B = $b; $A =~ s/^V(\d+)$/$1/; $B =~ s/^V(\d+)$/$1/; $A <=> $B } keys %{ $record } )
3403 push @rows, $record->{ $key };
3407 push @matrix, [ @rows ];
3412 @matrix = Maasha::Matrix::matrix_flip( \@matrix );
3414 foreach $row ( @matrix )
3416 for ( $i = 0; $i < @{ $row }; $i++ ) {
3417 $record->{ "V$i" } = $row->[ $i ];
3420 Maasha::Biopieces::put_record( $record, $out );
3425 sub script_sort_records
3427 # Martin A. Hansen, August 2007.
3429 # Sort to sort records according to keys.
3431 my ( $in, # handle to in stream
3432 $out, # handle to out stream
3433 $options, # options hash
3438 my ( @keys, $key, @sort_cmd, $sort_str, $sort_sub, @records, $record, $i );
3440 foreach $key ( @{ $options->{ "keys" } } )
3442 if ( $key =~ s/n$// ) {
3443 push @sort_cmd, qq(\$a->{ "$key" } <=> \$b->{ "$key" });
3445 push @sort_cmd, qq(\$a->{ "$key" } cmp \$b->{ "$key" });
3449 $sort_str = join " or ", @sort_cmd;
3450 $sort_sub = eval "sub { $sort_str }"; # NB security issue!
3452 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
3453 push @records, $record;
3456 @records = sort $sort_sub @records;
3458 if ( $options->{ "reverse" } )
3460 for ( $i = scalar @records - 1; $i >= 0; $i-- ) {
3461 Maasha::Biopieces::put_record( $records[ $i ], $out );
3466 for ( $i = 0; $i < scalar @records; $i++ ) {
3467 Maasha::Biopieces::put_record( $records[ $i ], $out );
3473 sub script_count_vals
3475 # Martin A. Hansen, August 2007.
3477 # Count records in stream.
3479 my ( $in, # handle to in stream
3480 $out, # handle to out stream
3481 $options, # options hash
3486 my ( $num, $record, %count_hash, @records, $tmp_file, $fh_out, $fh_in, $cache );
3488 $tmp_file = "$BP_TMP/count_cache.tmp";
3490 $fh_out = Maasha::Common::write_open( $tmp_file );
3495 while ( $record = Maasha::Biopieces::get_record( $in ) )
3497 map { $count_hash{ $_ }{ $record->{ $_ } }++ if exists $record->{ $_ } } @{ $options->{ "keys" } };
3499 push @records, $record;
3501 if ( scalar @records > 5_000_000 ) # too many records to hold in memory - use disk cache
3503 map { Maasha::Biopieces::put_record( $_, $fh_out ) } @records;
3510 print STDERR "verbose: records read $num\n" if ( $options->{ 'verbose' } and ( $num % 1_000_000 ) == 0 );
3521 $fh_in = Maasha::Common::read_open( $tmp_file );
3523 while ( $record = Maasha::Biopieces::get_record( $fh_in ) )
3525 map { $record->{ $_ . "_COUNT" } = $count_hash{ $_ }{ $record->{ $_ } } if exists $record->{ $_ } } @{ $options->{ "keys" } };
3527 Maasha::Biopieces::put_record( $record, $out );
3529 print STDERR "verbose: cache read $num\n" if ( $options->{ 'verbose' } and ( $num % 1_000_000 ) == 0 );
3537 foreach $record ( @records )
3539 map { $record->{ $_ . "_COUNT" } = $count_hash{ $_ }{ $record->{ $_ } } if exists $record->{ $_ } } @{ $options->{ "keys" } };
3541 Maasha::Biopieces::put_record( $record, $out );
3548 sub script_plot_histogram
3550 # Martin A. Hansen, September 2007.
3552 # Plot a simple histogram for a given key using GNU plot.
3554 my ( $in, # handle to in stream
3555 $out, # handle to out stream
3556 $options, # options hash
3561 my ( $record, %data_hash, $max, @data_list, $i, $result, $fh );
3563 $options->{ "title" } ||= "Histogram";
3564 $options->{ "sort" } ||= "num";
3566 while ( $record = Maasha::Biopieces::get_record( $in ) )
3568 $data_hash{ $record->{ $options->{ "key" } } }++ if defined $record->{ $options->{ "key" } };
3570 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
3573 if ( $options->{ "sort" } eq "num" ) {
3574 map { push @data_list, [ $_, $data_hash{ $_ } ] } sort { $a <=> $b } keys %data_hash;
3576 map { push @data_list, [ $_, $data_hash{ $_ } ] } sort keys %data_hash;
3579 $result = Maasha::Plot::histogram_simple( \@data_list, $options );
3581 $fh = write_stream( $options->{ "data_out" } );
3583 print $fh "$_\n" foreach @{ $result };
3589 sub script_plot_lendist
3591 # Martin A. Hansen, August 2007.
3593 # Plot length distribution using GNU plot.
3595 my ( $in, # handle to in stream
3596 $out, # handle to out stream
3597 $options, # options hash
3602 my ( $record, %data_hash, $max, @data_list, $i, $result, $fh );
3604 $options->{ "title" } ||= "Length Distribution";
3606 while ( $record = Maasha::Biopieces::get_record( $in ) )
3608 $data_hash{ $record->{ $options->{ "key" } } }++ if defined $record->{ $options->{ "key" } };
3610 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
3613 $max = Maasha::Calc::list_max( [ keys %data_hash ] );
3615 for ( $i = 0; $i < $max; $i++ ) {
3616 push @data_list, [ $i, $data_hash{ $i } || 0 ];
3619 $result = Maasha::Plot::histogram_lendist( \@data_list, $options );
3621 $fh = write_stream( $options->{ "data_out" } );
3623 print $fh "$_\n" foreach @{ $result };
3629 sub script_plot_chrdist
3631 # Martin A. Hansen, August 2007.
3633 # Plot chromosome distribution using GNU plot.
3635 my ( $in, # handle to in stream
3636 $out, # handle to out stream
3637 $options, # options hash
3642 my ( $record, %data_hash, @data_list, $elem, $sort_key, $count, $result, $fh );
3644 $options->{ "title" } ||= "Chromosome Distribution";
3646 while ( $record = Maasha::Biopieces::get_record( $in ) )
3648 if ( $record->{ "CHR" } ) { # generic
3649 $data_hash{ $record->{ "CHR" } }++;
3650 } elsif ( $record->{ "REC_TYPE" } eq "PATSCAN" and $record->{ "S_ID" } =~ /^chr/i ) { # patscan
3651 $data_hash{ $record->{ "S_ID" } }++;
3652 } elsif ( $record->{ "REC_TYPE" } eq "PSL" and $record->{ "S_ID" } =~ /^chr/i ) { # BLAT / PSL
3653 $data_hash{ $record->{ "S_ID" } }++;
3654 } elsif ( $record->{ "REC_TYPE" } eq "BLAST" and $record->{ "S_ID" } =~ /^chr/i ) { # BLAST
3655 $data_hash{ $record->{ "S_ID" } }++;
3658 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
3661 foreach $elem ( keys %data_hash )
3665 $sort_key =~ s/chr//i;
3667 $sort_key =~ s/^X(.*)/99$1/;
3668 $sort_key =~ s/^Y(.*)/99$1/;
3669 $sort_key =~ s/^Z(.*)/999$1/;
3670 $sort_key =~ s/^M(.*)/9999$1/;
3671 $sort_key =~ s/^U(.*)/99999$1/;
3673 $count = $sort_key =~ tr/_//;
3675 $sort_key =~ s/_.*/"999999" x $count/ex;
3677 push @data_list, [ $elem, $data_hash{ $elem }, $sort_key ];
3680 @data_list = sort { $a->[ 2 ] <=> $b->[ 2 ] } @data_list;
3682 $result = Maasha::Plot::histogram_chrdist( \@data_list, $options );
3684 $fh = write_stream( $options->{ "data_out" } );
3686 print $fh "$_\n" foreach @{ $result };
3692 sub script_plot_karyogram
3694 # Martin A. Hansen, August 2007.
3696 # Plot hits on karyogram.
3698 my ( $in, # handle to in stream
3699 $out, # handle to out stream
3700 $options, # options hash
3705 my ( %options, $record, @data, $fh, $result, %data_hash );
3707 $options->{ "genome" } ||= "human";
3708 $options->{ "feat_color" } ||= "black";
3710 while ( $record = Maasha::Biopieces::get_record( $in ) )
3712 if ( $record->{ "CHR" } and $record->{ "CHR_BEG" } and $record->{ "CHR_END" } )
3714 push @{ $data_hash{ $record->{ "CHR" } } }, [ $record->{ "CHR_BEG" }, $record->{ "CHR_END" }, $options->{ "feat_color" } ];
3717 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
3720 $result = Maasha::Plot::karyogram( \%data_hash, \%options );
3722 $fh = write_stream( $options->{ "data_out" } );
3730 sub script_plot_matches
3732 # Martin A. Hansen, August 2007.
3734 # Plot matches in 2D generating a dotplot.
3736 my ( $in, # handle to in stream
3737 $out, # handle to out stream
3738 $options, # options hash
3743 my ( $record, @data, $fh, $result, %data_hash );
3745 $options->{ "direction" } ||= "both";
3747 while ( $record = Maasha::Biopieces::get_record( $in ) )
3749 if ( defined $record->{ "Q_BEG" } and defined $record->{ "S_BEG" } and $record->{ "Q_END" } and $record->{ "S_END" } ) {
3750 push @data, $record;
3753 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
3756 $options->{ "title" } ||= "plot_matches";
3757 $options->{ "xlabel" } ||= $data[ 0 ]->{ "Q_ID" };
3758 $options->{ "ylabel" } ||= $data[ 0 ]->{ "S_ID" };
3760 $result = Maasha::Plot::dotplot_matches( \@data, $options, $BP_TMP );
3762 $fh = write_stream( $options->{ "data_out" } );
3764 print $fh "$_\n" foreach @{ $result };
3770 sub script_length_vals
3772 # Martin A. Hansen, August 2007.
3774 # Determine the length of the value for given keys.
3776 my ( $in, # handle to in stream
3777 $out, # handle to out stream
3778 $options, # options hash
3783 my ( $record, $key );
3785 while ( $record = Maasha::Biopieces::get_record( $in ) )
3787 foreach $key ( @{ $options->{ "keys" } } )
3789 if ( $record->{ $key } ) {
3790 $record->{ $key . "_LEN" } = length $record->{ $key };
3794 Maasha::Biopieces::put_record( $record, $out );
3801 # Martin A. Hansen, August 2007.
3803 # Calculates the sums for values of given keys.
3805 my ( $in, # handle to in stream
3806 $out, # handle to out stream
3807 $options, # options hash
3812 my ( $record, $key, %sum_hash, $fh );
3814 while ( $record = Maasha::Biopieces::get_record( $in ) )
3816 foreach $key ( @{ $options->{ "keys" } } )
3818 if ( $record->{ $key } ) {
3819 $sum_hash{ $key } += $record->{ $key };
3823 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
3826 $fh = write_stream( $options->{ "data_out" } );
3828 foreach $key ( @{ $options->{ "keys" } } ) {
3829 Maasha::Biopieces::put_record( { $key . "_SUM" => $sum_hash{ $key } || 0 } , $fh );
3836 sub script_mean_vals
3838 # Martin A. Hansen, August 2007.
3840 # Calculate the mean of values of given keys.
3842 my ( $in, # handle to in stream
3843 $out, # handle to out stream
3844 $options, # options hash
3849 my ( $record, $key, %sum_hash, %count_hash, $mean, $fh );
3851 while ( $record = Maasha::Biopieces::get_record( $in ) )
3853 foreach $key ( @{ $options->{ "keys" } } )
3855 if ( $record->{ $key } )
3857 $sum_hash{ $key } += $record->{ $key };
3858 $count_hash{ $key }++;
3862 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
3865 $fh = write_stream( $options->{ "data_out" } );
3867 foreach $key ( @{ $options->{ "keys" } } )
3869 if ( $count_hash{ $key } ) {
3870 $mean = sprintf( "%.2f", ( $sum_hash{ $key } / $count_hash{ $key } ) );
3875 Maasha::Biopieces::put_record( { $key . "_MEAN" => $mean } , $fh );
3882 sub script_median_vals
3884 # Martin A. Hansen, March 2008.
3886 # Calculate the median values of given keys.
3888 my ( $in, # handle to in stream
3889 $out, # handle to out stream
3890 $options, # options hash
3895 my ( $record, $key, %median_hash, $median, $fh );
3897 while ( $record = Maasha::Biopieces::get_record( $in ) )
3899 foreach $key ( @{ $options->{ "keys" } } ) {
3900 push @{ $median_hash{ $key } }, $record->{ $key } if defined $record->{ $key };
3903 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
3906 $fh = write_stream( $options->{ "data_out" } );
3908 foreach $key ( @{ $options->{ "keys" } } )
3910 if ( $median_hash{ $key } ) {
3911 $median = Maasha::Calc::median( $median_hash{ $key } );
3916 Maasha::Biopieces::put_record( { $key . "_MEDIAN" => $median } , $fh );
3925 # Martin A. Hansen, February 2008.
3927 # Determine the maximum values of given keys.
3929 my ( $in, # handle to in stream
3930 $out, # handle to out stream
3931 $options, # options hash
3936 my ( $record, $key, $fh, %max_hash, $max_record );
3938 while ( $record = Maasha::Biopieces::get_record( $in ) )
3940 foreach $key ( @{ $options->{ "keys" } } )
3942 if ( $record->{ $key } )
3944 $max_hash{ $key } = $record->{ $key } if $record->{ $key } > $max_hash{ $key };
3948 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
3951 $fh = write_stream( $options->{ "data_out" } );
3953 foreach $key ( @{ $options->{ "keys" } } )
3955 $max_record->{ $key . "_MAX" } = $max_hash{ $key };
3958 Maasha::Biopieces::put_record( $max_record, $fh );
3966 # Martin A. Hansen, February 2008.
3968 # Determine the minimum values of given keys.
3970 my ( $in, # handle to in stream
3971 $out, # handle to out stream
3972 $options, # options hash
3977 my ( $record, $key, $fh, %min_hash, $min_record );
3979 while ( $record = Maasha::Biopieces::get_record( $in ) )
3981 foreach $key ( @{ $options->{ "keys" } } )
3983 if ( defined $record->{ $key } )
3985 if ( exists $min_hash{ $key } ) {
3986 $min_hash{ $key } = $record->{ $key } if $record->{ $key } < $min_hash{ $key };
3988 $min_hash{ $key } = $record->{ $key };
3993 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
3996 $fh = write_stream( $options->{ "data_out" } );
3998 foreach $key ( @{ $options->{ "keys" } } )
4000 $min_record->{ $key . "_MIN" } = $min_hash{ $key };
4003 Maasha::Biopieces::put_record( $min_record, $fh );
4009 sub script_upload_to_ucsc
4011 # Martin A. Hansen, August 2007.
4013 # Calculate the mean of values of given keys.
4015 # This routine has developed into the most ugly hack. Do something!
4017 my ( $in, # handle to in stream
4018 $out, # handle to out stream
4019 $options, # options hash
4024 my ( $record, $file, $wib_file, $wig_file, $wib_dir, $fh_out, $i, $first, $format, $type, $columns, $append, $entry );
4026 $options->{ "short_label" } ||= $options->{ 'table' };
4027 $options->{ "long_label" } ||= $options->{ 'table' };
4028 $options->{ "group" } ||= $ENV{ "LOGNAME" };
4029 $options->{ "priority" } ||= 1;
4030 $options->{ "visibility" } ||= "pack";
4031 $options->{ "color" } ||= join( ",", int( rand( 255 ) ), int( rand( 255 ) ), int( rand( 255 ) ) );
4032 $options->{ "chunk_size" } ||= 10_000_000_000; # Due to 32-bit UCSC compilation really large tables cannot be loaded in one go.
4034 $file = "$BP_TMP/ucsc_upload.tmp";
4039 $fh_out = Maasha::Common::write_open( $file );
4041 while ( $record = Maasha::Biopieces::get_record( $in ) )
4043 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
4045 if ( $record->{ "REC_TYPE" } eq "fixed_step" )
4049 if ( $entry = Maasha::UCSC::Wiggle::biopiece2fixedstep( $record ) ) {
4050 Maasha::UCSC::Wiggle::fixedstep_entry_put( $entry, $fh_out );
4053 elsif ( $record->{ "REC_TYPE" } eq "PSL" )
4057 Maasha::UCSC::psl_put_header( $fh_out ) if $first;
4058 Maasha::UCSC::psl_put_entry( $record, $fh_out );
4062 elsif ( $record->{ "REC_TYPE" } eq "BED" and $record->{ "SEC_STRUCT" } )
4064 # chrom chromStart chromEnd name score strand size secStr conf
4068 print $fh_out join ( "\t",
4070 $record->{ "CHR_BEG" },
4071 $record->{ "CHR_END" } + 1,
4072 $record->{ "Q_ID" },
4073 $record->{ "SCORE" },
4074 $record->{ "STRAND" },
4075 $record->{ "SIZE" },
4076 $record->{ "SEC_STRUCT" },
4077 $record->{ "CONF" },
4080 elsif ( $record->{ "REC_TYPE" } eq "BED" )
4083 $columns = $record->{ "BED_COLS" };
4085 if ( $entry = Maasha::UCSC::BED::biopiece2bed( $record, $columns ) ) {
4086 Maasha::UCSC::BED::bed_entry_put( $entry, $fh_out, $columns, $options->{ 'check' } );
4089 elsif ( $record->{ "REC_TYPE" } eq "PATSCAN" and $record->{ "CHR" } )
4094 if ( $entry = Maasha::UCSC::BED::biopiece2bed( $record, $columns ) ) {
4095 Maasha::UCSC::BED::bed_entry_put( $entry, $fh_out, $columns, $options->{ 'check' } );
4098 elsif ( $record->{ "REC_TYPE" } eq "BLAST" and $record->{ "S_ID" } =~ /^chr/ )
4103 $record->{ "SCORE" } = $record->{ "BIT_SCORE" } * 1000;
4105 if ( $entry = Maasha::UCSC::BED::biopiece2bed( $record, $columns ) ) {
4106 Maasha::UCSC::BED::bed_entry_put( $entry, $fh_out, $columns, $options->{ 'check' } );
4109 elsif ( $record->{ "REC_TYPE" } eq "VMATCH" and $record->{ "S_ID" } =~ /^chr/i )
4114 if ( $entry = Maasha::UCSC::BED::biopiece2bed( $record, $columns ) ) {
4115 Maasha::UCSC::BED::bed_entry_put( $entry, $fh_out, $columns, $options->{ 'check' } );
4119 if ( $i == $options->{ "chunk_size" } )
4123 if ( $format eq "BED" ) {
4124 Maasha::UCSC::bed_upload_to_ucsc( $BP_TMP, $file, $options, $append );
4125 } elsif ( $format eq "PSL" ) {
4126 Maasha::UCSC::psl_upload_to_ucsc( $file, $options, $append );
4135 $fh_out = Maasha::Common::write_open( $file );
4143 if ( exists $options->{ "database" } and $options->{ "table" } )
4145 if ( $format eq "BED" )
4147 $type = "bed $columns";
4149 Maasha::UCSC::bed_upload_to_ucsc( $BP_TMP, $file, $options, $append );
4151 elsif ( $format eq "BED_SS" )
4153 $type = "type bed 6 +";
4155 Maasha::UCSC::bed_upload_to_ucsc( $BP_TMP, $file, $options, $append );
4157 elsif ( $format eq "PSL" )
4161 Maasha::UCSC::psl_upload_to_ucsc( $file, $options, $append );
4163 elsif ( $format eq "WIGGLE" )
4165 $options->{ "visibility" } = "full";
4167 $wig_file = "$options->{ 'table' }.wig";
4168 $wib_file = "$options->{ 'table' }.wib";
4170 $wib_dir = "$ENV{ 'HOME' }/ucsc/wib";
4172 Maasha::Common::dir_create_if_not_exists( $wib_dir );
4174 if ( $options->{ 'verbose' } ) {
4175 `cd $BP_TMP && wigEncode $file $wig_file $wib_file`;
4177 `cd $BP_TMP && wigEncode $file $wig_file $wib_file > /dev/null 2>&1`;
4180 Maasha::Common::run( "mv", "$BP_TMP/$wib_file $wib_dir" );
4188 Maasha::UCSC::wiggle_upload_to_ucsc( $BP_TMP, $wib_dir, $file, $options );
4193 Maasha::UCSC::ucsc_update_config( $options, $type );
4198 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<