1 package Maasha::BioRun;
4 # Copyright (C) 2007-2009 Martin A. Hansen.
6 # This program is free software; you can redistribute it and/or
7 # modify it under the terms of the GNU General Public License
8 # as published by the Free Software Foundation; either version 2
9 # of the License, or (at your option) any later version.
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program; if not, write to the Free Software
18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 # http://www.gnu.org/copyleft/gpl.html
23 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
26 # Routines that contains Biopieces which are run.
29 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
34 use Getopt::Long qw( :config bundling );
35 use Time::HiRes qw( gettimeofday );
36 use Storable qw( dclone );
37 use Maasha::Biopieces;
46 use Maasha::Stockholm;
52 use Maasha::UCSC::BED;
53 use Maasha::UCSC::Wiggle;
62 use vars qw( @ISA @EXPORT_OK );
66 @ISA = qw( Exporter );
74 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> GLOBALS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
77 my ( $script, $BP_TMP );
79 $script = Maasha::Common::get_scriptname();
80 $BP_TMP = Maasha::Common::get_tmpdir();
83 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> RUN SCRIPT <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
86 run_script( $script );
89 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SUBROUTINES <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
94 # Martin A. Hansen, August 2007.
96 # Run a specific script.
98 my ( $script, # script name
103 my ( $t0, $t1, $options, $in, $out );
105 Maasha::Biopieces::log_biopiece();
107 $t0 = gettimeofday();
109 $options = get_options( $script );
111 $options->{ "SCRIPT" } = $script;
113 $script = "print_usage" if ( -t STDIN and keys %{ $options } <= 1 or $options->{ 'help' } );
115 $in = Maasha::Biopieces::read_stream( $options->{ "stream_in" } );
116 $out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } );
118 if ( $script eq "print_usage" ) { script_print_usage( $in, $out, $options ) }
119 elsif ( $script eq "read_psl" ) { script_read_psl( $in, $out, $options ) }
120 elsif ( $script eq "read_bed" ) { script_read_bed( $in, $out, $options ) }
121 elsif ( $script eq "read_fixedstep" ) { script_read_fixedstep( $in, $out, $options ) }
122 elsif ( $script eq "read_blast_tab" ) { script_read_blast_tab( $in, $out, $options ) }
123 elsif ( $script eq "read_embl" ) { script_read_embl( $in, $out, $options ) }
124 elsif ( $script eq "read_stockholm" ) { script_read_stockholm( $in, $out, $options ) }
125 elsif ( $script eq "read_phastcons" ) { script_read_phastcons( $in, $out, $options ) }
126 elsif ( $script eq "read_soft" ) { script_read_soft( $in, $out, $options ) }
127 elsif ( $script eq "read_gff" ) { script_read_gff( $in, $out, $options ) }
128 elsif ( $script eq "read_2bit" ) { script_read_2bit( $in, $out, $options ) }
129 elsif ( $script eq "read_solexa" ) { script_read_solexa( $in, $out, $options ) }
130 elsif ( $script eq "read_solid" ) { script_read_solid( $in, $out, $options ) }
131 elsif ( $script eq "read_mysql" ) { script_read_mysql( $in, $out, $options ) }
132 elsif ( $script eq "read_ucsc_config" ) { script_read_ucsc_config( $in, $out, $options ) }
133 elsif ( $script eq "length_seq" ) { script_length_seq( $in, $out, $options ) }
134 elsif ( $script eq "uppercase_seq" ) { script_uppercase_seq( $in, $out, $options ) }
135 elsif ( $script eq "complexity_seq" ) { script_complexity_seq( $in, $out, $options ) }
136 elsif ( $script eq "oligo_freq" ) { script_oligo_freq( $in, $out, $options ) }
137 elsif ( $script eq "create_weight_matrix" ) { script_create_weight_matrix( $in, $out, $options ) }
138 elsif ( $script eq "remove_indels" ) { script_remove_indels( $in, $out, $options ) }
139 elsif ( $script eq "transliterate_seq" ) { script_transliterate_seq( $in, $out, $options ) }
140 elsif ( $script eq "transliterate_vals" ) { script_transliterate_vals( $in, $out, $options ) }
141 elsif ( $script eq "translate_seq" ) { script_translate_seq( $in, $out, $options ) }
142 elsif ( $script eq "get_genome_align" ) { script_get_genome_align( $in, $out, $options ) }
143 elsif ( $script eq "get_genome_phastcons" ) { script_get_genome_phastcons( $in, $out, $options ) }
144 elsif ( $script eq "fold_seq" ) { script_fold_seq( $in, $out, $options ) }
145 elsif ( $script eq "tile_seq" ) { script_tile_seq( $in, $out, $options ) }
146 elsif ( $script eq "invert_align" ) { script_invert_align( $in, $out, $options ) }
147 elsif ( $script eq "patscan_seq" ) { script_patscan_seq( $in, $out, $options ) }
148 elsif ( $script eq "soap_seq" ) { script_soap_seq( $in, $out, $options ) }
149 elsif ( $script eq "match_seq" ) { script_match_seq( $in, $out, $options ) }
150 elsif ( $script eq "create_vmatch_index" ) { script_create_vmatch_index( $in, $out, $options ) }
151 elsif ( $script eq "write_bed" ) { script_write_bed( $in, $out, $options ) }
152 elsif ( $script eq "write_psl" ) { script_write_psl( $in, $out, $options ) }
153 elsif ( $script eq "write_fixedstep" ) { script_write_fixedstep( $in, $out, $options ) }
154 elsif ( $script eq "write_2bit" ) { script_write_2bit( $in, $out, $options ) }
155 elsif ( $script eq "write_solid" ) { script_write_solid( $in, $out, $options ) }
156 elsif ( $script eq "write_ucsc_config" ) { script_write_ucsc_config( $in, $out, $options ) }
157 elsif ( $script eq "head_records" ) { script_head_records( $in, $out, $options ) }
158 elsif ( $script eq "remove_keys" ) { script_remove_keys( $in, $out, $options ) }
159 elsif ( $script eq "remove_adaptor" ) { script_remove_adaptor( $in, $out, $options ) }
160 elsif ( $script eq "remove_mysql_tables" ) { script_remove_mysql_tables( $in, $out, $options ) }
161 elsif ( $script eq "remove_ucsc_tracks" ) { script_remove_ucsc_tracks( $in, $out, $options ) }
162 elsif ( $script eq "rename_keys" ) { script_rename_keys( $in, $out, $options ) }
163 elsif ( $script eq "uniq_vals" ) { script_uniq_vals( $in, $out, $options ) }
164 elsif ( $script eq "merge_vals" ) { script_merge_vals( $in, $out, $options ) }
165 elsif ( $script eq "merge_records" ) { script_merge_records( $in, $out, $options ) }
166 elsif ( $script eq "compute" ) { script_compute( $in, $out, $options ) }
167 elsif ( $script eq "flip_tab" ) { script_flip_tab( $in, $out, $options ) }
168 elsif ( $script eq "count_records" ) { script_count_records( $in, $out, $options ) }
169 elsif ( $script eq "sort_records" ) { script_sort_records( $in, $out, $options ) }
170 elsif ( $script eq "count_vals" ) { script_count_vals( $in, $out, $options ) }
171 elsif ( $script eq "plot_histogram" ) { script_plot_histogram( $in, $out, $options ) }
172 elsif ( $script eq "plot_lendist" ) { script_plot_lendist( $in, $out, $options ) }
173 elsif ( $script eq "plot_chrdist" ) { script_plot_chrdist( $in, $out, $options ) }
174 elsif ( $script eq "plot_karyogram" ) { script_plot_karyogram( $in, $out, $options ) }
175 elsif ( $script eq "plot_matches" ) { script_plot_matches( $in, $out, $options ) }
176 elsif ( $script eq "plot_seqlogo" ) { script_plot_seqlogo( $in, $out, $options ) }
177 elsif ( $script eq "plot_phastcons_profiles" ) { script_plot_phastcons_profiles( $in, $out, $options ) }
178 elsif ( $script eq "length_vals" ) { script_length_vals( $in, $out, $options ) }
179 elsif ( $script eq "sum_vals" ) { script_sum_vals( $in, $out, $options ) }
180 elsif ( $script eq "mean_vals" ) { script_mean_vals( $in, $out, $options ) }
181 elsif ( $script eq "median_vals" ) { script_median_vals( $in, $out, $options ) }
182 elsif ( $script eq "max_vals" ) { script_max_vals( $in, $out, $options ) }
183 elsif ( $script eq "min_vals" ) { script_min_vals( $in, $out, $options ) }
184 elsif ( $script eq "upload_to_ucsc" ) { script_upload_to_ucsc( $in, $out, $options ) }
186 close $in if defined $in;
189 $t1 = gettimeofday();
191 print STDERR "Program: $script" . ( " " x ( 25 - length( $script ) ) ) . sprintf( "Run time: %.4f\n", ( $t1 - $t0 ) ) if $options->{ 'verbose' };
197 # Martin A. Hansen, February 2008.
199 # Gets options from commandline and checks these vigerously.
201 my ( $script, # name of script
206 my ( %options, @options, $opt, @genomes, $real );
208 if ( $script eq "print_usage" )
214 elsif ( $script eq "read_psl" )
221 elsif ( $script eq "read_bed" )
230 elsif ( $script eq "read_fixedstep" )
237 elsif ( $script eq "read_blast_tab" )
244 elsif ( $script eq "read_embl" )
254 elsif ( $script eq "read_stockholm" )
261 elsif ( $script eq "read_phastcons" )
272 elsif ( $script eq "read_soft" )
280 elsif ( $script eq "read_gff" )
287 elsif ( $script eq "read_2bit" )
295 elsif ( $script eq "read_solexa" )
304 elsif ( $script eq "read_solid" )
312 elsif ( $script eq "read_mysql" )
321 elsif ( $script eq "read_ucsc_config" )
328 elsif ( $script eq "length_seq" )
335 elsif ( $script eq "oligo_freq" )
342 elsif ( $script eq "create_weight_matrix" )
348 elsif ( $script eq "transliterate_seq" )
356 elsif ( $script eq "transliterate_vals" )
365 elsif ( $script eq "translate_seq" )
371 elsif ( $script eq "get_genome_align" )
382 elsif ( $script eq "get_genome_phastcons" )
393 elsif ( $script eq "tile_seq" )
400 elsif ( $script eq "invert_align" )
406 elsif ( $script eq "patscan_seq" )
417 elsif ( $script eq "soap_seq" )
428 elsif ( $script eq "match_seq" )
435 elsif ( $script eq "create_vmatch_index" )
443 elsif ( $script eq "write_bed" )
453 elsif ( $script eq "write_psl" )
461 elsif ( $script eq "write_fixedstep" )
469 elsif ( $script eq "write_2bit" )
477 elsif ( $script eq "write_solid" )
486 elsif ( $script eq "write_ucsc_config" )
493 elsif ( $script eq "plot_seqlogo" )
500 elsif ( $script eq "plot_phastcons_profiles" )
515 elsif ( $script eq "head_records" )
521 elsif ( $script eq "remove_keys" )
528 elsif ( $script eq "remove_adaptor" )
537 elsif ( $script eq "remove_mysql_tables" )
548 elsif ( $script eq "remove_ucsc_tracks" )
560 elsif ( $script eq "rename_keys" )
566 elsif ( $script eq "uniq_vals" )
573 elsif ( $script eq "merge_vals" )
580 elsif ( $script eq "merge_records" )
587 elsif ( $script eq "compute" )
593 elsif ( $script eq "count_records" )
600 elsif ( $script eq "sort_records" )
607 elsif ( $script eq "count_vals" )
613 elsif ( $script eq "plot_histogram" )
626 elsif ( $script eq "plot_lendist" )
638 elsif ( $script eq "plot_chrdist" )
649 elsif ( $script eq "plot_karyogram" )
658 elsif ( $script eq "plot_matches" )
670 elsif ( $script eq "length_vals" )
676 elsif ( $script eq "sum_vals" )
684 elsif ( $script eq "mean_vals" )
692 elsif ( $script eq "median_vals" )
700 elsif ( $script eq "max_vals" )
708 elsif ( $script eq "min_vals" )
716 elsif ( $script eq "upload_to_ucsc" )
740 # print STDERR Dumper( \@options );
747 # print STDERR Dumper( \%options );
749 if ( -t STDIN && scalar( keys %options ) == 0 or $options{ "help" } ) {
750 return wantarray ? %options : \%options;
753 $options{ "cols" } = [ split ",", $options{ "cols" } ] if defined $options{ "cols" };
754 $options{ "keys" } = [ split ",", $options{ "keys" } ] if defined $options{ "keys" };
755 $options{ "no_keys" } = [ split ",", $options{ "no_keys" } ] if defined $options{ "no_keys" };
756 $options{ "save_keys" } = [ split ",", $options{ "save_keys" } ] if defined $options{ "save_keys" };
757 $options{ "quals" } = [ split ",", $options{ "quals" } ] if defined $options{ "quals" };
758 $options{ "feats" } = [ split ",", $options{ "feats" } ] if defined $options{ "feats" };
759 $options{ "frames" } = [ split ",", $options{ "frames" } ] if defined $options{ "frames" };
760 $options{ "samples" } = [ split ",", $options{ "samples" } ] if defined $options{ "samples" };
761 $options{ "tables" } = [ split ",", $options{ "tables" } ] if defined $options{ "tables" };
762 $options{ "tracks" } = [ split ",", $options{ "tracks" } ] if defined $options{ "tracks" };
764 # ---- check arguments ----
766 if ( $options{ 'data_in' } )
768 $options{ "files" } = Maasha::Biopieces::getopt_files( $options{ 'data_in' } );
770 Maasha::Common::error( qq(Argument to --data_in must be a valid file or fileglob expression) ) if scalar @{ $options{ "files" } } == 0;
773 map { Maasha::Common::error( qq(Argument to --cols must be a whole numbers - not "$_") ) if $_ !~ /^\d+$/ } @{ $options{ "cols" } } if $options{ "cols" };
775 # print STDERR Dumper( \%options );
777 $real = "beg|end|word_size|wrap|tile_size|len|prefix_length|mismatches|offset|num|skip|cpus|window_size|step_size";
779 foreach $opt ( keys %options )
781 if ( $opt =~ /stream_in|pattern_in|exact_in/ and not -f $options{ $opt } )
783 Maasha::Common::error( qq(Argument to --$opt must be a valid file or fileglob expression - not "$options{ $opt }") );
785 elsif ( $opt =~ /$real/ and $options{ $opt } !~ /^\d+$/ )
787 Maasha::Common::error( qq(Argument to --$opt must be a whole number - not "$options{ $opt }") );
789 elsif ( $opt =~ /max_hits|max_hits|max_misses|dist|edit_dist|flank|gap|hamming_dist|priority/ and $options{ $opt } !~ /^-?\d+$/ )
791 Maasha::Common::error( qq(Argument to --$opt must be an integer - not "$options{ $opt }") );
793 elsif ( $opt =~ /identity|threshold/ and $options{ $opt } !~ /^-?(?:\d+(?:\.\d*)?|\.\d+)$/ )
795 Maasha::Common::error( qq(Argument to --$opt must be a decimal number - not "$options{ $opt }") );
797 elsif ( $opt =~ /e_val/ and $options{ $opt } !~ /^([+-]?)(?=\d|\.\d)\d*(\.\d*)?([Ee]([+-]?\d+))?$/ )
799 Maasha::Common::error( qq(Argument to --$opt must be a float - not "$options{ $opt }") );
801 elsif ( $opt =~ /strand/ and $options{ $opt } !~ /^(\+|-)$/ )
803 Maasha::Common::error( qq(Argument to --$opt must be "+" or "-" - not "$options{ $opt }") );
805 elsif ( $opt eq "genome" )
807 @genomes = Maasha::Common::ls_dirs( "$ENV{ 'BP_DATA' }/genomes" );
808 map { $_ =~ s/.*\/(.+)$/$1/ } @genomes;
810 if ( not grep { $_ =~ /^$options{ $opt }$/ } @genomes ) {
811 Maasha::Common::error( qq(Genome $options{ $opt } not found in "$ENV{ 'BP_DATA' }/genomes/") );
814 elsif ( $opt eq "terminal" and not $options{ $opt } =~ /^(svg|post|dumb|x11)/ )
816 Maasha::Common::error( qq(Bad --$opt argument "$options{ $opt }") );
818 elsif ( $opt eq "table" and $options{ $opt } =~ /(-|\.)/ )
820 Maasha::Common::error( qq(Character '$1' is not allowed in table name: $options{ $opt }) );
822 elsif ( $opt eq "merge" and $options{ $opt } !~ /^(AandB|AorB|BorA|AnotB|BnotA)$/ )
824 Maasha::Common::error( qq(Argument to --$opt must be AandB, AorB, BorA, AnotB, or BnotA - not "$options{ $opt }") );
826 elsif ( $opt eq "format" and $script eq "read_solexa" and $options{ $opt } !~ /octal|decimal/ )
828 Maasha::Common::error( qq(Argument to --$opt must be octal or decimal - not "$options{ $opt }") );
830 elsif ( $opt eq "remove" and $script eq "remove_adaptor" and $options{ $opt } !~ /before|after|skip/ )
832 Maasha::Common::error( qq(Argument to --$opt must be before, after, or skip - not "$options{ $opt }") );
836 Maasha::Common::error( qq(no --database specified) ) if $script eq "remove_ucsc_tracks" and not $options{ "database" };
837 Maasha::Common::error( qq(no --index_name specified) ) if $script =~ /create_vmatch_index/ and not $options{ "index_name" };
838 Maasha::Common::error( qq(no --in_file or --genome specified) ) if $script eq "soap_seq" and not $options{ "genome" } and not $options{ "in_file" };
839 Maasha::Common::error( qq(both --in_file and --genome specified) ) if $script eq "soap_seq" and $options{ "genome" } and $options{ "in_file" };
840 Maasha::Common::error( qq(no --genome specified) ) if $script =~ /get_genome_align|get_genome_phastcons|plot_phastcons_profiles|plot_karyogram/ and not $options{ "genome" };
841 Maasha::Common::error( qq(no --key specified) ) if $script =~ /plot_lendist|plot_histogram/ and not $options{ "key" };
842 Maasha::Common::error( qq(no --keys speficied) ) if $script =~ /sort_records|count_vals|sum_vals|mean_vals|median_vals|length_vals/ and not $options{ "keys" };
844 if ( $script eq "upload_to_ucsc" )
846 Maasha::Common::error( qq(no --database specified) ) if not $options{ "database" };
847 Maasha::Common::error( qq(no --table specified) ) if not $options{ "table" };
850 return wantarray ? %options : \%options;
854 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SCRIPTS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
857 sub script_print_usage
859 # Martin A. Hansen, January 2008.
861 # Retrieves usage information from file and
862 # prints this nicely formatted.
864 my ( $in, # handle to in stream
865 $out, # handle to out stream
866 $options, # options hash
871 my ( $file, $wiki, $lines );
873 if ( $options->{ 'data_in' } ) {
874 $file = $options->{ 'data_in' };
876 $file = join "", $ENV{ 'BP_DIR' }, "/bp_usage/", $options->{ 'SCRIPT' }, ".wiki";
879 $wiki = Maasha::Gwiki::gwiki_read( $file );
881 ( $wiki->[ 2 ], $wiki->[ 3 ], $wiki->[ 0 ], $wiki->[ 1 ] ) = ( $wiki->[ 0 ], $wiki->[ 1 ], $wiki->[ 2 ], $wiki->[ 3 ] );
883 if ( not $options->{ "help" } ) {
884 @{ $wiki } = grep { $_->[ 0 ]->{ 'SECTION' } =~ /Biopiece|summary|Usage|Options|Help/ } @{ $wiki };
887 $lines = Maasha::Gwiki::gwiki2ascii( $wiki );
889 print STDERR "$_\n" foreach @{ $lines };
897 # Martin A. Hansen, August 2007.
899 # Read psl table from stream or file.
901 my ( $in, # handle to in stream
902 $out, # handle to out stream
903 $options, # options hash
908 my ( $record, $file, $data_in, $num );
910 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
911 Maasha::Biopieces::put_record( $record, $out );
916 foreach $file ( @{ $options->{ "files" } } )
918 $data_in = Maasha::Common::read_open( $file );
920 while ( $record = Maasha::UCSC::psl_get_entry( $data_in ) )
922 Maasha::Biopieces::put_record( $record, $out );
924 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
936 # Martin A. Hansen, August 2007.
938 # Read bed table from stream or file.
940 my ( $in, # handle to in stream
941 $out, # handle to out stream
942 $options, # options hash
947 my ( $cols, $file, $record, $bed_entry, $data_in, $num );
949 $cols = $options->{ 'cols' }->[ 0 ];
951 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
952 Maasha::Biopieces::put_record( $record, $out );
957 foreach $file ( @{ $options->{ "files" } } )
959 $data_in = Maasha::Common::read_open( $file );
961 while ( $bed_entry = Maasha::UCSC::BED::bed_entry_get( $data_in, $cols, $options->{ 'check' } ) )
963 $record = Maasha::UCSC::BED::bed2biopiece( $bed_entry );
965 Maasha::Biopieces::put_record( $record, $out );
967 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
977 close $data_in if $data_in;
981 sub script_read_fixedstep
983 # Martin A. Hansen, Juli 2008.
985 # Read fixedstep wiggle format from stream or file.
987 my ( $in, # handle to in stream
988 $out, # handle to out stream
989 $options, # options hash
994 my ( $file, $record, $entry, $data_in, $num );
996 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
997 Maasha::Biopieces::put_record( $record, $out );
1002 foreach $file ( @{ $options->{ "files" } } )
1004 $data_in = Maasha::Common::read_open( $file );
1006 while ( $entry = Maasha::UCSC::Wiggle::fixedstep_entry_get( $data_in ) )
1008 $record = Maasha::UCSC::Wiggle::fixedstep2biopiece( $entry );
1010 Maasha::Biopieces::put_record( $record, $out );
1012 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
1022 close $data_in if $data_in;
1026 sub script_read_blast_tab
1028 # Martin A. Hansen, September 2007.
1030 # Read tabular BLAST output from NCBI blast run with -m8 or -m9.
1032 my ( $in, # handle to in stream
1033 $out, # handle to out stream
1034 $options, # options hash
1039 my ( $file, $line, @fields, $strand, $record, $data_in, $num );
1041 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
1042 Maasha::Biopieces::put_record( $record, $out );
1047 foreach $file ( @{ $options->{ "files" } } )
1049 $data_in = Maasha::Common::read_open( $file );
1051 while ( $line = <$data_in> )
1055 next if $line =~ /^#/;
1057 @fields = split /\t/, $line;
1059 $record->{ "REC_TYPE" } = "BLAST";
1060 $record->{ "Q_ID" } = $fields[ 0 ];
1061 $record->{ "S_ID" } = $fields[ 1 ];
1062 $record->{ "IDENT" } = $fields[ 2 ];
1063 $record->{ "ALIGN_LEN" } = $fields[ 3 ];
1064 $record->{ "MISMATCHES" } = $fields[ 4 ];
1065 $record->{ "GAPS" } = $fields[ 5 ];
1066 $record->{ "Q_BEG" } = $fields[ 6 ] - 1; # BLAST is 1-based
1067 $record->{ "Q_END" } = $fields[ 7 ] - 1; # BLAST is 1-based
1068 $record->{ "S_BEG" } = $fields[ 8 ] - 1; # BLAST is 1-based
1069 $record->{ "S_END" } = $fields[ 9 ] - 1; # BLAST is 1-based
1070 $record->{ "E_VAL" } = $fields[ 10 ];
1071 $record->{ "BIT_SCORE" } = $fields[ 11 ];
1073 if ( $record->{ "S_BEG" } > $record->{ "S_END" } )
1075 $record->{ "STRAND" } = '-';
1077 ( $record->{ "S_BEG" }, $record->{ "S_END" } ) = ( $record->{ "S_END" }, $record->{ "S_BEG" } );
1081 $record->{ "STRAND" } = '+';
1084 Maasha::Biopieces::put_record( $record, $out );
1086 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
1096 close $data_in if $data_in;
1100 sub script_read_embl
1102 # Martin A. Hansen, August 2007.
1106 my ( $in, # handle to in stream
1107 $out, # handle to out stream
1108 $options, # options hash
1113 my ( %options2, $file, $data_in, $num, $entry, $record );
1115 map { $options2{ "keys" }{ $_ } = 1 } @{ $options->{ "keys" } };
1116 map { $options2{ "feats" }{ $_ } = 1 } @{ $options->{ "feats" } };
1117 map { $options2{ "quals" }{ $_ } = 1 } @{ $options->{ "quals" } };
1119 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
1120 Maasha::Biopieces::put_record( $record, $out );
1125 foreach $file ( @{ $options->{ "files" } } )
1127 $data_in = Maasha::Common::read_open( $file );
1129 while ( $entry = Maasha::EMBL::get_embl_entry( $data_in ) )
1131 $record = Maasha::EMBL::parse_embl_entry( $entry, \%options2 );
1133 my ( $feat, $feat2, $qual, $qual_val, $record_copy );
1135 $record_copy = dclone $record;
1137 delete $record_copy->{ "FT" };
1139 Maasha::Biopieces::put_record( $record_copy, $out );
1141 delete $record_copy->{ "SEQ" };
1143 foreach $feat ( keys %{ $record->{ "FT" } } )
1145 $record_copy->{ "FEAT_TYPE" } = $feat;
1147 foreach $feat2 ( @{ $record->{ "FT" }->{ $feat } } )
1149 foreach $qual ( keys %{ $feat2 } )
1151 $qual_val = join "; ", @{ $feat2->{ $qual } };
1156 $record_copy->{ $qual } = $qual_val;
1159 Maasha::Biopieces::put_record( $record_copy, $out );
1163 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
1173 close $data_in if $data_in;
1177 sub script_read_stockholm
1179 # Martin A. Hansen, August 2007.
1181 # Read Stockholm format.
1183 my ( $in, # handle to in stream
1184 $out, # handle to out stream
1185 $options, # options hash
1190 my ( $data_in, $file, $num, $entry, $record, $record_anno, $record_align, $key, $seq );
1192 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
1193 Maasha::Biopieces::put_record( $record, $out );
1198 foreach $file ( @{ $options->{ "files" } } )
1200 $data_in = Maasha::Common::read_open( $file );
1202 while ( $entry = Maasha::Stockholm::get_stockholm_entry( $data_in ) )
1204 $record = Maasha::Stockholm::parse_stockholm_entry( $entry );
1208 foreach $key ( keys %{ $record->{ "GF" } } ) {
1209 $record_anno->{ $key } = $record->{ "GF" }->{ $key };
1212 $record_anno->{ "ALIGN" } = $num;
1214 Maasha::Biopieces::put_record( $record_anno, $out );
1216 foreach $seq ( @{ $record->{ "ALIGN" } } )
1218 undef $record_align;
1221 SEQ_NAME => $seq->[ 0 ],
1225 Maasha::Biopieces::put_record( $record_align, $out );
1228 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
1238 close $data_in if $data_in;
1242 sub script_read_phastcons
1244 # Martin A. Hansen, December 2007.
1246 # Read PhastCons format.
1248 my ( $in, # handle to in stream
1249 $out, # handle to out stream
1250 $options, # options hash
1255 my ( $data_in, $file, $num, $entry, @records, $record );
1257 $options->{ "min" } ||= 10;
1258 $options->{ "dist" } ||= 25;
1259 $options->{ "threshold" } ||= 0.8;
1260 $options->{ "gap" } ||= 5;
1262 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
1263 Maasha::Biopieces::put_record( $record, $out );
1268 foreach $file ( @{ $options->{ "files" } } )
1270 $data_in = Maasha::Common::read_open( $file );
1272 while ( $entry = Maasha::UCSC::fixedstep_get_entry( $data_in ) )
1274 @records = Maasha::UCSC::phastcons_parse_entry( $entry, $options );
1276 foreach $record ( @records )
1278 $record->{ "REC_TYPE" } = "BED";
1279 $record->{ "BED_LEN" } = $record->{ "CHR_END" } - $record->{ "CHR_BEG" } + 1;
1281 Maasha::Biopieces::put_record( $record, $out );
1283 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
1294 close $data_in if $data_in;
1298 sub script_read_soft
1300 # Martin A. Hansen, December 2007.
1303 # http://www.ncbi.nlm.nih.gov/geo/info/soft2.html
1305 my ( $in, # handle to in stream
1306 $out, # handle to out stream
1307 $options, # options hash
1312 my ( $data_in, $file, $num, $records, $record, $soft_index, $fh, @platforms, $plat_table, @samples, $sample, $old_end, $skip );
1314 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
1315 Maasha::Biopieces::put_record( $record, $out );
1320 foreach $file ( @{ $options->{ "files" } } )
1322 print STDERR "Creating index for file: $file\n" if $options->{ "verbose" };
1324 $soft_index = Maasha::NCBI::soft_index_file( $file );
1326 $fh = Maasha::Common::read_open( $file );
1328 @platforms = grep { $_->{ "SECTION" } =~ /PLATFORM/ } @{ $soft_index };
1330 print STDERR "Getting platform tables for file: $file\n" if $options->{ "verbose" };
1332 $plat_table = Maasha::NCBI::soft_get_platform( $fh, $platforms[ 0 ]->{ "LINE_BEG" }, $platforms[ -1 ]->{ "LINE_END" } );
1334 @samples = grep { $_->{ "SECTION" } =~ /SAMPLE/ } @{ $soft_index };
1336 $old_end = $platforms[ -1 ]->{ "LINE_END" };
1338 foreach $sample ( @samples )
1341 $skip = 1 if ( $options->{ "samples" } and grep { $sample->{ "SECTION" } !~ /$_/ } @{ $options->{ "samples" } } );
1343 print STDERR "Getting samples for dataset: $sample->{ 'SECTION' }\n" if $options->{ "verbose" } and not $skip;
1345 $records = Maasha::NCBI::soft_get_sample( $fh, $plat_table, $sample->{ "LINE_BEG" } - $old_end - 1, $sample->{ "LINE_END" } - $old_end - 1, $skip );
1347 foreach $record ( @{ $records } )
1349 Maasha::Biopieces::put_record( $record, $out );
1351 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
1356 $old_end = $sample->{ "LINE_END" };
1364 close $data_in if $data_in;
1371 # Martin A. Hansen, February 2008.
1374 # http://www.ncbi.nlm.nih.gov/geo/info/soft2.html
1376 my ( $in, # handle to in stream
1377 $out, # handle to out stream
1378 $options, # options hash
1383 my ( $data_in, $file, $fh, $num, $record, $entry );
1385 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
1386 Maasha::Biopieces::put_record( $record, $out );
1391 foreach $file ( @{ $options->{ "files" } } )
1393 $fh = Maasha::Common::read_open( $file );
1395 while ( $entry = Maasha::GFF::get_entry( $fh ) )
1397 Maasha::Biopieces::put_record( $entry, $out );
1399 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
1409 close $data_in if $data_in;
1413 sub script_read_2bit
1415 # Martin A. Hansen, March 2008.
1417 # Read sequences from 2bit file.
1419 my ( $in, # handle to in stream
1420 $out, # handle to out stream
1421 $options, # options hash
1426 my ( $record, $file, $data_in, $mask, $toc, $line, $num );
1428 $mask = 1 if not $options->{ "no_mask" };
1430 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
1431 Maasha::Biopieces::put_record( $record, $out );
1436 foreach $file ( @{ $options->{ "files" } } )
1438 $data_in = Maasha::Common::read_open( $file );
1440 $toc = Maasha::TwoBit::twobit_get_TOC( $data_in );
1442 foreach $line ( @{ $toc } )
1444 $record->{ "SEQ_NAME" } = $line->[ 0 ];
1445 $record->{ "SEQ" } = Maasha::TwoBit::twobit_get_seq( $data_in, $line->[ 1 ], undef, undef, $mask );
1446 $record->{ "SEQ_LEN" } = length $record->{ "SEQ" };
1448 Maasha::Biopieces::put_record( $record, $out );
1450 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
1460 close $data_in if $data_in;
1464 sub script_read_solexa
1466 # Martin A. Hansen, March 2008.
1468 # Read Solexa sequence reads from file.
1470 my ( $in, # handle to in stream
1471 $out, # handle to out stream
1472 $options, # options hash
1477 my ( $record, $file, $data_in, $entry, $num, @seqs, @scores, $i );
1479 $options->{ "format" } ||= "octal";
1480 $options->{ "quality" } ||= 20;
1482 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
1483 Maasha::Biopieces::put_record( $record, $out );
1488 foreach $file ( @{ $options->{ "files" } } )
1490 $data_in = Maasha::Common::read_open( $file );
1492 if ( $options->{ "format" } eq "octal" )
1494 while ( $entry = Maasha::Solexa::solexa_get_entry_octal( $data_in ) )
1496 $record = Maasha::Solexa::solexa2biopiece( $entry, $options->{ "quality" } );
1498 Maasha::Biopieces::put_record( $record, $out );
1500 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
1507 while ( $entry = Maasha::Solexa::solexa_get_entry_decimal( $data_in ) )
1509 $record = Maasha::Solexa::solexa2biopiece( $entry, $options->{ "quality" } );
1511 Maasha::Biopieces::put_record( $record, $out );
1513 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
1524 close $data_in if $data_in;
1528 sub script_read_solid
1530 # Martin A. Hansen, April 2008.
1532 # Read Solid sequence from file.
1534 my ( $in, # handle to in stream
1535 $out, # handle to out stream
1536 $options, # options hash
1541 my ( $record, $file, $data_in, $line, $num, $seq_name, $seq_cs, $seq_qual, @scores, @seqs, $i );
1543 $options->{ "quality" } ||= 15;
1545 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
1546 Maasha::Biopieces::put_record( $record, $out );
1551 foreach $file ( @{ $options->{ "files" } } )
1553 $data_in = Maasha::Common::read_open( $file );
1555 while ( $line = <$data_in> )
1559 ( $seq_name, $seq_cs, $seq_qual ) = split /\t/, $line;
1561 @scores = split /,/, $seq_qual;
1562 @seqs = split //, Maasha::Solid::color_space2seq( $seq_cs );
1564 for ( $i = 0; $i < @seqs; $i++ ) {
1565 $seqs[ $i ] = lc $seqs[ $i ] if $scores[ $i ] < $options->{ "quality" };
1569 REC_TYPE => 'SOLID',
1570 SEQ_NAME => $seq_name,
1572 SEQ_QUAL => join( ";", @scores ),
1573 SEQ_LEN => length $seq_cs,
1574 SEQ => join( "", @seqs ),
1575 SCORE_MEAN => sprintf( "%.2f", Maasha::Calc::mean( \@scores ) ),
1578 Maasha::Biopieces::put_record( $record, $out );
1580 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
1590 close $data_in if $data_in;
1594 sub script_read_mysql
1596 # Martin A. Hansen, May 2008.
1598 # Read a MySQL query into stream.
1600 my ( $in, # handle to in stream
1601 $out, # handle to out stream
1602 $options, # options hash
1607 my ( $record, $dbh, $results );
1609 $options->{ "user" } ||= Maasha::UCSC::ucsc_get_user();
1610 $options->{ "password" } ||= Maasha::UCSC::ucsc_get_password();
1612 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
1613 Maasha::Biopieces::put_record( $record, $out );
1616 $dbh = Maasha::SQL::connect( $options->{ "database" }, $options->{ "user" }, $options->{ "password" } );
1618 $results = Maasha::SQL::query_hashref_list( $dbh, $options->{ "query" } );
1620 Maasha::SQL::disconnect( $dbh );
1622 map { Maasha::Biopieces::put_record( $_ ) } @{ $results };
1626 sub script_read_ucsc_config
1628 # Martin A. Hansen, November 2008.
1630 # Read track entries from UCSC Genome Browser '.ra' files.
1632 my ( $in, # handle to in stream
1633 $out, # handle to out stream
1634 $options, # options hash
1639 my ( $record, $file, $data_in, $entry, $num );
1641 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
1642 Maasha::Biopieces::put_record( $record, $out );
1647 foreach $file ( @{ $options->{ "files" } } )
1649 $data_in = Maasha::Common::read_open( $file );
1651 while ( $record = Maasha::UCSC::ucsc_config_entry_get( $data_in ) )
1653 $record->{ 'REC_TYPE' } = "UCSC Config";
1655 Maasha::Biopieces::put_record( $record, $out );
1657 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
1667 close $data_in if $data_in;
1671 sub script_length_seq
1673 # Martin A. Hansen, August 2007.
1675 # Determine the length of sequences in stream.
1677 my ( $in, # handle to in stream
1678 $out, # handle to out stream
1679 $options, # options hash
1684 my ( $record, $total );
1686 while ( $record = Maasha::Biopieces::get_record( $in ) )
1688 if ( $record->{ "SEQ" } )
1690 $record->{ "SEQ_LEN" } = length $record->{ "SEQ" };
1691 $total += $record->{ "SEQ_LEN" };
1694 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
1697 Maasha::Biopieces::put_record( { TOTAL_SEQ_LEN => $total }, $out );
1701 sub script_complexity_seq
1703 # Martin A. Hansen, May 2008.
1705 # Generates an index calculated as the most common di-residue over
1706 # the sequence length for all sequences in stream.
1708 my ( $in, # handle to in stream
1709 $out, # handle to out stream
1714 my ( $record, $index );
1716 while ( $record = Maasha::Biopieces::get_record( $in ) )
1718 $record->{ "SEQ_COMPLEXITY" } = sprintf( "%.2f", Maasha::Seq::seq_complexity( $record->{ "SEQ" } ) ) if $record->{ "SEQ" };
1720 Maasha::Biopieces::put_record( $record, $out );
1725 sub script_oligo_freq
1727 # Martin A. Hansen, August 2007.
1729 # Determine the length of sequences in stream.
1731 my ( $in, # handle to in stream
1732 $out, # handle to out stream
1733 $options, # options hash
1738 my ( $record, %oligos, @freq_table );
1740 $options->{ "word_size" } ||= 7;
1742 while ( $record = Maasha::Biopieces::get_record( $in ) )
1744 if ( $record->{ "SEQ" } )
1746 map { $oligos{ $_ }++ } Maasha::Seq::seq2oligos( \$record->{ "SEQ" }, $options->{ "word_size" } );
1748 if ( not $options->{ "all" } )
1750 @freq_table = Maasha::Seq::oligo_freq( \%oligos );
1752 map { Maasha::Biopieces::put_record( $_, $out ) } @freq_table;
1758 Maasha::Biopieces::put_record( $record, $out );
1761 if ( $options->{ "all" } )
1763 @freq_table = Maasha::Seq::oligo_freq( \%oligos );
1765 map { Maasha::Biopieces::put_record( $_, $out ) } @freq_table;
1770 sub script_create_weight_matrix
1772 # Martin A. Hansen, August 2007.
1774 # Creates a weight matrix from an alignmnet.
1776 my ( $in, # handle to in stream
1777 $out, # handle to out stream
1778 $options, # options hash
1783 my ( $record, $count, $i, $res, %freq_hash, %res_hash, $freq );
1787 while ( $record = Maasha::Biopieces::get_record( $in ) )
1789 if ( $record->{ "SEQ" } )
1791 for ( $i = 0; $i < length $record->{ "SEQ" }; $i++ )
1793 $res = substr $record->{ "SEQ" }, $i, 1;
1795 $freq_hash{ $i }{ $res }++;
1796 $res_hash{ $res } = 1;
1803 Maasha::Biopieces::put_record( $record, $out );
1807 foreach $res ( sort keys %res_hash )
1811 $record->{ "V0" } = $res;
1813 for ( $i = 0; $i < keys %freq_hash; $i++ )
1815 $freq = $freq_hash{ $i }{ $res } || 0;
1817 if ( $options->{ "percent" } ) {
1818 $freq = sprintf( "%.0f", 100 * $freq / $count ) if $freq > 0;
1821 $record->{ "V" . ( $i + 1 ) } = $freq;
1824 Maasha::Biopieces::put_record( $record, $out );
1829 sub script_remove_indels
1831 # Martin A. Hansen, August 2007.
1833 # Remove indels from sequences in stream.
1835 my ( $in, # handle to in stream
1836 $out, # handle to out stream
1843 while ( $record = Maasha::Biopieces::get_record( $in ) )
1845 $record->{ 'SEQ' } =~ tr/-~.//d if $record->{ "SEQ" };
1847 Maasha::Biopieces::put_record( $record, $out );
1852 sub script_transliterate_seq
1854 # Martin A. Hansen, August 2007.
1856 # Transliterate chars from sequence in record.
1858 my ( $in, # handle to in stream
1859 $out, # handle to out stream
1860 $options, # options hash
1865 my ( $record, $search, $replace, $delete );
1867 $search = $options->{ "search" } || "";
1868 $replace = $options->{ "replace" } || "";
1869 $delete = $options->{ "delete" } || "";
1871 while ( $record = Maasha::Biopieces::get_record( $in ) )
1873 if ( $record->{ "SEQ" } )
1875 if ( $search and $replace ) {
1876 eval "\$record->{ 'SEQ' } =~ tr/$search/$replace/";
1877 } elsif ( $delete ) {
1878 eval "\$record->{ 'SEQ' } =~ tr/$delete//d";
1882 Maasha::Biopieces::put_record( $record, $out );
1887 sub script_transliterate_vals
1889 # Martin A. Hansen, April 2008.
1891 # Transliterate chars from values in record.
1893 my ( $in, # handle to in stream
1894 $out, # handle to out stream
1895 $options, # options hash
1900 my ( $record, $search, $replace, $delete, $key );
1902 $search = $options->{ "search" } || "";
1903 $replace = $options->{ "replace" } || "";
1904 $delete = $options->{ "delete" } || "";
1906 while ( $record = Maasha::Biopieces::get_record( $in ) )
1908 foreach $key ( @{ $options->{ "keys" } } )
1910 if ( exists $record->{ $key } )
1912 if ( $search and $replace ) {
1913 eval "\$record->{ $key } =~ tr/$search/$replace/";
1914 } elsif ( $delete ) {
1915 eval "\$record->{ $key } =~ tr/$delete//d";
1920 Maasha::Biopieces::put_record( $record, $out );
1925 sub script_translate_seq
1927 # Martin A. Hansen, February 2008.
1929 # Translate DNA sequence into protein sequence.
1931 my ( $in, # handle to in stream
1932 $out, # handle to out stream
1933 $options, # options hash
1938 my ( $record, $frame, %new_record );
1940 $options->{ "frames" } ||= [ 1, 2, 3, -1, -2, -3 ];
1942 while ( $record = Maasha::Biopieces::get_record( $in ) )
1944 if ( $record->{ "SEQ" } )
1946 if ( Maasha::Seq::seq_guess_type( $record->{ "SEQ" } ) eq "dna" )
1948 foreach $frame ( @{ $options->{ "frames" } } )
1950 %new_record = %{ $record };
1952 $new_record{ "SEQ" } = Maasha::Seq::translate( $record->{ "SEQ" }, $frame );
1953 $new_record{ "SEQ_LEN" } = length $new_record{ "SEQ" };
1954 $new_record{ "FRAME" } = $frame;
1956 Maasha::Biopieces::put_record( \%new_record, $out );
1962 Maasha::Biopieces::put_record( $record, $out );
1968 sub script_get_genome_align
1970 # Martin A. Hansen, April 2008.
1972 # Gets a subalignment from a multiple genome alignment.
1974 my ( $in, # handle to in stream
1975 $out, # handle to out stream
1976 $options, # options hash
1981 my ( $record, $maf_track, $align, $align_num, $beg, $end, $len, $entry );
1983 $options->{ "strand" } ||= "+";
1987 $maf_track = Maasha::Config::maf_track( $options->{ "genome" } );
1989 if ( $options->{ "chr" } and $options->{ "beg" } and ( $options->{ "end" } or $options->{ "len" } ) )
1991 $beg = $options->{ "beg" } - 1;
1993 if ( $options->{ "end" } ) {
1994 $end = $options->{ "end" };
1995 } elsif ( $options->{ "len" } ) {
1996 $end = $beg + $options->{ "len" };
1999 $align = Maasha::UCSC::maf_extract( $BP_TMP, $options->{ "genome" }, $maf_track, $options->{ "chr" }, $beg, $end, $options->{ "strand" } );
2001 foreach $entry ( @{ $align } )
2003 $entry->{ "CHR" } = $record->{ "CHR" };
2004 $entry->{ "CHR_BEG" } = $record->{ "CHR_BEG" };
2005 $entry->{ "CHR_END" } = $record->{ "CHR_END" };
2006 $entry->{ "STRAND" } = $record->{ "STRAND" } || '+';
2007 $entry->{ "Q_ID" } = $record->{ "Q_ID" };
2008 $entry->{ "SCORE" } = $record->{ "SCORE" };
2010 Maasha::Biopieces::put_record( $entry, $out );
2014 while ( $record = Maasha::Biopieces::get_record( $in ) )
2016 if ( $record->{ "REC_TYPE" } eq "BED" )
2018 $align = Maasha::UCSC::maf_extract( $BP_TMP, $options->{ "genome" }, $maf_track, $record->{ "CHR" }, $record->{ "CHR_BEG" }, $record->{ "CHR_END" }, $record->{ "STRAND" } );
2020 elsif ( $record->{ "REC_TYPE" } eq "VMATCH" )
2022 $align = Maasha::UCSC::maf_extract( $BP_TMP, $options->{ "genome" }, $maf_track, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" } + 1, $record->{ "STRAND" } );
2024 elsif ( $record->{ "REC_TYPE" } eq "PSL" )
2026 $align = Maasha::UCSC::maf_extract( $BP_TMP, $options->{ "genome" }, $maf_track, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" }, $record->{ "STRAND" } );
2028 elsif ( $record->{ "REC_TYPE" } eq "BLAST" )
2030 $align = Maasha::UCSC::maf_extract( $BP_TMP, $options->{ "genome" }, $maf_track, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" }, $record->{ "STRAND" } );
2033 foreach $entry ( @{ $align } )
2035 $entry->{ "CHR" } = $record->{ "CHR" };
2036 $entry->{ "CHR_BEG" } = $record->{ "CHR_BEG" };
2037 $entry->{ "CHR_END" } = $record->{ "CHR_END" };
2038 $entry->{ "STRAND" } = $record->{ "STRAND" };
2039 $entry->{ "Q_ID" } = $record->{ "Q_ID" };
2040 $entry->{ "SCORE" } = $record->{ "SCORE" };
2042 Maasha::Biopieces::put_record( $entry, $out );
2050 sub script_get_genome_phastcons
2052 # Martin A. Hansen, February 2008.
2054 # Get phastcons scores from genome intervals.
2056 my ( $in, # handle to in stream
2057 $out, # handle to out stream
2058 $options, # options hash
2063 my ( $phastcons_file, $phastcons_index, $index, $fh_phastcons, $scores, $record );
2065 $options->{ "flank" } ||= 0;
2067 $phastcons_file = Maasha::Config::genome_phastcons( $options->{ "genome" } );
2068 $phastcons_index = Maasha::Config::genome_phastcons_index( $options->{ "genome" } );
2070 $index = Maasha::UCSC::fixedstep_index_retrieve( $phastcons_index );
2071 $fh_phastcons = Maasha::Common::read_open( $phastcons_file );
2073 if ( defined $options->{ "chr" } and defined $options->{ "beg" } and ( defined $options->{ "end" } or defined $options->{ "len" } ) )
2075 $options->{ "beg" } -= 1; # request is 1-based
2076 $options->{ "end" } -= 1; # request is 1-based
2078 if ( $options->{ "len" } ) {
2079 $options->{ "end" } = $options->{ "beg" } + $options->{ "len" } - 1;
2082 $scores = Maasha::UCSC::fixedstep_index_lookup( $index, $fh_phastcons, $options->{ "chr" }, $options->{ "beg" }, $options->{ "end" }, $options->{ "flank" } );
2084 $record->{ "CHR" } = $options->{ "chr" };
2085 $record->{ "CHR_BEG" } = $options->{ "beg" } - $options->{ "flank" };
2086 $record->{ "CHR_END" } = $options->{ "end" } + $options->{ "flank" };
2088 $record->{ "PHASTCONS" } = join ",", @{ $scores };
2089 $record->{ "PHAST_COUNT" } = scalar @{ $scores }; # DEBUG
2091 Maasha::Biopieces::put_record( $record, $out );
2094 while ( $record = Maasha::Biopieces::get_record( $in ) )
2096 if ( $record->{ "REC_TYPE" } eq "BED" )
2098 $scores = Maasha::UCSC::fixedstep_index_lookup( $index, $fh_phastcons, $record->{ "CHR" }, $record->{ "CHR_BEG" }, $record->{ "CHR_END" }, $options->{ "flank" } );
2100 elsif ( $record->{ "REC_TYPE" } eq "PSL" )
2102 $scores = Maasha::UCSC::fixedstep_index_lookup( $index, $fh_phastcons, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" }, $options->{ "flank" } );
2104 elsif ( $record->{ "REC_TYPE" } eq "BLAST" )
2106 $scores = Maasha::UCSC::fixedstep_index_lookup( $index, $fh_phastcons, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" }, $options->{ "flank" } );
2109 $record->{ "PHASTCONS" } = join ",", @{ $scores } if @{ $scores };
2110 # $record->{ "PHAST_COUNT" } = @{ $scores } if @{ $scores }; # DEBUG
2112 Maasha::Biopieces::put_record( $record, $out );
2115 close $fh_phastcons if $fh_phastcons;
2121 # Martin A. Hansen, December 2007.
2123 # Folds sequences in stream into secondary structures.
2125 my ( $in, # handle to in stream
2126 $out, # handle to out stream
2131 my ( $record, $type, $struct, $index );
2133 while ( $record = Maasha::Biopieces::get_record( $in ) )
2135 if ( $record->{ "SEQ" } )
2138 $type = Maasha::Seq::seq_guess_type( $record->{ "SEQ" } );
2141 if ( $type ne "protein" )
2143 ( $struct, $index ) = Maasha::Seq::fold_struct_rnafold( $record->{ "SEQ" } );
2144 $record->{ "SEC_STRUCT" } = $struct;
2145 $record->{ "FREE_ENERGY" } = $index;
2146 $record->{ "SCORE" } = abs int $index;
2147 $record->{ "SIZE" } = length $struct;
2148 $record->{ "CONF" } = "1," x $record->{ "SIZE" };
2152 Maasha::Biopieces::put_record( $record, $out );
2159 # Martin A. Hansen, February 2008.
2161 # Using the first sequence in stream as reference, tile
2162 # all subsequent sequences based on pairwise alignments.
2164 my ( $in, # handle to in stream
2165 $out, # handle to out stream
2166 $options, # options hash
2171 my ( $record, $first, $ref_entry, @entries );
2175 while ( $record = Maasha::Biopieces::get_record( $in ) )
2177 if ( $record->{ "SEQ_NAME" } and $record->{ "SEQ" } )
2181 $ref_entry = [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ];
2187 push @entries, [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ];
2192 Maasha::Biopieces::put_record( $record, $out );
2196 @entries = Maasha::Align::align_tile( $ref_entry, \@entries, $options );
2198 map { Maasha::Biopieces::put_record( { SEQ_NAME => $_->[ SEQ_NAME ], SEQ => $_->[ SEQ ] }, $out ) } @entries;
2202 sub script_invert_align
2204 # Martin A. Hansen, February 2008.
2206 # Inverts an alignment showing only non-mathing residues
2207 # using the first sequence as reference.
2209 my ( $in, # handle to in stream
2210 $out, # handle to out stream
2211 $options, # options hash
2216 my ( $record, @entries );
2218 while ( $record = Maasha::Biopieces::get_record( $in ) )
2220 if ( $record->{ "SEQ_NAME" } and $record->{ "SEQ" } )
2222 push @entries, [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ];
2226 Maasha::Biopieces::put_record( $record, $out );
2230 Maasha::Align::align_invert( \@entries, $options->{ "soft" } );
2232 map { Maasha::Biopieces::put_record( { SEQ_NAME => $_->[ SEQ_NAME ], SEQ => $_->[ SEQ ] }, $out ) } @entries;
2236 sub script_patscan_seq
2238 # Martin A. Hansen, August 2007.
2240 # Locates patterns in sequences using scan_for_matches.
2242 my ( $in, # handle to in stream
2243 $out, # handle to out stream
2244 $options, # options hash
2249 my ( $genome_file, @args, $arg, $type, $seq_file, $pat_file, $out_file, $fh_in, $fh_out, $record, $patterns, $pattern, $entry, $result, %head_hash, $i );
2251 if ( $options->{ "patterns" } ) {
2252 $patterns = Maasha::Patscan::parse_patterns( $options->{ "patterns" } );
2253 } elsif ( -f $options->{ "patterns_in" } ) {
2254 $patterns = Maasha::Patscan::read_patterns( $options->{ "patterns_in" } );
2257 $genome_file = "$ENV{ 'BP_DATA' }/genomes/$options->{ 'genome' }/fasta/$options->{ 'genome' }.fna" if $options->{ 'genome' };
2259 push @args, "-c" if $options->{ "comp" };
2260 push @args, "-m $options->{ 'max_hits' }" if $options->{ 'max_hits' };
2261 push @args, "-n $options->{ 'max_misses' }" if $options->{ 'max_hits' };
2263 $seq_file = "$BP_TMP/patscan.seq";
2264 $pat_file = "$BP_TMP/patscan.pat";
2265 $out_file = "$BP_TMP/patscan.out";
2267 $fh_out = Maasha::Common::write_open( $seq_file );
2271 while ( $record = Maasha::Biopieces::get_record( $in ) )
2273 if ( $record->{ "SEQ" } and $record->{ "SEQ_NAME" } )
2275 $type = Maasha::Seq::seq_guess_type( $record->{ "SEQ" } ) if not $type;
2277 Maasha::Fasta::put_entry( [ $i, $record->{ "SEQ" } ], $fh_out );
2279 $head_hash{ $i } = $record->{ "SEQ_NAME" };
2287 $arg = join " ", @args;
2288 $arg .= " -p" if $type eq "protein";
2290 foreach $pattern ( @{ $patterns } )
2292 $fh_out = Maasha::Common::write_open( $pat_file );
2294 print $fh_out "$pattern\n";
2298 if ( $options->{ 'genome' } ) {
2299 `scan_for_matches $arg $pat_file < $genome_file > $out_file`;
2300 # Maasha::Common::run( "scan_for_matches", "$arg $pat_file < $genome_file > $out_file" );
2302 `scan_for_matches $arg $pat_file < $seq_file > $out_file`;
2303 # Maasha::Common::run( "scan_for_matches", "$arg $pat_file < $seq_file > $out_file" );
2306 $fh_in = Maasha::Common::read_open( $out_file );
2308 while ( $entry = Maasha::Fasta::get_entry( $fh_in ) )
2310 $result = Maasha::Patscan::parse_scan_result( $entry, $pattern );
2312 if ( $options->{ 'genome' } )
2314 $result->{ "CHR" } = $result->{ "S_ID" };
2315 $result->{ "CHR_BEG" } = $result->{ "S_BEG" };
2316 $result->{ "CHR_END" } = $result->{ "S_END" };
2318 delete $result->{ "S_ID" };
2319 delete $result->{ "S_BEG" };
2320 delete $result->{ "S_END" };
2324 $result->{ "S_ID" } = $head_hash{ $result->{ "S_ID" } };
2327 Maasha::Biopieces::put_record( $result, $out );
2341 # Martin A. Hansen, July 2008.
2343 # soap sequences in stream against a given file or genome.
2345 my ( $in, # handle to in stream
2346 $out, # handle to out stream
2347 $options, # options hash
2352 my ( $genome, $tmp_in, $tmp_out, $fh_in, $fh_out, $record, $line, @fields, $entry, $count, $args );
2354 $options->{ "seed_size" } ||= 10;
2355 $options->{ "mismatches" } ||= 2;
2356 $options->{ "gap_size" } ||= 0;
2357 $options->{ "cpus" } ||= 1;
2359 if ( $options->{ "genome" } ) {
2360 $options->{ "in_file" } = "$ENV{ 'BP_DATA' }/genomes/$options->{ 'genome' }/fasta/$options->{ 'genome' }.fna";
2363 $tmp_in = "$BP_TMP/soap_query.seq";
2364 $tmp_out = "$BP_TMP/soap.result";
2366 $fh_out = Maasha::Common::write_open( $tmp_in );
2370 while ( $record = Maasha::Biopieces::get_record( $in ) )
2372 if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) )
2374 Maasha::Fasta::put_entry( $entry, $fh_out );
2379 Maasha::Biopieces::put_record( $record, $out );
2387 "-s $options->{ 'seed_size' }",
2390 "-v $options->{ 'mismatches' }",
2391 "-g $options->{ 'gap_size' }",
2392 "-p $options->{ 'cpus' }",
2393 "-d $options->{ 'in_file' }",
2397 $args .= " > /dev/null 2>&1" if not $options->{ 'verbose' };
2399 Maasha::Common::run( "soap", $args, 1 );
2403 $fh_out = Maasha::Common::read_open( $tmp_out );
2407 while ( $line = <$fh_out> )
2411 @fields = split /\t/, $line;
2413 $record->{ "REC_TYPE" } = "SOAP";
2414 $record->{ "Q_ID" } = $fields[ 0 ];
2415 $record->{ "SCORE" } = $fields[ 3 ];
2416 $record->{ "STRAND" } = $fields[ 6 ];
2417 $record->{ "S_ID" } = $fields[ 7 ];
2418 $record->{ "S_BEG" } = $fields[ 8 ] - 1; # soap is 1-based
2419 $record->{ "S_END" } = $fields[ 8 ] + $fields[ 5 ] - 2;
2421 Maasha::Biopieces::put_record( $record, $out );
2431 sub script_match_seq
2433 # Martin A. Hansen, August 2007.
2435 # BLATs sequences in stream against a given genome.
2437 my ( $in, # handle to in stream
2438 $out, # handle to out stream
2439 $options, # options hash
2444 my ( $record, @entries, $results );
2446 $options->{ "word_size" } ||= 20;
2447 $options->{ "direction" } ||= "both";
2449 while ( $record = Maasha::Biopieces::get_record( $in ) )
2451 if ( $record->{ "SEQ_NAME" } and $record->{ "SEQ" } ) {
2452 push @entries, [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ];
2455 Maasha::Biopieces::put_record( $record, $out );
2458 if ( @entries == 1 )
2460 $results = Maasha::Match::match_mummer( [ $entries[ 0 ] ], [ $entries[ 0 ] ], $options, $BP_TMP );
2462 map { Maasha::Biopieces::put_record( $_, $out ) } @{ $results };
2464 elsif ( @entries == 2 )
2466 $results = Maasha::Match::match_mummer( [ $entries[ 0 ] ], [ $entries[ 1 ] ], $options, $BP_TMP );
2468 map { Maasha::Biopieces::put_record( $_, $out ) } @{ $results };
2473 sub script_create_vmatch_index
2475 # Martin A. Hansen, January 2008.
2477 # Create a vmatch index from sequences in the stream.
2479 my ( $in, # handle to in stream
2480 $out, # handle to out stream
2481 $options, # options hash
2486 my ( $record, $file_tmp, $fh_tmp, $type, $entry );
2488 if ( $options->{ "index_name" } )
2490 $file_tmp = $options->{ 'index_name' };
2491 $fh_tmp = Maasha::Common::write_open( $file_tmp );
2494 while ( $record = Maasha::Biopieces::get_record( $in ) )
2496 if ( $options->{ "index_name" } and $entry = Maasha::Fasta::biopiece2fasta( $record ) )
2498 Maasha::Fasta::put_entry( $entry, $fh_tmp );
2500 $type = Maasha::Seq::seq_guess_type( $entry->[ SEQ ] ) if not defined $type;
2503 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
2506 if ( $options->{ "index_name" } )
2510 if ( $type eq "protein" ) {
2511 Maasha::Common::run( "mkvtree", "-db $file_tmp -protein -pl $options->{ 'prefix_length' } -allout -indexname $file_tmp > /dev/null 2>&1" );
2513 Maasha::Common::run( "mkvtree", "-db $file_tmp -dna -pl $options->{ 'prefix_length' } -allout -indexname $file_tmp > /dev/null 2>&1" );
2521 sub script_write_bed
2523 # Martin A. Hansen, August 2007.
2525 # Write BED format for the UCSC genome browser using records in stream.
2527 my ( $in, # handle to in stream
2528 $out, # handle to out stream
2529 $options, # options hash
2534 my ( $cols, $fh, $record, $bed_entry, $new_record );
2536 $cols = $options->{ 'cols' }->[ 0 ];
2538 $fh = Maasha::Biopieces::write_stream( $options->{ 'data_out' }, $options->{ 'compress' } );
2540 while ( $record = Maasha::Biopieces::get_record( $in ) )
2542 $record = Maasha::UCSC::psl2record( $record ) if $record->{ 'tBaseInsert' }; # Dirty addition to allow Affy data from MySQL to be dumped
2544 if ( $bed_entry = Maasha::UCSC::BED::biopiece2bed( $record, $cols ) ) {
2545 Maasha::UCSC::BED::bed_entry_put( $bed_entry, $fh, $cols, $options->{ 'check' } );
2548 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ 'no_stream' };
2555 sub script_write_psl
2557 # Martin A. Hansen, August 2007.
2559 # Write PSL output from stream.
2561 my ( $in, # handle to in stream
2562 $out, # handle to out stream
2563 $options, # options hash
2568 my ( $fh, $record, @output, $first );
2572 $fh = write_stream( $options->{ "data_out" }, $options->{ "compress" } );
2574 while ( $record = Maasha::Biopieces::get_record( $in ) )
2576 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
2578 if ( $record->{ "REC_TYPE" } and $record->{ "REC_TYPE" } eq "PSL" )
2580 Maasha::UCSC::psl_put_header( $fh ) if $first;
2581 Maasha::UCSC::psl_put_entry( $record, $fh );
2590 sub script_write_fixedstep
2592 # Martin A. Hansen, Juli 2008.
2594 # Write fixedStep entries from recrods in the stream.
2596 my ( $in, # handle to in stream
2597 $out, # handle to out stream
2598 $options, # options hash
2603 my ( $fh, $record, $entry );
2605 $fh = write_stream( $options->{ "data_out" }, $options->{ "compress" } );
2607 while ( $record = Maasha::Biopieces::get_record( $in ) )
2609 if ( $entry = Maasha::UCSC::Wiggle::biopiece2fixedstep( $record ) ) {
2610 Maasha::UCSC::Wiggle::fixedstep_entry_put( $entry, $fh );
2613 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
2620 sub script_write_2bit
2622 # Martin A. Hansen, March 2008.
2624 # Write sequence entries from stream in 2bit format.
2626 my ( $in, # handle to in stream
2627 $out, # handle to out stream
2628 $options, # options hash
2633 my ( $record, $mask, $tmp_file, $fh_tmp, $fh_in, $fh_out, $entry );
2635 $mask = 1 if not $options->{ "no_mask" };
2637 $tmp_file = "$BP_TMP/write_2bit.fna";
2638 $fh_tmp = Maasha::Common::write_open( $tmp_file );
2640 $fh_out = write_stream( $options->{ "data_out" } );
2642 while ( $record = Maasha::Biopieces::get_record( $in ) )
2644 if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) ) {
2645 Maasha::Fasta::put_entry( $entry, $fh_tmp );
2648 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
2653 $fh_in = Maasha::Common::read_open( $tmp_file );
2655 Maasha::TwoBit::fasta2twobit( $fh_in, $fh_out, $mask );
2664 sub script_write_solid
2666 # Martin A. Hansen, April 2008.
2668 # Write di-base encoded Solid sequence from entries in stream.
2670 my ( $in, # handle to in stream
2671 $out, # handle to out stream
2672 $options, # options hash
2677 my ( $record, $fh, $entry );
2679 $fh = write_stream( $options->{ "data_out" }, $options->{ "compress" } );
2681 while ( $record = Maasha::Biopieces::get_record( $in ) )
2683 if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) )
2685 $entry->[ SEQ ] = Maasha::Solid::seq2color_space( uc $entry->[ SEQ ] );
2687 Maasha::Fasta::put_entry( $entry, $fh, $options->{ "wrap" } );
2690 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
2697 sub script_write_ucsc_config
2699 # Martin A. Hansen, November 2008.
2701 # Write UCSC Genome Broser configuration (.ra file type) from
2702 # records in the stream.
2704 my ( $in, # handle to in stream
2705 $out, # handle to out stream
2706 $options, # options hash
2711 my ( $record, $fh );
2713 $fh = write_stream( $options->{ "data_out" } );
2715 while ( $record = Maasha::Biopieces::get_record( $in ) )
2717 Maasha::UCSC::ucsc_config_entry_put( $record, $fh ) if $record->{ "REC_TYPE" } eq "UCSC Config";
2719 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
2726 sub script_plot_seqlogo
2728 # Martin A. Hansen, August 2007.
2730 # Calculates and writes a sequence logo for alignments.
2732 my ( $in, # handle to in stream
2733 $out, # handle to out stream
2734 $options, # options hash
2739 my ( $record, @entries, $logo, $fh );
2741 while ( $record = Maasha::Biopieces::get_record( $in ) )
2743 if ( $record->{ "SEQ_NAME" } and $record->{ "SEQ" } ) {
2744 push @entries, [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ];
2747 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
2750 $logo = Maasha::Plot::seq_logo( \@entries );
2752 $fh = write_stream( $options->{ "data_out" } );
2760 sub script_plot_phastcons_profiles
2762 # Martin A. Hansen, January 2008.
2764 # Plots PhastCons profiles.
2766 my ( $in, # handle to in stream
2767 $out, # handle to out stream
2768 $options, # options hash
2773 my ( $phastcons_file, $phastcons_index, $index, $fh_phastcons, $record, $scores, $AoA, $plot, $fh );
2775 $options->{ "title" } ||= "PhastCons Profiles";
2777 $phastcons_file = Maasha::Config::genome_phastcons( $options->{ "genome" } );
2778 $phastcons_index = Maasha::Config::genome_phastcons_index( $options->{ "genome" } );
2780 $index = Maasha::UCSC::fixedstep_index_retrieve( $phastcons_index );
2781 $fh_phastcons = Maasha::Common::read_open( $phastcons_file );
2783 while ( $record = Maasha::Biopieces::get_record( $in ) )
2785 if ( $record->{ "CHR" } and $record->{ "CHR_BEG" } and $record->{ "CHR_END" } )
2787 $scores = Maasha::UCSC::fixedstep_index_lookup( $index, $fh_phastcons, $record->{ "CHR" },
2788 $record->{ "CHR_BEG" },
2789 $record->{ "CHR_END" },
2790 $options->{ "flank" } );
2792 push @{ $AoA }, [ @{ $scores } ];
2795 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
2798 Maasha::UCSC::phastcons_normalize( $AoA );
2800 $AoA = [ [ Maasha::UCSC::phastcons_mean( $AoA ) ] ] if $options->{ "mean" };
2801 $AoA = [ [ Maasha::UCSC::phastcons_median( $AoA ) ] ] if $options->{ "median" };
2803 $AoA = Maasha::Matrix::matrix_flip( $AoA );
2805 $plot = Maasha::Plot::lineplot_simple( $AoA, $options, $BP_TMP );
2807 $fh = write_stream( $options->{ "data_out" } );
2809 print $fh "$_\n" foreach @{ $plot };
2815 sub script_head_records
2817 # Martin A. Hansen, August 2007.
2819 # Display the first sequences in stream.
2821 my ( $in, # handle to in stream
2822 $out, # handle to out stream
2823 $options, # options hash
2828 my ( $record, $count );
2830 $options->{ "num" } ||= 10;
2834 while ( $record = Maasha::Biopieces::get_record( $in ) )
2838 Maasha::Biopieces::put_record( $record, $out );
2840 last if $count == $options->{ "num" };
2845 sub script_remove_keys
2847 # Martin A. Hansen, August 2007.
2849 # Remove keys from stream.
2851 my ( $in, # handle to in stream
2852 $out, # handle to out stream
2853 $options, # options hash
2858 my ( $record, $new_record );
2860 while ( $record = Maasha::Biopieces::get_record( $in ) )
2862 if ( $options->{ "keys" } )
2864 map { delete $record->{ $_ } } @{ $options->{ "keys" } };
2866 elsif ( $options->{ "save_keys" } )
2868 map { $new_record->{ $_ } = $record->{ $_ } if exists $record->{ $_ } } @{ $options->{ "save_keys" } };
2870 $record = $new_record;
2873 Maasha::Biopieces::put_record( $record, $out ) if keys %{ $record };
2878 sub script_remove_adaptor
2880 # Martin A. Hansen, August 2008.
2882 # Find and remove adaptor from sequences in the stream.
2884 my ( $in, # handle to in stream
2885 $out, # handle to out stream
2886 $options, # options hash
2891 my ( $record, $adaptor, $seq, $adaptor_len, $seq_len, $offset, $max_match, $max_mismatch, $pos );
2893 $options->{ "remove" } ||= "after";
2895 $max_mismatch = $options->{ "mismatches" } || 0;
2896 $offset = $options->{ "offset" };
2898 if ( not defined $offset ) {
2904 $adaptor = uc $options->{ "adaptor" };
2905 $adaptor_len = length $adaptor;
2907 while ( $record = Maasha::Biopieces::get_record( $in ) )
2909 if ( $record->{ "SEQ" } )
2911 $seq = uc $record->{ "SEQ" };
2912 $seq_len = length $seq;
2914 $pos = Maasha::Common::index_m( $seq, $adaptor, $seq_len, $adaptor_len, $offset, $max_mismatch );
2916 $record->{ "ADAPTOR_POS" } = $pos;
2918 if ( $pos >= 0 and $options->{ "remove" } ne "skip" )
2920 if ( $options->{ "remove" } eq "after" )
2922 $record->{ "SEQ" } = substr $record->{ "SEQ" }, 0, $pos;
2923 $record->{ "SEQ_LEN" } = $pos;
2927 $record->{ "SEQ" } = substr $record->{ "SEQ" }, $pos + $adaptor_len;
2928 $record->{ "SEQ_LEN" } = length $record->{ "SEQ" };
2932 Maasha::Biopieces::put_record( $record, $out );
2936 Maasha::Biopieces::put_record( $record, $out );
2942 sub script_remove_mysql_tables
2944 # Martin A. Hansen, November 2008.
2946 # Remove MySQL tables from values in stream.
2948 my ( $in, # handle to in stream
2949 $out, # handle to out stream
2950 $options, # options hash
2955 my ( $record, %table_hash, $dbh, $table );
2957 $options->{ "user" } ||= Maasha::UCSC::ucsc_get_user();
2958 $options->{ "password" } ||= Maasha::UCSC::ucsc_get_password();
2960 map { $table_hash{ $_ } = 1 } @{ $options->{ 'tables' } };
2962 while ( $record = Maasha::Biopieces::get_record( $in ) )
2964 map { $table_hash{ $record->{ $_ } } = 1 } @{ $options->{ 'keys' } };
2966 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ 'no_stream' };
2969 $dbh = Maasha::SQL::connect( $options->{ "database" }, $options->{ "user" }, $options->{ "password" } );
2971 foreach $table ( sort keys %table_hash )
2973 if ( Maasha::SQL::table_exists( $dbh, $table ) )
2975 print STDERR qq(Removing table "$table" from database "$options->{ 'database' }" ... ) if $options->{ 'verbose' };
2976 Maasha::SQL::delete_table( $dbh, $table );
2977 print STDERR "done.\n" if $options->{ 'verbose' };
2981 print STDERR qq(WARNING: table "$table" not found in database "$options->{ 'database' }\n");
2985 Maasha::SQL::disconnect( $dbh );
2989 sub script_remove_ucsc_tracks
2991 # Martin A. Hansen, November 2008.
2993 # Remove track from MySQL tables and config file.
2995 my ( $in, # handle to in stream
2996 $out, # handle to out stream
2997 $options, # options hash
3002 my ( $record, %track_hash, $fh_in, $fh_out, $track, @tracks, @new_tracks, $dbh );
3004 $options->{ 'user' } ||= Maasha::UCSC::ucsc_get_user();
3005 $options->{ 'password' } ||= Maasha::UCSC::ucsc_get_password();
3006 $options->{ 'config_file' } ||= "$ENV{ 'HOME' }/ucsc/my_tracks.ra";
3008 map { $track_hash{ $_ } = 1 } @{ $options->{ 'tracks' } };
3010 while ( $record = Maasha::Biopieces::get_record( $in ) )
3012 map { $track_hash{ $record->{ $_ } } = 1 } @{ $options->{ 'keys' } };
3014 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ 'no_stream' };
3017 $fh_in = Maasha::Filesys::file_read_open( $options->{ 'config_file' } );
3019 while ( $track = Maasha::UCSC::ucsc_config_entry_get( $fh_in ) ) {
3020 push @tracks, $track;
3025 foreach $track ( @tracks )
3027 if ( $track->{ 'database' } eq $options->{ 'database' } and exists $track_hash{ $track->{ 'track' } } ) {
3028 print STDERR qq(Removing track "$track->{ 'track' }" from config file.\n) if $options->{ 'verbose' };
3030 push @new_tracks, $track;
3034 rename "$options->{ 'config_file' }", "$options->{ 'config_file' }~";
3036 $fh_out = Maasha::Filesys::file_write_open( $options->{ 'config_file' } );
3038 map { Maasha::UCSC::ucsc_config_entry_put( $_, $fh_out ) } @new_tracks;
3042 # ---- locate track in database ----
3044 $dbh = Maasha::SQL::connect( $options->{ "database" }, $options->{ "user" }, $options->{ "password" } );
3046 foreach $track ( sort keys %track_hash )
3048 if ( Maasha::SQL::table_exists( $dbh, $track ) )
3050 print STDERR qq(Removing table "$track" from database "$options->{ 'database' }" ... ) if $options->{ 'verbose' };
3051 Maasha::SQL::delete_table( $dbh, $track );
3052 print STDERR "done.\n" if $options->{ 'verbose' };
3056 print STDERR qq(WARNING: table "$track" not found in database "$options->{ 'database' }\n");
3060 Maasha::SQL::disconnect( $dbh );
3062 Maasha::Common::run( "ucscMakeTracks.pl", "-b > /dev/null 2>&1" );
3066 sub script_rename_keys
3068 # Martin A. Hansen, August 2007.
3070 # Rename keys in stream.
3072 my ( $in, # handle to in stream
3073 $out, # handle to out stream
3074 $options, # options hash
3081 while ( $record = Maasha::Biopieces::get_record( $in ) )
3083 if ( exists $record->{ $options->{ "keys" }->[ 0 ] } )
3085 $record->{ $options->{ "keys" }->[ 1 ] } = $record->{ $options->{ "keys" }->[ 0 ] };
3087 delete $record->{ $options->{ "keys" }->[ 0 ] };
3090 Maasha::Biopieces::put_record( $record, $out );
3095 sub script_uniq_vals
3097 # Martin A. Hansen, August 2007.
3099 # Find unique values in stream.
3101 my ( $in, # handle to in stream
3102 $out, # handle to out stream
3103 $options, # options hash
3108 my ( %hash, $record );
3110 while ( $record = Maasha::Biopieces::get_record( $in ) )
3112 if ( $record->{ $options->{ "key" } } )
3114 if ( not $hash{ $record->{ $options->{ "key" } } } and not $options->{ "invert" } )
3116 Maasha::Biopieces::put_record( $record, $out );
3118 $hash{ $record->{ $options->{ "key" } } } = 1;
3120 elsif ( $hash{ $record->{ $options->{ "key" } } } and $options->{ "invert" } )
3122 Maasha::Biopieces::put_record( $record, $out );
3126 $hash{ $record->{ $options->{ "key" } } } = 1;
3131 Maasha::Biopieces::put_record( $record, $out );
3137 sub script_merge_vals
3139 # Martin A. Hansen, August 2007.
3141 # Rename keys in stream.
3143 my ( $in, # handle to in stream
3144 $out, # handle to out stream
3145 $options, # options hash
3150 my ( $record, @join, $i );
3152 $options->{ "delimit" } ||= '_';
3154 while ( $record = Maasha::Biopieces::get_record( $in ) )
3156 if ( exists $record->{ $options->{ "keys" }->[ 0 ] } )
3158 @join = $record->{ $options->{ "keys" }->[ 0 ] };
3160 for ( $i = 1; $i < @{ $options->{ "keys" } }; $i++ ) {
3161 push @join, $record->{ $options->{ "keys" }->[ $i ] } if exists $record->{ $options->{ "keys" }->[ $i ] };
3164 $record->{ $options->{ "keys" }->[ 0 ] } = join $options->{ "delimit" }, @join;
3167 Maasha::Biopieces::put_record( $record, $out );
3172 sub script_merge_records
3174 # Martin A. Hansen, July 2008.
3176 # Merges records in the stream based on identical values of two given keys.
3178 my ( $in, # handle to in stream
3179 $out, # handle to out stream
3180 $options, # options hash
3185 my ( $merge, $record, $file1, $file2, $fh1, $fh2, $key1, $key2, @keys1, @keys2, @vals1, @vals2,
3186 $num1, $num2, $num, $cmp, $i );
3188 $merge = $options->{ "merge" } || "AandB";
3190 $file1 = "$BP_TMP/merge_records1.tmp";
3191 $file2 = "$BP_TMP/merge_records2.tmp";
3193 $fh1 = Maasha::Common::write_open( $file1 );
3194 $fh2 = Maasha::Common::write_open( $file2 );
3196 $key1 = $options->{ "keys" }->[ 0 ];
3197 $key2 = $options->{ "keys" }->[ 1 ];
3199 $num = $key2 =~ s/n$//;
3203 while ( $record = Maasha::Biopieces::get_record( $in ) )
3205 if ( exists $record->{ $key1 } )
3208 @vals1 = $record->{ $key1 };
3210 delete $record->{ $key1 };
3212 map { push @keys1, $_; push @vals1, $record->{ $_ } } keys %{ $record };
3214 print $fh1 join( "\t", @vals1 ), "\n";
3218 elsif ( exists $record->{ $key2 } )
3221 @vals2 = $record->{ $key2 };
3223 delete $record->{ $key2 };
3225 map { push @keys2, $_; push @vals2, $record->{ $_ } } keys %{ $record };
3227 print $fh2 join( "\t", @vals2 ), "\n";
3238 Maasha::Common::run( "sort", "-k 1,1n $file1 > $file1.sort" ) and rename "$file1.sort", $file1;
3239 Maasha::Common::run( "sort", "-k 1,1n $file2 > $file2.sort" ) and rename "$file2.sort", $file2;
3243 Maasha::Common::run( "sort", "-k 1,1 $file1 > $file1.sort" ) and rename "$file1.sort", $file1;
3244 Maasha::Common::run( "sort", "-k 1,1 $file2 > $file2.sort" ) and rename "$file2.sort", $file2;
3247 $fh1 = Maasha::Common::read_open( $file1 );
3248 $fh2 = Maasha::Common::read_open( $file2 );
3250 @vals1 = Maasha::Common::get_fields( $fh1 );
3251 @vals2 = Maasha::Common::get_fields( $fh2 );
3253 while ( $num1 > 0 and $num2 > 0 )
3258 $cmp = $vals1[ 0 ] <=> $vals2[ 0 ];
3260 $cmp = $vals1[ 0 ] cmp $vals2[ 0 ];
3265 if ( $merge =~ /^(AorB|AnotB)$/ )
3267 for ( $i = 0; $i < @keys1; $i++ ) {
3268 $record->{ $keys1[ $i ] } = $vals1[ $i ];
3271 Maasha::Biopieces::put_record( $record, $out );
3274 @vals1 = Maasha::Common::get_fields( $fh1 );
3279 if ( $merge =~ /^(BorA|BnotA)$/ )
3281 for ( $i = 0; $i < @keys2; $i++ ) {
3282 $record->{ $keys2[ $i ] } = $vals2[ $i ];
3285 Maasha::Biopieces::put_record( $record, $out );
3288 @vals2 = Maasha::Common::get_fields( $fh2 );
3293 if ( $merge =~ /^(AandB|AorB|BorA)$/ )
3295 for ( $i = 0; $i < @keys1; $i++ ) {
3296 $record->{ $keys1[ $i ] } = $vals1[ $i ];
3299 for ( $i = 1; $i < @keys2; $i++ ) {
3300 $record->{ $keys2[ $i ] } = $vals2[ $i ];
3303 Maasha::Biopieces::put_record( $record, $out );
3306 @vals1 = Maasha::Common::get_fields( $fh1 );
3307 @vals2 = Maasha::Common::get_fields( $fh2 );
3319 if ( $num1 > 0 and $merge =~ /^(AorB|AnotB)$/ )
3323 for ( $i = 0; $i < @keys1; $i++ ) {
3324 $record->{ $keys1[ $i ] } = $vals1[ $i ];
3327 Maasha::Biopieces::put_record( $record, $out );
3330 if ( $num2 > 0 and $merge =~ /^(BorA|BnotA)$/ )
3334 for ( $i = 0; $i < @keys2; $i++ ) {
3335 $record->{ $keys2[ $i ] } = $vals2[ $i ];
3338 Maasha::Biopieces::put_record( $record, $out );
3345 # Martin A. Hansen, August 2007.
3347 # Evaluate extression for records in stream.
3349 my ( $in, # handle to in stream
3350 $out, # handle to out stream
3351 $options, # options hash
3356 my ( $record, $eval_key, @keys, $eval_val );
3358 while ( $record = Maasha::Biopieces::get_record( $in ) )
3360 if ( $options->{ "eval" } )
3362 if ( $options->{ "eval" } =~ /^(\S+)\s*=\s*(.+)$/ )
3369 @keys = split /\s+|\+|-|\*|\/|\*\*/, $eval_val;
3371 @keys = grep { exists $record->{ $_ } } @keys;
3374 map { $eval_val =~ s/\Q$_\E/$record->{ $_ }/g } @keys;
3376 $record->{ $eval_key } = eval "$eval_val";
3377 Maasha::Common::error( qq(eval "$eval_key = $eval_val" failed -> $@) ) if $@;
3381 Maasha::Common::error( qq(Bad compute expression: "$options->{ 'eval' }"\n) );
3385 Maasha::Biopieces::put_record( $record, $out );
3392 # Martin A. Hansen, June 2008.
3396 my ( $in, # handle to in stream
3397 $out, # handle to out stream
3398 $options, # options hash
3403 my ( $record, $key, $A, $B, @rows, @matrix, $row, $i );
3405 while ( $record = Maasha::Biopieces::get_record( $in ) )
3409 foreach $key ( sort { $A = $a; $B = $b; $A =~ s/^V(\d+)$/$1/; $B =~ s/^V(\d+)$/$1/; $A <=> $B } keys %{ $record } )
3411 push @rows, $record->{ $key };
3415 push @matrix, [ @rows ];
3420 @matrix = Maasha::Matrix::matrix_flip( \@matrix );
3422 foreach $row ( @matrix )
3424 for ( $i = 0; $i < @{ $row }; $i++ ) {
3425 $record->{ "V$i" } = $row->[ $i ];
3428 Maasha::Biopieces::put_record( $record, $out );
3433 sub script_count_records
3435 # Martin A. Hansen, August 2007.
3437 # Count records in stream.
3439 my ( $in, # handle to in stream
3440 $out, # handle to out stream
3441 $options, # options hash
3446 my ( $record, $count, $result, $fh, $line );
3450 if ( $options->{ "no_stream" } )
3452 while ( $line = <$in> )
3456 $count++ if $line eq "---";
3461 while ( $record = Maasha::Biopieces::get_record( $in ) )
3463 Maasha::Biopieces::put_record( $record, $out );
3469 $result = { "RECORDS_COUNT" => $count };
3471 $fh = write_stream( $options->{ "data_out" } );
3473 Maasha::Biopieces::put_record( $result, $fh );
3479 sub script_sort_records
3481 # Martin A. Hansen, August 2007.
3483 # Sort to sort records according to keys.
3485 my ( $in, # handle to in stream
3486 $out, # handle to out stream
3487 $options, # options hash
3492 my ( @keys, $key, @sort_cmd, $sort_str, $sort_sub, @records, $record, $i );
3494 foreach $key ( @{ $options->{ "keys" } } )
3496 if ( $key =~ s/n$// ) {
3497 push @sort_cmd, qq(\$a->{ "$key" } <=> \$b->{ "$key" });
3499 push @sort_cmd, qq(\$a->{ "$key" } cmp \$b->{ "$key" });
3503 $sort_str = join " or ", @sort_cmd;
3504 $sort_sub = eval "sub { $sort_str }"; # NB security issue!
3506 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
3507 push @records, $record;
3510 @records = sort $sort_sub @records;
3512 if ( $options->{ "reverse" } )
3514 for ( $i = scalar @records - 1; $i >= 0; $i-- ) {
3515 Maasha::Biopieces::put_record( $records[ $i ], $out );
3520 for ( $i = 0; $i < scalar @records; $i++ ) {
3521 Maasha::Biopieces::put_record( $records[ $i ], $out );
3527 sub script_count_vals
3529 # Martin A. Hansen, August 2007.
3531 # Count records in stream.
3533 my ( $in, # handle to in stream
3534 $out, # handle to out stream
3535 $options, # options hash
3540 my ( $num, $record, %count_hash, @records, $tmp_file, $fh_out, $fh_in, $cache );
3542 $tmp_file = "$BP_TMP/count_cache.tmp";
3544 $fh_out = Maasha::Common::write_open( $tmp_file );
3549 while ( $record = Maasha::Biopieces::get_record( $in ) )
3551 map { $count_hash{ $_ }{ $record->{ $_ } }++ if exists $record->{ $_ } } @{ $options->{ "keys" } };
3553 push @records, $record;
3555 if ( scalar @records > 5_000_000 ) # too many records to hold in memory - use disk cache
3557 map { Maasha::Biopieces::put_record( $_, $fh_out ) } @records;
3564 print STDERR "verbose: records read $num\n" if ( $options->{ 'verbose' } and ( $num % 1_000_000 ) == 0 );
3575 $fh_in = Maasha::Common::read_open( $tmp_file );
3577 while ( $record = Maasha::Biopieces::get_record( $fh_in ) )
3579 map { $record->{ $_ . "_COUNT" } = $count_hash{ $_ }{ $record->{ $_ } } if exists $record->{ $_ } } @{ $options->{ "keys" } };
3581 Maasha::Biopieces::put_record( $record, $out );
3583 print STDERR "verbose: cache read $num\n" if ( $options->{ 'verbose' } and ( $num % 1_000_000 ) == 0 );
3591 foreach $record ( @records )
3593 map { $record->{ $_ . "_COUNT" } = $count_hash{ $_ }{ $record->{ $_ } } if exists $record->{ $_ } } @{ $options->{ "keys" } };
3595 Maasha::Biopieces::put_record( $record, $out );
3602 sub script_plot_histogram
3604 # Martin A. Hansen, September 2007.
3606 # Plot a simple histogram for a given key using GNU plot.
3608 my ( $in, # handle to in stream
3609 $out, # handle to out stream
3610 $options, # options hash
3615 my ( $record, %data_hash, $max, @data_list, $i, $result, $fh );
3617 $options->{ "title" } ||= "Histogram";
3618 $options->{ "sort" } ||= "num";
3620 while ( $record = Maasha::Biopieces::get_record( $in ) )
3622 $data_hash{ $record->{ $options->{ "key" } } }++ if defined $record->{ $options->{ "key" } };
3624 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
3627 if ( $options->{ "sort" } eq "num" ) {
3628 map { push @data_list, [ $_, $data_hash{ $_ } ] } sort { $a <=> $b } keys %data_hash;
3630 map { push @data_list, [ $_, $data_hash{ $_ } ] } sort keys %data_hash;
3633 $result = Maasha::Plot::histogram_simple( \@data_list, $options );
3635 $fh = write_stream( $options->{ "data_out" } );
3637 print $fh "$_\n" foreach @{ $result };
3643 sub script_plot_lendist
3645 # Martin A. Hansen, August 2007.
3647 # Plot length distribution using GNU plot.
3649 my ( $in, # handle to in stream
3650 $out, # handle to out stream
3651 $options, # options hash
3656 my ( $record, %data_hash, $max, @data_list, $i, $result, $fh );
3658 $options->{ "title" } ||= "Length Distribution";
3660 while ( $record = Maasha::Biopieces::get_record( $in ) )
3662 $data_hash{ $record->{ $options->{ "key" } } }++ if defined $record->{ $options->{ "key" } };
3664 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
3667 $max = Maasha::Calc::list_max( [ keys %data_hash ] );
3669 for ( $i = 0; $i < $max; $i++ ) {
3670 push @data_list, [ $i, $data_hash{ $i } || 0 ];
3673 $result = Maasha::Plot::histogram_lendist( \@data_list, $options );
3675 $fh = write_stream( $options->{ "data_out" } );
3677 print $fh "$_\n" foreach @{ $result };
3683 sub script_plot_chrdist
3685 # Martin A. Hansen, August 2007.
3687 # Plot chromosome distribution using GNU plot.
3689 my ( $in, # handle to in stream
3690 $out, # handle to out stream
3691 $options, # options hash
3696 my ( $record, %data_hash, @data_list, $elem, $sort_key, $count, $result, $fh );
3698 $options->{ "title" } ||= "Chromosome Distribution";
3700 while ( $record = Maasha::Biopieces::get_record( $in ) )
3702 if ( $record->{ "CHR" } ) { # generic
3703 $data_hash{ $record->{ "CHR" } }++;
3704 } elsif ( $record->{ "REC_TYPE" } eq "PATSCAN" and $record->{ "S_ID" } =~ /^chr/i ) { # patscan
3705 $data_hash{ $record->{ "S_ID" } }++;
3706 } elsif ( $record->{ "REC_TYPE" } eq "PSL" and $record->{ "S_ID" } =~ /^chr/i ) { # BLAT / PSL
3707 $data_hash{ $record->{ "S_ID" } }++;
3708 } elsif ( $record->{ "REC_TYPE" } eq "BLAST" and $record->{ "S_ID" } =~ /^chr/i ) { # BLAST
3709 $data_hash{ $record->{ "S_ID" } }++;
3712 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
3715 foreach $elem ( keys %data_hash )
3719 $sort_key =~ s/chr//i;
3721 $sort_key =~ s/^X(.*)/99$1/;
3722 $sort_key =~ s/^Y(.*)/99$1/;
3723 $sort_key =~ s/^Z(.*)/999$1/;
3724 $sort_key =~ s/^M(.*)/9999$1/;
3725 $sort_key =~ s/^U(.*)/99999$1/;
3727 $count = $sort_key =~ tr/_//;
3729 $sort_key =~ s/_.*/"999999" x $count/ex;
3731 push @data_list, [ $elem, $data_hash{ $elem }, $sort_key ];
3734 @data_list = sort { $a->[ 2 ] <=> $b->[ 2 ] } @data_list;
3736 $result = Maasha::Plot::histogram_chrdist( \@data_list, $options );
3738 $fh = write_stream( $options->{ "data_out" } );
3740 print $fh "$_\n" foreach @{ $result };
3746 sub script_plot_karyogram
3748 # Martin A. Hansen, August 2007.
3750 # Plot hits on karyogram.
3752 my ( $in, # handle to in stream
3753 $out, # handle to out stream
3754 $options, # options hash
3759 my ( %options, $record, @data, $fh, $result, %data_hash );
3761 $options->{ "genome" } ||= "human";
3762 $options->{ "feat_color" } ||= "black";
3764 while ( $record = Maasha::Biopieces::get_record( $in ) )
3766 if ( $record->{ "CHR" } and $record->{ "CHR_BEG" } and $record->{ "CHR_END" } )
3768 push @{ $data_hash{ $record->{ "CHR" } } }, [ $record->{ "CHR_BEG" }, $record->{ "CHR_END" }, $options->{ "feat_color" } ];
3771 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
3774 $result = Maasha::Plot::karyogram( \%data_hash, \%options );
3776 $fh = write_stream( $options->{ "data_out" } );
3784 sub script_plot_matches
3786 # Martin A. Hansen, August 2007.
3788 # Plot matches in 2D generating a dotplot.
3790 my ( $in, # handle to in stream
3791 $out, # handle to out stream
3792 $options, # options hash
3797 my ( $record, @data, $fh, $result, %data_hash );
3799 $options->{ "direction" } ||= "both";
3801 while ( $record = Maasha::Biopieces::get_record( $in ) )
3803 if ( defined $record->{ "Q_BEG" } and defined $record->{ "S_BEG" } and $record->{ "Q_END" } and $record->{ "S_END" } ) {
3804 push @data, $record;
3807 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
3810 $options->{ "title" } ||= "plot_matches";
3811 $options->{ "xlabel" } ||= $data[ 0 ]->{ "Q_ID" };
3812 $options->{ "ylabel" } ||= $data[ 0 ]->{ "S_ID" };
3814 $result = Maasha::Plot::dotplot_matches( \@data, $options, $BP_TMP );
3816 $fh = write_stream( $options->{ "data_out" } );
3818 print $fh "$_\n" foreach @{ $result };
3824 sub script_length_vals
3826 # Martin A. Hansen, August 2007.
3828 # Determine the length of the value for given keys.
3830 my ( $in, # handle to in stream
3831 $out, # handle to out stream
3832 $options, # options hash
3837 my ( $record, $key );
3839 while ( $record = Maasha::Biopieces::get_record( $in ) )
3841 foreach $key ( @{ $options->{ "keys" } } )
3843 if ( $record->{ $key } ) {
3844 $record->{ $key . "_LEN" } = length $record->{ $key };
3848 Maasha::Biopieces::put_record( $record, $out );
3855 # Martin A. Hansen, August 2007.
3857 # Calculates the sums for values of given keys.
3859 my ( $in, # handle to in stream
3860 $out, # handle to out stream
3861 $options, # options hash
3866 my ( $record, $key, %sum_hash, $fh );
3868 while ( $record = Maasha::Biopieces::get_record( $in ) )
3870 foreach $key ( @{ $options->{ "keys" } } )
3872 if ( $record->{ $key } ) {
3873 $sum_hash{ $key } += $record->{ $key };
3877 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
3880 $fh = write_stream( $options->{ "data_out" } );
3882 foreach $key ( @{ $options->{ "keys" } } ) {
3883 Maasha::Biopieces::put_record( { $key . "_SUM" => $sum_hash{ $key } || 0 } , $fh );
3890 sub script_mean_vals
3892 # Martin A. Hansen, August 2007.
3894 # Calculate the mean of values of given keys.
3896 my ( $in, # handle to in stream
3897 $out, # handle to out stream
3898 $options, # options hash
3903 my ( $record, $key, %sum_hash, %count_hash, $mean, $fh );
3905 while ( $record = Maasha::Biopieces::get_record( $in ) )
3907 foreach $key ( @{ $options->{ "keys" } } )
3909 if ( $record->{ $key } )
3911 $sum_hash{ $key } += $record->{ $key };
3912 $count_hash{ $key }++;
3916 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
3919 $fh = write_stream( $options->{ "data_out" } );
3921 foreach $key ( @{ $options->{ "keys" } } )
3923 if ( $count_hash{ $key } ) {
3924 $mean = sprintf( "%.2f", ( $sum_hash{ $key } / $count_hash{ $key } ) );
3929 Maasha::Biopieces::put_record( { $key . "_MEAN" => $mean } , $fh );
3936 sub script_median_vals
3938 # Martin A. Hansen, March 2008.
3940 # Calculate the median values of given keys.
3942 my ( $in, # handle to in stream
3943 $out, # handle to out stream
3944 $options, # options hash
3949 my ( $record, $key, %median_hash, $median, $fh );
3951 while ( $record = Maasha::Biopieces::get_record( $in ) )
3953 foreach $key ( @{ $options->{ "keys" } } ) {
3954 push @{ $median_hash{ $key } }, $record->{ $key } if defined $record->{ $key };
3957 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
3960 $fh = write_stream( $options->{ "data_out" } );
3962 foreach $key ( @{ $options->{ "keys" } } )
3964 if ( $median_hash{ $key } ) {
3965 $median = Maasha::Calc::median( $median_hash{ $key } );
3970 Maasha::Biopieces::put_record( { $key . "_MEDIAN" => $median } , $fh );
3979 # Martin A. Hansen, February 2008.
3981 # Determine the maximum values of given keys.
3983 my ( $in, # handle to in stream
3984 $out, # handle to out stream
3985 $options, # options hash
3990 my ( $record, $key, $fh, %max_hash, $max_record );
3992 while ( $record = Maasha::Biopieces::get_record( $in ) )
3994 foreach $key ( @{ $options->{ "keys" } } )
3996 if ( $record->{ $key } )
3998 $max_hash{ $key } = $record->{ $key } if $record->{ $key } > $max_hash{ $key };
4002 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
4005 $fh = write_stream( $options->{ "data_out" } );
4007 foreach $key ( @{ $options->{ "keys" } } )
4009 $max_record->{ $key . "_MAX" } = $max_hash{ $key };
4012 Maasha::Biopieces::put_record( $max_record, $fh );
4020 # Martin A. Hansen, February 2008.
4022 # Determine the minimum values of given keys.
4024 my ( $in, # handle to in stream
4025 $out, # handle to out stream
4026 $options, # options hash
4031 my ( $record, $key, $fh, %min_hash, $min_record );
4033 while ( $record = Maasha::Biopieces::get_record( $in ) )
4035 foreach $key ( @{ $options->{ "keys" } } )
4037 if ( defined $record->{ $key } )
4039 if ( exists $min_hash{ $key } ) {
4040 $min_hash{ $key } = $record->{ $key } if $record->{ $key } < $min_hash{ $key };
4042 $min_hash{ $key } = $record->{ $key };
4047 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
4050 $fh = write_stream( $options->{ "data_out" } );
4052 foreach $key ( @{ $options->{ "keys" } } )
4054 $min_record->{ $key . "_MIN" } = $min_hash{ $key };
4057 Maasha::Biopieces::put_record( $min_record, $fh );
4063 sub script_upload_to_ucsc
4065 # Martin A. Hansen, August 2007.
4067 # Calculate the mean of values of given keys.
4069 # This routine has developed into the most ugly hack. Do something!
4071 my ( $in, # handle to in stream
4072 $out, # handle to out stream
4073 $options, # options hash
4078 my ( $record, $file, $wib_file, $wig_file, $wib_dir, $fh_out, $i, $first, $format, $type, $columns, $append, $entry );
4080 $options->{ "short_label" } ||= $options->{ 'table' };
4081 $options->{ "long_label" } ||= $options->{ 'table' };
4082 $options->{ "group" } ||= $ENV{ "LOGNAME" };
4083 $options->{ "priority" } ||= 1;
4084 $options->{ "visibility" } ||= "pack";
4085 $options->{ "color" } ||= join( ",", int( rand( 255 ) ), int( rand( 255 ) ), int( rand( 255 ) ) );
4086 $options->{ "chunk_size" } ||= 10_000_000_000; # Due to 32-bit UCSC compilation really large tables cannot be loaded in one go.
4088 $file = "$BP_TMP/ucsc_upload.tmp";
4093 $fh_out = Maasha::Common::write_open( $file );
4095 while ( $record = Maasha::Biopieces::get_record( $in ) )
4097 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
4099 if ( $record->{ "REC_TYPE" } eq "fixed_step" )
4103 if ( $entry = Maasha::UCSC::Wiggle::biopiece2fixedstep( $record ) ) {
4104 Maasha::UCSC::Wiggle::fixedstep_entry_put( $entry, $fh_out );
4107 elsif ( $record->{ "REC_TYPE" } eq "PSL" )
4111 Maasha::UCSC::psl_put_header( $fh_out ) if $first;
4112 Maasha::UCSC::psl_put_entry( $record, $fh_out );
4116 elsif ( $record->{ "REC_TYPE" } eq "BED" and $record->{ "SEC_STRUCT" } )
4118 # chrom chromStart chromEnd name score strand size secStr conf
4122 print $fh_out join ( "\t",
4124 $record->{ "CHR_BEG" },
4125 $record->{ "CHR_END" } + 1,
4126 $record->{ "Q_ID" },
4127 $record->{ "SCORE" },
4128 $record->{ "STRAND" },
4129 $record->{ "SIZE" },
4130 $record->{ "SEC_STRUCT" },
4131 $record->{ "CONF" },
4134 elsif ( $record->{ "REC_TYPE" } eq "BED" )
4137 $columns = $record->{ "BED_COLS" };
4139 if ( $entry = Maasha::UCSC::BED::biopiece2bed( $record, $columns ) ) {
4140 Maasha::UCSC::BED::bed_entry_put( $entry, $fh_out, $columns, $options->{ 'check' } );
4143 elsif ( $record->{ "REC_TYPE" } eq "PATSCAN" and $record->{ "CHR" } )
4148 if ( $entry = Maasha::UCSC::BED::biopiece2bed( $record, $columns ) ) {
4149 Maasha::UCSC::BED::bed_entry_put( $entry, $fh_out, $columns, $options->{ 'check' } );
4152 elsif ( $record->{ "REC_TYPE" } eq "BLAST" and $record->{ "S_ID" } =~ /^chr/ )
4157 $record->{ "SCORE" } = $record->{ "BIT_SCORE" } * 1000;
4159 if ( $entry = Maasha::UCSC::BED::biopiece2bed( $record, $columns ) ) {
4160 Maasha::UCSC::BED::bed_entry_put( $entry, $fh_out, $columns, $options->{ 'check' } );
4163 elsif ( $record->{ "REC_TYPE" } eq "VMATCH" and $record->{ "S_ID" } =~ /^chr/i )
4168 if ( $entry = Maasha::UCSC::BED::biopiece2bed( $record, $columns ) ) {
4169 Maasha::UCSC::BED::bed_entry_put( $entry, $fh_out, $columns, $options->{ 'check' } );
4173 if ( $i == $options->{ "chunk_size" } )
4177 if ( $format eq "BED" ) {
4178 Maasha::UCSC::bed_upload_to_ucsc( $BP_TMP, $file, $options, $append );
4179 } elsif ( $format eq "PSL" ) {
4180 Maasha::UCSC::psl_upload_to_ucsc( $file, $options, $append );
4189 $fh_out = Maasha::Common::write_open( $file );
4197 if ( exists $options->{ "database" } and $options->{ "table" } )
4199 if ( $format eq "BED" )
4201 $type = "bed $columns";
4203 Maasha::UCSC::bed_upload_to_ucsc( $BP_TMP, $file, $options, $append );
4205 elsif ( $format eq "BED_SS" )
4207 $type = "type bed 6 +";
4209 Maasha::UCSC::bed_upload_to_ucsc( $BP_TMP, $file, $options, $append );
4211 elsif ( $format eq "PSL" )
4215 Maasha::UCSC::psl_upload_to_ucsc( $file, $options, $append );
4217 elsif ( $format eq "WIGGLE" )
4219 $options->{ "visibility" } = "full";
4221 $wig_file = "$options->{ 'table' }.wig";
4222 $wib_file = "$options->{ 'table' }.wib";
4224 $wib_dir = "$ENV{ 'HOME' }/ucsc/wib";
4226 Maasha::Common::dir_create_if_not_exists( $wib_dir );
4228 if ( $options->{ 'verbose' } ) {
4229 `cd $BP_TMP && wigEncode $file $wig_file $wib_file`;
4231 `cd $BP_TMP && wigEncode $file $wig_file $wib_file > /dev/null 2>&1`;
4234 Maasha::Common::run( "mv", "$BP_TMP/$wib_file $wib_dir" );
4242 Maasha::UCSC::wiggle_upload_to_ucsc( $BP_TMP, $wib_dir, $file, $options );
4247 Maasha::UCSC::ucsc_update_config( $options, $type );
4252 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<