1 package Maasha::BioRun;
4 # Copyright (C) 2007-2009 Martin A. Hansen.
6 # This program is free software; you can redistribute it and/or
7 # modify it under the terms of the GNU General Public License
8 # as published by the Free Software Foundation; either version 2
9 # of the License, or (at your option) any later version.
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program; if not, write to the Free Software
18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 # http://www.gnu.org/copyleft/gpl.html
23 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
26 # Routines that contains Biopieces which are run.
29 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
34 use Getopt::Long qw( :config bundling );
35 use Time::HiRes qw( gettimeofday );
36 use Storable qw( dclone );
37 use Maasha::Biopieces;
46 use Maasha::Stockholm;
52 use Maasha::UCSC::BED;
53 use Maasha::UCSC::Wiggle;
62 use vars qw( @ISA @EXPORT_OK );
66 @ISA = qw( Exporter );
74 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> GLOBALS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
77 my ( $script, $BP_TMP );
79 $script = Maasha::Common::get_scriptname();
80 $BP_TMP = Maasha::Common::get_tmpdir();
83 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> RUN SCRIPT <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
86 run_script( $script );
89 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SUBROUTINES <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
94 # Martin A. Hansen, August 2007.
96 # Run a specific script.
98 my ( $script, # script name
103 my ( $t0, $t1, $options, $in, $out );
105 Maasha::Biopieces::log_biopiece();
107 $t0 = gettimeofday();
109 $options = get_options( $script );
111 $options->{ "SCRIPT" } = $script;
113 $script = "print_usage" if ( -t STDIN and keys %{ $options } <= 1 or $options->{ 'help' } );
115 $in = Maasha::Biopieces::read_stream( $options->{ "stream_in" } );
116 $out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } );
118 if ( $script eq "print_usage" ) { script_print_usage( $in, $out, $options ) }
119 elsif ( $script eq "read_psl" ) { script_read_psl( $in, $out, $options ) }
120 elsif ( $script eq "read_bed" ) { script_read_bed( $in, $out, $options ) }
121 elsif ( $script eq "read_fixedstep" ) { script_read_fixedstep( $in, $out, $options ) }
122 elsif ( $script eq "read_blast_tab" ) { script_read_blast_tab( $in, $out, $options ) }
123 elsif ( $script eq "read_embl" ) { script_read_embl( $in, $out, $options ) }
124 elsif ( $script eq "read_stockholm" ) { script_read_stockholm( $in, $out, $options ) }
125 elsif ( $script eq "read_phastcons" ) { script_read_phastcons( $in, $out, $options ) }
126 elsif ( $script eq "read_soft" ) { script_read_soft( $in, $out, $options ) }
127 elsif ( $script eq "read_gff" ) { script_read_gff( $in, $out, $options ) }
128 elsif ( $script eq "read_2bit" ) { script_read_2bit( $in, $out, $options ) }
129 elsif ( $script eq "read_solexa" ) { script_read_solexa( $in, $out, $options ) }
130 elsif ( $script eq "read_solid" ) { script_read_solid( $in, $out, $options ) }
131 elsif ( $script eq "read_mysql" ) { script_read_mysql( $in, $out, $options ) }
132 elsif ( $script eq "read_ucsc_config" ) { script_read_ucsc_config( $in, $out, $options ) }
133 elsif ( $script eq "uppercase_seq" ) { script_uppercase_seq( $in, $out, $options ) }
134 elsif ( $script eq "complexity_seq" ) { script_complexity_seq( $in, $out, $options ) }
135 elsif ( $script eq "get_genome_align" ) { script_get_genome_align( $in, $out, $options ) }
136 elsif ( $script eq "get_genome_phastcons" ) { script_get_genome_phastcons( $in, $out, $options ) }
137 elsif ( $script eq "soap_seq" ) { script_soap_seq( $in, $out, $options ) }
138 elsif ( $script eq "write_bed" ) { script_write_bed( $in, $out, $options ) }
139 elsif ( $script eq "write_psl" ) { script_write_psl( $in, $out, $options ) }
140 elsif ( $script eq "write_fixedstep" ) { script_write_fixedstep( $in, $out, $options ) }
141 elsif ( $script eq "write_2bit" ) { script_write_2bit( $in, $out, $options ) }
142 elsif ( $script eq "write_solid" ) { script_write_solid( $in, $out, $options ) }
143 elsif ( $script eq "write_ucsc_config" ) { script_write_ucsc_config( $in, $out, $options ) }
144 elsif ( $script eq "remove_mysql_tables" ) { script_remove_mysql_tables( $in, $out, $options ) }
145 elsif ( $script eq "remove_ucsc_tracks" ) { script_remove_ucsc_tracks( $in, $out, $options ) }
146 elsif ( $script eq "plot_histogram" ) { script_plot_histogram( $in, $out, $options ) }
147 elsif ( $script eq "plot_lendist" ) { script_plot_lendist( $in, $out, $options ) }
148 elsif ( $script eq "plot_karyogram" ) { script_plot_karyogram( $in, $out, $options ) }
149 elsif ( $script eq "plot_matches" ) { script_plot_matches( $in, $out, $options ) }
150 elsif ( $script eq "plot_seqlogo" ) { script_plot_seqlogo( $in, $out, $options ) }
151 elsif ( $script eq "plot_phastcons_profiles" ) { script_plot_phastcons_profiles( $in, $out, $options ) }
152 elsif ( $script eq "upload_to_ucsc" ) { script_upload_to_ucsc( $in, $out, $options ) }
154 close $in if defined $in;
157 $t1 = gettimeofday();
159 print STDERR "Program: $script" . ( " " x ( 25 - length( $script ) ) ) . sprintf( "Run time: %.4f\n", ( $t1 - $t0 ) ) if $options->{ 'verbose' };
165 # Martin A. Hansen, February 2008.
167 # Gets options from commandline and checks these vigerously.
169 my ( $script, # name of script
174 my ( %options, @options, $opt, @genomes, $real );
176 if ( $script eq "print_usage" )
182 elsif ( $script eq "read_psl" )
189 elsif ( $script eq "read_bed" )
198 elsif ( $script eq "read_fixedstep" )
205 elsif ( $script eq "read_blast_tab" )
212 elsif ( $script eq "read_embl" )
222 elsif ( $script eq "read_stockholm" )
229 elsif ( $script eq "read_phastcons" )
240 elsif ( $script eq "read_soft" )
248 elsif ( $script eq "read_gff" )
255 elsif ( $script eq "read_2bit" )
263 elsif ( $script eq "read_solexa" )
272 elsif ( $script eq "read_solid" )
280 elsif ( $script eq "read_mysql" )
289 elsif ( $script eq "read_ucsc_config" )
296 elsif ( $script eq "get_genome_align" )
307 elsif ( $script eq "get_genome_phastcons" )
318 elsif ( $script eq "soap_seq" )
329 elsif ( $script eq "write_bed" )
339 elsif ( $script eq "write_psl" )
347 elsif ( $script eq "write_fixedstep" )
355 elsif ( $script eq "write_2bit" )
363 elsif ( $script eq "write_solid" )
372 elsif ( $script eq "write_ucsc_config" )
379 elsif ( $script eq "plot_seqlogo" )
386 elsif ( $script eq "plot_phastcons_profiles" )
401 elsif ( $script eq "remove_mysql_tables" )
412 elsif ( $script eq "remove_ucsc_tracks" )
424 elsif ( $script eq "plot_histogram" )
437 elsif ( $script eq "plot_lendist" )
449 elsif ( $script eq "plot_karyogram" )
458 elsif ( $script eq "plot_matches" )
470 elsif ( $script eq "upload_to_ucsc" )
494 # print STDERR Dumper( \@options );
501 # print STDERR Dumper( \%options );
503 if ( -t STDIN && scalar( keys %options ) == 0 or $options{ "help" } ) {
504 return wantarray ? %options : \%options;
507 $options{ "cols" } = [ split ",", $options{ "cols" } ] if defined $options{ "cols" };
508 $options{ "keys" } = [ split ",", $options{ "keys" } ] if defined $options{ "keys" };
509 $options{ "no_keys" } = [ split ",", $options{ "no_keys" } ] if defined $options{ "no_keys" };
510 $options{ "save_keys" } = [ split ",", $options{ "save_keys" } ] if defined $options{ "save_keys" };
511 $options{ "quals" } = [ split ",", $options{ "quals" } ] if defined $options{ "quals" };
512 $options{ "feats" } = [ split ",", $options{ "feats" } ] if defined $options{ "feats" };
513 $options{ "frames" } = [ split ",", $options{ "frames" } ] if defined $options{ "frames" };
514 $options{ "samples" } = [ split ",", $options{ "samples" } ] if defined $options{ "samples" };
515 $options{ "tables" } = [ split ",", $options{ "tables" } ] if defined $options{ "tables" };
516 $options{ "tracks" } = [ split ",", $options{ "tracks" } ] if defined $options{ "tracks" };
518 # ---- check arguments ----
520 if ( $options{ 'data_in' } )
522 $options{ "files" } = Maasha::Biopieces::getopt_files( $options{ 'data_in' } );
524 Maasha::Common::error( qq(Argument to --data_in must be a valid file or fileglob expression) ) if scalar @{ $options{ "files" } } == 0;
527 map { Maasha::Common::error( qq(Argument to --cols must be a whole numbers - not "$_") ) if $_ !~ /^\d+$/ } @{ $options{ "cols" } } if $options{ "cols" };
529 # print STDERR Dumper( \%options );
531 $real = "beg|end|word_size|wrap|len|prefix_length|mismatches|offset|num|skip|cpus|window_size|step_size";
533 foreach $opt ( keys %options )
535 if ( $opt =~ /stream_in|pattern_in|exact_in/ and not -f $options{ $opt } )
537 Maasha::Common::error( qq(Argument to --$opt must be a valid file or fileglob expression - not "$options{ $opt }") );
539 elsif ( $opt =~ /$real/ and $options{ $opt } !~ /^\d+$/ )
541 Maasha::Common::error( qq(Argument to --$opt must be a whole number - not "$options{ $opt }") );
543 elsif ( $opt =~ /max_hits|max_hits|max_misses|dist|edit_dist|flank|gap|hamming_dist|priority/ and $options{ $opt } !~ /^-?\d+$/ )
545 Maasha::Common::error( qq(Argument to --$opt must be an integer - not "$options{ $opt }") );
547 elsif ( $opt =~ /identity|threshold/ and $options{ $opt } !~ /^-?(?:\d+(?:\.\d*)?|\.\d+)$/ )
549 Maasha::Common::error( qq(Argument to --$opt must be a decimal number - not "$options{ $opt }") );
551 elsif ( $opt =~ /e_val/ and $options{ $opt } !~ /^([+-]?)(?=\d|\.\d)\d*(\.\d*)?([Ee]([+-]?\d+))?$/ )
553 Maasha::Common::error( qq(Argument to --$opt must be a float - not "$options{ $opt }") );
555 elsif ( $opt =~ /strand/ and $options{ $opt } !~ /^(\+|-)$/ )
557 Maasha::Common::error( qq(Argument to --$opt must be "+" or "-" - not "$options{ $opt }") );
559 elsif ( $opt eq "genome" )
561 @genomes = Maasha::Common::ls_dirs( "$ENV{ 'BP_DATA' }/genomes" );
562 map { $_ =~ s/.*\/(.+)$/$1/ } @genomes;
564 if ( not grep { $_ =~ /^$options{ $opt }$/ } @genomes ) {
565 Maasha::Common::error( qq(Genome $options{ $opt } not found in "$ENV{ 'BP_DATA' }/genomes/") );
568 elsif ( $opt eq "terminal" and not $options{ $opt } =~ /^(svg|post|dumb|x11)/ )
570 Maasha::Common::error( qq(Bad --$opt argument "$options{ $opt }") );
572 elsif ( $opt eq "table" and $options{ $opt } =~ /(-|\.)/ )
574 Maasha::Common::error( qq(Character '$1' is not allowed in table name: $options{ $opt }) );
576 elsif ( $opt eq "merge" and $options{ $opt } !~ /^(AandB|AorB|BorA|AnotB|BnotA)$/ )
578 Maasha::Common::error( qq(Argument to --$opt must be AandB, AorB, BorA, AnotB, or BnotA - not "$options{ $opt }") );
580 elsif ( $opt eq "format" and $script eq "read_solexa" and $options{ $opt } !~ /octal|decimal/ )
582 Maasha::Common::error( qq(Argument to --$opt must be octal or decimal - not "$options{ $opt }") );
586 Maasha::Common::error( qq(no --database specified) ) if $script eq "remove_ucsc_tracks" and not $options{ "database" };
587 Maasha::Common::error( qq(no --in_file or --genome specified) ) if $script eq "soap_seq" and not $options{ "genome" } and not $options{ "in_file" };
588 Maasha::Common::error( qq(both --in_file and --genome specified) ) if $script eq "soap_seq" and $options{ "genome" } and $options{ "in_file" };
589 Maasha::Common::error( qq(no --genome specified) ) if $script =~ /get_genome_align|get_genome_phastcons|plot_phastcons_profiles|plot_karyogram/ and not $options{ "genome" };
590 Maasha::Common::error( qq(no --key specified) ) if $script =~ /plot_lendist|plot_histogram/ and not $options{ "key" };
592 if ( $script eq "upload_to_ucsc" )
594 Maasha::Common::error( qq(no --database specified) ) if not $options{ "database" };
595 Maasha::Common::error( qq(no --table specified) ) if not $options{ "table" };
598 return wantarray ? %options : \%options;
602 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SCRIPTS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
605 sub script_print_usage
607 # Martin A. Hansen, January 2008.
609 # Retrieves usage information from file and
610 # prints this nicely formatted.
612 my ( $in, # handle to in stream
613 $out, # handle to out stream
614 $options, # options hash
619 my ( $file, $wiki, $lines );
621 if ( $options->{ 'data_in' } ) {
622 $file = $options->{ 'data_in' };
624 $file = join "", $ENV{ 'BP_DIR' }, "/bp_usage/", $options->{ 'SCRIPT' }, ".wiki";
627 $wiki = Maasha::Gwiki::gwiki_read( $file );
629 ( $wiki->[ 2 ], $wiki->[ 3 ], $wiki->[ 0 ], $wiki->[ 1 ] ) = ( $wiki->[ 0 ], $wiki->[ 1 ], $wiki->[ 2 ], $wiki->[ 3 ] );
631 if ( not $options->{ "help" } ) {
632 @{ $wiki } = grep { $_->[ 0 ]->{ 'SECTION' } =~ /Biopiece|summary|Usage|Options|Help/ } @{ $wiki };
635 $lines = Maasha::Gwiki::gwiki2ascii( $wiki );
637 print STDERR "$_\n" foreach @{ $lines };
645 # Martin A. Hansen, August 2007.
647 # Read psl table from stream or file.
649 my ( $in, # handle to in stream
650 $out, # handle to out stream
651 $options, # options hash
656 my ( $record, $file, $data_in, $num );
658 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
659 Maasha::Biopieces::put_record( $record, $out );
664 foreach $file ( @{ $options->{ "files" } } )
666 $data_in = Maasha::Common::read_open( $file );
668 while ( $record = Maasha::UCSC::psl_get_entry( $data_in ) )
670 Maasha::Biopieces::put_record( $record, $out );
672 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
684 # Martin A. Hansen, August 2007.
686 # Read bed table from stream or file.
688 my ( $in, # handle to in stream
689 $out, # handle to out stream
690 $options, # options hash
695 my ( $cols, $file, $record, $bed_entry, $data_in, $num );
697 $cols = $options->{ 'cols' }->[ 0 ];
699 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
700 Maasha::Biopieces::put_record( $record, $out );
705 foreach $file ( @{ $options->{ "files" } } )
707 $data_in = Maasha::Common::read_open( $file );
709 while ( $bed_entry = Maasha::UCSC::BED::bed_entry_get( $data_in, $cols, $options->{ 'check' } ) )
711 $record = Maasha::UCSC::BED::bed2biopiece( $bed_entry );
713 Maasha::Biopieces::put_record( $record, $out );
715 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
725 close $data_in if $data_in;
729 sub script_read_fixedstep
731 # Martin A. Hansen, Juli 2008.
733 # Read fixedstep wiggle format from stream or file.
735 my ( $in, # handle to in stream
736 $out, # handle to out stream
737 $options, # options hash
742 my ( $file, $record, $entry, $data_in, $num );
744 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
745 Maasha::Biopieces::put_record( $record, $out );
750 foreach $file ( @{ $options->{ "files" } } )
752 $data_in = Maasha::Common::read_open( $file );
754 while ( $entry = Maasha::UCSC::Wiggle::fixedstep_entry_get( $data_in ) )
756 $record = Maasha::UCSC::Wiggle::fixedstep2biopiece( $entry );
758 Maasha::Biopieces::put_record( $record, $out );
760 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
770 close $data_in if $data_in;
774 sub script_read_blast_tab
776 # Martin A. Hansen, September 2007.
778 # Read tabular BLAST output from NCBI blast run with -m8 or -m9.
780 my ( $in, # handle to in stream
781 $out, # handle to out stream
782 $options, # options hash
787 my ( $file, $line, @fields, $strand, $record, $data_in, $num );
789 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
790 Maasha::Biopieces::put_record( $record, $out );
795 foreach $file ( @{ $options->{ "files" } } )
797 $data_in = Maasha::Common::read_open( $file );
799 while ( $line = <$data_in> )
803 next if $line =~ /^#/;
805 @fields = split /\t/, $line;
807 $record->{ "REC_TYPE" } = "BLAST";
808 $record->{ "Q_ID" } = $fields[ 0 ];
809 $record->{ "S_ID" } = $fields[ 1 ];
810 $record->{ "IDENT" } = $fields[ 2 ];
811 $record->{ "ALIGN_LEN" } = $fields[ 3 ];
812 $record->{ "MISMATCHES" } = $fields[ 4 ];
813 $record->{ "GAPS" } = $fields[ 5 ];
814 $record->{ "Q_BEG" } = $fields[ 6 ] - 1; # BLAST is 1-based
815 $record->{ "Q_END" } = $fields[ 7 ] - 1; # BLAST is 1-based
816 $record->{ "S_BEG" } = $fields[ 8 ] - 1; # BLAST is 1-based
817 $record->{ "S_END" } = $fields[ 9 ] - 1; # BLAST is 1-based
818 $record->{ "E_VAL" } = $fields[ 10 ];
819 $record->{ "BIT_SCORE" } = $fields[ 11 ];
821 if ( $record->{ "S_BEG" } > $record->{ "S_END" } )
823 $record->{ "STRAND" } = '-';
825 ( $record->{ "S_BEG" }, $record->{ "S_END" } ) = ( $record->{ "S_END" }, $record->{ "S_BEG" } );
829 $record->{ "STRAND" } = '+';
832 Maasha::Biopieces::put_record( $record, $out );
834 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
844 close $data_in if $data_in;
850 # Martin A. Hansen, August 2007.
854 my ( $in, # handle to in stream
855 $out, # handle to out stream
856 $options, # options hash
861 my ( %options2, $file, $data_in, $num, $entry, $record );
863 map { $options2{ "keys" }{ $_ } = 1 } @{ $options->{ "keys" } };
864 map { $options2{ "feats" }{ $_ } = 1 } @{ $options->{ "feats" } };
865 map { $options2{ "quals" }{ $_ } = 1 } @{ $options->{ "quals" } };
867 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
868 Maasha::Biopieces::put_record( $record, $out );
873 foreach $file ( @{ $options->{ "files" } } )
875 $data_in = Maasha::Common::read_open( $file );
877 while ( $entry = Maasha::EMBL::get_embl_entry( $data_in ) )
879 $record = Maasha::EMBL::parse_embl_entry( $entry, \%options2 );
881 my ( $feat, $feat2, $qual, $qual_val, $record_copy );
883 $record_copy = dclone $record;
885 delete $record_copy->{ "FT" };
887 Maasha::Biopieces::put_record( $record_copy, $out );
889 delete $record_copy->{ "SEQ" };
891 foreach $feat ( keys %{ $record->{ "FT" } } )
893 $record_copy->{ "FEAT_TYPE" } = $feat;
895 foreach $feat2 ( @{ $record->{ "FT" }->{ $feat } } )
897 foreach $qual ( keys %{ $feat2 } )
899 $qual_val = join "; ", @{ $feat2->{ $qual } };
904 $record_copy->{ $qual } = $qual_val;
907 Maasha::Biopieces::put_record( $record_copy, $out );
911 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
921 close $data_in if $data_in;
925 sub script_read_stockholm
927 # Martin A. Hansen, August 2007.
929 # Read Stockholm format.
931 my ( $in, # handle to in stream
932 $out, # handle to out stream
933 $options, # options hash
938 my ( $data_in, $file, $num, $entry, $record, $record_anno, $record_align, $key, $seq );
940 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
941 Maasha::Biopieces::put_record( $record, $out );
946 foreach $file ( @{ $options->{ "files" } } )
948 $data_in = Maasha::Common::read_open( $file );
950 while ( $entry = Maasha::Stockholm::get_stockholm_entry( $data_in ) )
952 $record = Maasha::Stockholm::parse_stockholm_entry( $entry );
956 foreach $key ( keys %{ $record->{ "GF" } } ) {
957 $record_anno->{ $key } = $record->{ "GF" }->{ $key };
960 $record_anno->{ "ALIGN" } = $num;
962 Maasha::Biopieces::put_record( $record_anno, $out );
964 foreach $seq ( @{ $record->{ "ALIGN" } } )
969 SEQ_NAME => $seq->[ 0 ],
973 Maasha::Biopieces::put_record( $record_align, $out );
976 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
986 close $data_in if $data_in;
990 sub script_read_phastcons
992 # Martin A. Hansen, December 2007.
994 # Read PhastCons format.
996 my ( $in, # handle to in stream
997 $out, # handle to out stream
998 $options, # options hash
1003 my ( $data_in, $file, $num, $entry, @records, $record );
1005 $options->{ "min" } ||= 10;
1006 $options->{ "dist" } ||= 25;
1007 $options->{ "threshold" } ||= 0.8;
1008 $options->{ "gap" } ||= 5;
1010 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
1011 Maasha::Biopieces::put_record( $record, $out );
1016 foreach $file ( @{ $options->{ "files" } } )
1018 $data_in = Maasha::Common::read_open( $file );
1020 while ( $entry = Maasha::UCSC::fixedstep_get_entry( $data_in ) )
1022 @records = Maasha::UCSC::phastcons_parse_entry( $entry, $options );
1024 foreach $record ( @records )
1026 $record->{ "REC_TYPE" } = "BED";
1027 $record->{ "BED_LEN" } = $record->{ "CHR_END" } - $record->{ "CHR_BEG" } + 1;
1029 Maasha::Biopieces::put_record( $record, $out );
1031 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
1042 close $data_in if $data_in;
1046 sub script_read_soft
1048 # Martin A. Hansen, December 2007.
1051 # http://www.ncbi.nlm.nih.gov/geo/info/soft2.html
1053 my ( $in, # handle to in stream
1054 $out, # handle to out stream
1055 $options, # options hash
1060 my ( $data_in, $file, $num, $records, $record, $soft_index, $fh, @platforms, $plat_table, @samples, $sample, $old_end, $skip );
1062 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
1063 Maasha::Biopieces::put_record( $record, $out );
1068 foreach $file ( @{ $options->{ "files" } } )
1070 print STDERR "Creating index for file: $file\n" if $options->{ "verbose" };
1072 $soft_index = Maasha::NCBI::soft_index_file( $file );
1074 $fh = Maasha::Common::read_open( $file );
1076 @platforms = grep { $_->{ "SECTION" } =~ /PLATFORM/ } @{ $soft_index };
1078 print STDERR "Getting platform tables for file: $file\n" if $options->{ "verbose" };
1080 $plat_table = Maasha::NCBI::soft_get_platform( $fh, $platforms[ 0 ]->{ "LINE_BEG" }, $platforms[ -1 ]->{ "LINE_END" } );
1082 @samples = grep { $_->{ "SECTION" } =~ /SAMPLE/ } @{ $soft_index };
1084 $old_end = $platforms[ -1 ]->{ "LINE_END" };
1086 foreach $sample ( @samples )
1089 $skip = 1 if ( $options->{ "samples" } and grep { $sample->{ "SECTION" } !~ /$_/ } @{ $options->{ "samples" } } );
1091 print STDERR "Getting samples for dataset: $sample->{ 'SECTION' }\n" if $options->{ "verbose" } and not $skip;
1093 $records = Maasha::NCBI::soft_get_sample( $fh, $plat_table, $sample->{ "LINE_BEG" } - $old_end - 1, $sample->{ "LINE_END" } - $old_end - 1, $skip );
1095 foreach $record ( @{ $records } )
1097 Maasha::Biopieces::put_record( $record, $out );
1099 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
1104 $old_end = $sample->{ "LINE_END" };
1112 close $data_in if $data_in;
1119 # Martin A. Hansen, February 2008.
1122 # http://www.ncbi.nlm.nih.gov/geo/info/soft2.html
1124 my ( $in, # handle to in stream
1125 $out, # handle to out stream
1126 $options, # options hash
1131 my ( $data_in, $file, $fh, $num, $record, $entry );
1133 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
1134 Maasha::Biopieces::put_record( $record, $out );
1139 foreach $file ( @{ $options->{ "files" } } )
1141 $fh = Maasha::Common::read_open( $file );
1143 while ( $entry = Maasha::GFF::get_entry( $fh ) )
1145 Maasha::Biopieces::put_record( $entry, $out );
1147 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
1157 close $data_in if $data_in;
1161 sub script_read_2bit
1163 # Martin A. Hansen, March 2008.
1165 # Read sequences from 2bit file.
1167 my ( $in, # handle to in stream
1168 $out, # handle to out stream
1169 $options, # options hash
1174 my ( $record, $file, $data_in, $mask, $toc, $line, $num );
1176 $mask = 1 if not $options->{ "no_mask" };
1178 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
1179 Maasha::Biopieces::put_record( $record, $out );
1184 foreach $file ( @{ $options->{ "files" } } )
1186 $data_in = Maasha::Common::read_open( $file );
1188 $toc = Maasha::TwoBit::twobit_get_TOC( $data_in );
1190 foreach $line ( @{ $toc } )
1192 $record->{ "SEQ_NAME" } = $line->[ 0 ];
1193 $record->{ "SEQ" } = Maasha::TwoBit::twobit_get_seq( $data_in, $line->[ 1 ], undef, undef, $mask );
1194 $record->{ "SEQ_LEN" } = length $record->{ "SEQ" };
1196 Maasha::Biopieces::put_record( $record, $out );
1198 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
1208 close $data_in if $data_in;
1212 sub script_read_solexa
1214 # Martin A. Hansen, March 2008.
1216 # Read Solexa sequence reads from file.
1218 my ( $in, # handle to in stream
1219 $out, # handle to out stream
1220 $options, # options hash
1225 my ( $record, $file, $data_in, $entry, $num, @seqs, @scores, $i );
1227 $options->{ "format" } ||= "octal";
1228 $options->{ "quality" } ||= 20;
1230 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
1231 Maasha::Biopieces::put_record( $record, $out );
1236 foreach $file ( @{ $options->{ "files" } } )
1238 $data_in = Maasha::Common::read_open( $file );
1240 if ( $options->{ "format" } eq "octal" )
1242 while ( $entry = Maasha::Solexa::solexa_get_entry_octal( $data_in ) )
1244 $record = Maasha::Solexa::solexa2biopiece( $entry, $options->{ "quality" } );
1246 Maasha::Biopieces::put_record( $record, $out );
1248 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
1255 while ( $entry = Maasha::Solexa::solexa_get_entry_decimal( $data_in ) )
1257 $record = Maasha::Solexa::solexa2biopiece( $entry, $options->{ "quality" } );
1259 Maasha::Biopieces::put_record( $record, $out );
1261 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
1272 close $data_in if $data_in;
1276 sub script_read_solid
1278 # Martin A. Hansen, April 2008.
1280 # Read Solid sequence from file.
1282 my ( $in, # handle to in stream
1283 $out, # handle to out stream
1284 $options, # options hash
1289 my ( $record, $file, $data_in, $line, $num, $seq_name, $seq_cs, $seq_qual, @scores, @seqs, $i );
1291 $options->{ "quality" } ||= 15;
1293 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
1294 Maasha::Biopieces::put_record( $record, $out );
1299 foreach $file ( @{ $options->{ "files" } } )
1301 $data_in = Maasha::Common::read_open( $file );
1303 while ( $line = <$data_in> )
1307 ( $seq_name, $seq_cs, $seq_qual ) = split /\t/, $line;
1309 @scores = split /,/, $seq_qual;
1310 @seqs = split //, Maasha::Solid::color_space2seq( $seq_cs );
1312 for ( $i = 0; $i < @seqs; $i++ ) {
1313 $seqs[ $i ] = lc $seqs[ $i ] if $scores[ $i ] < $options->{ "quality" };
1317 REC_TYPE => 'SOLID',
1318 SEQ_NAME => $seq_name,
1320 SEQ_QUAL => join( ";", @scores ),
1321 SEQ_LEN => length $seq_cs,
1322 SEQ => join( "", @seqs ),
1323 SCORE_MEAN => sprintf( "%.2f", Maasha::Calc::mean( \@scores ) ),
1326 Maasha::Biopieces::put_record( $record, $out );
1328 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
1338 close $data_in if $data_in;
1342 sub script_read_mysql
1344 # Martin A. Hansen, May 2008.
1346 # Read a MySQL query into stream.
1348 my ( $in, # handle to in stream
1349 $out, # handle to out stream
1350 $options, # options hash
1355 my ( $record, $dbh, $results );
1357 $options->{ "user" } ||= Maasha::UCSC::ucsc_get_user();
1358 $options->{ "password" } ||= Maasha::UCSC::ucsc_get_password();
1360 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
1361 Maasha::Biopieces::put_record( $record, $out );
1364 $dbh = Maasha::SQL::connect( $options->{ "database" }, $options->{ "user" }, $options->{ "password" } );
1366 $results = Maasha::SQL::query_hashref_list( $dbh, $options->{ "query" } );
1368 Maasha::SQL::disconnect( $dbh );
1370 map { Maasha::Biopieces::put_record( $_ ) } @{ $results };
1374 sub script_read_ucsc_config
1376 # Martin A. Hansen, November 2008.
1378 # Read track entries from UCSC Genome Browser '.ra' files.
1380 my ( $in, # handle to in stream
1381 $out, # handle to out stream
1382 $options, # options hash
1387 my ( $record, $file, $data_in, $entry, $num );
1389 while ( $record = Maasha::Biopieces::get_record( $in ) ) {
1390 Maasha::Biopieces::put_record( $record, $out );
1395 foreach $file ( @{ $options->{ "files" } } )
1397 $data_in = Maasha::Common::read_open( $file );
1399 while ( $record = Maasha::UCSC::ucsc_config_entry_get( $data_in ) )
1401 $record->{ 'REC_TYPE' } = "UCSC Config";
1403 Maasha::Biopieces::put_record( $record, $out );
1405 goto NUM if $options->{ "num" } and $num == $options->{ "num" };
1415 close $data_in if $data_in;
1419 sub script_complexity_seq
1421 # Martin A. Hansen, May 2008.
1423 # Generates an index calculated as the most common di-residue over
1424 # the sequence length for all sequences in stream.
1426 my ( $in, # handle to in stream
1427 $out, # handle to out stream
1432 my ( $record, $index );
1434 while ( $record = Maasha::Biopieces::get_record( $in ) )
1436 $record->{ "SEQ_COMPLEXITY" } = sprintf( "%.2f", Maasha::Seq::seq_complexity( $record->{ "SEQ" } ) ) if $record->{ "SEQ" };
1438 Maasha::Biopieces::put_record( $record, $out );
1443 sub script_get_genome_align
1445 # Martin A. Hansen, April 2008.
1447 # Gets a subalignment from a multiple genome alignment.
1449 my ( $in, # handle to in stream
1450 $out, # handle to out stream
1451 $options, # options hash
1456 my ( $record, $maf_track, $align, $align_num, $beg, $end, $len, $entry );
1458 $options->{ "strand" } ||= "+";
1462 $maf_track = Maasha::Config::maf_track( $options->{ "genome" } );
1464 if ( $options->{ "chr" } and $options->{ "beg" } and ( $options->{ "end" } or $options->{ "len" } ) )
1466 $beg = $options->{ "beg" } - 1;
1468 if ( $options->{ "end" } ) {
1469 $end = $options->{ "end" };
1470 } elsif ( $options->{ "len" } ) {
1471 $end = $beg + $options->{ "len" };
1474 $align = Maasha::UCSC::maf_extract( $BP_TMP, $options->{ "genome" }, $maf_track, $options->{ "chr" }, $beg, $end, $options->{ "strand" } );
1476 foreach $entry ( @{ $align } )
1478 $entry->{ "CHR" } = $record->{ "CHR" };
1479 $entry->{ "CHR_BEG" } = $record->{ "CHR_BEG" };
1480 $entry->{ "CHR_END" } = $record->{ "CHR_END" };
1481 $entry->{ "STRAND" } = $record->{ "STRAND" } || '+';
1482 $entry->{ "Q_ID" } = $record->{ "Q_ID" };
1483 $entry->{ "SCORE" } = $record->{ "SCORE" };
1485 Maasha::Biopieces::put_record( $entry, $out );
1489 while ( $record = Maasha::Biopieces::get_record( $in ) )
1491 if ( $record->{ "REC_TYPE" } eq "BED" )
1493 $align = Maasha::UCSC::maf_extract( $BP_TMP, $options->{ "genome" }, $maf_track, $record->{ "CHR" }, $record->{ "CHR_BEG" }, $record->{ "CHR_END" }, $record->{ "STRAND" } );
1495 elsif ( $record->{ "REC_TYPE" } eq "VMATCH" )
1497 $align = Maasha::UCSC::maf_extract( $BP_TMP, $options->{ "genome" }, $maf_track, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" } + 1, $record->{ "STRAND" } );
1499 elsif ( $record->{ "REC_TYPE" } eq "PSL" )
1501 $align = Maasha::UCSC::maf_extract( $BP_TMP, $options->{ "genome" }, $maf_track, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" }, $record->{ "STRAND" } );
1503 elsif ( $record->{ "REC_TYPE" } eq "BLAST" )
1505 $align = Maasha::UCSC::maf_extract( $BP_TMP, $options->{ "genome" }, $maf_track, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" }, $record->{ "STRAND" } );
1508 foreach $entry ( @{ $align } )
1510 $entry->{ "CHR" } = $record->{ "CHR" };
1511 $entry->{ "CHR_BEG" } = $record->{ "CHR_BEG" };
1512 $entry->{ "CHR_END" } = $record->{ "CHR_END" };
1513 $entry->{ "STRAND" } = $record->{ "STRAND" };
1514 $entry->{ "Q_ID" } = $record->{ "Q_ID" };
1515 $entry->{ "SCORE" } = $record->{ "SCORE" };
1517 Maasha::Biopieces::put_record( $entry, $out );
1525 sub script_get_genome_phastcons
1527 # Martin A. Hansen, February 2008.
1529 # Get phastcons scores from genome intervals.
1531 my ( $in, # handle to in stream
1532 $out, # handle to out stream
1533 $options, # options hash
1538 my ( $phastcons_file, $phastcons_index, $index, $fh_phastcons, $scores, $record );
1540 $options->{ "flank" } ||= 0;
1542 $phastcons_file = Maasha::Config::genome_phastcons( $options->{ "genome" } );
1543 $phastcons_index = Maasha::Config::genome_phastcons_index( $options->{ "genome" } );
1545 $index = Maasha::UCSC::fixedstep_index_retrieve( $phastcons_index );
1546 $fh_phastcons = Maasha::Common::read_open( $phastcons_file );
1548 if ( defined $options->{ "chr" } and defined $options->{ "beg" } and ( defined $options->{ "end" } or defined $options->{ "len" } ) )
1550 $options->{ "beg" } -= 1; # request is 1-based
1551 $options->{ "end" } -= 1; # request is 1-based
1553 if ( $options->{ "len" } ) {
1554 $options->{ "end" } = $options->{ "beg" } + $options->{ "len" } - 1;
1557 $scores = Maasha::UCSC::fixedstep_index_lookup( $index, $fh_phastcons, $options->{ "chr" }, $options->{ "beg" }, $options->{ "end" }, $options->{ "flank" } );
1559 $record->{ "CHR" } = $options->{ "chr" };
1560 $record->{ "CHR_BEG" } = $options->{ "beg" } - $options->{ "flank" };
1561 $record->{ "CHR_END" } = $options->{ "end" } + $options->{ "flank" };
1563 $record->{ "PHASTCONS" } = join ",", @{ $scores };
1564 $record->{ "PHAST_COUNT" } = scalar @{ $scores }; # DEBUG
1566 Maasha::Biopieces::put_record( $record, $out );
1569 while ( $record = Maasha::Biopieces::get_record( $in ) )
1571 if ( $record->{ "REC_TYPE" } eq "BED" )
1573 $scores = Maasha::UCSC::fixedstep_index_lookup( $index, $fh_phastcons, $record->{ "CHR" }, $record->{ "CHR_BEG" }, $record->{ "CHR_END" }, $options->{ "flank" } );
1575 elsif ( $record->{ "REC_TYPE" } eq "PSL" )
1577 $scores = Maasha::UCSC::fixedstep_index_lookup( $index, $fh_phastcons, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" }, $options->{ "flank" } );
1579 elsif ( $record->{ "REC_TYPE" } eq "BLAST" )
1581 $scores = Maasha::UCSC::fixedstep_index_lookup( $index, $fh_phastcons, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" }, $options->{ "flank" } );
1584 $record->{ "PHASTCONS" } = join ",", @{ $scores } if @{ $scores };
1585 # $record->{ "PHAST_COUNT" } = @{ $scores } if @{ $scores }; # DEBUG
1587 Maasha::Biopieces::put_record( $record, $out );
1590 close $fh_phastcons if $fh_phastcons;
1596 # Martin A. Hansen, July 2008.
1598 # soap sequences in stream against a given file or genome.
1600 my ( $in, # handle to in stream
1601 $out, # handle to out stream
1602 $options, # options hash
1607 my ( $genome, $tmp_in, $tmp_out, $fh_in, $fh_out, $record, $line, @fields, $entry, $count, $args );
1609 $options->{ "seed_size" } ||= 10;
1610 $options->{ "mismatches" } ||= 2;
1611 $options->{ "gap_size" } ||= 0;
1612 $options->{ "cpus" } ||= 1;
1614 if ( $options->{ "genome" } ) {
1615 $options->{ "in_file" } = "$ENV{ 'BP_DATA' }/genomes/$options->{ 'genome' }/fasta/$options->{ 'genome' }.fna";
1618 $tmp_in = "$BP_TMP/soap_query.seq";
1619 $tmp_out = "$BP_TMP/soap.result";
1621 $fh_out = Maasha::Common::write_open( $tmp_in );
1625 while ( $record = Maasha::Biopieces::get_record( $in ) )
1627 if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) )
1629 Maasha::Fasta::put_entry( $entry, $fh_out );
1634 Maasha::Biopieces::put_record( $record, $out );
1642 "-s $options->{ 'seed_size' }",
1645 "-v $options->{ 'mismatches' }",
1646 "-g $options->{ 'gap_size' }",
1647 "-p $options->{ 'cpus' }",
1648 "-d $options->{ 'in_file' }",
1652 $args .= " > /dev/null 2>&1" if not $options->{ 'verbose' };
1654 Maasha::Common::run( "soap", $args, 1 );
1658 $fh_out = Maasha::Common::read_open( $tmp_out );
1662 while ( $line = <$fh_out> )
1666 @fields = split /\t/, $line;
1668 $record->{ "REC_TYPE" } = "SOAP";
1669 $record->{ "Q_ID" } = $fields[ 0 ];
1670 $record->{ "SCORE" } = $fields[ 3 ];
1671 $record->{ "STRAND" } = $fields[ 6 ];
1672 $record->{ "S_ID" } = $fields[ 7 ];
1673 $record->{ "S_BEG" } = $fields[ 8 ] - 1; # soap is 1-based
1674 $record->{ "S_END" } = $fields[ 8 ] + $fields[ 5 ] - 2;
1676 Maasha::Biopieces::put_record( $record, $out );
1686 sub script_write_bed
1688 # Martin A. Hansen, August 2007.
1690 # Write BED format for the UCSC genome browser using records in stream.
1692 my ( $in, # handle to in stream
1693 $out, # handle to out stream
1694 $options, # options hash
1699 my ( $cols, $fh, $record, $bed_entry, $new_record );
1701 $cols = $options->{ 'cols' }->[ 0 ];
1703 $fh = Maasha::Biopieces::write_stream( $options->{ 'data_out' }, $options->{ 'compress' } );
1705 while ( $record = Maasha::Biopieces::get_record( $in ) )
1707 $record = Maasha::UCSC::psl2record( $record ) if $record->{ 'tBaseInsert' }; # Dirty addition to allow Affy data from MySQL to be dumped
1709 if ( $bed_entry = Maasha::UCSC::BED::biopiece2bed( $record, $cols ) ) {
1710 Maasha::UCSC::BED::bed_entry_put( $bed_entry, $fh, $cols, $options->{ 'check' } );
1713 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ 'no_stream' };
1720 sub script_write_psl
1722 # Martin A. Hansen, August 2007.
1724 # Write PSL output from stream.
1726 my ( $in, # handle to in stream
1727 $out, # handle to out stream
1728 $options, # options hash
1733 my ( $fh, $record, @output, $first );
1737 $fh = Maasha::Biopieces::write_stream( $options->{ "data_out" }, $options->{ "compress" } );
1739 while ( $record = Maasha::Biopieces::get_record( $in ) )
1741 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
1743 if ( $record->{ "REC_TYPE" } and $record->{ "REC_TYPE" } eq "PSL" )
1745 Maasha::UCSC::psl_put_header( $fh ) if $first;
1746 Maasha::UCSC::psl_put_entry( $record, $fh );
1755 sub script_write_fixedstep
1757 # Martin A. Hansen, Juli 2008.
1759 # Write fixedStep entries from recrods in the stream.
1761 my ( $in, # handle to in stream
1762 $out, # handle to out stream
1763 $options, # options hash
1768 my ( $fh, $record, $entry );
1770 $fh = Maasha::Biopieces::write_stream( $options->{ "data_out" }, $options->{ "compress" } );
1772 while ( $record = Maasha::Biopieces::get_record( $in ) )
1774 if ( $entry = Maasha::UCSC::Wiggle::biopiece2fixedstep( $record ) ) {
1775 Maasha::UCSC::Wiggle::fixedstep_entry_put( $entry, $fh );
1778 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
1785 sub script_write_2bit
1787 # Martin A. Hansen, March 2008.
1789 # Write sequence entries from stream in 2bit format.
1791 my ( $in, # handle to in stream
1792 $out, # handle to out stream
1793 $options, # options hash
1798 my ( $record, $mask, $tmp_file, $fh_tmp, $fh_in, $fh_out, $entry );
1800 $mask = 1 if not $options->{ "no_mask" };
1802 $tmp_file = "$BP_TMP/write_2bit.fna";
1803 $fh_tmp = Maasha::Common::write_open( $tmp_file );
1805 $fh_out = Maasha::Biopieces::write_stream( $options->{ "data_out" } );
1807 while ( $record = Maasha::Biopieces::get_record( $in ) )
1809 if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) ) {
1810 Maasha::Fasta::put_entry( $entry, $fh_tmp );
1813 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
1818 $fh_in = Maasha::Common::read_open( $tmp_file );
1820 Maasha::TwoBit::fasta2twobit( $fh_in, $fh_out, $mask );
1829 sub script_write_solid
1831 # Martin A. Hansen, April 2008.
1833 # Write di-base encoded Solid sequence from entries in stream.
1835 my ( $in, # handle to in stream
1836 $out, # handle to out stream
1837 $options, # options hash
1842 my ( $record, $fh, $entry );
1844 $fh = Maasha::Biopieces::write_stream( $options->{ "data_out" }, $options->{ "compress" } );
1846 while ( $record = Maasha::Biopieces::get_record( $in ) )
1848 if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) )
1850 $entry->[ SEQ ] = Maasha::Solid::seq2color_space( uc $entry->[ SEQ ] );
1852 Maasha::Fasta::put_entry( $entry, $fh, $options->{ "wrap" } );
1855 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
1862 sub script_write_ucsc_config
1864 # Martin A. Hansen, November 2008.
1866 # Write UCSC Genome Broser configuration (.ra file type) from
1867 # records in the stream.
1869 my ( $in, # handle to in stream
1870 $out, # handle to out stream
1871 $options, # options hash
1876 my ( $record, $fh );
1878 $fh = Maasha::Biopieces::write_stream( $options->{ "data_out" } );
1880 while ( $record = Maasha::Biopieces::get_record( $in ) )
1882 Maasha::UCSC::ucsc_config_entry_put( $record, $fh ) if $record->{ "REC_TYPE" } eq "UCSC Config";
1884 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
1891 sub script_plot_seqlogo
1893 # Martin A. Hansen, August 2007.
1895 # Calculates and writes a sequence logo for alignments.
1897 my ( $in, # handle to in stream
1898 $out, # handle to out stream
1899 $options, # options hash
1904 my ( $record, @entries, $logo, $fh );
1906 while ( $record = Maasha::Biopieces::get_record( $in ) )
1908 if ( $record->{ "SEQ_NAME" } and $record->{ "SEQ" } ) {
1909 push @entries, [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ];
1912 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
1915 $logo = Maasha::Plot::seq_logo( \@entries );
1917 $fh = Maasha::Biopieces::write_stream( $options->{ "data_out" } );
1925 sub script_plot_phastcons_profiles
1927 # Martin A. Hansen, January 2008.
1929 # Plots PhastCons profiles.
1931 my ( $in, # handle to in stream
1932 $out, # handle to out stream
1933 $options, # options hash
1938 my ( $phastcons_file, $phastcons_index, $index, $fh_phastcons, $record, $scores, $AoA, $plot, $fh );
1940 $options->{ "title" } ||= "PhastCons Profiles";
1942 $phastcons_file = Maasha::Config::genome_phastcons( $options->{ "genome" } );
1943 $phastcons_index = Maasha::Config::genome_phastcons_index( $options->{ "genome" } );
1945 $index = Maasha::UCSC::fixedstep_index_retrieve( $phastcons_index );
1946 $fh_phastcons = Maasha::Common::read_open( $phastcons_file );
1948 while ( $record = Maasha::Biopieces::get_record( $in ) )
1950 if ( $record->{ "CHR" } and $record->{ "CHR_BEG" } and $record->{ "CHR_END" } )
1952 $scores = Maasha::UCSC::fixedstep_index_lookup( $index, $fh_phastcons, $record->{ "CHR" },
1953 $record->{ "CHR_BEG" },
1954 $record->{ "CHR_END" },
1955 $options->{ "flank" } );
1957 push @{ $AoA }, [ @{ $scores } ];
1960 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
1963 Maasha::UCSC::phastcons_normalize( $AoA );
1965 $AoA = [ [ Maasha::UCSC::phastcons_mean( $AoA ) ] ] if $options->{ "mean" };
1966 $AoA = [ [ Maasha::UCSC::phastcons_median( $AoA ) ] ] if $options->{ "median" };
1968 $AoA = Maasha::Matrix::matrix_flip( $AoA );
1970 $plot = Maasha::Plot::lineplot_simple( $AoA, $options, $BP_TMP );
1972 $fh = Maasha::Biopieces::write_stream( $options->{ "data_out" } );
1974 print $fh "$_\n" foreach @{ $plot };
1980 sub script_remove_mysql_tables
1982 # Martin A. Hansen, November 2008.
1984 # Remove MySQL tables from values in stream.
1986 my ( $in, # handle to in stream
1987 $out, # handle to out stream
1988 $options, # options hash
1993 my ( $record, %table_hash, $dbh, $table );
1995 $options->{ "user" } ||= Maasha::UCSC::ucsc_get_user();
1996 $options->{ "password" } ||= Maasha::UCSC::ucsc_get_password();
1998 map { $table_hash{ $_ } = 1 } @{ $options->{ 'tables' } };
2000 while ( $record = Maasha::Biopieces::get_record( $in ) )
2002 map { $table_hash{ $record->{ $_ } } = 1 } @{ $options->{ 'keys' } };
2004 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ 'no_stream' };
2007 $dbh = Maasha::SQL::connect( $options->{ "database" }, $options->{ "user" }, $options->{ "password" } );
2009 foreach $table ( sort keys %table_hash )
2011 if ( Maasha::SQL::table_exists( $dbh, $table ) )
2013 print STDERR qq(Removing table "$table" from database "$options->{ 'database' }" ... ) if $options->{ 'verbose' };
2014 Maasha::SQL::delete_table( $dbh, $table );
2015 print STDERR "done.\n" if $options->{ 'verbose' };
2019 print STDERR qq(WARNING: table "$table" not found in database "$options->{ 'database' }\n");
2023 Maasha::SQL::disconnect( $dbh );
2027 sub script_remove_ucsc_tracks
2029 # Martin A. Hansen, November 2008.
2031 # Remove track from MySQL tables and config file.
2033 my ( $in, # handle to in stream
2034 $out, # handle to out stream
2035 $options, # options hash
2040 my ( $record, %track_hash, $fh_in, $fh_out, $track, @tracks, @new_tracks, $dbh );
2042 $options->{ 'user' } ||= Maasha::UCSC::ucsc_get_user();
2043 $options->{ 'password' } ||= Maasha::UCSC::ucsc_get_password();
2044 $options->{ 'config_file' } ||= "$ENV{ 'HOME' }/ucsc/my_tracks.ra";
2046 map { $track_hash{ $_ } = 1 } @{ $options->{ 'tracks' } };
2048 while ( $record = Maasha::Biopieces::get_record( $in ) )
2050 map { $track_hash{ $record->{ $_ } } = 1 } @{ $options->{ 'keys' } };
2052 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ 'no_stream' };
2055 $fh_in = Maasha::Filesys::file_read_open( $options->{ 'config_file' } );
2057 while ( $track = Maasha::UCSC::ucsc_config_entry_get( $fh_in ) ) {
2058 push @tracks, $track;
2063 foreach $track ( @tracks )
2065 if ( $track->{ 'database' } eq $options->{ 'database' } and exists $track_hash{ $track->{ 'track' } } ) {
2066 print STDERR qq(Removing track "$track->{ 'track' }" from config file.\n) if $options->{ 'verbose' };
2068 push @new_tracks, $track;
2072 rename "$options->{ 'config_file' }", "$options->{ 'config_file' }~";
2074 $fh_out = Maasha::Filesys::file_write_open( $options->{ 'config_file' } );
2076 map { Maasha::UCSC::ucsc_config_entry_put( $_, $fh_out ) } @new_tracks;
2080 # ---- locate track in database ----
2082 $dbh = Maasha::SQL::connect( $options->{ "database" }, $options->{ "user" }, $options->{ "password" } );
2084 foreach $track ( sort keys %track_hash )
2086 if ( Maasha::SQL::table_exists( $dbh, $track ) )
2088 print STDERR qq(Removing table "$track" from database "$options->{ 'database' }" ... ) if $options->{ 'verbose' };
2089 Maasha::SQL::delete_table( $dbh, $track );
2090 print STDERR "done.\n" if $options->{ 'verbose' };
2094 print STDERR qq(WARNING: table "$track" not found in database "$options->{ 'database' }\n");
2098 Maasha::SQL::disconnect( $dbh );
2100 Maasha::Common::run( "ucscMakeTracks.pl", "-b > /dev/null 2>&1" );
2104 sub script_plot_histogram
2106 # Martin A. Hansen, September 2007.
2108 # Plot a simple histogram for a given key using GNU plot.
2110 my ( $in, # handle to in stream
2111 $out, # handle to out stream
2112 $options, # options hash
2117 my ( $record, %data_hash, $max, @data_list, $i, $result, $fh );
2119 $options->{ "title" } ||= "Histogram";
2120 $options->{ "sort" } ||= "num";
2122 while ( $record = Maasha::Biopieces::get_record( $in ) )
2124 $data_hash{ $record->{ $options->{ "key" } } }++ if defined $record->{ $options->{ "key" } };
2126 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
2129 if ( $options->{ "sort" } eq "num" ) {
2130 map { push @data_list, [ $_, $data_hash{ $_ } ] } sort { $a <=> $b } keys %data_hash;
2132 map { push @data_list, [ $_, $data_hash{ $_ } ] } sort keys %data_hash;
2135 $result = Maasha::Plot::histogram_simple( \@data_list, $options );
2137 $fh = Maasha::Biopieces::write_stream( $options->{ "data_out" } );
2139 print $fh "$_\n" foreach @{ $result };
2145 sub script_plot_lendist
2147 # Martin A. Hansen, August 2007.
2149 # Plot length distribution using GNU plot.
2151 my ( $in, # handle to in stream
2152 $out, # handle to out stream
2153 $options, # options hash
2158 my ( $record, %data_hash, $max, @data_list, $i, $result, $fh );
2160 $options->{ "title" } ||= "Length Distribution";
2162 while ( $record = Maasha::Biopieces::get_record( $in ) )
2164 $data_hash{ $record->{ $options->{ "key" } } }++ if defined $record->{ $options->{ "key" } };
2166 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
2169 $max = Maasha::Calc::list_max( [ keys %data_hash ] );
2171 for ( $i = 0; $i < $max; $i++ ) {
2172 push @data_list, [ $i, $data_hash{ $i } || 0 ];
2175 $result = Maasha::Plot::histogram_lendist( \@data_list, $options );
2177 $fh = Maasha::Biopieces::write_stream( $options->{ "data_out" } );
2179 print $fh "$_\n" foreach @{ $result };
2185 sub script_plot_karyogram
2187 # Martin A. Hansen, August 2007.
2189 # Plot hits on karyogram.
2191 my ( $in, # handle to in stream
2192 $out, # handle to out stream
2193 $options, # options hash
2198 my ( %options, $record, @data, $fh, $result, %data_hash );
2200 $options->{ "genome" } ||= "human";
2201 $options->{ "feat_color" } ||= "black";
2203 while ( $record = Maasha::Biopieces::get_record( $in ) )
2205 if ( $record->{ "CHR" } and $record->{ "CHR_BEG" } and $record->{ "CHR_END" } )
2207 push @{ $data_hash{ $record->{ "CHR" } } }, [ $record->{ "CHR_BEG" }, $record->{ "CHR_END" }, $options->{ "feat_color" } ];
2210 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
2213 $result = Maasha::Plot::karyogram( \%data_hash, \%options );
2215 $fh = Maasha::Biopieces::write_stream( $options->{ "data_out" } );
2223 sub script_plot_matches
2225 # Martin A. Hansen, August 2007.
2227 # Plot matches in 2D generating a dotplot.
2229 my ( $in, # handle to in stream
2230 $out, # handle to out stream
2231 $options, # options hash
2236 my ( $record, @data, $fh, $result, %data_hash );
2238 $options->{ "direction" } ||= "both";
2240 while ( $record = Maasha::Biopieces::get_record( $in ) )
2242 if ( defined $record->{ "Q_BEG" } and defined $record->{ "S_BEG" } and $record->{ "Q_END" } and $record->{ "S_END" } ) {
2243 push @data, $record;
2246 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
2249 $options->{ "title" } ||= "plot_matches";
2250 $options->{ "xlabel" } ||= $data[ 0 ]->{ "Q_ID" };
2251 $options->{ "ylabel" } ||= $data[ 0 ]->{ "S_ID" };
2253 $result = Maasha::Plot::dotplot_matches( \@data, $options, $BP_TMP );
2255 $fh = Maasha::Biopieces::write_stream( $options->{ "data_out" } );
2257 print $fh "$_\n" foreach @{ $result };
2263 sub script_upload_to_ucsc
2265 # Martin A. Hansen, August 2007.
2267 # Calculate the mean of values of given keys.
2269 # This routine has developed into the most ugly hack. Do something!
2271 my ( $in, # handle to in stream
2272 $out, # handle to out stream
2273 $options, # options hash
2278 my ( $record, $file, $wib_file, $wig_file, $wib_dir, $fh_out, $i, $first, $format, $type, $columns, $append, $entry );
2280 $options->{ "short_label" } ||= $options->{ 'table' };
2281 $options->{ "long_label" } ||= $options->{ 'table' };
2282 $options->{ "group" } ||= $ENV{ "LOGNAME" };
2283 $options->{ "priority" } ||= 1;
2284 $options->{ "visibility" } ||= "pack";
2285 $options->{ "color" } ||= join( ",", int( rand( 255 ) ), int( rand( 255 ) ), int( rand( 255 ) ) );
2286 $options->{ "chunk_size" } ||= 10_000_000_000; # Due to 32-bit UCSC compilation really large tables cannot be loaded in one go.
2288 $file = "$BP_TMP/ucsc_upload.tmp";
2293 $fh_out = Maasha::Common::write_open( $file );
2295 while ( $record = Maasha::Biopieces::get_record( $in ) )
2297 Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
2299 if ( $record->{ "REC_TYPE" } eq "fixed_step" )
2303 if ( $entry = Maasha::UCSC::Wiggle::biopiece2fixedstep( $record ) ) {
2304 Maasha::UCSC::Wiggle::fixedstep_entry_put( $entry, $fh_out );
2307 elsif ( $record->{ "REC_TYPE" } eq "PSL" )
2311 Maasha::UCSC::psl_put_header( $fh_out ) if $first;
2312 Maasha::UCSC::psl_put_entry( $record, $fh_out );
2316 elsif ( $record->{ "REC_TYPE" } eq "BED" and $record->{ "SEC_STRUCT" } )
2318 # chrom chromStart chromEnd name score strand size secStr conf
2322 print $fh_out join ( "\t",
2324 $record->{ "CHR_BEG" },
2325 $record->{ "CHR_END" } + 1,
2326 $record->{ "Q_ID" },
2327 $record->{ "SCORE" },
2328 $record->{ "STRAND" },
2329 $record->{ "SIZE" },
2330 $record->{ "SEC_STRUCT" },
2331 $record->{ "CONF" },
2334 elsif ( $record->{ "REC_TYPE" } eq "BED" )
2337 $columns = $record->{ "BED_COLS" };
2339 if ( $entry = Maasha::UCSC::BED::biopiece2bed( $record, $columns ) ) {
2340 Maasha::UCSC::BED::bed_entry_put( $entry, $fh_out, $columns, $options->{ 'check' } );
2343 elsif ( $record->{ "REC_TYPE" } eq "PATSCAN" and $record->{ "CHR" } )
2348 if ( $entry = Maasha::UCSC::BED::biopiece2bed( $record, $columns ) ) {
2349 Maasha::UCSC::BED::bed_entry_put( $entry, $fh_out, $columns, $options->{ 'check' } );
2352 elsif ( $record->{ "REC_TYPE" } eq "BLAST" and $record->{ "S_ID" } =~ /^chr/ )
2357 $record->{ "SCORE" } = $record->{ "BIT_SCORE" } * 1000;
2359 if ( $entry = Maasha::UCSC::BED::biopiece2bed( $record, $columns ) ) {
2360 Maasha::UCSC::BED::bed_entry_put( $entry, $fh_out, $columns, $options->{ 'check' } );
2363 elsif ( $record->{ "REC_TYPE" } eq "VMATCH" and $record->{ "S_ID" } =~ /^chr/i )
2368 if ( $entry = Maasha::UCSC::BED::biopiece2bed( $record, $columns ) ) {
2369 Maasha::UCSC::BED::bed_entry_put( $entry, $fh_out, $columns, $options->{ 'check' } );
2373 if ( $i == $options->{ "chunk_size" } )
2377 if ( $format eq "BED" ) {
2378 Maasha::UCSC::bed_upload_to_ucsc( $BP_TMP, $file, $options, $append );
2379 } elsif ( $format eq "PSL" ) {
2380 Maasha::UCSC::psl_upload_to_ucsc( $file, $options, $append );
2389 $fh_out = Maasha::Common::write_open( $file );
2397 if ( exists $options->{ "database" } and $options->{ "table" } )
2399 if ( $format eq "BED" )
2401 $type = "bed $columns";
2403 Maasha::UCSC::bed_upload_to_ucsc( $BP_TMP, $file, $options, $append );
2405 elsif ( $format eq "BED_SS" )
2407 $type = "type bed 6 +";
2409 Maasha::UCSC::bed_upload_to_ucsc( $BP_TMP, $file, $options, $append );
2411 elsif ( $format eq "PSL" )
2415 Maasha::UCSC::psl_upload_to_ucsc( $file, $options, $append );
2417 elsif ( $format eq "WIGGLE" )
2419 $options->{ "visibility" } = "full";
2421 $wig_file = "$options->{ 'table' }.wig";
2422 $wib_file = "$options->{ 'table' }.wib";
2424 $wib_dir = "$ENV{ 'HOME' }/ucsc/wib";
2426 Maasha::Common::dir_create_if_not_exists( $wib_dir );
2428 if ( $options->{ 'verbose' } ) {
2429 `cd $BP_TMP && wigEncode $file $wig_file $wib_file`;
2431 `cd $BP_TMP && wigEncode $file $wig_file $wib_file > /dev/null 2>&1`;
2434 Maasha::Common::run( "mv", "$BP_TMP/$wib_file $wib_dir" );
2442 Maasha::UCSC::wiggle_upload_to_ucsc( $BP_TMP, $wib_dir, $file, $options );
2447 Maasha::UCSC::ucsc_update_config( $options, $type );
2452 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<