-#!/usr/bin/env perl
+#!/usr/bin/env perl -w
+
+# Copyright (C) 2007-2009 Martin A. Hansen.
+
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+# http://www.gnu.org/copyleft/gpl.html
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# BLAST sequences in the stream against a specified database or genome.
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
-use warnings;
use strict;
+use Maasha::Biopieces;
+use Maasha::Common;
+use Maasha::Seq;
+use Maasha::Fasta;
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+my ( $run_time_beg, $run_time_end, $options, $in, $out, $tmp_dir, $tmp_in, $tmp_out, $q_type, $s_type, $record, $entry,
+ $fh_in, $fh_out, $program );
+
+$options = Maasha::Biopieces::parse_options(
+ [
+ { long => 'database', short => 'd', type => 'file', mandatory => 'no', default => undef, allowed => undef, disallowed => undef },
+ { long => 'genome', short => 'g', type => 'genome', mandatory => 'no', default => undef, allowed => undef, disallowed => undef },
+ { long => 'program', short => 'p', type => 'string', mandatory => 'no', default => undef, allowed => 'blastn,blastp,tblastn,blastx,tblastx', disallowed => undef },
+ { long => 'e_val', short => 'e', type => 'float', mandatory => 'no', default => 10, allowed => undef, disallowed => undef },
+ { long => 'filter', short => 'f', type => 'string', mandatory => 'no', default => 'no', allowed => 'yes,no', disallowed => undef },
+ { long => 'cpus', short => 'c', type => 'uint', mandatory => 'no', default => 1, allowed => undef, disallowed => 0 },
+ ]
+);
+
+Maasha::Common::error( qq(both --database and --genome specified) ) if $options->{ "genome" } and $options->{ "database" };
+Maasha::Common::error( qq(no --database or --genome specified) ) if not $options->{ "genome" } and not $options->{ "database" };
+
+$in = Maasha::Biopieces::read_stream( $options->{ "stream_in" } );
+$out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } );
+
+$options->{ "database" } = "$ENV{ 'BP_DATA' }/genomes/$options->{ 'genome' }/blast/$options->{ 'genome' }.fna" if $options->{ 'genome' };
+
+$tmp_dir = Maasha::Biopieces::get_tmpdir();
+$tmp_in = "$tmp_dir/blast_query.seq";
+$tmp_out = "$tmp_dir/blast.result";
+
+$fh_out = Maasha::Filesys::file_write_open( $tmp_in );
+
+while ( $record = Maasha::Biopieces::get_record( $in ) )
+{
+ if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) )
+ {
+ $q_type = Maasha::Seq::seq_guess_type( $record->{ 'SEQ' } ) if not $q_type;
+
+ Maasha::Fasta::put_entry( $entry, $fh_out );
+ }
+
+ Maasha::Biopieces::put_record( $record, $out );
+}
+
+close $fh_out;
+
+$s_type = guess_database_type( $options->{ 'database' } );
+
+$program = $options->{ 'program' } || guess_program( $q_type, $s_type );
+
+if ( $options->{ 'filter' } eq 'yes' ) {
+ $options->{ 'filter' } = 'T';
+} else {
+ $options->{ 'filter' } = 'F';
+}
+
+if ( $options->{ 'verbose' } )
+{
+ Maasha::Common::run(
+ "blastall",
+ join( " ",
+ "-p $program",
+ "-e $options->{ 'e_val' }",
+ "-a $options->{ 'cpus' }",
+ "-m 8",
+ "-i $tmp_in",
+ "-d $options->{ 'database' }",
+ "-F $options->{ 'filter' }",
+ "-o $tmp_out",
+ ),
+ 1
+ );
+}
+else
+{
+ Maasha::Common::run(
+ "blastall",
+ join( " ",
+ "-p $program",
+ "-e $options->{ 'e_val' }",
+ "-a $options->{ 'cpus' }",
+ "-m 8",
+ "-i $tmp_in",
+ "-d $options->{ 'database' }",
+ "-F $options->{ 'filter' }",
+ "-o $tmp_out",
+ "> /dev/null 2>&1"
+ ),
+ 1
+ );
+}
+
+# unlink $tmp_in;
+
+$fh_in = Maasha::Filesys::file_read_open( $tmp_out );
+
+while ( $entry = get_tab_entry( $fh_in ) )
+{
+ $record = blast_tab2biopiece( $entry );
+
+ Maasha::Biopieces::put_record( $record, $out );
+}
+
+close $fh_out;
+
+# unlink $tmp_out;
+
+# Maasha::Filesys::dir_remove( $tmp_dir );
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SUBROUTINES <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+sub guess_database_type
+{
+ # Martin A. Hansen, May 2009
+
+ # Guess the type of BLAST database from the database
+ # filename assuming that it is a protein database if
+ # a .phr file exists.
+
+ # Returns string;
+ if ( -f $options->{ 'database' } . ".phr" ) {
+ return "protein";
+ } else {
+ return "nucleotide";
+ }
+}
+
+
+sub guess_program
+{
+ # Martin A. Hansen, May 2009
+
+ # Guess what BLAST program to use based on the
+ # sequence type of the query and subject sequences.
+
+ my ( $q_type, # query sequence type
+ $s_type, # subject sequence type
+ ) = @_;
+
+ # Returns string.
+
+ my ( $program );
+
+ if ( $q_type ne "protein" and $s_type ne "protein" ) {
+ $program = "blastn";
+ } elsif ( $q_type eq "protein" and $s_type eq "protein" ) {
+ $program = "blastp";
+ } elsif ( $q_type ne "protein" and $s_type eq "protein" ) {
+ $program = "blastx";
+ } elsif ( $q_type eq "protein" and $s_type ne "protein" ) {
+ $program = "tblastn";
+ }
+
+ return $program;
+}
+
+
+
+sub get_tab_entry
+{
+ # Martin A. Hansen, May 2009.
+
+ # Get the next tabular entry from a filehandle to a BLAST file.
+
+ my ( $fh, # filehandle
+ ) = @_;
+
+ # Returns a list
+
+ my ( $line, @fields );
+
+ while ( $line = <$fh> )
+ {
+ next if $line =~ /^#/;
+
+ @fields = split /\s+/, $line;
+
+ use Data::Dumper;
+
+ return wantarray ? @fields : \@fields;
+ }
+
+ return undef;
+}
+
+
+sub blast_tab2biopiece
+{
+ # Martin A. Hansen, May 2009.
+
+ # Get the next BLAST tabular entry and convert it to
+ # a biopiece record that is returned.
+
+ my ( $entry, # BLAST tabular entry
+ ) = @_;
+
+ # Returns a hashref.
+
+ my ( %record );
+
+ $record{ "REC_TYPE" } = "BLAST";
+ $record{ "Q_ID" } = $entry->[ 0 ];
+ $record{ "S_ID" } = $entry->[ 1 ];
+ $record{ "IDENT" } = $entry->[ 2 ];
+ $record{ "ALIGN_LEN" } = $entry->[ 3 ];
+ $record{ "MISMATCHES" } = $entry->[ 4 ];
+ $record{ "GAPS" } = $entry->[ 5 ];
+ $record{ "Q_BEG" } = $entry->[ 6 ] - 1; # BLAST is 1-based
+ $record{ "Q_END" } = $entry->[ 7 ] - 1; # BLAST is 1-based
+ $record{ "S_BEG" } = $entry->[ 8 ] - 1; # BLAST is 1-based
+ $record{ "S_END" } = $entry->[ 9 ] - 1; # BLAST is 1-based
+ $record{ "E_VAL" } = $entry->[ 10 ];
+ $record{ "BIT_SCORE" } = $entry->[ 11 ];
+
+ if ( $record{ "S_BEG" } > $record{ "S_END" } )
+ {
+ $record{ "STRAND" } = '-';
+
+ ( $record{ "S_BEG" }, $record{ "S_END" } ) = ( $record{ "S_END" }, $record{ "S_BEG" } );
+ }
+ else
+ {
+ $record{ "STRAND" } = '+';
+ }
+
+ return wantarray ? %record : \%record;
+}
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+BEGIN
+{
+ $run_time_beg = Maasha::Biopieces::run_time();
+
+ Maasha::Biopieces::log_biopiece();
+}
+
+END
+{
+ Maasha::Biopieces::close_stream( $in );
+ Maasha::Biopieces::close_stream( $out );
+
+ $run_time_end = Maasha::Biopieces::run_time();
+
+ Maasha::Biopieces::run_time_print( $run_time_beg, $run_time_end, $options );
+}
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+__END__
-use Maasha::BioRun;
-#!/usr/bin/env perl
+#!/usr/bin/env perl -w
+
+# Copyright (C) 2007-2009 Martin A. Hansen.
+
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+# http://www.gnu.org/copyleft/gpl.html
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# Create a BLAST database from sequences in stream for use with [blast_seq].
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
-use warnings;
use strict;
+use Maasha::Seq;
+use Maasha::Fasta;
+use Maasha::Biopieces;
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+my ( $run_time_beg, $run_time_end, $options, $in, $out, $record, $fh, $entry, $seq_type, $path );
+
+$options = Maasha::Biopieces::parse_options(
+ [
+ { long => 'no_stream', short => 'x', type => 'flag', mandatory => 'no', default => undef, allowed => undef, disallowed => undef },
+ { long => 'database', short => 'd', type => 'file', mandatory => 'yes', default => undef, allowed => undef, disallowed => undef },
+ ]
+);
+
+$in = Maasha::Biopieces::read_stream( $options->{ "stream_in" } );
+$out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } );
+
+$path = $options->{ "database" };
+
+$fh = Maasha::Common::write_open( $path );
+
+while ( $record = Maasha::Biopieces::get_record( $in ) )
+{
+ Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
+
+ if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) )
+ {
+ $seq_type = Maasha::Seq::seq_guess_type( $record->{ 'SEQ' } ) if not $seq_type;
+
+ Maasha::Fasta::put_entry( $entry, $fh );
+ }
+}
+
+close $fh;
+
+if ( $seq_type eq "protein" ) {
+ Maasha::Common::run( "formatdb", "-p T -i $path -t $options->{ 'database' } -l /dev/null" );
+} else {
+ Maasha::Common::run( "formatdb", "-p F -i $path -t $options->{ 'database' } -l /dev/null" );
+}
+
+unlink $path;
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+BEGIN
+{
+ $run_time_beg = Maasha::Biopieces::run_time();
+
+ Maasha::Biopieces::log_biopiece();
+}
+
+END
+{
+ Maasha::Biopieces::close_stream( $in );
+ Maasha::Biopieces::close_stream( $out );
+
+ $run_time_end = Maasha::Biopieces::run_time();
+
+ Maasha::Biopieces::run_time_print( $run_time_beg, $run_time_end, $options );
+}
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+__END__
-use Maasha::BioRun;
-#!/usr/bin/env perl
+#!/usr/bin/env perl -w
+
+# Copyright (C) 2007-2009 Martin A. Hansen.
+
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+# http://www.gnu.org/copyleft/gpl.html
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# Split the values of a key into new key/value pairs.
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
-use warnings;
use strict;
+use Maasha::Biopieces;
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+my ( $run_time_beg, $run_time_end, $options, $in, $out, $record, @vals, $i );
+
+$options = Maasha::Biopieces::parse_options(
+ [
+ { long => 'key', short => 'k', type => 'string', mandatory => 'yes', default => undef, allowed => undef, disallowed => undef },
+ { long => 'keys', short => 'K', type => 'list', mandatory => 'no', default => undef, allowed => undef, disallowed => undef },
+ { long => 'delimit', short => 'd', type => 'string', mandatory => 'no', default => '_', allowed => undef, disallowed => undef },
+ ]
+);
+
+$in = Maasha::Biopieces::read_stream( $options->{ "stream_in" } );
+$out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } );
+
+while ( $record = Maasha::Biopieces::get_record( $in ) )
+{
+ if ( exists $options->{ 'key' } and exists $record->{ $options->{ 'key' } } )
+ {
+ @vals = split /$options->{ 'delimit' }/, $record->{ $options->{ 'key' } };
+
+ if ( scalar @vals > 1 )
+ {
+ for ( $i = 0; $i < @vals; $i++ )
+ {
+ if ( defined $options->{ "keys" } and defined $options->{ "keys" }->[ $i ] ) {
+ $record->{ $options->{ "keys" }->[ $i ] } = $vals[ $i ];
+ } else {
+ $record->{ $options->{ 'key' } . "_$i" } = $vals[ $i ];
+ }
+ }
+ }
+ }
+
+ Maasha::Biopieces::put_record( $record, $out );
+}
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+BEGIN
+{
+ $run_time_beg = Maasha::Biopieces::run_time();
+
+ Maasha::Biopieces::log_biopiece();
+}
+
+
+END
+{
+ Maasha::Biopieces::close_stream( $in );
+ Maasha::Biopieces::close_stream( $out );
+
+ $run_time_end = Maasha::Biopieces::run_time();
+
+ Maasha::Biopieces::run_time_print( $run_time_beg, $run_time_end, $options );
+}
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+__END__
-use Maasha::BioRun;
-#!/usr/bin/env perl
+#!/usr/bin/env perl -w
+
+# Copyright (C) 2007-2009 Martin A. Hansen.
+
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+# http://www.gnu.org/copyleft/gpl.html
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# Format a genome creating specified indexes.
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
-use warnings;
use strict;
+use Maasha::Fasta;
+use Maasha::Biopieces;
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+my ( $run_time_beg, $run_time_end, $default, $formats, $options, $in, $out, $record, $data_out, $entry,
+ $genome, $dir, $fasta_dir, $phastcons_dir, $fh_out, $vals, $format, $tmp_dir );
+
+$tmp_dir = Maasha::Biopieces::get_tmpdir();
+$default = $ENV{ 'BP_DATA' };
+$formats = 'fasta,blast,vmatch,phastcons';
+
+$options = Maasha::Biopieces::parse_options(
+ [
+ { long => 'no_stream', short => 'x', type => 'flag', mandatory => 'no', default => undef, allowed => undef, disallowed => undef },
+ { long => 'dir', short => 'd', type => 'dir!', mandatory => 'no', default => $default, allowed => undef, disallowed => undef },
+ { long => 'genome', short => 'g', type => 'string', mandatory => 'yes', default => undef, allowed => undef, disallowed => undef },
+ { long => 'formats', short => 'f', type => 'list', mandatory => 'yes', default => undef, allowed => $formats, disallowed => '0' },
+ ]
+);
+
+$in = Maasha::Biopieces::read_stream( $options->{ "stream_in" } );
+$out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } );
+
+$dir = $options->{ 'dir' };
+$genome = $options->{ 'genome' };
+
+Maasha::Filesys::dir_create_if_not_exists( "$dir/genomes" );
+Maasha::Filesys::dir_create_if_not_exists( "$dir/genomes/$genome" );
+
+if ( grep { $_ =~ /fasta|blast|vmatch/i } @{ $options->{ "formats" } } )
+{
+ if ( -f "$dir/genomes/$genome/fasta/$genome.fna" )
+ {
+ $fasta_dir = "$dir/genomes/$genome/fasta";
+ }
+ else
+ {
+ Maasha::Filesys::dir_create_if_not_exists( "$dir/genomes/$genome/fasta" );
+
+ $fasta_dir = "$dir/genomes/$genome/fasta";
+
+ $fh_out = Maasha::Filesys::file_write_open( "$fasta_dir/$genome.fna" );
+ }
+}
+elsif ( grep { $_ =~ /phastcons/i } @{ $options->{ "formats" } } )
+{
+ Maasha::Filesys::dir_create_if_not_exists( "$dir/genomes/$genome/phastcons" );
+
+ $phastcons_dir = "$dir/genomes/$genome/phastcons";
+
+ $fh_out = Maasha::Filesys::file_write_open( "$phastcons_dir/$genome.pp" );
+}
+
+while ( $record = Maasha::Biopieces::get_record( $in ) )
+{
+ if ( $fh_out and $entry = Maasha::Fasta::biopiece2fasta( $record ) )
+ {
+ Maasha::Fasta::put_entry( $entry, $fh_out );
+ }
+ elsif ( $fh_out and $record->{ "CHR" } and $record->{ "CHR_BEG" } and $record->{ "STEP" } and $record->{ "VALS" } )
+ {
+ print $fh_out "fixedStep chrom=$record->{ 'CHR' } start=$record->{ 'CHR_BEG' } step=$record->{ 'STEP' }\n";
+
+ $vals = $record->{ 'VALS' };
+
+ $vals =~ tr/,/\n/;
+
+ print $fh_out "$vals\n";
+ }
+
+ Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
+}
+
+foreach $format ( @{ $options->{ 'formats' } } )
+{
+ if ( $format =~ /^fasta$/i ) { Maasha::Fasta::fasta_index( "$fasta_dir/$genome.fna", "$dir/genomes/$genome/fasta/$genome.index" ) }
+ elsif ( $format =~ /^blast$/i ) { Maasha::NCBI::blast_index( "$genome.fna", $fasta_dir, "$dir/genomes/$genome/blast", "dna", $genome ) }
+ elsif ( $format =~ /^blat$/i ) { print STDERR "BLAT FORMAT NOT IMPLEMENTED" }
+ elsif ( $format =~ /^vmatch$/i ) { Maasha::Match::vmatch_index( "$genome.fna", $fasta_dir, "$dir/genomes/$genome/vmatch", $tmp_dir ) }
+ elsif ( $format =~ /^phastcons$/i ) { Maasha::UCSC::phastcons_index( "$genome.pp", $phastcons_dir ) }
+}
+
+close $fh_out if $fh_out;
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+BEGIN
+{
+ $run_time_beg = Maasha::Biopieces::run_time();
+
+ Maasha::Biopieces::log_biopiece();
+}
+
+END
+{
+ Maasha::Biopieces::close_stream( $in );
+ Maasha::Biopieces::close_stream( $out );
+
+ $run_time_end = Maasha::Biopieces::run_time();
+
+ Maasha::Biopieces::run_time_print( $run_time_beg, $run_time_end, $options );
+}
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
-use Maasha::BioRun;
+__END__
-#!/usr/bin/env perl
+#!/usr/bin/env perl -w
+
+# Copyright (C) 2007-2009 Martin A. Hansen.
+
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+# http://www.gnu.org/copyleft/gpl.html
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# Extract subsequences from a genome sequence.
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
-use warnings;
use strict;
+use Maasha::Biopieces;
+use Maasha::Filesys;
+use Maasha::Seq;
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+my ( $run_time_beg, $run_time_end, $options, $in, $out, $record, $genome_file, $index_file, $fh, $genome,
+ $beg, $end, $len, $index_beg, $index_len, @begs, @lens, $index, $seq, $i, %lookup_hash );
+
+$options = Maasha::Biopieces::parse_options(
+ [
+ { long => 'genome', short => 'g', type => 'genome', mandatory => 'no', default => undef, allowed => undef, disallowed => undef },
+ { long => 'chr', short => 'c', type => 'string', mandatory => 'no', default => undef, allowed => undef, disallowed => undef },
+ { long => 'beg', short => 'b', type => 'uint', mandatory => 'no', default => undef, allowed => undef, disallowed => 0 },
+ { long => 'end', short => 'e', type => 'uint', mandatory => 'no', default => undef, allowed => undef, disallowed => 0 },
+ { long => 'len', short => 'l', type => 'uint', mandatory => 'no', default => undef, allowed => undef, disallowed => 0 },
+ { long => 'flank', short => 'f', type => 'uint', mandatory => 'no', default => 0, allowed => undef, disallowed => undef },
+ { long => 'mask', short => 'm', type => 'flag', mandatory => 'no', default => undef, allowed => undef, disallowed => undef },
+ { long => 'splice', short => 's', type => 'flag', mandatory => 'no', default => undef, allowed => undef, disallowed => undef },
+ ]
+);
+
+$in = Maasha::Biopieces::read_stream( $options->{ "stream_in" } );
+$out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } );
+
+if ( $options->{ "genome" } )
+{
+ $genome = $options->{ "genome" };
+
+ $genome_file = "$ENV{ 'BP_DATA' }/genomes/$genome/fasta/$genome.fna";
+ $index_file = "$ENV{ 'BP_DATA' }/genomes/$genome/fasta/$genome.index";
+
+ $fh = Maasha::Filesys::file_read_open( $genome_file );
+ $index = Maasha::Fasta::index_retrieve( $index_file );
+
+ shift @{ $index }; # Get rid of the file size info
+
+ map { $lookup_hash{ $_->[ 0 ] } = [ $_->[ 1 ], $_->[ 2 ] ] } @{ $index };
+
+ if ( exists $lookup_hash{ $options->{ "chr" } } and defined $options->{ "beg" } and ( defined $options->{ "end" } or defined $options->{ "len" } ) )
+ {
+ ( $index_beg, $index_len ) = @{ $lookup_hash{ $options->{ "chr" } } };
+
+ $beg = $index_beg + $options->{ "beg" } - 1;
+
+ if ( $options->{ "len" } ) {
+ $len = $options->{ "len" };
+ } elsif ( $options->{ "end" } ) {
+ $len = ( $options->{ "end" } - $options->{ "beg" } + 1 );
+ }
+
+ $beg -= $options->{ "flank" };
+ $len += 2 * $options->{ "flank" };
+
+ if ( $beg <= $index_beg )
+ {
+ $len -= $index_beg - $beg;
+ $beg = $index_beg;
+ }
+
+ $len = $index_beg + $index_len - $beg if $beg + $len > $index_beg + $index_len;
+
+ next if $beg > $index_beg + $index_len;
+
+ $record->{ "CHR" } = $options->{ "chr" };
+ $record->{ "CHR_BEG" } = $beg - $index_beg;
+ $record->{ "CHR_END" } = $record->{ "CHR_BEG" } + $len - 1;
+
+ $record->{ "SEQ" } = Maasha::Filesys::file_read( $fh, $beg, $len );
+ $record->{ "SEQ_LEN" } = $len;
+
+ Maasha::Biopieces::put_record( $record, $out );
+ }
+}
+
+
+while ( $record = Maasha::Biopieces::get_record( $in ) )
+{
+ if ( $options->{ "genome" } and not $record->{ "SEQ" } )
+ {
+ if ( $record->{ "REC_TYPE" } eq "BED" and exists $lookup_hash{ $record->{ "CHR" } } )
+ {
+ ( $index_beg, $index_len ) = @{ $lookup_hash{ $record->{ "CHR" } } };
+
+ $beg = $record->{ "CHR_BEG" } + $index_beg;
+ $len = $record->{ "CHR_END" } - $record->{ "CHR_BEG" } + 1;
+ }
+ elsif ( $record->{ "REC_TYPE" } eq "PSL" and exists $lookup_hash{ $record->{ "S_ID" } } )
+ {
+ ( $index_beg, $index_len ) = @{ $lookup_hash{ $record->{ "S_ID" } } };
+
+ $beg = $record->{ "S_BEG" } + $index_beg;
+ $len = $record->{ "S_END" } - $record->{ "S_BEG" } + 1;
+ }
+ elsif ( $record->{ "REC_TYPE" } eq "BLAST" and exists $lookup_hash{ $record->{ "S_ID" } } )
+ {
+ ( $index_beg, $index_len ) = @{ $lookup_hash{ $record->{ "S_ID" } } };
+
+ $beg = $record->{ "S_BEG" } + $index_beg;
+ $len = $record->{ "S_END" } - $record->{ "S_BEG" } + 1;
+ }
+
+ $beg -= $options->{ "flank" };
+ $len += 2 * $options->{ "flank" };
+
+ if ( $beg <= $index_beg )
+ {
+ $len -= $index_beg - $beg;
+ $beg = $index_beg;
+ }
+
+ $len = $index_beg + $index_len - $beg if $beg + $len > $index_beg + $index_len;
+
+ next if $beg > $index_beg + $index_len;
+
+ $record->{ "CHR_BEG" } = $beg - $index_beg;
+ $record->{ "CHR_END" } = $record->{ "CHR_BEG" } + $len - 1;
+
+ $record->{ "SEQ" } = Maasha::Filesys::file_read( $fh, $beg, $len );
+
+ if ( $record->{ "STRAND" } and $record->{ "STRAND" } eq "-" )
+ {
+ Maasha::Seq::dna_comp( \$record->{ "SEQ" } );
+ $record->{ "SEQ" } = reverse $record->{ "SEQ" };
+ }
+
+ if ( $options->{ "mask" } )
+ {
+ if ( $record->{ "BLOCK_COUNT" } > 1 ) # uppercase hit block segments and lowercase the rest.
+ {
+ $record->{ "SEQ" } = lc $record->{ "SEQ" };
+
+ @begs = split ",", $record->{ "Q_BEGS" };
+ @lens = split ",", $record->{ "BLOCK_LENS" };
+
+ for ( $i = 0; $i < @begs; $i++ ) {
+ substr $record->{ "SEQ" }, $begs[ $i ], $lens[ $i ], uc substr $record->{ "SEQ" }, $begs[ $i ], $lens[ $i ];
+ }
+ }
+ }
+ elsif ( $options->{ "splice" } )
+ {
+ if ( $record->{ "BLOCK_COUNT" } > 1 ) # splice block sequences
+ {
+ $seq = "";
+ @begs = split ",", $record->{ "Q_BEGS" };
+ @lens = split ",", $record->{ "BLOCK_LENS" };
+
+ for ( $i = 0; $i < @begs; $i++ ) {
+ $seq .= substr $record->{ "SEQ" }, $begs[ $i ], $lens[ $i ];
+ }
+
+ $record->{ "SEQ" } = $seq;
+ }
+ }
+
+ $record->{ "SEQ_LEN" } = length $record->{ "SEQ" };
+ }
+
+ Maasha::Biopieces::put_record( $record, $out );
+}
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+BEGIN
+{
+ $run_time_beg = Maasha::Biopieces::run_time();
+
+ Maasha::Biopieces::log_biopiece();
+}
+
+
+END
+{
+ Maasha::Biopieces::close_stream( $in );
+ Maasha::Biopieces::close_stream( $out );
+
+ $run_time_end = Maasha::Biopieces::run_time();
+
+ Maasha::Biopieces::run_time_print( $run_time_beg, $run_time_end, $options );
+}
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+__END__
-use Maasha::BioRun;
-#!/usr/bin/env perl
+#!/usr/bin/env perl -w
+
+# Copyright (C) 2007-2009 Martin A. Hansen.
+
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+# http://www.gnu.org/copyleft/gpl.html
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# Grab records in stream.
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
-use warnings;
use strict;
+use Maasha::Biopieces;
+use Maasha::Common;
+use Maasha::Patscan;
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+my ( $run_time_beg, $run_time_end, $options, $in, $out, $record, $keys, $vals_only, $keys_only, $invert,
+ $patterns, $regex, %lookup_hash, $key, $op, $val, $found );
+
+$options = Maasha::Biopieces::parse_options(
+ [
+ { long => 'patterns', short => 'p', type => 'list', mandatory => 'no', default => undef, allowed => undef, disallowed => undef },
+ { long => 'patterns_in', short => 'P', type => 'file!', mandatory => 'no', default => undef, allowed => undef, disallowed => undef },
+ { long => 'regex', short => 'r', type => 'string', mandatory => 'no', default => undef, allowed => undef, disallowed => undef },
+ { long => 'eval', short => 'e', type => 'string', mandatory => 'no', default => undef, allowed => undef, disallowed => undef },
+ { long => 'exact_in', short => 'E', type => 'file!', mandatory => 'no', default => undef, allowed => undef, disallowed => undef },
+ { long => 'invert', short => 'i', type => 'flag', mandatory => 'no', default => undef, allowed => undef, disallowed => undef },
+ { long => 'case_insensitive', short => 'c', type => 'flag', mandatory => 'no', default => undef, allowed => undef, disallowed => undef },
+ { long => 'keys', short => 'k', type => 'list', mandatory => 'no', default => undef, allowed => undef, disallowed => undef },
+ { long => 'keys_only', short => 'K', type => 'flag', mandatory => 'no', default => undef, allowed => undef, disallowed => undef },
+ { long => 'vals_only', short => 'V', type => 'flag', mandatory => 'no', default => undef, allowed => undef, disallowed => undef },
+ ]
+);
+
+$in = Maasha::Biopieces::read_stream( $options->{ "stream_in" } );
+$out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } );
+
+$keys = $options->{ 'keys' };
+$vals_only = $options->{ 'vals_only' };
+$keys_only = $options->{ 'keys_only' };
+$invert = $options->{ 'invert' };
+
+if ( $options->{ 'patterns' } )
+{
+ $patterns = $options->{ 'patterns' };
+}
+elsif ( defined $options->{ 'patterns_in' } and -f $options->{ 'patterns_in' } )
+{
+ $patterns = Maasha::Patscan::read_patterns( $options->{ 'patterns_in' } );
+}
+elsif ( $options->{ 'regex' } )
+{
+ if ( $options->{ 'case_insensitive' } ) {
+ $regex = qr/$options->{ 'regex' }/i;
+ } else {
+ $regex = qr/$options->{ 'regex' }/;
+ }
+}
+elsif ( defined $options->{ 'exact_in' } and -f $options->{ 'exact_in' } )
+{
+ $patterns = Maasha::Patscan::read_patterns( $options->{ 'exact_in' } );
+
+ map { $lookup_hash{ $_ } = 1 } @{ $patterns };
+
+ undef $patterns;
+}
+elsif ( $options->{ 'eval' } )
+{
+ if ( $options->{ 'eval' } =~ /^([^><=! ]+)\s*(>=|<=|>|<|==|=|!=|eq|ne)\s*(.+)$/ )
+ {
+ $key = $1;
+ $op = $2;
+ $val = $3;
+ }
+ else
+ {
+ Maasha::Common::error( qq(Bad eval string: $options->{ 'eval' }) );
+ }
+}
+
+while ( $record = Maasha::Biopieces::get_record( $in ) )
+{
+ $found = 0;
+
+ if ( %lookup_hash ) {
+ $found = grab_lookup( \%lookup_hash, $record, $keys, $vals_only, $keys_only );
+ } elsif ( $patterns ) {
+ $found = grab_patterns( $patterns, $record, $keys, $vals_only, $keys_only );
+ } elsif ( $regex ) {
+ $found = grab_regex( $regex, $record, $keys, $vals_only, $keys_only );
+ } elsif ( $op ) {
+ $found = grab_eval( $key, $op, $val, $record );
+ }
+
+ if ( $found and not $invert ) {
+ Maasha::Biopieces::put_record( $record, $out );
+ } elsif ( not $found and $invert ) {
+ Maasha::Biopieces::put_record( $record, $out );
+ }
+}
+
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SUBROUTINES <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+sub grab_lookup
+{
+ # Martin A. Hansen, November 2009.
+
+ # Uses keys from a lookup hash to search records. Optionally, a list of
+ # keys can be given so the lookup is limited to these, also, flags
+ # can be given to limit lookup to keys or vals only. Returns 1 if lookup
+ # succeeded, else 0.
+
+ my ( $lookup_hash, # hashref with patterns
+ $record, # hashref
+ $keys, # list of keys - OPTIONAL
+ $vals_only, # only vals flag - OPTIONAL
+ $keys_only, # only keys flag - OPTIONAL
+ ) = @_;
+
+ # Returns boolean.
+
+ if ( $keys )
+ {
+ map { return 1 if exists $lookup_hash->{ $record->{ $_ } } } @{ $keys };
+ }
+ else
+ {
+ if ( not $vals_only ) {
+ map { return 1 if exists $lookup_hash->{ $_ } } keys %{ $record };
+ }
+
+ if ( not $keys_only ) {
+ map { return 1 if exists $lookup_hash->{ $record->{ $_ } } } keys %{ $record };
+ }
+ }
+
+ return 0;
+}
+
+
+sub grab_patterns
+{
+ # Martin A. Hansen, November 2009.
+
+ # Uses patterns to match records containing the pattern as a substring.
+ # Returns 1 if the record is matched, else 0.
+
+ my ( $patterns, # list of patterns
+ $record, # hashref
+ $keys, # list of keys - OPTIONAL
+ $vals_only, # only vals flag - OPTIONAL
+ $keys_only, # only keys flag - OPTIONAL
+ ) = @_;
+
+ # Returns boolean.
+
+ my ( $pattern );
+
+ foreach $pattern ( @{ $patterns } )
+ {
+ if ( $keys )
+ {
+ map { return 1 if index( $record->{ $_ }, $pattern ) >= 0 } @{ $keys };
+ }
+ else
+ {
+ if ( not $vals_only ) {
+ map { return 1 if index( $_, $pattern ) >= 0 } keys %{ $record };
+ }
+
+ if ( not $keys_only ) {
+ map { return 1 if index( $record->{ $_ }, $pattern ) >= 0 } keys %{ $record };
+ }
+ }
+ }
+
+ return 0;
+}
+
+
+sub grab_regex
+{
+ # Martin A. Hansen, November 2009.
+
+ # Uses regex to match records.
+ # Returns 1 if the record is matched, else 0.
+
+ my ( $regex, # regex to match
+ $record, # hashref
+ $keys, # list of keys - OPTIONAL
+ $vals_only, # only vals flag - OPTIONAL
+ $keys_only, # only keys flag - OPTIONAL
+ ) = @_;
+
+ # Returns boolean.
+
+ if ( $keys )
+ {
+ map { return 1 if $record->{ $_ } =~ /$regex/ } @{ $keys };
+ }
+ else
+ {
+ if ( not $vals_only ) {
+ map { return 1 if $_ =~ /$regex/ } keys %{ $record };
+ }
+
+ if ( not $keys_only ) {
+ map { return 1 if $record->{ $_ } =~ /$regex/ } keys %{ $record };
+ }
+ }
+
+ return 0;
+}
+
+
+sub grab_eval
+{
+ # Martin A. Hansen, November 2009.
+
+ # Test if the value of a given record key evaluates according
+ # to a given operator. Returns 1 if eval is OK, else 0.
+
+ my ( $key, # record key
+ $op, # operator
+ $val, # value
+ $record, # hashref
+ ) = @_;
+
+ # Returns boolean.
+
+ if ( defined $record->{ $key } )
+ {
+ return 1 if ( $op eq "<" and $record->{ $key } < $val );
+ return 1 if ( $op eq ">" and $record->{ $key } > $val );
+ return 1 if ( $op eq ">=" and $record->{ $key } >= $val );
+ return 1 if ( $op eq "<=" and $record->{ $key } <= $val );
+ return 1 if ( $op eq "=" and $record->{ $key } == $val );
+ return 1 if ( $op eq "==" and $record->{ $key } == $val );
+ return 1 if ( $op eq "!=" and $record->{ $key } != $val );
+ return 1 if ( $op eq "eq" and $record->{ $key } eq $val );
+ return 1 if ( $op eq "ne" and $record->{ $key } ne $val );
+ }
+
+ return 0;
+}
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+BEGIN
+{
+ $run_time_beg = Maasha::Biopieces::run_time();
+
+ Maasha::Biopieces::log_biopiece();
+}
+
+
+END
+{
+ Maasha::Biopieces::close_stream( $in );
+ Maasha::Biopieces::close_stream( $out );
+
+ $run_time_end = Maasha::Biopieces::run_time();
+
+ Maasha::Biopieces::run_time_print( $run_time_beg, $run_time_end, $options );
+}
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+__END__
-use Maasha::BioRun;
-#!/usr/bin/env perl
+#!/usr/bin/env perl -w
+
+# Copyright (C) 2007-2009 Martin A. Hansen.
+
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+# http://www.gnu.org/copyleft/gpl.html
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# Print summary for all Biopices.
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
-use warnings;
use strict;
+use Maasha::Common;
+use Maasha::Biopieces;
+use Maasha::Gwiki;
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+my ( $run_time_beg, $run_time_end, $options, @files, $file, $wiki, $program, $summary );
+
+$options = Maasha::Biopieces::parse_options();
+
+@files = Maasha::Filesys::ls_files( "$ENV{ 'BP_DIR' }/bp_usage" );
+
+foreach $file ( sort @files )
+{
+ if ( $file =~ /\/([a-z0-9_]+)\.wiki$/ )
+ {
+ $program = $1;
+
+ $wiki = Maasha::Gwiki::gwiki_read( $file );
+
+ $summary = $wiki->[ 0 ]->[ 0 ]->{ 'TEXT' };
+ $summary =~ s/^#summary\s+//;
+
+ printf( "%-30s%s\n", $program, $summary );
+ }
+}
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+BEGIN
+{
+ $run_time_beg = Maasha::Biopieces::run_time();
+
+ Maasha::Biopieces::log_biopiece();
+}
+
+END
+{
+ $run_time_end = Maasha::Biopieces::run_time();
+
+ Maasha::Biopieces::run_time_print( $run_time_beg, $run_time_end, $options );
+}
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+__END__
+
-use Maasha::BioRun;
-#!/usr/bin/env perl
+#!/usr/bin/env perl -w
+
+# Copyright (C) 2007-2009 Martin A. Hansen.
+
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+# http://www.gnu.org/copyleft/gpl.html
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# Display available genomes.
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
-use warnings;
use strict;
+use Maasha::Common;
+use Maasha::Biopieces;
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+my ( $run_time_beg, $run_time_end, $options, @genomes, $genome, @formats, $format, %hash, %found, @row );
+
+$options = Maasha::Biopieces::parse_options();
+
+@genomes = Maasha::Filesys::ls_dirs( "$ENV{ 'BP_DATA' }/genomes" );
+
+foreach $genome ( @genomes )
+{
+ next if $genome =~ /\.$/;
+
+ @formats = Maasha::Filesys::ls_dirs( $genome );
+
+ foreach $format ( @formats )
+ {
+ if ( $format =~ /\/([^\/]+)\/(\w+)$/ )
+ {
+ $hash{ $1 }{ $2 } = 1;
+
+ $found{ $2 } = 1;
+ }
+ }
+}
+
+@row = "Genome";
+
+map { push @row, $_ } sort keys %found;
+
+print join( "\t", @row ), "\n";
+
+foreach $genome ( sort keys %hash )
+{
+ @row = $genome;
+
+ foreach $format ( sort keys %found )
+ {
+ if ( exists $hash{ $genome }{ $format } ) {
+ push @row, "yes";
+ } else {
+ push @row, "no";
+ }
+ }
+
+ print join( "\t", @row ), "\n";
+}
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+BEGIN
+{
+ $run_time_beg = Maasha::Biopieces::run_time();
+
+ Maasha::Biopieces::log_biopiece();
+}
+
+END
+{
+ $run_time_end = Maasha::Biopieces::run_time();
+
+ Maasha::Biopieces::run_time_print( $run_time_beg, $run_time_end, $options );
+}
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+__END__
+
-use Maasha::BioRun;
-#!/usr/bin/env perl
+#!/usr/bin/env perl -w
+
+# Copyright (C) 2007-2009 Martin A. Hansen.
+
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+# http://www.gnu.org/copyleft/gpl.html
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# Read FASTA entries from one or more files.
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
-use warnings;
use strict;
+use Maasha::Common;
+use Maasha::Fasta;
+use Maasha::Biopieces;
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+my ( $run_time_beg, $run_time_end, $options, $in, $out, $record, $data_in, $num, $entry );
+
+$options = Maasha::Biopieces::parse_options(
+ [
+ { long => 'data_in', short => 'i', type => 'files!', mandatory => 'no', default => undef, allowed => undef, disallowed => undef },
+ { long => 'num', short => 'n', type => 'uint', mandatory => 'no', default => undef, allowed => undef, disallowed => '0' },
+ ]
+);
+
+$in = Maasha::Biopieces::read_stream( $options->{ "stream_in" } );
+$out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } );
+
+while ( $record = Maasha::Biopieces::get_record( $in ) ) {
+ Maasha::Biopieces::put_record( $record, $out );
+}
+
+if ( $options->{ 'data_in' } )
+{
+ $data_in = Maasha::Common::read_open_multi( $options->{ 'data_in' } );
+
+ $num = 1;
+
+ while ( $entry = Maasha::Fasta::get_entry( $data_in ) )
+ {
+ if ( $record = Maasha::Fasta::fasta2biopiece( $entry ) ) {
+ Maasha::Biopieces::put_record( $record, $out );
+ }
+
+ last if $options->{ "num" } and $num == $options->{ "num" };
+
+ $num++;
+ }
+
+ close $data_in;
+}
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+BEGIN
+{
+ $run_time_beg = Maasha::Biopieces::run_time();
+
+ Maasha::Biopieces::log_biopiece();
+}
+
+END
+{
+ Maasha::Biopieces::close_stream( $in );
+ Maasha::Biopieces::close_stream( $out );
+
+ $run_time_end = Maasha::Biopieces::run_time();
+
+ Maasha::Biopieces::run_time_print( $run_time_beg, $run_time_end, $options );
+}
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
-use Maasha::BioRun;
+__END__
-#!/usr/bin/env perl
+#!/usr/bin/env perl -w
+
+# Copyright (C) 2007-2009 Martin A. Hansen.
+
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+# http://www.gnu.org/copyleft/gpl.html
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# Read tabular data.
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
-use warnings;
use strict;
+use Maasha::Common;
+use Maasha::Fasta;
+use Maasha::Biopieces;
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+my ( $run_time_beg, $run_time_end, $options, $in, $out, $record, $data_in, $num, $skip, $line, @fields, @fields2, $i );
+
+$options = Maasha::Biopieces::parse_options(
+ [
+ { long => 'data_in', short => 'i', type => 'files!', mandatory => 'no', default => undef, allowed => undef, disallowed => undef },
+ { long => 'delimit', short => 'd', type => 'string', mandatory => 'no', default => '\s+', allowed => undef, disallowed => undef },
+ { long => 'cols', short => 'c', type => 'list', mandatory => 'no', default => undef, allowed => undef, disallowed => undef },
+ { long => 'keys', short => 'k', type => 'list', mandatory => 'no', default => undef, allowed => undef, disallowed => undef },
+ { long => 'skip', short => 's', type => 'uint', mandatory => 'no', default => '0', allowed => undef, disallowed => undef },
+ { long => 'num', short => 'n', type => 'uint', mandatory => 'no', default => undef, allowed => undef, disallowed => '0' },
+ ]
+);
+
+$in = Maasha::Biopieces::read_stream( $options->{ "stream_in" } );
+$out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } );
+
+while ( $record = Maasha::Biopieces::get_record( $in ) ) {
+ Maasha::Biopieces::put_record( $record, $out );
+}
+
+if ( $options->{ 'data_in' } )
+{
+ $data_in = Maasha::Common::read_open_multi( $options->{ 'data_in' } );
+
+ $num = 1;
+ $skip = $options->{ 'skip' };
+
+ while ( $line = <$data_in> )
+ {
+ if ( $skip )
+ {
+ $skip--;
+ next;
+ }
+
+ next if $line =~ /^#|^$/;
+
+ chomp $line;
+
+ undef $record;
+ undef @fields2;
+
+ @fields = split /$options->{'delimit'}/, $line;
+
+ if ( $options->{ "cols" } ) {
+ map { push @fields2, $fields[ $_ ] } @{ $options->{ "cols" } };
+ } else {
+ @fields2 = @fields;
+ }
+
+ for ( $i = 0; $i < @fields2; $i++ )
+ {
+ if ( $options->{ "keys" }->[ $i ] ) {
+ $record->{ $options->{ "keys" }->[ $i ] } = $fields2[ $i ];
+ } else {
+ $record->{ "V" . $i } = $fields2[ $i ];
+ }
+ }
+
+ Maasha::Biopieces::put_record( $record, $out );
+
+ last if $options->{ "num" } and $num == $options->{ "num" };
+
+ $num++;
+ }
+
+ close $data_in;
+}
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+BEGIN
+{
+ $run_time_beg = Maasha::Biopieces::run_time();
+
+ Maasha::Biopieces::log_biopiece();
+}
+
+END
+{
+ Maasha::Biopieces::close_stream( $in );
+ Maasha::Biopieces::close_stream( $out );
+
+ $run_time_end = Maasha::Biopieces::run_time();
+
+ Maasha::Biopieces::run_time_print( $run_time_beg, $run_time_end, $options );
+}
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+__END__
+
+
+
-use Maasha::BioRun;
#!/usr/bin/env perl -w
+# Copyright (C) 2007-2009 Martin A. Hansen.
+
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+# http://www.gnu.org/copyleft/gpl.html
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# Split the values of a key into new key/value pairs.
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
use strict;
use Maasha::Biopieces;
my ( $run_time_beg, $run_time_end, $options, $in, $out, $record, @vals, $i );
-$run_time_beg = Maasha::Biopieces::run_time();
-
-Maasha::Biopieces::log_biopiece();
-
$options = Maasha::Biopieces::parse_options(
[
{ long => 'key', short => 'k', type => 'string', mandatory => 'yes', default => undef, allowed => undef, disallowed => undef },
if ( exists $options->{ 'key' } and exists $record->{ $options->{ 'key' } } )
{
@vals = split /$options->{ 'delimit' }/, $record->{ $options->{ 'key' } };
-
- for ( $i = 0; $i < @vals; $i++ )
+
+ if ( scalar @vals > 1 )
{
- if ( defined $options->{ "keys" } and defined $options->{ "keys" }->[ $i ] ) {
- $record->{ $options->{ "keys" }->[ $i ] } = $vals[ $i ];
- } else {
- $record->{ $options->{ 'key' } . "_$i" } = $vals[ $i ];
+ for ( $i = 0; $i < @vals; $i++ )
+ {
+ if ( defined $options->{ "keys" } and defined $options->{ "keys" }->[ $i ] ) {
+ $record->{ $options->{ "keys" }->[ $i ] } = $vals[ $i ];
+ } else {
+ $record->{ $options->{ 'key' } . "_$i" } = $vals[ $i ];
+ }
}
}
}
Maasha::Biopieces::put_record( $record, $out );
}
-close $in;
-close $out;
-$run_time_end = Maasha::Biopieces::run_time();
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+BEGIN
+{
+ $run_time_beg = Maasha::Biopieces::run_time();
+
+ Maasha::Biopieces::log_biopiece();
+}
+
+
+END
+{
+ Maasha::Biopieces::close_stream( $in );
+ Maasha::Biopieces::close_stream( $out );
+
+ $run_time_end = Maasha::Biopieces::run_time();
-Maasha::Biopieces::run_time_print( $run_time_beg, $run_time_end, $options );
+ Maasha::Biopieces::run_time_print( $run_time_beg, $run_time_end, $options );
+}
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
-#!/usr/bin/env perl
+#!/usr/bin/env perl -w
+
+# Copyright (C) 2007-2009 Martin A. Hansen.
+
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+# http://www.gnu.org/copyleft/gpl.html
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# Uppercases sequences in stream.
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
-use warnings;
use strict;
+use Maasha::Common;
+use Maasha::Fasta;
+use Maasha::Biopieces;
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+my ( $run_time_beg, $run_time_end, $options, $in, $out, $record );
+
+$options = Maasha::Biopieces::parse_options();
+
+$in = Maasha::Biopieces::read_stream( $options->{ "stream_in" } );
+$out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } );
+
+while ( $record = Maasha::Biopieces::get_record( $in ) )
+{
+ $record->{ "SEQ" } = uc $record->{ "SEQ" } if $record->{ "SEQ" };
+
+ Maasha::Biopieces::put_record( $record, $out );
+}
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+BEGIN
+{
+ $run_time_beg = Maasha::Biopieces::run_time();
+
+ Maasha::Biopieces::log_biopiece();
+}
+
+END
+{
+ Maasha::Biopieces::close_stream( $in );
+ Maasha::Biopieces::close_stream( $out );
+
+ $run_time_end = Maasha::Biopieces::run_time();
+
+ Maasha::Biopieces::run_time_print( $run_time_beg, $run_time_end, $options );
+}
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
-use Maasha::BioRun;
+__END__
-#!/usr/bin/env perl
+#!/usr/bin/env perl -w
+
+# Copyright (C) 2007-2009 Martin A. Hansen.
+
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+# http://www.gnu.org/copyleft/gpl.html
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# Vmatch sequences in the stream against a specified database.
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
-use warnings;
use strict;
+use Data::Dumper;
+use Maasha::Biopieces;
+use Maasha::Common;
+use Maasha::Filesys;
+use Maasha::Seq;
+use Maasha::Fasta;
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+my ( $run_time_beg, $run_time_end, $options, $in, $out, @index_files, $tmp_dir, $query_file, $fh_out, $fh_in, $record, $entry,
+ $vmatch_args, $result_file, $count_hash );
+
+$options = Maasha::Biopieces::parse_options(
+ [
+ { long => 'genome', short => 'g', type => 'genome', mandatory => 'no', default => undef, allowed => undef, disallowed => undef },
+ { long => 'index_name', short => 'i', type => 'file', mandatory => 'no', default => undef, allowed => undef, disallowed => undef },
+ { long => 'count', short => 'c', type => 'flag', mandatory => 'no', default => undef, allowed => undef, disallowed => undef },
+ { long => 'max_hits', short => 'm', type => 'uint', mandatory => 'no', default => undef, allowed => undef, disallowed => 0 },
+ { long => 'hamming_dist', short => 'h', type => 'uint', mandatory => 'no', default => undef, allowed => undef, disallowed => 0 },
+ { long => 'edit_dist', short => 'e', type => 'uint', mandatory => 'no', default => undef, allowed => undef, disallowed => 0 },
+ ]
+);
+
+$options->{ 'count' } = 1 if $options->{ 'max_hits' };
+
+Maasha::Common::error( qq(both --index_hame and --genome specified) ) if $options->{ "genome" } and $options->{ "index_name" };
+Maasha::Common::error( qq(no --index_name or --genome specified) ) if not $options->{ "genome" } and not $options->{ "index_name" };
+
+$in = Maasha::Biopieces::read_stream( $options->{ "stream_in" } );
+$out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } );
+
+@index_files = get_index_files( $options );
+
+$tmp_dir = Maasha::Biopieces::get_tmpdir();
+
+$query_file = "$tmp_dir/query.seq";
+$result_file = "$tmp_dir/vmatch.out";
+
+$fh_out = Maasha::Filesys::file_write_open( $query_file );
+
+while ( $record = Maasha::Biopieces::get_record( $in ) )
+{
+ if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) )
+ {
+ next if length $record->{ 'SEQ' } < 12; # assuming that the index is created for 12 as minimum length
+
+ Maasha::Fasta::put_entry( $entry, $fh_out, 100 );
+ }
+
+ Maasha::Biopieces::put_record( $record, $out ); # must this be here?
+}
+
+close $fh_out;
+
+$vmatch_args = vmatch_args( $options, $query_file );
+
+map { Maasha::Common::run( "vmatch", "$vmatch_args $_ >> $result_file" ) } @index_files;
+
+unlink $query_file;
+
+$fh_in = Maasha::Filesys::file_read_open( $result_file );
+
+if ( $options->{ "count" } )
+{
+ $count_hash = vmatch_count_hits( $result_file ) if ( $options->{ "count" } );
+
+ if ( $options->{ "max_hits" } )
+ {
+ while ( $record = vmatch_get_entry( $fh_in ) )
+ {
+ $record->{ 'SCORE' } = $count_hash->{ $record->{ 'Q_ID' } };
+
+ next if $record->{ 'SCORE' } > $options->{ 'max_hits' };
+
+ Maasha::Biopieces::put_record( $record, $out );
+ }
+
+ }
+ else
+ {
+ while ( $record = vmatch_get_entry( $fh_in ) )
+ {
+ $record->{ 'SCORE' } = $count_hash->{ $record->{ 'Q_ID' } };
+
+ Maasha::Biopieces::put_record( $record, $out );
+ }
+ }
+}
+else
+{
+ while ( $record = vmatch_get_entry( $fh_in ) ) {
+ Maasha::Biopieces::put_record( $record, $out );
+ }
+}
+
+close $fh_in;
+
+unlink $result_file;
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SUBROUTINES <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+sub get_index_files
+{
+ # Martin A. Hansen, May 2009.
+
+ # Get the index_files from the index_name or genome options.
+
+ my ( $options, # hashref
+ ) = @_;
+
+ # Returns a list.
+
+ my ( @index_files, %hash );
+
+ if ( $options->{ "index_name" } )
+ {
+ @index_files = $options->{ "index_name" };
+ }
+ else
+ {
+ @index_files = Maasha::Filesys::ls_files( "$ENV{ 'BP_DATA' }/genomes/$options->{ 'genome' }/vmatch" );
+
+ map { $_ =~ /^(.+)\.[a-z1]{3,4}$/; $hash{ $1 } = 1 } @index_files;
+
+ @index_files = sort keys %hash;
+ }
+
+ return wantarray ? @index_files : \@index_files;
+}
+
+
+sub vmatch_args
+{
+ # Martin A. Hansen, May 2009.
+
+ # Given an option hashref and query file, compiles the arguments for running
+ # Vmatch on the commandline.
+
+ my ( $options, # hashref
+ $query_file, # path to query file
+ ) = @_;
+
+ # Returns a string.
+
+ my ( $vmatch_args );
+
+ $vmatch_args = "-complete -d -p -showdesc 100 -q $query_file";
+
+ $vmatch_args .= " -h " . $options->{ "hamming_dist" } if $options->{ "hamming_dist" };
+ $vmatch_args .= " -e " . $options->{ "edit_dist" } if $options->{ "edit_dist" };
+
+ return $vmatch_args;
+}
+
+
+sub vmatch_count_hits
+{
+ # Martin A. Hansen, April 2008.
+
+ # Given a Vmatch result files, count duplications based
+ # on q_id. The counts are returned in a hash.
+
+ my ( $file, # vmatch result file
+ ) = @_;
+
+ # Returns a list.
+
+ my ( $fh_in, $line, @fields, %count_hash );
+
+ $fh_in = Maasha::Filesys::file_read_open( $file );
+
+ while ( $line = <$fh_in> )
+ {
+ chomp $line;
+
+ next if $line =~ /^#/;
+
+ @fields = split " ", $line;
+
+ $count_hash{ $fields[ 5 ] }++;
+ }
+
+ close $fh_in;
+
+ return wantarray ? %count_hash : \%count_hash;
+}
+
+
+sub vmatch_get_entry
+{
+ # Martin A. Hansen, January 2008.
+
+ # Parses vmatch output records.
+
+ my ( $fh, # file handle to vmatch result file.
+ ) = @_;
+
+ # Returns a hash.
+
+ my ( $line, @fields, %record );
+
+ while ( $line = <$fh> )
+ {
+ chomp $line;
+
+ next if $line =~ /^#/;
+ $line =~ s/^\s+//;
+
+ @fields = split " ", $line;
+
+ $record{ "REC_TYPE" } = "VMATCH";
+
+ $record{ "S_LEN" } = $fields[ 0 ];
+ $record{ "S_ID" } = $fields[ 1 ];
+ $record{ "S_BEG" } = $fields[ 2 ];
+
+ if ( $fields[ 3 ] eq "D" ) {
+ $record{ "STRAND" } = "+";
+ } else {
+ $record{ "STRAND" } = "-";
+ }
+
+ $record{ "Q_LEN" } = $fields[ 4 ];
+ $record{ "Q_ID" } = $fields[ 5 ];
+ $record{ "Q_BEG" } = $fields[ 6 ];
+ $record{ "MATCH_DIST" } = $fields[ 7 ];
+ $record{ "E_VAL" } = $fields[ 8 ];
+ $record{ "SCORE" } = $fields[ 9 ];
+ $record{ "IDENT" } = $fields[ 10 ];
+
+ $record{ "Q_END" } = $record{ "Q_BEG" } + $record{ "Q_LEN" } - 1;
+ $record{ "S_END" } = $record{ "S_BEG" } + $record{ "S_LEN" } - 1;
+
+ return wantarray ? %record : \%record;
+ }
+}
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+BEGIN
+{
+ $run_time_beg = Maasha::Biopieces::run_time();
+
+ Maasha::Biopieces::log_biopiece();
+}
+
+END
+{
+ Maasha::Biopieces::close_stream( $in );
+ Maasha::Biopieces::close_stream( $out );
+
+ $run_time_end = Maasha::Biopieces::run_time();
+
+ Maasha::Biopieces::run_time_print( $run_time_beg, $run_time_end, $options );
+}
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+__END__
-use Maasha::BioRun;
-#!/usr/bin/env perl
+#!/usr/bin/env perl -w
+
+# Copyright (C) 2007-2009 Martin A. Hansen.
+
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+# http://www.gnu.org/copyleft/gpl.html
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# Write tabular BLAST output from the stream.
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
-use warnings;
use strict;
+use Maasha::Fasta;
+use Maasha::Biopieces;
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+my ( $run_time_beg, $run_time_end, $options, $in, $out, $record, $first, $entry, $data_out );
+
+$options = Maasha::Biopieces::parse_options(
+ [
+ { long => 'no_stream', short => 'x', type => 'flag', mandatory => 'no', default => undef, allowed => undef, disallowed => undef },
+ { long => 'data_out', short => 'o', type => 'file', mandatory => 'no', default => undef, allowed => undef, disallowed => undef },
+ { long => 'comment', short => 'c', type => 'flag', mandatory => 'no', default => undef, allowed => undef, disallowed => undef },
+ { long => 'compress', short => 'Z', type => 'flag', mandatory => 'no', default => undef, allowed => undef, disallowed => undef },
+ ]
+);
+
+$in = Maasha::Biopieces::read_stream( $options->{ "stream_in" } );
+$out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } );
+$data_out = Maasha::Biopieces::write_stream( $options->{ "data_out" }, $options->{ "compress" } ) ;
+
+$first = 1;
+
+while ( $record = Maasha::Biopieces::get_record( $in ) )
+{
+ if ( $entry = biopiece2blast_tab( $record ) )
+ {
+ if ( $options->{ "comment" } and $first )
+ {
+ print $data_out "# Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score\n";
+
+ $first = 0;
+ }
+
+ print $data_out join( "\t", @{ $entry } ), "\n";
+ }
+
+ Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
+}
+
+close $data_out;
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+sub biopiece2blast_tab
+{
+ # Martin A. Hansen, May 2009.
+
+ # Convert a Biopiece record to a BLAST tabular entry.
+ # Note that we correct positions because BLAST is 1-based.
+
+ my ( $record, # hashref
+ ) = @_;
+
+ # Returns a list.
+
+ my ( @fields, $s_beg, $s_end );
+
+ return undef if not defined $record->{ 'REC_TYPE' } or not $record->{ 'REC_TYPE' } eq "BLAST";
+
+ if ( $record->{ "STRAND" } eq '+' )
+ {
+ $s_beg = $record->{ "S_BEG" };
+ $s_end = $record->{ "S_END" };
+ }
+ else
+ {
+ $s_beg = $record->{ "S_END" };
+ $s_end = $record->{ "S_BEG" };
+ }
+
+ $fields[ 0 ] = $record->{ "Q_ID" };
+ $fields[ 1 ] = $record->{ "S_ID" };
+ $fields[ 2 ] = $record->{ "IDENT" };
+ $fields[ 3 ] = $record->{ "ALIGN_LEN" };
+ $fields[ 4 ] = $record->{ "MISMATCHES" };
+ $fields[ 5 ] = $record->{ "GAPS" };
+ $fields[ 6 ] = $record->{ "Q_BEG" } + 1;
+ $fields[ 7 ] = $record->{ "Q_END" } + 1;
+ $fields[ 8 ] = $s_beg + 1;
+ $fields[ 9 ] = $s_end + 1;
+ $fields[ 10 ] = $record->{ "E_VAL" };
+ $fields[ 11 ] = $record->{ "BIT_SCORE" };
+
+ return wantarray ? @fields : \@fields;
+}
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+BEGIN
+{
+ $run_time_beg = Maasha::Biopieces::run_time();
+
+ Maasha::Biopieces::log_biopiece();
+}
+
+END
+{
+ Maasha::Biopieces::close_stream( $in );
+ Maasha::Biopieces::close_stream( $out );
+
+ $run_time_end = Maasha::Biopieces::run_time();
+
+ Maasha::Biopieces::run_time_print( $run_time_beg, $run_time_end, $options );
+}
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
-use Maasha::BioRun;
+__END__
-#!/usr/bin/env perl
+#!/usr/bin/env perl -w
+
+# Copyright (C) 2007-2009 Martin A. Hansen.
+
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+# http://www.gnu.org/copyleft/gpl.html
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# Write sequences from stream in FASTA format.
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
-use warnings;
use strict;
+use Maasha::Fasta;
+use Maasha::Biopieces;
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+my ( $run_time_beg, $run_time_end, $options, $in, $out, $record, $data_out, $entry );
+
+$options = Maasha::Biopieces::parse_options(
+ [
+ { long => 'no_stream', short => 'x', type => 'flag', mandatory => 'no', default => undef, allowed => undef, disallowed => undef },
+ { long => 'data_out', short => 'o', type => 'file', mandatory => 'no', default => undef, allowed => undef, disallowed => undef },
+ { long => 'wrap', short => 'w', type => 'uint', mandatory => 'no', default => undef, allowed => undef, disallowed => '0' },
+ { long => 'compress', short => 'Z', type => 'flag', mandatory => 'no', default => undef, allowed => undef, disallowed => undef },
+ ]
+);
+
+$in = Maasha::Biopieces::read_stream( $options->{ "stream_in" } );
+$out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } );
+
+$data_out = Maasha::Biopieces::write_stream( $options->{ "data_out" }, $options->{ "compress" } );
+
+while ( $record = Maasha::Biopieces::get_record( $in ) )
+{
+ if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) ) {
+ Maasha::Fasta::put_entry( $entry, $data_out, $options->{ "wrap" } );
+ }
+
+ Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
+}
+
+close $data_out;
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+BEGIN
+{
+ $run_time_beg = Maasha::Biopieces::run_time();
+
+ Maasha::Biopieces::log_biopiece();
+}
+
+END
+{
+ Maasha::Biopieces::close_stream( $in );
+ Maasha::Biopieces::close_stream( $out );
+
+ $run_time_end = Maasha::Biopieces::run_time();
+
+ Maasha::Biopieces::run_time_print( $run_time_beg, $run_time_end, $options );
+}
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
-use Maasha::BioRun;
+__END__
-#!/usr/bin/env perl
+#!/usr/bin/env perl -w
+
+# Copyright (C) 2007-2009 Martin A. Hansen.
+
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+# http://www.gnu.org/copyleft/gpl.html
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# Write tabular output from the stream.
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
-use warnings;
use strict;
+use Maasha::Fasta;
+use Maasha::Biopieces;
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+my ( $run_time_beg, $run_time_end, $options, $in, $out, $record, $data_out, @vals, $ok, $key, @keys, $A, $B, %no_keys, $sort_keys, $check_sort );
+
+$options = Maasha::Biopieces::parse_options(
+ [
+ { long => 'no_stream', short => 'x', type => 'flag', mandatory => 'no', default => undef, allowed => undef, disallowed => undef },
+ { long => 'data_out', short => 'o', type => 'file', mandatory => 'no', default => undef, allowed => undef, disallowed => undef },
+ { long => 'comment', short => 'c', type => 'flag', mandatory => 'no', default => undef, allowed => undef, disallowed => undef },
+ { long => 'delimit', short => 'd', type => 'string', mandatory => 'no', default => "\t", allowed => undef, disallowed => undef },
+ { long => 'keys', short => 'k', type => 'list', mandatory => 'no', default => undef, allowed => undef, disallowed => undef },
+ { long => 'no_keys', short => 'K', type => 'list', mandatory => 'no', default => undef, allowed => undef, disallowed => undef },
+ { long => 'compress', short => 'Z', type => 'flag', mandatory => 'no', default => undef, allowed => undef, disallowed => undef },
+ ]
+);
+
+$in = Maasha::Biopieces::read_stream( $options->{ "stream_in" } );
+$out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } );
+$data_out = Maasha::Biopieces::write_stream( $options->{ "data_out" }, $options->{ "compress" } );
+
+map { $no_keys{ $_ } = 1 } @{ $options->{ "no_keys" } };
+
+while ( $record = Maasha::Biopieces::get_record( $in ) )
+{
+ undef @vals;
+ $ok = 1;
+
+ if ( $options->{ "keys" } )
+ {
+ map { $ok = 0 if not exists $record->{ $_ } } @{ $options->{ "keys" } };
+
+ if ( $ok )
+ {
+ foreach $key ( @{ $options->{ "keys" } } )
+ {
+ if ( exists $record->{ $key } )
+ {
+ push @keys, $key if $options->{ "comment" };
+ push @vals, $record->{ $key };
+ }
+ }
+ }
+ }
+ else
+ {
+ if ( not $check_sort )
+ {
+ $sort_keys = 1;
+
+ map { $sort_keys = 0 if $_ !~ /^V(\d+)$/ } keys %{ $record };
+
+ $check_sort = 1;
+ }
+
+ if ( $sort_keys )
+ {
+ foreach $key ( sort { $A = $a; $B = $b; $A =~ s/^V(\d+)$/$1/; $B =~ s/^V(\d+)$/$1/; $A <=> $B } keys %{ $record } )
+ {
+ next if exists $no_keys{ $key };
+
+ push @keys, $key if $options->{ "comment" };
+ push @vals, $record->{ $key };
+ }
+ }
+ else
+ {
+ foreach $key ( keys %{ $record } )
+ {
+ next if exists $no_keys{ $key };
+
+ push @keys, $key if $options->{ "comment" };
+ push @vals, $record->{ $key };
+ }
+
+ }
+ }
+
+ if ( @keys and $options->{ "comment" } )
+ {
+ print $data_out "#", join( $options->{ "delimit" }, @keys ), "\n";
+
+ delete $options->{ "comment" };
+ }
+
+ print $data_out join( $options->{ "delimit" }, @vals ), "\n" if @vals;
+
+ Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
+}
+
+close $data_out;
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+BEGIN
+{
+ $run_time_beg = Maasha::Biopieces::run_time();
+
+ Maasha::Biopieces::log_biopiece();
+}
+
+END
+{
+ Maasha::Biopieces::close_stream( $in );
+ Maasha::Biopieces::close_stream( $out );
+
+ $run_time_end = Maasha::Biopieces::run_time();
+
+ Maasha::Biopieces::run_time_print( $run_time_beg, $run_time_end, $options );
+}
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
-use Maasha::BioRun;
+__END__
$options->{ "SCRIPT" } = $script;
- if ( $script ne "list_biopieces" and $script ne "list_genomes" ) {
- $script = "print_usage" if ( -t STDIN and keys %{ $options } <= 1 or $options->{ 'help' } );
- }
+ $script = "print_usage" if ( -t STDIN and keys %{ $options } <= 1 or $options->{ 'help' } );
$in = Maasha::Biopieces::read_stream( $options->{ "stream_in" } );
$out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } );
if ( $script eq "print_usage" ) { script_print_usage( $in, $out, $options ) }
- elsif ( $script eq "list_biopieces" ) { script_list_biopieces( $in, $out, $options ) }
- elsif ( $script eq "list_genomes" ) { script_list_genomes( $in, $out, $options ) }
- elsif ( $script eq "read_fasta" ) { script_read_fasta( $in, $out, $options ) }
- elsif ( $script eq "read_tab" ) { script_read_tab( $in, $out, $options ) }
elsif ( $script eq "read_psl" ) { script_read_psl( $in, $out, $options ) }
elsif ( $script eq "read_bed" ) { script_read_bed( $in, $out, $options ) }
elsif ( $script eq "read_fixedstep" ) { script_read_fixedstep( $in, $out, $options ) }
elsif ( $script eq "read_mysql" ) { script_read_mysql( $in, $out, $options ) }
elsif ( $script eq "read_ucsc_config" ) { script_read_ucsc_config( $in, $out, $options ) }
elsif ( $script eq "assemble_tag_contigs" ) { script_assemble_tag_contigs( $in, $out, $options ) }
- elsif ( $script eq "format_genome" ) { script_format_genome( $in, $out, $options ) }
elsif ( $script eq "length_seq" ) { script_length_seq( $in, $out, $options ) }
elsif ( $script eq "uppercase_seq" ) { script_uppercase_seq( $in, $out, $options ) }
elsif ( $script eq "shuffle_seq" ) { script_shuffle_seq( $in, $out, $options ) }
elsif ( $script eq "transliterate_vals" ) { script_transliterate_vals( $in, $out, $options ) }
elsif ( $script eq "translate_seq" ) { script_translate_seq( $in, $out, $options ) }
elsif ( $script eq "extract_seq" ) { script_extract_seq( $in, $out, $options ) }
- elsif ( $script eq "get_genome_seq" ) { script_get_genome_seq( $in, $out, $options ) }
elsif ( $script eq "get_genome_align" ) { script_get_genome_align( $in, $out, $options ) }
elsif ( $script eq "get_genome_phastcons" ) { script_get_genome_phastcons( $in, $out, $options ) }
elsif ( $script eq "fold_seq" ) { script_fold_seq( $in, $out, $options ) }
elsif ( $script eq "tile_seq" ) { script_tile_seq( $in, $out, $options ) }
elsif ( $script eq "invert_align" ) { script_invert_align( $in, $out, $options ) }
elsif ( $script eq "patscan_seq" ) { script_patscan_seq( $in, $out, $options ) }
- elsif ( $script eq "create_blast_db" ) { script_create_blast_db( $in, $out, $options ) }
- elsif ( $script eq "blast_seq" ) { script_blast_seq( $in, $out, $options ) }
elsif ( $script eq "blat_seq" ) { script_blat_seq( $in, $out, $options ) }
elsif ( $script eq "soap_seq" ) { script_soap_seq( $in, $out, $options ) }
elsif ( $script eq "match_seq" ) { script_match_seq( $in, $out, $options ) }
elsif ( $script eq "create_vmatch_index" ) { script_create_vmatch_index( $in, $out, $options ) }
- elsif ( $script eq "vmatch_seq" ) { script_vmatch_seq( $in, $out, $options ) }
- elsif ( $script eq "write_fasta" ) { script_write_fasta( $in, $out, $options ) }
elsif ( $script eq "write_align" ) { script_write_align( $in, $out, $options ) }
- elsif ( $script eq "write_blast" ) { script_write_blast( $in, $out, $options ) }
- elsif ( $script eq "write_tab" ) { script_write_tab( $in, $out, $options ) }
elsif ( $script eq "write_bed" ) { script_write_bed( $in, $out, $options ) }
elsif ( $script eq "write_psl" ) { script_write_psl( $in, $out, $options ) }
elsif ( $script eq "write_fixedstep" ) { script_write_fixedstep( $in, $out, $options ) }
elsif ( $script eq "uniq_vals" ) { script_uniq_vals( $in, $out, $options ) }
elsif ( $script eq "merge_vals" ) { script_merge_vals( $in, $out, $options ) }
elsif ( $script eq "merge_records" ) { script_merge_records( $in, $out, $options ) }
- elsif ( $script eq "grab" ) { script_grab( $in, $out, $options ) }
elsif ( $script eq "compute" ) { script_compute( $in, $out, $options ) }
elsif ( $script eq "flip_tab" ) { script_flip_tab( $in, $out, $options ) }
elsif ( $script eq "add_ident" ) { script_add_ident( $in, $out, $options ) }
data_in|i=s
);
}
- elsif ( $script eq "read_fasta" )
- {
- @options = qw(
- data_in|i=s
- num|n=s
- );
- }
- elsif ( $script eq "read_tab" )
- {
- @options = qw(
- data_in|i=s
- delimit|d=s
- cols|c=s
- keys|k=s
- skip|s=s
- num|n=s
- );
- }
elsif ( $script eq "read_psl" )
{
@options = qw(
check|C
);
}
- elsif ( $script eq "format_genome" )
- {
- @options = qw(
- no_stream|x
- dir|d=s
- genome|g=s
- formats|f=s
- );
- }
elsif ( $script eq "length_seq" )
{
@options = qw(
len|l=s
);
}
- elsif ( $script eq "get_genome_seq" )
- {
- @options = qw(
- genome|g=s
- chr|c=s
- beg|b=s
- end|e=s
- len|l=s
- flank|f=s
- mask|m
- splice|s
- );
- }
elsif ( $script eq "get_genome_align" )
{
@options = qw(
genome|g=s
);
}
- elsif ( $script eq "create_blast_db" )
- {
- @options = qw(
- no_stream|x
- database|d=s
- );
- }
- elsif ( $script eq "blast_seq" )
- {
- @options = qw(
- database|d=s
- genome|g=s
- program|p=s
- e_val|e=f
- filter|f
- cpus|c=s
- no_filter|F
- );
- }
elsif ( $script eq "blat_seq" )
{
@options = qw(
no_stream|x
);
}
- elsif ( $script eq "vmatch_seq" )
- {
- @options = qw(
- genome|g=s
- index_name|i=s
- count|c
- max_hits|m=s
- hamming_dist|h=s
- edit_dist|e=s
- );
- }
- elsif ( $script eq "write_fasta" )
- {
- @options = qw(
- wrap|w=s
- no_stream|x
- data_out|o=s
- compress|Z
- );
- }
elsif ( $script eq "write_align" )
{
@options = qw(
data_out|o=s
);
}
- elsif ( $script eq "write_blast" )
- {
- @options = qw(
- no_stream|x
- data_out|o=s
- comment|c
- compress|Z
- );
- }
- elsif ( $script eq "write_tab" )
- {
- @options = qw(
- no_stream|x
- data_out|o=s
- delimit|d=s
- keys|k=s
- no_keys|K=s
- comment|c
- compress|Z
- );
- }
elsif ( $script eq "write_bed" )
{
@options = qw(
merge|m=s
);
}
- elsif ( $script eq "grab" )
- {
- @options = qw(
- patterns|p=s
- patterns_in|P=s
- regex|r=s
- eval|e=s
- exact_in|E=s
- invert|i
- case_insensitive|c
- keys|k=s
- keys_only|K
- vals_only|V
- );
- }
elsif ( $script eq "compute" )
{
@options = qw(
$options{ "quals" } = [ split ",", $options{ "quals" } ] if defined $options{ "quals" };
$options{ "feats" } = [ split ",", $options{ "feats" } ] if defined $options{ "feats" };
$options{ "frames" } = [ split ",", $options{ "frames" } ] if defined $options{ "frames" };
- $options{ "formats" } = [ split ",", $options{ "formats" } ] if defined $options{ "formats" };
$options{ "samples" } = [ split ",", $options{ "samples" } ] if defined $options{ "samples" };
$options{ "tables" } = [ split ",", $options{ "tables" } ] if defined $options{ "tables" };
$options{ "tracks" } = [ split ",", $options{ "tracks" } ] if defined $options{ "tracks" };
{
Maasha::Common::error( qq(Argument to --$opt must be "+" or "-" - not "$options{ $opt }") );
}
- elsif ( $opt eq "genome" and $script ne "format_genome" )
+ elsif ( $opt eq "genome" )
{
@genomes = Maasha::Common::ls_dirs( "$ENV{ 'BP_DATA' }/genomes" );
map { $_ =~ s/.*\/(.+)$/$1/ } @genomes;
}
Maasha::Common::error( qq(no --database specified) ) if $script eq "remove_ucsc_tracks" and not $options{ "database" };
- Maasha::Common::error( qq(no --database specified) ) if $script eq "create_blast_db" and not $options{ "database" };
Maasha::Common::error( qq(no --index_name specified) ) if $script =~ /create_vmatch_index/ and not $options{ "index_name" };
- Maasha::Common::error( qq(no --database or --genome specified) ) if $script eq "blast_seq" and not $options{ "genome" } and not $options{ "database" };
- Maasha::Common::error( qq(both --database and --genome specified) ) if $script eq "blast_seq" and $options{ "genome" } and $options{ "database" };
- Maasha::Common::error( qq(no --index_name or --genome specified) ) if $script eq "vmatch_seq" and not $options{ "genome" } and not $options{ "index_name" };
- Maasha::Common::error( qq(both --index and --genome specified) ) if $script eq "vmatch_seq" and $options{ "genome" } and $options{ "index_name" };
Maasha::Common::error( qq(no --in_file or --genome specified) ) if $script eq "soap_seq" and not $options{ "genome" } and not $options{ "in_file" };
Maasha::Common::error( qq(both --in_file and --genome specified) ) if $script eq "soap_seq" and $options{ "genome" } and $options{ "in_file" };
- Maasha::Common::error( qq(no --genome specified) ) if $script =~ /format_genome|get_genome_seq|get_genome_align|get_genome_phastcons|blat_seq|plot_phastcons_profiles|plot_karyogram/ and not $options{ "genome" };
+ Maasha::Common::error( qq(no --genome specified) ) if $script =~ /get_genome_align|get_genome_phastcons|blat_seq|plot_phastcons_profiles|plot_karyogram/ and not $options{ "genome" };
Maasha::Common::error( qq(no --key specified) ) if $script =~ /plot_lendist|plot_histogram/ and not $options{ "key" };
Maasha::Common::error( qq(no --keys speficied) ) if $script =~ /sort_records|count_vals|sum_vals|mean_vals|median_vals|length_vals/ and not $options{ "keys" };
}
-sub script_list_biopieces
-{
- # Martin A. Hansen, January 2008.
-
- # Prints the summary from the usage for each of the biopieces.
-
- my ( $in, # handle to in stream
- $out, # handle to out stream
- $options, # options hash
- ) = @_;
-
- # Returns nothing.
-
- my ( @files, $file, $wiki, $program, $summary );
-
- @files = Maasha::Common::ls_files( "$ENV{ 'BP_DIR' }/bp_usage" );
-
- foreach $file ( sort @files )
- {
- if ( $file =~ /\/([a-z0-9_]+)\.wiki$/ )
- {
- $program = $1;
-
- $wiki = Maasha::Gwiki::gwiki_read( $file );
-
- $summary = $wiki->[ 0 ]->[ 0 ]->{ 'TEXT' };
- $summary =~ s/^#summary\s+//;
-
- printf( "%-30s%s\n", $program, $summary );
- }
- }
-
- exit;
-}
-
-
-sub script_list_genomes
-{
- # Martin A. Hansen, January 2008.
-
- # Prints the synopsis from the usage for each of the biopieces.
-
- my ( $in, # handle to in stream
- $out, # handle to out stream
- $options, # options hash
- ) = @_;
-
- # Returns nothing.
-
- my ( @genomes, $genome, @formats, $format, %hash, %found, @row );
-
- @genomes = Maasha::Common::ls_dirs( "$ENV{ 'BP_DATA' }/genomes" );
-
- foreach $genome ( @genomes )
- {
- next if $genome =~ /\.$/;
-
- @formats = Maasha::Common::ls_dirs( $genome );
-
- foreach $format ( @formats )
- {
- if ( $format =~ /\/([^\/]+)\/(\w+)$/ )
- {
- $hash{ $1 }{ $2 } = 1;
-
- $found{ $2 } = 1;
- }
- }
- }
-
- @row = "Genome";
-
- map { push @row, $_ } sort keys %found;
-
- print join( "\t", @row ), "\n";
-
- foreach $genome ( sort keys %hash )
- {
- @row = $genome;
-
- foreach $format ( sort keys %found )
- {
- if ( exists $hash{ $genome }{ $format } ) {
- push @row, "yes";
- } else {
- push @row, "no";
- }
- }
-
- print join( "\t", @row ), "\n";
- }
-}
-
-
-sub script_read_fasta
-{
- # Martin A. Hansen, August 2007.
-
- # Read sequences from FASTA file.
-
- my ( $in, # handle to in stream
- $out, # handle to out stream
- $options, # options hash
- ) = @_;
-
- # Returns nothing.
-
- my ( $record, $file, $data_in, $entry, $num );
-
- while ( $record = Maasha::Biopieces::get_record( $in ) ) {
- Maasha::Biopieces::put_record( $record, $out );
- }
-
- $num = 1;
-
- foreach $file ( @{ $options->{ "files" } } )
- {
- $data_in = Maasha::Common::read_open( $file );
-
- while ( $entry = Maasha::Fasta::get_entry( $data_in ) )
- {
- if ( defined $entry->[ SEQ_NAME ] and $entry->[ SEQ ] )
- {
- $record = {
- SEQ_NAME => $entry->[ SEQ_NAME ],
- SEQ => $entry->[ SEQ ],
- SEQ_LEN => length $entry->[ SEQ ],
- };
-
- Maasha::Biopieces::put_record( $record, $out );
- }
-
- goto NUM if $options->{ "num" } and $num == $options->{ "num" };
-
- $num++;
- }
-
- close $data_in;
- }
-
- NUM:
-
- close $data_in if $data_in;
-}
-
-
-sub script_read_tab
-{
- # Martin A. Hansen, August 2007.
-
- # Read table or table columns from stream or file.
-
- my ( $in, # handle to in stream
- $out, # handle to out stream
- $options, # options hash
- ) = @_;
-
- # Returns nothing.
-
- my ( $file, $line, @fields, @fields2, $i, $record, $data_in, $skip, $num );
-
- $options->{ 'delimit' } ||= '\s+';
-
- while ( $record = Maasha::Biopieces::get_record( $in ) ) {
- Maasha::Biopieces::put_record( $record, $out );
- }
-
- $skip = $options->{ 'skip' } ||= 0;
- $num = 1;
-
- foreach $file ( @{ $options->{ "files" } } )
- {
- $data_in = Maasha::Common::read_open( $file );
-
- while ( $line = <$data_in> )
- {
- if ( $skip )
- {
- $skip--;
- next;
- }
-
- next if $line =~ /^#|^$/;
-
- chomp $line;
-
- undef $record;
- undef @fields2;
-
- @fields = split /$options->{'delimit'}/, $line;
-
- if ( $options->{ "cols" } ) {
- map { push @fields2, $fields[ $_ ] } @{ $options->{ "cols" } };
- } else {
- @fields2 = @fields;
- }
-
- for ( $i = 0; $i < @fields2; $i++ )
- {
- if ( $options->{ "keys" }->[ $i ] ) {
- $record->{ $options->{ "keys" }->[ $i ] } = $fields2[ $i ];
- } else {
- $record->{ "V" . $i } = $fields2[ $i ];
- }
- }
-
- Maasha::Biopieces::put_record( $record, $out );
-
- goto NUM if $options->{ "num" } and $num == $options->{ "num" };
-
- $num++;
- }
-
- close $data_in;
- }
-
- NUM:
-
- close $data_in if $data_in;
-}
-
-
sub script_read_psl
{
# Martin A. Hansen, August 2007.
}
-sub script_format_genome
+sub script_length_seq
{
- # Martin A. Hansen, Juli 2008.
+ # Martin A. Hansen, August 2007.
- # Format a genome to speficed formats.
+ # Determine the length of sequences in stream.
my ( $in, # handle to in stream
$out, # handle to out stream
# Returns nothing.
- my ( $dir, $genome, $fasta_dir, $phastcons_dir, $vals, $fh_out, $record, $format, $index, $entry );
-
- $dir = $options->{ 'dir' } || $ENV{ 'BP_DATA' };
- $genome = $options->{ 'genome' };
-
- Maasha::Common::error( "Directory: $dir does not exist" ) if not -d $dir;
- Maasha::Common::dir_create_if_not_exists( "$dir/genomes" );
- Maasha::Common::dir_create_if_not_exists( "$dir/genomes/$genome" );
+ my ( $record, $total );
- if ( grep { $_ =~ /fasta|blast|vmatch/i } @{ $options->{ "formats" } } )
+ while ( $record = Maasha::Biopieces::get_record( $in ) )
{
- if ( -f "$dir/genomes/$genome/fasta/$genome.fna" )
- {
- $fasta_dir = "$dir/genomes/$genome/fasta";
- }
- else
- {
- Maasha::Common::dir_create_if_not_exists( "$dir/genomes/$genome/fasta" );
-
- $fasta_dir = "$dir/genomes/$genome/fasta";
-
- $fh_out = Maasha::Common::write_open( "$fasta_dir/$genome.fna" );
- }
- }
- elsif ( grep { $_ =~ /phastcons/i } @{ $options->{ "formats" } } )
- {
- Maasha::Common::dir_create_if_not_exists( "$dir/genomes/$genome/phastcons" );
-
- $phastcons_dir = "$dir/genomes/$genome/phastcons";
-
- $fh_out = Maasha::Common::write_open( "$phastcons_dir/$genome.pp" );
- }
-
- while ( $record = Maasha::Biopieces::get_record( $in ) )
- {
- if ( $fh_out and $entry = Maasha::Fasta::biopiece2fasta( $record ) )
- {
- Maasha::Fasta::put_entry( $entry, $fh_out, $options->{ "wrap" } );
- }
- elsif ( $fh_out and $record->{ "CHR" } and $record->{ "CHR_BEG" } and $record->{ "STEP" } and $record->{ "VALS" } )
- {
- print $fh_out "fixedStep chrom=$record->{ 'CHR' } start=$record->{ 'CHR_BEG' } step=$record->{ 'STEP' }\n";
-
- $vals = $record->{ 'VALS' };
-
- $vals =~ tr/,/\n/;
-
- print $fh_out "$vals\n";
- }
-
- Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
- }
-
- foreach $format ( @{ $options->{ 'formats' } } )
- {
- if ( $format =~ /^fasta$/i ) { Maasha::Fasta::fasta_index( "$fasta_dir/$genome.fna", "$dir/genomes/$genome/fasta/$genome.index" ) }
- elsif ( $format =~ /^blast$/i ) { Maasha::NCBI::blast_index( "$genome.fna", $fasta_dir, "$dir/genomes/$genome/blast", "dna", $genome ) }
- elsif ( $format =~ /^blat$/i ) { print STDERR "BLAT FORMAT NOT IMPLEMENTED" }
- elsif ( $format =~ /^vmatch$/i ) { Maasha::Match::vmatch_index( "$genome.fna", $fasta_dir, "$dir/genomes/$genome/vmatch", $BP_TMP ) }
- elsif ( $format =~ /^phastcons$/i ) { Maasha::UCSC::phastcons_index( "$genome.pp", $phastcons_dir ) }
- }
-
- close $fh_out if $fh_out;
-}
-
-
-sub script_length_seq
-{
- # Martin A. Hansen, August 2007.
-
- # Determine the length of sequences in stream.
-
- my ( $in, # handle to in stream
- $out, # handle to out stream
- $options, # options hash
- ) = @_;
-
- # Returns nothing.
-
- my ( $record, $total );
-
- while ( $record = Maasha::Biopieces::get_record( $in ) )
- {
- if ( $record->{ "SEQ" } )
+ if ( $record->{ "SEQ" } )
{
$record->{ "SEQ_LEN" } = length $record->{ "SEQ" };
$total += $record->{ "SEQ_LEN" };
}
-sub script_uppercase_seq
-{
- # Martin A. Hansen, August 2007.
-
- # Uppercases sequences in stream.
-
- my ( $in, # handle to in stream
- $out, # handle to out stream
- ) = @_;
-
- # Returns nothing.
-
- my ( $record );
-
- while ( $record = Maasha::Biopieces::get_record( $in ) )
- {
- $record->{ "SEQ" } = uc $record->{ "SEQ" } if $record->{ "SEQ" };
-
- Maasha::Biopieces::put_record( $record, $out );
- }
-}
-
-
sub script_shuffle_seq
{
# Martin A. Hansen, December 2007.
# Maasha::Common::run( "bed2fixedstep", "< $bed_file > $fixedstep_file" );
-
$fh_in = Maasha::Filesys::file_read_open( $fixedstep_file );
while ( $fixedstep_entry = Maasha::UCSC::Wiggle::fixedstep_entry_get( $fh_in ) )
}
-sub script_get_genome_seq
-{
- # Martin A. Hansen, December 2007.
-
- # Gets a subsequence from a genome.
-
- my ( $in, # handle to in stream
- $out, # handle to out stream
- $options, # options hash
- ) = @_;
-
- # Returns nothing.
-
- my ( $record, $genome, $genome_file, $index_file, $index, $fh, $index_head, $index_beg, $index_len, $beg, $len, %lookup_hash, @begs, @lens, $i, $seq );
-
- $options->{ "flank" } ||= 0;
-
- if ( $options->{ "genome" } )
- {
- $genome = $options->{ "genome" };
-
- $genome_file = "$ENV{ 'BP_DATA' }/genomes/$genome/fasta/$genome.fna";
- $index_file = "$ENV{ 'BP_DATA' }/genomes/$genome/fasta/$genome.index";
-
- $fh = Maasha::Common::read_open( $genome_file );
- $index = Maasha::Fasta::index_retrieve( $index_file );
-
- shift @{ $index }; # Get rid of the file size info
-
- map { $lookup_hash{ $_->[ 0 ] } = [ $_->[ 1 ], $_->[ 2 ] ] } @{ $index };
-
- if ( exists $lookup_hash{ $options->{ "chr" } } and defined $options->{ "beg" } and ( defined $options->{ "end" } or defined $options->{ "len" } ) )
- {
- ( $index_beg, $index_len ) = @{ $lookup_hash{ $options->{ "chr" } } };
-
- $beg = $index_beg + $options->{ "beg" } - 1;
-
- if ( $options->{ "len" } ) {
- $len = $options->{ "len" };
- } elsif ( $options->{ "end" } ) {
- $len = ( $options->{ "end" } - $options->{ "beg" } + 1 );
- }
-
- $beg -= $options->{ "flank" };
- $len += 2 * $options->{ "flank" };
-
- if ( $beg <= $index_beg )
- {
- $len -= $index_beg - $beg;
- $beg = $index_beg;
- }
-
- $len = $index_beg + $index_len - $beg if $beg + $len > $index_beg + $index_len;
-
- next if $beg > $index_beg + $index_len;
-
- $record->{ "CHR" } = $options->{ "chr" };
- $record->{ "CHR_BEG" } = $beg - $index_beg;
- $record->{ "CHR_END" } = $record->{ "CHR_BEG" } + $len - 1;
-
- $record->{ "SEQ" } = Maasha::Common::file_read( $fh, $beg, $len );
- $record->{ "SEQ_LEN" } = $len;
-
- Maasha::Biopieces::put_record( $record, $out );
- }
- }
-
- while ( $record = Maasha::Biopieces::get_record( $in ) )
- {
- if ( $options->{ "genome" } and not $record->{ "SEQ" } )
- {
- if ( $record->{ "REC_TYPE" } eq "BED" and exists $lookup_hash{ $record->{ "CHR" } } )
- {
- ( $index_beg, $index_len ) = @{ $lookup_hash{ $record->{ "CHR" } } };
-
- $beg = $record->{ "CHR_BEG" } + $index_beg;
- $len = $record->{ "CHR_END" } - $record->{ "CHR_BEG" } + 1;
- }
- elsif ( $record->{ "REC_TYPE" } eq "PSL" and exists $lookup_hash{ $record->{ "S_ID" } } )
- {
- ( $index_beg, $index_len ) = @{ $lookup_hash{ $record->{ "S_ID" } } };
-
- $beg = $record->{ "S_BEG" } + $index_beg;
- $len = $record->{ "S_END" } - $record->{ "S_BEG" } + 1;
- }
- elsif ( $record->{ "REC_TYPE" } eq "BLAST" and exists $lookup_hash{ $record->{ "S_ID" } } )
- {
- ( $index_beg, $index_len ) = @{ $lookup_hash{ $record->{ "S_ID" } } };
-
- $beg = $record->{ "S_BEG" } + $index_beg;
- $len = $record->{ "S_END" } - $record->{ "S_BEG" } + 1;
- }
-
- $beg -= $options->{ "flank" };
- $len += 2 * $options->{ "flank" };
-
- if ( $beg <= $index_beg )
- {
- $len -= $index_beg - $beg;
- $beg = $index_beg;
- }
-
- $len = $index_beg + $index_len - $beg if $beg + $len > $index_beg + $index_len;
-
- next if $beg > $index_beg + $index_len;
-
- $record->{ "CHR_BEG" } = $beg - $index_beg;
- $record->{ "CHR_END" } = $record->{ "CHR_BEG" } + $len - 1;
-
- $record->{ "SEQ" } = Maasha::Common::file_read( $fh, $beg, $len );
-
- if ( $record->{ "STRAND" } and $record->{ "STRAND" } eq "-" )
- {
- Maasha::Seq::dna_comp( \$record->{ "SEQ" } );
- $record->{ "SEQ" } = reverse $record->{ "SEQ" };
- }
-
- if ( $options->{ "mask" } )
- {
- if ( $record->{ "BLOCK_COUNT" } > 1 ) # uppercase hit block segments and lowercase the rest.
- {
- $record->{ "SEQ" } = lc $record->{ "SEQ" };
-
- @begs = split ",", $record->{ "Q_BEGS" };
- @lens = split ",", $record->{ "BLOCK_LENS" };
-
- for ( $i = 0; $i < @begs; $i++ ) {
- substr $record->{ "SEQ" }, $begs[ $i ], $lens[ $i ], uc substr $record->{ "SEQ" }, $begs[ $i ], $lens[ $i ];
- }
- }
- }
- elsif ( $options->{ "splice" } )
- {
- if ( $record->{ "BLOCK_COUNT" } > 1 ) # splice block sequences
- {
- $seq = "";
- @begs = split ",", $record->{ "Q_BEGS" };
- @lens = split ",", $record->{ "BLOCK_LENS" };
-
- for ( $i = 0; $i < @begs; $i++ ) {
- $seq .= substr $record->{ "SEQ" }, $begs[ $i ], $lens[ $i ];
- }
-
- $record->{ "SEQ" } = $seq;
- }
- }
-
- $record->{ "SEQ_LEN" } = length $record->{ "SEQ" };
- }
-
- Maasha::Biopieces::put_record( $record, $out );
- }
-
- close $fh if $fh;
-}
-
-
sub script_get_genome_align
{
# Martin A. Hansen, April 2008.
}
-sub script_create_blast_db
-{
- # Martin A. Hansen, September 2007.
-
- # Creates a NCBI BLAST database with formatdb
-
- my ( $in, # handle to in stream
- $out, # handle to out stream
- $options, # options hash
- ) = @_;
-
- # Returns nothing.
-
- my ( $fh, $seq_type, $path, $record, $entry );
-
- $path = $options->{ "database" };
-
- $fh = Maasha::Common::write_open( $path );
-
- while ( $record = Maasha::Biopieces::get_record( $in ) )
- {
- Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
-
- if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) )
- {
- $seq_type = Maasha::Seq::seq_guess_type( $entry->[ SEQ ] ) if not $seq_type;
-
- Maasha::Fasta::put_entry( $entry, $fh );
- }
- }
-
- close $fh;
-
- if ( $seq_type eq "protein" ) {
- Maasha::Common::run( "formatdb", "-p T -i $path -t $options->{ 'database' }" );
- } else {
- Maasha::Common::run( "formatdb", "-p F -i $path -t $options->{ 'database' }" );
- }
-
- unlink $path;
-}
-
-
-sub script_blast_seq
-{
- # Martin A. Hansen, September 2007.
-
- # BLASTs sequences in stream against a given database.
-
- my ( $in, # handle to in stream
- $out, # handle to out stream
- $options, # options hash
- ) = @_;
-
- # Returns nothing.
-
- my ( $genome, $q_type, $s_type, $tmp_in, $tmp_out, $fh_in, $fh_out, $record, $line, @fields, $entry );
-
- $options->{ "e_val" } = 10 if not defined $options->{ "e_val" };
- $options->{ "filter" } = "F";
- $options->{ "filter" } = "T" if $options->{ "filter" };
- $options->{ "cpus" } ||= 1;
-
- $options->{ "database" } = "$ENV{ 'BP_DATA' }/genomes/$options->{ 'genome' }/blast/$options->{ 'genome' }.fna" if $options->{ 'genome' };
-
- $tmp_in = "$BP_TMP/blast_query.seq";
- $tmp_out = "$BP_TMP/blast.result";
-
- $fh_out = Maasha::Filesys::file_write_open( $tmp_in );
-
- while ( $record = Maasha::Biopieces::get_record( $in ) )
- {
- if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) )
- {
- $q_type = Maasha::Seq::seq_guess_type( $entry->[ SEQ ] ) if not $q_type;
-
- Maasha::Fasta::put_entry( $entry, $fh_out );
- }
-
- Maasha::Biopieces::put_record( $record, $out );
- }
-
- close $fh_out;
-
- if ( -f $options->{ 'database' } . ".phr" ) {
- $s_type = "protein";
- } else {
- $s_type = "nucleotide";
- }
-
- if ( not $options->{ 'program' } )
- {
- if ( $q_type ne "protein" and $s_type ne "protein" ) {
- $options->{ 'program' } = "blastn";
- } elsif ( $q_type eq "protein" and $s_type eq "protein" ) {
- $options->{ 'program' } = "blastp";
- } elsif ( $q_type ne "protein" and $s_type eq "protein" ) {
- $options->{ 'program' } = "blastx";
- } elsif ( $q_type eq "protein" and $s_type ne "protein" ) {
- $options->{ 'program' } = "tblastn";
- }
- }
-
- if ( $options->{ 'verbose' } )
- {
- Maasha::Common::run(
- "blastall",
- join( " ",
- "-p $options->{ 'program' }",
- "-e $options->{ 'e_val' }",
- "-a $options->{ 'cpus' }",
- "-m 8",
- "-i $tmp_in",
- "-d $options->{ 'database' }",
- "-F $options->{ 'filter' }",
- "-o $tmp_out",
- ),
- 1
- );
- }
- else
- {
- Maasha::Common::run(
- "blastall",
- join( " ",
- "-p $options->{ 'program' }",
- "-e $options->{ 'e_val' }",
- "-a $options->{ 'cpus' }",
- "-m 8",
- "-i $tmp_in",
- "-d $options->{ 'database' }",
- "-F $options->{ 'filter' }",
- "-o $tmp_out",
- "> /dev/null 2>&1"
- ),
- 1
- );
- }
-
- unlink $tmp_in;
-
- $fh_out = Maasha::Filesys::file_read_open( $tmp_out );
-
- undef $record;
-
- while ( $line = <$fh_out> )
- {
- chomp $line;
-
- next if $line =~ /^#/;
-
- @fields = split /\s+/, $line;
-
- $record->{ "REC_TYPE" } = "BLAST";
- $record->{ "Q_ID" } = $fields[ 0 ];
- $record->{ "S_ID" } = $fields[ 1 ];
- $record->{ "IDENT" } = $fields[ 2 ];
- $record->{ "ALIGN_LEN" } = $fields[ 3 ];
- $record->{ "MISMATCHES" } = $fields[ 4 ];
- $record->{ "GAPS" } = $fields[ 5 ];
- $record->{ "Q_BEG" } = $fields[ 6 ] - 1; # BLAST is 1-based
- $record->{ "Q_END" } = $fields[ 7 ] - 1; # BLAST is 1-based
- $record->{ "S_BEG" } = $fields[ 8 ] - 1; # BLAST is 1-based
- $record->{ "S_END" } = $fields[ 9 ] - 1; # BLAST is 1-based
- $record->{ "E_VAL" } = $fields[ 10 ];
- $record->{ "BIT_SCORE" } = $fields[ 11 ];
-
- if ( $record->{ "S_BEG" } > $record->{ "S_END" } )
- {
- $record->{ "STRAND" } = '-';
-
- ( $record->{ "S_BEG" }, $record->{ "S_END" } ) = ( $record->{ "S_END" }, $record->{ "S_BEG" } );
- }
- else
- {
- $record->{ "STRAND" } = '+';
- }
-
- Maasha::Biopieces::put_record( $record, $out );
- }
-
- close $fh_out;
-
- unlink $tmp_out;
-}
-
-
sub script_blat_seq
{
# Martin A. Hansen, August 2007.
{
$args = join( " ",
"-s $options->{ 'seed_size' }",
- "-r 2",
- "-a $tmp_in",
- "-v $options->{ 'mismatches' }",
- "-g $options->{ 'gap_size' }",
- "-p $options->{ 'cpus' }",
- "-d $options->{ 'in_file' }",
- "-o $tmp_out",
- );
-
- $args .= " > /dev/null 2>&1" if not $options->{ 'verbose' };
-
- Maasha::Common::run( "soap", $args, 1 );
-
- unlink $tmp_in;
-
- $fh_out = Maasha::Common::read_open( $tmp_out );
-
- undef $record;
-
- while ( $line = <$fh_out> )
- {
- chomp $line;
-
- @fields = split /\t/, $line;
-
- $record->{ "REC_TYPE" } = "SOAP";
- $record->{ "Q_ID" } = $fields[ 0 ];
- $record->{ "SCORE" } = $fields[ 3 ];
- $record->{ "STRAND" } = $fields[ 6 ];
- $record->{ "S_ID" } = $fields[ 7 ];
- $record->{ "S_BEG" } = $fields[ 8 ] - 1; # soap is 1-based
- $record->{ "S_END" } = $fields[ 8 ] + $fields[ 5 ] - 2;
-
- Maasha::Biopieces::put_record( $record, $out );
- }
-
- close $fh_out;
- }
-
- unlink $tmp_out;
-}
-
-
-sub script_match_seq
-{
- # Martin A. Hansen, August 2007.
-
- # BLATs sequences in stream against a given genome.
-
- my ( $in, # handle to in stream
- $out, # handle to out stream
- $options, # options hash
- ) = @_;
-
- # Returns nothing.
-
- my ( $record, @entries, $results );
-
- $options->{ "word_size" } ||= 20;
- $options->{ "direction" } ||= "both";
-
- while ( $record = Maasha::Biopieces::get_record( $in ) )
- {
- if ( $record->{ "SEQ_NAME" } and $record->{ "SEQ" } ) {
- push @entries, [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ];
- }
-
- Maasha::Biopieces::put_record( $record, $out );
- }
-
- if ( @entries == 1 )
- {
- $results = Maasha::Match::match_mummer( [ $entries[ 0 ] ], [ $entries[ 0 ] ], $options, $BP_TMP );
-
- map { Maasha::Biopieces::put_record( $_, $out ) } @{ $results };
- }
- elsif ( @entries == 2 )
- {
- $results = Maasha::Match::match_mummer( [ $entries[ 0 ] ], [ $entries[ 1 ] ], $options, $BP_TMP );
-
- map { Maasha::Biopieces::put_record( $_, $out ) } @{ $results };
- }
-}
-
-
-sub script_create_vmatch_index
-{
- # Martin A. Hansen, January 2008.
-
- # Create a vmatch index from sequences in the stream.
-
- my ( $in, # handle to in stream
- $out, # handle to out stream
- $options, # options hash
- ) = @_;
-
- # Returns nothing.
-
- my ( $record, $file_tmp, $fh_tmp, $type, $entry );
-
- if ( $options->{ "index_name" } )
- {
- $file_tmp = $options->{ 'index_name' };
- $fh_tmp = Maasha::Common::write_open( $file_tmp );
- }
-
- while ( $record = Maasha::Biopieces::get_record( $in ) )
- {
- if ( $options->{ "index_name" } and $entry = Maasha::Fasta::biopiece2fasta( $record ) )
- {
- Maasha::Fasta::put_entry( $entry, $fh_tmp );
-
- $type = Maasha::Seq::seq_guess_type( $entry->[ SEQ ] ) if not defined $type;
- }
-
- Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
- }
-
- if ( $options->{ "index_name" } )
- {
- close $fh_tmp;
-
- if ( $type eq "protein" ) {
- Maasha::Common::run( "mkvtree", "-db $file_tmp -protein -pl $options->{ 'prefix_length' } -allout -indexname $file_tmp > /dev/null 2>&1" );
- } else {
- Maasha::Common::run( "mkvtree", "-db $file_tmp -dna -pl $options->{ 'prefix_length' } -allout -indexname $file_tmp > /dev/null 2>&1" );
- }
-
- unlink $file_tmp;
- }
-}
-
-
-sub script_vmatch_seq
-{
- # Martin A. Hansen, August 2007.
-
- # Vmatches sequences in stream against a given genome.
-
- my ( $in, # handle to in stream
- $out, # handle to out stream
- $options, # options hash
- ) = @_;
-
- # Returns nothing.
-
- my ( @index_files, @records, $result_file, $fh_in, $record, %hash );
-
- $options->{ 'count' } = 1 if $options->{ 'max_hits' };
-
- if ( $options->{ "index_name" } )
- {
- @index_files = $options->{ "index_name" };
- }
- else
- {
- @index_files = Maasha::Common::ls_files( "$ENV{ 'BP_DATA' }/genomes/$options->{ 'genome' }/vmatch" );
-
- map { $_ =~ /^(.+)\.[a-z1]{3,4}$/; $hash{ $1 } = 1 } @index_files;
-
- @index_files = sort keys %hash;
- }
-
- while ( $record = Maasha::Biopieces::get_record( $in ) )
- {
- push @records, $record;
-
- Maasha::Biopieces::put_record( $record, $out );
- }
-
- $result_file = Maasha::Match::match_vmatch( $BP_TMP, \@records, \@index_files, $options );
-
- undef @records;
-
- $fh_in = Maasha::Common::read_open( $result_file );
-
- while ( $record = Maasha::Match::vmatch_get_entry( $fh_in ) ) {
- Maasha::Biopieces::put_record( $record, $out );
- }
-
- close $fh_in;
+ "-r 2",
+ "-a $tmp_in",
+ "-v $options->{ 'mismatches' }",
+ "-g $options->{ 'gap_size' }",
+ "-p $options->{ 'cpus' }",
+ "-d $options->{ 'in_file' }",
+ "-o $tmp_out",
+ );
- unlink $result_file;
-}
+ $args .= " > /dev/null 2>&1" if not $options->{ 'verbose' };
+ Maasha::Common::run( "soap", $args, 1 );
-sub script_write_fasta
-{
- # Martin A. Hansen, August 2007.
+ unlink $tmp_in;
- # Write FASTA entries from sequences in stream.
+ $fh_out = Maasha::Common::read_open( $tmp_out );
- my ( $in, # handle to in stream
- $out, # handle to out stream
- $options, # options hash
- ) = @_;
+ undef $record;
- # Returns nothing.
+ while ( $line = <$fh_out> )
+ {
+ chomp $line;
- my ( $record, $fh, $entry );
+ @fields = split /\t/, $line;
- $fh = write_stream( $options->{ "data_out" }, $options->{ "compress" } );
+ $record->{ "REC_TYPE" } = "SOAP";
+ $record->{ "Q_ID" } = $fields[ 0 ];
+ $record->{ "SCORE" } = $fields[ 3 ];
+ $record->{ "STRAND" } = $fields[ 6 ];
+ $record->{ "S_ID" } = $fields[ 7 ];
+ $record->{ "S_BEG" } = $fields[ 8 ] - 1; # soap is 1-based
+ $record->{ "S_END" } = $fields[ 8 ] + $fields[ 5 ] - 2;
- while ( $record = Maasha::Biopieces::get_record( $in ) )
- {
- if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) ) {
- Maasha::Fasta::put_entry( $entry, $fh, $options->{ "wrap" } );
+ Maasha::Biopieces::put_record( $record, $out );
}
- Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
+ close $fh_out;
}
- close $fh;
+ unlink $tmp_out;
}
-sub script_write_align
+sub script_match_seq
{
# Martin A. Hansen, August 2007.
- # Write pretty alignments aligned sequences in stream.
+ # BLATs sequences in stream against a given genome.
my ( $in, # handle to in stream
$out, # handle to out stream
# Returns nothing.
- my ( $fh, $record, @entries );
+ my ( $record, @entries, $results );
- $fh = write_stream( $options->{ "data_out" } ) ;
+ $options->{ "word_size" } ||= 20;
+ $options->{ "direction" } ||= "both";
while ( $record = Maasha::Biopieces::get_record( $in ) )
{
push @entries, [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ];
}
- Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
+ Maasha::Biopieces::put_record( $record, $out );
}
- if ( scalar( @entries ) == 2 ) {
- Maasha::Align::align_print_pairwise( $entries[ 0 ], $entries[ 1 ], $fh, $options->{ "wrap" } );
- } elsif ( scalar ( @entries ) > 2 ) {
- Maasha::Align::align_print_multi( \@entries, $fh, $options->{ "wrap" }, $options->{ "no_ruler" }, $options->{ "no_consensus" } );
+ if ( @entries == 1 )
+ {
+ $results = Maasha::Match::match_mummer( [ $entries[ 0 ] ], [ $entries[ 0 ] ], $options, $BP_TMP );
+
+ map { Maasha::Biopieces::put_record( $_, $out ) } @{ $results };
}
+ elsif ( @entries == 2 )
+ {
+ $results = Maasha::Match::match_mummer( [ $entries[ 0 ] ], [ $entries[ 1 ] ], $options, $BP_TMP );
- close $fh if $fh;
+ map { Maasha::Biopieces::put_record( $_, $out ) } @{ $results };
+ }
}
-sub script_write_blast
+sub script_create_vmatch_index
{
- # Martin A. Hansen, November 2007.
+ # Martin A. Hansen, January 2008.
- # Write data in blast table format (-m8 and 9).
+ # Create a vmatch index from sequences in the stream.
my ( $in, # handle to in stream
$out, # handle to out stream
# Returns nothing.
- my ( $fh, $record, $first );
-
- $fh = write_stream( $options->{ "data_out" }, $options->{ "compress" } ) ;
+ my ( $record, $file_tmp, $fh_tmp, $type, $entry );
- $first = 1;
+ if ( $options->{ "index_name" } )
+ {
+ $file_tmp = $options->{ 'index_name' };
+ $fh_tmp = Maasha::Common::write_open( $file_tmp );
+ }
while ( $record = Maasha::Biopieces::get_record( $in ) )
{
- if ( $record->{ "REC_TYPE" } eq "BLAST" )
+ if ( $options->{ "index_name" } and $entry = Maasha::Fasta::biopiece2fasta( $record ) )
{
- if ( $options->{ "comment" } and $first )
- {
- print "# Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score\n";
-
- $first = 0;
- }
-
- if ( $record->{ "STRAND" } eq "-" ) {
- ( $record->{ "S_BEG" }, $record->{ "S_END" } ) = ( $record->{ "S_END" }, $record->{ "S_BEG" } );
- }
+ Maasha::Fasta::put_entry( $entry, $fh_tmp );
- print $fh join( "\t",
- $record->{ "Q_ID" },
- $record->{ "S_ID" },
- $record->{ "IDENT" },
- $record->{ "ALIGN_LEN" },
- $record->{ "MISMATCHES" },
- $record->{ "GAPS" },
- $record->{ "Q_BEG" } + 1,
- $record->{ "Q_END" } + 1,
- $record->{ "S_BEG" } + 1,
- $record->{ "S_END" } + 1,
- $record->{ "E_VAL" },
- $record->{ "BIT_SCORE" }
- ), "\n";
+ $type = Maasha::Seq::seq_guess_type( $entry->[ SEQ ] ) if not defined $type;
}
Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
}
- close $fh;
+ if ( $options->{ "index_name" } )
+ {
+ close $fh_tmp;
+
+ if ( $type eq "protein" ) {
+ Maasha::Common::run( "mkvtree", "-db $file_tmp -protein -pl $options->{ 'prefix_length' } -allout -indexname $file_tmp > /dev/null 2>&1" );
+ } else {
+ Maasha::Common::run( "mkvtree", "-db $file_tmp -dna -pl $options->{ 'prefix_length' } -allout -indexname $file_tmp > /dev/null 2>&1" );
+ }
+
+ unlink $file_tmp;
+ }
}
-sub script_write_tab
+sub script_write_align
{
# Martin A. Hansen, August 2007.
- # Write data as table.
+ # Write pretty alignments aligned sequences in stream.
my ( $in, # handle to in stream
$out, # handle to out stream
# Returns nothing.
- my ( $fh, $record, $key, @keys, @vals, $ok, %no_keys, $A, $B );
-
- $options->{ "delimit" } ||= "\t";
-
- map { $no_keys{ $_ } = 1 } @{ $options->{ "no_keys" } };
+ my ( $fh, $record, @entries );
- $fh = write_stream( $options->{ "data_out" }, $options->{ "compress" } );
+ $fh = write_stream( $options->{ "data_out" } ) ;
while ( $record = Maasha::Biopieces::get_record( $in ) )
{
- undef @vals;
- $ok = 1;
-
- if ( $options->{ "keys" } )
- {
- map { $ok = 0 if not exists $record->{ $_ } } @{ $options->{ "keys" } };
-
- if ( $ok )
- {
- foreach $key ( @{ $options->{ "keys" } } )
- {
- if ( exists $record->{ $key } )
- {
- push @keys, $key if $options->{ "comment" };
- push @vals, $record->{ $key };
- }
- }
- }
- }
- else
- {
- foreach $key ( sort { $A = $a; $B = $b; $A =~ s/^V(\d+)$/$1/; $B =~ s/^V(\d+)$/$1/; $A <=> $B } keys %{ $record } )
- {
- next if exists $no_keys{ $key };
-
- push @keys, $key if $options->{ "comment" };
- push @vals, $record->{ $key };
- }
- }
-
- if ( @keys and $options->{ "comment" } )
- {
- print $fh "#", join( $options->{ "delimit" }, @keys ), "\n";
-
- delete $options->{ "comment" };
+ if ( $record->{ "SEQ_NAME" } and $record->{ "SEQ" } ) {
+ push @entries, [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ];
}
- print $fh join( $options->{ "delimit" }, @vals ), "\n" if @vals;
-
Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" };
}
- close $fh;
+ if ( scalar( @entries ) == 2 ) {
+ Maasha::Align::align_print_pairwise( $entries[ 0 ], $entries[ 1 ], $fh, $options->{ "wrap" } );
+ } elsif ( scalar ( @entries ) > 2 ) {
+ Maasha::Align::align_print_multi( \@entries, $fh, $options->{ "wrap" }, $options->{ "no_ruler" }, $options->{ "no_consensus" } );
+ }
+
+ close $fh if $fh;
}
$cols = $options->{ 'cols' }->[ 0 ];
- $fh = write_stream( $options->{ 'data_out' }, $options->{ 'compress' } );
+ $fh = Maasha::Biopieces::write_stream( $options->{ 'data_out' }, $options->{ 'compress' } );
while ( $record = Maasha::Biopieces::get_record( $in ) )
{
}
-sub script_grab
-{
- # Martin A. Hansen, August 2007.
-
- # Grab for records in stream.
-
- my ( $in, # handle to in stream
- $out, # handle to out stream
- $options, # options hash
- ) = @_;
-
- # Returns nothing.
-
- my ( $keys, $vals_only, $keys_only, $invert, $patterns, $pattern, $regex, $record, $key, $op, $val, %lookup_hash, $found );
-
- $keys = $options->{ 'keys' };
- $vals_only = $options->{ 'vals_only' };
- $keys_only = $options->{ 'keys_only' };
- $invert = $options->{ 'invert' };
-
- if ( $options->{ 'patterns' } )
- {
- $patterns = [ split ",", $options->{ 'patterns' } ];
- }
- elsif ( -f $options->{ 'patterns_in' } )
- {
- $patterns = Maasha::Patscan::read_patterns( $options->{ 'patterns_in' } );
- }
- elsif ( $options->{ 'regex' } )
- {
- if ( $options->{ 'case_insensitive' } ) {
- $regex = qr/$options->{ 'regex' }/i;
- } else {
- $regex = qr/$options->{ 'regex' }/;
- }
- }
- elsif ( -f $options->{ 'exact_in' } )
- {
- $patterns = Maasha::Patscan::read_patterns( $options->{ 'exact_in' } );
-
- map { $lookup_hash{ $_ } = 1 } @{ $patterns };
-
- undef $patterns;
- }
- elsif ( $options->{ 'eval' } )
- {
- if ( $options->{ 'eval' } =~ /^([^><=! ]+)\s*(>=|<=|>|<|=|!=|eq|ne)\s*(.+)$/ )
- {
- $key = $1;
- $op = $2;
- $val = $3;
- }
- }
-
- while ( $record = Maasha::Biopieces::get_record( $in ) )
- {
- $found = 0;
-
- if ( %lookup_hash ) {
- $found = grab_lookup( \%lookup_hash, $record, $keys, $vals_only, $keys_only );
- } elsif ( $patterns ) {
- $found = grab_patterns( $patterns, $record, $keys, $vals_only, $keys_only );
- } elsif ( $regex ) {
- $found = grab_regex( $regex, $record, $keys, $vals_only, $keys_only );
- } elsif ( $op ) {
- $found = grab_eval( $key, $op, $val, $record );
- }
-
- if ( $found and not $invert ) {
- Maasha::Biopieces::put_record( $record, $out );
- } elsif ( not $found and $invert ) {
- Maasha::Biopieces::put_record( $record, $out );
- }
- }
-}
-
-
sub script_compute
{
# Martin A. Hansen, August 2007.
}
-# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
-
-
-sub grab_lookup
-{
- # Martin A. Hansen, November 2009.
-
- # Uses keys from a lookup hash to search records. Optionally, a list of
- # keys can be given so the lookup is limited to these, also, flags
- # can be given to limit lookup to keys or vals only. Returns 1 if lookup
- # succeeded, else 0.
-
- my ( $lookup_hash, # hashref with patterns
- $record, # hashref
- $keys, # list of keys - OPTIONAL
- $vals_only, # only vals flag - OPTIONAL
- $keys_only, # only keys flag - OPTIONAL
- ) = @_;
-
- # Returns boolean.
-
- if ( $keys )
- {
- map { return 1 if exists $lookup_hash->{ $record->{ $_ } } } @{ $keys };
- }
- else
- {
- if ( not $vals_only ) {
- map { return 1 if exists $lookup_hash->{ $_ } } keys %{ $record };
- }
-
- if ( not $keys_only ) {
- map { return 1 if exists $lookup_hash->{ $record->{ $_ } } } keys %{ $record };
- }
- }
-
- return 0;
-}
-
-
-sub grab_patterns
-{
- # Martin A. Hansen, November 2009.
-
- # Uses patterns to match records containing the pattern as a substring.
- # Returns 1 if the record is matched, else 0.
-
- my ( $patterns, # list of patterns
- $record, # hashref
- $keys, # list of keys - OPTIONAL
- $vals_only, # only vals flag - OPTIONAL
- $keys_only, # only keys flag - OPTIONAL
- ) = @_;
-
- # Returns boolean.
-
- my ( $pattern );
-
- foreach $pattern ( @{ $patterns } )
- {
- if ( $keys )
- {
- map { return 1 if index( $record->{ $_ }, $pattern ) >= 0 } @{ $keys };
- }
- else
- {
- if ( not $vals_only ) {
- map { return 1 if index( $_, $pattern ) >= 0 } keys %{ $record };
- }
-
- if ( not $keys_only ) {
- map { return 1 if index( $record->{ $_ }, $pattern ) >= 0 } keys %{ $record };
- }
- }
- }
-
- return 0;
-}
-
-
-sub grab_regex
-{
- # Martin A. Hansen, November 2009.
-
- # Uses regex to match records.
- # Returns 1 if the record is matched, else 0.
-
- my ( $regex, # regex to match
- $record, # hashref
- $keys, # list of keys - OPTIONAL
- $vals_only, # only vals flag - OPTIONAL
- $keys_only, # only keys flag - OPTIONAL
- ) = @_;
-
- # Returns boolean.
-
- if ( $keys )
- {
- map { return 1 if $record->{ $_ } =~ /$regex/ } @{ $keys };
- }
- else
- {
- if ( not $vals_only ) {
- map { return 1 if $_ =~ /$regex/ } keys %{ $record };
- }
-
- if ( not $keys_only ) {
- map { return 1 if $record->{ $_ } =~ /$regex/ } keys %{ $record };
- }
- }
-
- return 0;
-}
-
-
-sub grab_eval
-{
- # Martin A. Hansen, November 2009.
-
- # Test if the value of a given record key evaluates according
- # to a given operator. Returns 1 if eval is OK, else 0.
-
- my ( $key, # record key
- $op, # operator
- $val, # value
- $record, # hashref
- ) = @_;
-
- # Returns boolean.
-
- if ( defined $record->{ $key } )
- {
- return 1 if ( $op eq "<" and $record->{ $key } < $val );
- return 1 if ( $op eq ">" and $record->{ $key } > $val );
- return 1 if ( $op eq ">=" and $record->{ $key } >= $val );
- return 1 if ( $op eq "<=" and $record->{ $key } <= $val );
- return 1 if ( $op eq "=" and $record->{ $key } == $val );
- return 1 if ( $op eq "!=" and $record->{ $key } != $val );
- return 1 if ( $op eq "eq" and $record->{ $key } eq $val );
- return 1 if ( $op eq "ne" and $record->{ $key } ne $val );
- }
-
- return 0;
-}
-
-
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1;
__END__
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+use Getopt::Long qw( :config bundling );
+use Data::Dumper;
+use Maasha::Match;
+use Maasha::Common;
+use Maasha::Filesys;
use vars qw( @ISA @EXPORT_OK );
require Exporter;
$user = Maasha::Common::get_user();
$script = Maasha::Common::get_scriptname();
- $fh_global = Maasha::Common::append_open( "$ENV{ 'BP_LOG' }/biopieces.log" );
- $fh_local = Maasha::Common::append_open( "$ENV{ 'HOME' }/.biopieces.log" );
+ $fh_global = Maasha::Filesys::file_append_open( "$ENV{ 'BP_LOG' }/biopieces.log" );
+ $fh_local = Maasha::Filesys::file_append_open( "$ENV{ 'HOME' }/.biopieces.log" );
print $fh_global "$time_stamp\t$user\t$script ", join( " ", @ARGV ), "\n";
print $fh_local "$time_stamp\t$user\t$script ", join( " ", @ARGV ), "\n";
# Opens a stream to STDIN or a file,
- my ( $path, # path - OPTIONAL
+ my ( $file, # file - OPTIONAL
) = @_;
# Returns filehandle.
my ( $fh );
if ( not -t STDIN ) {
- $fh = Maasha::Common::read_stdin();
- } elsif ( not $path ) {
-# Maasha::Common::error( qq(no data stream) );
+ $fh = Maasha::Filesys::stdin_read();
+ } elsif ( not $file ) {
+ # Maasha::Common::error( qq(no data stream) );
} else {
- $fh = Maasha::Common::read_open( $path );
+ $fh = Maasha::Filesys::file_read_open( $file );
}
-# $fh->autoflush(1) if $fh; # Disable file buffer for debugging.
-
return $fh;
}
my ( $fh );
if ( $path ) {
- $fh = Maasha::Common::write_open( $path, $gzip );
+ $fh = Maasha::Filesys::file_write_open( $path, $gzip );
} else {
- $fh = Maasha::Common::write_stdout();
+ $fh = Maasha::Filesys::stdout_write();
}
return $fh;
}
+sub close_stream
+{
+ # Martin A. Hansen, May 2009.
+
+ # Close stream if open.
+
+ my ( $fh, # filehandle
+ ) = @_;
+
+ # Returns nothing.
+
+ close $fh if defined $fh;
+}
+
+
sub get_record
{
# Martin A. Hansen, July 2007.
my ( $block, @lines, $line, $key, $value, %record );
+ return if not defined $fh;
+
local $/ = "\n---\n";
$block = <$fh>;
- chomp $block;
-
return if not defined $block;
+ chomp $block;
+
@lines = split "\n", $block;
foreach $line ( @lines )
}
+sub parse_options
+{
+ # Martin A. Hansen, May 2009
+
+ #
+
+ my ( $arg_list, # data structure with argument description
+ ) = @_;
+
+ # Returns hashref.
+
+ my ( $arg, @list, $options );
+
+ # ---- Adding the mandatory arguments to the arg_list ----
+
+ push @{ $arg_list }, (
+
+ { long => 'help', short => '?', type => 'flag', mandatory => 'no', default => undef, allowed => undef, disallowed => undef },
+ { long => 'stream_in', short => 'I', type => 'file!', mandatory => 'no', default => undef, allowed => undef, disallowed => undef },
+ { long => 'stream_out', short => 'O', type => 'file', mandatory => 'no', default => undef, allowed => undef, disallowed => undef },
+ { long => 'verbose', short => 'v', type => 'flag', mandatory => 'no', default => undef, allowed => undef, disallowed => undef },
+ );
+
+ check_duplicates_args( $arg_list );
+
+ # ---- Compiling options list ----
+
+ foreach $arg ( @{ $arg_list } )
+ {
+ if ( $arg->{ 'type' } eq 'flag' ) {
+ push @list, "$arg->{ 'long' }|$arg->{ 'short' }";
+ } else {
+ push @list, "$arg->{ 'long' }|$arg->{ 'short' }=s";
+ }
+ }
+
+ # ---- Parsing options from @ARGV ----
+
+ $options = {};
+
+ Getopt::Long::GetOptions( $options, @list );
+
+ # print Dumper( $options );
+
+ check_print_usage( $options );
+
+ # ---- Expanding and checking options ----
+
+ foreach $arg ( @{ $arg_list } )
+ {
+ check_mandatory( $arg, $options );
+ set_default( $arg, $options );
+ check_uint( $arg, $options );
+ check_int( $arg, $options );
+ set_list( $arg, $options );
+ check_dir( $arg, $options );
+ check_file( $arg, $options );
+ set_files( $arg, $options );
+ check_files( $arg, $options );
+ check_allowed( $arg, $options );
+ check_disallowed( $arg, $options );
+ }
+
+ # print Dumper( $options );
+
+ # return wantarray ? $options : %{ $options }; # WTF! Someone changed the behaviour of wantarray???
+
+ return $options;
+}
+
+
+sub check_duplicates_args
+{
+ # Martin A. Hansen, May 2009
+
+ # Check if there are duplicate long or short arguments,
+ # and raise an error if so.
+
+ my ( $arg_list, # List of argument hashrefs,
+ ) = @_;
+
+ # Returns nothing.
+
+ my ( $arg, %check_hash );
+
+ foreach $arg ( @{ $arg_list } )
+ {
+ Maasha::Common::error( qq(Duplicate long argument: $arg->{ 'long' }) ) if exists $check_hash{ $arg->{ 'long' } };
+ Maasha::Common::error( qq(Duplicate short argument: $arg->{ 'short' }) ) if exists $check_hash{ $arg->{ 'short' } };
+
+ $check_hash{ $arg->{ 'long' } } = 1;
+ $check_hash{ $arg->{ 'short' } } = 1;
+ }
+}
+
+
+sub check_print_usage
+{
+ # Martin A. Hansen, May 2009.
+
+ # Check if we need to print usage and print usage
+ # and exit if that is the case.
+
+ my ( $options, # option hash
+ ) = @_;
+
+ # Returns nothing.
+
+ my ( $script, $wiki );
+
+ $script = Maasha::Common::get_scriptname();
+
+ if ( $script ne 'print_wiki' )
+ {
+ # if ( exists $options{ 'help' } or not ( exists $options{ 'stream_in' } or exists $options{ 'data_in' } or not -t STDIN ) )
+ if ( exists $options->{ 'help' } or ( scalar keys %{ $options } == 0 or exists $options->{ 'data_in' } or not -t STDIN ) )
+ {
+ $wiki = $ENV{ 'BP_DIR' } . "/bp_usage/$script.wiki";
+
+ if ( exists $options->{ 'help' } ) {
+ `print_wiki --data_in=$wiki --help`;
+ } elsif ( $script =~ /^(list_biopieces|list_genomes)$/ ) {
+ return;
+ } else {
+ `print_wiki --data_in=$wiki`;
+ }
+
+ exit;
+ }
+ }
+}
+
+
+sub check_mandatory
+{
+ # Martin A. Hansen, May 2009.
+
+ # Check if mandatory arguments are set and raises an error if not.
+
+ my ( $arg, # hashref
+ $options, # options hash
+ ) = @_;
+
+ # Returns nothing.
+
+ if ( $arg->{ 'mandatory' } eq 'yes' and not defined $options->{ $arg->{ 'long' } } ) {
+ Maasha::Common::error( qq(Argument --$arg->{ 'long' } is mandatory) );
+ }
+}
+
+
+sub set_default
+{
+ # Martin A. Hansen, May 2009.
+
+ # Set default values in option hash.
+
+ my ( $arg, # hashref
+ $options, # options hash
+ ) = @_;
+
+ # Returns nothing.
+
+ if ( not defined $options->{ $arg->{ 'long' } } ) {
+ $options->{ $arg->{ 'long' } } = $arg->{ 'default' }
+ }
+}
+
+
+sub check_uint
+{
+ # Martin A. Hansen, May 2009.
+
+ # Check if value to argument is an unsigned integer and
+ # raises an error if not.
+
+ my ( $arg, # hashref
+ $options, # options hash
+ ) = @_;
+
+ # Returns nothing.
+
+ if ( $arg->{ 'type' } eq 'uint' and defined $options->{ $arg->{ 'long' } } )
+ {
+ if ( $options->{ $arg->{ 'long' } } !~ /^\d+$/ ) {
+ Maasha::Common::error( qq(Argument --$arg->{ 'long' } must be an unsigned integer - not $options->{ $arg->{ 'long' } }) );
+ }
+ }
+}
+
+
+sub check_int
+{
+ # Martin A. Hansen, May 2009.
+
+ # Check if value to argument is an integer and
+ # raises an error if not.
+
+ my ( $arg, # hashref
+ $options, # options hash
+ ) = @_;
+
+ # Returns nothing.
+
+ if ( $arg->{ 'type' } eq 'int' and defined $options{ $arg->{ 'long' } } )
+ {
+ if ( $options->{ $arg->{ 'long' } } !~ /^-?\d+$/ ) {
+ Maasha::Common::error( qq(Argument --$arg->{ 'long' } must be an integer - not $options->{ $arg->{ 'long' } }) );
+ }
+ }
+}
+
+
+sub set_list
+{
+ # Martin A. Hansen, May 2009.
+
+ # Splits an argument of type 'list' into a list that is put
+ # in the options hash.
+
+ my ( $arg, # hashref
+ $options, # options hash
+ ) = @_;
+
+ # Returns nothing.
+
+ if ( $arg->{ 'type' } eq 'list' and defined $options->{ $arg->{ 'long' } } ) {
+ $options->{ $arg->{ 'long' } } = [ split /,/, $options->{ $arg->{ 'long' } } ];
+ }
+}
+
+
+sub check_dir
+{
+ # Martin A. Hansen, May 2009.
+
+ # Check if an argument of type 'dir!' truly is a directory and
+ # raises an error if not.
+
+ my ( $arg, # hashref
+ $options, # options hash
+ ) = @_;
+
+ # Returns nothing.
+
+ if ( $arg->{ 'type' } eq 'dir!' and defined $options->{ $arg->{ 'long' } } )
+ {
+ if ( not -d $options->{ $arg->{ 'long' } } ) {
+ Maasha::Common::error( qq(No such directory: "$options->{ $arg->{ 'long' } }") );
+ }
+ }
+}
+
+
+sub check_file
+{
+ # Martin A. Hansen, May 2009.
+
+ # Check if an argument of type 'file!' truly is a file and
+ # raises an error if not.
+
+ my ( $arg, # hashref
+ $options, # options hash
+ ) = @_;
+
+ # Returns nothing.
+
+ if ( $arg->{ 'type' } eq 'file!' and defined $options->{ $arg->{ 'long' } } )
+ {
+ if ( not -f $options->{ $arg->{ 'long' } } ) {
+ Maasha::Common::error( qq(No such file: "$options->{ $arg->{ 'long' } }") );
+ }
+ }
+}
+
+
+sub set_files
+{
+ # Martin A. Hansen, May 2009.
+
+ # Split the argument to 'files' into a list that is put on the options hash.
+
+ my ( $arg, # hashref
+ $options, # options hash
+ ) = @_;
+
+ # Returns nothing.
+
+ if ( $arg->{ 'type' } eq 'files' and defined $options->{ $arg->{ 'long' } } ) {
+ $options->{ $arg->{ 'long' } } = [ split /,/, $options->{ $arg->{ 'long' } } ];
+ }
+}
+
+
+sub check_files
+{
+ # Martin A. Hansen, May 2009.
+
+ # Split the argument to 'files!' and check if each file do exists before adding
+ # the file list to the options hash.
+
+ my ( $arg, # hashref
+ $options, # options hash
+ ) = @_;
+
+ # Returns nothing.
+
+ my ( $elem, @files );
+
+ if ( $arg->{ 'type' } eq 'files!' and defined $options->{ $arg->{ 'long' } } )
+ {
+ foreach $elem ( split /,/, $options->{ $arg->{ 'long' } } )
+ {
+ if ( -f $elem ) {
+ push @files, $elem;
+ } elsif ( $elem =~ /\*/ ) {
+ push @files, glob( $elem );
+ }
+ }
+
+ if ( scalar @files == 0 ) {
+ Maasha::Common::error( qq(Argument to --$arg->{ 'long' } must be a valid file or fileglob expression - not $options->{ $arg->{ 'long' } }) );
+ }
+
+ $options->{ $arg->{ 'long' } } = [ @files ];
+ }
+}
+
+
+sub check_allowed
+{
+ # Martin A. Hansen, May 2009.
+
+ # Check if all values to all arguement are allowed and raise an
+ # error if not.
+
+ my ( $arg, # hashref
+ $options, # options hash
+ ) = @_;
+
+ # Returns nothing.
+
+ my ( $elem );
+
+ if ( defined $arg->{ 'allowed' } and defined $options->{ $arg->{ 'long' } } )
+ {
+ map { $val_hash{ $_ } = 1 } split /,/, $arg->{ 'allowed' };
+
+ if ( $arg->{ 'type' } =~ /^(list|files|files!)$/ )
+ {
+ foreach $elem ( @{ $options->{ $arg->{ 'long' } } } )
+ {
+ if ( not exists $val_hash{ $elem } ) {
+ Maasha::Common::error( qq(Argument to --$arg->{ 'long' } $elem is not allowed) );
+ }
+ }
+ }
+ else
+ {
+ if ( not exists $val_hash{ $options->{ $arg->{ 'long' } } } ) {
+ Maasha::Common::error( qq(Argument to --$arg->{ 'long' } $options->{ $arg->{ 'long' } } is not allowed) );
+ }
+ }
+ }
+}
+
+
+sub check_disallowed
+{
+ # Martin A. Hansen, May 2009.
+
+ # Check if any values to all arguemnts are disallowed and raise an error if so.
+
+ my ( $arg, # hashref
+ $options, # options hash
+ ) = @_;
+
+ # Returns nothing.
+
+ my ( $val, %val_hash );
+
+ if ( defined $arg->{ 'disallowed' } and defined $options->{ $arg->{ 'long' } } )
+ {
+ foreach $val ( split /,/, $arg->{ 'disallowed' } )
+ {
+ if ( $options->{ $arg->{ 'long' } } eq $val ) {
+ Maasha::Common::error( qq(Argument to --$arg->{ 'long' } $val is disallowed) );
+ }
+ }
+ }
+}
+
+
sub getopt_files
{
# Martin A. Hansen, November 2007.
# print STDERR "signal->$sig<-\n";
+ my $script = Maasha::Common::get_scriptname();
+
chomp $sig;
sleep 1;
- if ( -d $BP_TMP )
+# if ( -d $BP_TMP )
{
if ( $sig =~ /MAASHA_ERROR/ ) {
print STDERR "\nProgram '$script' had an error" . " - Please wait for temporary data to be removed\n";
$curr_pid = Maasha::Common::get_processid();
- @dirs = Maasha::Common::ls_dirs( $tmpdir );
+ @dirs = Maasha::Filesys::ls_dirs( $tmpdir );
foreach $dir ( @dirs )
{
if ( not Maasha::Common::process_running( $pid ) )
{
# print STDERR "Removing stale dir: $dir\n";
- Maasha::Common::dir_remove( $dir );
+ Maasha::Filesys::dir_remove( $dir );
}
elsif ( $pid == $curr_pid )
{
# print STDERR "Removing current dir: $dir\n";
- Maasha::Common::dir_remove( $dir );
+ Maasha::Filesys::dir_remove( $dir );
}
}
}
}
+sub run_time
+{
+ # Martin A. Hansen, May 2009.
+
+ # Returns a precision timestamp for calculating
+ # run time.
+
+ return Maasha::Common::get_time_hires();
+}
+
+
+sub run_time_print
+{
+ # Martin A. Hansen, May 2009
+
+ # Print the run time to STDERR for the current script if
+ # the verbose switch is set in the option hash.
+
+ my ( $t0, # run time begin
+ $t1, # run time end
+ $options, # options hash
+ ) = @_;
+
+ # Returns nothing
+
+ my $script = Maasha::Common::get_scriptname();
+
+ print STDERR "Program: $script" . ( " " x ( 25 - length( $script ) ) ) . sprintf( "Run time: %.4f\n", ( $t1 - $t0 ) ) if $options->{ 'verbose' };
+
+}
+
+
+sub get_tmpdir
+{
+ # Martin A. Hansen, April 2008.
+
+ # Create a temporary directory based on
+ # $ENV{ 'BP_TMP' } and sessionid.
+
+ # this thing is a really bad solution and needs to be removed.
+
+ # Returns a path.
+
+ my ( $user, $sid, $pid, $path );
+
+ Maasha::Common::error( qq(no BP_TMP set in %ENV) ) if not -d $ENV{ 'BP_TMP' };
+
+ $user = Maasha::Common::get_user();
+ $sid = Maasha::Common::get_sessionid();
+ $pid = Maasha::Common::get_processid();
+
+ $path = "$ENV{ 'BP_TMP' }/" . join( "_", $user, $sid, $pid, "bp_tmp" );
+
+ Maasha::Filesys::dir_create( $path );
+
+ return $path;
+}
+
+
END
{
clean_tmp();
package Maasha::Blast;
-# Copyright (C) 2007 Martin A. Hansen.
+# Copyright (C) 2007-2009 Martin A. Hansen.
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
use Data::Dumper;
use Storable;
use IO::File;
+use Time::HiRes qw( gettimeofday );
use Maasha::Config;
use Exporter;
{
# Martin A. Hansen, January 2004.
- # read opens a file and returns a filehandle.
+ # Read opens a file and returns a filehandle.
my ( $path, # full path to file
) = @_;
my ( $fh, $type );
- $type = `file $path` if $path;
+ $type = `file $path` if -f $path;
if ( $type =~ /gzip compressed/ ) {
$fh = new IO::File "zcat $path|" or Maasha::Common::error( qq(Could not read-open file "$path": $!) );
}
+sub read_open_multi
+{
+ # Martin A. Hansen, May 2009.
+
+ # Cats a number of files and returns a filehandle.
+
+ my ( $files, # full path to file
+ ) = @_;
+
+ # returns filehandle
+
+ my ( $file, $fh, $type, %type_hash, $file_string );
+
+ foreach $file ( @{ $files } )
+ {
+ Maasha::Common::error( qq(No such file: $file) ) if not -f $file;
+
+ $type = `file $file`;
+
+ if ( $type =~ /gzip compressed/ ) {
+ $type_hash{ 'gzip' } = 1;
+ } else {
+ $type_hash{ 'ascii' } = 1;
+ }
+ }
+
+ Maasha::Common::error( qq(Mixture of zipped and unzipped files) ) if scalar keys %type_hash > 1;
+
+ $file_string = join " ", @{ $files };
+
+ if ( $type =~ /gzip compressed/ ) {
+ $fh = new IO::File "zcat $file_string|" or Maasha::Common::error( qq(Could not open pipe: $!) );
+ } else {
+ $fh = new IO::File "cat $file_string|" or Maasha::Common::error( qq(Could not open pipe: $!) );
+ }
+
+ return $fh;
+}
+
+
sub write_open
{
# Martin A. Hansen, January 2004.
}
-sub read_stdin
-{
- # Martin A. Hansen, July 2007.
-
- # Returns a filehandle to STDIN
-
- my ( $fh );
-
- $fh = new IO::File "<&STDIN" or Maasha::Common::error( qq(Could not read from STDIN: $!) );
-
- return $fh;
-}
-
-
-sub write_stdout
-{
- # Martin A. Hansen, July 2007.
-
- # Returns a filehandle to STDOUT
-
- my ( $fh );
-
- $fh = new IO::File ">&STDOUT" or Maasha::Common::error( qq(Could not write to STDOUT: $!) );
-
- return $fh;
-}
-
-
sub file_store
{
# Martin A. Hansen, December 2004.
}
-sub dir_create
-{
- # Martin A. Hansen, July 2007.
-
- # Creates a directory.
-
- my ( $path, # full path to dir
- ) = @_;
-
- # Returns nothing.
-
- if ( -d $path ) {
- Maasha::Common::error( qq(Directory already exists "$path": $!) );
- } else {
- mkdir $path or Maasha::Common::error( qq(Could not create directory "$path": $!) );
- }
-}
-
-
-sub dir_create_if_not_exists
-{
- # Martin A. Hansen, May 2008.
-
- # Creates a directory if it does not already exists.
-
- my ( $path, # full path to dir
- ) = @_;
-
- # Returns nothing.
-
- if ( not -d $path ) {
- mkdir $path or Maasha::Common::error( qq(Could not create directory "$path": $!) );
- }
-}
-
-
-sub dir_remove
-{
- # Martin A. Hansen, April 2008.
-
- # Removes a directory recursively.
-
- my ( $path, # directory
- ) = @_;
-
- Maasha::Common::run( "rm", "-rf $path" ) if -d $path;
-}
-
-
-sub ls_dirs
-{
- # Martin A. Hansen, June 2007.
-
- # returns all dirs in a given directory.
-
- my ( $path, # full path to directory
- ) = @_;
-
- # returns a list of filenames.
-
- my ( $dh, @dirs );
-
- $dh = open_dir( $path );
-
- @dirs = read_dir( $dh );
- @dirs = grep { -d "$path/$_" } @dirs;
-
- map { $_ = "$path/$_" } @dirs;
-
- close $dh;
-
- return wantarray ? @dirs : \@dirs;
-}
-
-
-sub ls_files
-{
- # Martin A. Hansen, June 2007.
-
- # returns all files in a given directory.
-
- my ( $path, # full path to directory
- ) = @_;
-
- # returns a list of filenames.
-
- my ( $dh, @files );
-
- $dh = open_dir( $path );
-
- @files = read_dir( $dh );
- @files = grep { -f "$path/$_" } @files;
-
- map { $_ = "$path/$_" } @files;
-
- close $dh;
-
- return wantarray ? @files : \@files;
-}
-
-
-sub open_dir
-{
- # Martin A. Hansen, June 2007.
-
- # open a directory and returns a directory handle
-
- use IO::Dir;
-
- my ( $path, # full path to directory
- ) = @_;
-
- # returns object
-
- my $dh;
-
- $dh = IO::Dir->new( $path ) or Maasha::Common::error( qq(Could not open dir "$path": $!) );
-
- return $dh;
-}
-
-
-sub read_dir
-{
- # Martin A. Hansen, June 2007.
-
- # read all files and directories from a directory.
-
- my ( $dh, # directory handle object
- ) = @_;
-
- # returns list
-
- my ( $elem, @elems );
-
- while ( defined( $elem = $dh->read ) ) {
- push @elems, $elem;
- }
-
- return wantarray ? @elems : \@elems;
-}
-
-
sub read_args
{
# Martin A. Hansen, December 2006
}
+sub get_time_hires
+{
+ # Martin A. Hansen, May 2008.
+
+ # Get current time in high resolution.
+
+ # Returns a float.
+
+ return gettimeofday();
+}
+
+
sub get_processid
{
# Martin A. Hansen, July 2008.
$path = "$ENV{ 'BP_TMP' }/" . join( "_", $user, $sid, $pid, "bp_tmp" );
- Maasha::Common::dir_create( $path );
+ Maasha::Filesys::dir_create( $path );
return $path;
}
}
-sub file_read
-{
- # Martin A. Hansen, December 2004.
-
- # given a file, a seek beg position and
- # length, returns the corresponding string.
-
- my ( $fh, # file handle to file
- $beg, # read start in file
- $len, # read length of block
- ) = @_;
-
- # returns string
-
- my ( $string );
-
- Maasha::Common::error( qq(Negative length: $len) ) if $len < 0;
-
- sysseek $fh, $beg, 0;
- sysread $fh, $string, $len;
-
- return $string;
-}
-
-
-sub file_size
-{
- # Martin A. Hansen, March 2007
-
- # returns the file size for a given file
-
- my ( $path, # full path to file
- ) = @_;
-
- # returns integer
-
- my $file_size = ( stat ( $path ) )[ 7 ];
-
- return $file_size;
-}
-
-
sub run
{
# Martin A. Hansen, April 2007.
package Maasha::Fasta;
-# Copyright (C) 2006 Martin A. Hansen.
+# Copyright (C) 2006-2009 Martin A. Hansen.
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
@ISA = qw( Exporter );
use constant {
- HEAD => 0,
+ SEQ_NAME => 0,
SEQ => 1,
};
# Returns nothing.
- Maasha::Common::error( qq(FASTA entry has no header) ) if not defined $entry->[ HEAD ];
+ Maasha::Common::error( qq(FASTA entry has no header) ) if not defined $entry->[ SEQ_NAME ];
Maasha::Common::error( qq(FASTA entry has no sequence) ) if not defined $entry->[ SEQ ];
if ( $wrap ) {
}
if ( defined $fh ) {
- print $fh ">$entry->[ HEAD ]\n$entry->[ SEQ ]\n";
+ print $fh ">$entry->[ SEQ_NAME ]\n$entry->[ SEQ ]\n";
} else {
- print ">$entry->[ HEAD ]\n$entry->[ SEQ ]\n";
+ print ">$entry->[ SEQ_NAME ]\n$entry->[ SEQ ]\n";
}
}
$fh = Maasha::Common::read_open( $path );
while ( $entry = get_entry( $fh ) ) {
- push @list, $entry->[ HEAD ];
+ push @list, $entry->[ SEQ_NAME ];
}
close $fh;
while ( $entry = get_entry( $fh ) )
{
- warn qq(WARNING: header->$entry->[ HEAD ] alread exists in index) if exists $hash{ $entry->[ HEAD ] };
+ warn qq(WARNING: header->$entry->[ SEQ_NAME ] alread exists in index) if exists $hash{ $entry->[ SEQ_NAME ] };
- $beg += $len + 2 + length $entry->[ HEAD ];
+ $beg += $len + 2 + length $entry->[ SEQ_NAME ];
$len = length $entry->[ SEQ ];
- push @index, [ $entry->[ HEAD ], $beg, $len ];
+ push @index, [ $entry->[ SEQ_NAME ], $beg, $len ];
- $hash{ $entry->[ HEAD ] } = 1;
+ $hash{ $entry->[ SEQ_NAME ] } = 1;
$beg++;
}
}
+sub fasta2biopiece
+{
+ # Martin A. Hansen, May 2009
+
+ # Convert a FASTA record to a Biopiece record.
+
+ my ( $entry, # FASTA entry
+ ) = @_;
+
+ # Returns a hashref.
+
+ my ( $record );
+
+ if ( defined $entry->[ SEQ_NAME ] and $entry->[ SEQ ] )
+ {
+ $record = {
+ SEQ_NAME => $entry->[ SEQ_NAME ],
+ SEQ => $entry->[ SEQ ],
+ SEQ_LEN => length $entry->[ SEQ ],
+ };
+ }
+
+ return wantarray ? %{ $record } : $record;
+}
+
+
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
}
+sub stdin_read
+{
+ # Martin A. Hansen, July 2007.
+
+ # Returns a filehandle to STDIN
+
+ my ( $fh );
+
+ $fh = new IO::File "<&STDIN" or Maasha::Common::error( qq(Could not read from STDIN: $!) );
+
+ return $fh;
+}
+
+
+sub stdout_write
+{
+ # Martin A. Hansen, July 2007.
+
+ # Returns a filehandle to STDOUT
+
+ my ( $fh );
+
+ $fh = new IO::File ">&STDOUT" or Maasha::Common::error( qq(Could not write to STDOUT: $!) );
+
+ return $fh;
+}
+
+
+sub file_read
+{
+ # Martin A. Hansen, December 2004.
+
+ # given a file, a seek beg position and
+ # length, returns the corresponding string.
+
+ my ( $fh, # file handle to file
+ $beg, # read start in file
+ $len, # read length of block
+ ) = @_;
+
+ # returns string
+
+ my ( $string );
+
+ Maasha::Common::error( qq(Negative length: $len) ) if $len < 0;
+
+ sysseek $fh, $beg, 0;
+ sysread $fh, $string, $len;
+
+ return $string;
+}
+
+
sub file_copy
{
# Martin A. Hansen, November 2008.
}
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DIRECTORIES <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+sub dir_create
+{
+ # Martin A. Hansen, July 2007.
+
+ # Creates a directory.
+
+ my ( $path, # full path to dir
+ ) = @_;
+
+ # Returns nothing.
+
+ if ( -d $path ) {
+ Maasha::Common::error( qq(Directory already exists "$path": $!) );
+ } else {
+ mkdir $path or Maasha::Common::error( qq(Could not create directory "$path": $!) );
+ }
+}
+
+
+sub dir_create_if_not_exists
+{
+ # Martin A. Hansen, May 2008.
+
+ # Creates a directory if it does not already exists.
+
+ my ( $path, # full path to dir
+ ) = @_;
+
+ # Returns nothing.
+
+ if ( not -d $path ) {
+ mkdir $path or Maasha::Common::error( qq(Could not create directory "$path": $!) );
+ }
+}
+
+
+sub dir_remove
+{
+ # Martin A. Hansen, April 2008.
+
+ # Removes a directory recursively.
+
+ my ( $path, # directory
+ ) = @_;
+
+ Maasha::Common::run( "rm", "-rf $path" ) if -d $path;
+}
+
+
+sub ls_dirs
+{
+ # Martin A. Hansen, June 2007.
+
+ # returns all dirs in a given directory.
+
+ my ( $path, # full path to directory
+ ) = @_;
+
+ # returns a list of filenames.
+
+ my ( $dh, @dirs );
+
+ $dh = open_dir( $path );
+
+ @dirs = read_dir( $dh );
+ @dirs = grep { -d "$path/$_" } @dirs;
+
+ map { $_ = "$path/$_" } @dirs;
+
+ close $dh;
+
+ return wantarray ? @dirs : \@dirs;
+}
+
+
+sub ls_files
+{
+ # Martin A. Hansen, June 2007.
+
+ # returns all files in a given directory.
+
+ my ( $path, # full path to directory
+ ) = @_;
+
+ # returns a list of filenames.
+
+ my ( $dh, @files );
+
+ $dh = open_dir( $path );
+
+ @files = read_dir( $dh );
+ @files = grep { -f "$path/$_" } @files;
+
+ map { $_ = "$path/$_" } @files;
+
+ close $dh;
+
+ return wantarray ? @files : \@files;
+}
+
+
+sub open_dir
+{
+ # Martin A. Hansen, June 2007.
+
+ # open a directory and returns a directory handle
+
+ use IO::Dir;
+
+ my ( $path, # full path to directory
+ ) = @_;
+
+ # returns object
+
+ my $dh;
+
+ $dh = IO::Dir->new( $path ) or Maasha::Common::error( qq(Could not open dir "$path": $!) );
+
+ return $dh;
+}
+
+
+sub read_dir
+{
+ # Martin A. Hansen, June 2007.
+
+ # read all files and directories from a directory.
+
+ my ( $dh, # directory handle object
+ ) = @_;
+
+ # returns list
+
+ my ( $elem, @elems );
+
+ while ( defined( $elem = $dh->read ) ) {
+ push @elems, $elem;
+ }
+
+ return wantarray ? @elems : \@elems;
+}
+
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
use Data::Dumper;
use Term::ANSIColor;
use Maasha::Common;
+use Maasha::Filesys;
use vars qw ( @ISA @EXPORT );
@ISA = qw( Exporter );
my ( $fh, @lines, $i, $c, $section, $paragraph, @block, @output );
- $fh = Maasha::Common::read_open( $file );
+ $fh = Maasha::Filesys::file_read_open( $file );
@lines = <$fh>;
$c = $i;
- while ( $lines[ $c ] !~ /^\s*$/ )
+ while ( defined $lines[ $c ] and $lines[ $c ] !~ /^\s*$/ )
{
$paragraph .= " $lines[ $c ]";
use Data::Dumper;
use Storable qw( dclone );
use Maasha::Common;
+use Maasha::Filesys;
use Maasha::Fasta;
use Maasha::Seq;
use Maasha::Biopieces;
Maasha::Common::run( "mummer", "$arg $file_in1 $file_in2 > $file_out 2>/dev/null" );
- $fh = Maasha::Common::read_open( $file_out );
+ $fh = Maasha::Filesys::file_read_open( $file_out );
while ( $line = <$fh> )
{
}
-sub match_vmatch
-{
- # Martin A. Hansen, April 2008.
-
- # Vmatches a list of records against a list of index files and the full
- # path to the result file is returned.
-
- my ( $tmp_dir, # directory in where to save temp files
- $records, # list of records
- $index_files, # list of index files
- $options, # argument hash
- ) = @_;
-
- # Returns a string.
-
- my ( $query_file, $result_file, @result_files, $fh_in, $fh_out, $line, @fields, $i, $record, $vmatch_args, @index_names, @seq_names, $count_list, $entry );
-
- $query_file = "$tmp_dir/query.seq";
- $result_file = "$tmp_dir/vmatch.out";
-
- $fh_out = Maasha::Common::write_open( $query_file );
-
- foreach $record ( @{ $records } )
- {
- if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) )
- {
- next if length $entry->[ SEQ ] < 12; # assuming that the index is created for 12 as minimum length
-
- push @seq_names, $entry->[ SEQ_NAME ];
-
- Maasha::Fasta::put_entry( $entry, $fh_out, 100 );
- }
- }
-
- close $fh_out;
-
- if ( $options->{ 'genome' } ) {
- $vmatch_args = "-complete -d -p -q $query_file";
- } else {
- $vmatch_args = "-complete -d -p -showdesc 100 -q $query_file";
- }
-
- $vmatch_args .= " -h " . $options->{ "hamming_dist" } if $options->{ "hamming_dist" };
- $vmatch_args .= " -e " . $options->{ "edit_dist" } if $options->{ "edit_dist" };
-
- for ( $i = 0; $i < @{ $index_files }; $i++ )
- {
- Maasha::Common::run( "vmatch", "$vmatch_args $index_files->[ $i ] > $result_file.$i" );
-
- push @result_files, "$result_file.$i";
- }
-
- unlink $query_file;
-
- $count_list = vmatch_count_hits( \@result_files ) if ( $options->{ "count" } );
-
- $fh_out = Maasha::Common::write_open( $result_file );
-
- for ( $i = 0; $i < @{ $index_files }; $i++ )
- {
- $index_files->[ $i ] =~ s/.+\/(.+)\.fna$/$1/ if $options->{ 'genome' };
-
- $fh_in = Maasha::Common::read_open( "$result_file.$i" );
-
- while ( $line = <$fh_in> )
- {
- chomp $line;
-
- next if $line =~ /^#/;
-
- @fields = split " ", $line;
-
- next if $options->{ "max_hits" } and $count_list->[ $fields[ 5 ] ] > $options->{ 'max_hits' };
-
- $fields[ 1 ] = $index_files->[ $i ]; # S_ID
- $fields[ 9 ] = int $fields[ 9 ];
- $fields[ 9 ] = $count_list->[ $fields[ 5 ] ] if $options->{ "count" }; # SCORE
- $fields[ 5 ] = $seq_names[ $fields[ 5 ] ]; # Q_ID
-
- print $fh_out join( "\t", @fields ), "\n";
- }
-
- close $fh_in;
-
- unlink "$result_file.$i";
- }
-
- close $fh_out;
-
- return $result_file;
-}
-
-
-sub vmatch_count_hits
-{
- # Martin A. Hansen, April 2008.
-
- # Given a list of Vmatch result file, count duplications based
- # on q_id. The counts are returned in a list where the list index
- # corresponds to the q_id index in the query file.
-
- my ( $files, # vmatch result files
- ) = @_;
-
- # Returns a list.
-
- my ( $file, $fh_in, $line, @fields, @count_list );
-
- foreach $file ( @{ $files } )
- {
- $fh_in = Maasha::Common::read_open( $file );
-
- while ( $line = <$fh_in> )
- {
- chomp $line;
-
- next if $line =~ /^#/;
-
- @fields = split " ", $line;
-
- $count_list[ $fields[ 5 ] ]++;
- }
-
- close $fh_in;
- }
-
- return wantarray ? @count_list : \@count_list;
-}
-
-
-sub vmatch_get_entry
-{
- # Martin A. Hansen, January 2008.
-
- # Parses vmatch output records.
-
- my ( $fh, # file handle to vmatch result file.
- ) = @_;
-
- # Returns a hash.
-
- my ( $line, @fields, %record );
-
- while ( $line = <$fh> )
- {
- chomp $line;
-
- next if $line =~ /^#/;
-
- @fields = split "\t", $line;
-
- $record{ "REC_TYPE" } = "VMATCH";
-
- $record{ "S_LEN" } = $fields[ 0 ];
- $record{ "S_ID" } = $fields[ 1 ];
- $record{ "S_BEG" } = $fields[ 2 ];
-
- if ( $fields[ 3 ] eq "D" ) {
- $record{ "STRAND" } = "+";
- } else {
- $record{ "STRAND" } = "-";
- }
-
- $record{ "Q_LEN" } = $fields[ 4 ];
- $record{ "Q_ID" } = $fields[ 5 ];
- $record{ "Q_BEG" } = $fields[ 6 ];
- $record{ "MATCH_DIST" } = $fields[ 7 ];
- $record{ "E_VAL" } = $fields[ 8 ];
- $record{ "SCORE" } = $fields[ 9 ];
- $record{ "IDENT" } = $fields[ 10 ];
-
- $record{ "Q_END" } = $record{ "Q_BEG" } + $record{ "Q_LEN" } - 1;
- $record{ "S_END" } = $record{ "S_BEG" } + $record{ "S_LEN" } - 1;
-
- return wantarray ? %record : \%record;
- }
-}
-
-
sub vmatch_index
{
# Martin A. Hansen, July 2008.
my ( $fh, $entry, $tmp_file );
- Maasha::Common::dir_create_if_not_exists( $dst_dir );
+ Maasha::Filesys::dir_create_if_not_exists( $dst_dir );
# if ( Maasha::Common::file_size( $file ) < 200_000_000 )
# {
# }
# else
# {
- $fh = Maasha::Common::read_open( "$src_dir/$file" );
+ $fh = Maasha::Filesys::file_read_open( "$src_dir/$file" );
while ( $entry = Maasha::Fasta::get_entry( $fh ) )
{
$tmp_file = $entry->[ SEQ_NAME ] . ".fna";
- Maasha::Fasta::put_entries( [ $entry ], "$tmp_dir/$tmp_file" )
+ Maasha::Fasta::put_entries( [ $entry ], "$tmp_dir/$tmp_file" );
- &Maasha::Common::run( "mkvtree", "-db $tmp_dir/$tmp_file -dna -pl -allout -indexname $dst_dir/$tmp_file > /dev/null 3>&1" );
+ Maasha::Common::run( "mkvtree", "-db $tmp_dir/$tmp_file -dna -pl -allout -indexname $dst_dir/$tmp_file > /dev/null 3>&1" );
unlink "$tmp_dir/$tmp_file";
}