From: martinahansen Date: Sat, 23 May 2009 19:51:55 +0000 (+0000) Subject: rewrite begin X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=9854397f83abe8d5a8c614b799ef3107d3f85f80;p=biopieces.git rewrite begin git-svn-id: http://biopieces.googlecode.com/svn/trunk@381 74ccb610-7750-0410-82ae-013aeee3265d --- diff --git a/bp_bin/blast_seq b/bp_bin/blast_seq index fdf5bd2..bcb9df7 100755 --- a/bp_bin/blast_seq +++ b/bp_bin/blast_seq @@ -1,6 +1,291 @@ -#!/usr/bin/env perl +#!/usr/bin/env perl -w + +# Copyright (C) 2007-2009 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# BLAST sequences in the stream against a specified database or genome. + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + -use warnings; use strict; +use Maasha::Biopieces; +use Maasha::Common; +use Maasha::Seq; +use Maasha::Fasta; + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +my ( $run_time_beg, $run_time_end, $options, $in, $out, $tmp_dir, $tmp_in, $tmp_out, $q_type, $s_type, $record, $entry, + $fh_in, $fh_out, $program ); + +$options = Maasha::Biopieces::parse_options( + [ + { long => 'database', short => 'd', type => 'file', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, + { long => 'genome', short => 'g', type => 'genome', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, + { long => 'program', short => 'p', type => 'string', mandatory => 'no', default => undef, allowed => 'blastn,blastp,tblastn,blastx,tblastx', disallowed => undef }, + { long => 'e_val', short => 'e', type => 'float', mandatory => 'no', default => 10, allowed => undef, disallowed => undef }, + { long => 'filter', short => 'f', type => 'string', mandatory => 'no', default => 'no', allowed => 'yes,no', disallowed => undef }, + { long => 'cpus', short => 'c', type => 'uint', mandatory => 'no', default => 1, allowed => undef, disallowed => 0 }, + ] +); + +Maasha::Common::error( qq(both --database and --genome specified) ) if $options->{ "genome" } and $options->{ "database" }; +Maasha::Common::error( qq(no --database or --genome specified) ) if not $options->{ "genome" } and not $options->{ "database" }; + +$in = Maasha::Biopieces::read_stream( $options->{ "stream_in" } ); +$out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } ); + +$options->{ "database" } = "$ENV{ 'BP_DATA' }/genomes/$options->{ 'genome' }/blast/$options->{ 'genome' }.fna" if $options->{ 'genome' }; + +$tmp_dir = Maasha::Biopieces::get_tmpdir(); +$tmp_in = "$tmp_dir/blast_query.seq"; +$tmp_out = "$tmp_dir/blast.result"; + +$fh_out = Maasha::Filesys::file_write_open( $tmp_in ); + +while ( $record = Maasha::Biopieces::get_record( $in ) ) +{ + if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) ) + { + $q_type = Maasha::Seq::seq_guess_type( $record->{ 'SEQ' } ) if not $q_type; + + Maasha::Fasta::put_entry( $entry, $fh_out ); + } + + Maasha::Biopieces::put_record( $record, $out ); +} + +close $fh_out; + +$s_type = guess_database_type( $options->{ 'database' } ); + +$program = $options->{ 'program' } || guess_program( $q_type, $s_type ); + +if ( $options->{ 'filter' } eq 'yes' ) { + $options->{ 'filter' } = 'T'; +} else { + $options->{ 'filter' } = 'F'; +} + +if ( $options->{ 'verbose' } ) +{ + Maasha::Common::run( + "blastall", + join( " ", + "-p $program", + "-e $options->{ 'e_val' }", + "-a $options->{ 'cpus' }", + "-m 8", + "-i $tmp_in", + "-d $options->{ 'database' }", + "-F $options->{ 'filter' }", + "-o $tmp_out", + ), + 1 + ); +} +else +{ + Maasha::Common::run( + "blastall", + join( " ", + "-p $program", + "-e $options->{ 'e_val' }", + "-a $options->{ 'cpus' }", + "-m 8", + "-i $tmp_in", + "-d $options->{ 'database' }", + "-F $options->{ 'filter' }", + "-o $tmp_out", + "> /dev/null 2>&1" + ), + 1 + ); +} + +# unlink $tmp_in; + +$fh_in = Maasha::Filesys::file_read_open( $tmp_out ); + +while ( $entry = get_tab_entry( $fh_in ) ) +{ + $record = blast_tab2biopiece( $entry ); + + Maasha::Biopieces::put_record( $record, $out ); +} + +close $fh_out; + +# unlink $tmp_out; + +# Maasha::Filesys::dir_remove( $tmp_dir ); + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SUBROUTINES <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +sub guess_database_type +{ + # Martin A. Hansen, May 2009 + + # Guess the type of BLAST database from the database + # filename assuming that it is a protein database if + # a .phr file exists. + + # Returns string; + if ( -f $options->{ 'database' } . ".phr" ) { + return "protein"; + } else { + return "nucleotide"; + } +} + + +sub guess_program +{ + # Martin A. Hansen, May 2009 + + # Guess what BLAST program to use based on the + # sequence type of the query and subject sequences. + + my ( $q_type, # query sequence type + $s_type, # subject sequence type + ) = @_; + + # Returns string. + + my ( $program ); + + if ( $q_type ne "protein" and $s_type ne "protein" ) { + $program = "blastn"; + } elsif ( $q_type eq "protein" and $s_type eq "protein" ) { + $program = "blastp"; + } elsif ( $q_type ne "protein" and $s_type eq "protein" ) { + $program = "blastx"; + } elsif ( $q_type eq "protein" and $s_type ne "protein" ) { + $program = "tblastn"; + } + + return $program; +} + + + +sub get_tab_entry +{ + # Martin A. Hansen, May 2009. + + # Get the next tabular entry from a filehandle to a BLAST file. + + my ( $fh, # filehandle + ) = @_; + + # Returns a list + + my ( $line, @fields ); + + while ( $line = <$fh> ) + { + next if $line =~ /^#/; + + @fields = split /\s+/, $line; + + use Data::Dumper; + + return wantarray ? @fields : \@fields; + } + + return undef; +} + + +sub blast_tab2biopiece +{ + # Martin A. Hansen, May 2009. + + # Get the next BLAST tabular entry and convert it to + # a biopiece record that is returned. + + my ( $entry, # BLAST tabular entry + ) = @_; + + # Returns a hashref. + + my ( %record ); + + $record{ "REC_TYPE" } = "BLAST"; + $record{ "Q_ID" } = $entry->[ 0 ]; + $record{ "S_ID" } = $entry->[ 1 ]; + $record{ "IDENT" } = $entry->[ 2 ]; + $record{ "ALIGN_LEN" } = $entry->[ 3 ]; + $record{ "MISMATCHES" } = $entry->[ 4 ]; + $record{ "GAPS" } = $entry->[ 5 ]; + $record{ "Q_BEG" } = $entry->[ 6 ] - 1; # BLAST is 1-based + $record{ "Q_END" } = $entry->[ 7 ] - 1; # BLAST is 1-based + $record{ "S_BEG" } = $entry->[ 8 ] - 1; # BLAST is 1-based + $record{ "S_END" } = $entry->[ 9 ] - 1; # BLAST is 1-based + $record{ "E_VAL" } = $entry->[ 10 ]; + $record{ "BIT_SCORE" } = $entry->[ 11 ]; + + if ( $record{ "S_BEG" } > $record{ "S_END" } ) + { + $record{ "STRAND" } = '-'; + + ( $record{ "S_BEG" }, $record{ "S_END" } ) = ( $record{ "S_END" }, $record{ "S_BEG" } ); + } + else + { + $record{ "STRAND" } = '+'; + } + + return wantarray ? %record : \%record; +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +BEGIN +{ + $run_time_beg = Maasha::Biopieces::run_time(); + + Maasha::Biopieces::log_biopiece(); +} + +END +{ + Maasha::Biopieces::close_stream( $in ); + Maasha::Biopieces::close_stream( $out ); + + $run_time_end = Maasha::Biopieces::run_time(); + + Maasha::Biopieces::run_time_print( $run_time_beg, $run_time_end, $options ); +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +__END__ -use Maasha::BioRun; diff --git a/bp_bin/create_blast_db b/bp_bin/create_blast_db index fdf5bd2..79bd310 100755 --- a/bp_bin/create_blast_db +++ b/bp_bin/create_blast_db @@ -1,6 +1,102 @@ -#!/usr/bin/env perl +#!/usr/bin/env perl -w + +# Copyright (C) 2007-2009 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# Create a BLAST database from sequences in stream for use with [blast_seq]. + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + -use warnings; use strict; +use Maasha::Seq; +use Maasha::Fasta; +use Maasha::Biopieces; + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +my ( $run_time_beg, $run_time_end, $options, $in, $out, $record, $fh, $entry, $seq_type, $path ); + +$options = Maasha::Biopieces::parse_options( + [ + { long => 'no_stream', short => 'x', type => 'flag', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, + { long => 'database', short => 'd', type => 'file', mandatory => 'yes', default => undef, allowed => undef, disallowed => undef }, + ] +); + +$in = Maasha::Biopieces::read_stream( $options->{ "stream_in" } ); +$out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } ); + +$path = $options->{ "database" }; + +$fh = Maasha::Common::write_open( $path ); + +while ( $record = Maasha::Biopieces::get_record( $in ) ) +{ + Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" }; + + if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) ) + { + $seq_type = Maasha::Seq::seq_guess_type( $record->{ 'SEQ' } ) if not $seq_type; + + Maasha::Fasta::put_entry( $entry, $fh ); + } +} + +close $fh; + +if ( $seq_type eq "protein" ) { + Maasha::Common::run( "formatdb", "-p T -i $path -t $options->{ 'database' } -l /dev/null" ); +} else { + Maasha::Common::run( "formatdb", "-p F -i $path -t $options->{ 'database' } -l /dev/null" ); +} + +unlink $path; + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +BEGIN +{ + $run_time_beg = Maasha::Biopieces::run_time(); + + Maasha::Biopieces::log_biopiece(); +} + +END +{ + Maasha::Biopieces::close_stream( $in ); + Maasha::Biopieces::close_stream( $out ); + + $run_time_end = Maasha::Biopieces::run_time(); + + Maasha::Biopieces::run_time_print( $run_time_beg, $run_time_end, $options ); +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +__END__ -use Maasha::BioRun; diff --git a/bp_bin/extract_seq b/bp_bin/extract_seq index fdf5bd2..bdd8104 100755 --- a/bp_bin/extract_seq +++ b/bp_bin/extract_seq @@ -1,6 +1,98 @@ -#!/usr/bin/env perl +#!/usr/bin/env perl -w + +# Copyright (C) 2007-2009 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# Split the values of a key into new key/value pairs. + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + -use warnings; use strict; +use Maasha::Biopieces; + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +my ( $run_time_beg, $run_time_end, $options, $in, $out, $record, @vals, $i ); + +$options = Maasha::Biopieces::parse_options( + [ + { long => 'key', short => 'k', type => 'string', mandatory => 'yes', default => undef, allowed => undef, disallowed => undef }, + { long => 'keys', short => 'K', type => 'list', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, + { long => 'delimit', short => 'd', type => 'string', mandatory => 'no', default => '_', allowed => undef, disallowed => undef }, + ] +); + +$in = Maasha::Biopieces::read_stream( $options->{ "stream_in" } ); +$out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } ); + +while ( $record = Maasha::Biopieces::get_record( $in ) ) +{ + if ( exists $options->{ 'key' } and exists $record->{ $options->{ 'key' } } ) + { + @vals = split /$options->{ 'delimit' }/, $record->{ $options->{ 'key' } }; + + if ( scalar @vals > 1 ) + { + for ( $i = 0; $i < @vals; $i++ ) + { + if ( defined $options->{ "keys" } and defined $options->{ "keys" }->[ $i ] ) { + $record->{ $options->{ "keys" }->[ $i ] } = $vals[ $i ]; + } else { + $record->{ $options->{ 'key' } . "_$i" } = $vals[ $i ]; + } + } + } + } + + Maasha::Biopieces::put_record( $record, $out ); +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +BEGIN +{ + $run_time_beg = Maasha::Biopieces::run_time(); + + Maasha::Biopieces::log_biopiece(); +} + + +END +{ + Maasha::Biopieces::close_stream( $in ); + Maasha::Biopieces::close_stream( $out ); + + $run_time_end = Maasha::Biopieces::run_time(); + + Maasha::Biopieces::run_time_print( $run_time_beg, $run_time_end, $options ); +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +__END__ -use Maasha::BioRun; diff --git a/bp_bin/format_genome b/bp_bin/format_genome index fdf5bd2..5e5b8de 100755 --- a/bp_bin/format_genome +++ b/bp_bin/format_genome @@ -1,6 +1,142 @@ -#!/usr/bin/env perl +#!/usr/bin/env perl -w + +# Copyright (C) 2007-2009 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# Format a genome creating specified indexes. + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + -use warnings; use strict; +use Maasha::Fasta; +use Maasha::Biopieces; + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +my ( $run_time_beg, $run_time_end, $default, $formats, $options, $in, $out, $record, $data_out, $entry, + $genome, $dir, $fasta_dir, $phastcons_dir, $fh_out, $vals, $format, $tmp_dir ); + +$tmp_dir = Maasha::Biopieces::get_tmpdir(); +$default = $ENV{ 'BP_DATA' }; +$formats = 'fasta,blast,vmatch,phastcons'; + +$options = Maasha::Biopieces::parse_options( + [ + { long => 'no_stream', short => 'x', type => 'flag', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, + { long => 'dir', short => 'd', type => 'dir!', mandatory => 'no', default => $default, allowed => undef, disallowed => undef }, + { long => 'genome', short => 'g', type => 'string', mandatory => 'yes', default => undef, allowed => undef, disallowed => undef }, + { long => 'formats', short => 'f', type => 'list', mandatory => 'yes', default => undef, allowed => $formats, disallowed => '0' }, + ] +); + +$in = Maasha::Biopieces::read_stream( $options->{ "stream_in" } ); +$out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } ); + +$dir = $options->{ 'dir' }; +$genome = $options->{ 'genome' }; + +Maasha::Filesys::dir_create_if_not_exists( "$dir/genomes" ); +Maasha::Filesys::dir_create_if_not_exists( "$dir/genomes/$genome" ); + +if ( grep { $_ =~ /fasta|blast|vmatch/i } @{ $options->{ "formats" } } ) +{ + if ( -f "$dir/genomes/$genome/fasta/$genome.fna" ) + { + $fasta_dir = "$dir/genomes/$genome/fasta"; + } + else + { + Maasha::Filesys::dir_create_if_not_exists( "$dir/genomes/$genome/fasta" ); + + $fasta_dir = "$dir/genomes/$genome/fasta"; + + $fh_out = Maasha::Filesys::file_write_open( "$fasta_dir/$genome.fna" ); + } +} +elsif ( grep { $_ =~ /phastcons/i } @{ $options->{ "formats" } } ) +{ + Maasha::Filesys::dir_create_if_not_exists( "$dir/genomes/$genome/phastcons" ); + + $phastcons_dir = "$dir/genomes/$genome/phastcons"; + + $fh_out = Maasha::Filesys::file_write_open( "$phastcons_dir/$genome.pp" ); +} + +while ( $record = Maasha::Biopieces::get_record( $in ) ) +{ + if ( $fh_out and $entry = Maasha::Fasta::biopiece2fasta( $record ) ) + { + Maasha::Fasta::put_entry( $entry, $fh_out ); + } + elsif ( $fh_out and $record->{ "CHR" } and $record->{ "CHR_BEG" } and $record->{ "STEP" } and $record->{ "VALS" } ) + { + print $fh_out "fixedStep chrom=$record->{ 'CHR' } start=$record->{ 'CHR_BEG' } step=$record->{ 'STEP' }\n"; + + $vals = $record->{ 'VALS' }; + + $vals =~ tr/,/\n/; + + print $fh_out "$vals\n"; + } + + Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" }; +} + +foreach $format ( @{ $options->{ 'formats' } } ) +{ + if ( $format =~ /^fasta$/i ) { Maasha::Fasta::fasta_index( "$fasta_dir/$genome.fna", "$dir/genomes/$genome/fasta/$genome.index" ) } + elsif ( $format =~ /^blast$/i ) { Maasha::NCBI::blast_index( "$genome.fna", $fasta_dir, "$dir/genomes/$genome/blast", "dna", $genome ) } + elsif ( $format =~ /^blat$/i ) { print STDERR "BLAT FORMAT NOT IMPLEMENTED" } + elsif ( $format =~ /^vmatch$/i ) { Maasha::Match::vmatch_index( "$genome.fna", $fasta_dir, "$dir/genomes/$genome/vmatch", $tmp_dir ) } + elsif ( $format =~ /^phastcons$/i ) { Maasha::UCSC::phastcons_index( "$genome.pp", $phastcons_dir ) } +} + +close $fh_out if $fh_out; + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +BEGIN +{ + $run_time_beg = Maasha::Biopieces::run_time(); + + Maasha::Biopieces::log_biopiece(); +} + +END +{ + Maasha::Biopieces::close_stream( $in ); + Maasha::Biopieces::close_stream( $out ); + + $run_time_end = Maasha::Biopieces::run_time(); + + Maasha::Biopieces::run_time_print( $run_time_beg, $run_time_end, $options ); +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + -use Maasha::BioRun; +__END__ diff --git a/bp_bin/get_genome_seq b/bp_bin/get_genome_seq index fdf5bd2..cde18f9 100755 --- a/bp_bin/get_genome_seq +++ b/bp_bin/get_genome_seq @@ -1,6 +1,221 @@ -#!/usr/bin/env perl +#!/usr/bin/env perl -w + +# Copyright (C) 2007-2009 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# Extract subsequences from a genome sequence. + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + -use warnings; use strict; +use Maasha::Biopieces; +use Maasha::Filesys; +use Maasha::Seq; + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +my ( $run_time_beg, $run_time_end, $options, $in, $out, $record, $genome_file, $index_file, $fh, $genome, + $beg, $end, $len, $index_beg, $index_len, @begs, @lens, $index, $seq, $i, %lookup_hash ); + +$options = Maasha::Biopieces::parse_options( + [ + { long => 'genome', short => 'g', type => 'genome', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, + { long => 'chr', short => 'c', type => 'string', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, + { long => 'beg', short => 'b', type => 'uint', mandatory => 'no', default => undef, allowed => undef, disallowed => 0 }, + { long => 'end', short => 'e', type => 'uint', mandatory => 'no', default => undef, allowed => undef, disallowed => 0 }, + { long => 'len', short => 'l', type => 'uint', mandatory => 'no', default => undef, allowed => undef, disallowed => 0 }, + { long => 'flank', short => 'f', type => 'uint', mandatory => 'no', default => 0, allowed => undef, disallowed => undef }, + { long => 'mask', short => 'm', type => 'flag', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, + { long => 'splice', short => 's', type => 'flag', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, + ] +); + +$in = Maasha::Biopieces::read_stream( $options->{ "stream_in" } ); +$out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } ); + +if ( $options->{ "genome" } ) +{ + $genome = $options->{ "genome" }; + + $genome_file = "$ENV{ 'BP_DATA' }/genomes/$genome/fasta/$genome.fna"; + $index_file = "$ENV{ 'BP_DATA' }/genomes/$genome/fasta/$genome.index"; + + $fh = Maasha::Filesys::file_read_open( $genome_file ); + $index = Maasha::Fasta::index_retrieve( $index_file ); + + shift @{ $index }; # Get rid of the file size info + + map { $lookup_hash{ $_->[ 0 ] } = [ $_->[ 1 ], $_->[ 2 ] ] } @{ $index }; + + if ( exists $lookup_hash{ $options->{ "chr" } } and defined $options->{ "beg" } and ( defined $options->{ "end" } or defined $options->{ "len" } ) ) + { + ( $index_beg, $index_len ) = @{ $lookup_hash{ $options->{ "chr" } } }; + + $beg = $index_beg + $options->{ "beg" } - 1; + + if ( $options->{ "len" } ) { + $len = $options->{ "len" }; + } elsif ( $options->{ "end" } ) { + $len = ( $options->{ "end" } - $options->{ "beg" } + 1 ); + } + + $beg -= $options->{ "flank" }; + $len += 2 * $options->{ "flank" }; + + if ( $beg <= $index_beg ) + { + $len -= $index_beg - $beg; + $beg = $index_beg; + } + + $len = $index_beg + $index_len - $beg if $beg + $len > $index_beg + $index_len; + + next if $beg > $index_beg + $index_len; + + $record->{ "CHR" } = $options->{ "chr" }; + $record->{ "CHR_BEG" } = $beg - $index_beg; + $record->{ "CHR_END" } = $record->{ "CHR_BEG" } + $len - 1; + + $record->{ "SEQ" } = Maasha::Filesys::file_read( $fh, $beg, $len ); + $record->{ "SEQ_LEN" } = $len; + + Maasha::Biopieces::put_record( $record, $out ); + } +} + + +while ( $record = Maasha::Biopieces::get_record( $in ) ) +{ + if ( $options->{ "genome" } and not $record->{ "SEQ" } ) + { + if ( $record->{ "REC_TYPE" } eq "BED" and exists $lookup_hash{ $record->{ "CHR" } } ) + { + ( $index_beg, $index_len ) = @{ $lookup_hash{ $record->{ "CHR" } } }; + + $beg = $record->{ "CHR_BEG" } + $index_beg; + $len = $record->{ "CHR_END" } - $record->{ "CHR_BEG" } + 1; + } + elsif ( $record->{ "REC_TYPE" } eq "PSL" and exists $lookup_hash{ $record->{ "S_ID" } } ) + { + ( $index_beg, $index_len ) = @{ $lookup_hash{ $record->{ "S_ID" } } }; + + $beg = $record->{ "S_BEG" } + $index_beg; + $len = $record->{ "S_END" } - $record->{ "S_BEG" } + 1; + } + elsif ( $record->{ "REC_TYPE" } eq "BLAST" and exists $lookup_hash{ $record->{ "S_ID" } } ) + { + ( $index_beg, $index_len ) = @{ $lookup_hash{ $record->{ "S_ID" } } }; + + $beg = $record->{ "S_BEG" } + $index_beg; + $len = $record->{ "S_END" } - $record->{ "S_BEG" } + 1; + } + + $beg -= $options->{ "flank" }; + $len += 2 * $options->{ "flank" }; + + if ( $beg <= $index_beg ) + { + $len -= $index_beg - $beg; + $beg = $index_beg; + } + + $len = $index_beg + $index_len - $beg if $beg + $len > $index_beg + $index_len; + + next if $beg > $index_beg + $index_len; + + $record->{ "CHR_BEG" } = $beg - $index_beg; + $record->{ "CHR_END" } = $record->{ "CHR_BEG" } + $len - 1; + + $record->{ "SEQ" } = Maasha::Filesys::file_read( $fh, $beg, $len ); + + if ( $record->{ "STRAND" } and $record->{ "STRAND" } eq "-" ) + { + Maasha::Seq::dna_comp( \$record->{ "SEQ" } ); + $record->{ "SEQ" } = reverse $record->{ "SEQ" }; + } + + if ( $options->{ "mask" } ) + { + if ( $record->{ "BLOCK_COUNT" } > 1 ) # uppercase hit block segments and lowercase the rest. + { + $record->{ "SEQ" } = lc $record->{ "SEQ" }; + + @begs = split ",", $record->{ "Q_BEGS" }; + @lens = split ",", $record->{ "BLOCK_LENS" }; + + for ( $i = 0; $i < @begs; $i++ ) { + substr $record->{ "SEQ" }, $begs[ $i ], $lens[ $i ], uc substr $record->{ "SEQ" }, $begs[ $i ], $lens[ $i ]; + } + } + } + elsif ( $options->{ "splice" } ) + { + if ( $record->{ "BLOCK_COUNT" } > 1 ) # splice block sequences + { + $seq = ""; + @begs = split ",", $record->{ "Q_BEGS" }; + @lens = split ",", $record->{ "BLOCK_LENS" }; + + for ( $i = 0; $i < @begs; $i++ ) { + $seq .= substr $record->{ "SEQ" }, $begs[ $i ], $lens[ $i ]; + } + + $record->{ "SEQ" } = $seq; + } + } + + $record->{ "SEQ_LEN" } = length $record->{ "SEQ" }; + } + + Maasha::Biopieces::put_record( $record, $out ); +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +BEGIN +{ + $run_time_beg = Maasha::Biopieces::run_time(); + + Maasha::Biopieces::log_biopiece(); +} + + +END +{ + Maasha::Biopieces::close_stream( $in ); + Maasha::Biopieces::close_stream( $out ); + + $run_time_end = Maasha::Biopieces::run_time(); + + Maasha::Biopieces::run_time_print( $run_time_beg, $run_time_end, $options ); +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +__END__ -use Maasha::BioRun; diff --git a/bp_bin/grab b/bp_bin/grab index fdf5bd2..6d2aabf 100755 --- a/bp_bin/grab +++ b/bp_bin/grab @@ -1,6 +1,298 @@ -#!/usr/bin/env perl +#!/usr/bin/env perl -w + +# Copyright (C) 2007-2009 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# Grab records in stream. + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + -use warnings; use strict; +use Maasha::Biopieces; +use Maasha::Common; +use Maasha::Patscan; + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +my ( $run_time_beg, $run_time_end, $options, $in, $out, $record, $keys, $vals_only, $keys_only, $invert, + $patterns, $regex, %lookup_hash, $key, $op, $val, $found ); + +$options = Maasha::Biopieces::parse_options( + [ + { long => 'patterns', short => 'p', type => 'list', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, + { long => 'patterns_in', short => 'P', type => 'file!', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, + { long => 'regex', short => 'r', type => 'string', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, + { long => 'eval', short => 'e', type => 'string', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, + { long => 'exact_in', short => 'E', type => 'file!', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, + { long => 'invert', short => 'i', type => 'flag', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, + { long => 'case_insensitive', short => 'c', type => 'flag', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, + { long => 'keys', short => 'k', type => 'list', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, + { long => 'keys_only', short => 'K', type => 'flag', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, + { long => 'vals_only', short => 'V', type => 'flag', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, + ] +); + +$in = Maasha::Biopieces::read_stream( $options->{ "stream_in" } ); +$out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } ); + +$keys = $options->{ 'keys' }; +$vals_only = $options->{ 'vals_only' }; +$keys_only = $options->{ 'keys_only' }; +$invert = $options->{ 'invert' }; + +if ( $options->{ 'patterns' } ) +{ + $patterns = $options->{ 'patterns' }; +} +elsif ( defined $options->{ 'patterns_in' } and -f $options->{ 'patterns_in' } ) +{ + $patterns = Maasha::Patscan::read_patterns( $options->{ 'patterns_in' } ); +} +elsif ( $options->{ 'regex' } ) +{ + if ( $options->{ 'case_insensitive' } ) { + $regex = qr/$options->{ 'regex' }/i; + } else { + $regex = qr/$options->{ 'regex' }/; + } +} +elsif ( defined $options->{ 'exact_in' } and -f $options->{ 'exact_in' } ) +{ + $patterns = Maasha::Patscan::read_patterns( $options->{ 'exact_in' } ); + + map { $lookup_hash{ $_ } = 1 } @{ $patterns }; + + undef $patterns; +} +elsif ( $options->{ 'eval' } ) +{ + if ( $options->{ 'eval' } =~ /^([^><=! ]+)\s*(>=|<=|>|<|==|=|!=|eq|ne)\s*(.+)$/ ) + { + $key = $1; + $op = $2; + $val = $3; + } + else + { + Maasha::Common::error( qq(Bad eval string: $options->{ 'eval' }) ); + } +} + +while ( $record = Maasha::Biopieces::get_record( $in ) ) +{ + $found = 0; + + if ( %lookup_hash ) { + $found = grab_lookup( \%lookup_hash, $record, $keys, $vals_only, $keys_only ); + } elsif ( $patterns ) { + $found = grab_patterns( $patterns, $record, $keys, $vals_only, $keys_only ); + } elsif ( $regex ) { + $found = grab_regex( $regex, $record, $keys, $vals_only, $keys_only ); + } elsif ( $op ) { + $found = grab_eval( $key, $op, $val, $record ); + } + + if ( $found and not $invert ) { + Maasha::Biopieces::put_record( $record, $out ); + } elsif ( not $found and $invert ) { + Maasha::Biopieces::put_record( $record, $out ); + } +} + + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SUBROUTINES <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +sub grab_lookup +{ + # Martin A. Hansen, November 2009. + + # Uses keys from a lookup hash to search records. Optionally, a list of + # keys can be given so the lookup is limited to these, also, flags + # can be given to limit lookup to keys or vals only. Returns 1 if lookup + # succeeded, else 0. + + my ( $lookup_hash, # hashref with patterns + $record, # hashref + $keys, # list of keys - OPTIONAL + $vals_only, # only vals flag - OPTIONAL + $keys_only, # only keys flag - OPTIONAL + ) = @_; + + # Returns boolean. + + if ( $keys ) + { + map { return 1 if exists $lookup_hash->{ $record->{ $_ } } } @{ $keys }; + } + else + { + if ( not $vals_only ) { + map { return 1 if exists $lookup_hash->{ $_ } } keys %{ $record }; + } + + if ( not $keys_only ) { + map { return 1 if exists $lookup_hash->{ $record->{ $_ } } } keys %{ $record }; + } + } + + return 0; +} + + +sub grab_patterns +{ + # Martin A. Hansen, November 2009. + + # Uses patterns to match records containing the pattern as a substring. + # Returns 1 if the record is matched, else 0. + + my ( $patterns, # list of patterns + $record, # hashref + $keys, # list of keys - OPTIONAL + $vals_only, # only vals flag - OPTIONAL + $keys_only, # only keys flag - OPTIONAL + ) = @_; + + # Returns boolean. + + my ( $pattern ); + + foreach $pattern ( @{ $patterns } ) + { + if ( $keys ) + { + map { return 1 if index( $record->{ $_ }, $pattern ) >= 0 } @{ $keys }; + } + else + { + if ( not $vals_only ) { + map { return 1 if index( $_, $pattern ) >= 0 } keys %{ $record }; + } + + if ( not $keys_only ) { + map { return 1 if index( $record->{ $_ }, $pattern ) >= 0 } keys %{ $record }; + } + } + } + + return 0; +} + + +sub grab_regex +{ + # Martin A. Hansen, November 2009. + + # Uses regex to match records. + # Returns 1 if the record is matched, else 0. + + my ( $regex, # regex to match + $record, # hashref + $keys, # list of keys - OPTIONAL + $vals_only, # only vals flag - OPTIONAL + $keys_only, # only keys flag - OPTIONAL + ) = @_; + + # Returns boolean. + + if ( $keys ) + { + map { return 1 if $record->{ $_ } =~ /$regex/ } @{ $keys }; + } + else + { + if ( not $vals_only ) { + map { return 1 if $_ =~ /$regex/ } keys %{ $record }; + } + + if ( not $keys_only ) { + map { return 1 if $record->{ $_ } =~ /$regex/ } keys %{ $record }; + } + } + + return 0; +} + + +sub grab_eval +{ + # Martin A. Hansen, November 2009. + + # Test if the value of a given record key evaluates according + # to a given operator. Returns 1 if eval is OK, else 0. + + my ( $key, # record key + $op, # operator + $val, # value + $record, # hashref + ) = @_; + + # Returns boolean. + + if ( defined $record->{ $key } ) + { + return 1 if ( $op eq "<" and $record->{ $key } < $val ); + return 1 if ( $op eq ">" and $record->{ $key } > $val ); + return 1 if ( $op eq ">=" and $record->{ $key } >= $val ); + return 1 if ( $op eq "<=" and $record->{ $key } <= $val ); + return 1 if ( $op eq "=" and $record->{ $key } == $val ); + return 1 if ( $op eq "==" and $record->{ $key } == $val ); + return 1 if ( $op eq "!=" and $record->{ $key } != $val ); + return 1 if ( $op eq "eq" and $record->{ $key } eq $val ); + return 1 if ( $op eq "ne" and $record->{ $key } ne $val ); + } + + return 0; +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +BEGIN +{ + $run_time_beg = Maasha::Biopieces::run_time(); + + Maasha::Biopieces::log_biopiece(); +} + + +END +{ + Maasha::Biopieces::close_stream( $in ); + Maasha::Biopieces::close_stream( $out ); + + $run_time_end = Maasha::Biopieces::run_time(); + + Maasha::Biopieces::run_time_print( $run_time_beg, $run_time_end, $options ); +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +__END__ -use Maasha::BioRun; diff --git a/bp_bin/list_biopieces b/bp_bin/list_biopieces index fdf5bd2..b00e5af 100755 --- a/bp_bin/list_biopieces +++ b/bp_bin/list_biopieces @@ -1,6 +1,83 @@ -#!/usr/bin/env perl +#!/usr/bin/env perl -w + +# Copyright (C) 2007-2009 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# Print summary for all Biopices. + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + -use warnings; use strict; +use Maasha::Common; +use Maasha::Biopieces; +use Maasha::Gwiki; + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +my ( $run_time_beg, $run_time_end, $options, @files, $file, $wiki, $program, $summary ); + +$options = Maasha::Biopieces::parse_options(); + +@files = Maasha::Filesys::ls_files( "$ENV{ 'BP_DIR' }/bp_usage" ); + +foreach $file ( sort @files ) +{ + if ( $file =~ /\/([a-z0-9_]+)\.wiki$/ ) + { + $program = $1; + + $wiki = Maasha::Gwiki::gwiki_read( $file ); + + $summary = $wiki->[ 0 ]->[ 0 ]->{ 'TEXT' }; + $summary =~ s/^#summary\s+//; + + printf( "%-30s%s\n", $program, $summary ); + } +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +BEGIN +{ + $run_time_beg = Maasha::Biopieces::run_time(); + + Maasha::Biopieces::log_biopiece(); +} + +END +{ + $run_time_end = Maasha::Biopieces::run_time(); + + Maasha::Biopieces::run_time_print( $run_time_beg, $run_time_end, $options ); +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +__END__ + -use Maasha::BioRun; diff --git a/bp_bin/list_genomes b/bp_bin/list_genomes index fdf5bd2..b0e77a6 100755 --- a/bp_bin/list_genomes +++ b/bp_bin/list_genomes @@ -1,6 +1,106 @@ -#!/usr/bin/env perl +#!/usr/bin/env perl -w + +# Copyright (C) 2007-2009 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# Display available genomes. + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + -use warnings; use strict; +use Maasha::Common; +use Maasha::Biopieces; + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +my ( $run_time_beg, $run_time_end, $options, @genomes, $genome, @formats, $format, %hash, %found, @row ); + +$options = Maasha::Biopieces::parse_options(); + +@genomes = Maasha::Filesys::ls_dirs( "$ENV{ 'BP_DATA' }/genomes" ); + +foreach $genome ( @genomes ) +{ + next if $genome =~ /\.$/; + + @formats = Maasha::Filesys::ls_dirs( $genome ); + + foreach $format ( @formats ) + { + if ( $format =~ /\/([^\/]+)\/(\w+)$/ ) + { + $hash{ $1 }{ $2 } = 1; + + $found{ $2 } = 1; + } + } +} + +@row = "Genome"; + +map { push @row, $_ } sort keys %found; + +print join( "\t", @row ), "\n"; + +foreach $genome ( sort keys %hash ) +{ + @row = $genome; + + foreach $format ( sort keys %found ) + { + if ( exists $hash{ $genome }{ $format } ) { + push @row, "yes"; + } else { + push @row, "no"; + } + } + + print join( "\t", @row ), "\n"; +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +BEGIN +{ + $run_time_beg = Maasha::Biopieces::run_time(); + + Maasha::Biopieces::log_biopiece(); +} + +END +{ + $run_time_end = Maasha::Biopieces::run_time(); + + Maasha::Biopieces::run_time_print( $run_time_beg, $run_time_end, $options ); +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +__END__ + -use Maasha::BioRun; diff --git a/bp_bin/read_fasta b/bp_bin/read_fasta index fdf5bd2..092e0c5 100755 --- a/bp_bin/read_fasta +++ b/bp_bin/read_fasta @@ -1,6 +1,99 @@ -#!/usr/bin/env perl +#!/usr/bin/env perl -w + +# Copyright (C) 2007-2009 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# Read FASTA entries from one or more files. + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + -use warnings; use strict; +use Maasha::Common; +use Maasha::Fasta; +use Maasha::Biopieces; + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +my ( $run_time_beg, $run_time_end, $options, $in, $out, $record, $data_in, $num, $entry ); + +$options = Maasha::Biopieces::parse_options( + [ + { long => 'data_in', short => 'i', type => 'files!', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, + { long => 'num', short => 'n', type => 'uint', mandatory => 'no', default => undef, allowed => undef, disallowed => '0' }, + ] +); + +$in = Maasha::Biopieces::read_stream( $options->{ "stream_in" } ); +$out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } ); + +while ( $record = Maasha::Biopieces::get_record( $in ) ) { + Maasha::Biopieces::put_record( $record, $out ); +} + +if ( $options->{ 'data_in' } ) +{ + $data_in = Maasha::Common::read_open_multi( $options->{ 'data_in' } ); + + $num = 1; + + while ( $entry = Maasha::Fasta::get_entry( $data_in ) ) + { + if ( $record = Maasha::Fasta::fasta2biopiece( $entry ) ) { + Maasha::Biopieces::put_record( $record, $out ); + } + + last if $options->{ "num" } and $num == $options->{ "num" }; + + $num++; + } + + close $data_in; +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +BEGIN +{ + $run_time_beg = Maasha::Biopieces::run_time(); + + Maasha::Biopieces::log_biopiece(); +} + +END +{ + Maasha::Biopieces::close_stream( $in ); + Maasha::Biopieces::close_stream( $out ); + + $run_time_end = Maasha::Biopieces::run_time(); + + Maasha::Biopieces::run_time_print( $run_time_beg, $run_time_end, $options ); +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + -use Maasha::BioRun; +__END__ diff --git a/bp_bin/read_tab b/bp_bin/read_tab index fdf5bd2..b6c82ab 100755 --- a/bp_bin/read_tab +++ b/bp_bin/read_tab @@ -1,6 +1,136 @@ -#!/usr/bin/env perl +#!/usr/bin/env perl -w + +# Copyright (C) 2007-2009 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# Read tabular data. + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + -use warnings; use strict; +use Maasha::Common; +use Maasha::Fasta; +use Maasha::Biopieces; + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +my ( $run_time_beg, $run_time_end, $options, $in, $out, $record, $data_in, $num, $skip, $line, @fields, @fields2, $i ); + +$options = Maasha::Biopieces::parse_options( + [ + { long => 'data_in', short => 'i', type => 'files!', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, + { long => 'delimit', short => 'd', type => 'string', mandatory => 'no', default => '\s+', allowed => undef, disallowed => undef }, + { long => 'cols', short => 'c', type => 'list', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, + { long => 'keys', short => 'k', type => 'list', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, + { long => 'skip', short => 's', type => 'uint', mandatory => 'no', default => '0', allowed => undef, disallowed => undef }, + { long => 'num', short => 'n', type => 'uint', mandatory => 'no', default => undef, allowed => undef, disallowed => '0' }, + ] +); + +$in = Maasha::Biopieces::read_stream( $options->{ "stream_in" } ); +$out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } ); + +while ( $record = Maasha::Biopieces::get_record( $in ) ) { + Maasha::Biopieces::put_record( $record, $out ); +} + +if ( $options->{ 'data_in' } ) +{ + $data_in = Maasha::Common::read_open_multi( $options->{ 'data_in' } ); + + $num = 1; + $skip = $options->{ 'skip' }; + + while ( $line = <$data_in> ) + { + if ( $skip ) + { + $skip--; + next; + } + + next if $line =~ /^#|^$/; + + chomp $line; + + undef $record; + undef @fields2; + + @fields = split /$options->{'delimit'}/, $line; + + if ( $options->{ "cols" } ) { + map { push @fields2, $fields[ $_ ] } @{ $options->{ "cols" } }; + } else { + @fields2 = @fields; + } + + for ( $i = 0; $i < @fields2; $i++ ) + { + if ( $options->{ "keys" }->[ $i ] ) { + $record->{ $options->{ "keys" }->[ $i ] } = $fields2[ $i ]; + } else { + $record->{ "V" . $i } = $fields2[ $i ]; + } + } + + Maasha::Biopieces::put_record( $record, $out ); + + last if $options->{ "num" } and $num == $options->{ "num" }; + + $num++; + } + + close $data_in; +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +BEGIN +{ + $run_time_beg = Maasha::Biopieces::run_time(); + + Maasha::Biopieces::log_biopiece(); +} + +END +{ + Maasha::Biopieces::close_stream( $in ); + Maasha::Biopieces::close_stream( $out ); + + $run_time_end = Maasha::Biopieces::run_time(); + + Maasha::Biopieces::run_time_print( $run_time_beg, $run_time_end, $options ); +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +__END__ + + + -use Maasha::BioRun; diff --git a/bp_bin/split_vals b/bp_bin/split_vals index fdb06ce..bdd8104 100755 --- a/bp_bin/split_vals +++ b/bp_bin/split_vals @@ -1,5 +1,31 @@ #!/usr/bin/env perl -w +# Copyright (C) 2007-2009 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# Split the values of a key into new key/value pairs. + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + use strict; use Maasha::Biopieces; @@ -9,10 +35,6 @@ use Maasha::Biopieces; my ( $run_time_beg, $run_time_end, $options, $in, $out, $record, @vals, $i ); -$run_time_beg = Maasha::Biopieces::run_time(); - -Maasha::Biopieces::log_biopiece(); - $options = Maasha::Biopieces::parse_options( [ { long => 'key', short => 'k', type => 'string', mandatory => 'yes', default => undef, allowed => undef, disallowed => undef }, @@ -29,13 +51,16 @@ while ( $record = Maasha::Biopieces::get_record( $in ) ) if ( exists $options->{ 'key' } and exists $record->{ $options->{ 'key' } } ) { @vals = split /$options->{ 'delimit' }/, $record->{ $options->{ 'key' } }; - - for ( $i = 0; $i < @vals; $i++ ) + + if ( scalar @vals > 1 ) { - if ( defined $options->{ "keys" } and defined $options->{ "keys" }->[ $i ] ) { - $record->{ $options->{ "keys" }->[ $i ] } = $vals[ $i ]; - } else { - $record->{ $options->{ 'key' } . "_$i" } = $vals[ $i ]; + for ( $i = 0; $i < @vals; $i++ ) + { + if ( defined $options->{ "keys" } and defined $options->{ "keys" }->[ $i ] ) { + $record->{ $options->{ "keys" }->[ $i ] } = $vals[ $i ]; + } else { + $record->{ $options->{ 'key' } . "_$i" } = $vals[ $i ]; + } } } } @@ -43,12 +68,27 @@ while ( $record = Maasha::Biopieces::get_record( $in ) ) Maasha::Biopieces::put_record( $record, $out ); } -close $in; -close $out; -$run_time_end = Maasha::Biopieces::run_time(); +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +BEGIN +{ + $run_time_beg = Maasha::Biopieces::run_time(); + + Maasha::Biopieces::log_biopiece(); +} + + +END +{ + Maasha::Biopieces::close_stream( $in ); + Maasha::Biopieces::close_stream( $out ); + + $run_time_end = Maasha::Biopieces::run_time(); -Maasha::Biopieces::run_time_print( $run_time_beg, $run_time_end, $options ); + Maasha::Biopieces::run_time_print( $run_time_beg, $run_time_end, $options ); +} # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< diff --git a/bp_bin/uppercase_seq b/bp_bin/uppercase_seq index fdf5bd2..774cfe7 100755 --- a/bp_bin/uppercase_seq +++ b/bp_bin/uppercase_seq @@ -1,6 +1,76 @@ -#!/usr/bin/env perl +#!/usr/bin/env perl -w + +# Copyright (C) 2007-2009 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# Uppercases sequences in stream. + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + -use warnings; use strict; +use Maasha::Common; +use Maasha::Fasta; +use Maasha::Biopieces; + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +my ( $run_time_beg, $run_time_end, $options, $in, $out, $record ); + +$options = Maasha::Biopieces::parse_options(); + +$in = Maasha::Biopieces::read_stream( $options->{ "stream_in" } ); +$out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } ); + +while ( $record = Maasha::Biopieces::get_record( $in ) ) +{ + $record->{ "SEQ" } = uc $record->{ "SEQ" } if $record->{ "SEQ" }; + + Maasha::Biopieces::put_record( $record, $out ); +} + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +BEGIN +{ + $run_time_beg = Maasha::Biopieces::run_time(); + + Maasha::Biopieces::log_biopiece(); +} + +END +{ + Maasha::Biopieces::close_stream( $in ); + Maasha::Biopieces::close_stream( $out ); + + $run_time_end = Maasha::Biopieces::run_time(); + + Maasha::Biopieces::run_time_print( $run_time_beg, $run_time_end, $options ); +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + -use Maasha::BioRun; +__END__ diff --git a/bp_bin/vmatch_seq b/bp_bin/vmatch_seq index fdf5bd2..96d4263 100755 --- a/bp_bin/vmatch_seq +++ b/bp_bin/vmatch_seq @@ -1,6 +1,297 @@ -#!/usr/bin/env perl +#!/usr/bin/env perl -w + +# Copyright (C) 2007-2009 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# Vmatch sequences in the stream against a specified database. + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + -use warnings; use strict; +use Data::Dumper; +use Maasha::Biopieces; +use Maasha::Common; +use Maasha::Filesys; +use Maasha::Seq; +use Maasha::Fasta; + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +my ( $run_time_beg, $run_time_end, $options, $in, $out, @index_files, $tmp_dir, $query_file, $fh_out, $fh_in, $record, $entry, + $vmatch_args, $result_file, $count_hash ); + +$options = Maasha::Biopieces::parse_options( + [ + { long => 'genome', short => 'g', type => 'genome', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, + { long => 'index_name', short => 'i', type => 'file', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, + { long => 'count', short => 'c', type => 'flag', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, + { long => 'max_hits', short => 'm', type => 'uint', mandatory => 'no', default => undef, allowed => undef, disallowed => 0 }, + { long => 'hamming_dist', short => 'h', type => 'uint', mandatory => 'no', default => undef, allowed => undef, disallowed => 0 }, + { long => 'edit_dist', short => 'e', type => 'uint', mandatory => 'no', default => undef, allowed => undef, disallowed => 0 }, + ] +); + +$options->{ 'count' } = 1 if $options->{ 'max_hits' }; + +Maasha::Common::error( qq(both --index_hame and --genome specified) ) if $options->{ "genome" } and $options->{ "index_name" }; +Maasha::Common::error( qq(no --index_name or --genome specified) ) if not $options->{ "genome" } and not $options->{ "index_name" }; + +$in = Maasha::Biopieces::read_stream( $options->{ "stream_in" } ); +$out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } ); + +@index_files = get_index_files( $options ); + +$tmp_dir = Maasha::Biopieces::get_tmpdir(); + +$query_file = "$tmp_dir/query.seq"; +$result_file = "$tmp_dir/vmatch.out"; + +$fh_out = Maasha::Filesys::file_write_open( $query_file ); + +while ( $record = Maasha::Biopieces::get_record( $in ) ) +{ + if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) ) + { + next if length $record->{ 'SEQ' } < 12; # assuming that the index is created for 12 as minimum length + + Maasha::Fasta::put_entry( $entry, $fh_out, 100 ); + } + + Maasha::Biopieces::put_record( $record, $out ); # must this be here? +} + +close $fh_out; + +$vmatch_args = vmatch_args( $options, $query_file ); + +map { Maasha::Common::run( "vmatch", "$vmatch_args $_ >> $result_file" ) } @index_files; + +unlink $query_file; + +$fh_in = Maasha::Filesys::file_read_open( $result_file ); + +if ( $options->{ "count" } ) +{ + $count_hash = vmatch_count_hits( $result_file ) if ( $options->{ "count" } ); + + if ( $options->{ "max_hits" } ) + { + while ( $record = vmatch_get_entry( $fh_in ) ) + { + $record->{ 'SCORE' } = $count_hash->{ $record->{ 'Q_ID' } }; + + next if $record->{ 'SCORE' } > $options->{ 'max_hits' }; + + Maasha::Biopieces::put_record( $record, $out ); + } + + } + else + { + while ( $record = vmatch_get_entry( $fh_in ) ) + { + $record->{ 'SCORE' } = $count_hash->{ $record->{ 'Q_ID' } }; + + Maasha::Biopieces::put_record( $record, $out ); + } + } +} +else +{ + while ( $record = vmatch_get_entry( $fh_in ) ) { + Maasha::Biopieces::put_record( $record, $out ); + } +} + +close $fh_in; + +unlink $result_file; + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SUBROUTINES <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +sub get_index_files +{ + # Martin A. Hansen, May 2009. + + # Get the index_files from the index_name or genome options. + + my ( $options, # hashref + ) = @_; + + # Returns a list. + + my ( @index_files, %hash ); + + if ( $options->{ "index_name" } ) + { + @index_files = $options->{ "index_name" }; + } + else + { + @index_files = Maasha::Filesys::ls_files( "$ENV{ 'BP_DATA' }/genomes/$options->{ 'genome' }/vmatch" ); + + map { $_ =~ /^(.+)\.[a-z1]{3,4}$/; $hash{ $1 } = 1 } @index_files; + + @index_files = sort keys %hash; + } + + return wantarray ? @index_files : \@index_files; +} + + +sub vmatch_args +{ + # Martin A. Hansen, May 2009. + + # Given an option hashref and query file, compiles the arguments for running + # Vmatch on the commandline. + + my ( $options, # hashref + $query_file, # path to query file + ) = @_; + + # Returns a string. + + my ( $vmatch_args ); + + $vmatch_args = "-complete -d -p -showdesc 100 -q $query_file"; + + $vmatch_args .= " -h " . $options->{ "hamming_dist" } if $options->{ "hamming_dist" }; + $vmatch_args .= " -e " . $options->{ "edit_dist" } if $options->{ "edit_dist" }; + + return $vmatch_args; +} + + +sub vmatch_count_hits +{ + # Martin A. Hansen, April 2008. + + # Given a Vmatch result files, count duplications based + # on q_id. The counts are returned in a hash. + + my ( $file, # vmatch result file + ) = @_; + + # Returns a list. + + my ( $fh_in, $line, @fields, %count_hash ); + + $fh_in = Maasha::Filesys::file_read_open( $file ); + + while ( $line = <$fh_in> ) + { + chomp $line; + + next if $line =~ /^#/; + + @fields = split " ", $line; + + $count_hash{ $fields[ 5 ] }++; + } + + close $fh_in; + + return wantarray ? %count_hash : \%count_hash; +} + + +sub vmatch_get_entry +{ + # Martin A. Hansen, January 2008. + + # Parses vmatch output records. + + my ( $fh, # file handle to vmatch result file. + ) = @_; + + # Returns a hash. + + my ( $line, @fields, %record ); + + while ( $line = <$fh> ) + { + chomp $line; + + next if $line =~ /^#/; + $line =~ s/^\s+//; + + @fields = split " ", $line; + + $record{ "REC_TYPE" } = "VMATCH"; + + $record{ "S_LEN" } = $fields[ 0 ]; + $record{ "S_ID" } = $fields[ 1 ]; + $record{ "S_BEG" } = $fields[ 2 ]; + + if ( $fields[ 3 ] eq "D" ) { + $record{ "STRAND" } = "+"; + } else { + $record{ "STRAND" } = "-"; + } + + $record{ "Q_LEN" } = $fields[ 4 ]; + $record{ "Q_ID" } = $fields[ 5 ]; + $record{ "Q_BEG" } = $fields[ 6 ]; + $record{ "MATCH_DIST" } = $fields[ 7 ]; + $record{ "E_VAL" } = $fields[ 8 ]; + $record{ "SCORE" } = $fields[ 9 ]; + $record{ "IDENT" } = $fields[ 10 ]; + + $record{ "Q_END" } = $record{ "Q_BEG" } + $record{ "Q_LEN" } - 1; + $record{ "S_END" } = $record{ "S_BEG" } + $record{ "S_LEN" } - 1; + + return wantarray ? %record : \%record; + } +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +BEGIN +{ + $run_time_beg = Maasha::Biopieces::run_time(); + + Maasha::Biopieces::log_biopiece(); +} + +END +{ + Maasha::Biopieces::close_stream( $in ); + Maasha::Biopieces::close_stream( $out ); + + $run_time_end = Maasha::Biopieces::run_time(); + + Maasha::Biopieces::run_time_print( $run_time_beg, $run_time_end, $options ); +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +__END__ -use Maasha::BioRun; diff --git a/bp_bin/write_blast b/bp_bin/write_blast index fdf5bd2..f72f718 100755 --- a/bp_bin/write_blast +++ b/bp_bin/write_blast @@ -1,6 +1,145 @@ -#!/usr/bin/env perl +#!/usr/bin/env perl -w + +# Copyright (C) 2007-2009 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# Write tabular BLAST output from the stream. + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + -use warnings; use strict; +use Maasha::Fasta; +use Maasha::Biopieces; + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +my ( $run_time_beg, $run_time_end, $options, $in, $out, $record, $first, $entry, $data_out ); + +$options = Maasha::Biopieces::parse_options( + [ + { long => 'no_stream', short => 'x', type => 'flag', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, + { long => 'data_out', short => 'o', type => 'file', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, + { long => 'comment', short => 'c', type => 'flag', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, + { long => 'compress', short => 'Z', type => 'flag', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, + ] +); + +$in = Maasha::Biopieces::read_stream( $options->{ "stream_in" } ); +$out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } ); +$data_out = Maasha::Biopieces::write_stream( $options->{ "data_out" }, $options->{ "compress" } ) ; + +$first = 1; + +while ( $record = Maasha::Biopieces::get_record( $in ) ) +{ + if ( $entry = biopiece2blast_tab( $record ) ) + { + if ( $options->{ "comment" } and $first ) + { + print $data_out "# Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score\n"; + + $first = 0; + } + + print $data_out join( "\t", @{ $entry } ), "\n"; + } + + Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" }; +} + +close $data_out; + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +sub biopiece2blast_tab +{ + # Martin A. Hansen, May 2009. + + # Convert a Biopiece record to a BLAST tabular entry. + # Note that we correct positions because BLAST is 1-based. + + my ( $record, # hashref + ) = @_; + + # Returns a list. + + my ( @fields, $s_beg, $s_end ); + + return undef if not defined $record->{ 'REC_TYPE' } or not $record->{ 'REC_TYPE' } eq "BLAST"; + + if ( $record->{ "STRAND" } eq '+' ) + { + $s_beg = $record->{ "S_BEG" }; + $s_end = $record->{ "S_END" }; + } + else + { + $s_beg = $record->{ "S_END" }; + $s_end = $record->{ "S_BEG" }; + } + + $fields[ 0 ] = $record->{ "Q_ID" }; + $fields[ 1 ] = $record->{ "S_ID" }; + $fields[ 2 ] = $record->{ "IDENT" }; + $fields[ 3 ] = $record->{ "ALIGN_LEN" }; + $fields[ 4 ] = $record->{ "MISMATCHES" }; + $fields[ 5 ] = $record->{ "GAPS" }; + $fields[ 6 ] = $record->{ "Q_BEG" } + 1; + $fields[ 7 ] = $record->{ "Q_END" } + 1; + $fields[ 8 ] = $s_beg + 1; + $fields[ 9 ] = $s_end + 1; + $fields[ 10 ] = $record->{ "E_VAL" }; + $fields[ 11 ] = $record->{ "BIT_SCORE" }; + + return wantarray ? @fields : \@fields; +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +BEGIN +{ + $run_time_beg = Maasha::Biopieces::run_time(); + + Maasha::Biopieces::log_biopiece(); +} + +END +{ + Maasha::Biopieces::close_stream( $in ); + Maasha::Biopieces::close_stream( $out ); + + $run_time_end = Maasha::Biopieces::run_time(); + + Maasha::Biopieces::run_time_print( $run_time_beg, $run_time_end, $options ); +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + -use Maasha::BioRun; +__END__ diff --git a/bp_bin/write_fasta b/bp_bin/write_fasta index fdf5bd2..51cfec0 100755 --- a/bp_bin/write_fasta +++ b/bp_bin/write_fasta @@ -1,6 +1,89 @@ -#!/usr/bin/env perl +#!/usr/bin/env perl -w + +# Copyright (C) 2007-2009 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# Write sequences from stream in FASTA format. + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + -use warnings; use strict; +use Maasha::Fasta; +use Maasha::Biopieces; + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +my ( $run_time_beg, $run_time_end, $options, $in, $out, $record, $data_out, $entry ); + +$options = Maasha::Biopieces::parse_options( + [ + { long => 'no_stream', short => 'x', type => 'flag', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, + { long => 'data_out', short => 'o', type => 'file', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, + { long => 'wrap', short => 'w', type => 'uint', mandatory => 'no', default => undef, allowed => undef, disallowed => '0' }, + { long => 'compress', short => 'Z', type => 'flag', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, + ] +); + +$in = Maasha::Biopieces::read_stream( $options->{ "stream_in" } ); +$out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } ); + +$data_out = Maasha::Biopieces::write_stream( $options->{ "data_out" }, $options->{ "compress" } ); + +while ( $record = Maasha::Biopieces::get_record( $in ) ) +{ + if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) ) { + Maasha::Fasta::put_entry( $entry, $data_out, $options->{ "wrap" } ); + } + + Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" }; +} + +close $data_out; + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +BEGIN +{ + $run_time_beg = Maasha::Biopieces::run_time(); + + Maasha::Biopieces::log_biopiece(); +} + +END +{ + Maasha::Biopieces::close_stream( $in ); + Maasha::Biopieces::close_stream( $out ); + + $run_time_end = Maasha::Biopieces::run_time(); + + Maasha::Biopieces::run_time_print( $run_time_beg, $run_time_end, $options ); +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + -use Maasha::BioRun; +__END__ diff --git a/bp_bin/write_tab b/bp_bin/write_tab index fdf5bd2..bc6fbd9 100755 --- a/bp_bin/write_tab +++ b/bp_bin/write_tab @@ -1,6 +1,151 @@ -#!/usr/bin/env perl +#!/usr/bin/env perl -w + +# Copyright (C) 2007-2009 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# Write tabular output from the stream. + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + -use warnings; use strict; +use Maasha::Fasta; +use Maasha::Biopieces; + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +my ( $run_time_beg, $run_time_end, $options, $in, $out, $record, $data_out, @vals, $ok, $key, @keys, $A, $B, %no_keys, $sort_keys, $check_sort ); + +$options = Maasha::Biopieces::parse_options( + [ + { long => 'no_stream', short => 'x', type => 'flag', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, + { long => 'data_out', short => 'o', type => 'file', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, + { long => 'comment', short => 'c', type => 'flag', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, + { long => 'delimit', short => 'd', type => 'string', mandatory => 'no', default => "\t", allowed => undef, disallowed => undef }, + { long => 'keys', short => 'k', type => 'list', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, + { long => 'no_keys', short => 'K', type => 'list', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, + { long => 'compress', short => 'Z', type => 'flag', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, + ] +); + +$in = Maasha::Biopieces::read_stream( $options->{ "stream_in" } ); +$out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } ); +$data_out = Maasha::Biopieces::write_stream( $options->{ "data_out" }, $options->{ "compress" } ); + +map { $no_keys{ $_ } = 1 } @{ $options->{ "no_keys" } }; + +while ( $record = Maasha::Biopieces::get_record( $in ) ) +{ + undef @vals; + $ok = 1; + + if ( $options->{ "keys" } ) + { + map { $ok = 0 if not exists $record->{ $_ } } @{ $options->{ "keys" } }; + + if ( $ok ) + { + foreach $key ( @{ $options->{ "keys" } } ) + { + if ( exists $record->{ $key } ) + { + push @keys, $key if $options->{ "comment" }; + push @vals, $record->{ $key }; + } + } + } + } + else + { + if ( not $check_sort ) + { + $sort_keys = 1; + + map { $sort_keys = 0 if $_ !~ /^V(\d+)$/ } keys %{ $record }; + + $check_sort = 1; + } + + if ( $sort_keys ) + { + foreach $key ( sort { $A = $a; $B = $b; $A =~ s/^V(\d+)$/$1/; $B =~ s/^V(\d+)$/$1/; $A <=> $B } keys %{ $record } ) + { + next if exists $no_keys{ $key }; + + push @keys, $key if $options->{ "comment" }; + push @vals, $record->{ $key }; + } + } + else + { + foreach $key ( keys %{ $record } ) + { + next if exists $no_keys{ $key }; + + push @keys, $key if $options->{ "comment" }; + push @vals, $record->{ $key }; + } + + } + } + + if ( @keys and $options->{ "comment" } ) + { + print $data_out "#", join( $options->{ "delimit" }, @keys ), "\n"; + + delete $options->{ "comment" }; + } + + print $data_out join( $options->{ "delimit" }, @vals ), "\n" if @vals; + + Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" }; +} + +close $data_out; + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +BEGIN +{ + $run_time_beg = Maasha::Biopieces::run_time(); + + Maasha::Biopieces::log_biopiece(); +} + +END +{ + Maasha::Biopieces::close_stream( $in ); + Maasha::Biopieces::close_stream( $out ); + + $run_time_end = Maasha::Biopieces::run_time(); + + Maasha::Biopieces::run_time_print( $run_time_beg, $run_time_end, $options ); +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + -use Maasha::BioRun; +__END__ diff --git a/code_perl/Maasha/BioRun.pm b/code_perl/Maasha/BioRun.pm index 0f8cf30..60b7717 100644 --- a/code_perl/Maasha/BioRun.pm +++ b/code_perl/Maasha/BioRun.pm @@ -110,18 +110,12 @@ sub run_script $options->{ "SCRIPT" } = $script; - if ( $script ne "list_biopieces" and $script ne "list_genomes" ) { - $script = "print_usage" if ( -t STDIN and keys %{ $options } <= 1 or $options->{ 'help' } ); - } + $script = "print_usage" if ( -t STDIN and keys %{ $options } <= 1 or $options->{ 'help' } ); $in = Maasha::Biopieces::read_stream( $options->{ "stream_in" } ); $out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } ); if ( $script eq "print_usage" ) { script_print_usage( $in, $out, $options ) } - elsif ( $script eq "list_biopieces" ) { script_list_biopieces( $in, $out, $options ) } - elsif ( $script eq "list_genomes" ) { script_list_genomes( $in, $out, $options ) } - elsif ( $script eq "read_fasta" ) { script_read_fasta( $in, $out, $options ) } - elsif ( $script eq "read_tab" ) { script_read_tab( $in, $out, $options ) } elsif ( $script eq "read_psl" ) { script_read_psl( $in, $out, $options ) } elsif ( $script eq "read_bed" ) { script_read_bed( $in, $out, $options ) } elsif ( $script eq "read_fixedstep" ) { script_read_fixedstep( $in, $out, $options ) } @@ -137,7 +131,6 @@ sub run_script elsif ( $script eq "read_mysql" ) { script_read_mysql( $in, $out, $options ) } elsif ( $script eq "read_ucsc_config" ) { script_read_ucsc_config( $in, $out, $options ) } elsif ( $script eq "assemble_tag_contigs" ) { script_assemble_tag_contigs( $in, $out, $options ) } - elsif ( $script eq "format_genome" ) { script_format_genome( $in, $out, $options ) } elsif ( $script eq "length_seq" ) { script_length_seq( $in, $out, $options ) } elsif ( $script eq "uppercase_seq" ) { script_uppercase_seq( $in, $out, $options ) } elsif ( $script eq "shuffle_seq" ) { script_shuffle_seq( $in, $out, $options ) } @@ -155,7 +148,6 @@ sub run_script elsif ( $script eq "transliterate_vals" ) { script_transliterate_vals( $in, $out, $options ) } elsif ( $script eq "translate_seq" ) { script_translate_seq( $in, $out, $options ) } elsif ( $script eq "extract_seq" ) { script_extract_seq( $in, $out, $options ) } - elsif ( $script eq "get_genome_seq" ) { script_get_genome_seq( $in, $out, $options ) } elsif ( $script eq "get_genome_align" ) { script_get_genome_align( $in, $out, $options ) } elsif ( $script eq "get_genome_phastcons" ) { script_get_genome_phastcons( $in, $out, $options ) } elsif ( $script eq "fold_seq" ) { script_fold_seq( $in, $out, $options ) } @@ -165,17 +157,11 @@ sub run_script elsif ( $script eq "tile_seq" ) { script_tile_seq( $in, $out, $options ) } elsif ( $script eq "invert_align" ) { script_invert_align( $in, $out, $options ) } elsif ( $script eq "patscan_seq" ) { script_patscan_seq( $in, $out, $options ) } - elsif ( $script eq "create_blast_db" ) { script_create_blast_db( $in, $out, $options ) } - elsif ( $script eq "blast_seq" ) { script_blast_seq( $in, $out, $options ) } elsif ( $script eq "blat_seq" ) { script_blat_seq( $in, $out, $options ) } elsif ( $script eq "soap_seq" ) { script_soap_seq( $in, $out, $options ) } elsif ( $script eq "match_seq" ) { script_match_seq( $in, $out, $options ) } elsif ( $script eq "create_vmatch_index" ) { script_create_vmatch_index( $in, $out, $options ) } - elsif ( $script eq "vmatch_seq" ) { script_vmatch_seq( $in, $out, $options ) } - elsif ( $script eq "write_fasta" ) { script_write_fasta( $in, $out, $options ) } elsif ( $script eq "write_align" ) { script_write_align( $in, $out, $options ) } - elsif ( $script eq "write_blast" ) { script_write_blast( $in, $out, $options ) } - elsif ( $script eq "write_tab" ) { script_write_tab( $in, $out, $options ) } elsif ( $script eq "write_bed" ) { script_write_bed( $in, $out, $options ) } elsif ( $script eq "write_psl" ) { script_write_psl( $in, $out, $options ) } elsif ( $script eq "write_fixedstep" ) { script_write_fixedstep( $in, $out, $options ) } @@ -191,7 +177,6 @@ sub run_script elsif ( $script eq "uniq_vals" ) { script_uniq_vals( $in, $out, $options ) } elsif ( $script eq "merge_vals" ) { script_merge_vals( $in, $out, $options ) } elsif ( $script eq "merge_records" ) { script_merge_records( $in, $out, $options ) } - elsif ( $script eq "grab" ) { script_grab( $in, $out, $options ) } elsif ( $script eq "compute" ) { script_compute( $in, $out, $options ) } elsif ( $script eq "flip_tab" ) { script_flip_tab( $in, $out, $options ) } elsif ( $script eq "add_ident" ) { script_add_ident( $in, $out, $options ) } @@ -244,24 +229,6 @@ sub get_options data_in|i=s ); } - elsif ( $script eq "read_fasta" ) - { - @options = qw( - data_in|i=s - num|n=s - ); - } - elsif ( $script eq "read_tab" ) - { - @options = qw( - data_in|i=s - delimit|d=s - cols|c=s - keys|k=s - skip|s=s - num|n=s - ); - } elsif ( $script eq "read_psl" ) { @options = qw( @@ -388,15 +355,6 @@ sub get_options check|C ); } - elsif ( $script eq "format_genome" ) - { - @options = qw( - no_stream|x - dir|d=s - genome|g=s - formats|f=s - ); - } elsif ( $script eq "length_seq" ) { @options = qw( @@ -448,19 +406,6 @@ sub get_options len|l=s ); } - elsif ( $script eq "get_genome_seq" ) - { - @options = qw( - genome|g=s - chr|c=s - beg|b=s - end|e=s - len|l=s - flank|f=s - mask|m - splice|s - ); - } elsif ( $script eq "get_genome_align" ) { @options = qw( @@ -521,25 +466,6 @@ sub get_options genome|g=s ); } - elsif ( $script eq "create_blast_db" ) - { - @options = qw( - no_stream|x - database|d=s - ); - } - elsif ( $script eq "blast_seq" ) - { - @options = qw( - database|d=s - genome|g=s - program|p=s - e_val|e=f - filter|f - cpus|c=s - no_filter|F - ); - } elsif ( $script eq "blat_seq" ) { @options = qw( @@ -578,26 +504,6 @@ sub get_options no_stream|x ); } - elsif ( $script eq "vmatch_seq" ) - { - @options = qw( - genome|g=s - index_name|i=s - count|c - max_hits|m=s - hamming_dist|h=s - edit_dist|e=s - ); - } - elsif ( $script eq "write_fasta" ) - { - @options = qw( - wrap|w=s - no_stream|x - data_out|o=s - compress|Z - ); - } elsif ( $script eq "write_align" ) { @options = qw( @@ -608,27 +514,6 @@ sub get_options data_out|o=s ); } - elsif ( $script eq "write_blast" ) - { - @options = qw( - no_stream|x - data_out|o=s - comment|c - compress|Z - ); - } - elsif ( $script eq "write_tab" ) - { - @options = qw( - no_stream|x - data_out|o=s - delimit|d=s - keys|k=s - no_keys|K=s - comment|c - compress|Z - ); - } elsif ( $script eq "write_bed" ) { @options = qw( @@ -780,21 +665,6 @@ sub get_options merge|m=s ); } - elsif ( $script eq "grab" ) - { - @options = qw( - patterns|p=s - patterns_in|P=s - regex|r=s - eval|e=s - exact_in|E=s - invert|i - case_insensitive|c - keys|k=s - keys_only|K - vals_only|V - ); - } elsif ( $script eq "compute" ) { @options = qw( @@ -981,7 +851,6 @@ sub get_options $options{ "quals" } = [ split ",", $options{ "quals" } ] if defined $options{ "quals" }; $options{ "feats" } = [ split ",", $options{ "feats" } ] if defined $options{ "feats" }; $options{ "frames" } = [ split ",", $options{ "frames" } ] if defined $options{ "frames" }; - $options{ "formats" } = [ split ",", $options{ "formats" } ] if defined $options{ "formats" }; $options{ "samples" } = [ split ",", $options{ "samples" } ] if defined $options{ "samples" }; $options{ "tables" } = [ split ",", $options{ "tables" } ] if defined $options{ "tables" }; $options{ "tracks" } = [ split ",", $options{ "tracks" } ] if defined $options{ "tracks" }; @@ -1027,7 +896,7 @@ sub get_options { Maasha::Common::error( qq(Argument to --$opt must be "+" or "-" - not "$options{ $opt }") ); } - elsif ( $opt eq "genome" and $script ne "format_genome" ) + elsif ( $opt eq "genome" ) { @genomes = Maasha::Common::ls_dirs( "$ENV{ 'BP_DATA' }/genomes" ); map { $_ =~ s/.*\/(.+)$/$1/ } @genomes; @@ -1059,15 +928,10 @@ sub get_options } Maasha::Common::error( qq(no --database specified) ) if $script eq "remove_ucsc_tracks" and not $options{ "database" }; - Maasha::Common::error( qq(no --database specified) ) if $script eq "create_blast_db" and not $options{ "database" }; Maasha::Common::error( qq(no --index_name specified) ) if $script =~ /create_vmatch_index/ and not $options{ "index_name" }; - Maasha::Common::error( qq(no --database or --genome specified) ) if $script eq "blast_seq" and not $options{ "genome" } and not $options{ "database" }; - Maasha::Common::error( qq(both --database and --genome specified) ) if $script eq "blast_seq" and $options{ "genome" } and $options{ "database" }; - Maasha::Common::error( qq(no --index_name or --genome specified) ) if $script eq "vmatch_seq" and not $options{ "genome" } and not $options{ "index_name" }; - Maasha::Common::error( qq(both --index and --genome specified) ) if $script eq "vmatch_seq" and $options{ "genome" } and $options{ "index_name" }; Maasha::Common::error( qq(no --in_file or --genome specified) ) if $script eq "soap_seq" and not $options{ "genome" } and not $options{ "in_file" }; Maasha::Common::error( qq(both --in_file and --genome specified) ) if $script eq "soap_seq" and $options{ "genome" } and $options{ "in_file" }; - Maasha::Common::error( qq(no --genome specified) ) if $script =~ /format_genome|get_genome_seq|get_genome_align|get_genome_phastcons|blat_seq|plot_phastcons_profiles|plot_karyogram/ and not $options{ "genome" }; + Maasha::Common::error( qq(no --genome specified) ) if $script =~ /get_genome_align|get_genome_phastcons|blat_seq|plot_phastcons_profiles|plot_karyogram/ and not $options{ "genome" }; Maasha::Common::error( qq(no --key specified) ) if $script =~ /plot_lendist|plot_histogram/ and not $options{ "key" }; Maasha::Common::error( qq(no --keys speficied) ) if $script =~ /sort_records|count_vals|sum_vals|mean_vals|median_vals|length_vals/ and not $options{ "keys" }; @@ -1122,228 +986,6 @@ sub script_print_usage } -sub script_list_biopieces -{ - # Martin A. Hansen, January 2008. - - # Prints the summary from the usage for each of the biopieces. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( @files, $file, $wiki, $program, $summary ); - - @files = Maasha::Common::ls_files( "$ENV{ 'BP_DIR' }/bp_usage" ); - - foreach $file ( sort @files ) - { - if ( $file =~ /\/([a-z0-9_]+)\.wiki$/ ) - { - $program = $1; - - $wiki = Maasha::Gwiki::gwiki_read( $file ); - - $summary = $wiki->[ 0 ]->[ 0 ]->{ 'TEXT' }; - $summary =~ s/^#summary\s+//; - - printf( "%-30s%s\n", $program, $summary ); - } - } - - exit; -} - - -sub script_list_genomes -{ - # Martin A. Hansen, January 2008. - - # Prints the synopsis from the usage for each of the biopieces. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( @genomes, $genome, @formats, $format, %hash, %found, @row ); - - @genomes = Maasha::Common::ls_dirs( "$ENV{ 'BP_DATA' }/genomes" ); - - foreach $genome ( @genomes ) - { - next if $genome =~ /\.$/; - - @formats = Maasha::Common::ls_dirs( $genome ); - - foreach $format ( @formats ) - { - if ( $format =~ /\/([^\/]+)\/(\w+)$/ ) - { - $hash{ $1 }{ $2 } = 1; - - $found{ $2 } = 1; - } - } - } - - @row = "Genome"; - - map { push @row, $_ } sort keys %found; - - print join( "\t", @row ), "\n"; - - foreach $genome ( sort keys %hash ) - { - @row = $genome; - - foreach $format ( sort keys %found ) - { - if ( exists $hash{ $genome }{ $format } ) { - push @row, "yes"; - } else { - push @row, "no"; - } - } - - print join( "\t", @row ), "\n"; - } -} - - -sub script_read_fasta -{ - # Martin A. Hansen, August 2007. - - # Read sequences from FASTA file. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $file, $data_in, $entry, $num ); - - while ( $record = Maasha::Biopieces::get_record( $in ) ) { - Maasha::Biopieces::put_record( $record, $out ); - } - - $num = 1; - - foreach $file ( @{ $options->{ "files" } } ) - { - $data_in = Maasha::Common::read_open( $file ); - - while ( $entry = Maasha::Fasta::get_entry( $data_in ) ) - { - if ( defined $entry->[ SEQ_NAME ] and $entry->[ SEQ ] ) - { - $record = { - SEQ_NAME => $entry->[ SEQ_NAME ], - SEQ => $entry->[ SEQ ], - SEQ_LEN => length $entry->[ SEQ ], - }; - - Maasha::Biopieces::put_record( $record, $out ); - } - - goto NUM if $options->{ "num" } and $num == $options->{ "num" }; - - $num++; - } - - close $data_in; - } - - NUM: - - close $data_in if $data_in; -} - - -sub script_read_tab -{ - # Martin A. Hansen, August 2007. - - # Read table or table columns from stream or file. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $file, $line, @fields, @fields2, $i, $record, $data_in, $skip, $num ); - - $options->{ 'delimit' } ||= '\s+'; - - while ( $record = Maasha::Biopieces::get_record( $in ) ) { - Maasha::Biopieces::put_record( $record, $out ); - } - - $skip = $options->{ 'skip' } ||= 0; - $num = 1; - - foreach $file ( @{ $options->{ "files" } } ) - { - $data_in = Maasha::Common::read_open( $file ); - - while ( $line = <$data_in> ) - { - if ( $skip ) - { - $skip--; - next; - } - - next if $line =~ /^#|^$/; - - chomp $line; - - undef $record; - undef @fields2; - - @fields = split /$options->{'delimit'}/, $line; - - if ( $options->{ "cols" } ) { - map { push @fields2, $fields[ $_ ] } @{ $options->{ "cols" } }; - } else { - @fields2 = @fields; - } - - for ( $i = 0; $i < @fields2; $i++ ) - { - if ( $options->{ "keys" }->[ $i ] ) { - $record->{ $options->{ "keys" }->[ $i ] } = $fields2[ $i ]; - } else { - $record->{ "V" . $i } = $fields2[ $i ]; - } - } - - Maasha::Biopieces::put_record( $record, $out ); - - goto NUM if $options->{ "num" } and $num == $options->{ "num" }; - - $num++; - } - - close $data_in; - } - - NUM: - - close $data_in if $data_in; -} - - sub script_read_psl { # Martin A. Hansen, August 2007. @@ -2184,11 +1826,11 @@ sub script_assemble_tag_contigs } -sub script_format_genome +sub script_length_seq { - # Martin A. Hansen, Juli 2008. + # Martin A. Hansen, August 2007. - # Format a genome to speficed formats. + # Determine the length of sequences in stream. my ( $in, # handle to in stream $out, # handle to out stream @@ -2197,90 +1839,11 @@ sub script_format_genome # Returns nothing. - my ( $dir, $genome, $fasta_dir, $phastcons_dir, $vals, $fh_out, $record, $format, $index, $entry ); - - $dir = $options->{ 'dir' } || $ENV{ 'BP_DATA' }; - $genome = $options->{ 'genome' }; - - Maasha::Common::error( "Directory: $dir does not exist" ) if not -d $dir; - Maasha::Common::dir_create_if_not_exists( "$dir/genomes" ); - Maasha::Common::dir_create_if_not_exists( "$dir/genomes/$genome" ); + my ( $record, $total ); - if ( grep { $_ =~ /fasta|blast|vmatch/i } @{ $options->{ "formats" } } ) + while ( $record = Maasha::Biopieces::get_record( $in ) ) { - if ( -f "$dir/genomes/$genome/fasta/$genome.fna" ) - { - $fasta_dir = "$dir/genomes/$genome/fasta"; - } - else - { - Maasha::Common::dir_create_if_not_exists( "$dir/genomes/$genome/fasta" ); - - $fasta_dir = "$dir/genomes/$genome/fasta"; - - $fh_out = Maasha::Common::write_open( "$fasta_dir/$genome.fna" ); - } - } - elsif ( grep { $_ =~ /phastcons/i } @{ $options->{ "formats" } } ) - { - Maasha::Common::dir_create_if_not_exists( "$dir/genomes/$genome/phastcons" ); - - $phastcons_dir = "$dir/genomes/$genome/phastcons"; - - $fh_out = Maasha::Common::write_open( "$phastcons_dir/$genome.pp" ); - } - - while ( $record = Maasha::Biopieces::get_record( $in ) ) - { - if ( $fh_out and $entry = Maasha::Fasta::biopiece2fasta( $record ) ) - { - Maasha::Fasta::put_entry( $entry, $fh_out, $options->{ "wrap" } ); - } - elsif ( $fh_out and $record->{ "CHR" } and $record->{ "CHR_BEG" } and $record->{ "STEP" } and $record->{ "VALS" } ) - { - print $fh_out "fixedStep chrom=$record->{ 'CHR' } start=$record->{ 'CHR_BEG' } step=$record->{ 'STEP' }\n"; - - $vals = $record->{ 'VALS' }; - - $vals =~ tr/,/\n/; - - print $fh_out "$vals\n"; - } - - Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" }; - } - - foreach $format ( @{ $options->{ 'formats' } } ) - { - if ( $format =~ /^fasta$/i ) { Maasha::Fasta::fasta_index( "$fasta_dir/$genome.fna", "$dir/genomes/$genome/fasta/$genome.index" ) } - elsif ( $format =~ /^blast$/i ) { Maasha::NCBI::blast_index( "$genome.fna", $fasta_dir, "$dir/genomes/$genome/blast", "dna", $genome ) } - elsif ( $format =~ /^blat$/i ) { print STDERR "BLAT FORMAT NOT IMPLEMENTED" } - elsif ( $format =~ /^vmatch$/i ) { Maasha::Match::vmatch_index( "$genome.fna", $fasta_dir, "$dir/genomes/$genome/vmatch", $BP_TMP ) } - elsif ( $format =~ /^phastcons$/i ) { Maasha::UCSC::phastcons_index( "$genome.pp", $phastcons_dir ) } - } - - close $fh_out if $fh_out; -} - - -sub script_length_seq -{ - # Martin A. Hansen, August 2007. - - # Determine the length of sequences in stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $total ); - - while ( $record = Maasha::Biopieces::get_record( $in ) ) - { - if ( $record->{ "SEQ" } ) + if ( $record->{ "SEQ" } ) { $record->{ "SEQ_LEN" } = length $record->{ "SEQ" }; $total += $record->{ "SEQ_LEN" }; @@ -2293,29 +1856,6 @@ sub script_length_seq } -sub script_uppercase_seq -{ - # Martin A. Hansen, August 2007. - - # Uppercases sequences in stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - ) = @_; - - # Returns nothing. - - my ( $record ); - - while ( $record = Maasha::Biopieces::get_record( $in ) ) - { - $record->{ "SEQ" } = uc $record->{ "SEQ" } if $record->{ "SEQ" }; - - Maasha::Biopieces::put_record( $record, $out ); - } -} - - sub script_shuffle_seq { # Martin A. Hansen, December 2007. @@ -2645,7 +2185,6 @@ sub script_calc_fixedstep # Maasha::Common::run( "bed2fixedstep", "< $bed_file > $fixedstep_file" ); - $fh_in = Maasha::Filesys::file_read_open( $fixedstep_file ); while ( $fixedstep_entry = Maasha::UCSC::Wiggle::fixedstep_entry_get( $fh_in ) ) @@ -2925,163 +2464,6 @@ sub script_extract_seq } -sub script_get_genome_seq -{ - # Martin A. Hansen, December 2007. - - # Gets a subsequence from a genome. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $genome, $genome_file, $index_file, $index, $fh, $index_head, $index_beg, $index_len, $beg, $len, %lookup_hash, @begs, @lens, $i, $seq ); - - $options->{ "flank" } ||= 0; - - if ( $options->{ "genome" } ) - { - $genome = $options->{ "genome" }; - - $genome_file = "$ENV{ 'BP_DATA' }/genomes/$genome/fasta/$genome.fna"; - $index_file = "$ENV{ 'BP_DATA' }/genomes/$genome/fasta/$genome.index"; - - $fh = Maasha::Common::read_open( $genome_file ); - $index = Maasha::Fasta::index_retrieve( $index_file ); - - shift @{ $index }; # Get rid of the file size info - - map { $lookup_hash{ $_->[ 0 ] } = [ $_->[ 1 ], $_->[ 2 ] ] } @{ $index }; - - if ( exists $lookup_hash{ $options->{ "chr" } } and defined $options->{ "beg" } and ( defined $options->{ "end" } or defined $options->{ "len" } ) ) - { - ( $index_beg, $index_len ) = @{ $lookup_hash{ $options->{ "chr" } } }; - - $beg = $index_beg + $options->{ "beg" } - 1; - - if ( $options->{ "len" } ) { - $len = $options->{ "len" }; - } elsif ( $options->{ "end" } ) { - $len = ( $options->{ "end" } - $options->{ "beg" } + 1 ); - } - - $beg -= $options->{ "flank" }; - $len += 2 * $options->{ "flank" }; - - if ( $beg <= $index_beg ) - { - $len -= $index_beg - $beg; - $beg = $index_beg; - } - - $len = $index_beg + $index_len - $beg if $beg + $len > $index_beg + $index_len; - - next if $beg > $index_beg + $index_len; - - $record->{ "CHR" } = $options->{ "chr" }; - $record->{ "CHR_BEG" } = $beg - $index_beg; - $record->{ "CHR_END" } = $record->{ "CHR_BEG" } + $len - 1; - - $record->{ "SEQ" } = Maasha::Common::file_read( $fh, $beg, $len ); - $record->{ "SEQ_LEN" } = $len; - - Maasha::Biopieces::put_record( $record, $out ); - } - } - - while ( $record = Maasha::Biopieces::get_record( $in ) ) - { - if ( $options->{ "genome" } and not $record->{ "SEQ" } ) - { - if ( $record->{ "REC_TYPE" } eq "BED" and exists $lookup_hash{ $record->{ "CHR" } } ) - { - ( $index_beg, $index_len ) = @{ $lookup_hash{ $record->{ "CHR" } } }; - - $beg = $record->{ "CHR_BEG" } + $index_beg; - $len = $record->{ "CHR_END" } - $record->{ "CHR_BEG" } + 1; - } - elsif ( $record->{ "REC_TYPE" } eq "PSL" and exists $lookup_hash{ $record->{ "S_ID" } } ) - { - ( $index_beg, $index_len ) = @{ $lookup_hash{ $record->{ "S_ID" } } }; - - $beg = $record->{ "S_BEG" } + $index_beg; - $len = $record->{ "S_END" } - $record->{ "S_BEG" } + 1; - } - elsif ( $record->{ "REC_TYPE" } eq "BLAST" and exists $lookup_hash{ $record->{ "S_ID" } } ) - { - ( $index_beg, $index_len ) = @{ $lookup_hash{ $record->{ "S_ID" } } }; - - $beg = $record->{ "S_BEG" } + $index_beg; - $len = $record->{ "S_END" } - $record->{ "S_BEG" } + 1; - } - - $beg -= $options->{ "flank" }; - $len += 2 * $options->{ "flank" }; - - if ( $beg <= $index_beg ) - { - $len -= $index_beg - $beg; - $beg = $index_beg; - } - - $len = $index_beg + $index_len - $beg if $beg + $len > $index_beg + $index_len; - - next if $beg > $index_beg + $index_len; - - $record->{ "CHR_BEG" } = $beg - $index_beg; - $record->{ "CHR_END" } = $record->{ "CHR_BEG" } + $len - 1; - - $record->{ "SEQ" } = Maasha::Common::file_read( $fh, $beg, $len ); - - if ( $record->{ "STRAND" } and $record->{ "STRAND" } eq "-" ) - { - Maasha::Seq::dna_comp( \$record->{ "SEQ" } ); - $record->{ "SEQ" } = reverse $record->{ "SEQ" }; - } - - if ( $options->{ "mask" } ) - { - if ( $record->{ "BLOCK_COUNT" } > 1 ) # uppercase hit block segments and lowercase the rest. - { - $record->{ "SEQ" } = lc $record->{ "SEQ" }; - - @begs = split ",", $record->{ "Q_BEGS" }; - @lens = split ",", $record->{ "BLOCK_LENS" }; - - for ( $i = 0; $i < @begs; $i++ ) { - substr $record->{ "SEQ" }, $begs[ $i ], $lens[ $i ], uc substr $record->{ "SEQ" }, $begs[ $i ], $lens[ $i ]; - } - } - } - elsif ( $options->{ "splice" } ) - { - if ( $record->{ "BLOCK_COUNT" } > 1 ) # splice block sequences - { - $seq = ""; - @begs = split ",", $record->{ "Q_BEGS" }; - @lens = split ",", $record->{ "BLOCK_LENS" }; - - for ( $i = 0; $i < @begs; $i++ ) { - $seq .= substr $record->{ "SEQ" }, $begs[ $i ], $lens[ $i ]; - } - - $record->{ "SEQ" } = $seq; - } - } - - $record->{ "SEQ_LEN" } = length $record->{ "SEQ" }; - } - - Maasha::Biopieces::put_record( $record, $out ); - } - - close $fh if $fh; -} - - sub script_get_genome_align { # Martin A. Hansen, April 2008. @@ -3592,193 +2974,6 @@ sub script_patscan_seq } -sub script_create_blast_db -{ - # Martin A. Hansen, September 2007. - - # Creates a NCBI BLAST database with formatdb - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $fh, $seq_type, $path, $record, $entry ); - - $path = $options->{ "database" }; - - $fh = Maasha::Common::write_open( $path ); - - while ( $record = Maasha::Biopieces::get_record( $in ) ) - { - Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" }; - - if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) ) - { - $seq_type = Maasha::Seq::seq_guess_type( $entry->[ SEQ ] ) if not $seq_type; - - Maasha::Fasta::put_entry( $entry, $fh ); - } - } - - close $fh; - - if ( $seq_type eq "protein" ) { - Maasha::Common::run( "formatdb", "-p T -i $path -t $options->{ 'database' }" ); - } else { - Maasha::Common::run( "formatdb", "-p F -i $path -t $options->{ 'database' }" ); - } - - unlink $path; -} - - -sub script_blast_seq -{ - # Martin A. Hansen, September 2007. - - # BLASTs sequences in stream against a given database. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $genome, $q_type, $s_type, $tmp_in, $tmp_out, $fh_in, $fh_out, $record, $line, @fields, $entry ); - - $options->{ "e_val" } = 10 if not defined $options->{ "e_val" }; - $options->{ "filter" } = "F"; - $options->{ "filter" } = "T" if $options->{ "filter" }; - $options->{ "cpus" } ||= 1; - - $options->{ "database" } = "$ENV{ 'BP_DATA' }/genomes/$options->{ 'genome' }/blast/$options->{ 'genome' }.fna" if $options->{ 'genome' }; - - $tmp_in = "$BP_TMP/blast_query.seq"; - $tmp_out = "$BP_TMP/blast.result"; - - $fh_out = Maasha::Filesys::file_write_open( $tmp_in ); - - while ( $record = Maasha::Biopieces::get_record( $in ) ) - { - if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) ) - { - $q_type = Maasha::Seq::seq_guess_type( $entry->[ SEQ ] ) if not $q_type; - - Maasha::Fasta::put_entry( $entry, $fh_out ); - } - - Maasha::Biopieces::put_record( $record, $out ); - } - - close $fh_out; - - if ( -f $options->{ 'database' } . ".phr" ) { - $s_type = "protein"; - } else { - $s_type = "nucleotide"; - } - - if ( not $options->{ 'program' } ) - { - if ( $q_type ne "protein" and $s_type ne "protein" ) { - $options->{ 'program' } = "blastn"; - } elsif ( $q_type eq "protein" and $s_type eq "protein" ) { - $options->{ 'program' } = "blastp"; - } elsif ( $q_type ne "protein" and $s_type eq "protein" ) { - $options->{ 'program' } = "blastx"; - } elsif ( $q_type eq "protein" and $s_type ne "protein" ) { - $options->{ 'program' } = "tblastn"; - } - } - - if ( $options->{ 'verbose' } ) - { - Maasha::Common::run( - "blastall", - join( " ", - "-p $options->{ 'program' }", - "-e $options->{ 'e_val' }", - "-a $options->{ 'cpus' }", - "-m 8", - "-i $tmp_in", - "-d $options->{ 'database' }", - "-F $options->{ 'filter' }", - "-o $tmp_out", - ), - 1 - ); - } - else - { - Maasha::Common::run( - "blastall", - join( " ", - "-p $options->{ 'program' }", - "-e $options->{ 'e_val' }", - "-a $options->{ 'cpus' }", - "-m 8", - "-i $tmp_in", - "-d $options->{ 'database' }", - "-F $options->{ 'filter' }", - "-o $tmp_out", - "> /dev/null 2>&1" - ), - 1 - ); - } - - unlink $tmp_in; - - $fh_out = Maasha::Filesys::file_read_open( $tmp_out ); - - undef $record; - - while ( $line = <$fh_out> ) - { - chomp $line; - - next if $line =~ /^#/; - - @fields = split /\s+/, $line; - - $record->{ "REC_TYPE" } = "BLAST"; - $record->{ "Q_ID" } = $fields[ 0 ]; - $record->{ "S_ID" } = $fields[ 1 ]; - $record->{ "IDENT" } = $fields[ 2 ]; - $record->{ "ALIGN_LEN" } = $fields[ 3 ]; - $record->{ "MISMATCHES" } = $fields[ 4 ]; - $record->{ "GAPS" } = $fields[ 5 ]; - $record->{ "Q_BEG" } = $fields[ 6 ] - 1; # BLAST is 1-based - $record->{ "Q_END" } = $fields[ 7 ] - 1; # BLAST is 1-based - $record->{ "S_BEG" } = $fields[ 8 ] - 1; # BLAST is 1-based - $record->{ "S_END" } = $fields[ 9 ] - 1; # BLAST is 1-based - $record->{ "E_VAL" } = $fields[ 10 ]; - $record->{ "BIT_SCORE" } = $fields[ 11 ]; - - if ( $record->{ "S_BEG" } > $record->{ "S_END" } ) - { - $record->{ "STRAND" } = '-'; - - ( $record->{ "S_BEG" }, $record->{ "S_END" } ) = ( $record->{ "S_END" }, $record->{ "S_BEG" } ); - } - else - { - $record->{ "STRAND" } = '+'; - } - - Maasha::Biopieces::put_record( $record, $out ); - } - - close $fh_out; - - unlink $tmp_out; -} - - sub script_blat_seq { # Martin A. Hansen, August 2007. @@ -3892,227 +3087,54 @@ sub script_soap_seq { $args = join( " ", "-s $options->{ 'seed_size' }", - "-r 2", - "-a $tmp_in", - "-v $options->{ 'mismatches' }", - "-g $options->{ 'gap_size' }", - "-p $options->{ 'cpus' }", - "-d $options->{ 'in_file' }", - "-o $tmp_out", - ); - - $args .= " > /dev/null 2>&1" if not $options->{ 'verbose' }; - - Maasha::Common::run( "soap", $args, 1 ); - - unlink $tmp_in; - - $fh_out = Maasha::Common::read_open( $tmp_out ); - - undef $record; - - while ( $line = <$fh_out> ) - { - chomp $line; - - @fields = split /\t/, $line; - - $record->{ "REC_TYPE" } = "SOAP"; - $record->{ "Q_ID" } = $fields[ 0 ]; - $record->{ "SCORE" } = $fields[ 3 ]; - $record->{ "STRAND" } = $fields[ 6 ]; - $record->{ "S_ID" } = $fields[ 7 ]; - $record->{ "S_BEG" } = $fields[ 8 ] - 1; # soap is 1-based - $record->{ "S_END" } = $fields[ 8 ] + $fields[ 5 ] - 2; - - Maasha::Biopieces::put_record( $record, $out ); - } - - close $fh_out; - } - - unlink $tmp_out; -} - - -sub script_match_seq -{ - # Martin A. Hansen, August 2007. - - # BLATs sequences in stream against a given genome. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, @entries, $results ); - - $options->{ "word_size" } ||= 20; - $options->{ "direction" } ||= "both"; - - while ( $record = Maasha::Biopieces::get_record( $in ) ) - { - if ( $record->{ "SEQ_NAME" } and $record->{ "SEQ" } ) { - push @entries, [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ]; - } - - Maasha::Biopieces::put_record( $record, $out ); - } - - if ( @entries == 1 ) - { - $results = Maasha::Match::match_mummer( [ $entries[ 0 ] ], [ $entries[ 0 ] ], $options, $BP_TMP ); - - map { Maasha::Biopieces::put_record( $_, $out ) } @{ $results }; - } - elsif ( @entries == 2 ) - { - $results = Maasha::Match::match_mummer( [ $entries[ 0 ] ], [ $entries[ 1 ] ], $options, $BP_TMP ); - - map { Maasha::Biopieces::put_record( $_, $out ) } @{ $results }; - } -} - - -sub script_create_vmatch_index -{ - # Martin A. Hansen, January 2008. - - # Create a vmatch index from sequences in the stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $file_tmp, $fh_tmp, $type, $entry ); - - if ( $options->{ "index_name" } ) - { - $file_tmp = $options->{ 'index_name' }; - $fh_tmp = Maasha::Common::write_open( $file_tmp ); - } - - while ( $record = Maasha::Biopieces::get_record( $in ) ) - { - if ( $options->{ "index_name" } and $entry = Maasha::Fasta::biopiece2fasta( $record ) ) - { - Maasha::Fasta::put_entry( $entry, $fh_tmp ); - - $type = Maasha::Seq::seq_guess_type( $entry->[ SEQ ] ) if not defined $type; - } - - Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" }; - } - - if ( $options->{ "index_name" } ) - { - close $fh_tmp; - - if ( $type eq "protein" ) { - Maasha::Common::run( "mkvtree", "-db $file_tmp -protein -pl $options->{ 'prefix_length' } -allout -indexname $file_tmp > /dev/null 2>&1" ); - } else { - Maasha::Common::run( "mkvtree", "-db $file_tmp -dna -pl $options->{ 'prefix_length' } -allout -indexname $file_tmp > /dev/null 2>&1" ); - } - - unlink $file_tmp; - } -} - - -sub script_vmatch_seq -{ - # Martin A. Hansen, August 2007. - - # Vmatches sequences in stream against a given genome. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( @index_files, @records, $result_file, $fh_in, $record, %hash ); - - $options->{ 'count' } = 1 if $options->{ 'max_hits' }; - - if ( $options->{ "index_name" } ) - { - @index_files = $options->{ "index_name" }; - } - else - { - @index_files = Maasha::Common::ls_files( "$ENV{ 'BP_DATA' }/genomes/$options->{ 'genome' }/vmatch" ); - - map { $_ =~ /^(.+)\.[a-z1]{3,4}$/; $hash{ $1 } = 1 } @index_files; - - @index_files = sort keys %hash; - } - - while ( $record = Maasha::Biopieces::get_record( $in ) ) - { - push @records, $record; - - Maasha::Biopieces::put_record( $record, $out ); - } - - $result_file = Maasha::Match::match_vmatch( $BP_TMP, \@records, \@index_files, $options ); - - undef @records; - - $fh_in = Maasha::Common::read_open( $result_file ); - - while ( $record = Maasha::Match::vmatch_get_entry( $fh_in ) ) { - Maasha::Biopieces::put_record( $record, $out ); - } - - close $fh_in; + "-r 2", + "-a $tmp_in", + "-v $options->{ 'mismatches' }", + "-g $options->{ 'gap_size' }", + "-p $options->{ 'cpus' }", + "-d $options->{ 'in_file' }", + "-o $tmp_out", + ); - unlink $result_file; -} + $args .= " > /dev/null 2>&1" if not $options->{ 'verbose' }; + Maasha::Common::run( "soap", $args, 1 ); -sub script_write_fasta -{ - # Martin A. Hansen, August 2007. + unlink $tmp_in; - # Write FASTA entries from sequences in stream. + $fh_out = Maasha::Common::read_open( $tmp_out ); - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; + undef $record; - # Returns nothing. + while ( $line = <$fh_out> ) + { + chomp $line; - my ( $record, $fh, $entry ); + @fields = split /\t/, $line; - $fh = write_stream( $options->{ "data_out" }, $options->{ "compress" } ); + $record->{ "REC_TYPE" } = "SOAP"; + $record->{ "Q_ID" } = $fields[ 0 ]; + $record->{ "SCORE" } = $fields[ 3 ]; + $record->{ "STRAND" } = $fields[ 6 ]; + $record->{ "S_ID" } = $fields[ 7 ]; + $record->{ "S_BEG" } = $fields[ 8 ] - 1; # soap is 1-based + $record->{ "S_END" } = $fields[ 8 ] + $fields[ 5 ] - 2; - while ( $record = Maasha::Biopieces::get_record( $in ) ) - { - if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) ) { - Maasha::Fasta::put_entry( $entry, $fh, $options->{ "wrap" } ); + Maasha::Biopieces::put_record( $record, $out ); } - Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" }; + close $fh_out; } - close $fh; + unlink $tmp_out; } -sub script_write_align +sub script_match_seq { # Martin A. Hansen, August 2007. - # Write pretty alignments aligned sequences in stream. + # BLATs sequences in stream against a given genome. my ( $in, # handle to in stream $out, # handle to out stream @@ -4121,9 +3143,10 @@ sub script_write_align # Returns nothing. - my ( $fh, $record, @entries ); + my ( $record, @entries, $results ); - $fh = write_stream( $options->{ "data_out" } ) ; + $options->{ "word_size" } ||= 20; + $options->{ "direction" } ||= "both"; while ( $record = Maasha::Biopieces::get_record( $in ) ) { @@ -4131,24 +3154,29 @@ sub script_write_align push @entries, [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ]; } - Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" }; + Maasha::Biopieces::put_record( $record, $out ); } - if ( scalar( @entries ) == 2 ) { - Maasha::Align::align_print_pairwise( $entries[ 0 ], $entries[ 1 ], $fh, $options->{ "wrap" } ); - } elsif ( scalar ( @entries ) > 2 ) { - Maasha::Align::align_print_multi( \@entries, $fh, $options->{ "wrap" }, $options->{ "no_ruler" }, $options->{ "no_consensus" } ); + if ( @entries == 1 ) + { + $results = Maasha::Match::match_mummer( [ $entries[ 0 ] ], [ $entries[ 0 ] ], $options, $BP_TMP ); + + map { Maasha::Biopieces::put_record( $_, $out ) } @{ $results }; } + elsif ( @entries == 2 ) + { + $results = Maasha::Match::match_mummer( [ $entries[ 0 ] ], [ $entries[ 1 ] ], $options, $BP_TMP ); - close $fh if $fh; + map { Maasha::Biopieces::put_record( $_, $out ) } @{ $results }; + } } -sub script_write_blast +sub script_create_vmatch_index { - # Martin A. Hansen, November 2007. + # Martin A. Hansen, January 2008. - # Write data in blast table format (-m8 and 9). + # Create a vmatch index from sequences in the stream. my ( $in, # handle to in stream $out, # handle to out stream @@ -4157,55 +3185,46 @@ sub script_write_blast # Returns nothing. - my ( $fh, $record, $first ); - - $fh = write_stream( $options->{ "data_out" }, $options->{ "compress" } ) ; + my ( $record, $file_tmp, $fh_tmp, $type, $entry ); - $first = 1; + if ( $options->{ "index_name" } ) + { + $file_tmp = $options->{ 'index_name' }; + $fh_tmp = Maasha::Common::write_open( $file_tmp ); + } while ( $record = Maasha::Biopieces::get_record( $in ) ) { - if ( $record->{ "REC_TYPE" } eq "BLAST" ) + if ( $options->{ "index_name" } and $entry = Maasha::Fasta::biopiece2fasta( $record ) ) { - if ( $options->{ "comment" } and $first ) - { - print "# Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score\n"; - - $first = 0; - } - - if ( $record->{ "STRAND" } eq "-" ) { - ( $record->{ "S_BEG" }, $record->{ "S_END" } ) = ( $record->{ "S_END" }, $record->{ "S_BEG" } ); - } + Maasha::Fasta::put_entry( $entry, $fh_tmp ); - print $fh join( "\t", - $record->{ "Q_ID" }, - $record->{ "S_ID" }, - $record->{ "IDENT" }, - $record->{ "ALIGN_LEN" }, - $record->{ "MISMATCHES" }, - $record->{ "GAPS" }, - $record->{ "Q_BEG" } + 1, - $record->{ "Q_END" } + 1, - $record->{ "S_BEG" } + 1, - $record->{ "S_END" } + 1, - $record->{ "E_VAL" }, - $record->{ "BIT_SCORE" } - ), "\n"; + $type = Maasha::Seq::seq_guess_type( $entry->[ SEQ ] ) if not defined $type; } Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" }; } - close $fh; + if ( $options->{ "index_name" } ) + { + close $fh_tmp; + + if ( $type eq "protein" ) { + Maasha::Common::run( "mkvtree", "-db $file_tmp -protein -pl $options->{ 'prefix_length' } -allout -indexname $file_tmp > /dev/null 2>&1" ); + } else { + Maasha::Common::run( "mkvtree", "-db $file_tmp -dna -pl $options->{ 'prefix_length' } -allout -indexname $file_tmp > /dev/null 2>&1" ); + } + + unlink $file_tmp; + } } -sub script_write_tab +sub script_write_align { # Martin A. Hansen, August 2007. - # Write data as table. + # Write pretty alignments aligned sequences in stream. my ( $in, # handle to in stream $out, # handle to out stream @@ -4214,59 +3233,26 @@ sub script_write_tab # Returns nothing. - my ( $fh, $record, $key, @keys, @vals, $ok, %no_keys, $A, $B ); - - $options->{ "delimit" } ||= "\t"; - - map { $no_keys{ $_ } = 1 } @{ $options->{ "no_keys" } }; + my ( $fh, $record, @entries ); - $fh = write_stream( $options->{ "data_out" }, $options->{ "compress" } ); + $fh = write_stream( $options->{ "data_out" } ) ; while ( $record = Maasha::Biopieces::get_record( $in ) ) { - undef @vals; - $ok = 1; - - if ( $options->{ "keys" } ) - { - map { $ok = 0 if not exists $record->{ $_ } } @{ $options->{ "keys" } }; - - if ( $ok ) - { - foreach $key ( @{ $options->{ "keys" } } ) - { - if ( exists $record->{ $key } ) - { - push @keys, $key if $options->{ "comment" }; - push @vals, $record->{ $key }; - } - } - } - } - else - { - foreach $key ( sort { $A = $a; $B = $b; $A =~ s/^V(\d+)$/$1/; $B =~ s/^V(\d+)$/$1/; $A <=> $B } keys %{ $record } ) - { - next if exists $no_keys{ $key }; - - push @keys, $key if $options->{ "comment" }; - push @vals, $record->{ $key }; - } - } - - if ( @keys and $options->{ "comment" } ) - { - print $fh "#", join( $options->{ "delimit" }, @keys ), "\n"; - - delete $options->{ "comment" }; + if ( $record->{ "SEQ_NAME" } and $record->{ "SEQ" } ) { + push @entries, [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ]; } - print $fh join( $options->{ "delimit" }, @vals ), "\n" if @vals; - Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" }; } - close $fh; + if ( scalar( @entries ) == 2 ) { + Maasha::Align::align_print_pairwise( $entries[ 0 ], $entries[ 1 ], $fh, $options->{ "wrap" } ); + } elsif ( scalar ( @entries ) > 2 ) { + Maasha::Align::align_print_multi( \@entries, $fh, $options->{ "wrap" }, $options->{ "no_ruler" }, $options->{ "no_consensus" } ); + } + + close $fh if $fh; } @@ -4287,7 +3273,7 @@ sub script_write_bed $cols = $options->{ 'cols' }->[ 0 ]; - $fh = write_stream( $options->{ 'data_out' }, $options->{ 'compress' } ); + $fh = Maasha::Biopieces::write_stream( $options->{ 'data_out' }, $options->{ 'compress' } ); while ( $record = Maasha::Biopieces::get_record( $in ) ) { @@ -5205,83 +4191,6 @@ sub script_merge_records } -sub script_grab -{ - # Martin A. Hansen, August 2007. - - # Grab for records in stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $keys, $vals_only, $keys_only, $invert, $patterns, $pattern, $regex, $record, $key, $op, $val, %lookup_hash, $found ); - - $keys = $options->{ 'keys' }; - $vals_only = $options->{ 'vals_only' }; - $keys_only = $options->{ 'keys_only' }; - $invert = $options->{ 'invert' }; - - if ( $options->{ 'patterns' } ) - { - $patterns = [ split ",", $options->{ 'patterns' } ]; - } - elsif ( -f $options->{ 'patterns_in' } ) - { - $patterns = Maasha::Patscan::read_patterns( $options->{ 'patterns_in' } ); - } - elsif ( $options->{ 'regex' } ) - { - if ( $options->{ 'case_insensitive' } ) { - $regex = qr/$options->{ 'regex' }/i; - } else { - $regex = qr/$options->{ 'regex' }/; - } - } - elsif ( -f $options->{ 'exact_in' } ) - { - $patterns = Maasha::Patscan::read_patterns( $options->{ 'exact_in' } ); - - map { $lookup_hash{ $_ } = 1 } @{ $patterns }; - - undef $patterns; - } - elsif ( $options->{ 'eval' } ) - { - if ( $options->{ 'eval' } =~ /^([^><=! ]+)\s*(>=|<=|>|<|=|!=|eq|ne)\s*(.+)$/ ) - { - $key = $1; - $op = $2; - $val = $3; - } - } - - while ( $record = Maasha::Biopieces::get_record( $in ) ) - { - $found = 0; - - if ( %lookup_hash ) { - $found = grab_lookup( \%lookup_hash, $record, $keys, $vals_only, $keys_only ); - } elsif ( $patterns ) { - $found = grab_patterns( $patterns, $record, $keys, $vals_only, $keys_only ); - } elsif ( $regex ) { - $found = grab_regex( $regex, $record, $keys, $vals_only, $keys_only ); - } elsif ( $op ) { - $found = grab_eval( $key, $op, $val, $record ); - } - - if ( $found and not $invert ) { - Maasha::Biopieces::put_record( $record, $out ); - } elsif ( not $found and $invert ) { - Maasha::Biopieces::put_record( $record, $out ); - } - } -} - - sub script_compute { # Martin A. Hansen, August 2007. @@ -6292,154 +5201,9 @@ sub script_upload_to_ucsc } -# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - - -sub grab_lookup -{ - # Martin A. Hansen, November 2009. - - # Uses keys from a lookup hash to search records. Optionally, a list of - # keys can be given so the lookup is limited to these, also, flags - # can be given to limit lookup to keys or vals only. Returns 1 if lookup - # succeeded, else 0. - - my ( $lookup_hash, # hashref with patterns - $record, # hashref - $keys, # list of keys - OPTIONAL - $vals_only, # only vals flag - OPTIONAL - $keys_only, # only keys flag - OPTIONAL - ) = @_; - - # Returns boolean. - - if ( $keys ) - { - map { return 1 if exists $lookup_hash->{ $record->{ $_ } } } @{ $keys }; - } - else - { - if ( not $vals_only ) { - map { return 1 if exists $lookup_hash->{ $_ } } keys %{ $record }; - } - - if ( not $keys_only ) { - map { return 1 if exists $lookup_hash->{ $record->{ $_ } } } keys %{ $record }; - } - } - - return 0; -} - - -sub grab_patterns -{ - # Martin A. Hansen, November 2009. - - # Uses patterns to match records containing the pattern as a substring. - # Returns 1 if the record is matched, else 0. - - my ( $patterns, # list of patterns - $record, # hashref - $keys, # list of keys - OPTIONAL - $vals_only, # only vals flag - OPTIONAL - $keys_only, # only keys flag - OPTIONAL - ) = @_; - - # Returns boolean. - - my ( $pattern ); - - foreach $pattern ( @{ $patterns } ) - { - if ( $keys ) - { - map { return 1 if index( $record->{ $_ }, $pattern ) >= 0 } @{ $keys }; - } - else - { - if ( not $vals_only ) { - map { return 1 if index( $_, $pattern ) >= 0 } keys %{ $record }; - } - - if ( not $keys_only ) { - map { return 1 if index( $record->{ $_ }, $pattern ) >= 0 } keys %{ $record }; - } - } - } - - return 0; -} - - -sub grab_regex -{ - # Martin A. Hansen, November 2009. - - # Uses regex to match records. - # Returns 1 if the record is matched, else 0. - - my ( $regex, # regex to match - $record, # hashref - $keys, # list of keys - OPTIONAL - $vals_only, # only vals flag - OPTIONAL - $keys_only, # only keys flag - OPTIONAL - ) = @_; - - # Returns boolean. - - if ( $keys ) - { - map { return 1 if $record->{ $_ } =~ /$regex/ } @{ $keys }; - } - else - { - if ( not $vals_only ) { - map { return 1 if $_ =~ /$regex/ } keys %{ $record }; - } - - if ( not $keys_only ) { - map { return 1 if $record->{ $_ } =~ /$regex/ } keys %{ $record }; - } - } - - return 0; -} - - -sub grab_eval -{ - # Martin A. Hansen, November 2009. - - # Test if the value of a given record key evaluates according - # to a given operator. Returns 1 if eval is OK, else 0. - - my ( $key, # record key - $op, # operator - $val, # value - $record, # hashref - ) = @_; - - # Returns boolean. - - if ( defined $record->{ $key } ) - { - return 1 if ( $op eq "<" and $record->{ $key } < $val ); - return 1 if ( $op eq ">" and $record->{ $key } > $val ); - return 1 if ( $op eq ">=" and $record->{ $key } >= $val ); - return 1 if ( $op eq "<=" and $record->{ $key } <= $val ); - return 1 if ( $op eq "=" and $record->{ $key } == $val ); - return 1 if ( $op eq "!=" and $record->{ $key } != $val ); - return 1 if ( $op eq "eq" and $record->{ $key } eq $val ); - return 1 if ( $op eq "ne" and $record->{ $key } ne $val ); - } - - return 0; -} - - # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< 1; __END__ + diff --git a/code_perl/Maasha/Biopieces.pm b/code_perl/Maasha/Biopieces.pm index 8d8f403..b1c5fb0 100644 --- a/code_perl/Maasha/Biopieces.pm +++ b/code_perl/Maasha/Biopieces.pm @@ -29,6 +29,11 @@ package Maasha::Biopieces; # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< +use Getopt::Long qw( :config bundling ); +use Data::Dumper; +use Maasha::Match; +use Maasha::Common; +use Maasha::Filesys; use vars qw( @ISA @EXPORT_OK ); require Exporter; @@ -65,8 +70,8 @@ sub log_biopiece $user = Maasha::Common::get_user(); $script = Maasha::Common::get_scriptname(); - $fh_global = Maasha::Common::append_open( "$ENV{ 'BP_LOG' }/biopieces.log" ); - $fh_local = Maasha::Common::append_open( "$ENV{ 'HOME' }/.biopieces.log" ); + $fh_global = Maasha::Filesys::file_append_open( "$ENV{ 'BP_LOG' }/biopieces.log" ); + $fh_local = Maasha::Filesys::file_append_open( "$ENV{ 'HOME' }/.biopieces.log" ); print $fh_global "$time_stamp\t$user\t$script ", join( " ", @ARGV ), "\n"; print $fh_local "$time_stamp\t$user\t$script ", join( " ", @ARGV ), "\n"; @@ -85,7 +90,7 @@ sub read_stream # Opens a stream to STDIN or a file, - my ( $path, # path - OPTIONAL + my ( $file, # file - OPTIONAL ) = @_; # Returns filehandle. @@ -93,15 +98,13 @@ sub read_stream my ( $fh ); if ( not -t STDIN ) { - $fh = Maasha::Common::read_stdin(); - } elsif ( not $path ) { -# Maasha::Common::error( qq(no data stream) ); + $fh = Maasha::Filesys::stdin_read(); + } elsif ( not $file ) { + # Maasha::Common::error( qq(no data stream) ); } else { - $fh = Maasha::Common::read_open( $path ); + $fh = Maasha::Filesys::file_read_open( $file ); } -# $fh->autoflush(1) if $fh; # Disable file buffer for debugging. - return $fh; } @@ -121,15 +124,30 @@ sub write_stream my ( $fh ); if ( $path ) { - $fh = Maasha::Common::write_open( $path, $gzip ); + $fh = Maasha::Filesys::file_write_open( $path, $gzip ); } else { - $fh = Maasha::Common::write_stdout(); + $fh = Maasha::Filesys::stdout_write(); } return $fh; } +sub close_stream +{ + # Martin A. Hansen, May 2009. + + # Close stream if open. + + my ( $fh, # filehandle + ) = @_; + + # Returns nothing. + + close $fh if defined $fh; +} + + sub get_record { # Martin A. Hansen, July 2007. @@ -144,14 +162,16 @@ sub get_record my ( $block, @lines, $line, $key, $value, %record ); + return if not defined $fh; + local $/ = "\n---\n"; $block = <$fh>; - chomp $block; - return if not defined $block; + chomp $block; + @lines = split "\n", $block; foreach $line ( @lines ) @@ -195,6 +215,399 @@ sub put_record } +sub parse_options +{ + # Martin A. Hansen, May 2009 + + # + + my ( $arg_list, # data structure with argument description + ) = @_; + + # Returns hashref. + + my ( $arg, @list, $options ); + + # ---- Adding the mandatory arguments to the arg_list ---- + + push @{ $arg_list }, ( + + { long => 'help', short => '?', type => 'flag', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, + { long => 'stream_in', short => 'I', type => 'file!', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, + { long => 'stream_out', short => 'O', type => 'file', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, + { long => 'verbose', short => 'v', type => 'flag', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, + ); + + check_duplicates_args( $arg_list ); + + # ---- Compiling options list ---- + + foreach $arg ( @{ $arg_list } ) + { + if ( $arg->{ 'type' } eq 'flag' ) { + push @list, "$arg->{ 'long' }|$arg->{ 'short' }"; + } else { + push @list, "$arg->{ 'long' }|$arg->{ 'short' }=s"; + } + } + + # ---- Parsing options from @ARGV ---- + + $options = {}; + + Getopt::Long::GetOptions( $options, @list ); + + # print Dumper( $options ); + + check_print_usage( $options ); + + # ---- Expanding and checking options ---- + + foreach $arg ( @{ $arg_list } ) + { + check_mandatory( $arg, $options ); + set_default( $arg, $options ); + check_uint( $arg, $options ); + check_int( $arg, $options ); + set_list( $arg, $options ); + check_dir( $arg, $options ); + check_file( $arg, $options ); + set_files( $arg, $options ); + check_files( $arg, $options ); + check_allowed( $arg, $options ); + check_disallowed( $arg, $options ); + } + + # print Dumper( $options ); + + # return wantarray ? $options : %{ $options }; # WTF! Someone changed the behaviour of wantarray??? + + return $options; +} + + +sub check_duplicates_args +{ + # Martin A. Hansen, May 2009 + + # Check if there are duplicate long or short arguments, + # and raise an error if so. + + my ( $arg_list, # List of argument hashrefs, + ) = @_; + + # Returns nothing. + + my ( $arg, %check_hash ); + + foreach $arg ( @{ $arg_list } ) + { + Maasha::Common::error( qq(Duplicate long argument: $arg->{ 'long' }) ) if exists $check_hash{ $arg->{ 'long' } }; + Maasha::Common::error( qq(Duplicate short argument: $arg->{ 'short' }) ) if exists $check_hash{ $arg->{ 'short' } }; + + $check_hash{ $arg->{ 'long' } } = 1; + $check_hash{ $arg->{ 'short' } } = 1; + } +} + + +sub check_print_usage +{ + # Martin A. Hansen, May 2009. + + # Check if we need to print usage and print usage + # and exit if that is the case. + + my ( $options, # option hash + ) = @_; + + # Returns nothing. + + my ( $script, $wiki ); + + $script = Maasha::Common::get_scriptname(); + + if ( $script ne 'print_wiki' ) + { + # if ( exists $options{ 'help' } or not ( exists $options{ 'stream_in' } or exists $options{ 'data_in' } or not -t STDIN ) ) + if ( exists $options->{ 'help' } or ( scalar keys %{ $options } == 0 or exists $options->{ 'data_in' } or not -t STDIN ) ) + { + $wiki = $ENV{ 'BP_DIR' } . "/bp_usage/$script.wiki"; + + if ( exists $options->{ 'help' } ) { + `print_wiki --data_in=$wiki --help`; + } elsif ( $script =~ /^(list_biopieces|list_genomes)$/ ) { + return; + } else { + `print_wiki --data_in=$wiki`; + } + + exit; + } + } +} + + +sub check_mandatory +{ + # Martin A. Hansen, May 2009. + + # Check if mandatory arguments are set and raises an error if not. + + my ( $arg, # hashref + $options, # options hash + ) = @_; + + # Returns nothing. + + if ( $arg->{ 'mandatory' } eq 'yes' and not defined $options->{ $arg->{ 'long' } } ) { + Maasha::Common::error( qq(Argument --$arg->{ 'long' } is mandatory) ); + } +} + + +sub set_default +{ + # Martin A. Hansen, May 2009. + + # Set default values in option hash. + + my ( $arg, # hashref + $options, # options hash + ) = @_; + + # Returns nothing. + + if ( not defined $options->{ $arg->{ 'long' } } ) { + $options->{ $arg->{ 'long' } } = $arg->{ 'default' } + } +} + + +sub check_uint +{ + # Martin A. Hansen, May 2009. + + # Check if value to argument is an unsigned integer and + # raises an error if not. + + my ( $arg, # hashref + $options, # options hash + ) = @_; + + # Returns nothing. + + if ( $arg->{ 'type' } eq 'uint' and defined $options->{ $arg->{ 'long' } } ) + { + if ( $options->{ $arg->{ 'long' } } !~ /^\d+$/ ) { + Maasha::Common::error( qq(Argument --$arg->{ 'long' } must be an unsigned integer - not $options->{ $arg->{ 'long' } }) ); + } + } +} + + +sub check_int +{ + # Martin A. Hansen, May 2009. + + # Check if value to argument is an integer and + # raises an error if not. + + my ( $arg, # hashref + $options, # options hash + ) = @_; + + # Returns nothing. + + if ( $arg->{ 'type' } eq 'int' and defined $options{ $arg->{ 'long' } } ) + { + if ( $options->{ $arg->{ 'long' } } !~ /^-?\d+$/ ) { + Maasha::Common::error( qq(Argument --$arg->{ 'long' } must be an integer - not $options->{ $arg->{ 'long' } }) ); + } + } +} + + +sub set_list +{ + # Martin A. Hansen, May 2009. + + # Splits an argument of type 'list' into a list that is put + # in the options hash. + + my ( $arg, # hashref + $options, # options hash + ) = @_; + + # Returns nothing. + + if ( $arg->{ 'type' } eq 'list' and defined $options->{ $arg->{ 'long' } } ) { + $options->{ $arg->{ 'long' } } = [ split /,/, $options->{ $arg->{ 'long' } } ]; + } +} + + +sub check_dir +{ + # Martin A. Hansen, May 2009. + + # Check if an argument of type 'dir!' truly is a directory and + # raises an error if not. + + my ( $arg, # hashref + $options, # options hash + ) = @_; + + # Returns nothing. + + if ( $arg->{ 'type' } eq 'dir!' and defined $options->{ $arg->{ 'long' } } ) + { + if ( not -d $options->{ $arg->{ 'long' } } ) { + Maasha::Common::error( qq(No such directory: "$options->{ $arg->{ 'long' } }") ); + } + } +} + + +sub check_file +{ + # Martin A. Hansen, May 2009. + + # Check if an argument of type 'file!' truly is a file and + # raises an error if not. + + my ( $arg, # hashref + $options, # options hash + ) = @_; + + # Returns nothing. + + if ( $arg->{ 'type' } eq 'file!' and defined $options->{ $arg->{ 'long' } } ) + { + if ( not -f $options->{ $arg->{ 'long' } } ) { + Maasha::Common::error( qq(No such file: "$options->{ $arg->{ 'long' } }") ); + } + } +} + + +sub set_files +{ + # Martin A. Hansen, May 2009. + + # Split the argument to 'files' into a list that is put on the options hash. + + my ( $arg, # hashref + $options, # options hash + ) = @_; + + # Returns nothing. + + if ( $arg->{ 'type' } eq 'files' and defined $options->{ $arg->{ 'long' } } ) { + $options->{ $arg->{ 'long' } } = [ split /,/, $options->{ $arg->{ 'long' } } ]; + } +} + + +sub check_files +{ + # Martin A. Hansen, May 2009. + + # Split the argument to 'files!' and check if each file do exists before adding + # the file list to the options hash. + + my ( $arg, # hashref + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $elem, @files ); + + if ( $arg->{ 'type' } eq 'files!' and defined $options->{ $arg->{ 'long' } } ) + { + foreach $elem ( split /,/, $options->{ $arg->{ 'long' } } ) + { + if ( -f $elem ) { + push @files, $elem; + } elsif ( $elem =~ /\*/ ) { + push @files, glob( $elem ); + } + } + + if ( scalar @files == 0 ) { + Maasha::Common::error( qq(Argument to --$arg->{ 'long' } must be a valid file or fileglob expression - not $options->{ $arg->{ 'long' } }) ); + } + + $options->{ $arg->{ 'long' } } = [ @files ]; + } +} + + +sub check_allowed +{ + # Martin A. Hansen, May 2009. + + # Check if all values to all arguement are allowed and raise an + # error if not. + + my ( $arg, # hashref + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $elem ); + + if ( defined $arg->{ 'allowed' } and defined $options->{ $arg->{ 'long' } } ) + { + map { $val_hash{ $_ } = 1 } split /,/, $arg->{ 'allowed' }; + + if ( $arg->{ 'type' } =~ /^(list|files|files!)$/ ) + { + foreach $elem ( @{ $options->{ $arg->{ 'long' } } } ) + { + if ( not exists $val_hash{ $elem } ) { + Maasha::Common::error( qq(Argument to --$arg->{ 'long' } $elem is not allowed) ); + } + } + } + else + { + if ( not exists $val_hash{ $options->{ $arg->{ 'long' } } } ) { + Maasha::Common::error( qq(Argument to --$arg->{ 'long' } $options->{ $arg->{ 'long' } } is not allowed) ); + } + } + } +} + + +sub check_disallowed +{ + # Martin A. Hansen, May 2009. + + # Check if any values to all arguemnts are disallowed and raise an error if so. + + my ( $arg, # hashref + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $val, %val_hash ); + + if ( defined $arg->{ 'disallowed' } and defined $options->{ $arg->{ 'long' } } ) + { + foreach $val ( split /,/, $arg->{ 'disallowed' } ) + { + if ( $options->{ $arg->{ 'long' } } eq $val ) { + Maasha::Common::error( qq(Argument to --$arg->{ 'long' } $val is disallowed) ); + } + } + } +} + + sub getopt_files { # Martin A. Hansen, November 2007. @@ -240,11 +653,13 @@ sub sig_handler # print STDERR "signal->$sig<-\n"; + my $script = Maasha::Common::get_scriptname(); + chomp $sig; sleep 1; - if ( -d $BP_TMP ) +# if ( -d $BP_TMP ) { if ( $sig =~ /MAASHA_ERROR/ ) { print STDERR "\nProgram '$script' had an error" . " - Please wait for temporary data to be removed\n"; @@ -277,7 +692,7 @@ sub clean_tmp $curr_pid = Maasha::Common::get_processid(); - @dirs = Maasha::Common::ls_dirs( $tmpdir ); + @dirs = Maasha::Filesys::ls_dirs( $tmpdir ); foreach $dir ( @dirs ) { @@ -294,12 +709,12 @@ sub clean_tmp if ( not Maasha::Common::process_running( $pid ) ) { # print STDERR "Removing stale dir: $dir\n"; - Maasha::Common::dir_remove( $dir ); + Maasha::Filesys::dir_remove( $dir ); } elsif ( $pid == $curr_pid ) { # print STDERR "Removing current dir: $dir\n"; - Maasha::Common::dir_remove( $dir ); + Maasha::Filesys::dir_remove( $dir ); } } } @@ -307,6 +722,65 @@ sub clean_tmp } +sub run_time +{ + # Martin A. Hansen, May 2009. + + # Returns a precision timestamp for calculating + # run time. + + return Maasha::Common::get_time_hires(); +} + + +sub run_time_print +{ + # Martin A. Hansen, May 2009 + + # Print the run time to STDERR for the current script if + # the verbose switch is set in the option hash. + + my ( $t0, # run time begin + $t1, # run time end + $options, # options hash + ) = @_; + + # Returns nothing + + my $script = Maasha::Common::get_scriptname(); + + print STDERR "Program: $script" . ( " " x ( 25 - length( $script ) ) ) . sprintf( "Run time: %.4f\n", ( $t1 - $t0 ) ) if $options->{ 'verbose' }; + +} + + +sub get_tmpdir +{ + # Martin A. Hansen, April 2008. + + # Create a temporary directory based on + # $ENV{ 'BP_TMP' } and sessionid. + + # this thing is a really bad solution and needs to be removed. + + # Returns a path. + + my ( $user, $sid, $pid, $path ); + + Maasha::Common::error( qq(no BP_TMP set in %ENV) ) if not -d $ENV{ 'BP_TMP' }; + + $user = Maasha::Common::get_user(); + $sid = Maasha::Common::get_sessionid(); + $pid = Maasha::Common::get_processid(); + + $path = "$ENV{ 'BP_TMP' }/" . join( "_", $user, $sid, $pid, "bp_tmp" ); + + Maasha::Filesys::dir_create( $path ); + + return $path; +} + + END { clean_tmp(); diff --git a/code_perl/Maasha/Blast.pm b/code_perl/Maasha/Blast.pm index b89ce8c..923fccd 100644 --- a/code_perl/Maasha/Blast.pm +++ b/code_perl/Maasha/Blast.pm @@ -1,6 +1,6 @@ package Maasha::Blast; -# Copyright (C) 2007 Martin A. Hansen. +# Copyright (C) 2007-2009 Martin A. Hansen. # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License diff --git a/code_perl/Maasha/Common.pm b/code_perl/Maasha/Common.pm index 16e069f..847ff5d 100644 --- a/code_perl/Maasha/Common.pm +++ b/code_perl/Maasha/Common.pm @@ -34,6 +34,7 @@ use Carp; use Data::Dumper; use Storable; use IO::File; +use Time::HiRes qw( gettimeofday ); use Maasha::Config; use Exporter; @@ -215,7 +216,7 @@ sub read_open { # Martin A. Hansen, January 2004. - # read opens a file and returns a filehandle. + # Read opens a file and returns a filehandle. my ( $path, # full path to file ) = @_; @@ -224,7 +225,7 @@ sub read_open my ( $fh, $type ); - $type = `file $path` if $path; + $type = `file $path` if -f $path; if ( $type =~ /gzip compressed/ ) { $fh = new IO::File "zcat $path|" or Maasha::Common::error( qq(Could not read-open file "$path": $!) ); @@ -236,6 +237,46 @@ sub read_open } +sub read_open_multi +{ + # Martin A. Hansen, May 2009. + + # Cats a number of files and returns a filehandle. + + my ( $files, # full path to file + ) = @_; + + # returns filehandle + + my ( $file, $fh, $type, %type_hash, $file_string ); + + foreach $file ( @{ $files } ) + { + Maasha::Common::error( qq(No such file: $file) ) if not -f $file; + + $type = `file $file`; + + if ( $type =~ /gzip compressed/ ) { + $type_hash{ 'gzip' } = 1; + } else { + $type_hash{ 'ascii' } = 1; + } + } + + Maasha::Common::error( qq(Mixture of zipped and unzipped files) ) if scalar keys %type_hash > 1; + + $file_string = join " ", @{ $files }; + + if ( $type =~ /gzip compressed/ ) { + $fh = new IO::File "zcat $file_string|" or Maasha::Common::error( qq(Could not open pipe: $!) ); + } else { + $fh = new IO::File "cat $file_string|" or Maasha::Common::error( qq(Could not open pipe: $!) ); + } + + return $fh; +} + + sub write_open { # Martin A. Hansen, January 2004. @@ -293,34 +334,6 @@ sub pipe_open } -sub read_stdin -{ - # Martin A. Hansen, July 2007. - - # Returns a filehandle to STDIN - - my ( $fh ); - - $fh = new IO::File "<&STDIN" or Maasha::Common::error( qq(Could not read from STDIN: $!) ); - - return $fh; -} - - -sub write_stdout -{ - # Martin A. Hansen, July 2007. - - # Returns a filehandle to STDOUT - - my ( $fh ); - - $fh = new IO::File ">&STDOUT" or Maasha::Common::error( qq(Could not write to STDOUT: $!) ); - - return $fh; -} - - sub file_store { # Martin A. Hansen, December 2004. @@ -353,149 +366,6 @@ sub file_retrieve } -sub dir_create -{ - # Martin A. Hansen, July 2007. - - # Creates a directory. - - my ( $path, # full path to dir - ) = @_; - - # Returns nothing. - - if ( -d $path ) { - Maasha::Common::error( qq(Directory already exists "$path": $!) ); - } else { - mkdir $path or Maasha::Common::error( qq(Could not create directory "$path": $!) ); - } -} - - -sub dir_create_if_not_exists -{ - # Martin A. Hansen, May 2008. - - # Creates a directory if it does not already exists. - - my ( $path, # full path to dir - ) = @_; - - # Returns nothing. - - if ( not -d $path ) { - mkdir $path or Maasha::Common::error( qq(Could not create directory "$path": $!) ); - } -} - - -sub dir_remove -{ - # Martin A. Hansen, April 2008. - - # Removes a directory recursively. - - my ( $path, # directory - ) = @_; - - Maasha::Common::run( "rm", "-rf $path" ) if -d $path; -} - - -sub ls_dirs -{ - # Martin A. Hansen, June 2007. - - # returns all dirs in a given directory. - - my ( $path, # full path to directory - ) = @_; - - # returns a list of filenames. - - my ( $dh, @dirs ); - - $dh = open_dir( $path ); - - @dirs = read_dir( $dh ); - @dirs = grep { -d "$path/$_" } @dirs; - - map { $_ = "$path/$_" } @dirs; - - close $dh; - - return wantarray ? @dirs : \@dirs; -} - - -sub ls_files -{ - # Martin A. Hansen, June 2007. - - # returns all files in a given directory. - - my ( $path, # full path to directory - ) = @_; - - # returns a list of filenames. - - my ( $dh, @files ); - - $dh = open_dir( $path ); - - @files = read_dir( $dh ); - @files = grep { -f "$path/$_" } @files; - - map { $_ = "$path/$_" } @files; - - close $dh; - - return wantarray ? @files : \@files; -} - - -sub open_dir -{ - # Martin A. Hansen, June 2007. - - # open a directory and returns a directory handle - - use IO::Dir; - - my ( $path, # full path to directory - ) = @_; - - # returns object - - my $dh; - - $dh = IO::Dir->new( $path ) or Maasha::Common::error( qq(Could not open dir "$path": $!) ); - - return $dh; -} - - -sub read_dir -{ - # Martin A. Hansen, June 2007. - - # read all files and directories from a directory. - - my ( $dh, # directory handle object - ) = @_; - - # returns list - - my ( $elem, @elems ); - - while ( defined( $elem = $dh->read ) ) { - push @elems, $elem; - } - - return wantarray ? @elems : \@elems; -} - - sub read_args { # Martin A. Hansen, December 2006 @@ -558,6 +428,18 @@ sub get_time } +sub get_time_hires +{ + # Martin A. Hansen, May 2008. + + # Get current time in high resolution. + + # Returns a float. + + return gettimeofday(); +} + + sub get_processid { # Martin A. Hansen, July 2008. @@ -615,7 +497,7 @@ sub get_tmpdir $path = "$ENV{ 'BP_TMP' }/" . join( "_", $user, $sid, $pid, "bp_tmp" ); - Maasha::Common::dir_create( $path ); + Maasha::Filesys::dir_create( $path ); return $path; } @@ -681,48 +563,6 @@ sub get_fields } -sub file_read -{ - # Martin A. Hansen, December 2004. - - # given a file, a seek beg position and - # length, returns the corresponding string. - - my ( $fh, # file handle to file - $beg, # read start in file - $len, # read length of block - ) = @_; - - # returns string - - my ( $string ); - - Maasha::Common::error( qq(Negative length: $len) ) if $len < 0; - - sysseek $fh, $beg, 0; - sysread $fh, $string, $len; - - return $string; -} - - -sub file_size -{ - # Martin A. Hansen, March 2007 - - # returns the file size for a given file - - my ( $path, # full path to file - ) = @_; - - # returns integer - - my $file_size = ( stat ( $path ) )[ 7 ]; - - return $file_size; -} - - sub run { # Martin A. Hansen, April 2007. diff --git a/code_perl/Maasha/Fasta.pm b/code_perl/Maasha/Fasta.pm index c4920a8..a81a546 100644 --- a/code_perl/Maasha/Fasta.pm +++ b/code_perl/Maasha/Fasta.pm @@ -1,6 +1,6 @@ package Maasha::Fasta; -# Copyright (C) 2006 Martin A. Hansen. +# Copyright (C) 2006-2009 Martin A. Hansen. # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License @@ -37,7 +37,7 @@ use vars qw ( @ISA @EXPORT ); @ISA = qw( Exporter ); use constant { - HEAD => 0, + SEQ_NAME => 0, SEQ => 1, }; @@ -205,7 +205,7 @@ sub put_entry # Returns nothing. - Maasha::Common::error( qq(FASTA entry has no header) ) if not defined $entry->[ HEAD ]; + Maasha::Common::error( qq(FASTA entry has no header) ) if not defined $entry->[ SEQ_NAME ]; Maasha::Common::error( qq(FASTA entry has no sequence) ) if not defined $entry->[ SEQ ]; if ( $wrap ) { @@ -213,9 +213,9 @@ sub put_entry } if ( defined $fh ) { - print $fh ">$entry->[ HEAD ]\n$entry->[ SEQ ]\n"; + print $fh ">$entry->[ SEQ_NAME ]\n$entry->[ SEQ ]\n"; } else { - print ">$entry->[ HEAD ]\n$entry->[ SEQ ]\n"; + print ">$entry->[ SEQ_NAME ]\n$entry->[ SEQ ]\n"; } } @@ -295,7 +295,7 @@ sub fasta_get_headers $fh = Maasha::Common::read_open( $path ); while ( $entry = get_entry( $fh ) ) { - push @list, $entry->[ HEAD ]; + push @list, $entry->[ SEQ_NAME ]; } close $fh; @@ -376,14 +376,14 @@ sub index_create while ( $entry = get_entry( $fh ) ) { - warn qq(WARNING: header->$entry->[ HEAD ] alread exists in index) if exists $hash{ $entry->[ HEAD ] }; + warn qq(WARNING: header->$entry->[ SEQ_NAME ] alread exists in index) if exists $hash{ $entry->[ SEQ_NAME ] }; - $beg += $len + 2 + length $entry->[ HEAD ]; + $beg += $len + 2 + length $entry->[ SEQ_NAME ]; $len = length $entry->[ SEQ ]; - push @index, [ $entry->[ HEAD ], $beg, $len ]; + push @index, [ $entry->[ SEQ_NAME ], $beg, $len ]; - $hash{ $entry->[ HEAD ] } = 1; + $hash{ $entry->[ SEQ_NAME ] } = 1; $beg++; } @@ -514,4 +514,31 @@ sub biopiece2fasta } +sub fasta2biopiece +{ + # Martin A. Hansen, May 2009 + + # Convert a FASTA record to a Biopiece record. + + my ( $entry, # FASTA entry + ) = @_; + + # Returns a hashref. + + my ( $record ); + + if ( defined $entry->[ SEQ_NAME ] and $entry->[ SEQ ] ) + { + $record = { + SEQ_NAME => $entry->[ SEQ_NAME ], + SEQ => $entry->[ SEQ ], + SEQ_LEN => length $entry->[ SEQ ], + }; + } + + return wantarray ? %{ $record } : $record; +} + + + # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< diff --git a/code_perl/Maasha/Filesys.pm b/code_perl/Maasha/Filesys.pm index 229c7c3..cb27c95 100644 --- a/code_perl/Maasha/Filesys.pm +++ b/code_perl/Maasha/Filesys.pm @@ -109,6 +109,59 @@ sub file_append_open } +sub stdin_read +{ + # Martin A. Hansen, July 2007. + + # Returns a filehandle to STDIN + + my ( $fh ); + + $fh = new IO::File "<&STDIN" or Maasha::Common::error( qq(Could not read from STDIN: $!) ); + + return $fh; +} + + +sub stdout_write +{ + # Martin A. Hansen, July 2007. + + # Returns a filehandle to STDOUT + + my ( $fh ); + + $fh = new IO::File ">&STDOUT" or Maasha::Common::error( qq(Could not write to STDOUT: $!) ); + + return $fh; +} + + +sub file_read +{ + # Martin A. Hansen, December 2004. + + # given a file, a seek beg position and + # length, returns the corresponding string. + + my ( $fh, # file handle to file + $beg, # read start in file + $len, # read length of block + ) = @_; + + # returns string + + my ( $string ); + + Maasha::Common::error( qq(Negative length: $len) ) if $len < 0; + + sysseek $fh, $beg, 0; + sysread $fh, $string, $len; + + return $string; +} + + sub file_copy { # Martin A. Hansen, November 2008. @@ -181,6 +234,152 @@ sub file_size } +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DIRECTORIES <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +sub dir_create +{ + # Martin A. Hansen, July 2007. + + # Creates a directory. + + my ( $path, # full path to dir + ) = @_; + + # Returns nothing. + + if ( -d $path ) { + Maasha::Common::error( qq(Directory already exists "$path": $!) ); + } else { + mkdir $path or Maasha::Common::error( qq(Could not create directory "$path": $!) ); + } +} + + +sub dir_create_if_not_exists +{ + # Martin A. Hansen, May 2008. + + # Creates a directory if it does not already exists. + + my ( $path, # full path to dir + ) = @_; + + # Returns nothing. + + if ( not -d $path ) { + mkdir $path or Maasha::Common::error( qq(Could not create directory "$path": $!) ); + } +} + + +sub dir_remove +{ + # Martin A. Hansen, April 2008. + + # Removes a directory recursively. + + my ( $path, # directory + ) = @_; + + Maasha::Common::run( "rm", "-rf $path" ) if -d $path; +} + + +sub ls_dirs +{ + # Martin A. Hansen, June 2007. + + # returns all dirs in a given directory. + + my ( $path, # full path to directory + ) = @_; + + # returns a list of filenames. + + my ( $dh, @dirs ); + + $dh = open_dir( $path ); + + @dirs = read_dir( $dh ); + @dirs = grep { -d "$path/$_" } @dirs; + + map { $_ = "$path/$_" } @dirs; + + close $dh; + + return wantarray ? @dirs : \@dirs; +} + + +sub ls_files +{ + # Martin A. Hansen, June 2007. + + # returns all files in a given directory. + + my ( $path, # full path to directory + ) = @_; + + # returns a list of filenames. + + my ( $dh, @files ); + + $dh = open_dir( $path ); + + @files = read_dir( $dh ); + @files = grep { -f "$path/$_" } @files; + + map { $_ = "$path/$_" } @files; + + close $dh; + + return wantarray ? @files : \@files; +} + + +sub open_dir +{ + # Martin A. Hansen, June 2007. + + # open a directory and returns a directory handle + + use IO::Dir; + + my ( $path, # full path to directory + ) = @_; + + # returns object + + my $dh; + + $dh = IO::Dir->new( $path ) or Maasha::Common::error( qq(Could not open dir "$path": $!) ); + + return $dh; +} + + +sub read_dir +{ + # Martin A. Hansen, June 2007. + + # read all files and directories from a directory. + + my ( $dh, # directory handle object + ) = @_; + + # returns list + + my ( $elem, @elems ); + + while ( defined( $elem = $dh->read ) ) { + push @elems, $elem; + } + + return wantarray ? @elems : \@elems; +} + + # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< diff --git a/code_perl/Maasha/Gwiki.pm b/code_perl/Maasha/Gwiki.pm index cd65868..267f960 100644 --- a/code_perl/Maasha/Gwiki.pm +++ b/code_perl/Maasha/Gwiki.pm @@ -32,6 +32,7 @@ use strict; use Data::Dumper; use Term::ANSIColor; use Maasha::Common; +use Maasha::Filesys; use vars qw ( @ISA @EXPORT ); @ISA = qw( Exporter ); @@ -135,7 +136,7 @@ sub gwiki_read my ( $fh, @lines, $i, $c, $section, $paragraph, @block, @output ); - $fh = Maasha::Common::read_open( $file ); + $fh = Maasha::Filesys::file_read_open( $file ); @lines = <$fh>; @@ -243,7 +244,7 @@ sub gwiki_read $c = $i; - while ( $lines[ $c ] !~ /^\s*$/ ) + while ( defined $lines[ $c ] and $lines[ $c ] !~ /^\s*$/ ) { $paragraph .= " $lines[ $c ]"; diff --git a/code_perl/Maasha/Match.pm b/code_perl/Maasha/Match.pm index c9b5bd7..7d6e0c0 100644 --- a/code_perl/Maasha/Match.pm +++ b/code_perl/Maasha/Match.pm @@ -32,6 +32,7 @@ use strict; use Data::Dumper; use Storable qw( dclone ); use Maasha::Common; +use Maasha::Filesys; use Maasha::Fasta; use Maasha::Seq; use Maasha::Biopieces; @@ -92,7 +93,7 @@ sub match_mummer Maasha::Common::run( "mummer", "$arg $file_in1 $file_in2 > $file_out 2>/dev/null" ); - $fh = Maasha::Common::read_open( $file_out ); + $fh = Maasha::Filesys::file_read_open( $file_out ); while ( $line = <$fh> ) { @@ -132,185 +133,6 @@ sub match_mummer } -sub match_vmatch -{ - # Martin A. Hansen, April 2008. - - # Vmatches a list of records against a list of index files and the full - # path to the result file is returned. - - my ( $tmp_dir, # directory in where to save temp files - $records, # list of records - $index_files, # list of index files - $options, # argument hash - ) = @_; - - # Returns a string. - - my ( $query_file, $result_file, @result_files, $fh_in, $fh_out, $line, @fields, $i, $record, $vmatch_args, @index_names, @seq_names, $count_list, $entry ); - - $query_file = "$tmp_dir/query.seq"; - $result_file = "$tmp_dir/vmatch.out"; - - $fh_out = Maasha::Common::write_open( $query_file ); - - foreach $record ( @{ $records } ) - { - if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) ) - { - next if length $entry->[ SEQ ] < 12; # assuming that the index is created for 12 as minimum length - - push @seq_names, $entry->[ SEQ_NAME ]; - - Maasha::Fasta::put_entry( $entry, $fh_out, 100 ); - } - } - - close $fh_out; - - if ( $options->{ 'genome' } ) { - $vmatch_args = "-complete -d -p -q $query_file"; - } else { - $vmatch_args = "-complete -d -p -showdesc 100 -q $query_file"; - } - - $vmatch_args .= " -h " . $options->{ "hamming_dist" } if $options->{ "hamming_dist" }; - $vmatch_args .= " -e " . $options->{ "edit_dist" } if $options->{ "edit_dist" }; - - for ( $i = 0; $i < @{ $index_files }; $i++ ) - { - Maasha::Common::run( "vmatch", "$vmatch_args $index_files->[ $i ] > $result_file.$i" ); - - push @result_files, "$result_file.$i"; - } - - unlink $query_file; - - $count_list = vmatch_count_hits( \@result_files ) if ( $options->{ "count" } ); - - $fh_out = Maasha::Common::write_open( $result_file ); - - for ( $i = 0; $i < @{ $index_files }; $i++ ) - { - $index_files->[ $i ] =~ s/.+\/(.+)\.fna$/$1/ if $options->{ 'genome' }; - - $fh_in = Maasha::Common::read_open( "$result_file.$i" ); - - while ( $line = <$fh_in> ) - { - chomp $line; - - next if $line =~ /^#/; - - @fields = split " ", $line; - - next if $options->{ "max_hits" } and $count_list->[ $fields[ 5 ] ] > $options->{ 'max_hits' }; - - $fields[ 1 ] = $index_files->[ $i ]; # S_ID - $fields[ 9 ] = int $fields[ 9 ]; - $fields[ 9 ] = $count_list->[ $fields[ 5 ] ] if $options->{ "count" }; # SCORE - $fields[ 5 ] = $seq_names[ $fields[ 5 ] ]; # Q_ID - - print $fh_out join( "\t", @fields ), "\n"; - } - - close $fh_in; - - unlink "$result_file.$i"; - } - - close $fh_out; - - return $result_file; -} - - -sub vmatch_count_hits -{ - # Martin A. Hansen, April 2008. - - # Given a list of Vmatch result file, count duplications based - # on q_id. The counts are returned in a list where the list index - # corresponds to the q_id index in the query file. - - my ( $files, # vmatch result files - ) = @_; - - # Returns a list. - - my ( $file, $fh_in, $line, @fields, @count_list ); - - foreach $file ( @{ $files } ) - { - $fh_in = Maasha::Common::read_open( $file ); - - while ( $line = <$fh_in> ) - { - chomp $line; - - next if $line =~ /^#/; - - @fields = split " ", $line; - - $count_list[ $fields[ 5 ] ]++; - } - - close $fh_in; - } - - return wantarray ? @count_list : \@count_list; -} - - -sub vmatch_get_entry -{ - # Martin A. Hansen, January 2008. - - # Parses vmatch output records. - - my ( $fh, # file handle to vmatch result file. - ) = @_; - - # Returns a hash. - - my ( $line, @fields, %record ); - - while ( $line = <$fh> ) - { - chomp $line; - - next if $line =~ /^#/; - - @fields = split "\t", $line; - - $record{ "REC_TYPE" } = "VMATCH"; - - $record{ "S_LEN" } = $fields[ 0 ]; - $record{ "S_ID" } = $fields[ 1 ]; - $record{ "S_BEG" } = $fields[ 2 ]; - - if ( $fields[ 3 ] eq "D" ) { - $record{ "STRAND" } = "+"; - } else { - $record{ "STRAND" } = "-"; - } - - $record{ "Q_LEN" } = $fields[ 4 ]; - $record{ "Q_ID" } = $fields[ 5 ]; - $record{ "Q_BEG" } = $fields[ 6 ]; - $record{ "MATCH_DIST" } = $fields[ 7 ]; - $record{ "E_VAL" } = $fields[ 8 ]; - $record{ "SCORE" } = $fields[ 9 ]; - $record{ "IDENT" } = $fields[ 10 ]; - - $record{ "Q_END" } = $record{ "Q_BEG" } + $record{ "Q_LEN" } - 1; - $record{ "S_END" } = $record{ "S_BEG" } + $record{ "S_LEN" } - 1; - - return wantarray ? %record : \%record; - } -} - - sub vmatch_index { # Martin A. Hansen, July 2008. @@ -327,7 +149,7 @@ sub vmatch_index my ( $fh, $entry, $tmp_file ); - Maasha::Common::dir_create_if_not_exists( $dst_dir ); + Maasha::Filesys::dir_create_if_not_exists( $dst_dir ); # if ( Maasha::Common::file_size( $file ) < 200_000_000 ) # { @@ -335,15 +157,15 @@ sub vmatch_index # } # else # { - $fh = Maasha::Common::read_open( "$src_dir/$file" ); + $fh = Maasha::Filesys::file_read_open( "$src_dir/$file" ); while ( $entry = Maasha::Fasta::get_entry( $fh ) ) { $tmp_file = $entry->[ SEQ_NAME ] . ".fna"; - Maasha::Fasta::put_entries( [ $entry ], "$tmp_dir/$tmp_file" ) + Maasha::Fasta::put_entries( [ $entry ], "$tmp_dir/$tmp_file" ); - &Maasha::Common::run( "mkvtree", "-db $tmp_dir/$tmp_file -dna -pl -allout -indexname $dst_dir/$tmp_file > /dev/null 3>&1" ); + Maasha::Common::run( "mkvtree", "-db $tmp_dir/$tmp_file -dna -pl -allout -indexname $dst_dir/$tmp_file > /dev/null 3>&1" ); unlink "$tmp_dir/$tmp_file"; } diff --git a/code_python/Cjung/Args.pyc b/code_python/Cjung/Args.pyc index 4e8e789..8d4a9b2 100644 Binary files a/code_python/Cjung/Args.pyc and b/code_python/Cjung/Args.pyc differ