From fc6aa2a75a62c9da33195bbdf058c425235fe00c Mon Sep 17 00:00:00 2001 From: martinahansen Date: Thu, 4 Jun 2009 19:14:37 +0000 Subject: [PATCH] almost there git-svn-id: http://biopieces.googlecode.com/svn/trunk@475 74ccb610-7750-0410-82ae-013aeee3265d --- bp_bin/blast_seq | 6 +- bp_bin/read_stockholm | 120 +++++++++++++++++++++++- bp_bin/soap_seq | 155 ++++++++++++++++++++++++++++++- code_perl/Maasha/BioRun.pm | 181 ------------------------------------- 4 files changed, 272 insertions(+), 190 deletions(-) diff --git a/bp_bin/blast_seq b/bp_bin/blast_seq index bcb9df7..1823216 100755 --- a/bp_bin/blast_seq +++ b/bp_bin/blast_seq @@ -124,7 +124,7 @@ else ); } -# unlink $tmp_in; +unlink $tmp_in; $fh_in = Maasha::Filesys::file_read_open( $tmp_out ); @@ -137,9 +137,9 @@ while ( $entry = get_tab_entry( $fh_in ) ) close $fh_out; -# unlink $tmp_out; +unlink $tmp_out; -# Maasha::Filesys::dir_remove( $tmp_dir ); +Maasha::Filesys::dir_remove( $tmp_dir ); # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SUBROUTINES <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< diff --git a/bp_bin/read_stockholm b/bp_bin/read_stockholm index fdf5bd2..530ddd4 100755 --- a/bp_bin/read_stockholm +++ b/bp_bin/read_stockholm @@ -1,6 +1,120 @@ -#!/usr/bin/env perl +#!/usr/bin/env perl -w + +# Copyright (C) 2007-2009 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# Read Stockholm entries from one or more files. + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + -use warnings; use strict; +use Maasha::Biopieces; +use Maasha::Filesys; +use Maasha::Stockholm; + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +my ( $run_time_beg, $run_time_end, $options, $in, $out, $record, $num, $entry, $data_in, $record_anno, $key, $seq, $record_align ); + +$options = Maasha::Biopieces::parse_options( + [ + { long => 'data_in', short => 'i', type => 'files!', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, + { long => 'num', short => 'n', type => 'uint', mandatory => 'no', default => undef, allowed => undef, disallowed => '0' }, + ] +); + +$in = Maasha::Biopieces::read_stream( $options->{ "stream_in" } ); +$out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } ); + +while ( $record = Maasha::Biopieces::get_record( $in ) ) { + Maasha::Biopieces::put_record( $record, $out ); +} + +if ( $options->{ 'data_in' } ) +{ + $data_in = Maasha::Filesys::files_read_open( $options->{ 'data_in' } ); + + $num = 1; + + while ( $entry = Maasha::Stockholm::get_stockholm_entry( $data_in ) ) + { + $record = Maasha::Stockholm::parse_stockholm_entry( $entry ); + + undef $record_anno; + + foreach $key ( keys %{ $record->{ "GF" } } ) { + $record_anno->{ $key } = $record->{ "GF" }->{ $key }; + } + + $record_anno->{ "ALIGN" } = $num; + + Maasha::Biopieces::put_record( $record_anno, $out ); + + foreach $seq ( @{ $record->{ "ALIGN" } } ) + { + undef $record_align; + + $record_align = { + SEQ_NAME => $seq->[ 0 ], + SEQ => $seq->[ 1 ], + }; + + Maasha::Biopieces::put_record( $record_align, $out ); + } + + goto NUM if $options->{ "num" } and $num == $options->{ "num" }; + + $num++; + } + + close $data_in; +} + + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +BEGIN +{ + $run_time_beg = Maasha::Biopieces::run_time(); + + Maasha::Biopieces::log_biopiece(); +} + +END +{ + Maasha::Biopieces::close_stream( $in ); + Maasha::Biopieces::close_stream( $out ); + + $run_time_end = Maasha::Biopieces::run_time(); + + Maasha::Biopieces::run_time_print( $run_time_beg, $run_time_end, $options ); +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + -use Maasha::BioRun; +__END__ diff --git a/bp_bin/soap_seq b/bp_bin/soap_seq index fdf5bd2..181e4fd 100755 --- a/bp_bin/soap_seq +++ b/bp_bin/soap_seq @@ -1,6 +1,155 @@ -#!/usr/bin/env perl +#!/usr/bin/env perl -w + +# Copyright (C) 2007-2009 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# Use soap to match short nucleotide sequences in the stream against a specified genome. + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + -use warnings; use strict; +use Maasha::Biopieces; +use Maasha::Common; +use Maasha::Filesys; +use Maasha::Fasta; + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +my ( $run_time_beg, $run_time_end, $options, $in, $out, $tmp_dir, $tmp_in, $tmp_out, $fh_out, $record, $entry, $count, $args, $line, @fields ); + +$options = Maasha::Biopieces::parse_options( + [ + { long => 'in_file', short => 'i', type => 'file', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, + { long => 'genome', short => 'g', type => 'genome', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, + { long => 'seed_size', short => 's', type => 'uint', mandatory => 'no', default => 10, allowed => undef, disallowed => undef }, + { long => 'mismatches', short => 'm', type => 'uint', mandatory => 'no', default => 2, allowed => undef, disallowed => undef }, + { long => 'gap_size', short => 'G', type => 'uint', mandatory => 'no', default => 0, allowed => undef, disallowed => undef }, + { long => 'cpus', short => 'c', type => 'uint', mandatory => 'no', default => 1, allowed => undef, disallowed => 0 }, + ] +); + +Maasha::Common::error( qq(both --in_file and --genome specified) ) if $options->{ "genome" } and $options->{ "in_file" }; +Maasha::Common::error( qq(no --in_file or --genome specified) ) if not $options->{ "genome" } and not $options->{ "in_file" }; + +$in = Maasha::Biopieces::read_stream( $options->{ "stream_in" } ); +$out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } ); + +$options->{ "in_file" } = "$ENV{ 'BP_DATA' }/genomes/$options->{ 'genome' }/fasta/$options->{ 'genome' }.fna" if $options->{ 'genome' }; + +$tmp_dir = Maasha::Biopieces::get_tmpdir(); +$tmp_in = "$tmp_dir/soap_query.seq"; +$tmp_out = "$tmp_dir/soap.result"; + +$fh_out = Maasha::Common::write_open( $tmp_in ); + +$count = 0; + +while ( $record = Maasha::Biopieces::get_record( $in ) ) +{ + if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) ) + { + Maasha::Fasta::put_entry( $entry, $fh_out ); + + $count++; + } + + Maasha::Biopieces::put_record( $record, $out ); +} + +close $fh_out; + +if ( $count > 0 ) +{ + $args = join( " ", + "-s $options->{ 'seed_size' }", + "-r 2", + "-a $tmp_in", + "-v $options->{ 'mismatches' }", + "-g $options->{ 'gap_size' }", + "-p $options->{ 'cpus' }", + "-d $options->{ 'in_file' }", + "-o $tmp_out", + ); + + $args .= " > /dev/null 2>&1" if not $options->{ 'verbose' }; + + Maasha::Common::run( "soap", $args, 1 ); + + unlink $tmp_in; + + $fh_out = Maasha::Filesys::file_read_open( $tmp_out ); + + undef $record; + + while ( $line = <$fh_out> ) + { + chomp $line; + + @fields = split /\t/, $line; + + $record->{ "REC_TYPE" } = "SOAP"; + $record->{ "Q_ID" } = $fields[ 0 ]; + $record->{ "SCORE" } = $fields[ 3 ]; + $record->{ "STRAND" } = $fields[ 6 ]; + $record->{ "S_ID" } = $fields[ 7 ]; + $record->{ "S_BEG" } = $fields[ 8 ] - 1; # soap is 1-based + $record->{ "S_END" } = $fields[ 8 ] + $fields[ 5 ] - 2; + + Maasha::Biopieces::put_record( $record, $out ); + } + + close $fh_out; +} + +unlink $tmp_out; + +Maasha::Filesys::dir_remove( $tmp_dir ); + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +BEGIN +{ + $run_time_beg = Maasha::Biopieces::run_time(); + + Maasha::Biopieces::log_biopiece(); +} + +END +{ + Maasha::Biopieces::close_stream( $in ); + Maasha::Biopieces::close_stream( $out ); + + $run_time_end = Maasha::Biopieces::run_time(); + + Maasha::Biopieces::run_time_print( $run_time_beg, $run_time_end, $options ); +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +__END__ -use Maasha::BioRun; diff --git a/code_perl/Maasha/BioRun.pm b/code_perl/Maasha/BioRun.pm index 16b6df9..86036f8 100644 --- a/code_perl/Maasha/BioRun.pm +++ b/code_perl/Maasha/BioRun.pm @@ -40,7 +40,6 @@ use Maasha::Common; use Maasha::Filesys; use Maasha::Fasta; use Maasha::EMBL; -use Maasha::Stockholm; use Maasha::Seq; use Maasha::Calc; use Maasha::UCSC; @@ -111,12 +110,9 @@ sub run_script $out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } ); if ( $script eq "print_usage" ) { script_print_usage( $in, $out, $options ) } - elsif ( $script eq "read_stockholm" ) { script_read_stockholm( $in, $out, $options ) } - elsif ( $script eq "uppercase_seq" ) { script_uppercase_seq( $in, $out, $options ) } elsif ( $script eq "complexity_seq" ) { script_complexity_seq( $in, $out, $options ) } elsif ( $script eq "get_genome_align" ) { script_get_genome_align( $in, $out, $options ) } elsif ( $script eq "get_genome_phastcons" ) { script_get_genome_phastcons( $in, $out, $options ) } - elsif ( $script eq "soap_seq" ) { script_soap_seq( $in, $out, $options ) } elsif ( $script eq "remove_mysql_tables" ) { script_remove_mysql_tables( $in, $out, $options ) } elsif ( $script eq "remove_ucsc_tracks" ) { script_remove_ucsc_tracks( $in, $out, $options ) } elsif ( $script eq "upload_to_ucsc" ) { script_upload_to_ucsc( $in, $out, $options ) } @@ -149,13 +145,6 @@ sub get_options data_in|i=s ); } - elsif ( $script eq "read_stockholm" ) - { - @options = qw( - data_in|i=s - num|n=s - ); - } elsif ( $script eq "get_genome_align" ) { @options = qw( @@ -178,17 +167,6 @@ sub get_options flank|f=s ); } - elsif ( $script eq "soap_seq" ) - { - @options = qw( - in_file|i=s - genome|g=s - seed_size|s=s - mismatches|m=s - gap_size|G=s - cpus|c=s - ); - } elsif ( $script eq "remove_mysql_tables" ) { @options = qw( @@ -325,8 +303,6 @@ sub get_options } Maasha::Common::error( qq(no --database specified) ) if $script eq "remove_ucsc_tracks" and not $options{ "database" }; - Maasha::Common::error( qq(no --in_file or --genome specified) ) if $script eq "soap_seq" and not $options{ "genome" } and not $options{ "in_file" }; - Maasha::Common::error( qq(both --in_file and --genome specified) ) if $script eq "soap_seq" and $options{ "genome" } and $options{ "in_file" }; Maasha::Common::error( qq(no --genome specified) ) if $script =~ /get_genome_align|get_genome_phastcons/ and not $options{ "genome" }; if ( $script eq "upload_to_ucsc" ) @@ -380,71 +356,6 @@ sub script_print_usage } -sub script_read_stockholm -{ - # Martin A. Hansen, August 2007. - - # Read Stockholm format. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $data_in, $file, $num, $entry, $record, $record_anno, $record_align, $key, $seq ); - - while ( $record = Maasha::Biopieces::get_record( $in ) ) { - Maasha::Biopieces::put_record( $record, $out ); - } - - $num = 1; - - foreach $file ( @{ $options->{ "files" } } ) - { - $data_in = Maasha::Common::read_open( $file ); - - while ( $entry = Maasha::Stockholm::get_stockholm_entry( $data_in ) ) - { - $record = Maasha::Stockholm::parse_stockholm_entry( $entry ); - - undef $record_anno; - - foreach $key ( keys %{ $record->{ "GF" } } ) { - $record_anno->{ $key } = $record->{ "GF" }->{ $key }; - } - - $record_anno->{ "ALIGN" } = $num; - - Maasha::Biopieces::put_record( $record_anno, $out ); - - foreach $seq ( @{ $record->{ "ALIGN" } } ) - { - undef $record_align; - - $record_align = { - SEQ_NAME => $seq->[ 0 ], - SEQ => $seq->[ 1 ], - }; - - Maasha::Biopieces::put_record( $record_align, $out ); - } - - goto NUM if $options->{ "num" } and $num == $options->{ "num" }; - - $num++; - } - - close $data_in; - } - - NUM: - - close $data_in if $data_in; -} - - sub script_complexity_seq { # Martin A. Hansen, May 2008. @@ -620,98 +531,6 @@ sub script_get_genome_phastcons } -sub script_soap_seq -{ - # Martin A. Hansen, July 2008. - - # soap sequences in stream against a given file or genome. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $genome, $tmp_in, $tmp_out, $fh_in, $fh_out, $record, $line, @fields, $entry, $count, $args ); - - $options->{ "seed_size" } ||= 10; - $options->{ "mismatches" } ||= 2; - $options->{ "gap_size" } ||= 0; - $options->{ "cpus" } ||= 1; - - if ( $options->{ "genome" } ) { - $options->{ "in_file" } = "$ENV{ 'BP_DATA' }/genomes/$options->{ 'genome' }/fasta/$options->{ 'genome' }.fna"; - } - - $tmp_in = "$BP_TMP/soap_query.seq"; - $tmp_out = "$BP_TMP/soap.result"; - - $fh_out = Maasha::Common::write_open( $tmp_in ); - - $count = 0; - - while ( $record = Maasha::Biopieces::get_record( $in ) ) - { - if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) ) - { - Maasha::Fasta::put_entry( $entry, $fh_out ); - - $count++; - } - - Maasha::Biopieces::put_record( $record, $out ); - } - - close $fh_out; - - if ( $count > 0 ) - { - $args = join( " ", - "-s $options->{ 'seed_size' }", - "-r 2", - "-a $tmp_in", - "-v $options->{ 'mismatches' }", - "-g $options->{ 'gap_size' }", - "-p $options->{ 'cpus' }", - "-d $options->{ 'in_file' }", - "-o $tmp_out", - ); - - $args .= " > /dev/null 2>&1" if not $options->{ 'verbose' }; - - Maasha::Common::run( "soap", $args, 1 ); - - unlink $tmp_in; - - $fh_out = Maasha::Common::read_open( $tmp_out ); - - undef $record; - - while ( $line = <$fh_out> ) - { - chomp $line; - - @fields = split /\t/, $line; - - $record->{ "REC_TYPE" } = "SOAP"; - $record->{ "Q_ID" } = $fields[ 0 ]; - $record->{ "SCORE" } = $fields[ 3 ]; - $record->{ "STRAND" } = $fields[ 6 ]; - $record->{ "S_ID" } = $fields[ 7 ]; - $record->{ "S_BEG" } = $fields[ 8 ] - 1; # soap is 1-based - $record->{ "S_END" } = $fields[ 8 ] + $fields[ 5 ] - 2; - - Maasha::Biopieces::put_record( $record, $out ); - } - - close $fh_out; - } - - unlink $tmp_out; -} - - sub script_remove_mysql_tables { # Martin A. Hansen, November 2008. -- 2.39.5