);
}
-# unlink $tmp_in;
+unlink $tmp_in;
$fh_in = Maasha::Filesys::file_read_open( $tmp_out );
close $fh_out;
-# unlink $tmp_out;
+unlink $tmp_out;
-# Maasha::Filesys::dir_remove( $tmp_dir );
+Maasha::Filesys::dir_remove( $tmp_dir );
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SUBROUTINES <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
-#!/usr/bin/env perl
+#!/usr/bin/env perl -w
+
+# Copyright (C) 2007-2009 Martin A. Hansen.
+
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+# http://www.gnu.org/copyleft/gpl.html
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# Read Stockholm entries from one or more files.
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
-use warnings;
use strict;
+use Maasha::Biopieces;
+use Maasha::Filesys;
+use Maasha::Stockholm;
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+my ( $run_time_beg, $run_time_end, $options, $in, $out, $record, $num, $entry, $data_in, $record_anno, $key, $seq, $record_align );
+
+$options = Maasha::Biopieces::parse_options(
+ [
+ { long => 'data_in', short => 'i', type => 'files!', mandatory => 'no', default => undef, allowed => undef, disallowed => undef },
+ { long => 'num', short => 'n', type => 'uint', mandatory => 'no', default => undef, allowed => undef, disallowed => '0' },
+ ]
+);
+
+$in = Maasha::Biopieces::read_stream( $options->{ "stream_in" } );
+$out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } );
+
+while ( $record = Maasha::Biopieces::get_record( $in ) ) {
+ Maasha::Biopieces::put_record( $record, $out );
+}
+
+if ( $options->{ 'data_in' } )
+{
+ $data_in = Maasha::Filesys::files_read_open( $options->{ 'data_in' } );
+
+ $num = 1;
+
+ while ( $entry = Maasha::Stockholm::get_stockholm_entry( $data_in ) )
+ {
+ $record = Maasha::Stockholm::parse_stockholm_entry( $entry );
+
+ undef $record_anno;
+
+ foreach $key ( keys %{ $record->{ "GF" } } ) {
+ $record_anno->{ $key } = $record->{ "GF" }->{ $key };
+ }
+
+ $record_anno->{ "ALIGN" } = $num;
+
+ Maasha::Biopieces::put_record( $record_anno, $out );
+
+ foreach $seq ( @{ $record->{ "ALIGN" } } )
+ {
+ undef $record_align;
+
+ $record_align = {
+ SEQ_NAME => $seq->[ 0 ],
+ SEQ => $seq->[ 1 ],
+ };
+
+ Maasha::Biopieces::put_record( $record_align, $out );
+ }
+
+ goto NUM if $options->{ "num" } and $num == $options->{ "num" };
+
+ $num++;
+ }
+
+ close $data_in;
+}
+
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+BEGIN
+{
+ $run_time_beg = Maasha::Biopieces::run_time();
+
+ Maasha::Biopieces::log_biopiece();
+}
+
+END
+{
+ Maasha::Biopieces::close_stream( $in );
+ Maasha::Biopieces::close_stream( $out );
+
+ $run_time_end = Maasha::Biopieces::run_time();
+
+ Maasha::Biopieces::run_time_print( $run_time_beg, $run_time_end, $options );
+}
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
-use Maasha::BioRun;
+__END__
-#!/usr/bin/env perl
+#!/usr/bin/env perl -w
+
+# Copyright (C) 2007-2009 Martin A. Hansen.
+
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+# http://www.gnu.org/copyleft/gpl.html
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# Use soap to match short nucleotide sequences in the stream against a specified genome.
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
-use warnings;
use strict;
+use Maasha::Biopieces;
+use Maasha::Common;
+use Maasha::Filesys;
+use Maasha::Fasta;
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+my ( $run_time_beg, $run_time_end, $options, $in, $out, $tmp_dir, $tmp_in, $tmp_out, $fh_out, $record, $entry, $count, $args, $line, @fields );
+
+$options = Maasha::Biopieces::parse_options(
+ [
+ { long => 'in_file', short => 'i', type => 'file', mandatory => 'no', default => undef, allowed => undef, disallowed => undef },
+ { long => 'genome', short => 'g', type => 'genome', mandatory => 'no', default => undef, allowed => undef, disallowed => undef },
+ { long => 'seed_size', short => 's', type => 'uint', mandatory => 'no', default => 10, allowed => undef, disallowed => undef },
+ { long => 'mismatches', short => 'm', type => 'uint', mandatory => 'no', default => 2, allowed => undef, disallowed => undef },
+ { long => 'gap_size', short => 'G', type => 'uint', mandatory => 'no', default => 0, allowed => undef, disallowed => undef },
+ { long => 'cpus', short => 'c', type => 'uint', mandatory => 'no', default => 1, allowed => undef, disallowed => 0 },
+ ]
+);
+
+Maasha::Common::error( qq(both --in_file and --genome specified) ) if $options->{ "genome" } and $options->{ "in_file" };
+Maasha::Common::error( qq(no --in_file or --genome specified) ) if not $options->{ "genome" } and not $options->{ "in_file" };
+
+$in = Maasha::Biopieces::read_stream( $options->{ "stream_in" } );
+$out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } );
+
+$options->{ "in_file" } = "$ENV{ 'BP_DATA' }/genomes/$options->{ 'genome' }/fasta/$options->{ 'genome' }.fna" if $options->{ 'genome' };
+
+$tmp_dir = Maasha::Biopieces::get_tmpdir();
+$tmp_in = "$tmp_dir/soap_query.seq";
+$tmp_out = "$tmp_dir/soap.result";
+
+$fh_out = Maasha::Common::write_open( $tmp_in );
+
+$count = 0;
+
+while ( $record = Maasha::Biopieces::get_record( $in ) )
+{
+ if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) )
+ {
+ Maasha::Fasta::put_entry( $entry, $fh_out );
+
+ $count++;
+ }
+
+ Maasha::Biopieces::put_record( $record, $out );
+}
+
+close $fh_out;
+
+if ( $count > 0 )
+{
+ $args = join( " ",
+ "-s $options->{ 'seed_size' }",
+ "-r 2",
+ "-a $tmp_in",
+ "-v $options->{ 'mismatches' }",
+ "-g $options->{ 'gap_size' }",
+ "-p $options->{ 'cpus' }",
+ "-d $options->{ 'in_file' }",
+ "-o $tmp_out",
+ );
+
+ $args .= " > /dev/null 2>&1" if not $options->{ 'verbose' };
+
+ Maasha::Common::run( "soap", $args, 1 );
+
+ unlink $tmp_in;
+
+ $fh_out = Maasha::Filesys::file_read_open( $tmp_out );
+
+ undef $record;
+
+ while ( $line = <$fh_out> )
+ {
+ chomp $line;
+
+ @fields = split /\t/, $line;
+
+ $record->{ "REC_TYPE" } = "SOAP";
+ $record->{ "Q_ID" } = $fields[ 0 ];
+ $record->{ "SCORE" } = $fields[ 3 ];
+ $record->{ "STRAND" } = $fields[ 6 ];
+ $record->{ "S_ID" } = $fields[ 7 ];
+ $record->{ "S_BEG" } = $fields[ 8 ] - 1; # soap is 1-based
+ $record->{ "S_END" } = $fields[ 8 ] + $fields[ 5 ] - 2;
+
+ Maasha::Biopieces::put_record( $record, $out );
+ }
+
+ close $fh_out;
+}
+
+unlink $tmp_out;
+
+Maasha::Filesys::dir_remove( $tmp_dir );
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+BEGIN
+{
+ $run_time_beg = Maasha::Biopieces::run_time();
+
+ Maasha::Biopieces::log_biopiece();
+}
+
+END
+{
+ Maasha::Biopieces::close_stream( $in );
+ Maasha::Biopieces::close_stream( $out );
+
+ $run_time_end = Maasha::Biopieces::run_time();
+
+ Maasha::Biopieces::run_time_print( $run_time_beg, $run_time_end, $options );
+}
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+__END__
-use Maasha::BioRun;
use Maasha::Filesys;
use Maasha::Fasta;
use Maasha::EMBL;
-use Maasha::Stockholm;
use Maasha::Seq;
use Maasha::Calc;
use Maasha::UCSC;
$out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } );
if ( $script eq "print_usage" ) { script_print_usage( $in, $out, $options ) }
- elsif ( $script eq "read_stockholm" ) { script_read_stockholm( $in, $out, $options ) }
- elsif ( $script eq "uppercase_seq" ) { script_uppercase_seq( $in, $out, $options ) }
elsif ( $script eq "complexity_seq" ) { script_complexity_seq( $in, $out, $options ) }
elsif ( $script eq "get_genome_align" ) { script_get_genome_align( $in, $out, $options ) }
elsif ( $script eq "get_genome_phastcons" ) { script_get_genome_phastcons( $in, $out, $options ) }
- elsif ( $script eq "soap_seq" ) { script_soap_seq( $in, $out, $options ) }
elsif ( $script eq "remove_mysql_tables" ) { script_remove_mysql_tables( $in, $out, $options ) }
elsif ( $script eq "remove_ucsc_tracks" ) { script_remove_ucsc_tracks( $in, $out, $options ) }
elsif ( $script eq "upload_to_ucsc" ) { script_upload_to_ucsc( $in, $out, $options ) }
data_in|i=s
);
}
- elsif ( $script eq "read_stockholm" )
- {
- @options = qw(
- data_in|i=s
- num|n=s
- );
- }
elsif ( $script eq "get_genome_align" )
{
@options = qw(
flank|f=s
);
}
- elsif ( $script eq "soap_seq" )
- {
- @options = qw(
- in_file|i=s
- genome|g=s
- seed_size|s=s
- mismatches|m=s
- gap_size|G=s
- cpus|c=s
- );
- }
elsif ( $script eq "remove_mysql_tables" )
{
@options = qw(
}
Maasha::Common::error( qq(no --database specified) ) if $script eq "remove_ucsc_tracks" and not $options{ "database" };
- Maasha::Common::error( qq(no --in_file or --genome specified) ) if $script eq "soap_seq" and not $options{ "genome" } and not $options{ "in_file" };
- Maasha::Common::error( qq(both --in_file and --genome specified) ) if $script eq "soap_seq" and $options{ "genome" } and $options{ "in_file" };
Maasha::Common::error( qq(no --genome specified) ) if $script =~ /get_genome_align|get_genome_phastcons/ and not $options{ "genome" };
if ( $script eq "upload_to_ucsc" )
}
-sub script_read_stockholm
-{
- # Martin A. Hansen, August 2007.
-
- # Read Stockholm format.
-
- my ( $in, # handle to in stream
- $out, # handle to out stream
- $options, # options hash
- ) = @_;
-
- # Returns nothing.
-
- my ( $data_in, $file, $num, $entry, $record, $record_anno, $record_align, $key, $seq );
-
- while ( $record = Maasha::Biopieces::get_record( $in ) ) {
- Maasha::Biopieces::put_record( $record, $out );
- }
-
- $num = 1;
-
- foreach $file ( @{ $options->{ "files" } } )
- {
- $data_in = Maasha::Common::read_open( $file );
-
- while ( $entry = Maasha::Stockholm::get_stockholm_entry( $data_in ) )
- {
- $record = Maasha::Stockholm::parse_stockholm_entry( $entry );
-
- undef $record_anno;
-
- foreach $key ( keys %{ $record->{ "GF" } } ) {
- $record_anno->{ $key } = $record->{ "GF" }->{ $key };
- }
-
- $record_anno->{ "ALIGN" } = $num;
-
- Maasha::Biopieces::put_record( $record_anno, $out );
-
- foreach $seq ( @{ $record->{ "ALIGN" } } )
- {
- undef $record_align;
-
- $record_align = {
- SEQ_NAME => $seq->[ 0 ],
- SEQ => $seq->[ 1 ],
- };
-
- Maasha::Biopieces::put_record( $record_align, $out );
- }
-
- goto NUM if $options->{ "num" } and $num == $options->{ "num" };
-
- $num++;
- }
-
- close $data_in;
- }
-
- NUM:
-
- close $data_in if $data_in;
-}
-
-
sub script_complexity_seq
{
# Martin A. Hansen, May 2008.
}
-sub script_soap_seq
-{
- # Martin A. Hansen, July 2008.
-
- # soap sequences in stream against a given file or genome.
-
- my ( $in, # handle to in stream
- $out, # handle to out stream
- $options, # options hash
- ) = @_;
-
- # Returns nothing.
-
- my ( $genome, $tmp_in, $tmp_out, $fh_in, $fh_out, $record, $line, @fields, $entry, $count, $args );
-
- $options->{ "seed_size" } ||= 10;
- $options->{ "mismatches" } ||= 2;
- $options->{ "gap_size" } ||= 0;
- $options->{ "cpus" } ||= 1;
-
- if ( $options->{ "genome" } ) {
- $options->{ "in_file" } = "$ENV{ 'BP_DATA' }/genomes/$options->{ 'genome' }/fasta/$options->{ 'genome' }.fna";
- }
-
- $tmp_in = "$BP_TMP/soap_query.seq";
- $tmp_out = "$BP_TMP/soap.result";
-
- $fh_out = Maasha::Common::write_open( $tmp_in );
-
- $count = 0;
-
- while ( $record = Maasha::Biopieces::get_record( $in ) )
- {
- if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) )
- {
- Maasha::Fasta::put_entry( $entry, $fh_out );
-
- $count++;
- }
-
- Maasha::Biopieces::put_record( $record, $out );
- }
-
- close $fh_out;
-
- if ( $count > 0 )
- {
- $args = join( " ",
- "-s $options->{ 'seed_size' }",
- "-r 2",
- "-a $tmp_in",
- "-v $options->{ 'mismatches' }",
- "-g $options->{ 'gap_size' }",
- "-p $options->{ 'cpus' }",
- "-d $options->{ 'in_file' }",
- "-o $tmp_out",
- );
-
- $args .= " > /dev/null 2>&1" if not $options->{ 'verbose' };
-
- Maasha::Common::run( "soap", $args, 1 );
-
- unlink $tmp_in;
-
- $fh_out = Maasha::Common::read_open( $tmp_out );
-
- undef $record;
-
- while ( $line = <$fh_out> )
- {
- chomp $line;
-
- @fields = split /\t/, $line;
-
- $record->{ "REC_TYPE" } = "SOAP";
- $record->{ "Q_ID" } = $fields[ 0 ];
- $record->{ "SCORE" } = $fields[ 3 ];
- $record->{ "STRAND" } = $fields[ 6 ];
- $record->{ "S_ID" } = $fields[ 7 ];
- $record->{ "S_BEG" } = $fields[ 8 ] - 1; # soap is 1-based
- $record->{ "S_END" } = $fields[ 8 ] + $fields[ 5 ] - 2;
-
- Maasha::Biopieces::put_record( $record, $out );
- }
-
- close $fh_out;
- }
-
- unlink $tmp_out;
-}
-
-
sub script_remove_mysql_tables
{
# Martin A. Hansen, November 2008.