-#!/usr/bin/env perl -w
+#!/usr/bin/env ruby
-# Copyright (C) 2007-2009 Martin A. Hansen.
+# Copyright (C) 2007-2011 Martin A. Hansen.
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# http://www.gnu.org/copyleft/gpl.html
-
-# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
-
-# Scan sequences in the stream or a specified genome for patterns using patscan.
-
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+# This program is part of the Biopieces framework (www.biopieces.org).
-use strict;
-use Maasha::Biopieces;
-use Maasha::Fasta;
-use Maasha::Filesys;
-use Maasha::Patscan;
-
-use constant {
- SEQ_NAME => 0,
- SEQ => 1,
-};
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+# Run Patscan on sequences in the stream.
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
-my ( $run_time_beg, $run_time_end, $options, $in, $out, $record, @args, $arg, $type, $tmp_dir,
- $seq_file, $pat_file, $out_file, $fh_in, $fh_out, $patterns, $pattern, $entry, $result, %head_hash, $i );
-
-$options = Maasha::Biopieces::parse_options(
- [
- { long => 'patterns', short => 'p', type => 'string', mandatory => 'no', default => undef, allowed => undef, disallowed => undef },
- { long => 'patterns_in', short => 'P', type => 'file!', mandatory => 'no', default => undef, allowed => undef, disallowed => undef },
- { long => 'comp', short => 'c', type => 'flag', mandatory => 'no', default => undef, allowed => undef, disallowed => undef },
- { long => 'max_hits', short => 'h', type => 'uint', mandatory => 'no', default => undef, allowed => undef, disallowed => 0 },
- { long => 'max_misses', short => 'm', type => 'uint', mandatory => 'no', default => undef, allowed => undef, disallowed => 0 },
- { long => 'genome', short => 'g', type => 'genome', mandatory => 'no', default => undef, allowed => undef, disallowed => 0 },
- ]
-);
-
-$in = Maasha::Biopieces::read_stream( $options->{ "stream_in" } );
-$out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } );
-
-$tmp_dir = Maasha::Biopieces::get_tmpdir();
-
-$pat_file = "$tmp_dir/patscan.pat";
-$out_file = "$tmp_dir/patscan.out";
-
-$patterns = parse_patterns( $options );
-$arg = parse_args( $options );
-
-if ( defined $options->{ 'genome' } )
-{
- $seq_file = "$ENV{ 'BP_DATA' }/genomes/$options->{ 'genome' }/fasta/$options->{ 'genome' }.fna";
-}
-else
-{
- $seq_file = "$tmp_dir/patscan.seq";
-
- $fh_out = Maasha::Filesys::file_write_open( $seq_file );
-
- $i = 0;
-
- while ( $record = Maasha::Biopieces::get_record( $in ) )
- {
- if ( $entry = Maasha::Biopieces::biopiece2fasta( $record ) )
- {
- $type = Maasha::Seq::seq_guess_type( $record->{ "SEQ" } ) if not $type;
-
- $entry->[ SEQ_NAME ] = $i;
-
- Maasha::Fasta::put_entry( $entry, $fh_out );
-
- $head_hash{ $i } = $record->{ "SEQ_NAME" };
-
- $i++;
- }
- }
-
- close $fh_out;
-
- $arg .= " -p" if $type eq "protein";
-}
-
-foreach $pattern ( @{ $patterns } )
-{
- pat_write( $pat_file, $pattern );
-
- `scan_for_matches $arg $pat_file < $seq_file > $out_file`;
- # Maasha::Common::run( "scan_for_matches", "$arg $pat_file < $seq_file > $out_file" );
-
- $fh_in = Maasha::Filesys::file_read_open( $out_file );
-
- while ( $entry = Maasha::Fasta::get_entry( $fh_in ) )
- {
- $result = Maasha::Patscan::parse_scan_result( $entry, $pattern );
-
- if ( $options->{ 'genome' } )
- {
- $result->{ "CHR" } = $result->{ "S_ID" };
- $result->{ "CHR_BEG" } = $result->{ "S_BEG" };
- $result->{ "CHR_END" } = $result->{ "S_END" };
-
- delete $result->{ "S_ID" };
- delete $result->{ "S_BEG" };
- delete $result->{ "S_END" };
- }
+require 'maasha/biopieces'
+require 'maasha/fasta'
+require 'maasha/seq'
+
+# Class with methods to execute Patscan (a.k.a. scan_for_matches).
+class Patscan
+ # Method to initialize a Patscan object.
+ def initialize(pat_file, in_file, out_file, pattern)
+ @pat_file = pat_file
+ @in_file = in_file
+ @out_file = out_file
+ @pattern = pattern
+
+ pattern_save
+ end
+
+ # Method to run Patscan
+ def run(comp = nil, type = nil, max_hit = nil, max_mis = nil, verbose = nil)
+ args = []
+ args << "scan_for_matches"
+ args << "-c" if comp
+ args << "-p" if type == 'protein'
+ args << "-n #{max_mis}" if max_mis
+ args << "-m #{max_hit}" if max_hit
+ args << @pat_file
+ args << "< #{@in_file}"
+ args << "> #{@out_file}"
+ command = args.join(" ")
+ $stderr.puts command if verbose
+ system(command)
+ raise "Command failed: #{command}" unless $?.success?
+ end
+
+ # Method to parse Patscan results and return
+ # these in a hash with ID as key and a list
+ # of hits as value.
+ def results_parse
+ results = Hash.new { |h, k| h[k] = [] }
+
+ Fasta.open(@out_file, mode='r') do |ios|
+ ios.each do |entry|
+ if entry.seq_name =~ /([^:]+):\[(\d+),(\d+)\]/
+ id = $1.to_i
+ start = $2.to_i - 1
+ stop = $3.to_i - 1
+
+ if stop > start
+ strand = '+'
+ else
+ start, stop = stop, start
+ strand = '-'
+ end
+
+ results[id.to_i] << Match.new(start, stop, strand, entry.seq)
else
- {
- $result->{ "S_ID" } = $head_hash{ $result->{ "S_ID" } };
- }
-
- Maasha::Biopieces::put_record( $result, $out );
- }
-
- close $fh_in;
-}
-
-unlink $pat_file;
-unlink $out_file;
-unlink $seq_file if not $options->{ 'genome' };
-
-Maasha::Filesys::dir_remove( $tmp_dir );
-
-
-# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SUBROUTINES <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
-
-
-sub parse_patterns
-{
- # Martin A. Hansen, May 2009.
-
- # Parse pattern arguments based on information in the
- # options hash and returns a list of patterns.
- # Raises an error if no info.
-
- my ( $options, # options hash
- ) = @_;
-
- # Returns a list.
-
- my ( $patterns );
-
- if ( $options->{ "patterns" } ) {
- $patterns = Maasha::Patscan::parse_patterns( $options->{ "patterns" } );
- } elsif ( -f $options->{ "patterns_in" } ) {
- $patterns = Maasha::Patscan::read_patterns( $options->{ "patterns_in" } );
- } else {
- Maasha::Common::error( qq(no patterns specified.) );
- }
-
- return wantarray ? @{ $patterns } : $patterns;
-}
-
-
-sub parse_args
-{
- # Martin A. Hansen, May 2009.
-
- # Generate an argument string for executing scan_for_matches based
- # on information in the option hash.
-
- my ( $options, # options hash
- ) = @_;
-
- # Returns a string.
-
- my ( @args );
-
- push @args, "-c" if $options->{ "comp" };
- push @args, "-m $options->{ 'max_hits' }" if $options->{ 'max_hits' };
- push @args, "-n $options->{ 'max_misses' }" if $options->{ 'max_hits' };
-
- return join " ", @args;
-}
-
-
-sub pat_write
-{
- # Martin A. Hansen, May 2009.
-
- # Write a scan_for_matches pattern to file.
-
- my ( $file, # target file to write pattern to.
- $pattern, # pattern to write.
- ) = @_;
-
- # Returns nothing.
-
- my ( $fh_out );
-
- $fh_out = Maasha::Common::write_open( $pat_file );
-
- print $fh_out "$pattern\n";
-
- close $fh_out;
-}
-
-
-# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
-
-
-BEGIN
-{
- $run_time_beg = Maasha::Biopieces::run_time();
-
- Maasha::Biopieces::log_biopiece();
-}
-
-
-END
-{
- Maasha::Biopieces::close_stream( $in );
- Maasha::Biopieces::close_stream( $out );
-
- $run_time_end = Maasha::Biopieces::run_time();
-
- Maasha::Biopieces::run_time_print( $run_time_beg, $run_time_end, $options );
-}
+ raise "Failed to parse seq_name: #{entry.seq_name} in patscan result"
+ end
+ end
+ end
+
+ results
+ end
+
+ private
+
+ # Method to save a patscan pattern to a file.
+ def pattern_save
+ File.open(@pat_file, "w") do |ios|
+ ios.puts @pattern
+ end
+ end
+
+ # Subclass to define Patscan hits.
+ class Match
+ attr_reader :start, :stop, :strand, :match
+
+ def initialize(start, stop, strand, match)
+ @start = start
+ @stop = stop
+ @strand = strand
+ @match = match
+ end
+
+ def length
+ @stop - @start + 1
+ end
+ end
+end
+
+casts = []
+casts << {:long=>'pattern', :short=>'p', :type=>'string', :mandatory=>true, :default=>nil, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'inline', :short=>'i', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'comp', :short=>'c', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'max_hits', :short=>'h', :type=>'uint', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>'0'}
+casts << {:long=>'max_misses', :short=>'m', :type=>'uint', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>'0'}
+
+options = Biopieces.options_parse(ARGV, casts)
+
+tmp_dir = Biopieces.mktmpdir
+tmp_file = File.join(tmp_dir, "tmp.stream")
+pat_file = File.join(tmp_dir, "pat.txt")
+in_file = File.join(tmp_dir, "in.fna")
+out_file = File.join(tmp_dir, "out.fna")
+out_file = File.join(tmp_dir, "out.fna")
+
+seq_name_count = 0
+seq_name_hash = {}
+seq_type = nil
+
+Biopieces.open(options[:stream_in], tmp_file) do |input, output|
+ Fasta.open(in_file, mode="w") do |fasta_io|
+ input.each_record do |record|
+ if record.has_key? :SEQ_NAME
+ seq_name_hash[seq_name_count] = record[:SEQ_NAME]
+ record[:SEQ_NAME] = seq_name_count
+ seq_name_count += 1
+
+ seq = Seq.new_bp(record)
+
+ if seq_type.nil?
+ seq_type = seq.type_guess
+ end
+
+ fasta_io.puts seq.to_fasta
+ end
+
+ output.puts record
+ end
+ end
+end
+
+patscan = Patscan.new(pat_file, in_file, out_file, options[:pattern])
+patscan.run(options[:comp], seq_type, options[:max_hits], options[:max_misses], options[:verbose])
+results = patscan.results_parse
+
+Biopieces.open(tmp_file, options[:stream_out]) do |input, output|
+ input.each_record do |record|
+ if record.has_key? :SEQ_NAME
+ key = record[:SEQ_NAME].to_i
+ record[:SEQ_NAME] = seq_name_hash[key]
+
+ if options[:inline]
+ if results.has_key? key
+ results[key].each do |result|
+ record[:PATTERN] = options[:pattern]
+ record[:MATCH] = result.match
+ record[:S_BEG] = result.start
+ record[:S_END] = result.stop
+ record[:STRAND] = result.strand
+ record[:MATCH_LEN] = result.length
+
+ output.puts record
+ end
+ else
+ output.puts record
+ end
+ else
+ output.puts record
+
+ new_record = {}
+
+ results[key].each do |result|
+ new_record[:REC_TYPE] = "PATSCAN"
+ new_record[:S_ID] = record[:SEQ_NAME]
+ new_record[:Q_ID] = options[:pattern]
+ new_record[:MATCH] = result.match
+ new_record[:S_BEG] = result.start
+ new_record[:S_END] = result.stop
+ new_record[:STRAND] = result.strand
+ new_record[:SCORE] = 100
+ new_record[:MATCH_LEN] = result.length
+
+ output.puts new_record
+ end
+ end
+ else
+ output.puts record
+ end
+ end
+end
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
__END__
-