]> git.donarmstrong.com Git - biopieces.git/commitdiff
updated patscan_seq
authormartinahansen <martinahansen@74ccb610-7750-0410-82ae-013aeee3265d>
Tue, 20 Sep 2011 15:06:09 +0000 (15:06 +0000)
committermartinahansen <martinahansen@74ccb610-7750-0410-82ae-013aeee3265d>
Tue, 20 Sep 2011 15:06:09 +0000 (15:06 +0000)
git-svn-id: http://biopieces.googlecode.com/svn/trunk@1529 74ccb610-7750-0410-82ae-013aeee3265d

bp_bin/patscan_seq
bp_test/in/patscan_seq.in.1 [new file with mode: 0644]
bp_test/in/patscan_seq.in.2 [new file with mode: 0644]
bp_test/out/patscan_seq.out.1 [new file with mode: 0644]
bp_test/out/patscan_seq.out.2 [new file with mode: 0644]
bp_test/out/patscan_seq.out.3 [new file with mode: 0644]
bp_test/out/patscan_seq.out.4 [new file with mode: 0644]
bp_test/out/patscan_seq.out.5 [new file with mode: 0644]
bp_test/test/test_patscan_seq [new file with mode: 0755]

index ca4fc983f47629bb78e960ea21f15f335bb68f5f..ef217c559298293d0fdec522e77b9d46c1af535e 100755 (executable)
@@ -1,6 +1,6 @@
-#!/usr/bin/env perl
+#!/usr/bin/env ruby
 
-# Copyright (C) 2007-2009 Martin A. Hansen.
+# Copyright (C) 2007-2011 Martin A. Hansen.
 
 # This program is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License
 
 # http://www.gnu.org/copyleft/gpl.html
 
-
-# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
-
-# Scan sequences in the stream or a specified genome for patterns using patscan.
-
 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
 
+# This program is part of the Biopieces framework (www.biopieces.org).
 
-use warnings;
-use strict;
-use Maasha::Biopieces;
-use Maasha::Common;
-use Maasha::Fasta;
-use Maasha::Filesys;
-use Maasha::Patscan;
-
-use constant {
-    SEQ_NAME => 0,
-    SEQ      => 1,
-};
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
 
+# Run Patscan on sequences in the stream.
 
 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
 
 
-my ( $options, $in, $out, $record, @args, $arg, $type, $tmp_dir, $seq_file, $pat_file, $out_file,
-     $fh_in, $fh_out, $patterns, $pattern, $entry, $result, %head_hash, $i );
-
-$options = Maasha::Biopieces::parse_options(
-    [
-        { long => 'patterns',    short => 'p', type => 'string',  mandatory => 'no', default => undef, allowed => undef, disallowed => undef },
-        { long => 'patterns_in', short => 'P', type => 'file!',   mandatory => 'no', default => undef, allowed => undef, disallowed => undef },
-        { long => 'comp',        short => 'c', type => 'flag',    mandatory => 'no', default => undef, allowed => undef, disallowed => undef },
-        { long => 'max_hits',    short => 'h', type => 'uint',    mandatory => 'no', default => undef, allowed => undef, disallowed => 0 },
-        { long => 'max_misses',  short => 'm', type => 'uint',    mandatory => 'no', default => undef, allowed => undef, disallowed => 0 },
-        { long => 'genome',      short => 'g', type => 'genome',  mandatory => 'no', default => undef, allowed => undef, disallowed => 0 },
-    ]   
-);
-
-$in  = Maasha::Biopieces::read_stream( $options->{ "stream_in" } );
-$out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } );
-
-$tmp_dir = Maasha::Biopieces::get_tmpdir();
-
-$pat_file = "$tmp_dir/patscan.pat";
-$out_file = "$tmp_dir/patscan.out";
-
-$patterns = parse_patterns( $options );
-$arg      = parse_args( $options );
-
-if ( defined $options->{ 'genome' } )
-{
-    $seq_file = "$ENV{ 'BP_DATA' }/genomes/$options->{ 'genome' }/fasta/$options->{ 'genome' }.fna";
-}
-else
-{
-    $seq_file = "$tmp_dir/patscan.seq";
-
-    $fh_out = Maasha::Filesys::file_write_open( $seq_file );
-
-    $i = 0;
-
-    while ( $record = Maasha::Biopieces::get_record( $in ) ) 
-    {
-        if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) )
-        {
-            $type = Maasha::Seq::seq_guess_type( $record->{ "SEQ" } ) if not $type;
-        
-            $entry->[ SEQ_NAME ] = $i;
-
-            Maasha::Fasta::put_entry( $entry, $fh_out );
-        
-            $head_hash{ $i } = $record->{ "SEQ_NAME" };
-
-            $i++;
-        }
-    }
-
-    close $fh_out;
-
-    $arg .= " -p" if $type eq "PROTEIN";
-}
-
-foreach $pattern ( @{ $patterns } )
-{
-    pat_write( $pat_file, $pattern );
-
-    `scan_for_matches $arg $pat_file < $seq_file > $out_file`;
-#     Maasha::Common::run( "scan_for_matches", "$arg $pat_file < $seq_file > $out_file" );
-
-    $fh_in = Maasha::Filesys::file_read_open( $out_file );
-
-    while ( $entry = Maasha::Fasta::get_entry( $fh_in ) )
-    {
-        $result = Maasha::Patscan::parse_scan_result( $entry, $pattern );
-
-        if ( $options->{ 'genome' } )
-        {
-            $result->{ "CHR" }     = $result->{ "S_ID" };
-            $result->{ "CHR_BEG" } = $result->{ "S_BEG" }; 
-            $result->{ "CHR_END" } = $result->{ "S_END" }; 
-
-            delete $result->{ "S_ID" };
-            delete $result->{ "S_BEG" };
-            delete $result->{ "S_END" };
-        }
+require 'maasha/biopieces'
+require 'maasha/fasta'
+require 'maasha/seq'
+
+# Class with methods to execute Patscan (a.k.a. scan_for_matches).
+class Patscan
+  # Method to initialize a Patscan object.
+  def initialize(pat_file, in_file, out_file, pattern)
+    @pat_file = pat_file
+    @in_file  = in_file
+    @out_file = out_file
+    @pattern  = pattern
+
+    pattern_save
+  end
+
+  # Method to run Patscan
+  def run(comp = nil, type = nil, max_mis = nil, max_hit = nil, verbose = nil)
+    args = []
+    args << "scan_for_matches"
+    args << "-c"            if comp
+    args << "-p"            if type == 'protein'
+    args << "-n #{max_mis}" if max_mis
+    args << "-m #{max_hit}" if max_hit
+    args << @pat_file
+    args << "< #{@in_file}"
+    args << "> #{@out_file}"
+               command = args.join(" ")
+    $stderr.puts command if verbose
+    system(command)
+    raise "Command failed: #{command}" unless $?.success?
+  end
+
+  # Method to parse Patscan results and return
+  # these in a hash with ID as key and a list
+  # of hits as value.
+  def results_parse
+    results = Hash.new { |h, k| h[k] = [] }
+
+    Fasta.open(@out_file, mode='r') do |ios|
+      ios.each do |entry|
+        if entry.seq_name =~ /([^:]+):\[(\d+),(\d+)\]/
+          id    = $1.to_i
+          start = $2.to_i - 1
+          stop  = $3.to_i - 1
+
+          if stop > start
+            strand = '+'
+          else
+            start, stop = stop, start
+            strand = '-'
+          end
+
+          results[id.to_i] << Match.new(start, stop, strand, entry.seq)
         else
-        {
-            $result->{ "S_ID" } = $head_hash{ $result->{ "S_ID" } };
-        }
-
-        Maasha::Biopieces::put_record( $result, $out );
-    }
-
-    close $fh_in;
-}
-
-unlink $pat_file;
-unlink $out_file;
-unlink $seq_file if not $options->{ 'genome' };
-
-Maasha::Biopieces::close_stream( $in );
-Maasha::Biopieces::close_stream( $out );
-
-
-# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SUBROUTINES <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
-
-
-sub parse_patterns
-{
-    # Martin A. Hansen, May 2009.
-
-    # Parse pattern arguments based on information in the 
-    # options hash and returns a list of patterns.
-    # Raises an error if no info.
-
-    my ( $options,   # options hash
-       ) = @_;
-
-    # Returns a list.
-
-    my ( $patterns );
-
-    if ( $options->{ "patterns" } ) {
-        $patterns = Maasha::Patscan::parse_patterns( $options->{ "patterns" } );
-    } elsif ( -f $options->{ "patterns_in" } ) {
-        $patterns = Maasha::Patscan::read_patterns( $options->{ "patterns_in" } );
-    } else {
-        Maasha::Common::error( qq(no patterns specified.) );
-    }
-
-    return wantarray ? @{ $patterns } : $patterns;
-}
-
-
-sub parse_args
-{
-    # Martin A. Hansen, May 2009.
-
-    # Generate an argument string for executing scan_for_matches based
-    # on information in the option hash.
-
-    my ( $options,   # options hash
-       ) = @_;
-
-    # Returns a string.
-
-    my ( @args );
-
-    push @args, "-c"                            if $options->{ "comp" };
-    push @args, "-m $options->{ 'max_hits' }"   if $options->{ 'max_hits' };
-    push @args, "-n $options->{ 'max_misses' }" if $options->{ 'max_hits' };
-
-    return join " ", @args;
-}
-
-
-sub pat_write
-{
-    # Martin A. Hansen, May 2009.
-
-    # Write a scan_for_matches pattern to file.
-
-    my ( $file,      # target file to write pattern to.
-         $pattern,   # pattern to write.
-       ) = @_;
-
-    # Returns nothing.
-
-    my ( $fh_out );
-
-    $fh_out = Maasha::Common::write_open( $pat_file );
-
-    print $fh_out "$pattern\n";
-
-    close $fh_out;
-}
-
-
-# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
-
-
-BEGIN
-{
-    Maasha::Biopieces::status_set();
-}
-
-
-END
-{
-    Maasha::Biopieces::status_log();
-}
+          raise "Failed to parse seq_name: #{entry.seq_name} in patscan result"
+        end
+      end
+    end
+
+    results
+  end
+
+  private
+
+  # Method to save a patscan pattern to a file.
+  def pattern_save
+    File.open(@pat_file, "w") do |ios|
+      ios.puts @pattern
+    end
+  end
+
+  # Subclass to define Patscan hits.
+  class Match
+    attr_reader :start, :stop, :strand, :match
+
+    def initialize(start, stop, strand, match)
+      @start   = start
+      @stop    = stop
+      @strand  = strand
+      @match   = match
+    end
+
+    def length
+      @stop - @start + 1
+    end
+  end
+end
+
+casts = []
+casts << {:long=>'pattern',    :short=>'p', :type=>'string', :mandatory=>true,  :default=>nil, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'inline',     :short=>'i', :type=>'flag',   :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'comp',       :short=>'c', :type=>'flag',   :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'max_hits',   :short=>'h', :type=>'uint',   :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>'0'}
+casts << {:long=>'max_misses', :short=>'m', :type=>'uint',   :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>'0'}
+
+options = Biopieces.options_parse(ARGV, casts)
+
+tmp_dir  = Biopieces.mktmpdir
+tmp_file = File.join(tmp_dir, "tmp.stream")
+pat_file = File.join(tmp_dir, "pat.txt")
+in_file  = File.join(tmp_dir, "in.fna")
+out_file = File.join(tmp_dir, "out.fna")
+out_file = File.join(tmp_dir, "out.fna")
+
+seq_name_count = 0
+seq_name_hash  = {}
+seq_type       = nil
+
+Biopieces.open(options[:stream_in], tmp_file) do |input, output|
+  Fasta.open(in_file, mode="w") do |fasta_io|
+    input.each_record do |record|
+      if record.has_key? :SEQ_NAME
+        seq_name_hash[seq_name_count] = record[:SEQ_NAME]
+        record[:SEQ_NAME] = seq_name_count
+        seq_name_count += 1
+
+        if seq_type.nil?
+          seq      = Seq.new("", record[:SEQ])
+          seq_type = seq.type_guess
+        end
+      end
+
+      output.puts record
+      fasta_io.puts record
+    end
+  end
+end
+
+patscan = Patscan.new(pat_file, in_file, out_file, options[:pattern])
+patscan.run(options[:comp], seq_type, options[:max_hits], options[:max_misses], options[:verbose])
+results = patscan.results_parse
+
+Biopieces.open(tmp_file, options[:stream_out]) do |input, output|
+  input.each_record do |record|
+    if record.has_key? :SEQ_NAME
+      key = record[:SEQ_NAME].to_i
+      record[:SEQ_NAME] = seq_name_hash[key]
+
+      if options[:inline]
+        if results.has_key? key
+          results[key].each do |result|
+            record[:PATTERN]   = options[:pattern]
+            record[:MATCH]     = result.match
+            record[:S_BEG]     = result.start
+            record[:S_END]     = result.stop
+            record[:STRAND]    = result.strand
+            record[:MATCH_LEN] = result.length
+
+            output.puts record
+          end
+        end
+      else
+        output.puts record
+
+        new_record = {}
+
+        results[key].each do |result|
+          new_record[:REC_TYPE]  = "PATSCAN"
+          new_record[:S_ID]      = record[:SEQ_NAME]
+          new_record[:Q_ID]      = options[:pattern]
+          new_record[:MATCH]     = result.match
+          new_record[:S_BEG]     = result.start
+          new_record[:S_END]     = result.stop
+          new_record[:STRAND]    = result.strand
+          new_record[:SCORE]     = 100
+          new_record[:MATCH_LEN] = result.length
+
+          output.puts new_record
+        end
+      end
+    else
+      output.puts record
+    end
+  end
+end
 
 
 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
diff --git a/bp_test/in/patscan_seq.in.1 b/bp_test/in/patscan_seq.in.1
new file mode 100644 (file)
index 0000000..af4c80c
--- /dev/null
@@ -0,0 +1,10 @@
+KEY: VALUE
+---
+SEQ_NAME: test
+SEQ: GGACTACNNGGGTATCTAATAGTC
+SEQ_LEN: 20
+---
+SEQ_NAME: test2
+SEQ: GGACTACNNGGGTATCTAATGCATACG
+SEQ_LEN: 27
+---
diff --git a/bp_test/in/patscan_seq.in.2 b/bp_test/in/patscan_seq.in.2
new file mode 100644 (file)
index 0000000..2eb7c83
--- /dev/null
@@ -0,0 +1,10 @@
+KEY: VALUE
+---
+SEQ_NAME: test1
+SEQ: LLRVDNIIRARPRTANRQHM
+SEQ_LEN: 20
+---
+SEQ_NAME: test2
+SEQ: NIIRARPRTAN
+SEQ_LEN: 11
+---
diff --git a/bp_test/out/patscan_seq.out.1 b/bp_test/out/patscan_seq.out.1
new file mode 100644 (file)
index 0000000..b0f7bc6
--- /dev/null
@@ -0,0 +1,30 @@
+KEY: VALUE
+---
+SEQ_NAME: test
+SEQ: GGACTACNNGGGTATCTAATAGTC
+SEQ_LEN: 20
+---
+REC_TYPE: PATSCAN
+S_ID: test
+Q_ID: GACT
+MATCH: GACT
+S_BEG: 1
+S_END: 4
+STRAND: +
+SCORE: 100
+MATCH_LEN: 4
+---
+SEQ_NAME: test2
+SEQ: GGACTACNNGGGTATCTAATGCATACG
+SEQ_LEN: 27
+---
+REC_TYPE: PATSCAN
+S_ID: test2
+Q_ID: GACT
+MATCH: GACT
+S_BEG: 1
+S_END: 4
+STRAND: +
+SCORE: 100
+MATCH_LEN: 4
+---
diff --git a/bp_test/out/patscan_seq.out.2 b/bp_test/out/patscan_seq.out.2
new file mode 100644 (file)
index 0000000..dd5e8ed
--- /dev/null
@@ -0,0 +1,40 @@
+KEY: VALUE
+---
+SEQ_NAME: test
+SEQ: GGACTACNNGGGTATCTAATAGTC
+SEQ_LEN: 20
+---
+REC_TYPE: PATSCAN
+S_ID: test
+Q_ID: GACT
+MATCH: GACT
+S_BEG: 1
+S_END: 4
+STRAND: +
+SCORE: 100
+MATCH_LEN: 4
+---
+REC_TYPE: PATSCAN
+S_ID: test
+Q_ID: GACT
+MATCH: GACT
+S_BEG: 20
+S_END: 23
+STRAND: -
+SCORE: 100
+MATCH_LEN: 4
+---
+SEQ_NAME: test2
+SEQ: GGACTACNNGGGTATCTAATGCATACG
+SEQ_LEN: 27
+---
+REC_TYPE: PATSCAN
+S_ID: test2
+Q_ID: GACT
+MATCH: GACT
+S_BEG: 1
+S_END: 4
+STRAND: +
+SCORE: 100
+MATCH_LEN: 4
+---
diff --git a/bp_test/out/patscan_seq.out.3 b/bp_test/out/patscan_seq.out.3
new file mode 100644 (file)
index 0000000..41077a3
--- /dev/null
@@ -0,0 +1,22 @@
+KEY: VALUE
+---
+SEQ_NAME: test
+SEQ: GGACTACNNGGGTATCTAATAGTC
+SEQ_LEN: 20
+PATTERN: GACT
+MATCH: GACT
+S_BEG: 1
+S_END: 4
+STRAND: +
+MATCH_LEN: 4
+---
+SEQ_NAME: test2
+SEQ: GGACTACNNGGGTATCTAATGCATACG
+SEQ_LEN: 27
+PATTERN: GACT
+MATCH: GACT
+S_BEG: 1
+S_END: 4
+STRAND: +
+MATCH_LEN: 4
+---
diff --git a/bp_test/out/patscan_seq.out.4 b/bp_test/out/patscan_seq.out.4
new file mode 100644 (file)
index 0000000..42fb5f5
--- /dev/null
@@ -0,0 +1,30 @@
+KEY: VALUE
+---
+SEQ_NAME: test1
+SEQ: LLRVDNIIRARPRTANRQHM
+SEQ_LEN: 20
+---
+REC_TYPE: PATSCAN
+S_ID: test1
+Q_ID: RARP
+MATCH: RARP
+S_BEG: 8
+S_END: 11
+STRAND: +
+SCORE: 100
+MATCH_LEN: 4
+---
+SEQ_NAME: test2
+SEQ: NIIRARPRTAN
+SEQ_LEN: 11
+---
+REC_TYPE: PATSCAN
+S_ID: test2
+Q_ID: RARP
+MATCH: RARP
+S_BEG: 3
+S_END: 6
+STRAND: +
+SCORE: 100
+MATCH_LEN: 4
+---
diff --git a/bp_test/out/patscan_seq.out.5 b/bp_test/out/patscan_seq.out.5
new file mode 100644 (file)
index 0000000..bf37a96
--- /dev/null
@@ -0,0 +1,22 @@
+KEY: VALUE
+---
+SEQ_NAME: test1
+SEQ: LLRVDNIIRARPRTANRQHM
+SEQ_LEN: 20
+PATTERN: RARP
+MATCH: RARP
+S_BEG: 8
+S_END: 11
+STRAND: +
+MATCH_LEN: 4
+---
+SEQ_NAME: test2
+SEQ: NIIRARPRTAN
+SEQ_LEN: 11
+PATTERN: RARP
+MATCH: RARP
+S_BEG: 3
+S_END: 6
+STRAND: +
+MATCH_LEN: 4
+---
diff --git a/bp_test/test/test_patscan_seq b/bp_test/test/test_patscan_seq
new file mode 100755 (executable)
index 0000000..abb1ac7
--- /dev/null
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+source "$BP_DIR/bp_test/lib/test.sh"
+
+run "$bp -I $in.1 -p GACT -O $tmp"
+assert_no_diff $tmp $out.1
+clean
+
+run "$bp -I $in.1 -p GACT -c -O $tmp"
+assert_no_diff $tmp $out.2
+clean
+
+run "$bp -I $in.1 -p GACT -i -O $tmp"
+assert_no_diff $tmp $out.3
+clean
+
+run "$bp -I $in.2 -p RARP -O $tmp"
+assert_no_diff $tmp $out.4
+clean
+
+run "$bp -I $in.2 -p RARP -i -O $tmp"
+assert_no_diff $tmp $out.5
+clean