]> git.donarmstrong.com Git - biopieces.git/commitdiff
fixed critical bug in biopieces.rb and reworked find_adaptor
authormartinahansen <martinahansen@74ccb610-7750-0410-82ae-013aeee3265d>
Mon, 23 May 2011 15:07:31 +0000 (15:07 +0000)
committermartinahansen <martinahansen@74ccb610-7750-0410-82ae-013aeee3265d>
Mon, 23 May 2011 15:07:31 +0000 (15:07 +0000)
git-svn-id: http://biopieces.googlecode.com/svn/trunk@1415 74ccb610-7750-0410-82ae-013aeee3265d

bp_bin/find_adaptor
code_ruby/lib/maasha/biopieces.rb

index 6319874c91eeff3cb1fdf1a9a13cf53e721a218d..53e453a147fd62414bfa1e4a9af96e8f27bc2326 100755 (executable)
 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
 
 
+require 'pp'
 require 'maasha/biopieces'
-require 'maasha/seq'
-
-def disambiguate(adaptor)
-  adaptor_disamb = adaptor.dup
-  adaptor_disamb.gsub!('U', 'T')
-  adaptor_disamb.gsub!('R', '[AG]')
-  adaptor_disamb.gsub!('Y', '[CT]')
-  adaptor_disamb.gsub!('S', '[GC]')
-  adaptor_disamb.gsub!('W', '[AT]')
-  adaptor_disamb.gsub!('M', '[AC]')
-  adaptor_disamb.gsub!('K', '[GT]')
-  adaptor_disamb.gsub!('V', '[ACG]')
-  adaptor_disamb.gsub!('H', '[ACT]')
-  adaptor_disamb.gsub!('D', '[AGT]')
-  adaptor_disamb.gsub!('B', '[CGT]')
-  adaptor_disamb.gsub!('N', '.')
-  adaptor_disamb
-end
+require 'maasha/fasta'
 
-class Seq
-  # Method that finds an adaptor or part thereof in the sequence of a Seq object.
-  # Returns a Match object if the adaptor was found otherwise nil. The ed_percent
-  # indicates the maximum edit distance allowed in all possible overlaps.
-  def adaptor_find(adaptor, adaptor_disamb, pos = 0, ed_percent = 0, dist = 0)
-    raise SeqError, "Edit distance percent out of range #{ed_percent}" unless (0 .. 100).include? ed_percent
+# Error class for PatScan errors.
+class PatScanError < StandardError; end;
 
-    if pos < 0
-      pos = self.length + pos # pos offset from the right end
-    end
+class PatScan
+  def initialize(options, file_fasta, file_pattern, file_patscan)
+    @options      = options
+    @file_fasta   = file_fasta
+    @file_pattern = file_pattern
+    @file_patscan = file_patscan
+
+    pat = Pattern.new(@options)
+    pat.write(@file_pattern)
+  end
+
+  def run
+    commands = []
+    commands << "nice -n 19"
+    commands << "scan_for_matches"
+    commands << @file_pattern
+    commands << "< #{@file_fasta}"
+    commands << "> #{@file_patscan}"
+    command = commands.join(" ")
+    system(command)
+    raise PatScanError, "Command failed: #{command}" unless $?.success?
+  end
 
-    if pos < self.length - dist + 1
-      if match = adaptor_find_simple(adaptor_disamb, pos)
-        return match
-      elsif match = adaptor_find_complex(adaptor, pos, ed_percent)
-        return match
-      elsif match = adaptor_partial_find_complex(adaptor, pos, ed_percent, dist)
-        return match
+  def parse_results
+    matches = {}
+
+    Fasta.open(@file_patscan, mode='r') do |ios|
+      ios.each_entry do |entry|
+        if entry.seq_name =~ /^(\d+):\[(\d+),(\d+)\]$/
+          name  = $1.to_i
+          start = $2.to_i - 1
+          stop  = $3.to_i - 1
+          matches[name] = [start, stop - start + 1]
+        else
+          raise "Failed to parse sequence name: #{entry.seq_name}"
+        end
       end
     end
+
+    matches
   end
+end
 
-  private
+# Error class for Pattern errors.
+class PatternError < StandardError; end;
 
-  # Method to find an adaptor in a sequence taking into account ambiguity
-  # codes, but not considering mismatches, insertions, and deletions.
-  def adaptor_find_simple(adaptor, pos)
-    self.seq.upcase.match(adaptor, pos) do |m|
-      return Match.new($`.length, m, m.to_s.length, 0, 0, 0, m.to_s.length)
-    end
+class Pattern
+  def initialize(options)
+    @options  = options
+    @patterns = []
+    @patterns << pattern_internal
+    @patterns += patterns_end if @options[:trim_end]
   end
 
-  # Method to find an adaptor in a sequence taking into account ambiguity
-  # codes, mismatches, insertions, and deletions.
-  def adaptor_find_complex(adaptor, pos, ed_percent)
-    ed_max = (adaptor.length * ed_percent * 0.01).round
+  def to_i
+    new_patterns = []
 
-    match = self.match(adaptor, pos, ed_max)
+    while @patterns.size > 1
+      new_patterns = @patterns[0 ... -2] 
+      new_patterns << "( #{@patterns[-2 .. -1].join(' | ')} )"
 
-    match
-  end
+      @patterns = new_patterns
+    end
 
-  # Method to find part of an adaptor at the right end of a sequence taking
-  # into account ambiguity codes, mismatches, insertions, and deletions.
-  def adaptor_partial_find_complex(adaptor, pos, ed_percent, dist)
-    if pos > self.length - adaptor.length
-      adaptor = adaptor[0 ... self.length - pos]
-    else
-      adaptor = adaptor[0 ... adaptor.length]
+    @patterns.first
+  end
 
-      pos = self.length - adaptor.length
+  def write(file)
+    File.open(file, mode='w') do |ios|
+      ios.puts self.to_i
     end
+  end
 
-    # puts self.seq
+  private
 
-    while adaptor.length > 0 and adaptor.length >= dist
-      # puts (" " * pos) + adaptor
+  def pattern_internal
+    pattern = @options[:adaptor]
+    mis = mis_count(pattern)
+    ins = ins_count(pattern)
+    del = del_count(pattern)
 
-      ed_max = (adaptor.length * ed_percent * 0.01).round
+    "#{pattern}[#{mis},#{ins},#{del}]"
+  end
 
-      if ed_max == 0
-        self.seq.upcase.match(adaptor, pos) do |m|
-          return Match.new($`.length, m, m.to_s.length, 0, 0, 0, m.to_s.length)
-        end
-      else
-        self.scan(adaptor, pos, ed_max).each do |match|
-          return match
-        end
-      end
+  def patterns_end
+    patterns = []
+    adaptor  = @options[:adaptor]
 
-      adaptor = adaptor[0 ... -1]
+    raise PatternError, "trim_end_min > adaptor length: #{@options[:trim_end_min]} > #{adaptor.length - 1}" if @options[:trim_end_min] > adaptor.length - 1
 
-      pos += 1
+    (adaptor.length - 1).downto(@options[:trim_end_min]) do |i|
+      pattern = adaptor[0 ... i]
+      mis = mis_count(pattern)
+      ins = ins_count(pattern)
+      del = del_count(pattern)
+      patterns << "#{pattern}[#{mis},#{ins},#{del}] $"
     end
+
+    patterns
+  end
+
+  def mis_count(pattern)
+    (pattern.length * @options[:mismatches] * 0.01).round
+  end
+
+  def ins_count(pattern)
+    (pattern.length * @options[:insertions] * 0.01).round
+  end
+
+  def del_count(pattern)
+    (pattern.length * @options[:deletions] * 0.01).round
   end
 end
 
 casts = []
-casts << {:long=>'adaptor',       :short=>'r', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
-casts << {:long=>'edit_distance', :short=>'e', :type=>'uint',   :mandatory=>false, :default=>20,  :allowed=>nil, :disallowed=>nil}
-casts << {:long=>'pos',           :short=>'p', :type=>'int',    :mandatory=>false, :default=>1,   :allowed=>nil, :disallowed=>"0"}
-casts << {:long=>'dist',          :short=>'d', :type=>'uint',   :mandatory=>false, :default=>0,   :allowed=>nil, :disallowed=>nil}
-casts << {:long=>'cache',         :short=>'c', :type=>'flag',   :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'adaptor',      :short=>'a', :type=>'string', :mandatory=>true,  :default=>nil, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'trim_end',     :short=>'t', :type=>'flag',   :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'trim_end_min', :short=>'l', :type=>'uint',   :mandatory=>false, :default=>10,  :allowed=>nil, :disallowed=>'0'}
+casts << {:long=>'mismatches',   :short=>'m', :type=>'uint',   :mandatory=>false, :default=>10,  :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'insertions',   :short=>'i', :type=>'uint',   :mandatory=>false, :default=>5,   :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'deletions',    :short=>'d', :type=>'uint',   :mandatory=>false, :default=>5,   :allowed=>nil, :disallowed=>nil}
 
 options = Biopieces.options_parse(ARGV, casts)
 
-adaptor        = options[:adaptor].to_s.upcase
-adaptor_disamb = disambiguate(adaptor)
+tmpdir       = Biopieces.mktmpdir
+file_fasta   = File.join(tmpdir, "data.fna")
+file_records = File.join(tmpdir, "data.stream")
+file_pattern = File.join(tmpdir, "pattern.txt")
+file_patscan = File.join(tmpdir, "patscan.fna")
 
-pos  = options[:pos]
-pos -= 1 if pos > 0  # pos was 1-based
+count = 0
 
-cache = {}
+Biopieces.open(options[:stream_in], file_records) do |input, output|
+  Fasta.open(file_fasta, mode='w') do |out_fa|
+    input.each do |record|
+      output.puts record
 
-Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output|
-  input.each_record do |record|
-    if record.has_key? :SEQ
-      entry = Seq.new(record[:SEQ_NAME], record[:SEQ], "dna", record[:SCORES])
+      if record.has_key? :SEQ
+        record[:SEQ_NAME] = count
+        out_fa.puts record
 
-      if cache[entry.seq.upcase.to_sym] and options[:cache]
-        match = cache[entry.seq.upcase]
-      else
-        match = entry.adaptor_find(adaptor, adaptor_disamb, pos, options[:edit_distance], options[:dist])
-
-        cache[entry.seq.upcase.to_sym] = match if match and options[:cache]
+        count += 1;
       end
+    end
+  end
+end
+
+patscan = PatScan.new(options, file_fasta, file_pattern, file_patscan)
+patscan.run
+matches = patscan.parse_results
 
-      if match
-        record[:ADAPTOR_POS]   = match.pos
-        record[:ADAPTOR_LEN]   = match.length
-        record[:ADAPTOR_MATCH] = match.match
+count = 0
+
+Biopieces.open(file_records, options[:stream_out]) do |input, output|
+  input.each_record do |record|
+    if record.has_key? :SEQ
+      if matches.has_key? count
+        record[:ADAPTOR_POS] = matches[count].first
+        record[:ADAPTOR_LEN] = matches[count].last
       end
+
+      count += 1;
     end
 
     output.puts record
index c1dc49b0f43e2cf40768feae4e0a2ed1ab7e6417..5fd6edf4a505154bb929976a86c58246ac5d3f85 100644 (file)
@@ -149,22 +149,22 @@ class Biopieces
   # Class method for opening data stream for reading Biopiece records.
   # Records are read from STDIN (default) or file (possibly gzipped).
   def self.open_input(input)
-    if STDIN.tty?
-      if input.nil?
+    if input.nil?
+      if STDIN.tty?
         input = self.new(StringIO.new)
       else
-        ios = File.open(input, mode='r')
-
-        begin
-            ios = Zlib::GzipReader.new(ios)
-        rescue
-            ios.rewind
-        end
+        input = self.new(STDIN, stdio = true)
+      end
+    elsif File.exists? input
+      ios = File.open(input, mode='r')
 
-        input = self.new(ios)
+      begin
+          ios = Zlib::GzipReader.new(ios)
+      rescue
+          ios.rewind
       end
-    else
-      input = self.new(STDIN, stdio = true)
+
+      input = self.new(ios)
     end
 
     input