]> git.donarmstrong.com Git - biopieces.git/blobdiff - code_ruby/lib/maasha/genbank.rb
changed layout of ruby source
[biopieces.git] / code_ruby / lib / maasha / genbank.rb
diff --git a/code_ruby/lib/maasha/genbank.rb b/code_ruby/lib/maasha/genbank.rb
new file mode 100644 (file)
index 0000000..a6ec292
--- /dev/null
@@ -0,0 +1,361 @@
+# Copyright (C) 2007-2010 Martin A. Hansen.
+
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+# http://www.gnu.org/copyleft/gpl.html
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# This software is part of the Biopieces framework (www.biopieces.org).
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+require 'maasha/seq'
+require 'maasha/filesys'
+
+# Error class for all exceptions to do with Genbank.
+class GenbankError < StandardError; end
+
+class Genbank < Filesys
+  def initialize(io)
+    @io      = io
+    @entry   = []
+  end
+
+  # Iterator method for parsing Genbank entries.
+  def each(hash_keys = nil, hash_feats = nil, hash_quals = nil)
+    while @entry = get_entry do
+      keys = get_keys(hash_keys)
+      seq  = get_seq
+
+      features = GenbankFeatures.new(@entry, hash_feats, hash_quals)
+
+      features.each do |record|
+        keys.each_pair { |key,val| record[key] = val }
+
+                               loc = Locator.new(record[:LOCATOR], seq)
+                               record[:SEQ]     = loc.subseq.seq
+        record[:SEQ_LEN] = loc.subseq.length
+                               record[:STRAND]  = loc.strand
+                               record[:S_BEG]   = loc.s_beg
+                               record[:S_END]   = loc.s_end
+
+        yield record
+      end
+    end
+  end
+
+  private
+
+  # Method to get the next Genbank entry form an ios and return this.
+  def get_entry
+    block = @io.gets("//" + $/)
+    return nil if block.nil?
+
+    block.chomp!("//" + $/ )
+
+    entry = block.split $/
+
+    return nil if entry.empty?
+
+    entry
+  end
+
+  # Method to get the DNA sequence from a Genbank entry and return
+  # this as a Seq object.
+  def get_seq
+    i = @entry.size
+    j = i - 1
+
+    while @entry[j] and @entry[j] !~ /^[A-Z]/
+      j -= 1
+    end
+
+    seq = @entry[j + 1 .. i].join.delete(" 0123456789")
+
+    Seq.new(nil, seq, "dna") if seq
+  end
+
+  # Method to get the base keys from Genbank entry and return these
+  # in a hash.
+  def get_keys(hash_keys)
+    keys = {}
+    i    = 0
+    j    = 0
+
+    while @entry[i] and @entry[i] !~ /^FEATURES/
+      if @entry[i] =~ /^\s{0,3}([A-Z]{2})/
+        if want_key?(hash_keys, $1)
+          j = i + 1
+
+          key, val = @entry[i].lstrip.split(/\s+/, 2)
+
+          while @entry[j] and @entry[j] !~ /^\s{0,3}[A-Z]/
+            val << @entry[j].lstrip
+            j += 1
+          end
+
+          if keys[key.to_sym]
+            keys[key.to_sym] << ";" + val
+          else
+            keys[key.to_sym] = val
+          end
+        end
+      end
+
+      j > i ? i = j : i += 1
+    end
+
+    keys
+  end
+
+  def want_key?(hash_keys, key)
+    if hash_keys
+      if hash_keys[key.to_sym]
+        return true
+      else
+        return false
+      end
+    else
+      return true
+    end
+  end
+end
+
+class GenbankFeatures
+  @@i = 0
+  @@j = 0
+
+  def initialize(entry, hash_feats, hash_quals)
+    @entry      = entry
+    @hash_feats = hash_feats
+    @hash_quals = hash_quals
+  end
+
+  def each
+    while @entry[@@i] and @entry[@@i] !~ /^ORIGIN/
+      if @entry[@@i] =~ /^\s{5}([A-Za-z_-]+)/
+        if want_feat? $1
+          record = {}
+
+          feat, loc = @entry[@@i].lstrip.split(/\s+/, 2)
+
+          @@j = @@i + 1
+
+          while @entry[@@j] and @entry[@@j] !~ /^(\s{21}\/|\s{5}[A-Za-z_-]|[A-Z])/
+            loc << @entry[@@j].lstrip
+            @@j += 1
+          end
+
+          get_quals.each_pair { |k,v|
+            record[k.upcase.to_sym] = v
+          }
+
+          record[:FEATURE] = feat
+          record[:LOCATOR] = loc
+
+          yield record
+        end
+      end
+
+      @@j > @@i ? @@i = @@j : @@i += 1
+    end
+  end
+
+  private
+
+  def get_quals
+    quals = {}
+    k     = 0
+
+    while @entry[@@j] and @entry[@@j] !~ /^\s{5}[A-Za-z_-]|^[A-Z]/
+      if @entry[@@j] =~ /^\s{21}\/([^=]+)="([^"]+)/
+        qual = $1
+        val  = $2
+
+        if want_qual? qual
+          k = @@j + 1
+
+          while @entry[k] and @entry[k] !~ /^(\s{21}\/|\s{5}[A-Za-z_-]|[A-Z])/
+            val << @entry[k].lstrip.chomp('"')
+            k += 1
+          end
+
+          if quals[qual]
+            quals[qual] << ";" + val
+          else
+            quals[qual] = val
+          end
+        end
+      end
+
+      k > @@j ? @@j = k : @@j += 1
+    end
+
+    quals
+  end
+
+  def want_feat?(feat)
+    if @hash_feats
+      if @hash_feats[feat.upcase.to_sym]
+        return true
+      else
+        return false
+      end
+    else
+      return true
+    end
+  end
+
+  def want_qual?(qual)
+    if @hash_quals
+      if @hash_quals[qual.upcase.to_sym]
+        return true
+      else
+        return false
+      end
+    else
+      return true
+    end
+  end
+end
+
+
+# Error class for all exceptions to do with Genbank/EMBL/DDBJ feature table locators.
+class LocatorError < StandardError; end
+
+class Locator
+  attr_accessor :locator, :seq, :subseq
+
+  def initialize(locator, seq)
+    @locator = locator
+    @seq     = seq
+    @subseq  = Seq.new(nil, "", "dna")
+    parse_locator(locator)
+  end
+
+  def strand
+    if @locator.match("complement")
+      return "-"
+    else
+      return "+"
+    end
+  end
+
+  def s_beg
+    if @locator =~ /(\d+)/
+      return $1.to_i - 1
+    end
+  end
+
+  def s_end
+    if @locator.reverse =~ /(\d+)/
+      return $1.reverse.to_i - 1
+    end
+  end
+
+  private
+
+  # Method that uses recursion to parse a locator string from a feature
+  # table and fetches the appropriate subsequence. the operators
+  # join(), complement(), and order() are handled.
+  # the locator string is broken into a comma separated lists, and
+  # modified if the parens donnot balance. otherwise the comma separated
+  # list of ranges are stripped from operators, and the subsequence are
+  # fetched and handled according to the operators.
+  # SNP locators are also dealt with (single positions).
+  def parse_locator(locator, join = nil, comp = nil, order = nil)
+    intervals = locator.split(",")
+
+    unless balance_parens?(intervals.first)   # locator includes a join/comp/order of several ranges
+      case locator
+      when /^join\((.*)\)$/
+        locator = $1
+        join     = true
+      when /^complement\((.*)\)$/
+        locator = $1
+        comp     = true
+      when /^order\((.*)\)$/
+        locator = $1
+        order    = true
+      end
+
+      parse_locator(locator, join, comp, order)
+    else
+      intervals.each do |interval|
+        case interval
+        when /^join\((.*)\)$/
+          locator = $1
+          join    = true
+          parse_locator(locator, join, comp, order)
+        when /^complement\((.*)\)$/
+          locator = $1
+          comp    = true
+          parse_locator(locator, join, comp, order)
+        when /^order\((.*)\)$/
+          locator = $1
+          order   = true
+          parse_locator(locator, join, comp, order)
+        when /^[<>]?(\d+)[^\d]+(\d+)$/
+          int_beg = $1.to_i - 1
+          int_end = $2.to_i - 1
+
+          newseq = Seq.new(nil, @seq.seq[int_beg...int_end], "dna")
+
+                                       unless newseq.seq.nil?
+                                               newseq.revcomp if comp
+
+                                               @subseq.seq << (order ? " " + newseq.seq : newseq.seq)
+                                       end
+        when /^(\d+)$/
+          pos = $1.to_i - 1
+
+          newseq = Seq.new(nil, @seq.seq[pos], "dna")
+
+                                       unless newseq.seq.nil?
+               newseq.revcomp if comp 
+
+               @subseq.seq << (order ? " " + newseq.seq : newseq.seq)
+                                       end
+        else
+          $stderr.puts "WARNING: Could not match locator -> #{locator}";
+          @subseq.seq << ""
+        end
+      end
+    end
+
+    return @subseq
+  end
+
+  def balance_parens?(locator)
+    parens = 0
+
+    locator.each_char do |char|
+      case char
+      when '(' then parens += 1
+      when ')' then parens -= 1
+      end
+    end
+
+    if parens == 0
+      return true
+    else
+      return false
+    end
+  end
+end
+
+
+__END__