--- /dev/null
+# Copyright (C) 2007-2010 Martin A. Hansen.
+
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+# http://www.gnu.org/copyleft/gpl.html
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# This software is part of the Biopieces framework (www.biopieces.org).
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+require 'maasha/seq'
+require 'maasha/filesys'
+
+# Error class for all exceptions to do with Genbank.
+class GenbankError < StandardError; end
+
+class Genbank < Filesys
+ def initialize(io)
+ @io = io
+ @entry = []
+ end
+
+ # Iterator method for parsing Genbank entries.
+ def each(hash_keys = nil, hash_feats = nil, hash_quals = nil)
+ while @entry = get_entry do
+ keys = get_keys(hash_keys)
+ seq = get_seq
+
+ features = GenbankFeatures.new(@entry, hash_feats, hash_quals)
+
+ features.each do |record|
+ keys.each_pair { |key,val| record[key] = val }
+
+ loc = Locator.new(record[:LOCATOR], seq)
+ record[:SEQ] = loc.subseq.seq
+ record[:SEQ_LEN] = loc.subseq.length
+ record[:STRAND] = loc.strand
+ record[:S_BEG] = loc.s_beg
+ record[:S_END] = loc.s_end
+
+ yield record
+ end
+ end
+ end
+
+ private
+
+ # Method to get the next Genbank entry form an ios and return this.
+ def get_entry
+ block = @io.gets("//" + $/)
+ return nil if block.nil?
+
+ block.chomp!("//" + $/ )
+
+ entry = block.split $/
+
+ return nil if entry.empty?
+
+ entry
+ end
+
+ # Method to get the DNA sequence from a Genbank entry and return
+ # this as a Seq object.
+ def get_seq
+ i = @entry.size
+ j = i - 1
+
+ while @entry[j] and @entry[j] !~ /^[A-Z]/
+ j -= 1
+ end
+
+ seq = @entry[j + 1 .. i].join.delete(" 0123456789")
+
+ Seq.new(nil, seq, "dna") if seq
+ end
+
+ # Method to get the base keys from Genbank entry and return these
+ # in a hash.
+ def get_keys(hash_keys)
+ keys = {}
+ i = 0
+ j = 0
+
+ while @entry[i] and @entry[i] !~ /^FEATURES/
+ if @entry[i] =~ /^\s{0,3}([A-Z]{2})/
+ if want_key?(hash_keys, $1)
+ j = i + 1
+
+ key, val = @entry[i].lstrip.split(/\s+/, 2)
+
+ while @entry[j] and @entry[j] !~ /^\s{0,3}[A-Z]/
+ val << @entry[j].lstrip
+ j += 1
+ end
+
+ if keys[key.to_sym]
+ keys[key.to_sym] << ";" + val
+ else
+ keys[key.to_sym] = val
+ end
+ end
+ end
+
+ j > i ? i = j : i += 1
+ end
+
+ keys
+ end
+
+ def want_key?(hash_keys, key)
+ if hash_keys
+ if hash_keys[key.to_sym]
+ return true
+ else
+ return false
+ end
+ else
+ return true
+ end
+ end
+end
+
+class GenbankFeatures
+ @@i = 0
+ @@j = 0
+
+ def initialize(entry, hash_feats, hash_quals)
+ @entry = entry
+ @hash_feats = hash_feats
+ @hash_quals = hash_quals
+ end
+
+ def each
+ while @entry[@@i] and @entry[@@i] !~ /^ORIGIN/
+ if @entry[@@i] =~ /^\s{5}([A-Za-z_-]+)/
+ if want_feat? $1
+ record = {}
+
+ feat, loc = @entry[@@i].lstrip.split(/\s+/, 2)
+
+ @@j = @@i + 1
+
+ while @entry[@@j] and @entry[@@j] !~ /^(\s{21}\/|\s{5}[A-Za-z_-]|[A-Z])/
+ loc << @entry[@@j].lstrip
+ @@j += 1
+ end
+
+ get_quals.each_pair { |k,v|
+ record[k.upcase.to_sym] = v
+ }
+
+ record[:FEATURE] = feat
+ record[:LOCATOR] = loc
+
+ yield record
+ end
+ end
+
+ @@j > @@i ? @@i = @@j : @@i += 1
+ end
+ end
+
+ private
+
+ def get_quals
+ quals = {}
+ k = 0
+
+ while @entry[@@j] and @entry[@@j] !~ /^\s{5}[A-Za-z_-]|^[A-Z]/
+ if @entry[@@j] =~ /^\s{21}\/([^=]+)="([^"]+)/
+ qual = $1
+ val = $2
+
+ if want_qual? qual
+ k = @@j + 1
+
+ while @entry[k] and @entry[k] !~ /^(\s{21}\/|\s{5}[A-Za-z_-]|[A-Z])/
+ val << @entry[k].lstrip.chomp('"')
+ k += 1
+ end
+
+ if quals[qual]
+ quals[qual] << ";" + val
+ else
+ quals[qual] = val
+ end
+ end
+ end
+
+ k > @@j ? @@j = k : @@j += 1
+ end
+
+ quals
+ end
+
+ def want_feat?(feat)
+ if @hash_feats
+ if @hash_feats[feat.upcase.to_sym]
+ return true
+ else
+ return false
+ end
+ else
+ return true
+ end
+ end
+
+ def want_qual?(qual)
+ if @hash_quals
+ if @hash_quals[qual.upcase.to_sym]
+ return true
+ else
+ return false
+ end
+ else
+ return true
+ end
+ end
+end
+
+
+# Error class for all exceptions to do with Genbank/EMBL/DDBJ feature table locators.
+class LocatorError < StandardError; end
+
+class Locator
+ attr_accessor :locator, :seq, :subseq
+
+ def initialize(locator, seq)
+ @locator = locator
+ @seq = seq
+ @subseq = Seq.new(nil, "", "dna")
+ parse_locator(locator)
+ end
+
+ def strand
+ if @locator.match("complement")
+ return "-"
+ else
+ return "+"
+ end
+ end
+
+ def s_beg
+ if @locator =~ /(\d+)/
+ return $1.to_i - 1
+ end
+ end
+
+ def s_end
+ if @locator.reverse =~ /(\d+)/
+ return $1.reverse.to_i - 1
+ end
+ end
+
+ private
+
+ # Method that uses recursion to parse a locator string from a feature
+ # table and fetches the appropriate subsequence. the operators
+ # join(), complement(), and order() are handled.
+ # the locator string is broken into a comma separated lists, and
+ # modified if the parens donnot balance. otherwise the comma separated
+ # list of ranges are stripped from operators, and the subsequence are
+ # fetched and handled according to the operators.
+ # SNP locators are also dealt with (single positions).
+ def parse_locator(locator, join = nil, comp = nil, order = nil)
+ intervals = locator.split(",")
+
+ unless balance_parens?(intervals.first) # locator includes a join/comp/order of several ranges
+ case locator
+ when /^join\((.*)\)$/
+ locator = $1
+ join = true
+ when /^complement\((.*)\)$/
+ locator = $1
+ comp = true
+ when /^order\((.*)\)$/
+ locator = $1
+ order = true
+ end
+
+ parse_locator(locator, join, comp, order)
+ else
+ intervals.each do |interval|
+ case interval
+ when /^join\((.*)\)$/
+ locator = $1
+ join = true
+ parse_locator(locator, join, comp, order)
+ when /^complement\((.*)\)$/
+ locator = $1
+ comp = true
+ parse_locator(locator, join, comp, order)
+ when /^order\((.*)\)$/
+ locator = $1
+ order = true
+ parse_locator(locator, join, comp, order)
+ when /^[<>]?(\d+)[^\d]+(\d+)$/
+ int_beg = $1.to_i - 1
+ int_end = $2.to_i - 1
+
+ newseq = Seq.new(nil, @seq.seq[int_beg...int_end], "dna")
+
+ unless newseq.seq.nil?
+ newseq.revcomp if comp
+
+ @subseq.seq << (order ? " " + newseq.seq : newseq.seq)
+ end
+ when /^(\d+)$/
+ pos = $1.to_i - 1
+
+ newseq = Seq.new(nil, @seq.seq[pos], "dna")
+
+ unless newseq.seq.nil?
+ newseq.revcomp if comp
+
+ @subseq.seq << (order ? " " + newseq.seq : newseq.seq)
+ end
+ else
+ $stderr.puts "WARNING: Could not match locator -> #{locator}";
+ @subseq.seq << ""
+ end
+ end
+ end
+
+ return @subseq
+ end
+
+ def balance_parens?(locator)
+ parens = 0
+
+ locator.each_char do |char|
+ case char
+ when '(' then parens += 1
+ when ')' then parens -= 1
+ end
+ end
+
+ if parens == 0
+ return true
+ else
+ return false
+ end
+ end
+end
+
+
+__END__