1 # Copyright (C) 2007-2010 Martin A. Hansen.
3 # This program is free software; you can redistribute it and/or
4 # modify it under the terms of the GNU General Public License
5 # as published by the Free Software Foundation; either version 2
6 # of the License, or (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU General Public License for more details.
13 # You should have received a copy of the GNU General Public License
14 # along with this program; if not, write to the Free Software
15 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17 # http://www.gnu.org/copyleft/gpl.html
19 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
21 # This software is part of the Biopieces framework (www.biopieces.org).
23 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
26 require 'maasha/filesys'
28 # Error class for all exceptions to do with Genbank.
29 class GenbankError < StandardError; end
31 class Genbank < Filesys
37 # Iterator method for parsing Genbank entries.
38 def each(hash_keys = nil, hash_feats = nil, hash_quals = nil)
39 while @entry = get_entry do
40 keys = get_keys(hash_keys)
43 features = GenbankFeatures.new(@entry, hash_feats, hash_quals)
45 features.each do |record|
46 keys.each_pair { |key,val| record[key] = val }
48 loc = Locator.new(record[:LOCATOR], seq)
49 record[:SEQ] = loc.subseq.seq
50 record[:SEQ_LEN] = loc.subseq.length
51 record[:STRAND] = loc.strand
52 record[:S_BEG] = loc.s_beg
53 record[:S_END] = loc.s_end
62 # Method to get the next Genbank entry form an ios and return this.
64 block = @io.gets("//" + $/)
65 return nil if block.nil?
67 block.chomp!("//" + $/ )
69 entry = block.split $/
71 return nil if entry.empty?
76 # Method to get the DNA sequence from a Genbank entry and return
77 # this as a Seq object.
82 while @entry[j] and @entry[j] !~ /^[A-Z]/
86 seq = @entry[j + 1 .. i].join.delete(" 0123456789")
88 Seq.new(nil, seq, "dna") if seq
91 # Method to get the base keys from Genbank entry and return these
93 def get_keys(hash_keys)
98 while @entry[i] and @entry[i] !~ /^FEATURES/
99 if @entry[i] =~ /^\s{0,3}([A-Z]{2})/
100 if want_key?(hash_keys, $1)
103 key, val = @entry[i].lstrip.split(/\s+/, 2)
105 while @entry[j] and @entry[j] !~ /^\s{0,3}[A-Z]/
106 val << @entry[j].lstrip
111 keys[key.to_sym] << ";" + val
113 keys[key.to_sym] = val
118 j > i ? i = j : i += 1
124 def want_key?(hash_keys, key)
126 if hash_keys[key.to_sym]
137 class GenbankFeatures
141 def initialize(entry, hash_feats, hash_quals)
143 @hash_feats = hash_feats
144 @hash_quals = hash_quals
148 while @entry[@@i] and @entry[@@i] !~ /^ORIGIN/
149 if @entry[@@i] =~ /^\s{5}([A-Za-z_-]+)/
153 feat, loc = @entry[@@i].lstrip.split(/\s+/, 2)
157 while @entry[@@j] and @entry[@@j] !~ /^(\s{21}\/|\s{5}[A-Za-z_-]|[A-Z])/
158 loc << @entry[@@j].lstrip
162 get_quals.each_pair { |k,v|
163 record[k.upcase.to_sym] = v
166 record[:FEATURE] = feat
167 record[:LOCATOR] = loc
173 @@j > @@i ? @@i = @@j : @@i += 1
183 while @entry[@@j] and @entry[@@j] !~ /^\s{5}[A-Za-z_-]|^[A-Z]/
184 if @entry[@@j] =~ /^\s{21}\/([^=]+)="([^"]+)/
191 while @entry[k] and @entry[k] !~ /^(\s{21}\/|\s{5}[A-Za-z_-]|[A-Z])/
192 val << @entry[k].lstrip.chomp('"')
197 quals[qual] << ";" + val
204 k > @@j ? @@j = k : @@j += 1
212 if @hash_feats[feat.upcase.to_sym]
224 if @hash_quals[qual.upcase.to_sym]
236 # Error class for all exceptions to do with Genbank/EMBL/DDBJ feature table locators.
237 class LocatorError < StandardError; end
240 attr_accessor :locator, :seq, :subseq
242 def initialize(locator, seq)
245 @subseq = Seq.new(nil, "", "dna")
246 parse_locator(locator)
250 if @locator.match("complement")
258 if @locator =~ /(\d+)/
264 if @locator.reverse =~ /(\d+)/
265 return $1.reverse.to_i - 1
271 # Method that uses recursion to parse a locator string from a feature
272 # table and fetches the appropriate subsequence. the operators
273 # join(), complement(), and order() are handled.
274 # the locator string is broken into a comma separated lists, and
275 # modified if the parens donnot balance. otherwise the comma separated
276 # list of ranges are stripped from operators, and the subsequence are
277 # fetched and handled according to the operators.
278 # SNP locators are also dealt with (single positions).
279 def parse_locator(locator, join = nil, comp = nil, order = nil)
280 intervals = locator.split(",")
282 unless balance_parens?(intervals.first) # locator includes a join/comp/order of several ranges
284 when /^join\((.*)\)$/
287 when /^complement\((.*)\)$/
290 when /^order\((.*)\)$/
295 parse_locator(locator, join, comp, order)
297 intervals.each do |interval|
299 when /^join\((.*)\)$/
302 parse_locator(locator, join, comp, order)
303 when /^complement\((.*)\)$/
306 parse_locator(locator, join, comp, order)
307 when /^order\((.*)\)$/
310 parse_locator(locator, join, comp, order)
311 when /^[<>]?(\d+)[^\d]+(\d+)$/
312 int_beg = $1.to_i - 1
313 int_end = $2.to_i - 1
315 newseq = Seq.new(nil, @seq.seq[int_beg...int_end], "dna")
317 unless newseq.seq.nil?
318 newseq.revcomp if comp
320 @subseq.seq << (order ? " " + newseq.seq : newseq.seq)
325 newseq = Seq.new(nil, @seq.seq[pos], "dna")
327 unless newseq.seq.nil?
328 newseq.revcomp if comp
330 @subseq.seq << (order ? " " + newseq.seq : newseq.seq)
333 $stderr.puts "WARNING: Could not match locator -> #{locator}";
342 def balance_parens?(locator)
345 locator.each_char do |char|
347 when '(' then parens += 1
348 when ')' then parens -= 1