1 # Copyright (C) 2007-2010 Martin A. Hansen.
3 # This program is free software; you can redistribute it and/or
4 # modify it under the terms of the GNU General Public License
5 # as published by the Free Software Foundation; either version 2
6 # of the License, or (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU General Public License for more details.
13 # You should have received a copy of the GNU General Public License
14 # along with this program; if not, write to the Free Software
15 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17 # http://www.gnu.org/copyleft/gpl.html
19 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
21 # This software is part of the Biopieces framework (www.biopieces.org).
23 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
26 require 'maasha/filesys'
29 # Error class for all exceptions to do with Genbank.
30 class GenbankError < StandardError; end
32 class Genbank < Filesys
38 # Iterator method for parsing Genbank entries.
39 def each(hash_keys = nil, hash_feats = nil, hash_quals = nil)
40 while @entry = get_entry do
41 keys = get_keys(hash_keys)
44 features = GenbankFeatures.new(@entry, hash_feats, hash_quals)
46 features.each do |record|
47 keys.each_pair { |key,val| record[key] = val }
49 loc = Locator.new(record[:LOCATOR], seq)
50 record[:SEQ] = loc.subseq.seq
51 record[:SEQ_LEN] = loc.subseq.length
52 record[:STRAND] = loc.strand
53 record[:S_BEG] = loc.s_beg
54 record[:S_END] = loc.s_end
63 # Method to get the next Genbank entry form an ios and return this.
65 block = @io.gets("//" + $/)
66 return nil if block.nil?
68 block.chomp!("//" + $/ )
70 entry = block.split $/
72 return nil if entry.empty?
77 # Method to get the DNA sequence from a Genbank entry and return
78 # this as a Seq object.
83 while @entry[j] and @entry[j] !~ /^[A-Z]/
87 seq = @entry[j + 1 .. i].join.delete(" 0123456789")
89 Seq.new(nil, seq, "dna") if seq
92 # Method to get the base keys from Genbank entry and return these
94 def get_keys(hash_keys)
99 while @entry[i] and @entry[i] !~ /^FEATURES/
100 if @entry[i] =~ /^\s{0,3}([A-Z]{2})/
101 if want_key?(hash_keys, $1)
104 key, val = @entry[i].lstrip.split(/\s+/, 2)
106 while @entry[j] and @entry[j] !~ /^\s{0,3}[A-Z]/
107 val << @entry[j].lstrip
112 keys[key.to_sym] << ";" + val
114 keys[key.to_sym] = val
119 j > i ? i = j : i += 1
125 def want_key?(hash_keys, key)
127 if hash_keys[key.to_sym]
138 class GenbankFeatures
139 def initialize(entry, hash_feats, hash_quals)
141 @hash_feats = hash_feats
142 @hash_quals = hash_quals
148 while @entry[@i] and @entry[@i] !~ /^ORIGIN/
149 if @entry[@i] =~ /^\s{5}([A-Za-z_-]+)/
153 feat, loc = @entry[@i].lstrip.split(/\s+/, 2)
157 while @entry[@j] and @entry[@j] !~ /^(\s{21}\/|\s{5}[A-Za-z_-]|[A-Z])/
158 loc << @entry[@j].lstrip
162 get_quals.each_pair { |k,v|
163 record[k.upcase.to_sym] = v
166 record[:FEATURE] = feat
167 record[:LOCATOR] = loc
173 @j > @i ? @i = @j : @i += 1
183 while @entry[@j] and @entry[@j] !~ /^\s{5}[A-Za-z_-]|^[A-Z]/
184 if @entry[@j] =~ /^\s{21}\/([^=]+)="([^"]+)/
191 while @entry[k] and @entry[k] !~ /^(\s{21}\/|\s{5}[A-Za-z_-]|[A-Z])/
192 val << @entry[k].lstrip.chomp('"')
197 quals[qual] << ";" + val
204 k > @j ? @j = k : @j += 1
212 if @hash_feats[feat.upcase.to_sym]
224 if @hash_quals[qual.upcase.to_sym]
236 # Error class for all exceptions to do with Genbank/EMBL/DDBJ feature table locators.
237 class LocatorError < StandardError; end
240 attr_accessor :locator, :seq, :subseq
242 def initialize(locator, seq)
245 @subseq = Seq.new(nil, "", "dna")
246 parse_locator(locator)
250 if @locator.match("complement")
258 if @locator =~ /(\d+)/
264 if @locator.reverse =~ /(\d+)/
265 return $1.reverse.to_i - 1
271 # Method that uses recursion to parse a locator string from a feature
272 # table and fetches the appropriate subsequence. the operators
273 # join(), complement(), and order() are handled.
274 # the locator string is broken into a comma separated lists, and
275 # modified if the parens donnot balance. otherwise the comma separated
276 # list of ranges are stripped from operators, and the subsequence are
277 # fetched and handled according to the operators.
278 # SNP locators are also dealt with (single positions).
279 def parse_locator(locator, join = nil, comp = nil, order = nil)
280 intervals = locator.split(",")
282 unless balance_parens?(intervals.first) # locator includes a join/comp/order of several ranges
284 when /^join\((.*)\)$/
287 when /^complement\((.*)\)$/
290 when /^order\((.*)\)$/
295 parse_locator(locator, join, comp, order)
297 intervals.each do |interval|
299 when /^join\((.*)\)$/
302 parse_locator(locator, join, comp, order)
303 when /^complement\((.*)\)$/
306 parse_locator(locator, join, comp, order)
307 when /^order\((.*)\)$/
310 parse_locator(locator, join, comp, order)
311 when /^[<>]?(\d+)[^\d]+(\d+)$/
312 int_beg = $1.to_i - 1
313 int_end = $2.to_i - 1
315 newseq = Seq.new(nil, @seq.seq[int_beg...int_end], "dna")
317 unless newseq.seq.nil?
318 newseq.revcomp if comp
320 @subseq.seq << (order ? " " + newseq.seq : newseq.seq)
325 newseq = Seq.new(nil, @seq.seq[pos], "dna")
327 unless newseq.seq.nil?
328 newseq.revcomp if comp
330 @subseq.seq << (order ? " " + newseq.seq : newseq.seq)
333 $stderr.puts "WARNING: Could not match locator -> #{locator}";
342 def balance_parens?(locator)
345 locator.each_char do |char|
347 when '(' then parens += 1
348 when ')' then parens -= 1