1 # This program is free software; you can redistribute it and/or
2 # modify it under the terms of the GNU General Public License
3 # as published by the Free Software Foundation; either version 2
4 # of the License, or (at your option) any later version.
6 # This program is distributed in the hope that it will be useful,
7 # but WITHOUT ANY WARRANTY; without even the implied warranty of
8 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
9 # GNU General Public License for more details.
11 # You should have received a copy of the GNU General Public License
12 # along with this program; if not, write to the Free Software
13 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
15 # http://www.gnu.org/copyleft/gpl.html
17 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
19 # This software is part of the Biopieces framework (www.biopieces.org).
21 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
26 # Error class for all exceptions to do with Genbank.
27 class GenbankError < StandardError; end
32 # Class method allowing open to be used on (zipped) files.
35 ios = self.zopen(*args)
53 # Method to close ios.
58 # Iterator method for parsing Genbank entries.
59 def each(hash_keys, hash_feats, hash_quals)
60 while @entry = get_entry do
61 keys = get_keys(hash_keys)
64 features = GenbankFeatures.new(@entry, hash_feats, hash_quals)
66 features.each do |record|
67 keys.each_pair { |key,val| record[key] = val }
68 loc = Locator.new(record[:LOCATOR], seq)
69 record[:SEQ] = loc.subseq.seq
70 record[:STRAND] = loc.strand
71 record[:S_BEG] = loc.s_beg
72 record[:S_END] = loc.s_end
81 # Helper method to return an ios to a file that may be zipped in which case
82 # the ios is unzipped on the fly. See File.open.
84 ios = File.open(*args)
87 ios = Zlib::GzipReader.new(ios)
95 # Method to get the next Genbank entry form an ios and return this.
97 block = @io.gets("//" + $/)
98 return nil if block.nil?
100 block.chomp!("//" + $/ )
102 entry = block.split $/
104 return nil if entry.empty?
109 # Method to get the DNA sequence from a Genbank entry and return
110 # this as a Seq object.
115 while @entry[i] !~ /^ORIGIN/
119 seq.seq = @entry[i + 1 .. @entry.size].join.delete( " 0123456789")
124 # Method to get the base keys from Genbank entry and return these
126 def get_keys(hash_keys)
131 while @entry[i] and @entry[i] !~ /^FEATURES/
132 if @entry[i] =~ /^\s{0,3}([A-Z]{2})/
133 if want_key?(hash_keys, $1)
136 key, val = @entry[i].lstrip.split(/\s+/, 2)
138 while @entry[j] and @entry[j] !~ /^\s{0,3}[A-Z]/
139 val << @entry[j].lstrip
144 keys[key.to_sym] << ";" + val
146 keys[key.to_sym] = val
151 j > i ? i = j : i += 1
157 def want_key?(hash_keys, key)
159 if hash_keys[key.to_sym]
170 class GenbankFeatures
174 def initialize(entry, hash_feats, hash_quals)
176 @hash_feats = hash_feats
177 @hash_quals = hash_quals
181 while @entry[@@i] and @entry[@@i] !~ /^ORIGIN/
182 if @entry[@@i] =~ /^\s{5}([A-Za-z_-]+)/
186 feat, loc = @entry[@@i].lstrip.split(/\s+/, 2)
190 while @entry[@@j] and @entry[@@j] !~ /^(\s{21}\/|\s{5}[A-Za-z_-]|[A-Z])/
191 loc << @entry[@@j].lstrip
195 get_quals.each_pair { |k,v|
196 record[k.upcase.to_sym] = v
199 record[:FEATURE] = feat
200 record[:LOCATOR] = loc
206 @@j > @@i ? @@i = @@j : @@i += 1
216 while @entry[@@j] and @entry[@@j] !~ /^\s{5}[A-Za-z_-]|^[A-Z]/
217 if @entry[@@j] =~ /^\s{21}\/([^=]+)="([^"]+)/
224 while @entry[k] and @entry[k] !~ /^(\s{21}\/|\s{5}[A-Za-z_-]|[A-Z])/
225 val << @entry[k].lstrip.chomp('"')
230 quals[qual] << ";" + val
237 k > @@j ? @@j = k : @@j += 1
245 if @hash_feats[feat.upcase.to_sym]
257 if @hash_quals[qual.upcase.to_sym]
269 # Error class for all exceptions to do with Genbank/EMBL/DDBJ feature table locators.
270 class LocatorError < StandardError; end
273 attr_accessor :locator, :seq, :subseq
275 def initialize(locator, seq)
279 @subseq = Seq.new(nil, "", "dna")
287 if @loc_orig.match("complement")
295 if @loc_orig =~ /(\d+)/
301 if @loc_orig.reverse =~ /(\d+)/
302 return $1.reverse.to_i - 1
308 # Method that uses recursion to parse a locator string from a feature
309 # table and fetches the appropriate subsequence. the operators
310 # join(), complement(), and order() are handled.
311 # the locator string is broken into a comma separated lists, and
312 # modified if the params donnot balance. otherwise the comma separated
313 # list of ranges are stripped from operators, and the subsequence are
314 # fetched and handled according to the operators.
315 # SNP locators are also dealt with (single positions).
316 def parse_locator(join = nil, comp = nil, order = nil)
317 intervals = @locator.split(",")
319 unless balance_params?(intervals.first) # locator includes a join/comp/order of several ranges
321 when /^join\((.*)\)$/
324 when /^complement\((.*)\)$/
327 when /^order\((.*)\)$/
332 parse_locator(join, comp, order)
334 intervals.each do |interval|
336 when /^join\((.*)\)$/
339 parse_locator(join, comp, order)
340 when /^complement\((.*)\)$/
343 parse_locator(join, comp, order)
344 when /^order\((.*)\)$/
347 parse_locator(join, comp, order)
348 when /^[<>]?(\d+)[^\d]+(\d+)$/
349 int_beg = $1.to_i - 1
350 int_end = $2.to_i - 1
352 newseq = Seq.new(nil, @seq.seq[int_beg...int_end], "dna")
353 newseq.revcomp if comp
355 @subseq.seq << (order ? " " + newseq.seq : newseq.seq)
359 newseq = Seq.new(nil, @seq.seq[pos], "dna")
360 newseq.revcomp if comp
362 @subseq.seq << (order ? " " + newseq.seq : newseq.seq)
364 $stderr.puts "WARNING: Could not match locator -> #{locator}";
373 def balance_params?(locator)
376 locator.each_char do |char|
378 when '(' then parens += 1
379 when ')' then parens -= 1