X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=code_ruby%2Flib%2Fmaasha%2Fgenbank.rb;h=9635efd69f92971b6cac9ab54c3331885c77ee28;hb=a30677c14f1738f6a76e8c12f2e732cdef9958d6;hp=a6ec292621a91f4259d25689a4606aeccac9b05a;hpb=494dc53ebd515b1e3e9b91bbebf43059899ca4ce;p=biopieces.git diff --git a/code_ruby/lib/maasha/genbank.rb b/code_ruby/lib/maasha/genbank.rb index a6ec292..9635efd 100644 --- a/code_ruby/lib/maasha/genbank.rb +++ b/code_ruby/lib/maasha/genbank.rb @@ -1,4 +1,4 @@ -# Copyright (C) 2007-2010 Martin A. Hansen. +# Copyright (C) 2007-2011 Martin A. Hansen. # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License @@ -22,8 +22,10 @@ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< +require 'maasha/locator' require 'maasha/seq' require 'maasha/filesys' +require 'pp' # Error class for all exceptions to do with Genbank. class GenbankError < StandardError; end @@ -66,7 +68,7 @@ class Genbank < Filesys block.chomp!("//" + $/ ) - entry = block.split $/ + entry = block.tr("\r", "\n").split $/ return nil if entry.empty? @@ -79,13 +81,13 @@ class Genbank < Filesys i = @entry.size j = i - 1 - while @entry[j] and @entry[j] !~ /^[A-Z]/ + while @entry[j] and @entry[j] =~ /^\s+\d/ j -= 1 end seq = @entry[j + 1 .. i].join.delete(" 0123456789") - Seq.new(nil, seq, "dna") if seq + Seq.new(seq: seq, type: :dna) if seq end # Method to get the base keys from Genbank entry and return these @@ -135,28 +137,27 @@ class Genbank < Filesys end class GenbankFeatures - @@i = 0 - @@j = 0 - def initialize(entry, hash_feats, hash_quals) @entry = entry @hash_feats = hash_feats @hash_quals = hash_quals + @i = 0 + @j = 0 end def each - while @entry[@@i] and @entry[@@i] !~ /^ORIGIN/ - if @entry[@@i] =~ /^\s{5}([A-Za-z_-]+)/ + while @entry[@i] and @entry[@i] !~ /^ORIGIN/ + if @entry[@i] =~ /^\s{5}([53'A-Za-z_-]+)/ if want_feat? $1 record = {} - feat, loc = @entry[@@i].lstrip.split(/\s+/, 2) + feat, loc = @entry[@i].lstrip.split(/\s+/, 2) - @@j = @@i + 1 + @j = @i + 1 - while @entry[@@j] and @entry[@@j] !~ /^(\s{21}\/|\s{5}[A-Za-z_-]|[A-Z])/ - loc << @entry[@@j].lstrip - @@j += 1 + while @entry[@j] and @entry[@j] !~ /^(\s{21}\/|\s{5}[53'A-Za-z_-]|[A-Z])/ + loc << @entry[@j].lstrip + @j += 1 end get_quals.each_pair { |k,v| @@ -170,7 +171,7 @@ class GenbankFeatures end end - @@j > @@i ? @@i = @@j : @@i += 1 + @j > @i ? @i = @j : @i += 1 end end @@ -180,15 +181,15 @@ class GenbankFeatures quals = {} k = 0 - while @entry[@@j] and @entry[@@j] !~ /^\s{5}[A-Za-z_-]|^[A-Z]/ - if @entry[@@j] =~ /^\s{21}\/([^=]+)="([^"]+)/ + while @entry[@j] and @entry[@j] !~ /^\s{5}[53'A-Za-z_-]|^[A-Z]/ + if @entry[@j] =~ /^\s{21}\/([^=]+)="([^"]+)/ qual = $1 val = $2 if want_qual? qual - k = @@j + 1 + k = @j + 1 - while @entry[k] and @entry[k] !~ /^(\s{21}\/|\s{5}[A-Za-z_-]|[A-Z])/ + while @entry[k] and @entry[k] !~ /^(\s{21}\/|\s{5}[53'A-Za-z_-]|[A-Z])/ val << @entry[k].lstrip.chomp('"') k += 1 end @@ -201,7 +202,7 @@ class GenbankFeatures end end - k > @@j ? @@j = k : @@j += 1 + k > @j ? @j = k : @j += 1 end quals @@ -232,130 +233,4 @@ class GenbankFeatures end end - -# Error class for all exceptions to do with Genbank/EMBL/DDBJ feature table locators. -class LocatorError < StandardError; end - -class Locator - attr_accessor :locator, :seq, :subseq - - def initialize(locator, seq) - @locator = locator - @seq = seq - @subseq = Seq.new(nil, "", "dna") - parse_locator(locator) - end - - def strand - if @locator.match("complement") - return "-" - else - return "+" - end - end - - def s_beg - if @locator =~ /(\d+)/ - return $1.to_i - 1 - end - end - - def s_end - if @locator.reverse =~ /(\d+)/ - return $1.reverse.to_i - 1 - end - end - - private - - # Method that uses recursion to parse a locator string from a feature - # table and fetches the appropriate subsequence. the operators - # join(), complement(), and order() are handled. - # the locator string is broken into a comma separated lists, and - # modified if the parens donnot balance. otherwise the comma separated - # list of ranges are stripped from operators, and the subsequence are - # fetched and handled according to the operators. - # SNP locators are also dealt with (single positions). - def parse_locator(locator, join = nil, comp = nil, order = nil) - intervals = locator.split(",") - - unless balance_parens?(intervals.first) # locator includes a join/comp/order of several ranges - case locator - when /^join\((.*)\)$/ - locator = $1 - join = true - when /^complement\((.*)\)$/ - locator = $1 - comp = true - when /^order\((.*)\)$/ - locator = $1 - order = true - end - - parse_locator(locator, join, comp, order) - else - intervals.each do |interval| - case interval - when /^join\((.*)\)$/ - locator = $1 - join = true - parse_locator(locator, join, comp, order) - when /^complement\((.*)\)$/ - locator = $1 - comp = true - parse_locator(locator, join, comp, order) - when /^order\((.*)\)$/ - locator = $1 - order = true - parse_locator(locator, join, comp, order) - when /^[<>]?(\d+)[^\d]+(\d+)$/ - int_beg = $1.to_i - 1 - int_end = $2.to_i - 1 - - newseq = Seq.new(nil, @seq.seq[int_beg...int_end], "dna") - - unless newseq.seq.nil? - newseq.revcomp if comp - - @subseq.seq << (order ? " " + newseq.seq : newseq.seq) - end - when /^(\d+)$/ - pos = $1.to_i - 1 - - newseq = Seq.new(nil, @seq.seq[pos], "dna") - - unless newseq.seq.nil? - newseq.revcomp if comp - - @subseq.seq << (order ? " " + newseq.seq : newseq.seq) - end - else - $stderr.puts "WARNING: Could not match locator -> #{locator}"; - @subseq.seq << "" - end - end - end - - return @subseq - end - - def balance_parens?(locator) - parens = 0 - - locator.each_char do |char| - case char - when '(' then parens += 1 - when ')' then parens -= 1 - end - end - - if parens == 0 - return true - else - return false - end - end -end - - __END__