From: martinahansen Date: Tue, 10 Jul 2012 08:52:14 +0000 (+0000) Subject: fixed bug for missing sequence in genbank parser X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=ea0a8a58dfd5a31a41425010398634deba57ada7;p=biopieces.git fixed bug for missing sequence in genbank parser git-svn-id: http://biopieces.googlecode.com/svn/trunk@1858 74ccb610-7750-0410-82ae-013aeee3265d --- diff --git a/code_ruby/lib/maasha/align.rb b/code_ruby/lib/maasha/align.rb index 4fe9238..139c4a2 100755 --- a/code_ruby/lib/maasha/align.rb +++ b/code_ruby/lib/maasha/align.rb @@ -25,8 +25,8 @@ require 'pp' require 'open3' require 'narray' +require 'maasha/align/pair' require 'maasha/fasta' -require 'maasha/mum' class AlignError < StandardError; end; @@ -92,6 +92,8 @@ class Align attr_accessor :options attr_reader :entries + include PairAlign + # Class method to align sequences in a list of Seq objects and # return these as a new list of Seq objects. def self.muscle(entries, verbose = false) @@ -134,22 +136,7 @@ class Align # the left and right search spaced around this match. When all search spaces # are exhausted the saved matches are used to insert gaps in the sequences. def self.pair(q_entry, s_entry) - q_space_beg = 0 - q_space_end = q_entry.length - s_space_beg = 0 - s_space_end = s_entry.length - - q_entry.seq.downcase! - s_entry.seq.downcase! - - ap = AlignPair.new(q_entry, s_entry) - - ap.align(q_space_beg, q_space_end, s_space_beg, s_space_end, 15, []) - - ap.upcase_match - ap.insert_gaps - - #ap.dump_bp and exit + AlignPair.align(q_entry, s_entry) self.new([q_entry, s_entry]) end @@ -224,136 +211,6 @@ class Align private - # Class for creating a pairwise alignment of two specified sequences. The - # alignment is created by casting a search space the size of the sequences - # and save the longest perfect match between the sequences and recurse into - # the left and right search spaced around this match. When all search spaces - # are exhausted the saved matches are used to insert gaps in the sequences. - class AlignPair - attr_reader :matches - - # Method to initialize an AlignPair object given two Seq objects. - def initialize(q_entry, s_entry) - @q_entry = q_entry - @s_entry = s_entry - @matches = [] - end - - # Method that locates the longest perfect match in a specified search space - # by comparing two sequence indeces. The first index is created by hashing - # for the first sequence the position of all non-overlapping unique oligos - # with the oligo as key an the position as value. The second index is - # created for the second sequence by hashing the position of all - # overlapping unique oligos in a similar way. The longest match is located - # by comparing these indeces and the longest match is expanded maximally - # within the specified search space. Finally, the longest match is used - # to define a left and a right search space which are recursed into - # until no more matches are found. - def align(q_space_beg, q_space_end, s_space_beg, s_space_end, kmer_size, matches) -# puts "search space: #{q_space_beg}-#{q_space_end} x #{s_space_beg}-#{s_space_end}" - new_matches = [] - - matches.each do |match| - new_matches << match if match.q_beg >= q_space_beg and - match.q_end <= q_space_end and - match.s_beg >= s_space_beg and - match.s_end <= s_space_end - end - - while new_matches.empty? and kmer_size > 0 - if @q_entry.length >= kmer_size and @s_entry.length >= kmer_size - new_matches = MUMs.find(@q_entry, @s_entry, {"q_space_beg" => q_space_beg, - "q_space_end" => q_space_end, - "s_space_beg" => s_space_beg, - "s_space_end" => s_space_end, - "kmer_size" => kmer_size} ) - end - - unless new_matches.empty? - best_match = new_matches.sort_by { |match| match.score }.pop - - @matches << best_match - - q_space_beg_left = (q_space_beg == 0) ? 0 : q_space_beg + 1 - s_space_beg_left = (s_space_beg == 0) ? 0 : s_space_beg + 1 - q_space_end_left = best_match.q_beg - 1 - s_space_end_left = best_match.s_beg - 1 - - q_space_beg_right = best_match.q_end + 1 - s_space_beg_right = best_match.s_end + 1 - q_space_end_right = (q_space_end == @q_entry.length) ? @q_entry.length : q_space_end - 1 - s_space_end_right = (s_space_end == @s_entry.length) ? @s_entry.length : s_space_end - 1 - - if q_space_end_left - q_space_beg_left > 0 and s_space_end_left - s_space_beg_left > 0 - align(q_space_beg_left, q_space_end_left, s_space_beg_left, s_space_end_left, kmer_size, new_matches) - end - - if q_space_end_right - q_space_beg_right > 0 and s_space_end_right - s_space_beg_right > 0 - align(q_space_beg_right, q_space_end_right, s_space_beg_right, s_space_end_right, kmer_size, new_matches) - end - end - - kmer_size /= 2 - end - end - - # Method for debugging purposes that upcase matching sequence while non-matches - # sequence is kept in lower case. - def upcase_match - @matches.each do |match| - @q_entry.seq[match.q_beg .. match.q_end] = @q_entry.seq[match.q_beg .. match.q_end].upcase - @s_entry.seq[match.s_beg .. match.s_end] = @s_entry.seq[match.s_beg .. match.s_end].upcase - end - end - - # Method that insert gaps in sequences based on a list of matches and thus - # creating an alignment. - # TODO check boundaries! - def insert_gaps - @matches.sort_by! { |m| m.q_beg } - - q_gaps = 0 - s_gaps = 0 - - @matches.each do |match| - diff = (q_gaps + match.q_beg) - (s_gaps + match.s_beg) - - #pp "q_gaps #{q_gaps} s_gaps #{s_gaps} diff #{diff}" - - if diff < 0 - @q_entry.seq.insert(match.q_beg + q_gaps, "-" * diff.abs) - q_gaps += diff.abs - elsif diff > 0 - @s_entry.seq.insert(match.s_beg + s_gaps, "-" * diff.abs) - s_gaps += diff.abs - end - end - - diff = @q_entry.length - @s_entry.length - - if diff < 0 - @q_entry.seq << ("-" * diff.abs) - else - @s_entry.seq << ("-" * diff.abs) - end - end - - # Method that dumps all matches as Biopiece records for use with - # plot_matches. - def dump_bp - @matches.sort_by! { |m| m.q_beg } - - @matches.each { |m| - puts "Q_BEG: #{m.q_beg}" - puts "Q_END: #{m.q_end}" - puts "S_BEG: #{m.s_beg}" - puts "S_END: #{m.s_end}" - puts "STRAND: +" - puts "---" - } - end - end - class Consensus # Method to initialize a Consensus object given a list of aligned Seq object. def initialize(entries, options) diff --git a/code_ruby/lib/maasha/genbank.rb b/code_ruby/lib/maasha/genbank.rb index d444b4b..fe08263 100644 --- a/code_ruby/lib/maasha/genbank.rb +++ b/code_ruby/lib/maasha/genbank.rb @@ -81,7 +81,7 @@ class Genbank < Filesys i = @entry.size j = i - 1 - while @entry[j] and @entry[j] !~ /^[A-Z]/ + while @entry[j] and @entry[j] =~ /^\s+\d/ j -= 1 end diff --git a/code_ruby/lib/maasha/mum.rb b/code_ruby/lib/maasha/mum.rb index 09f18ad..400d512 100755 --- a/code_ruby/lib/maasha/mum.rb +++ b/code_ruby/lib/maasha/mum.rb @@ -12,17 +12,16 @@ BYTES_IN_OLIGO = BYTES_IN_BIN + BYTES_IN_POS class MUMs include Enumerable - def self.find(q_seq, s_seq, opt_hash) - m = self.new(q_seq, s_seq, opt_hash) + def self.find(q_seq, s_seq, space, kmer_size) + m = self.new(q_seq, s_seq, space, kmer_size) m.to_a end - def initialize(q_seq, s_seq, opt_hash) - kmer_size = opt_hash["kmer_size"] || 16 - q_space_beg = opt_hash["q_space_beg"] || 0 - q_space_end = opt_hash["q_space_end"] || q_seq.length - 1 - s_space_beg = opt_hash["s_space_beg"] || 0 - s_space_end = opt_hash["s_space_end"] || s_seq.length - 1 + def initialize(q_seq, s_seq, space, kmer_size) + q_space_beg = space.q_min + q_space_end = space.q_max + s_space_beg = space.s_min + s_space_end = space.s_max @mums = []