From: martinahansen Date: Wed, 18 Apr 2012 07:09:49 +0000 (+0000) Subject: added q_id and s_id options to findsim.rb X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=8210c00fce669b066fae61f7286593d294baa37b;p=biopieces.git added q_id and s_id options to findsim.rb git-svn-id: http://biopieces.googlecode.com/svn/trunk@1795 74ccb610-7750-0410-82ae-013aeee3265d --- diff --git a/code_ruby/lib/maasha/findsim.rb b/code_ruby/lib/maasha/findsim.rb index 5a14065..cd07b8e 100644 --- a/code_ruby/lib/maasha/findsim.rb +++ b/code_ruby/lib/maasha/findsim.rb @@ -56,6 +56,8 @@ class FindSim @q_total_ary = nil @result = nil @result_count = 0 + @q_ids = [] + @s_ids = [] end # Method to load sequences from a query file in FASTA format @@ -69,6 +71,8 @@ class FindSim Fasta.open(file, 'r') do |ios| ios.each do |entry| + @q_ids << entry.seq_name if @opt_hash[:query_ids] + oligos = str_to_oligo_rb_ary_c(entry.seq, @opt_hash[:kmer], 1).uniq.sort q_total << oligos.size @@ -99,6 +103,8 @@ class FindSim Fasta.open(file, 'r') do |ios| ios.each_with_index do |entry, s_index| + @s_ids << entry.seq_name if @opt_hash[:subject_ids] + zero_ary_c(oligo_ary, (4 ** @opt_hash[:kmer]) * BYTES_IN_INT) zero_ary_c(shared_ary, @q_size * BYTES_IN_INT) @@ -119,7 +125,7 @@ class FindSim end # Method that for each query index yields all hits, sorted according to - # decending score, as a list of Score objects. + # decending score, as Hit objects. def each sort_hits_c(@result, @result_count) @@ -128,19 +134,23 @@ class FindSim hit_index = 0 (0 ... @q_size).each do |q_index| - scores = [] zero_ary_c(hit_ary, HIT_ARY_MAX * BYTES_IN_HIT) hit_ary_size = get_hits_c(@result, @result_count, hit_index, hit_ary, q_index) - max = (hit_ary_size > @opt_hash[:report_scores]) ? @opt_hash[:report_scores] : hit_ary_size + if @opt_hash[:report_scores] + max = (hit_ary_size > @opt_hash[:report_scores]) ? @opt_hash[:report_scores] : hit_ary_size + else + max = hit_ary_size + end (0 ... max).each do |i| q_index, s_index, score = hit_ary[BYTES_IN_HIT * i ... BYTES_IN_HIT * i + BYTES_IN_HIT].unpack("IIF") - scores << Score.new(score, s_index) - end + q_id = @opt_hash[:query_ids] ? @q_ids[q_index] : q_index + s_id = @opt_hash[:subject_ids] ? @s_ids[s_index] : s_index - yield scores + yield Hit.new(q_id, s_id, score) + end hit_index += hit_ary_size end @@ -366,6 +376,7 @@ class FindSim # Method that given a string (char array) encodes all kmers overlapping # with a given step size as integers that are pushed onto a Ruby array # which is returned. + # TODO should have an option for skipping oligos with ambiguity codes. builder.c %{ VALUE str_to_oligo_rb_ary_c( VALUE _str, // DNA or RNA string. @@ -554,19 +565,20 @@ class FindSim # >>>>>>>>>>>>>>> Embedded classes <<<<<<<<<<<<<<< # Class for holding score information. - class Score - attr_reader :score, :s_index - - # Method to initialize Score object with - # a subject sequence index and a score. - def initialize(score, s_index) - @s_index = s_index - @score = score + class Hit + attr_reader :q_id, :s_id, :score + + # Method to initialize Hit object with + # query and subject id a score. + def initialize(q_id, s_id, score) + @q_id = q_id + @s_id = s_id + @score = score end # Method for outputting score objects. def to_s - "#{@s_index}:#{@score.round(2)}" + "#{@q_id}:#{@s_id}:#{@score.round(2)}" end end end