]> git.donarmstrong.com Git - biopieces.git/blob - code_ruby/lib/maasha/seq/assemble.rb
016322727cf15eb824f9e3f0518cca9b2ecb0241
[biopieces.git] / code_ruby / lib / maasha / seq / assemble.rb
1 # Copyright (C) 2007-2013 Martin A. Hansen.
2
3 # This program is free software; you can redistribute it and/or
4 # modify it under the terms of the GNU General Public License
5 # as published by the Free Software Foundation; either version 2
6 # of the License, or (at your option) any later version.
7
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11 # GNU General Public License for more details.
12
13 # You should have received a copy of the GNU General Public License
14 # along with this program; if not, write to the Free Software
15 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
16
17 # http://www.gnu.org/copyleft/gpl.html
18
19 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
20
21 # This software is part of the Biopieces framework (www.biopieces.org).
22
23 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
24
25 # Class containing methods to assemble two overlapping sequences into a single.
26 class Assemble
27   # Class method to assemble two Seq objects.
28   def self.pair(entry1, entry2, options = {})
29     assemble = self.new(entry1, entry2, options)
30     assemble.match
31   end
32
33   # Method to initialize an Assembly object.
34   def initialize(entry1, entry2, options)
35     @entry1  = entry1
36     @entry2  = entry2
37     @options = options
38     @options[:mismatches_max] ||= 0
39     @options[:overlap_min]    ||= 1
40     @options[:overlap_max]    ||= entry1.length
41     @options[:overlap_max]      = [@options[:overlap_max], entry1.length, entry2.length].min
42   end
43
44   # Method to locate overlapping matche between two sequences.
45   def match
46     overlap = @options[:overlap_max]
47
48     na_seq1 = NArray.to_na(@entry1.seq.downcase, "byte")
49     na_seq2 = NArray.to_na(@entry2.seq.downcase, "byte")
50
51     while overlap >= @options[:overlap_min]
52       hamming_dist = (na_seq1[-1 * overlap .. -1] ^ na_seq2[0 ... overlap]).count_true
53
54       if hamming_dist <= (overlap * @options[:mismatches_max] * 0.01).round
55         entry_left  = @entry1[0 ... @entry1.length - overlap]
56         entry_right = @entry2[overlap .. -1]
57
58         if @entry1.qual and @entry2.qual
59           entry_overlap1 = @entry1[-1 * overlap .. -1]
60           entry_overlap2 = @entry2[0 ... overlap]
61
62           entry_overlap = merge_overlap(entry_overlap1, entry_overlap2)
63         else
64           entry_overlap = @entry1[-1 * overlap .. -1]
65         end
66
67         entry_left.seq.downcase!
68         entry_overlap.seq.upcase!
69         entry_right.seq.downcase!
70         entry_merged          = entry_left + entry_overlap + entry_right
71         entry_merged.seq_name = @entry1.seq_name + ":overlap=#{overlap}:hamming=#{hamming_dist}"
72
73         return entry_merged
74       end
75
76       overlap -= 1
77     end
78   end
79
80   # Method to merge sequence and quality scores in an overlap.
81   # The residue with the highest score at mismatch positions is selected.
82   # The quality scores of the overlap are the mean of the two sequences.
83   def merge_overlap(entry_overlap1, entry_overlap2)
84     na_seq = NArray.byte(entry_overlap1.length, 2)
85     na_seq[true, 0] = NArray.to_na(entry_overlap1.seq.downcase, "byte")
86     na_seq[true, 1] = NArray.to_na(entry_overlap2.seq.downcase, "byte")
87
88     na_qual = NArray.byte(entry_overlap1.length, 2)
89     na_qual[true, 0] = NArray.to_na(entry_overlap1.qual, "byte")
90     na_qual[true, 1] = NArray.to_na(entry_overlap2.qual, "byte")
91
92     mask_xor = na_seq[true, 0] ^ na_seq[true, 1] > 0
93     mask_seq = ((na_qual * mask_xor).eq( (na_qual * mask_xor).max(1)))
94
95     merged      = Seq.new()
96     merged.seq  = (na_seq * mask_seq).max(1).to_s
97     merged.qual = na_qual.mean(1).round.to_type("byte").to_s
98
99     merged
100   end
101 end
102
103
104 __END__
105