]> git.donarmstrong.com Git - biopieces.git/blob - code_ruby/lib/maasha/seq/assemble.rb
fixed bug in pair assembler
[biopieces.git] / code_ruby / lib / maasha / seq / assemble.rb
1 # Copyright (C) 2007-2013 Martin A. Hansen.
2
3 # This program is free software; you can redistribute it and/or
4 # modify it under the terms of the GNU General Public License
5 # as published by the Free Software Foundation; either version 2
6 # of the License, or (at your option) any later version.
7
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11 # GNU General Public License for more details.
12
13 # You should have received a copy of the GNU General Public License
14 # along with this program; if not, write to the Free Software
15 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
16
17 # http://www.gnu.org/copyleft/gpl.html
18
19 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
20
21 # This software is part of the Biopieces framework (www.biopieces.org).
22
23 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
24
25 require 'inline'
26 require 'maasha/seq/ambiguity'
27
28 # Class containing methods to assemble two overlapping sequences into a single.
29 class Assemble
30   extend Ambiguity
31
32   # Class method to assemble two Seq objects.
33   def self.pair(entry1, entry2, options = {})
34     assemble = self.new(entry1, entry2, options)
35     assemble.match
36   end
37
38   # Method to initialize an Assembly object.
39   def initialize(entry1, entry2, options)
40     @entry1  = entry1
41     @entry2  = entry2
42     @options = options
43     @options[:mismatches_max] ||= 0
44     @options[:overlap_min]    ||= 1
45   end
46
47   # Method to locate overlapping matches between two sequences.
48   def match
49     if @options[:overlap_max]
50       overlap = [@options[:overlap_max], @entry1.length, @entry2.length].min
51     else
52       overlap = [@entry1.length, @entry2.length].min
53     end
54
55     while overlap >= @options[:overlap_min]
56       mismatches_max = (overlap * @options[:mismatches_max] * 0.01).round
57       
58       offset1 = @entry2.length - overlap
59       offset2 = 0
60
61       if match_C(@entry1.seq, @entry2.seq, offset1, offset2, overlap, mismatches_max)
62         entry_left  = @entry1[0 ... @entry1.length - overlap]
63         entry_right = @entry2[overlap .. -1]
64
65         if @entry1.qual and @entry2.qual
66           entry_overlap1 = @entry1[-1 * overlap .. -1]
67           entry_overlap2 = @entry2[0 ... overlap]
68
69           entry_overlap = merge_overlap(entry_overlap1, entry_overlap2)
70         else
71           entry_overlap = @entry1[-1 * overlap .. -1]
72         end
73
74         entry_left.seq.downcase!
75         entry_overlap.seq.upcase!
76         entry_right.seq.downcase!
77         entry_merged          = entry_left + entry_overlap + entry_right
78         entry_merged.seq_name = @entry1.seq_name + ":overlap=#{overlap}"
79
80         return entry_merged
81       end
82
83       overlap -= 1
84     end
85   end
86
87   # Method to merge sequence and quality scores in an overlap.
88   # The residue with the highest score at mismatch positions is selected.
89   # The quality scores of the overlap are the mean of the two sequences.
90   def merge_overlap(entry_overlap1, entry_overlap2)
91     na_seq = NArray.byte(entry_overlap1.length, 2)
92     na_seq[true, 0] = NArray.to_na(entry_overlap1.seq.downcase, "byte")
93     na_seq[true, 1] = NArray.to_na(entry_overlap2.seq.downcase, "byte")
94
95     na_qual = NArray.byte(entry_overlap1.length, 2)
96     na_qual[true, 0] = NArray.to_na(entry_overlap1.qual, "byte")
97     na_qual[true, 1] = NArray.to_na(entry_overlap2.qual, "byte")
98
99     mask_xor = na_seq[true, 0] ^ na_seq[true, 1] > 0
100     mask_seq = ((na_qual * mask_xor).eq( (na_qual * mask_xor).max(1)))
101
102     merged      = Seq.new()
103     merged.seq  = (na_seq * mask_seq).max(1).to_s
104     merged.qual = na_qual.mean(1).round.to_type("byte").to_s
105
106     merged
107   end
108
109   inline do |builder|
110     add_ambiguity_macro(builder)
111
112     # C method for determining if two strings of equal length match
113     # given a maximum allowed mismatches and allowing for IUPAC
114     # ambiguity codes. Returns true if match, else false.
115     builder.c %{
116       VALUE match_C(
117         VALUE _string1,       // String 1
118         VALUE _string2,       // String 2
119         VALUE _offset1,       // Offset 1
120         VALUE _offset2,       // Offset 2
121         VALUE _length,        // String length
122         VALUE _max_mismatch   // Maximum mismatches
123       )
124       {
125         char         *string1      = StringValuePtr(_string1);
126         char         *string2      = StringValuePtr(_string2);
127         unsigned int  offset1      = FIX2UINT(_offset1);
128         unsigned int  offset2      = FIX2UINT(_offset2);
129         unsigned int  length       = FIX2UINT(_length);
130         unsigned int  max_mismatch = FIX2UINT(_max_mismatch);
131
132         unsigned int max_match = length - max_mismatch;
133         unsigned int match     = 0;
134         unsigned int mismatch  = 0;
135         unsigned int i         = 0;
136
137         for (i = 0; i < length; i++)
138         {
139           if (MATCH(string1[i + offset1], string2[i + offset2]))
140           {
141             match++;
142
143             if (match >= max_match) {
144               return Qtrue;
145             }
146           }
147           else
148           {
149             mismatch++;
150
151             if (mismatch > max_mismatch) {
152               return Qfalse;
153             }
154           }
155         }
156
157         return Qfalse;
158       }
159     }
160   end
161 end
162
163
164 __END__
165