1 # Copyright (C) 2007-2013 Martin A. Hansen.
3 # This program is free software; you can redistribute it and/or
4 # modify it under the terms of the GNU General Public License
5 # as published by the Free Software Foundation; either version 2
6 # of the License, or (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU General Public License for more details.
13 # You should have received a copy of the GNU General Public License
14 # along with this program; if not, write to the Free Software
15 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17 # http://www.gnu.org/copyleft/gpl.html
19 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
21 # This software is part of the Biopieces framework (www.biopieces.org).
23 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
26 require 'maasha/seq/ambiguity'
28 # Class containing methods to assemble two overlapping sequences into a single.
32 # Class method to assemble two Seq objects.
33 def self.pair(entry1, entry2, options = {})
34 assemble = self.new(entry1, entry2, options)
38 # Method to initialize an Assembly object.
39 def initialize(entry1, entry2, options)
46 @options[:mismatches_max] ||= 0
47 @options[:overlap_min] ||= 1
50 # Method to locate overlapping matches between two sequences.
52 if @options[:overlap_max]
53 @overlap = [@options[:overlap_max], @entry1.length, @entry2.length].min
55 @overlap = [@entry1.length, @entry2.length].min
58 diff = @entry1.length - @entry2.length
61 @offset1 = @entry1.length - @overlap - diff
63 while @overlap >= @options[:overlap_min]
64 mismatches_max = (@overlap * @options[:mismatches_max] * 0.01).round
66 # puts "diff: #{diff} offset1: #{@offset1} offset2: #{@offset2} overlap: #{@overlap}"
68 if mismatches = match_C(@entry1.seq, @entry2.seq, @offset1, @offset2, @overlap, mismatches_max) and mismatches >= 0
69 entry_merged = entry_left + entry_overlap + entry_right
70 entry_merged.seq_name = @entry1.seq_name + ":overlap=#{@overlap}:hamming=#{mismatches}"
85 # Method to extract and downcase the left part of an assembled pair.
87 entry = @entry1[0 ... @offset1]
92 # Method to extract and downcase the right part of an assembled pair.
94 if @entry1.length > @offset1 + @overlap
95 entry = @entry1[@offset1 + @overlap .. -1]
97 entry = @entry2[@offset2 + @overlap .. -1]
104 # Method to extract and upcase the overlapping part of an assembled pair.
106 if @entry1.qual and @entry2.qual
107 entry_overlap1 = @entry1[@offset1 ... @offset1 + @overlap]
108 entry_overlap2 = @entry2[@offset2 ... @offset2 + @overlap]
110 entry = merge_overlap(entry_overlap1, entry_overlap2)
112 entry = @entry1[@offset1 ... @offset1 + @overlap]
119 # Method to merge sequence and quality scores in an overlap.
120 # The residue with the highest score at mismatch positions is selected.
121 # The quality scores of the overlap are the mean of the two sequences.
122 def merge_overlap(entry_overlap1, entry_overlap2)
123 na_seq = NArray.byte(entry_overlap1.length, 2)
124 na_seq[true, 0] = NArray.to_na(entry_overlap1.seq.downcase, "byte")
125 na_seq[true, 1] = NArray.to_na(entry_overlap2.seq.downcase, "byte")
127 na_qual = NArray.byte(entry_overlap1.length, 2)
128 na_qual[true, 0] = NArray.to_na(entry_overlap1.qual, "byte")
129 na_qual[true, 1] = NArray.to_na(entry_overlap2.qual, "byte")
131 mask_xor = na_seq[true, 0] ^ na_seq[true, 1] > 0
132 mask_seq = ((na_qual * mask_xor).eq( (na_qual * mask_xor).max(1)))
135 merged.seq = (na_seq * mask_seq).max(1).to_s
136 merged.qual = na_qual.mean(1).round.to_type("byte").to_s
142 add_ambiguity_macro(builder)
144 # C method for determining if two strings of equal length match
145 # given a maximum allowed mismatches and allowing for IUPAC
146 # ambiguity codes. Returns number of mismatches is true if match, else false.
149 VALUE _string1, // String 1
150 VALUE _string2, // String 2
151 VALUE _offset1, // Offset 1
152 VALUE _offset2, // Offset 2
153 VALUE _length, // String length
154 VALUE _max_mismatch // Maximum mismatches
157 char *string1 = StringValuePtr(_string1);
158 char *string2 = StringValuePtr(_string2);
159 unsigned int offset1 = FIX2UINT(_offset1);
160 unsigned int offset2 = FIX2UINT(_offset2);
161 unsigned int length = FIX2UINT(_length);
162 unsigned int max_mismatch = FIX2UINT(_max_mismatch);
164 unsigned int max_match = length - max_mismatch;
165 unsigned int match = 0;
166 unsigned int mismatch = 0;
169 for (i = 0; i < length; i++)
171 if (MATCH(string1[i + offset1], string2[i + offset2]))
175 if (match >= max_match) {
176 return UINT2NUM(mismatch);
183 if (mismatch > max_mismatch) {