if merged
new_record = merged.to_bp
- if merged.seq_name =~ /overlap=(\d+):hamming=(\d+)$/
- new_record[:OVERLAP_LEN] = $1
- new_record[:HAMMING_DIST] = $2
+ if merged.seq_name =~ /overlap=(\d+)$/
+ new_record[:OVERLAP_LEN] = $1
end
output.puts new_record
-SEQ_NAME: M01168:16:000000000-A1R9L:1:1101:14862:1868 1:N:0:14:overlap=49:hamming=1
+SEQ_NAME: M01168:16:000000000-A1R9L:1:1101:14862:1868 1:N:0:14:overlap=49
SEQ: tggggaatattggacaatgggggcaaccctgatccagcaataccgcgtgtgtgaagaaggcctgagggttgtaaagcactttcaattgtgaagaaaagttaacggttaataaccgttagccttgacgttaactttagaagaagcaccggctaactccgtgccagcagccgcggtaatacggagggtgcaagcgttaatcgGAATTACTGGGCGTAAAGCGTGCGTAGGCGGTTTATTAAGTCAGATGTGaaagccccgggcttaacctgggaactgcatttgaaactggtcaactagagtatggtagaggaaagtggaatttctggtgtagcggtgaaatgcgtagatatcagaaggaacatcaatggcgaaggcagctttctggaccaatactgacgctgaggtacgaaagcgtgggtagcaaacagg
SEQ_LEN: 429
SCORES: !??????BDDDDDDDDGGGGGGGHHIIIEHIHHFGGHFHHGHFHHDHEHEHHFAFFGFFHFHHFFHHHEFEEHHHHHHHHHHHHFFFHFHHHHHHHHHHHHGBDEGGGGGGGGGGGGGGGGEGEGGGGGCEGGGGECCECEEECGGG!ADGCGGGEGGEGGGGGEGCE8!2!DC!EEEGGC?!DGCCCEC:C?CCEGGGG??288<8B47>43,(195??=36)745<6.;:=?D?@6AB?@D8?@C=?AA;4'8D8?:::A?1*)=,,==EC==,,ACFCAAEBC=AEFCEBEDBDEEDED=EDD=?BFBFDFB!DFF@FFFHHHHHHHHHGGGDDHHHHFHDHIHHHHIIIIIHHFGCFFHHE!EEDFG?HIHGFGGFFFFF?CDCE?9FEHDHHGE;F;IHFFHFFFEEFDDDDDB!-!BB?????
OVERLAP_LEN: 49
-HAMMING_DIST: 1
---
-SEQ_NAME: M01168:16:000000000-A1R9L:1:1101:13906:2139 1:N:0:14:overlap=96:hamming=2
+SEQ_NAME: M01168:16:000000000-A1R9L:1:1101:13906:2139 1:N:0:14:overlap=96
SEQ: tagggaatcttgcacaatggaggaaactctgatgcagcgatgccgcgtgagtgaagaaggcctttgggttgtaaagctctttcgtcggggaagaaaatgactgtacccgaataagaaggtccggctaacttcgtgccagcagccgcggtaatacgAAGGGACCTAGCGTAGTTCGGAATTACTGGGCTTAAAGAGTTCGTAGGTGGTTAAAAAAGTTGGTGGTGAAATCCCAGAGCTTAACTCTGGAACTGccatcaaaactttttagctagagtatgatagaggaaagcagaatttctagtgtagaggtgaaattcgtagatattagaaagaataccgattgcgaaggcagctttctggatcattactgacgctgaggaacgaaagcatgggtagcgaaga
SEQ_LEN: 402
SCORES: !???9?BBBDBDDBDDFFFFFFHHHIFHFHHIHHFHHHHEDDEGHHHCEHE?EFHFGHFHFHIHIIIHCECEHHHIFHIIIHGHFHHHHHHEEFFFFFEEFFFEFFFEEEEEFFFFFFFCEFBEEDEFFFFFFEFEFFEEFFFFDDDDA?EEEFFB@C?8B=:7785;660@?@FEB?7B;?2BBA?CED@@@B?=5@DEFD??;8@E0@BEC788>@?95*4-:=7BEB8B7BB2@B?8&+98>2CDBB>A=AEECCEEEEDB=FFEFEEEDEFFFBFFFFF?.HFDBDD?FFHFHFFFHFFFHHGGGGCBFAHHIHFHHHGCGGIIIHHHGE5!HFBDE@E@DGHHHFFFHIHHFFGFDC0?E?FHEHEHIIHHFHIHFHFFFFFFDDBDBADD?BB??!
OVERLAP_LEN: 96
-HAMMING_DIST: 2
---
-SEQ_NAME: M01168:16:000000000-A1R9L:1:1101:14865:2158 1:N:0:14:overlap=96:hamming=0
+SEQ_NAME: M01168:16:000000000-A1R9L:1:1101:14865:2158 1:N:0:14:overlap=96
SEQ: tagggaatcttgcacaatggaggaaactctgatgcagcgatgccgcgtgagtgaagaaggcctttgggttgtaaagctctttcgtcggggaagaaaatgactgtacccgaataagaaggtccggctaacttcgtgccagcagccgcggtaatacGAAGGGACCTAGCGTAGTTCGGAATTACTGGGCTTAAAGAGTTCGTAGGTGGTTAAAAAAGTTGGTGGTGAAATCCCAGAGCTTAACTCTGGAACTgccatcaaaactttttagctagagtatgatagaggaaagcagaatttctagtgtagaggtgaaattcgtagatattagaaagaataccgattgcgaaggcagctttctggatcattactgacactgaggaacgaaagcatgggtagcgaag
SEQ_LEN: 401
SCORES: ?????BBBBBDDBDDBFFFFFFECHFHFHFHGHHFGD?!CFHHHHHH!DEEFFFFDFFHF@FHHHIFHEEHHHIIHHHIHIHHHDEHHHHFEEFFF?FEEEEFEEFFEE:!CEEFEFFFBEEEEEDEFE::AE:?AECEFEF?A!;D88;CEEEC@@668CCBC??C;8?+02>CEA>CB@;;?8@B@CCDE@D@?5@B>ABB:2::1>CDB=/@>BB@<19=4?6:7@A6@=?36875:8?A=DBDC@ACC;EEEEEEDEEDD;=DFB?FFDFFFB?C=FFFF?HHFDD@.DCFHHFHHHGFHHFHHGECBFFFEF?CHFGEA??HHHIHHHFC!E@DDEDDFHHHDHGGFBE?C=FF?CC?=F?CHFA9C0GBHFFHF;HFFF?FFBBBBB?BB?B??!
OVERLAP_LEN: 96
-HAMMING_DIST: 0
---
-SEQ_NAME: M01168:16:000000000-A1R9L:1:1101:17246:2253 1:N:0:14:overlap=96:hamming=1
+SEQ_NAME: M01168:16:000000000-A1R9L:1:1101:17246:2253 1:N:0:14:overlap=96
SEQ: agggaatcttgcacaatgggggaaaccctgatgcagcgatgccgcgtgagtgaagaaggcccttgggttgtaaagctctttcgtcggggaagaaaatgactgtacccgaataagaaggtccggctaacttcgtgccagcagccgcggtaatacGAAGGGACCTAGCGTAGTTCGGAATTACTGGGCTTAAAGAGTTCGTAGGTGGTTAAAAAAGTTGATGGTGAAATCCCAAGGCTCAACCTTGGAACTgccatcaaaactttttagctagagtatgatagaggaaagtggaatttctagtgtagaggtgaaattcgtagatattagaaagaacatcaaaagcgaaggcaactttctggatcattactgacactgaggaacgaaagcatgggtagcgaagagg
SEQ_LEN: 403
SCORES: !???B-!-?!BBBBBFFF;!CEEECC;EFFH=EGBE=CEFHD@CDCE!+!5C=FBAEADHFEHHHHHHECCG=CD=CCF=DC=DDACEE5:DEEEECBECEC3?3;?B:@CEEECEEC?CC????E?CAAAE?ACEEEEEEEE????A;)::ABCEB8CA669967)2955CA@=FEB=??<2:>?@6@7B7>449*<<9<957:9D;AD88;60?CEB2@8:;:B?>@749-663=<2-6<>46>A=64.EFEEEEDDEFBDDFFFFFFHHHHFFCC.?HFFCFHFDFDDFFGBBHHHHFGDHHGGBHIHHIIHHHDGHHHFECHHIIHFHGEA+HHFC@EHIIHHGFEGHHFGDHGDDC0?GGFGC90A0DBFFIIHFHFFFFFBBBBBDBB??BB?????
OVERLAP_LEN: 96
-HAMMING_DIST: 1
---
-SEQ_NAME: M01168:16:000000000-A1R9L:1:1101:13072:2276 1:N:0:14:overlap=9:hamming=0
+SEQ_NAME: M01168:16:000000000-A1R9L:1:1101:13072:2276 1:N:0:14:overlap=9
SEQ: gggaattttgcgcaatgggcgaaagcctgacgcagcaacgccgcgggatcgaagaagctctgcggagtgtaaagatctgtcataagggaagaataacgagtattctaacaaaatattcgtctgacggtaccttataagaaagccacggctaacttcgtgccagcagccgcggtaatacgaggggggcaagcgttatccggaatcattgggcgtaaaggggtcgtatgaggACTGATCAGttggaattaaaagaccgcggcttaaccgggggaacggtttcaatactgtcagtcttgagtgaggtagaggtaagcggaattcctggtgtagcggtgaaatgcgtagatatcaggaggaacatcaaaggcgaaggcagcttactgggcctatactgacgctgaggaacgaaagcgtggggagcaaac
SEQ_LEN: 425
SCORES: ?????B?BB!-5@BC9CFFB!+C!C!E?@B=!,!5!FE,!77!CE)!!:CD=@DEFHDDBCFFEE8:=@,BD=,3;BEEEE?;BCCECCCE?B;?B3:AA88?:**:CAAA88:::::??8*)AC).8?)*:???:0?CEAAE8**4;8;AA:??:8:??8?EE:AC?;42.)4?*::8).'''..'.8C8A)8AAE:A;?'0:?CAEE?8''4A8?*8'0)''8*0*/:2?.?32;816666..//((6/'(.'''6(6;64-2'8(6///-'96;/(;8@;*78*;@@@@9**0*88;@0*2*9@3*D199;99DDD99D@EDECCDDDDDEEDD9EDDDDEEEDDEC!DE@AC5A=CCAE!5C9+E=@DB@DFFC=C+EEEA9.A999EC7@7-C!A/ECAAEEEEEEC@@@@@-5-!!!!!
OVERLAP_LEN: 9
-HAMMING_DIST: 0
---
-SEQ_NAME: M01168:16:000000000-A1R9L:1:1101:13906:2139 1:N:0:14:overlap=96:hamming=2
+SEQ_NAME: M01168:16:000000000-A1R9L:1:1101:13906:2139 1:N:0:14:overlap=96
SEQ: tagggaatcttgcacaatggaggaaactctgatgcagcgatgccgcgtgagtgaagaaggcctttgggttgtaaagctctttcgtcggggaagaaaatgactgtacccgaataagaaggtccggctaacttcgtgccagcagccgcggtaatacgAAGGGACCTAGCGTAGTTCGGAATTACTGGGCTTAAAGAGTTCGTAGGTGGTTAAAAAAGTTGGTGGTGAAATCCCAGAGCTTAACTCTGGAACTGccatcaaaactttttagctagagtatgatagaggaaagcagaatttctagtgtagaggtgaaattcgtagatattagaaagaataccgattgcgaaggcagctttctggatcattactgacgctgaggaacgaaagcatgggtagcgaaga
SEQ_LEN: 402
SCORES: !???9?BBBDBDDBDDFFFFFFHHHIFHFHHIHHFHHHHEDDEGHHHCEHE?EFHFGHFHFHIHIIIHCECEHHHIFHIIIHGHFHHHHHHEEFFFFFEEFFFEFFFEEEEEFFFFFFFCEFBEEDEFFFFFFEFEFFEEFFFFDDDDA?EEEFFB@C?8B=:7785;660@?@FEB?7B;?2BBA?CED@@@B?=5@DEFD??;8@E0@BEC788>@?95*4-:=7BEB8B7BB2@B?8&+98>2CDBB>A=AEECCEEEEDB=FFEFEEEDEFFFBFFFFF?.HFDBDD?FFHFHFFFHFFFHHGGGGCBFAHHIHFHHHGCGGIIIHHHGE5!HFBDE@E@DGHHHFFFHIHHFFGFDC0?E?FHEHEHIIHHFHIHFHFFFFFFDDBDBADD?BB??!
OVERLAP_LEN: 96
-HAMMING_DIST: 2
---
-SEQ_NAME: M01168:16:000000000-A1R9L:1:1101:14865:2158 1:N:0:14:overlap=96:hamming=0
+SEQ_NAME: M01168:16:000000000-A1R9L:1:1101:14865:2158 1:N:0:14:overlap=96
SEQ: tagggaatcttgcacaatggaggaaactctgatgcagcgatgccgcgtgagtgaagaaggcctttgggttgtaaagctctttcgtcggggaagaaaatgactgtacccgaataagaaggtccggctaacttcgtgccagcagccgcggtaatacGAAGGGACCTAGCGTAGTTCGGAATTACTGGGCTTAAAGAGTTCGTAGGTGGTTAAAAAAGTTGGTGGTGAAATCCCAGAGCTTAACTCTGGAACTgccatcaaaactttttagctagagtatgatagaggaaagcagaatttctagtgtagaggtgaaattcgtagatattagaaagaataccgattgcgaaggcagctttctggatcattactgacactgaggaacgaaagcatgggtagcgaag
SEQ_LEN: 401
SCORES: ?????BBBBBDDBDDBFFFFFFECHFHFHFHGHHFGD?!CFHHHHHH!DEEFFFFDFFHF@FHHHIFHEEHHHIIHHHIHIHHHDEHHHHFEEFFF?FEEEEFEEFFEE:!CEEFEFFFBEEEEEDEFE::AE:?AECEFEF?A!;D88;CEEEC@@668CCBC??C;8?+02>CEA>CB@;;?8@B@CCDE@D@?5@B>ABB:2::1>CDB=/@>BB@<19=4?6:7@A6@=?36875:8?A=DBDC@ACC;EEEEEEDEEDD;=DFB?FFDFFFB?C=FFFF?HHFDD@.DCFHHFHHHGFHHFHHGECBFFFEF?CHFGEA??HHHIHHHFC!E@DDEDDFHHHDHGGFBE?C=FF?CC?=F?CHFA9C0GBHFFHF;HFFF?FFBBBBB?BB?B??!
OVERLAP_LEN: 96
-HAMMING_DIST: 0
---
-SEQ_NAME: M01168:16:000000000-A1R9L:1:1101:17246:2253 1:N:0:14:overlap=96:hamming=1
+SEQ_NAME: M01168:16:000000000-A1R9L:1:1101:17246:2253 1:N:0:14:overlap=96
SEQ: agggaatcttgcacaatgggggaaaccctgatgcagcgatgccgcgtgagtgaagaaggcccttgggttgtaaagctctttcgtcggggaagaaaatgactgtacccgaataagaaggtccggctaacttcgtgccagcagccgcggtaatacGAAGGGACCTAGCGTAGTTCGGAATTACTGGGCTTAAAGAGTTCGTAGGTGGTTAAAAAAGTTGATGGTGAAATCCCAAGGCTCAACCTTGGAACTgccatcaaaactttttagctagagtatgatagaggaaagtggaatttctagtgtagaggtgaaattcgtagatattagaaagaacatcaaaagcgaaggcaactttctggatcattactgacactgaggaacgaaagcatgggtagcgaagagg
SEQ_LEN: 403
SCORES: !???B-!-?!BBBBBFFF;!CEEECC;EFFH=EGBE=CEFHD@CDCE!+!5C=FBAEADHFEHHHHHHECCG=CD=CCF=DC=DDACEE5:DEEEECBECEC3?3;?B:@CEEECEEC?CC????E?CAAAE?ACEEEEEEEE????A;)::ABCEB8CA669967)2955CA@=FEB=??<2:>?@6@7B7>449*<<9<957:9D;AD88;60?CEB2@8:;:B?>@749-663=<2-6<>46>A=64.EFEEEEDDEFBDDFFFFFFHHHHFFCC.?HFFCFHFDFDDFFGBBHHHHFGDHHGGBHIHHIIHHHDGHHHFECHHIIHFHGEA+HHFC@EHIIHHGFEGHHFGDHGDDC0?GGFGC90A0DBFFIIHFHFFFFFBBBBBDBB??BB?????
OVERLAP_LEN: 96
-HAMMING_DIST: 1
---
test_ruby_gem "gnuplot"
test_ruby_gem "narray"
test_ruby_gem "RubyInline"
-test_ruby_gem "bzip2-ruby"
test_aux_program "blastall"
test_aux_program "blat"
test_aux_program "bwa"
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+require 'inline'
+require 'maasha/seq/ambiguity'
+
# Class containing methods to assemble two overlapping sequences into a single.
class Assemble
+ extend Ambiguity
+
# Class method to assemble two Seq objects.
def self.pair(entry1, entry2, options = {})
assemble = self.new(entry1, entry2, options)
@options[:overlap_max] = [@options[:overlap_max], entry1.length, entry2.length].min
end
- # Method to locate overlapping matche between two sequences.
+ # Method to locate overlapping matches between two sequences.
def match
overlap = @options[:overlap_max]
- na_seq1 = NArray.to_na(@entry1.seq.downcase, "byte")
- na_seq2 = NArray.to_na(@entry2.seq.downcase, "byte")
-
while overlap >= @options[:overlap_min]
- hamming_dist = (na_seq1[-1 * overlap .. -1] ^ na_seq2[0 ... overlap]).count_true
+ mismatches_max = (overlap * @options[:mismatches_max] * 0.01).round
- if hamming_dist <= (overlap * @options[:mismatches_max] * 0.01).round
+ if match_C(@entry1.seq, @entry2.seq, @entry1.length - overlap, 0, overlap, mismatches_max)
entry_left = @entry1[0 ... @entry1.length - overlap]
entry_right = @entry2[overlap .. -1]
entry_overlap.seq.upcase!
entry_right.seq.downcase!
entry_merged = entry_left + entry_overlap + entry_right
- entry_merged.seq_name = @entry1.seq_name + ":overlap=#{overlap}:hamming=#{hamming_dist}"
+ entry_merged.seq_name = @entry1.seq_name + ":overlap=#{overlap}"
return entry_merged
end
merged
end
+
+ inline do |builder|
+ add_ambiguity_macro(builder)
+
+ # C method for determining if two strings of equal length match
+ # given a maximum allowed mismatches and allowing for IUPAC
+ # ambiguity codes. Returns true if match, else false.
+ builder.c %{
+ VALUE match_C(
+ VALUE _string1, // String 1
+ VALUE _string2, // String 2
+ VALUE _offset1, // Offset 1
+ VALUE _offset2, // Offset 2
+ VALUE _length, // String length
+ VALUE _max_mismatch // Maximum mismatches
+ )
+ {
+ char *string1 = StringValuePtr(_string1);
+ char *string2 = StringValuePtr(_string2);
+ unsigned int offset1 = FIX2UINT(_offset1);
+ unsigned int offset2 = FIX2UINT(_offset2);
+ unsigned int length = FIX2UINT(_length);
+ unsigned int max_mismatch = FIX2UINT(_max_mismatch);
+
+ unsigned int max_match = length - max_mismatch;
+ unsigned int match = 0;
+ unsigned int mismatch = 0;
+ unsigned int i = 0;
+
+ for (i = 0; i < length; i++)
+ {
+ if (MATCH(string1[i + offset1], string2[i + offset2]))
+ {
+ match++;
+
+ if (match >= max_match) {
+ return Qtrue;
+ }
+ }
+ else
+ {
+ mismatch++;
+
+ if (mismatch > max_mismatch) {
+ return Qfalse;
+ }
+ }
+ }
+
+ return Qfalse;
+ }
+ }
+ end
end