]> git.donarmstrong.com Git - biopieces.git/commitdiff
refactoring of assemble_pairs
authormartinahansen <martinahansen@74ccb610-7750-0410-82ae-013aeee3265d>
Tue, 8 Oct 2013 09:39:58 +0000 (09:39 +0000)
committermartinahansen <martinahansen@74ccb610-7750-0410-82ae-013aeee3265d>
Tue, 8 Oct 2013 09:39:58 +0000 (09:39 +0000)
git-svn-id: http://biopieces.googlecode.com/svn/trunk@2229 74ccb610-7750-0410-82ae-013aeee3265d

bp_bin/assemble_pairs
bp_test/out/assemble_pairs.out.1
bp_test/out/assemble_pairs.out.2
bp_test/test_all
code_ruby/lib/maasha/seq/assemble.rb

index 4bb1152d5ebea74aec4786eab5db1c2d5ef11109..a101f3e16d0b0f09aa2f9dd9380c44bb1ef46231 100755 (executable)
@@ -92,9 +92,8 @@ Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output|
           if merged
             new_record = merged.to_bp
 
-            if merged.seq_name =~ /overlap=(\d+):hamming=(\d+)$/
-              new_record[:OVERLAP_LEN]  = $1
-              new_record[:HAMMING_DIST] = $2
+            if merged.seq_name =~ /overlap=(\d+)$/
+              new_record[:OVERLAP_LEN] = $1
             end
 
             output.puts new_record
index f804cb79ea1d7699d5f1c6ac8f2296a8621ec445..f9cf1750ad17d75a7f7b3441ab59a77bdadd1f13 100644 (file)
@@ -1,35 +1,30 @@
-SEQ_NAME: M01168:16:000000000-A1R9L:1:1101:14862:1868 1:N:0:14:overlap=49:hamming=1
+SEQ_NAME: M01168:16:000000000-A1R9L:1:1101:14862:1868 1:N:0:14:overlap=49
 SEQ: tggggaatattggacaatgggggcaaccctgatccagcaataccgcgtgtgtgaagaaggcctgagggttgtaaagcactttcaattgtgaagaaaagttaacggttaataaccgttagccttgacgttaactttagaagaagcaccggctaactccgtgccagcagccgcggtaatacggagggtgcaagcgttaatcgGAATTACTGGGCGTAAAGCGTGCGTAGGCGGTTTATTAAGTCAGATGTGaaagccccgggcttaacctgggaactgcatttgaaactggtcaactagagtatggtagaggaaagtggaatttctggtgtagcggtgaaatgcgtagatatcagaaggaacatcaatggcgaaggcagctttctggaccaatactgacgctgaggtacgaaagcgtgggtagcaaacagg
 SEQ_LEN: 429
 SCORES: !??????BDDDDDDDDGGGGGGGHHIIIEHIHHFGGHFHHGHFHHDHEHEHHFAFFGFFHFHHFFHHHEFEEHHHHHHHHHHHHFFFHFHHHHHHHHHHHHGBDEGGGGGGGGGGGGGGGGEGEGGGGGCEGGGGECCECEEECGGG!ADGCGGGEGGEGGGGGEGCE8!2!DC!EEEGGC?!DGCCCEC:C?CCEGGGG??288<8B47>43,(195??=36)745<6.;:=?D?@6AB?@D8?@C=?AA;4'8D8?:::A?1*)=,,==EC==,,ACFCAAEBC=AEFCEBEDBDEEDED=EDD=?BFBFDFB!DFF@FFFHHHHHHHHHGGGDDHHHHFHDHIHHHHIIIIIHHFGCFFHHE!EEDFG?HIHGFGGFFFFF?CDCE?9FEHDHHGE;F;IHFFHFFFEEFDDDDDB!-!BB?????
 OVERLAP_LEN: 49
-HAMMING_DIST: 1
 ---
-SEQ_NAME: M01168:16:000000000-A1R9L:1:1101:13906:2139 1:N:0:14:overlap=96:hamming=2
+SEQ_NAME: M01168:16:000000000-A1R9L:1:1101:13906:2139 1:N:0:14:overlap=96
 SEQ: tagggaatcttgcacaatggaggaaactctgatgcagcgatgccgcgtgagtgaagaaggcctttgggttgtaaagctctttcgtcggggaagaaaatgactgtacccgaataagaaggtccggctaacttcgtgccagcagccgcggtaatacgAAGGGACCTAGCGTAGTTCGGAATTACTGGGCTTAAAGAGTTCGTAGGTGGTTAAAAAAGTTGGTGGTGAAATCCCAGAGCTTAACTCTGGAACTGccatcaaaactttttagctagagtatgatagaggaaagcagaatttctagtgtagaggtgaaattcgtagatattagaaagaataccgattgcgaaggcagctttctggatcattactgacgctgaggaacgaaagcatgggtagcgaaga
 SEQ_LEN: 402
 SCORES: !???9?BBBDBDDBDDFFFFFFHHHIFHFHHIHHFHHHHEDDEGHHHCEHE?EFHFGHFHFHIHIIIHCECEHHHIFHIIIHGHFHHHHHHEEFFFFFEEFFFEFFFEEEEEFFFFFFFCEFBEEDEFFFFFFEFEFFEEFFFFDDDDA?EEEFFB@C?8B=:7785;660@?@FEB?7B;?2BBA?CED@@@B?=5@DEFD??;8@E0@BEC788>@?95*4-:=7BEB8B7BB2@B?8&+98>2CDBB>A=AEECCEEEEDB=FFEFEEEDEFFFBFFFFF?.HFDBDD?FFHFHFFFHFFFHHGGGGCBFAHHIHFHHHGCGGIIIHHHGE5!HFBDE@E@DGHHHFFFHIHHFFGFDC0?E?FHEHEHIIHHFHIHFHFFFFFFDDBDBADD?BB??!
 OVERLAP_LEN: 96
-HAMMING_DIST: 2
 ---
-SEQ_NAME: M01168:16:000000000-A1R9L:1:1101:14865:2158 1:N:0:14:overlap=96:hamming=0
+SEQ_NAME: M01168:16:000000000-A1R9L:1:1101:14865:2158 1:N:0:14:overlap=96
 SEQ: tagggaatcttgcacaatggaggaaactctgatgcagcgatgccgcgtgagtgaagaaggcctttgggttgtaaagctctttcgtcggggaagaaaatgactgtacccgaataagaaggtccggctaacttcgtgccagcagccgcggtaatacGAAGGGACCTAGCGTAGTTCGGAATTACTGGGCTTAAAGAGTTCGTAGGTGGTTAAAAAAGTTGGTGGTGAAATCCCAGAGCTTAACTCTGGAACTgccatcaaaactttttagctagagtatgatagaggaaagcagaatttctagtgtagaggtgaaattcgtagatattagaaagaataccgattgcgaaggcagctttctggatcattactgacactgaggaacgaaagcatgggtagcgaag
 SEQ_LEN: 401
 SCORES: ?????BBBBBDDBDDBFFFFFFECHFHFHFHGHHFGD?!CFHHHHHH!DEEFFFFDFFHF@FHHHIFHEEHHHIIHHHIHIHHHDEHHHHFEEFFF?FEEEEFEEFFEE:!CEEFEFFFBEEEEEDEFE::AE:?AECEFEF?A!;D88;CEEEC@@668CCBC??C;8?+02>CEA>CB@;;?8@B@CCDE@D@?5@B>ABB:2::1>CDB=/@>BB@<19=4?6:7@A6@=?36875:8?A=DBDC@ACC;EEEEEEDEEDD;=DFB?FFDFFFB?C=FFFF?HHFDD@.DCFHHFHHHGFHHFHHGECBFFFEF?CHFGEA??HHHIHHHFC!E@DDEDDFHHHDHGGFBE?C=FF?CC?=F?CHFA9C0GBHFFHF;HFFF?FFBBBBB?BB?B??!
 OVERLAP_LEN: 96
-HAMMING_DIST: 0
 ---
-SEQ_NAME: M01168:16:000000000-A1R9L:1:1101:17246:2253 1:N:0:14:overlap=96:hamming=1
+SEQ_NAME: M01168:16:000000000-A1R9L:1:1101:17246:2253 1:N:0:14:overlap=96
 SEQ: agggaatcttgcacaatgggggaaaccctgatgcagcgatgccgcgtgagtgaagaaggcccttgggttgtaaagctctttcgtcggggaagaaaatgactgtacccgaataagaaggtccggctaacttcgtgccagcagccgcggtaatacGAAGGGACCTAGCGTAGTTCGGAATTACTGGGCTTAAAGAGTTCGTAGGTGGTTAAAAAAGTTGATGGTGAAATCCCAAGGCTCAACCTTGGAACTgccatcaaaactttttagctagagtatgatagaggaaagtggaatttctagtgtagaggtgaaattcgtagatattagaaagaacatcaaaagcgaaggcaactttctggatcattactgacactgaggaacgaaagcatgggtagcgaagagg
 SEQ_LEN: 403
 SCORES: !???B-!-?!BBBBBFFF;!CEEECC;EFFH=EGBE=CEFHD@CDCE!+!5C=FBAEADHFEHHHHHHECCG=CD=CCF=DC=DDACEE5:DEEEECBECEC3?3;?B:@CEEECEEC?CC????E?CAAAE?ACEEEEEEEE????A;)::ABCEB8CA669967)2955CA@=FEB=??<2:>?@6@7B7>449*<<9<957:9D;AD88;60?CEB2@8:;:B?>@749-663=<2-6<>46>A=64.EFEEEEDDEFBDDFFFFFFHHHHFFCC.?HFFCFHFDFDDFFGBBHHHHFGDHHGGBHIHHIIHHHDGHHHFECHHIIHFHGEA+HHFC@EHIIHHGFEGHHFGDHGDDC0?GGFGC90A0DBFFIIHFHFFFFFBBBBBDBB??BB?????
 OVERLAP_LEN: 96
-HAMMING_DIST: 1
 ---
-SEQ_NAME: M01168:16:000000000-A1R9L:1:1101:13072:2276 1:N:0:14:overlap=9:hamming=0
+SEQ_NAME: M01168:16:000000000-A1R9L:1:1101:13072:2276 1:N:0:14:overlap=9
 SEQ: gggaattttgcgcaatgggcgaaagcctgacgcagcaacgccgcgggatcgaagaagctctgcggagtgtaaagatctgtcataagggaagaataacgagtattctaacaaaatattcgtctgacggtaccttataagaaagccacggctaacttcgtgccagcagccgcggtaatacgaggggggcaagcgttatccggaatcattgggcgtaaaggggtcgtatgaggACTGATCAGttggaattaaaagaccgcggcttaaccgggggaacggtttcaatactgtcagtcttgagtgaggtagaggtaagcggaattcctggtgtagcggtgaaatgcgtagatatcaggaggaacatcaaaggcgaaggcagcttactgggcctatactgacgctgaggaacgaaagcgtggggagcaaac
 SEQ_LEN: 425
 SCORES: ?????B?BB!-5@BC9CFFB!+C!C!E?@B=!,!5!FE,!77!CE)!!:CD=@DEFHDDBCFFEE8:=@,BD=,3;BEEEE?;BCCECCCE?B;?B3:AA88?:**:CAAA88:::::??8*)AC).8?)*:???:0?CEAAE8**4;8;AA:??:8:??8?EE:AC?;42.)4?*::8).'''..'.8C8A)8AAE:A;?'0:?CAEE?8''4A8?*8'0)''8*0*/:2?.?32;816666..//((6/'(.'''6(6;64-2'8(6///-'96;/(;8@;*78*;@@@@9**0*88;@0*2*9@3*D199;99DDD99D@EDECCDDDDDEEDD9EDDDDEEEDDEC!DE@AC5A=CCAE!5C9+E=@DB@DFFC=C+EEEA9.A999EC7@7-C!A/ECAAEEEEEEC@@@@@-5-!!!!!
 OVERLAP_LEN: 9
-HAMMING_DIST: 0
 ---
index 1bf0ec54c1b533dd035944f89d9d5299212bb12b..a11b38c16e8f93ada2e21f23ffa76e14eb0354a0 100644 (file)
@@ -1,21 +1,18 @@
-SEQ_NAME: M01168:16:000000000-A1R9L:1:1101:13906:2139 1:N:0:14:overlap=96:hamming=2
+SEQ_NAME: M01168:16:000000000-A1R9L:1:1101:13906:2139 1:N:0:14:overlap=96
 SEQ: tagggaatcttgcacaatggaggaaactctgatgcagcgatgccgcgtgagtgaagaaggcctttgggttgtaaagctctttcgtcggggaagaaaatgactgtacccgaataagaaggtccggctaacttcgtgccagcagccgcggtaatacgAAGGGACCTAGCGTAGTTCGGAATTACTGGGCTTAAAGAGTTCGTAGGTGGTTAAAAAAGTTGGTGGTGAAATCCCAGAGCTTAACTCTGGAACTGccatcaaaactttttagctagagtatgatagaggaaagcagaatttctagtgtagaggtgaaattcgtagatattagaaagaataccgattgcgaaggcagctttctggatcattactgacgctgaggaacgaaagcatgggtagcgaaga
 SEQ_LEN: 402
 SCORES: !???9?BBBDBDDBDDFFFFFFHHHIFHFHHIHHFHHHHEDDEGHHHCEHE?EFHFGHFHFHIHIIIHCECEHHHIFHIIIHGHFHHHHHHEEFFFFFEEFFFEFFFEEEEEFFFFFFFCEFBEEDEFFFFFFEFEFFEEFFFFDDDDA?EEEFFB@C?8B=:7785;660@?@FEB?7B;?2BBA?CED@@@B?=5@DEFD??;8@E0@BEC788>@?95*4-:=7BEB8B7BB2@B?8&+98>2CDBB>A=AEECCEEEEDB=FFEFEEEDEFFFBFFFFF?.HFDBDD?FFHFHFFFHFFFHHGGGGCBFAHHIHFHHHGCGGIIIHHHGE5!HFBDE@E@DGHHHFFFHIHHFFGFDC0?E?FHEHEHIIHHFHIHFHFFFFFFDDBDBADD?BB??!
 OVERLAP_LEN: 96
-HAMMING_DIST: 2
 ---
-SEQ_NAME: M01168:16:000000000-A1R9L:1:1101:14865:2158 1:N:0:14:overlap=96:hamming=0
+SEQ_NAME: M01168:16:000000000-A1R9L:1:1101:14865:2158 1:N:0:14:overlap=96
 SEQ: tagggaatcttgcacaatggaggaaactctgatgcagcgatgccgcgtgagtgaagaaggcctttgggttgtaaagctctttcgtcggggaagaaaatgactgtacccgaataagaaggtccggctaacttcgtgccagcagccgcggtaatacGAAGGGACCTAGCGTAGTTCGGAATTACTGGGCTTAAAGAGTTCGTAGGTGGTTAAAAAAGTTGGTGGTGAAATCCCAGAGCTTAACTCTGGAACTgccatcaaaactttttagctagagtatgatagaggaaagcagaatttctagtgtagaggtgaaattcgtagatattagaaagaataccgattgcgaaggcagctttctggatcattactgacactgaggaacgaaagcatgggtagcgaag
 SEQ_LEN: 401
 SCORES: ?????BBBBBDDBDDBFFFFFFECHFHFHFHGHHFGD?!CFHHHHHH!DEEFFFFDFFHF@FHHHIFHEEHHHIIHHHIHIHHHDEHHHHFEEFFF?FEEEEFEEFFEE:!CEEFEFFFBEEEEEDEFE::AE:?AECEFEF?A!;D88;CEEEC@@668CCBC??C;8?+02>CEA>CB@;;?8@B@CCDE@D@?5@B>ABB:2::1>CDB=/@>BB@<19=4?6:7@A6@=?36875:8?A=DBDC@ACC;EEEEEEDEEDD;=DFB?FFDFFFB?C=FFFF?HHFDD@.DCFHHFHHHGFHHFHHGECBFFFEF?CHFGEA??HHHIHHHFC!E@DDEDDFHHHDHGGFBE?C=FF?CC?=F?CHFA9C0GBHFFHF;HFFF?FFBBBBB?BB?B??!
 OVERLAP_LEN: 96
-HAMMING_DIST: 0
 ---
-SEQ_NAME: M01168:16:000000000-A1R9L:1:1101:17246:2253 1:N:0:14:overlap=96:hamming=1
+SEQ_NAME: M01168:16:000000000-A1R9L:1:1101:17246:2253 1:N:0:14:overlap=96
 SEQ: agggaatcttgcacaatgggggaaaccctgatgcagcgatgccgcgtgagtgaagaaggcccttgggttgtaaagctctttcgtcggggaagaaaatgactgtacccgaataagaaggtccggctaacttcgtgccagcagccgcggtaatacGAAGGGACCTAGCGTAGTTCGGAATTACTGGGCTTAAAGAGTTCGTAGGTGGTTAAAAAAGTTGATGGTGAAATCCCAAGGCTCAACCTTGGAACTgccatcaaaactttttagctagagtatgatagaggaaagtggaatttctagtgtagaggtgaaattcgtagatattagaaagaacatcaaaagcgaaggcaactttctggatcattactgacactgaggaacgaaagcatgggtagcgaagagg
 SEQ_LEN: 403
 SCORES: !???B-!-?!BBBBBFFF;!CEEECC;EFFH=EGBE=CEFHD@CDCE!+!5C=FBAEADHFEHHHHHHECCG=CD=CCF=DC=DDACEE5:DEEEECBECEC3?3;?B:@CEEECEEC?CC????E?CAAAE?ACEEEEEEEE????A;)::ABCEB8CA669967)2955CA@=FEB=??<2:>?@6@7B7>449*<<9<957:9D;AD88;60?CEB2@8:;:B?>@749-663=<2-6<>46>A=64.EFEEEEDDEFBDDFFFFFFHHHHFFCC.?HFFCFHFDFDDFFGBBHHHHFGDHHGGBHIHHIIHHHDGHHHFECHHIIHFHGEA+HHFC@EHIIHHGFEGHHFGDHGDDC0?GGFGC90A0DBFFIIHFHFFFFFBBBBBDBB??BB?????
 OVERLAP_LEN: 96
-HAMMING_DIST: 1
 ---
index 3c238b5570aeb6984339affe90ec5cb17b406ce7..4a1aa1369d5d1d61bfbc557d6cfb4261fbf97eb7 100755 (executable)
@@ -16,7 +16,6 @@ test_ruby
 test_ruby_gem "gnuplot"
 test_ruby_gem "narray"
 test_ruby_gem "RubyInline"
-test_ruby_gem "bzip2-ruby"
 test_aux_program "blastall"
 test_aux_program "blat"
 test_aux_program "bwa"
index 016322727cf15eb824f9e3f0518cca9b2ecb0241..de14b64c1fe080fdcf65c34dd61c6e3dafc9e51f 100644 (file)
 
 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
 
+require 'inline'
+require 'maasha/seq/ambiguity'
+
 # Class containing methods to assemble two overlapping sequences into a single.
 class Assemble
+  extend Ambiguity
+
   # Class method to assemble two Seq objects.
   def self.pair(entry1, entry2, options = {})
     assemble = self.new(entry1, entry2, options)
@@ -41,17 +46,14 @@ class Assemble
     @options[:overlap_max]      = [@options[:overlap_max], entry1.length, entry2.length].min
   end
 
-  # Method to locate overlapping matche between two sequences.
+  # Method to locate overlapping matches between two sequences.
   def match
     overlap = @options[:overlap_max]
 
-    na_seq1 = NArray.to_na(@entry1.seq.downcase, "byte")
-    na_seq2 = NArray.to_na(@entry2.seq.downcase, "byte")
-
     while overlap >= @options[:overlap_min]
-      hamming_dist = (na_seq1[-1 * overlap .. -1] ^ na_seq2[0 ... overlap]).count_true
+      mismatches_max = (overlap * @options[:mismatches_max] * 0.01).round
 
-      if hamming_dist <= (overlap * @options[:mismatches_max] * 0.01).round
+      if match_C(@entry1.seq, @entry2.seq, @entry1.length - overlap, 0, overlap, mismatches_max)
         entry_left  = @entry1[0 ... @entry1.length - overlap]
         entry_right = @entry2[overlap .. -1]
 
@@ -68,7 +70,7 @@ class Assemble
         entry_overlap.seq.upcase!
         entry_right.seq.downcase!
         entry_merged          = entry_left + entry_overlap + entry_right
-        entry_merged.seq_name = @entry1.seq_name + ":overlap=#{overlap}:hamming=#{hamming_dist}"
+        entry_merged.seq_name = @entry1.seq_name + ":overlap=#{overlap}"
 
         return entry_merged
       end
@@ -98,6 +100,59 @@ class Assemble
 
     merged
   end
+
+  inline do |builder|
+    add_ambiguity_macro(builder)
+
+    # C method for determining if two strings of equal length match
+    # given a maximum allowed mismatches and allowing for IUPAC
+    # ambiguity codes. Returns true if match, else false.
+    builder.c %{
+      VALUE match_C(
+        VALUE _string1,       // String 1
+        VALUE _string2,       // String 2
+        VALUE _offset1,       // Offset 1
+        VALUE _offset2,       // Offset 2
+        VALUE _length,        // String length
+        VALUE _max_mismatch   // Maximum mismatches
+      )
+      {
+        char         *string1      = StringValuePtr(_string1);
+        char         *string2      = StringValuePtr(_string2);
+        unsigned int  offset1      = FIX2UINT(_offset1);
+        unsigned int  offset2      = FIX2UINT(_offset2);
+        unsigned int  length       = FIX2UINT(_length);
+        unsigned int  max_mismatch = FIX2UINT(_max_mismatch);
+
+        unsigned int max_match = length - max_mismatch;
+        unsigned int match     = 0;
+        unsigned int mismatch  = 0;
+        unsigned int i         = 0;
+
+        for (i = 0; i < length; i++)
+        {
+          if (MATCH(string1[i + offset1], string2[i + offset2]))
+          {
+            match++;
+
+            if (match >= max_match) {
+              return Qtrue;
+            }
+          }
+          else
+          {
+            mismatch++;
+
+            if (mismatch > max_mismatch) {
+              return Qfalse;
+            }
+          }
+        }
+
+        return Qfalse;
+      }
+    }
+  end
 end