]> git.donarmstrong.com Git - biopieces.git/commitdiff
fixed bug in assemble_pairs
authormartinahansen <martinahansen@74ccb610-7750-0410-82ae-013aeee3265d>
Mon, 2 Dec 2013 14:44:35 +0000 (14:44 +0000)
committermartinahansen <martinahansen@74ccb610-7750-0410-82ae-013aeee3265d>
Mon, 2 Dec 2013 14:44:35 +0000 (14:44 +0000)
git-svn-id: http://biopieces.googlecode.com/svn/trunk@2264 74ccb610-7750-0410-82ae-013aeee3265d

bp_bin/assemble_pairs
bp_test/out/assemble_pairs.out.1
bp_test/out/assemble_pairs.out.2
code_ruby/lib/maasha/seq/assemble.rb
code_ruby/test/maasha/seq/test_assemble.rb

index a101f3e16d0b0f09aa2f9dd9380c44bb1ef46231..df1fd0320a94d218e187067e4af03f5ba9436849 100755 (executable)
@@ -54,9 +54,9 @@ def names_match(entry1, entry2)
 end
 
 casts = []
-casts << {:long=>'mismatches',  :short=>'m', :type=>'uint', :mandatory=>false, :default=>5,   :allowed=>nil, :disallowed=>nil}
-casts << {:long=>'overlap_min', :short=>'o', :type=>'uint', :mandatory=>false, :default=>1,   :allowed=>nil, :disallowed=>"0"}
-casts << {:long=>'overlap_max', :short=>'p', :type=>'uint', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>"0"}
+casts << {long: 'mismatches',  short: 'm', type: 'uint', mandatory: false, default: 5,   allowed: nil, disallowed: nil}
+casts << {long: 'overlap_min', short: 'o', type: 'uint', mandatory: false, default: 1,   allowed: nil, disallowed: "0"}
+casts << {long: 'overlap_max', short: 'p', type: 'uint', mandatory: false, default: nil, allowed: nil, disallowed: "0"}
 
 options = Biopieces.options_parse(ARGV, casts)
 
@@ -84,16 +84,17 @@ Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output|
           merged = Assemble.pair(
             entry1,
             entry2,
-            mismatches_max:options[:mismatches],
-            overlap_min:options[:overlap_min],
-            overlap_max:options[:overlap_max]
+            mismatches_max: options[:mismatches],
+            overlap_min: options[:overlap_min],
+            overlap_max: options[:overlap_max]
           )
 
           if merged
             new_record = merged.to_bp
 
-            if merged.seq_name =~ /overlap=(\d+)$/
-              new_record[:OVERLAP_LEN] = $1
+            if merged.seq_name =~ /overlap=(\d+):hamming=(\d+)$/
+              new_record[:OVERLAP_LEN]  = $1
+              new_record[:HAMMING_DIST] = $2
             end
 
             output.puts new_record
index f9cf1750ad17d75a7f7b3441ab59a77bdadd1f13..f804cb79ea1d7699d5f1c6ac8f2296a8621ec445 100644 (file)
@@ -1,30 +1,35 @@
-SEQ_NAME: M01168:16:000000000-A1R9L:1:1101:14862:1868 1:N:0:14:overlap=49
+SEQ_NAME: M01168:16:000000000-A1R9L:1:1101:14862:1868 1:N:0:14:overlap=49:hamming=1
 SEQ: tggggaatattggacaatgggggcaaccctgatccagcaataccgcgtgtgtgaagaaggcctgagggttgtaaagcactttcaattgtgaagaaaagttaacggttaataaccgttagccttgacgttaactttagaagaagcaccggctaactccgtgccagcagccgcggtaatacggagggtgcaagcgttaatcgGAATTACTGGGCGTAAAGCGTGCGTAGGCGGTTTATTAAGTCAGATGTGaaagccccgggcttaacctgggaactgcatttgaaactggtcaactagagtatggtagaggaaagtggaatttctggtgtagcggtgaaatgcgtagatatcagaaggaacatcaatggcgaaggcagctttctggaccaatactgacgctgaggtacgaaagcgtgggtagcaaacagg
 SEQ_LEN: 429
 SCORES: !??????BDDDDDDDDGGGGGGGHHIIIEHIHHFGGHFHHGHFHHDHEHEHHFAFFGFFHFHHFFHHHEFEEHHHHHHHHHHHHFFFHFHHHHHHHHHHHHGBDEGGGGGGGGGGGGGGGGEGEGGGGGCEGGGGECCECEEECGGG!ADGCGGGEGGEGGGGGEGCE8!2!DC!EEEGGC?!DGCCCEC:C?CCEGGGG??288<8B47>43,(195??=36)745<6.;:=?D?@6AB?@D8?@C=?AA;4'8D8?:::A?1*)=,,==EC==,,ACFCAAEBC=AEFCEBEDBDEEDED=EDD=?BFBFDFB!DFF@FFFHHHHHHHHHGGGDDHHHHFHDHIHHHHIIIIIHHFGCFFHHE!EEDFG?HIHGFGGFFFFF?CDCE?9FEHDHHGE;F;IHFFHFFFEEFDDDDDB!-!BB?????
 OVERLAP_LEN: 49
+HAMMING_DIST: 1
 ---
-SEQ_NAME: M01168:16:000000000-A1R9L:1:1101:13906:2139 1:N:0:14:overlap=96
+SEQ_NAME: M01168:16:000000000-A1R9L:1:1101:13906:2139 1:N:0:14:overlap=96:hamming=2
 SEQ: tagggaatcttgcacaatggaggaaactctgatgcagcgatgccgcgtgagtgaagaaggcctttgggttgtaaagctctttcgtcggggaagaaaatgactgtacccgaataagaaggtccggctaacttcgtgccagcagccgcggtaatacgAAGGGACCTAGCGTAGTTCGGAATTACTGGGCTTAAAGAGTTCGTAGGTGGTTAAAAAAGTTGGTGGTGAAATCCCAGAGCTTAACTCTGGAACTGccatcaaaactttttagctagagtatgatagaggaaagcagaatttctagtgtagaggtgaaattcgtagatattagaaagaataccgattgcgaaggcagctttctggatcattactgacgctgaggaacgaaagcatgggtagcgaaga
 SEQ_LEN: 402
 SCORES: !???9?BBBDBDDBDDFFFFFFHHHIFHFHHIHHFHHHHEDDEGHHHCEHE?EFHFGHFHFHIHIIIHCECEHHHIFHIIIHGHFHHHHHHEEFFFFFEEFFFEFFFEEEEEFFFFFFFCEFBEEDEFFFFFFEFEFFEEFFFFDDDDA?EEEFFB@C?8B=:7785;660@?@FEB?7B;?2BBA?CED@@@B?=5@DEFD??;8@E0@BEC788>@?95*4-:=7BEB8B7BB2@B?8&+98>2CDBB>A=AEECCEEEEDB=FFEFEEEDEFFFBFFFFF?.HFDBDD?FFHFHFFFHFFFHHGGGGCBFAHHIHFHHHGCGGIIIHHHGE5!HFBDE@E@DGHHHFFFHIHHFFGFDC0?E?FHEHEHIIHHFHIHFHFFFFFFDDBDBADD?BB??!
 OVERLAP_LEN: 96
+HAMMING_DIST: 2
 ---
-SEQ_NAME: M01168:16:000000000-A1R9L:1:1101:14865:2158 1:N:0:14:overlap=96
+SEQ_NAME: M01168:16:000000000-A1R9L:1:1101:14865:2158 1:N:0:14:overlap=96:hamming=0
 SEQ: tagggaatcttgcacaatggaggaaactctgatgcagcgatgccgcgtgagtgaagaaggcctttgggttgtaaagctctttcgtcggggaagaaaatgactgtacccgaataagaaggtccggctaacttcgtgccagcagccgcggtaatacGAAGGGACCTAGCGTAGTTCGGAATTACTGGGCTTAAAGAGTTCGTAGGTGGTTAAAAAAGTTGGTGGTGAAATCCCAGAGCTTAACTCTGGAACTgccatcaaaactttttagctagagtatgatagaggaaagcagaatttctagtgtagaggtgaaattcgtagatattagaaagaataccgattgcgaaggcagctttctggatcattactgacactgaggaacgaaagcatgggtagcgaag
 SEQ_LEN: 401
 SCORES: ?????BBBBBDDBDDBFFFFFFECHFHFHFHGHHFGD?!CFHHHHHH!DEEFFFFDFFHF@FHHHIFHEEHHHIIHHHIHIHHHDEHHHHFEEFFF?FEEEEFEEFFEE:!CEEFEFFFBEEEEEDEFE::AE:?AECEFEF?A!;D88;CEEEC@@668CCBC??C;8?+02>CEA>CB@;;?8@B@CCDE@D@?5@B>ABB:2::1>CDB=/@>BB@<19=4?6:7@A6@=?36875:8?A=DBDC@ACC;EEEEEEDEEDD;=DFB?FFDFFFB?C=FFFF?HHFDD@.DCFHHFHHHGFHHFHHGECBFFFEF?CHFGEA??HHHIHHHFC!E@DDEDDFHHHDHGGFBE?C=FF?CC?=F?CHFA9C0GBHFFHF;HFFF?FFBBBBB?BB?B??!
 OVERLAP_LEN: 96
+HAMMING_DIST: 0
 ---
-SEQ_NAME: M01168:16:000000000-A1R9L:1:1101:17246:2253 1:N:0:14:overlap=96
+SEQ_NAME: M01168:16:000000000-A1R9L:1:1101:17246:2253 1:N:0:14:overlap=96:hamming=1
 SEQ: agggaatcttgcacaatgggggaaaccctgatgcagcgatgccgcgtgagtgaagaaggcccttgggttgtaaagctctttcgtcggggaagaaaatgactgtacccgaataagaaggtccggctaacttcgtgccagcagccgcggtaatacGAAGGGACCTAGCGTAGTTCGGAATTACTGGGCTTAAAGAGTTCGTAGGTGGTTAAAAAAGTTGATGGTGAAATCCCAAGGCTCAACCTTGGAACTgccatcaaaactttttagctagagtatgatagaggaaagtggaatttctagtgtagaggtgaaattcgtagatattagaaagaacatcaaaagcgaaggcaactttctggatcattactgacactgaggaacgaaagcatgggtagcgaagagg
 SEQ_LEN: 403
 SCORES: !???B-!-?!BBBBBFFF;!CEEECC;EFFH=EGBE=CEFHD@CDCE!+!5C=FBAEADHFEHHHHHHECCG=CD=CCF=DC=DDACEE5:DEEEECBECEC3?3;?B:@CEEECEEC?CC????E?CAAAE?ACEEEEEEEE????A;)::ABCEB8CA669967)2955CA@=FEB=??<2:>?@6@7B7>449*<<9<957:9D;AD88;60?CEB2@8:;:B?>@749-663=<2-6<>46>A=64.EFEEEEDDEFBDDFFFFFFHHHHFFCC.?HFFCFHFDFDDFFGBBHHHHFGDHHGGBHIHHIIHHHDGHHHFECHHIIHFHGEA+HHFC@EHIIHHGFEGHHFGDHGDDC0?GGFGC90A0DBFFIIHFHFFFFFBBBBBDBB??BB?????
 OVERLAP_LEN: 96
+HAMMING_DIST: 1
 ---
-SEQ_NAME: M01168:16:000000000-A1R9L:1:1101:13072:2276 1:N:0:14:overlap=9
+SEQ_NAME: M01168:16:000000000-A1R9L:1:1101:13072:2276 1:N:0:14:overlap=9:hamming=0
 SEQ: gggaattttgcgcaatgggcgaaagcctgacgcagcaacgccgcgggatcgaagaagctctgcggagtgtaaagatctgtcataagggaagaataacgagtattctaacaaaatattcgtctgacggtaccttataagaaagccacggctaacttcgtgccagcagccgcggtaatacgaggggggcaagcgttatccggaatcattgggcgtaaaggggtcgtatgaggACTGATCAGttggaattaaaagaccgcggcttaaccgggggaacggtttcaatactgtcagtcttgagtgaggtagaggtaagcggaattcctggtgtagcggtgaaatgcgtagatatcaggaggaacatcaaaggcgaaggcagcttactgggcctatactgacgctgaggaacgaaagcgtggggagcaaac
 SEQ_LEN: 425
 SCORES: ?????B?BB!-5@BC9CFFB!+C!C!E?@B=!,!5!FE,!77!CE)!!:CD=@DEFHDDBCFFEE8:=@,BD=,3;BEEEE?;BCCECCCE?B;?B3:AA88?:**:CAAA88:::::??8*)AC).8?)*:???:0?CEAAE8**4;8;AA:??:8:??8?EE:AC?;42.)4?*::8).'''..'.8C8A)8AAE:A;?'0:?CAEE?8''4A8?*8'0)''8*0*/:2?.?32;816666..//((6/'(.'''6(6;64-2'8(6///-'96;/(;8@;*78*;@@@@9**0*88;@0*2*9@3*D199;99DDD99D@EDECCDDDDDEEDD9EDDDDEEEDDEC!DE@AC5A=CCAE!5C9+E=@DB@DFFC=C+EEEA9.A999EC7@7-C!A/ECAAEEEEEEC@@@@@-5-!!!!!
 OVERLAP_LEN: 9
+HAMMING_DIST: 0
 ---
index a11b38c16e8f93ada2e21f23ffa76e14eb0354a0..1bf0ec54c1b533dd035944f89d9d5299212bb12b 100644 (file)
@@ -1,18 +1,21 @@
-SEQ_NAME: M01168:16:000000000-A1R9L:1:1101:13906:2139 1:N:0:14:overlap=96
+SEQ_NAME: M01168:16:000000000-A1R9L:1:1101:13906:2139 1:N:0:14:overlap=96:hamming=2
 SEQ: tagggaatcttgcacaatggaggaaactctgatgcagcgatgccgcgtgagtgaagaaggcctttgggttgtaaagctctttcgtcggggaagaaaatgactgtacccgaataagaaggtccggctaacttcgtgccagcagccgcggtaatacgAAGGGACCTAGCGTAGTTCGGAATTACTGGGCTTAAAGAGTTCGTAGGTGGTTAAAAAAGTTGGTGGTGAAATCCCAGAGCTTAACTCTGGAACTGccatcaaaactttttagctagagtatgatagaggaaagcagaatttctagtgtagaggtgaaattcgtagatattagaaagaataccgattgcgaaggcagctttctggatcattactgacgctgaggaacgaaagcatgggtagcgaaga
 SEQ_LEN: 402
 SCORES: !???9?BBBDBDDBDDFFFFFFHHHIFHFHHIHHFHHHHEDDEGHHHCEHE?EFHFGHFHFHIHIIIHCECEHHHIFHIIIHGHFHHHHHHEEFFFFFEEFFFEFFFEEEEEFFFFFFFCEFBEEDEFFFFFFEFEFFEEFFFFDDDDA?EEEFFB@C?8B=:7785;660@?@FEB?7B;?2BBA?CED@@@B?=5@DEFD??;8@E0@BEC788>@?95*4-:=7BEB8B7BB2@B?8&+98>2CDBB>A=AEECCEEEEDB=FFEFEEEDEFFFBFFFFF?.HFDBDD?FFHFHFFFHFFFHHGGGGCBFAHHIHFHHHGCGGIIIHHHGE5!HFBDE@E@DGHHHFFFHIHHFFGFDC0?E?FHEHEHIIHHFHIHFHFFFFFFDDBDBADD?BB??!
 OVERLAP_LEN: 96
+HAMMING_DIST: 2
 ---
-SEQ_NAME: M01168:16:000000000-A1R9L:1:1101:14865:2158 1:N:0:14:overlap=96
+SEQ_NAME: M01168:16:000000000-A1R9L:1:1101:14865:2158 1:N:0:14:overlap=96:hamming=0
 SEQ: tagggaatcttgcacaatggaggaaactctgatgcagcgatgccgcgtgagtgaagaaggcctttgggttgtaaagctctttcgtcggggaagaaaatgactgtacccgaataagaaggtccggctaacttcgtgccagcagccgcggtaatacGAAGGGACCTAGCGTAGTTCGGAATTACTGGGCTTAAAGAGTTCGTAGGTGGTTAAAAAAGTTGGTGGTGAAATCCCAGAGCTTAACTCTGGAACTgccatcaaaactttttagctagagtatgatagaggaaagcagaatttctagtgtagaggtgaaattcgtagatattagaaagaataccgattgcgaaggcagctttctggatcattactgacactgaggaacgaaagcatgggtagcgaag
 SEQ_LEN: 401
 SCORES: ?????BBBBBDDBDDBFFFFFFECHFHFHFHGHHFGD?!CFHHHHHH!DEEFFFFDFFHF@FHHHIFHEEHHHIIHHHIHIHHHDEHHHHFEEFFF?FEEEEFEEFFEE:!CEEFEFFFBEEEEEDEFE::AE:?AECEFEF?A!;D88;CEEEC@@668CCBC??C;8?+02>CEA>CB@;;?8@B@CCDE@D@?5@B>ABB:2::1>CDB=/@>BB@<19=4?6:7@A6@=?36875:8?A=DBDC@ACC;EEEEEEDEEDD;=DFB?FFDFFFB?C=FFFF?HHFDD@.DCFHHFHHHGFHHFHHGECBFFFEF?CHFGEA??HHHIHHHFC!E@DDEDDFHHHDHGGFBE?C=FF?CC?=F?CHFA9C0GBHFFHF;HFFF?FFBBBBB?BB?B??!
 OVERLAP_LEN: 96
+HAMMING_DIST: 0
 ---
-SEQ_NAME: M01168:16:000000000-A1R9L:1:1101:17246:2253 1:N:0:14:overlap=96
+SEQ_NAME: M01168:16:000000000-A1R9L:1:1101:17246:2253 1:N:0:14:overlap=96:hamming=1
 SEQ: agggaatcttgcacaatgggggaaaccctgatgcagcgatgccgcgtgagtgaagaaggcccttgggttgtaaagctctttcgtcggggaagaaaatgactgtacccgaataagaaggtccggctaacttcgtgccagcagccgcggtaatacGAAGGGACCTAGCGTAGTTCGGAATTACTGGGCTTAAAGAGTTCGTAGGTGGTTAAAAAAGTTGATGGTGAAATCCCAAGGCTCAACCTTGGAACTgccatcaaaactttttagctagagtatgatagaggaaagtggaatttctagtgtagaggtgaaattcgtagatattagaaagaacatcaaaagcgaaggcaactttctggatcattactgacactgaggaacgaaagcatgggtagcgaagagg
 SEQ_LEN: 403
 SCORES: !???B-!-?!BBBBBFFF;!CEEECC;EFFH=EGBE=CEFHD@CDCE!+!5C=FBAEADHFEHHHHHHECCG=CD=CCF=DC=DDACEE5:DEEEECBECEC3?3;?B:@CEEECEEC?CC????E?CAAAE?ACEEEEEEEE????A;)::ABCEB8CA669967)2955CA@=FEB=??<2:>?@6@7B7>449*<<9<957:9D;AD88;60?CEB2@8:;:B?>@749-663=<2-6<>46>A=64.EFEEEEDDEFBDDFFFFFFHHHHFFCC.?HFFCFHFDFDDFFGBBHHHHFGDHHGGBHIHHIIHHHDGHHHFECHHIIHFHGEA+HHFC@EHIIHHGFEGHHFGDHGDDC0?GGFGC90A0DBFFIIHFHFFFFFBBBBBDBB??BB?????
 OVERLAP_LEN: 96
+HAMMING_DIST: 1
 ---
index c6f528b8ce9800061cd5aabdfada95b1911a5c11..bab869c98b580a52cb1612b9d21b4add8760de10 100644 (file)
@@ -39,6 +39,9 @@ class Assemble
   def initialize(entry1, entry2, options)
     @entry1  = entry1
     @entry2  = entry2
+    @overlap = 0
+    @offset1 = 0
+    @offset2 = 0
     @options = options
     @options[:mismatches_max] ||= 0
     @options[:overlap_min]    ||= 1
@@ -47,41 +50,70 @@ class Assemble
   # Method to locate overlapping matches between two sequences.
   def match
     if @options[:overlap_max]
-      overlap = [@options[:overlap_max], @entry1.length, @entry2.length].min
+      @overlap = [@options[:overlap_max], @entry1.length, @entry2.length].min
     else
-      overlap = [@entry1.length, @entry2.length].min
+      @overlap = [@entry1.length, @entry2.length].min
     end
 
-    while overlap >= @options[:overlap_min]
-      mismatches_max = (overlap * @options[:mismatches_max] * 0.01).round
-      
-      offset1 = @entry2.length - overlap
-      offset2 = 0
+    diff = @entry1.length - @entry2.length
+    diff = 0 if diff < 0
 
-      if match_C(@entry1.seq, @entry2.seq, offset1, offset2, overlap, mismatches_max)
-        entry_left  = @entry1[0 ... @entry1.length - overlap]
-        entry_right = @entry2[overlap .. -1]
+    @offset1 = @entry1.length - @overlap - diff
 
-        if @entry1.qual and @entry2.qual
-          entry_overlap1 = @entry1[-1 * overlap .. -1]
-          entry_overlap2 = @entry2[0 ... overlap]
+    while @overlap >= @options[:overlap_min]
+      mismatches_max = (@overlap * @options[:mismatches_max] * 0.01).round
+     
+      # puts "diff: #{diff}   offset1: #{@offset1}  offset2: #{@offset2}   overlap: #{@overlap}"
 
-          entry_overlap = merge_overlap(entry_overlap1, entry_overlap2)
-        else
-          entry_overlap = @entry1[-1 * overlap .. -1]
-        end
-
-        entry_left.seq.downcase!
-        entry_overlap.seq.upcase!
-        entry_right.seq.downcase!
+      if mismatches = match_C(@entry1.seq, @entry2.seq, @offset1, @offset2, @overlap, mismatches_max) and mismatches >= 0
         entry_merged          = entry_left + entry_overlap + entry_right
-        entry_merged.seq_name = @entry1.seq_name + ":overlap=#{overlap}"
+        entry_merged.seq_name = @entry1.seq_name + ":overlap=#{@overlap}:hamming=#{mismatches}"
 
         return entry_merged
       end
 
-      overlap -= 1
+      if diff > 0
+        diff -= 1
+      else
+        @overlap -= 1
+      end
+
+      @offset1 += 1
+    end
+  end
+
+  # Method to extract and downcase the left part of an assembled pair.
+  def entry_left
+    entry = @entry1[0 ... @offset1]
+    entry.seq.downcase!
+    entry
+  end
+
+  # Method to extract and downcase the right part of an assembled pair.
+  def entry_right
+    if @entry1.length > @offset1 + @overlap
+      entry = @entry1[@offset1 + @overlap .. -1]
+    else
+      entry = @entry2[@offset2 + @overlap .. -1]
+    end
+
+    entry.seq.downcase!
+    entry
+  end
+
+  # Method to extract and upcase the overlapping part of an assembled pair.
+  def entry_overlap
+    if @entry1.qual and @entry2.qual
+      entry_overlap1 = @entry1[@offset1 ... @offset1 + @overlap]
+      entry_overlap2 = @entry2[@offset2 ... @offset2 + @overlap]
+
+      entry = merge_overlap(entry_overlap1, entry_overlap2)
+    else
+      entry = @entry1[@offset1 ... @offset1 + @overlap]
     end
+
+    entry.seq.upcase!
+    entry
   end
 
   # Method to merge sequence and quality scores in an overlap.
@@ -111,7 +143,7 @@ class Assemble
 
     # C method for determining if two strings of equal length match
     # given a maximum allowed mismatches and allowing for IUPAC
-    # ambiguity codes. Returns true if match, else false.
+    # ambiguity codes. Returns number of mismatches is true if match, else false.
     builder.c %{
       VALUE match_C(
         VALUE _string1,       // String 1
@@ -141,7 +173,7 @@ class Assemble
             match++;
 
             if (match >= max_match) {
-              return Qtrue;
+              return UINT2NUM(mismatch);
             }
           }
           else
@@ -149,12 +181,12 @@ class Assemble
             mismatch++;
 
             if (mismatch > max_mismatch) {
-              return Qfalse;
+              return INT2NUM(-1);
             }
           }
         }
 
-        return Qfalse;
+        return INT2NUM(-1);
       }
     }
   end
index bd1baf0649390f0a2cbabc382af0d9cce9b551a8..93174085f4509767ccb50c047510a73f7f981108 100755 (executable)
@@ -18,6 +18,38 @@ class TestAssemble < Test::Unit::TestCase
     assert_equal("ATCG",    Assemble.pair(Seq.new(seq_name: "t1", seq: "atcg"), Seq.new(seq_name: "t2", seq: "atcg")).seq)
   end
 
+  test "Assemble.pair with first sequence longer than second and terminal overlap returns correctly" do
+    e1 = Seq.new(seq_name: "t1", seq: "AGGCGT")
+    e2 = Seq.new(seq_name: "t2", seq: "GT")
+    a = Assemble.pair(e1, e2)
+    assert_equal("t1:overlap=2:hamming=0", a.seq_name)
+    assert_equal("aggcGT", a.seq)
+  end
+
+  test "Assemble.pair with first sequence longer than second and internal overlap returns correctly" do
+    e1 = Seq.new(seq_name: "t1", seq: "AGGCGT")
+    e2 = Seq.new(seq_name: "t2", seq: "GC")
+    a = Assemble.pair(e1, e2)
+    assert_equal("t1:overlap=2:hamming=0", a.seq_name)
+    assert_equal("agGCgt", a.seq)
+  end
+
+  test "Assemble.pair with first sequence longer than second and initial overlap returns correctly" do
+    e1 = Seq.new(seq_name: "t1", seq: "GTCAGA")
+    e2 = Seq.new(seq_name: "t2", seq: "GT")
+    a = Assemble.pair(e1, e2)
+    assert_equal("t1:overlap=2:hamming=0", a.seq_name)
+    assert_equal("GTcaga", a.seq)
+  end
+
+  test "Assemble.pair with first sequence shorter than second and initial overlap returns correctly" do
+    e1 = Seq.new(seq_name: "t1", seq: "AG")
+    e2 = Seq.new(seq_name: "t2", seq: "AGTCAG")
+    a = Assemble.pair(e1, e2)
+    assert_equal("t1:overlap=2:hamming=0", a.seq_name)
+    assert_equal("AGtcag", a.seq)
+  end
+
   test "Assemble.pair with overlap and overlap_min returns correctly" do
     assert_nil(Assemble.pair(Seq.new(seq: "atcg"), Seq.new(seq: "gatc"), :overlap_min => 2))
     assert_equal("atCGat", Assemble.pair(Seq.new(seq_name: "t1", seq: "atcg"), Seq.new(seq_name: "t2", seq: "cgat"), :overlap_min => 2).seq)
@@ -25,7 +57,7 @@ class TestAssemble < Test::Unit::TestCase
 
   test "Assemble.pair with overlap and overlap_max returns correctly" do
     assert_equal("aTCGa", Assemble.pair(Seq.new(seq_name: "t1", seq: "atcg"), Seq.new(seq_name: "t2", seq: "tcga"), :overlap_max => 3).seq)
-    assert_nil(Assemble.pair(Seq.new(seq: "atcg"), Seq.new(seq: "atcg"), :overlap_max => 3))
+    assert_nil(Assemble.pair(Seq.new(seq_name: "t1", seq: "atcg"), Seq.new(seq_name: "t2", seq: "atcg"), :overlap_max => 3))
   end
 
   test "Assemble.pair with overlap returns correct quality" do