--- /dev/null
+#!/usr/bin/env ruby
+
+# Copyright (C) 2007-2012 Martin A. Hansen.
+
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+# http://www.gnu.org/copyleft/gpl.html
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# This program is part of the Biopieces framework (www.biopieces.org).
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# Order records with pair end sequence data.
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+require 'pp'
+require 'maasha/biopieces'
+
+def files_merge(file_in1, file_in2)
+ file_out = "#{file_in1}.merge"
+
+ in1, out1 = Biopieces.open(file_in1, file_out)
+ in2, out2 = Biopieces.open(file_in2)
+
+ record1 = in1.get_entry
+ record2 = in2.get_entry
+
+ while 1
+ if record1 and record2
+ if record1[:SEQ_NAME] < record2[:SEQ_NAME]
+ out1.puts record1
+ record1 = in1.get_entry
+ else
+ out1.puts record2
+ record2 = in2.get_entry
+ end
+ elsif record1
+ out1.puts record1
+ record1 = in1.get_entry
+ elsif record2
+ out1.puts record2
+ record2 = in2.get_entry
+ else
+ break
+ end
+ end
+
+ File.rename(file_out, file_in1)
+ File.delete(file_in2)
+end
+
+casts = []
+casts << {:long=>'block_size', :short=>'b', :type=>'uint', :mandatory=>false, :default=>10_000_000, :allowed=>nil, :disallowed=>'0'}
+
+options = Biopieces.options_parse(ARGV, casts)
+tmpdir = Biopieces.mktmpdir
+
+Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output|
+ block = []
+ file_count = 0
+ block_size = 0
+
+ input.each do |record|
+ block << record
+
+ block_size += record.to_s.size
+
+ if block_size >= options[:block_size]
+ block.sort_by! { |r| r[:SEQ_NAME] }
+
+ tmpfile = File.join(tmpdir, "#{file_count}.stream")
+
+ Biopieces.open(nil, tmpfile) do |tmpin, tmpout|
+ block.each { |r| tmpout.puts r }
+ end
+
+ block = []
+ block_size = 0
+ file_count += 1
+ end
+ end
+
+ block.sort_by! { |r| r[:SEQ_NAME] }
+
+ tmpfile = File.join(tmpdir, "#{file_count}.stream")
+
+ Biopieces.open(nil, tmpfile) do |tmpin, tmpout|
+ block.each { |r| tmpout.puts r }
+ end
+end
+
+while files = Dir.glob(File.join(tmpdir, "*.stream")) and files.size > 1
+ 0.step(files.size - 2, 2) do |i|
+ files_merge(files[i], files[i + 1])
+ end
+end
+
+Biopieces.open(File.join(tmpdir, "0.stream"), options[:stream_out]) do |input, output|
+ record1 = input.get_entry
+ record2 = input.get_entry
+
+ while record1 and record2
+ case record1[:SEQ_NAME]
+ when /^(.+)\/(\d)$/ # Illumina 1.5
+ name1 = $1
+ pair1 = $2.to_i
+ when /^(.+) (\d):/ # Illumina 1.8
+ name1 = $1
+ pair1 = $2.to_i
+ else
+ $stderr.puts "WARNING: Unmatched sequence name: #{record1[:SEQ_NAME]}"
+ end
+
+ case record2[:SEQ_NAME]
+ when /^(.+)\/(\d)$/ # Illumina 1.5
+ name2 = $1
+ pair2 = $2.to_i
+ when /^(.+) (\d):/ # Illumina 1.8
+ name2 = $1
+ pair2 = $2.to_i
+ else
+ $stderr.puts "WARNING: Unmatched sequence name: #{record1[:SEQ_NAME]}"
+ end
+
+ if name1 == name2 and (pair1 - pair2).abs == 1
+ record1[:ORDER] = "paired"
+ record2[:ORDER] = "paired"
+ output.puts record1
+ output.puts record2
+ record1 = input.get_entry
+ record2 = input.get_entry
+ else
+ record1[:ORDER] = "orphan #{pair1}"
+ output.puts record1
+ record1 = record2
+ record2 = input.get_entry
+ end
+ end
+end
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+__END__