From 6a0164e1147c4ee1c107f277e9d81a0b17f2554b Mon Sep 17 00:00:00 2001 From: martinahansen Date: Thu, 1 Nov 2012 14:17:29 +0000 Subject: [PATCH] adding order_pairs3 git-svn-id: http://biopieces.googlecode.com/svn/trunk@1980 74ccb610-7750-0410-82ae-013aeee3265d --- bp_bin/order_pairs2 | 4 +- bp_bin/order_pairs3 | 160 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 162 insertions(+), 2 deletions(-) create mode 100755 bp_bin/order_pairs3 diff --git a/bp_bin/order_pairs2 b/bp_bin/order_pairs2 index 255cbd6..6eb247e 100755 --- a/bp_bin/order_pairs2 +++ b/bp_bin/order_pairs2 @@ -39,7 +39,7 @@ options = Biopieces.options_parse(ARGV) tmpdir = Biopieces.mktmpdir #tmpdir = "Sletmig" -tmpfile = File.join(tmpdir, "data") +tmpfile = File.join(tmpdir, "data.gdbm") db = GDBM.new(tmpfile) @@ -83,7 +83,7 @@ Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output| record2 = Marshal.load(record2) unless record2.nil? if record2.nil? - record1[:ORDER] = "orphan #{pair}" + record1[:ORDER] = "orphan#{pair}" output.puts record1 else record1[:ORDER] = "paired" diff --git a/bp_bin/order_pairs3 b/bp_bin/order_pairs3 new file mode 100755 index 0000000..0c51891 --- /dev/null +++ b/bp_bin/order_pairs3 @@ -0,0 +1,160 @@ +#!/usr/bin/env ruby + +# Copyright (C) 2007-2012 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# This program is part of the Biopieces framework (www.biopieces.org). + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# Order records with pair end sequence data. + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +require 'pp' +require 'maasha/biopieces' + +def files_merge(file_in1, file_in2) + file_out = "#{file_in1}.merge" + + in1, out1 = Biopieces.open(file_in1, file_out) + in2, out2 = Biopieces.open(file_in2) + + record1 = in1.get_entry + record2 = in2.get_entry + + while 1 + if record1 and record2 + if record1[:SEQ_NAME] < record2[:SEQ_NAME] + out1.puts record1 + record1 = in1.get_entry + else + out1.puts record2 + record2 = in2.get_entry + end + elsif record1 + out1.puts record1 + record1 = in1.get_entry + elsif record2 + out1.puts record2 + record2 = in2.get_entry + else + break + end + end + + File.rename(file_out, file_in1) + File.delete(file_in2) +end + +casts = [] +casts << {:long=>'block_size', :short=>'b', :type=>'uint', :mandatory=>false, :default=>10_000_000, :allowed=>nil, :disallowed=>'0'} + +options = Biopieces.options_parse(ARGV, casts) +tmpdir = Biopieces.mktmpdir + +Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output| + block = [] + file_count = 0 + block_size = 0 + + input.each do |record| + block << record + + block_size += record.to_s.size + + if block_size >= options[:block_size] + block.sort_by! { |r| r[:SEQ_NAME] } + + tmpfile = File.join(tmpdir, "#{file_count}.stream") + + Biopieces.open(nil, tmpfile) do |tmpin, tmpout| + block.each { |r| tmpout.puts r } + end + + block = [] + block_size = 0 + file_count += 1 + end + end + + block.sort_by! { |r| r[:SEQ_NAME] } + + tmpfile = File.join(tmpdir, "#{file_count}.stream") + + Biopieces.open(nil, tmpfile) do |tmpin, tmpout| + block.each { |r| tmpout.puts r } + end +end + +while files = Dir.glob(File.join(tmpdir, "*.stream")) and files.size > 1 + 0.step(files.size - 2, 2) do |i| + files_merge(files[i], files[i + 1]) + end +end + +Biopieces.open(File.join(tmpdir, "0.stream"), options[:stream_out]) do |input, output| + record1 = input.get_entry + record2 = input.get_entry + + while record1 and record2 + case record1[:SEQ_NAME] + when /^(.+)\/(\d)$/ # Illumina 1.5 + name1 = $1 + pair1 = $2.to_i + when /^(.+) (\d):/ # Illumina 1.8 + name1 = $1 + pair1 = $2.to_i + else + $stderr.puts "WARNING: Unmatched sequence name: #{record1[:SEQ_NAME]}" + end + + case record2[:SEQ_NAME] + when /^(.+)\/(\d)$/ # Illumina 1.5 + name2 = $1 + pair2 = $2.to_i + when /^(.+) (\d):/ # Illumina 1.8 + name2 = $1 + pair2 = $2.to_i + else + $stderr.puts "WARNING: Unmatched sequence name: #{record1[:SEQ_NAME]}" + end + + if name1 == name2 and (pair1 - pair2).abs == 1 + record1[:ORDER] = "paired" + record2[:ORDER] = "paired" + output.puts record1 + output.puts record2 + record1 = input.get_entry + record2 = input.get_entry + else + record1[:ORDER] = "orphan #{pair1}" + output.puts record1 + record1 = record2 + record2 = input.get_entry + end + end +end + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +__END__ -- 2.39.5