]> git.donarmstrong.com Git - biopieces.git/commitdiff
adding order_pairs3
authormartinahansen <martinahansen@74ccb610-7750-0410-82ae-013aeee3265d>
Thu, 1 Nov 2012 14:17:29 +0000 (14:17 +0000)
committermartinahansen <martinahansen@74ccb610-7750-0410-82ae-013aeee3265d>
Thu, 1 Nov 2012 14:17:29 +0000 (14:17 +0000)
git-svn-id: http://biopieces.googlecode.com/svn/trunk@1980 74ccb610-7750-0410-82ae-013aeee3265d

bp_bin/order_pairs2
bp_bin/order_pairs3 [new file with mode: 0755]

index 255cbd6c91724164463966801247d20093878f39..6eb247e4aa8e35f5e856261fcc7cf35de0af2b7a 100755 (executable)
@@ -39,7 +39,7 @@ options = Biopieces.options_parse(ARGV)
 
 tmpdir  = Biopieces.mktmpdir
 #tmpdir  = "Sletmig"
-tmpfile = File.join(tmpdir, "data")
+tmpfile = File.join(tmpdir, "data.gdbm")
 
 db = GDBM.new(tmpfile)
 
@@ -83,7 +83,7 @@ Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output|
     record2 = Marshal.load(record2) unless record2.nil?
 
     if record2.nil?
-      record1[:ORDER] = "orphan #{pair}"
+      record1[:ORDER] = "orphan#{pair}"
       output.puts record1
     else
       record1[:ORDER] = "paired"
diff --git a/bp_bin/order_pairs3 b/bp_bin/order_pairs3
new file mode 100755 (executable)
index 0000000..0c51891
--- /dev/null
@@ -0,0 +1,160 @@
+#!/usr/bin/env ruby
+
+# Copyright (C) 2007-2012 Martin A. Hansen.
+
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+# http://www.gnu.org/copyleft/gpl.html
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# This program is part of the Biopieces framework (www.biopieces.org).
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# Order records with pair end sequence data.
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+require 'pp'
+require 'maasha/biopieces'
+
+def files_merge(file_in1, file_in2)
+  file_out = "#{file_in1}.merge"
+
+  in1, out1 = Biopieces.open(file_in1, file_out)
+  in2, out2 = Biopieces.open(file_in2)
+
+  record1 = in1.get_entry
+  record2 = in2.get_entry
+
+  while 1
+    if record1 and record2
+      if record1[:SEQ_NAME] < record2[:SEQ_NAME]
+        out1.puts record1
+        record1 = in1.get_entry
+      else
+        out1.puts record2
+        record2 = in2.get_entry
+      end
+    elsif record1
+      out1.puts record1
+      record1 = in1.get_entry
+    elsif record2
+      out1.puts record2
+      record2 = in2.get_entry
+    else
+      break
+    end
+  end
+
+  File.rename(file_out, file_in1)
+  File.delete(file_in2)
+end
+
+casts = []
+casts << {:long=>'block_size', :short=>'b', :type=>'uint', :mandatory=>false, :default=>10_000_000, :allowed=>nil, :disallowed=>'0'}
+
+options = Biopieces.options_parse(ARGV, casts)
+tmpdir  = Biopieces.mktmpdir
+
+Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output|
+  block = []
+  file_count = 0
+  block_size = 0
+
+  input.each do |record|
+    block << record
+
+    block_size += record.to_s.size
+
+    if block_size >= options[:block_size]
+      block.sort_by! { |r| r[:SEQ_NAME] }
+
+      tmpfile = File.join(tmpdir, "#{file_count}.stream")
+
+      Biopieces.open(nil, tmpfile) do |tmpin, tmpout|
+        block.each { |r| tmpout.puts r }
+      end
+
+      block       = []
+      block_size  = 0
+      file_count += 1
+    end
+  end
+
+  block.sort_by! { |r| r[:SEQ_NAME] }
+
+  tmpfile = File.join(tmpdir, "#{file_count}.stream")
+
+  Biopieces.open(nil, tmpfile) do |tmpin, tmpout|
+    block.each { |r| tmpout.puts r }
+  end
+end
+
+while files = Dir.glob(File.join(tmpdir, "*.stream")) and files.size > 1
+  0.step(files.size - 2, 2) do |i|
+    files_merge(files[i], files[i + 1])
+  end
+end
+
+Biopieces.open(File.join(tmpdir, "0.stream"), options[:stream_out]) do |input, output|
+  record1 = input.get_entry
+  record2 = input.get_entry
+
+  while record1 and record2
+    case record1[:SEQ_NAME]
+    when /^(.+)\/(\d)$/   # Illumina 1.5
+      name1 = $1
+      pair1 = $2.to_i
+    when /^(.+) (\d):/    # Illumina 1.8
+      name1 = $1
+      pair1 = $2.to_i
+    else
+      $stderr.puts "WARNING: Unmatched sequence name: #{record1[:SEQ_NAME]}"
+    end
+
+    case record2[:SEQ_NAME]
+    when /^(.+)\/(\d)$/   # Illumina 1.5
+      name2 = $1
+      pair2 = $2.to_i
+    when /^(.+) (\d):/    # Illumina 1.8
+      name2 = $1
+      pair2 = $2.to_i
+    else
+      $stderr.puts "WARNING: Unmatched sequence name: #{record1[:SEQ_NAME]}"
+    end
+
+    if name1 == name2 and (pair1 - pair2).abs == 1
+      record1[:ORDER] = "paired"
+      record2[:ORDER] = "paired"
+      output.puts record1
+      output.puts record2
+      record1 = input.get_entry
+      record2 = input.get_entry
+    else
+      record1[:ORDER] = "orphan #{pair1}"
+      output.puts record1
+      record1 = record2
+      record2 = input.get_entry
+    end
+  end
+end
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+__END__