From 88de212818159ab70b40b5aba9a85f6cbff6e395 Mon Sep 17 00:00:00 2001 From: martinahansen Date: Wed, 13 Mar 2013 11:33:46 +0000 Subject: [PATCH] assemble_pairs revamp git-svn-id: http://biopieces.googlecode.com/svn/trunk@2145 74ccb610-7750-0410-82ae-013aeee3265d --- bp_bin/assemble_pairs | 129 +++++++++++++++++++++-------------------- bp_bin/assemble_pairs2 | 119 ------------------------------------- 2 files changed, 66 insertions(+), 182 deletions(-) delete mode 100755 bp_bin/assemble_pairs2 diff --git a/bp_bin/assemble_pairs b/bp_bin/assemble_pairs index c9e0eb0..66c2a7b 100755 --- a/bp_bin/assemble_pairs +++ b/bp_bin/assemble_pairs @@ -29,82 +29,85 @@ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< require 'maasha/biopieces' -require 'maasha/fastq' +require 'maasha/seq' +require 'maasha/seq/assemble' require 'pp' -casts = [] -casts << {:long=>'overlap', :short=>'o', :type=>'uint', :mandatory=>false, :default=>1, :allowed=>nil, :disallowed=>"0"} +def names_match(entry1, entry2) + if entry1.seq_name =~ /^([^ ]+) \d:/ + name1 = $1 + elsif entry1.seq_name =~ /^(.+)\/\d$/ + name1 = $1 + else + raise "Could not match sequence name: #{entry1.seq_name}" + end -options = Biopieces.options_parse(ARGV, casts) + if entry2.seq_name =~ /^([^ ]+) \d:/ + name2 = $1 + elsif entry2.seq_name =~ /^(.+)\/\d$/ + name2 = $1 + else + raise "Could not match sequence name: #{entry2.seq_name}" + end -tmpdir = Biopieces.mktmpdir -file_in1 = File.join(tmpdir, "in1.fq") -file_in2 = File.join(tmpdir, "in2.fq") -file_out = File.join(tmpdir, "out.fq") + name1 == name2 +end -io_in1 = Fastq.open(file_in1, 'w') -io_in2 = Fastq.open(file_in2, 'w') +casts = [] +casts << {:long=>'mismatches', :short=>'m', :type=>'uint', :mandatory=>false, :default=>5, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'overlap_min', :short=>'o', :type=>'uint', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>"0"} +casts << {:long=>'overlap_max', :short=>'p', :type=>'uint', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>"0"} + +options = Biopieces.options_parse(ARGV, casts) -name1 = "" -name2 = "" -entry1 = "" -entry2 = "" +entry1 = nil +entry2 = nil Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output| input.each_record do |record| if record[:SEQ_NAME] and record[:SEQ] and record[:SCORES] - entry = Seq.new_bp(record) - - case entry.seq_name - when /^(.+)\/(\d)$/ # Illumina 1.5 - name = $1 - pair = $2.to_i - type = 15 - when /^(.+) (\d):/ # Illumina 1.8 - name = $1 - pair = $2.to_i - type = 18 - else - $stderr.puts "WARNING: Unmatched sequence name: #{entry.seq_name}" - end - - if pair == 1 - name1 = name.dup - entry1 = entry.dup - else - name2 = name.dup - entry2 = entry.dup + if entry1.nil? + entry1 = Seq.new_bp(record) + elsif entry2.nil? + entry2 = Seq.new_bp(record) end - if name1 != nil and name1 == name2 - io_in1.puts entry1.to_fastq - io_in2.puts entry2.to_fastq + if entry1 and + entry2 and + entry1.length >= options[:overlap_min] and + entry2.length >= options[:overlap_min] + + if names_match(entry1, entry2) + entry2.type = :dna + entry2.reverse!.complement! + + merged = Assemble.pair( + entry1, + entry2, + mismatches_max:options[:mismatches], + overlap_min:options[:overlap_min], + overlap_max:options[:overlap_max] + ) + + if merged + new_record = merged.to_bp + + if merged.seq_name =~ /overlap=(\d+):hamming=(\d+)$/ + new_record[:OVERLAP_LEN] = $1 + new_record[:HAMMING_DIST] = $2 + end + + output.puts new_record + end + else + raise "name mismatch: #{entry1.seq_name} != #{entry2.seq_name}" + end + + entry1 = nil + entry2 = nil end - end - end - - io_in1.close - io_in2.close - - cmd = "pandaseq" - cmd << " -F" - cmd << " -B" - cmd << " -N" - cmd << " -o #{options[:overlap]}" - cmd << " -f #{file_in1}" - cmd << " -r #{file_in2}" - cmd << " > #{file_out}" - cmd << " 2> /dev/null" unless options[:verbose] - - $stderr.puts cmd if options[:verbose] - - system(cmd) - - raise RunTimeError "Error: command failed: #{cmd}" unless $?.success? - - Fastq.open(file_out) do |ios| - ios.each do |entry| - output.puts entry.to_bp + else + output.puts record end end end diff --git a/bp_bin/assemble_pairs2 b/bp_bin/assemble_pairs2 deleted file mode 100755 index 66c2a7b..0000000 --- a/bp_bin/assemble_pairs2 +++ /dev/null @@ -1,119 +0,0 @@ -#!/usr/bin/env ruby - -# Copyright (C) 2007-2013 Martin A. Hansen. - -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License -# as published by the Free Software Foundation; either version 2 -# of the License, or (at your option) any later version. - -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. - -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. - -# http://www.gnu.org/copyleft/gpl.html - -# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - -# This program is part of the Biopieces framework (www.biopieces.org). - -# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - -# Assemble ordered overlapping pair-end sequences in the stream. - -# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - -require 'maasha/biopieces' -require 'maasha/seq' -require 'maasha/seq/assemble' -require 'pp' - -def names_match(entry1, entry2) - if entry1.seq_name =~ /^([^ ]+) \d:/ - name1 = $1 - elsif entry1.seq_name =~ /^(.+)\/\d$/ - name1 = $1 - else - raise "Could not match sequence name: #{entry1.seq_name}" - end - - if entry2.seq_name =~ /^([^ ]+) \d:/ - name2 = $1 - elsif entry2.seq_name =~ /^(.+)\/\d$/ - name2 = $1 - else - raise "Could not match sequence name: #{entry2.seq_name}" - end - - name1 == name2 -end - -casts = [] -casts << {:long=>'mismatches', :short=>'m', :type=>'uint', :mandatory=>false, :default=>5, :allowed=>nil, :disallowed=>nil} -casts << {:long=>'overlap_min', :short=>'o', :type=>'uint', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>"0"} -casts << {:long=>'overlap_max', :short=>'p', :type=>'uint', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>"0"} - -options = Biopieces.options_parse(ARGV, casts) - -entry1 = nil -entry2 = nil - -Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output| - input.each_record do |record| - if record[:SEQ_NAME] and record[:SEQ] and record[:SCORES] - if entry1.nil? - entry1 = Seq.new_bp(record) - elsif entry2.nil? - entry2 = Seq.new_bp(record) - end - - if entry1 and - entry2 and - entry1.length >= options[:overlap_min] and - entry2.length >= options[:overlap_min] - - if names_match(entry1, entry2) - entry2.type = :dna - entry2.reverse!.complement! - - merged = Assemble.pair( - entry1, - entry2, - mismatches_max:options[:mismatches], - overlap_min:options[:overlap_min], - overlap_max:options[:overlap_max] - ) - - if merged - new_record = merged.to_bp - - if merged.seq_name =~ /overlap=(\d+):hamming=(\d+)$/ - new_record[:OVERLAP_LEN] = $1 - new_record[:HAMMING_DIST] = $2 - end - - output.puts new_record - end - else - raise "name mismatch: #{entry1.seq_name} != #{entry2.seq_name}" - end - - entry1 = nil - entry2 = nil - end - else - output.puts record - end - end -end - - -# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - - -__END__ -- 2.39.2