X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=bp_bin%2Fassemble_pairs;h=df1fd0320a94d218e187067e4af03f5ba9436849;hb=5526cf56fc20602b9be197db8b1a9a44502791ab;hp=440810b23f5f7fb7b400f09c71af3c6126ed3295;hpb=773dcafe5d5199784cbbb44eff03d3fe10a13cb0;p=biopieces.git diff --git a/bp_bin/assemble_pairs b/bp_bin/assemble_pairs index 440810b..df1fd03 100755 --- a/bp_bin/assemble_pairs +++ b/bp_bin/assemble_pairs @@ -29,83 +29,85 @@ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< require 'maasha/biopieces' -require 'maasha/fastq' +require 'maasha/seq' +require 'maasha/seq/assemble' require 'pp' -casts = [] -casts << {:long=>'overlap', :short=>'o', :type=>'uint', :mandatory=>false, :default=>1, :allowed=>nil, :disallowed=>"0"} +def names_match(entry1, entry2) + if entry1.seq_name =~ /^([^ ]+) \d:/ + name1 = $1 + elsif entry1.seq_name =~ /^(.+)\/\d$/ + name1 = $1 + else + raise "Could not match sequence name: #{entry1.seq_name}" + end -options = Biopieces.options_parse(ARGV, casts) + if entry2.seq_name =~ /^([^ ]+) \d:/ + name2 = $1 + elsif entry2.seq_name =~ /^(.+)\/\d$/ + name2 = $1 + else + raise "Could not match sequence name: #{entry2.seq_name}" + end -tmpdir = Biopieces.mktmpdir -file_in1 = File.join(tmpdir, "in1.fq") -file_in2 = File.join(tmpdir, "in2.fq") -file_out = File.join(tmpdir, "out.fq") + name1 == name2 +end -io_in1 = Fastq.open(file_in1, 'w') -io_in2 = Fastq.open(file_in2, 'w') +casts = [] +casts << {long: 'mismatches', short: 'm', type: 'uint', mandatory: false, default: 5, allowed: nil, disallowed: nil} +casts << {long: 'overlap_min', short: 'o', type: 'uint', mandatory: false, default: 1, allowed: nil, disallowed: "0"} +casts << {long: 'overlap_max', short: 'p', type: 'uint', mandatory: false, default: nil, allowed: nil, disallowed: "0"} + +options = Biopieces.options_parse(ARGV, casts) -name1 = "" -name2 = "" -entry1 = "" -entry2 = "" +entry1 = nil +entry2 = nil Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output| input.each_record do |record| if record[:SEQ_NAME] and record[:SEQ] and record[:SCORES] - entry = Seq.new_bp(record) - - case entry.seq_name - when /^(.+)\/(\d)$/ # Illumina 1.5 - name = $1 - pair = $2.to_i - type = 15 - when /^(.+) (\d):/ # Illumina 1.8 - name = $1 - pair = $2.to_i - type = 18 - else - $stderr.puts "WARNING: Unmatched sequence name: #{entry.seq_name}" - end - - if pair == 1 - name1 = name.dup - entry1 = entry.dup - else - name2 = name.dup - entry2 = entry.dup + if entry1.nil? + entry1 = Seq.new_bp(record) + elsif entry2.nil? + entry2 = Seq.new_bp(record) end - if name1 != nil and name1 == name2 - io_in1.puts entry1.to_fastq - io_in2.puts entry2.to_fastq + if entry1 and + entry2 and + entry1.length >= options[:overlap_min] and + entry2.length >= options[:overlap_min] + + if names_match(entry1, entry2) + entry2.type = :dna + entry2.reverse!.complement! + + merged = Assemble.pair( + entry1, + entry2, + mismatches_max: options[:mismatches], + overlap_min: options[:overlap_min], + overlap_max: options[:overlap_max] + ) + + if merged + new_record = merged.to_bp + + if merged.seq_name =~ /overlap=(\d+):hamming=(\d+)$/ + new_record[:OVERLAP_LEN] = $1 + new_record[:HAMMING_DIST] = $2 + end + + output.puts new_record + end + else + raise "name mismatch: #{entry1.seq_name} != #{entry2.seq_name}" + end + + entry1 = nil + entry2 = nil end - end - end - - io_in1.close - io_in2.close - - cmd = "pandaseq" - cmd << " -6" - cmd << " -F" - cmd << " -B" - cmd << " -N" - cmd << " -o #{options[:overlap]}" - cmd << " -f #{file_in1}" - cmd << " -r #{file_in2}" - cmd << " > #{file_out}" - cmd << " 2> /dev/null" unless options[:verbose] - - $stderr.puts cmd if options[:verbose] - - system(cmd) - - raise RunTimeError "Error: command failed: #{cmd}" unless $?.success? - - Fastq.open(file_out) do |ios| - ios.each do |entry| - output.puts entry.convert_scores!('illumina18', 'illumina15').to_bp + else + output.puts record end end end