From: martinahansen Date: Thu, 23 Feb 2012 08:29:07 +0000 (+0000) Subject: added duplicate_record biopiece X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=789478ebeceefec32d6c222f8bfff19f1078076d;p=biopieces.git added duplicate_record biopiece git-svn-id: http://biopieces.googlecode.com/svn/trunk@1750 74ccb610-7750-0410-82ae-013aeee3265d --- diff --git a/bp_bin/denoise_seq b/bp_bin/denoise_seq index ff557b4..5e67ab8 100755 --- a/bp_bin/denoise_seq +++ b/bp_bin/denoise_seq @@ -37,8 +37,11 @@ require 'maasha/align' require 'maasha/usearch' casts = [] -casts << {:long=>'identity', :short=>'i', :type=>'float', :mandatory=>true, :default=>0.97, :allowed=>nil, :disallowed=>nil} -casts << {:long=>'cluster_min', :short=>'c', :type=>'uint', :mandatory=>true, :default=>2, :allowed=>nil, :disallowed=>"0"} +casts << {:long=>'identity', :short=>'i', :type=>'float', :mandatory=>true, :default=>0.97, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'cluster_min', :short=>'c', :type=>'uint', :mandatory=>true, :default=>2, :allowed=>nil, :disallowed=>"0"} +casts << {:long=>'frequency_min', :short=>'f', :type=>'uint', :mandatory=>true, :default=>2, :allowed=>nil, :disallowed=>"0"} +casts << {:long=>'quality_min', :short=>'q', :type=>'uint', :mandatory=>true, :default=>20, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'consensus_min', :short=>'C', :type=>'float', :mandatory=>true, :default=>0.2, :allowed=>nil, :disallowed=>nil} options = Biopieces.options_parse(ARGV, casts) tmpdir = Biopieces.mktmpdir @@ -95,6 +98,8 @@ Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output| uc.each_alignment do |align| if align.members >= options[:cluster_min] + align.options = options + alignment_to_fastq(align.entries, index) cons = align.consensus diff --git a/bp_bin/duplicate_record b/bp_bin/duplicate_record new file mode 100755 index 0000000..af3d112 --- /dev/null +++ b/bp_bin/duplicate_record @@ -0,0 +1,54 @@ +#!/usr/bin/env ruby + +# Copyright (C) 2007-2012 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# This program is part of the Biopieces framework (www.biopieces.org). + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# Duplicate records in the stream using the value to a specified key. + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +require 'maasha/biopieces' + +casts = [] +casts << {:long=>'key', :short=>'k', :type=>'string', :mandatory=>true, :default=>nil, :allowed=>nil, :disallowed=>nil} + +options = Biopieces.options_parse(ARGV, casts) +key = options[:key].to_sym + +Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output| + input.each_record do |record| + if record.has_key? key + (0 ... record[key].to_i).each { output.puts record } + else + output.puts record + end + end +end + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +__END__ diff --git a/bp_test/in/duplicate_record.in b/bp_test/in/duplicate_record.in new file mode 100644 index 0000000..df96f16 --- /dev/null +++ b/bp_test/in/duplicate_record.in @@ -0,0 +1,17 @@ +ID: test1 +COUNT: 0 +--- +ID: test2 +COUNT: 1 +--- +ID: test3 +COUNT: 2 +--- +ID: test4 +COUNT: -1 +--- +ID: test5 +COUNT: fisk +--- +ID: test6 +--- diff --git a/bp_test/out/duplicate_record.out.1 b/bp_test/out/duplicate_record.out.1 new file mode 100644 index 0000000..f1c6a0a --- /dev/null +++ b/bp_test/out/duplicate_record.out.1 @@ -0,0 +1,11 @@ +ID: test2 +COUNT: 1 +--- +ID: test3 +COUNT: 2 +--- +ID: test3 +COUNT: 2 +--- +ID: test6 +--- diff --git a/bp_test/test/test_duplicate_record b/bp_test/test/test_duplicate_record new file mode 100755 index 0000000..07fce62 --- /dev/null +++ b/bp_test/test/test_duplicate_record @@ -0,0 +1,7 @@ +#!/bin/bash + +source "$BP_DIR/bp_test/lib/test.sh" + +run "$bp -I $in -k COUNT -O $tmp" +assert_no_diff $tmp $out.1 +clean diff --git a/code_ruby/lib/maasha/align.rb b/code_ruby/lib/maasha/align.rb index f7f59f6..b758cd4 100755 --- a/code_ruby/lib/maasha/align.rb +++ b/code_ruby/lib/maasha/align.rb @@ -29,7 +29,7 @@ require 'maasha/fasta' class AlignError < StandardError; end; -CONSENSUS = 0.20 +CONS_MIN = 0.20 FREQ_MIN = 2 QUAL_MIN = 20 ALPH_DNA = %w{A T C G} @@ -91,7 +91,8 @@ end # Class for aligning sequences. class Align - attr_reader :entries + attr_accessor :options + attr_reader :entries # Class method to align sequences in a list of Seq objects and # return these as a new list of Seq objects. @@ -130,8 +131,9 @@ class Align end # Method to initialize an Align object with a list of aligned Seq objects. - def initialize(entries) + def initialize(entries, options = {}) @entries = entries + @options = options end # Method that returns the length of the alignment. @@ -147,19 +149,21 @@ class Align # Method to create a consensus sequence from an Align object and # return a new Seq object with the consensus. def consensus - cons = Consensus.new(@entries) + cons = Consensus.new(@entries, @options) cons.to_seq end # Method to pretty print an alignment from an Align object. def to_s - cons = self.consensus - cons.mask_seq_soft!(QUAL_MIN) unless cons.qual.nil? + qual_min = @options[:quality_min] || QUAL_MIN + + cons = self.consensus + cons.mask_seq_soft!(qual_min) unless cons.qual.nil? max_name = @entries.group_by { |entry| entry.seq_name.length }.max.first @entries.each do |entry| - entry.mask_seq_soft!(QUAL_MIN) unless entry.qual.nil? + entry.mask_seq_soft!(qual_min) unless entry.qual.nil? puts entry.seq_name + (" " * (max_name + 3 - entry.seq_name.length )) + entry.seq end @@ -173,10 +177,16 @@ class Align class Consensus # Method to initialize a Consensus object given a list of aligned Seq object. - def initialize(entries) + def initialize(entries, options) @entries = entries - @cols = entries.first.length - @rows = entries.size + @options = options + + @cons_min = options[:consensus_min] || CONS_MIN + @freq_min = options[:frequency_min] || FREQ_MIN + @qual_min = options[:quality_min] || QUAL_MIN + + @cols = entries.first.length + @rows = entries.size @has_qual = entries.first.qual.nil? ? false : true @@ -217,7 +227,7 @@ class Align end def mask_high_qual - @na_qual > QUAL_MIN + @na_qual > @qual_min end def mask_conserved_columns @@ -231,10 +241,10 @@ class Align mask_C = @na_seq & BIT_C > 0 mask_G = @na_seq & BIT_G > 0 - mask_A = (mask_A * mask_A.sum(1)).to_f * factor > CONSENSUS - mask_T = (mask_T * mask_T.sum(1)).to_f * factor > CONSENSUS - mask_C = (mask_C * mask_C.sum(1)).to_f * factor > CONSENSUS - mask_G = (mask_G * mask_G.sum(1)).to_f * factor > CONSENSUS + mask_A = (mask_A * mask_A.sum(1)).to_f * factor > @cons_min + mask_T = (mask_T * mask_T.sum(1)).to_f * factor > @cons_min + mask_C = (mask_C * mask_C.sum(1)).to_f * factor > @cons_min + mask_G = (mask_G * mask_G.sum(1)).to_f * factor > @cons_min mask_A | mask_T | mask_C | mask_G end