From: martinahansen Date: Wed, 7 Dec 2011 15:23:27 +0000 (+0000) Subject: added new biopeice remove_indel_columns X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=ec113879cc16441646207c6caddafa917d2d0737;p=biopieces.git added new biopeice remove_indel_columns git-svn-id: http://biopieces.googlecode.com/svn/trunk@1696 74ccb610-7750-0410-82ae-013aeee3265d --- diff --git a/bp_bin/remove_indel_columns b/bp_bin/remove_indel_columns new file mode 100755 index 0000000..5de6c0b --- /dev/null +++ b/bp_bin/remove_indel_columns @@ -0,0 +1,77 @@ +#!/usr/bin/env ruby + +# Copyright (C) 2007-2011 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# This program is part of the Biopieces framework (www.biopieces.org). + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# Remove columns with indels only from aligned sequences in the stream. + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +require 'maasha/biopieces' +require 'narray' +require 'pp' + +options = Biopieces.options_parse(ARGV) +tmpdir = Biopieces.mktmpdir +file_records = File.join(tmpdir, "data.stream") +na_mask = false +count = 0 + +Biopieces.open(options[:stream_in], file_records) do |input, output| + input.each do |record| + if record.has_key? :SEQ + na_mask = NArray.int(record[:SEQ].length) unless na_mask + na_seq = NArray.to_na(record[:SEQ], "byte") + na_mask += na_seq.eq('-'.ord) + na_mask += na_seq.eq('.'.ord) + na_mask += na_seq.eq('_'.ord) + na_mask += na_seq.eq('~'.ord) + + count += 1 + end + + output.puts record + end +end + +na_mask = na_mask.ne count + +sum = na_mask.sum + +Biopieces.open(file_records, options[:stream_out]) do |input, output| + input.each do |record| + if sum > 0 and record.has_key? :SEQ + na_seq = NArray.to_na(record[:SEQ], "byte") + record[:SEQ] = na_seq[na_mask].to_s + record[:SEQ_LEN] = record[:SEQ].length + end + + output.puts record + end +end + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +__END__ diff --git a/bp_test/in/remove_indel_columns.in b/bp_test/in/remove_indel_columns.in new file mode 100644 index 0000000..1befbfc --- /dev/null +++ b/bp_test/in/remove_indel_columns.in @@ -0,0 +1,20 @@ +SEQ_NAME: test1 +SEQ: _cgta.cgta-ctacg~actcgtacg- +SEQ_LEN: 27 +--- +SEQ_NAME: test2 +SEQ: _cgta.cgta-ctacg~actcgtacg- +SEQ_LEN: 27 +--- +SEQ_NAME: test3 +SEQ: _cgta.cgta-ctacg~actcgtacg- +SEQ_LEN: 27 +--- +SEQ_NAME: test4 +SEQ: _cgta.cgta-ctacg~actcgtacg- +SEQ_LEN: 27 +--- +SEQ_NAME: test5 +SEQ: _cgta.cgtagctacg~actcgtacg- +SEQ_LEN: 27 +--- diff --git a/bp_test/out/remove_indel_columns.out.1 b/bp_test/out/remove_indel_columns.out.1 new file mode 100644 index 0000000..db3a6ed --- /dev/null +++ b/bp_test/out/remove_indel_columns.out.1 @@ -0,0 +1,20 @@ +SEQ_NAME: test1 +SEQ: cgtacgta-ctacgactcgtacg +SEQ_LEN: 23 +--- +SEQ_NAME: test2 +SEQ: cgtacgta-ctacgactcgtacg +SEQ_LEN: 23 +--- +SEQ_NAME: test3 +SEQ: cgtacgta-ctacgactcgtacg +SEQ_LEN: 23 +--- +SEQ_NAME: test4 +SEQ: cgtacgta-ctacgactcgtacg +SEQ_LEN: 23 +--- +SEQ_NAME: test5 +SEQ: cgtacgtagctacgactcgtacg +SEQ_LEN: 23 +--- diff --git a/bp_test/test/test_remove_indel_columns b/bp_test/test/test_remove_indel_columns new file mode 100755 index 0000000..61c5709 --- /dev/null +++ b/bp_test/test/test_remove_indel_columns @@ -0,0 +1,7 @@ +#!/bin/bash + +source "$BP_DIR/bp_test/lib/test.sh" + +run "$bp -I $in -O $tmp" +assert_no_diff $tmp $out.1 +clean