]> git.donarmstrong.com Git - biopieces.git/commitdiff
added new biopeice remove_indel_columns
authormartinahansen <martinahansen@74ccb610-7750-0410-82ae-013aeee3265d>
Wed, 7 Dec 2011 15:23:27 +0000 (15:23 +0000)
committermartinahansen <martinahansen@74ccb610-7750-0410-82ae-013aeee3265d>
Wed, 7 Dec 2011 15:23:27 +0000 (15:23 +0000)
git-svn-id: http://biopieces.googlecode.com/svn/trunk@1696 74ccb610-7750-0410-82ae-013aeee3265d

bp_bin/remove_indel_columns [new file with mode: 0755]
bp_test/in/remove_indel_columns.in [new file with mode: 0644]
bp_test/out/remove_indel_columns.out.1 [new file with mode: 0644]
bp_test/test/test_remove_indel_columns [new file with mode: 0755]

diff --git a/bp_bin/remove_indel_columns b/bp_bin/remove_indel_columns
new file mode 100755 (executable)
index 0000000..5de6c0b
--- /dev/null
@@ -0,0 +1,77 @@
+#!/usr/bin/env ruby
+
+# Copyright (C) 2007-2011 Martin A. Hansen.
+
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+# http://www.gnu.org/copyleft/gpl.html
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# This program is part of the Biopieces framework (www.biopieces.org).
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# Remove columns with indels only from aligned sequences in the stream.
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+require 'maasha/biopieces'
+require 'narray'
+require 'pp'
+
+options      = Biopieces.options_parse(ARGV)
+tmpdir       = Biopieces.mktmpdir
+file_records = File.join(tmpdir, "data.stream")
+na_mask      = false
+count        = 0
+
+Biopieces.open(options[:stream_in], file_records) do |input, output|
+  input.each do |record|
+    if record.has_key? :SEQ
+      na_mask = NArray.int(record[:SEQ].length) unless na_mask
+      na_seq  = NArray.to_na(record[:SEQ], "byte")
+      na_mask += na_seq.eq('-'.ord) 
+      na_mask += na_seq.eq('.'.ord) 
+      na_mask += na_seq.eq('_'.ord) 
+      na_mask += na_seq.eq('~'.ord) 
+
+      count += 1
+    end
+
+    output.puts record
+  end
+end
+
+na_mask = na_mask.ne count
+
+sum = na_mask.sum
+
+Biopieces.open(file_records, options[:stream_out]) do |input, output|
+  input.each do |record|
+    if sum > 0 and record.has_key? :SEQ
+      na_seq           = NArray.to_na(record[:SEQ], "byte")
+      record[:SEQ]     = na_seq[na_mask].to_s
+      record[:SEQ_LEN] = record[:SEQ].length
+    end
+
+    output.puts record
+  end
+end
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+__END__
diff --git a/bp_test/in/remove_indel_columns.in b/bp_test/in/remove_indel_columns.in
new file mode 100644 (file)
index 0000000..1befbfc
--- /dev/null
@@ -0,0 +1,20 @@
+SEQ_NAME: test1
+SEQ: _cgta.cgta-ctacg~actcgtacg-
+SEQ_LEN: 27
+---
+SEQ_NAME: test2
+SEQ: _cgta.cgta-ctacg~actcgtacg-
+SEQ_LEN: 27
+---
+SEQ_NAME: test3
+SEQ: _cgta.cgta-ctacg~actcgtacg-
+SEQ_LEN: 27
+---
+SEQ_NAME: test4
+SEQ: _cgta.cgta-ctacg~actcgtacg-
+SEQ_LEN: 27
+---
+SEQ_NAME: test5
+SEQ: _cgta.cgtagctacg~actcgtacg-
+SEQ_LEN: 27
+---
diff --git a/bp_test/out/remove_indel_columns.out.1 b/bp_test/out/remove_indel_columns.out.1
new file mode 100644 (file)
index 0000000..db3a6ed
--- /dev/null
@@ -0,0 +1,20 @@
+SEQ_NAME: test1
+SEQ: cgtacgta-ctacgactcgtacg
+SEQ_LEN: 23
+---
+SEQ_NAME: test2
+SEQ: cgtacgta-ctacgactcgtacg
+SEQ_LEN: 23
+---
+SEQ_NAME: test3
+SEQ: cgtacgta-ctacgactcgtacg
+SEQ_LEN: 23
+---
+SEQ_NAME: test4
+SEQ: cgtacgta-ctacgactcgtacg
+SEQ_LEN: 23
+---
+SEQ_NAME: test5
+SEQ: cgtacgtagctacgactcgtacg
+SEQ_LEN: 23
+---
diff --git a/bp_test/test/test_remove_indel_columns b/bp_test/test/test_remove_indel_columns
new file mode 100755 (executable)
index 0000000..61c5709
--- /dev/null
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+source "$BP_DIR/bp_test/lib/test.sh"
+
+run "$bp -I $in -O $tmp"
+assert_no_diff $tmp $out.1
+clean