]> git.donarmstrong.com Git - biopieces.git/commitdiff
ported trim_seq to ruby and added tests
authormartinahansen <martinahansen@74ccb610-7750-0410-82ae-013aeee3265d>
Thu, 20 Oct 2011 08:21:27 +0000 (08:21 +0000)
committermartinahansen <martinahansen@74ccb610-7750-0410-82ae-013aeee3265d>
Thu, 20 Oct 2011 08:21:27 +0000 (08:21 +0000)
git-svn-id: http://biopieces.googlecode.com/svn/trunk@1591 74ccb610-7750-0410-82ae-013aeee3265d

bp_bin/trim_seq
bp_test/in/trim_seq.in [new file with mode: 0644]
bp_test/out/trim_seq.out.1 [new file with mode: 0644]
bp_test/out/trim_seq.out.2 [new file with mode: 0644]
bp_test/out/trim_seq.out.3 [new file with mode: 0644]
bp_test/out/trim_seq.out.4 [new file with mode: 0644]
bp_test/out/trim_seq.out.5 [new file with mode: 0644]
bp_test/test/test_trim_seq [new file with mode: 0755]

index 2abaa864a3b3b0aeefbb3a62e44e7e4a9cdf50d7..58967e34b666e4522ba5808be6c2cd521ea8c5a0 100755 (executable)
@@ -1,6 +1,6 @@
-#!/usr/bin/env perl
+#!/usr/bin/env ruby
 
-# Copyright (C) 2007-2010 Martin A. Hansen.
+# Copyright (C) 2007-2011 Martin A. Hansen.
 
 # This program is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License
 
 # http://www.gnu.org/copyleft/gpl.html
 
-
-# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
-
-# Trim sequence ends for residues with a low quality score.
-
-# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
-
-
-use warnings;
-use strict;
-use Maasha::Biopieces;
-use Maasha::Fastq;
-
-
 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
 
+# This program is part of the Biopieces framework (www.biopieces.org).
 
-my ( $options, $in, $out, $record, $left, $right, $pos_l, $pos_r );
-
-$options = Maasha::Biopieces::parse_options(
-    [
-        { long => 'min',  short => 'm', type => 'uint',   mandatory => 'no', default => 15,     allowed => undef,             disallowed => 0 },
-        { long => 'trim', short => 't', type => 'string', mandatory => 'no', default => 'both', allowed => 'left,right,both', disallowed => undef },
-    ]   
-);
-
-$right = 1 if $options->{ 'trim' } eq 'right';
-$left  = 1 if $options->{ 'trim' } eq 'left';
-$right = 1 if $options->{ 'trim' } eq 'both';
-$left  = 1 if $options->{ 'trim' } eq 'both';
-
-$in  = Maasha::Biopieces::read_stream( $options->{ "stream_in" } );
-$out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } );
-
-while ( $record = Maasha::Biopieces::get_record( $in ) )
-{
-    if ( exists $record->{ 'SEQ' } and exists $record->{ 'SCORES' } )
-    {
-        if ( $right )
-        {
-            $pos_r = Maasha::Fastq::trim_right( $record->{ 'SCORES' }, $options->{ 'min' } );
-        
-            $record->{ 'SEQ' }        = substr $record->{ 'SEQ' }, 0, $pos_r + 1;
-            $record->{ 'SCORES' }     = substr $record->{ 'SCORES' }, 0, $pos_r + 1; 
-            $record->{ 'TRIM_RIGHT' } = $pos_r;
-        }
-
-        if ( $left )
-        {
-            $pos_l = Maasha::Fastq::trim_left( $record->{ 'SCORES' }, $options->{ 'min' } );
-        
-            $record->{ 'SEQ' }      = substr $record->{ 'SEQ' }, $pos_l;
-            $record->{ 'SCORES' }   = substr $record->{ 'SCORES' }, $pos_l;
-            $record->{ 'TRIM_LEFT' } = $pos_l;
-        }
-
-        $record->{ 'SEQ_LEN' } = length $record->{ 'SEQ' };
-    }
-
-    Maasha::Biopieces::put_record( $record, $out );
-}
-
-Maasha::Biopieces::close_stream( $in );
-Maasha::Biopieces::close_stream( $out );
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
 
+# Trim sequence ends for residues with a low quality score.
 
 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
 
-
-BEGIN
-{
-    Maasha::Biopieces::status_set();
-}
-
-
-END
-{
-    Maasha::Biopieces::status_log();
-}
+require 'maasha/biopieces'
+require 'pp'
+
+casts = []
+casts << {:long=>'min',  :short=>'m', :type=>'uint',   :mandatory=>true, :default=>15,     :allowed=>nil, :disallowed=>'0'}
+casts << {:long=>'trim', :short=>'t', :type=>'string', :mandatory=>true, :default=>'both', :allowed=>'left,right,both', :disallowed=>nil}
+
+options = Biopieces.options_parse(ARGV, casts)
+
+ILLUMINA_BASE = 64
+
+regex_left  = Regexp.new("^[#{(ILLUMINA_BASE).chr}-#{(ILLUMINA_BASE + options[:min]).chr}]+")
+regex_right = Regexp.new("[#{(ILLUMINA_BASE).chr}-#{(ILLUMINA_BASE + options[:min]).chr}]+$")
+
+Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output|
+  input.each_record do |record|
+    if record.has_key? :SEQ and record.has_key? :SCORES
+      case options[:trim]
+      when /both/
+        record[:SCORES].match(regex_right) do |m|
+          record[:SEQ]       = record[:SEQ][0 ... record[:SEQ].length - m.to_s.length]
+          record[:SCORES]    = $`
+          record[:SEQ_LEN]   = record[:SEQ].length
+          record[:TRIM_LEFT] = m.to_s.length
+        end
+        record[:SCORES].match(regex_left) do |m|
+          record[:SEQ]        = record[:SEQ][m.to_s.length ... record[:SEQ].length]
+          record[:SCORES]     = $'
+          record[:SEQ_LEN]    = record[:SEQ].length
+          record[:TRIM_RIGHT] = $'.length - 1
+        end
+      when /left/
+        record[:SCORES].match(regex_left) do |m|
+          record[:SEQ]       = record[:SEQ][m.to_s.length ... record[:SEQ].length]
+          record[:SCORES]    = $'
+          record[:SEQ_LEN]   = record[:SEQ].length
+          record[:TRIM_LEFT] = m.to_s.length
+        end
+      when /right/
+        record[:SCORES].match(regex_right) do |m|
+          record[:SEQ]        = record[:SEQ][0 ... record[:SEQ].length - m.to_s.length]
+          record[:SCORES]     = $`
+          record[:SEQ_LEN]    = record[:SEQ].length
+          record[:TRIM_RIGHT] = $`.length - 1
+        end
+      end
+    end
+
+    output.puts record
+  end
+end
 
 
 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
diff --git a/bp_test/in/trim_seq.in b/bp_test/in/trim_seq.in
new file mode 100644 (file)
index 0000000..2f2c282
--- /dev/null
@@ -0,0 +1,5 @@
+SEQ_NAME: test
+SEQ: TGGGCGGGCCGGGGCGGCGGTTGCAGCGGCGGCTACGGCTTTTCGGCATCGGCGGCGACGTTGGCGGCGGGGCCGGGCGGGT
+SEQ_LEN: 82
+SCORES: @ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghhgfedcba`_^]\[ZYXWVUTSRQPONMLKJIHGFEDCBA@
+---
diff --git a/bp_test/out/trim_seq.out.1 b/bp_test/out/trim_seq.out.1
new file mode 100644 (file)
index 0000000..c891278
--- /dev/null
@@ -0,0 +1,7 @@
+SEQ_NAME: test
+SEQ: GCGGTTGCAGCGGCGGCTACGGCTTTTCGGCATCGGCGGCGACGTTGGCG
+SEQ_LEN: 50
+SCORES: PQRSTUVWXYZ[\]^_`abcdefghhgfedcba`_^]\[ZYXWVUTSRQP
+TRIM_LEFT: 16
+TRIM_RIGHT: 49
+---
diff --git a/bp_test/out/trim_seq.out.2 b/bp_test/out/trim_seq.out.2
new file mode 100644 (file)
index 0000000..746746f
--- /dev/null
@@ -0,0 +1,7 @@
+SEQ_NAME: test
+SEQ: TGCAGCGGCGGCTACGGCTTTTCGGCATCGGCGGCGACGT
+SEQ_LEN: 40
+SCORES: UVWXYZ[\]^_`abcdefghhgfedcba`_^]\[ZYXWVU
+TRIM_LEFT: 21
+TRIM_RIGHT: 39
+---
diff --git a/bp_test/out/trim_seq.out.3 b/bp_test/out/trim_seq.out.3
new file mode 100644 (file)
index 0000000..c891278
--- /dev/null
@@ -0,0 +1,7 @@
+SEQ_NAME: test
+SEQ: GCGGTTGCAGCGGCGGCTACGGCTTTTCGGCATCGGCGGCGACGTTGGCG
+SEQ_LEN: 50
+SCORES: PQRSTUVWXYZ[\]^_`abcdefghhgfedcba`_^]\[ZYXWVUTSRQP
+TRIM_LEFT: 16
+TRIM_RIGHT: 49
+---
diff --git a/bp_test/out/trim_seq.out.4 b/bp_test/out/trim_seq.out.4
new file mode 100644 (file)
index 0000000..902e805
--- /dev/null
@@ -0,0 +1,6 @@
+SEQ_NAME: test
+SEQ: GCGGTTGCAGCGGCGGCTACGGCTTTTCGGCATCGGCGGCGACGTTGGCGGCGGGGCCGGGCGGGT
+SEQ_LEN: 66
+SCORES: PQRSTUVWXYZ[\]^_`abcdefghhgfedcba`_^]\[ZYXWVUTSRQPONMLKJIHGFEDCBA@
+TRIM_LEFT: 16
+---
diff --git a/bp_test/out/trim_seq.out.5 b/bp_test/out/trim_seq.out.5
new file mode 100644 (file)
index 0000000..0e5aa85
--- /dev/null
@@ -0,0 +1,6 @@
+SEQ_NAME: test
+SEQ: TGGGCGGGCCGGGGCGGCGGTTGCAGCGGCGGCTACGGCTTTTCGGCATCGGCGGCGACGTTGGCG
+SEQ_LEN: 66
+SCORES: @ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghhgfedcba`_^]\[ZYXWVUTSRQP
+TRIM_RIGHT: 65
+---
diff --git a/bp_test/test/test_trim_seq b/bp_test/test/test_trim_seq
new file mode 100755 (executable)
index 0000000..232b987
--- /dev/null
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+source "$BP_DIR/bp_test/lib/test.sh"
+
+run "$bp -I $in -O $tmp"
+assert_no_diff $tmp $out.1
+clean
+
+run "$bp -I $in -m 20 -O $tmp"
+assert_no_diff $tmp $out.2
+clean
+
+run "$bp -I $in -t both -O $tmp"
+assert_no_diff $tmp $out.3
+clean
+
+run "$bp -I $in -t left -O $tmp"
+assert_no_diff $tmp $out.4
+clean
+
+run "$bp -I $in -t right -O $tmp"
+assert_no_diff $tmp $out.5
+clean