From: martinahansen Date: Thu, 20 Oct 2011 08:21:27 +0000 (+0000) Subject: ported trim_seq to ruby and added tests X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=fd0247fd7e58675684dcba60322249dd27337f45;p=biopieces.git ported trim_seq to ruby and added tests git-svn-id: http://biopieces.googlecode.com/svn/trunk@1591 74ccb610-7750-0410-82ae-013aeee3265d --- diff --git a/bp_bin/trim_seq b/bp_bin/trim_seq index 2abaa86..58967e3 100755 --- a/bp_bin/trim_seq +++ b/bp_bin/trim_seq @@ -1,6 +1,6 @@ -#!/usr/bin/env perl +#!/usr/bin/env ruby -# Copyright (C) 2007-2010 Martin A. Hansen. +# Copyright (C) 2007-2011 Martin A. Hansen. # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License @@ -18,85 +18,67 @@ # http://www.gnu.org/copyleft/gpl.html - -# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - -# Trim sequence ends for residues with a low quality score. - -# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - - -use warnings; -use strict; -use Maasha::Biopieces; -use Maasha::Fastq; - - # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< +# This program is part of the Biopieces framework (www.biopieces.org). -my ( $options, $in, $out, $record, $left, $right, $pos_l, $pos_r ); - -$options = Maasha::Biopieces::parse_options( - [ - { long => 'min', short => 'm', type => 'uint', mandatory => 'no', default => 15, allowed => undef, disallowed => 0 }, - { long => 'trim', short => 't', type => 'string', mandatory => 'no', default => 'both', allowed => 'left,right,both', disallowed => undef }, - ] -); - -$right = 1 if $options->{ 'trim' } eq 'right'; -$left = 1 if $options->{ 'trim' } eq 'left'; -$right = 1 if $options->{ 'trim' } eq 'both'; -$left = 1 if $options->{ 'trim' } eq 'both'; - -$in = Maasha::Biopieces::read_stream( $options->{ "stream_in" } ); -$out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } ); - -while ( $record = Maasha::Biopieces::get_record( $in ) ) -{ - if ( exists $record->{ 'SEQ' } and exists $record->{ 'SCORES' } ) - { - if ( $right ) - { - $pos_r = Maasha::Fastq::trim_right( $record->{ 'SCORES' }, $options->{ 'min' } ); - - $record->{ 'SEQ' } = substr $record->{ 'SEQ' }, 0, $pos_r + 1; - $record->{ 'SCORES' } = substr $record->{ 'SCORES' }, 0, $pos_r + 1; - $record->{ 'TRIM_RIGHT' } = $pos_r; - } - - if ( $left ) - { - $pos_l = Maasha::Fastq::trim_left( $record->{ 'SCORES' }, $options->{ 'min' } ); - - $record->{ 'SEQ' } = substr $record->{ 'SEQ' }, $pos_l; - $record->{ 'SCORES' } = substr $record->{ 'SCORES' }, $pos_l; - $record->{ 'TRIM_LEFT' } = $pos_l; - } - - $record->{ 'SEQ_LEN' } = length $record->{ 'SEQ' }; - } - - Maasha::Biopieces::put_record( $record, $out ); -} - -Maasha::Biopieces::close_stream( $in ); -Maasha::Biopieces::close_stream( $out ); +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< +# Trim sequence ends for residues with a low quality score. # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - -BEGIN -{ - Maasha::Biopieces::status_set(); -} - - -END -{ - Maasha::Biopieces::status_log(); -} +require 'maasha/biopieces' +require 'pp' + +casts = [] +casts << {:long=>'min', :short=>'m', :type=>'uint', :mandatory=>true, :default=>15, :allowed=>nil, :disallowed=>'0'} +casts << {:long=>'trim', :short=>'t', :type=>'string', :mandatory=>true, :default=>'both', :allowed=>'left,right,both', :disallowed=>nil} + +options = Biopieces.options_parse(ARGV, casts) + +ILLUMINA_BASE = 64 + +regex_left = Regexp.new("^[#{(ILLUMINA_BASE).chr}-#{(ILLUMINA_BASE + options[:min]).chr}]+") +regex_right = Regexp.new("[#{(ILLUMINA_BASE).chr}-#{(ILLUMINA_BASE + options[:min]).chr}]+$") + +Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output| + input.each_record do |record| + if record.has_key? :SEQ and record.has_key? :SCORES + case options[:trim] + when /both/ + record[:SCORES].match(regex_right) do |m| + record[:SEQ] = record[:SEQ][0 ... record[:SEQ].length - m.to_s.length] + record[:SCORES] = $` + record[:SEQ_LEN] = record[:SEQ].length + record[:TRIM_LEFT] = m.to_s.length + end + record[:SCORES].match(regex_left) do |m| + record[:SEQ] = record[:SEQ][m.to_s.length ... record[:SEQ].length] + record[:SCORES] = $' + record[:SEQ_LEN] = record[:SEQ].length + record[:TRIM_RIGHT] = $'.length - 1 + end + when /left/ + record[:SCORES].match(regex_left) do |m| + record[:SEQ] = record[:SEQ][m.to_s.length ... record[:SEQ].length] + record[:SCORES] = $' + record[:SEQ_LEN] = record[:SEQ].length + record[:TRIM_LEFT] = m.to_s.length + end + when /right/ + record[:SCORES].match(regex_right) do |m| + record[:SEQ] = record[:SEQ][0 ... record[:SEQ].length - m.to_s.length] + record[:SCORES] = $` + record[:SEQ_LEN] = record[:SEQ].length + record[:TRIM_RIGHT] = $`.length - 1 + end + end + end + + output.puts record + end +end # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< diff --git a/bp_test/in/trim_seq.in b/bp_test/in/trim_seq.in new file mode 100644 index 0000000..2f2c282 --- /dev/null +++ b/bp_test/in/trim_seq.in @@ -0,0 +1,5 @@ +SEQ_NAME: test +SEQ: TGGGCGGGCCGGGGCGGCGGTTGCAGCGGCGGCTACGGCTTTTCGGCATCGGCGGCGACGTTGGCGGCGGGGCCGGGCGGGT +SEQ_LEN: 82 +SCORES: @ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghhgfedcba`_^]\[ZYXWVUTSRQPONMLKJIHGFEDCBA@ +--- diff --git a/bp_test/out/trim_seq.out.1 b/bp_test/out/trim_seq.out.1 new file mode 100644 index 0000000..c891278 --- /dev/null +++ b/bp_test/out/trim_seq.out.1 @@ -0,0 +1,7 @@ +SEQ_NAME: test +SEQ: GCGGTTGCAGCGGCGGCTACGGCTTTTCGGCATCGGCGGCGACGTTGGCG +SEQ_LEN: 50 +SCORES: PQRSTUVWXYZ[\]^_`abcdefghhgfedcba`_^]\[ZYXWVUTSRQP +TRIM_LEFT: 16 +TRIM_RIGHT: 49 +--- diff --git a/bp_test/out/trim_seq.out.2 b/bp_test/out/trim_seq.out.2 new file mode 100644 index 0000000..746746f --- /dev/null +++ b/bp_test/out/trim_seq.out.2 @@ -0,0 +1,7 @@ +SEQ_NAME: test +SEQ: TGCAGCGGCGGCTACGGCTTTTCGGCATCGGCGGCGACGT +SEQ_LEN: 40 +SCORES: UVWXYZ[\]^_`abcdefghhgfedcba`_^]\[ZYXWVU +TRIM_LEFT: 21 +TRIM_RIGHT: 39 +--- diff --git a/bp_test/out/trim_seq.out.3 b/bp_test/out/trim_seq.out.3 new file mode 100644 index 0000000..c891278 --- /dev/null +++ b/bp_test/out/trim_seq.out.3 @@ -0,0 +1,7 @@ +SEQ_NAME: test +SEQ: GCGGTTGCAGCGGCGGCTACGGCTTTTCGGCATCGGCGGCGACGTTGGCG +SEQ_LEN: 50 +SCORES: PQRSTUVWXYZ[\]^_`abcdefghhgfedcba`_^]\[ZYXWVUTSRQP +TRIM_LEFT: 16 +TRIM_RIGHT: 49 +--- diff --git a/bp_test/out/trim_seq.out.4 b/bp_test/out/trim_seq.out.4 new file mode 100644 index 0000000..902e805 --- /dev/null +++ b/bp_test/out/trim_seq.out.4 @@ -0,0 +1,6 @@ +SEQ_NAME: test +SEQ: GCGGTTGCAGCGGCGGCTACGGCTTTTCGGCATCGGCGGCGACGTTGGCGGCGGGGCCGGGCGGGT +SEQ_LEN: 66 +SCORES: PQRSTUVWXYZ[\]^_`abcdefghhgfedcba`_^]\[ZYXWVUTSRQPONMLKJIHGFEDCBA@ +TRIM_LEFT: 16 +--- diff --git a/bp_test/out/trim_seq.out.5 b/bp_test/out/trim_seq.out.5 new file mode 100644 index 0000000..0e5aa85 --- /dev/null +++ b/bp_test/out/trim_seq.out.5 @@ -0,0 +1,6 @@ +SEQ_NAME: test +SEQ: TGGGCGGGCCGGGGCGGCGGTTGCAGCGGCGGCTACGGCTTTTCGGCATCGGCGGCGACGTTGGCG +SEQ_LEN: 66 +SCORES: @ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghhgfedcba`_^]\[ZYXWVUTSRQP +TRIM_RIGHT: 65 +--- diff --git a/bp_test/test/test_trim_seq b/bp_test/test/test_trim_seq new file mode 100755 index 0000000..232b987 --- /dev/null +++ b/bp_test/test/test_trim_seq @@ -0,0 +1,23 @@ +#!/bin/bash + +source "$BP_DIR/bp_test/lib/test.sh" + +run "$bp -I $in -O $tmp" +assert_no_diff $tmp $out.1 +clean + +run "$bp -I $in -m 20 -O $tmp" +assert_no_diff $tmp $out.2 +clean + +run "$bp -I $in -t both -O $tmp" +assert_no_diff $tmp $out.3 +clean + +run "$bp -I $in -t left -O $tmp" +assert_no_diff $tmp $out.4 +clean + +run "$bp -I $in -t right -O $tmp" +assert_no_diff $tmp $out.5 +clean