From: martinahansen Date: Thu, 24 May 2012 09:00:48 +0000 (+0000) Subject: upgraded trim_seq X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=1db2127fa0928ad1531fc7f60d2dc1d497ee55b1;p=biopieces.git upgraded trim_seq git-svn-id: http://biopieces.googlecode.com/svn/trunk@1823 74ccb610-7750-0410-82ae-013aeee3265d --- diff --git a/bp_bin/trim_seq b/bp_bin/trim_seq index 58967e3..d6c1a09 100755 --- a/bp_bin/trim_seq +++ b/bp_bin/trim_seq @@ -1,6 +1,6 @@ #!/usr/bin/env ruby -# Copyright (C) 2007-2011 Martin A. Hansen. +# Copyright (C) 2007-2012 Martin A. Hansen. # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License @@ -24,56 +24,33 @@ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< -# Trim sequence ends for residues with a low quality score. +# Trim sequence ends removing residues with a low quality score. # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< require 'maasha/biopieces' +require 'maasha/seq' require 'pp' casts = [] -casts << {:long=>'min', :short=>'m', :type=>'uint', :mandatory=>true, :default=>15, :allowed=>nil, :disallowed=>'0'} -casts << {:long=>'trim', :short=>'t', :type=>'string', :mandatory=>true, :default=>'both', :allowed=>'left,right,both', :disallowed=>nil} +casts << {:long=>'min_qual', :short=>'m', :type=>'uint', :mandatory=>true, :default=>20, :allowed=>nil, :disallowed=>'0'} +casts << {:long=>'min_len', :short=>'l', :type=>'uint', :mandatory=>true, :default=>3, :allowed=>nil, :disallowed=>'0'} +casts << {:long=>'trim', :short=>'t', :type=>'string', :mandatory=>true, :default=>'both', :allowed=>'left,right,both', :disallowed=>nil} options = Biopieces.options_parse(ARGV, casts) -ILLUMINA_BASE = 64 - -regex_left = Regexp.new("^[#{(ILLUMINA_BASE).chr}-#{(ILLUMINA_BASE + options[:min]).chr}]+") -regex_right = Regexp.new("[#{(ILLUMINA_BASE).chr}-#{(ILLUMINA_BASE + options[:min]).chr}]+$") - Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output| input.each_record do |record| if record.has_key? :SEQ and record.has_key? :SCORES + entry = Seq.new_bp(record) + case options[:trim] - when /both/ - record[:SCORES].match(regex_right) do |m| - record[:SEQ] = record[:SEQ][0 ... record[:SEQ].length - m.to_s.length] - record[:SCORES] = $` - record[:SEQ_LEN] = record[:SEQ].length - record[:TRIM_LEFT] = m.to_s.length - end - record[:SCORES].match(regex_left) do |m| - record[:SEQ] = record[:SEQ][m.to_s.length ... record[:SEQ].length] - record[:SCORES] = $' - record[:SEQ_LEN] = record[:SEQ].length - record[:TRIM_RIGHT] = $'.length - 1 - end - when /left/ - record[:SCORES].match(regex_left) do |m| - record[:SEQ] = record[:SEQ][m.to_s.length ... record[:SEQ].length] - record[:SCORES] = $' - record[:SEQ_LEN] = record[:SEQ].length - record[:TRIM_LEFT] = m.to_s.length - end - when /right/ - record[:SCORES].match(regex_right) do |m| - record[:SEQ] = record[:SEQ][0 ... record[:SEQ].length - m.to_s.length] - record[:SCORES] = $` - record[:SEQ_LEN] = record[:SEQ].length - record[:TRIM_RIGHT] = $`.length - 1 - end + when /both/ then entry.quality_trim!(options[:min_qual], options[:min_len]) + when /left/ then entry.quality_trim_left!(options[:min_qual], options[:min_len]) + when /right/ then entry.quality_trim_right!(options[:min_qual], options[:min_len]) end + + record.merge! entry.to_bp end output.puts record diff --git a/bp_test/in/trim_seq.in b/bp_test/in/trim_seq.in index 2f2c282..6fb0b4e 100644 --- a/bp_test/in/trim_seq.in +++ b/bp_test/in/trim_seq.in @@ -1,5 +1,5 @@ SEQ_NAME: test -SEQ: TGGGCGGGCCGGGGCGGCGGTTGCAGCGGCGGCTACGGCTTTTCGGCATCGGCGGCGACGTTGGCGGCGGGGCCGGGCGGGT +SEQ: gatcgatcgtacgagcagcatctgacgtatcgatcgttgattagttgctagctatgcagtctacgacgagcatgctagctag SEQ_LEN: 82 -SCORES: @ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghhgfedcba`_^]\[ZYXWVUTSRQPONMLKJIHGFEDCBA@ +SCORES: @ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghhgfedcba`_^]\[ZYXWVUTSRQPONMLKJIHGFEDChhh --- diff --git a/bp_test/out/trim_seq.out.1 b/bp_test/out/trim_seq.out.1 index c891278..2c45ca6 100644 --- a/bp_test/out/trim_seq.out.1 +++ b/bp_test/out/trim_seq.out.1 @@ -1,7 +1,5 @@ SEQ_NAME: test -SEQ: GCGGTTGCAGCGGCGGCTACGGCTTTTCGGCATCGGCGGCGACGTTGGCG -SEQ_LEN: 50 -SCORES: PQRSTUVWXYZ[\]^_`abcdefghhgfedcba`_^]\[ZYXWVUTSRQP -TRIM_LEFT: 16 -TRIM_RIGHT: 49 +SEQ: ctgacgtatcgatcgttgattagttgctagctatgcagtctacgacgagcatgctagctag +SEQ_LEN: 61 +SCORES: UVWXYZ[\]^_`abcdefghhgfedcba`_^]\[ZYXWVUTSRQPONMLKJIHGFEDChhh --- diff --git a/bp_test/out/trim_seq.out.2 b/bp_test/out/trim_seq.out.2 index 746746f..77b064f 100644 --- a/bp_test/out/trim_seq.out.2 +++ b/bp_test/out/trim_seq.out.2 @@ -1,7 +1,5 @@ SEQ_NAME: test -SEQ: TGCAGCGGCGGCTACGGCTTTTCGGCATCGGCGGCGACGT -SEQ_LEN: 40 -SCORES: UVWXYZ[\]^_`abcdefghhgfedcba`_^]\[ZYXWVU -TRIM_LEFT: 21 -TRIM_RIGHT: 39 +SEQ: gtatcgatcgttgattagttgctagctatgcagtctacgacgagcatgctagctag +SEQ_LEN: 56 +SCORES: Z[\]^_`abcdefghhgfedcba`_^]\[ZYXWVUTSRQPONMLKJIHGFEDChhh --- diff --git a/bp_test/out/trim_seq.out.3 b/bp_test/out/trim_seq.out.3 index c891278..2c45ca6 100644 --- a/bp_test/out/trim_seq.out.3 +++ b/bp_test/out/trim_seq.out.3 @@ -1,7 +1,5 @@ SEQ_NAME: test -SEQ: GCGGTTGCAGCGGCGGCTACGGCTTTTCGGCATCGGCGGCGACGTTGGCG -SEQ_LEN: 50 -SCORES: PQRSTUVWXYZ[\]^_`abcdefghhgfedcba`_^]\[ZYXWVUTSRQP -TRIM_LEFT: 16 -TRIM_RIGHT: 49 +SEQ: ctgacgtatcgatcgttgattagttgctagctatgcagtctacgacgagcatgctagctag +SEQ_LEN: 61 +SCORES: UVWXYZ[\]^_`abcdefghhgfedcba`_^]\[ZYXWVUTSRQPONMLKJIHGFEDChhh --- diff --git a/bp_test/out/trim_seq.out.4 b/bp_test/out/trim_seq.out.4 index 902e805..2c45ca6 100644 --- a/bp_test/out/trim_seq.out.4 +++ b/bp_test/out/trim_seq.out.4 @@ -1,6 +1,5 @@ SEQ_NAME: test -SEQ: GCGGTTGCAGCGGCGGCTACGGCTTTTCGGCATCGGCGGCGACGTTGGCGGCGGGGCCGGGCGGGT -SEQ_LEN: 66 -SCORES: PQRSTUVWXYZ[\]^_`abcdefghhgfedcba`_^]\[ZYXWVUTSRQPONMLKJIHGFEDCBA@ -TRIM_LEFT: 16 +SEQ: ctgacgtatcgatcgttgattagttgctagctatgcagtctacgacgagcatgctagctag +SEQ_LEN: 61 +SCORES: UVWXYZ[\]^_`abcdefghhgfedcba`_^]\[ZYXWVUTSRQPONMLKJIHGFEDChhh --- diff --git a/bp_test/out/trim_seq.out.5 b/bp_test/out/trim_seq.out.5 index 0e5aa85..6fb0b4e 100644 --- a/bp_test/out/trim_seq.out.5 +++ b/bp_test/out/trim_seq.out.5 @@ -1,6 +1,5 @@ SEQ_NAME: test -SEQ: TGGGCGGGCCGGGGCGGCGGTTGCAGCGGCGGCTACGGCTTTTCGGCATCGGCGGCGACGTTGGCG -SEQ_LEN: 66 -SCORES: @ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghhgfedcba`_^]\[ZYXWVUTSRQP -TRIM_RIGHT: 65 +SEQ: gatcgatcgtacgagcagcatctgacgtatcgatcgttgattagttgctagctatgcagtctacgacgagcatgctagctag +SEQ_LEN: 82 +SCORES: @ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghhgfedcba`_^]\[ZYXWVUTSRQPONMLKJIHGFEDChhh --- diff --git a/bp_test/out/trim_seq.out.6 b/bp_test/out/trim_seq.out.6 new file mode 100644 index 0000000..53de4b0 --- /dev/null +++ b/bp_test/out/trim_seq.out.6 @@ -0,0 +1,5 @@ +SEQ_NAME: test +SEQ: ctgacgtatcgatcgttgattagttgctagctatgcagtc +SEQ_LEN: 40 +SCORES: UVWXYZ[\]^_`abcdefghhgfedcba`_^]\[ZYXWVU +--- diff --git a/bp_test/test/test_trim_seq b/bp_test/test/test_trim_seq index 232b987..b538796 100755 --- a/bp_test/test/test_trim_seq +++ b/bp_test/test/test_trim_seq @@ -6,7 +6,7 @@ run "$bp -I $in -O $tmp" assert_no_diff $tmp $out.1 clean -run "$bp -I $in -m 20 -O $tmp" +run "$bp -I $in -m 25 -O $tmp" assert_no_diff $tmp $out.2 clean @@ -21,3 +21,7 @@ clean run "$bp -I $in -t right -O $tmp" assert_no_diff $tmp $out.5 clean + +run "$bp -I $in -l 4 -O $tmp" +assert_no_diff $tmp $out.6 +clean