]> git.donarmstrong.com Git - biopieces.git/commitdiff
upgraded trim_seq
authormartinahansen <martinahansen@74ccb610-7750-0410-82ae-013aeee3265d>
Thu, 24 May 2012 09:00:48 +0000 (09:00 +0000)
committermartinahansen <martinahansen@74ccb610-7750-0410-82ae-013aeee3265d>
Thu, 24 May 2012 09:00:48 +0000 (09:00 +0000)
git-svn-id: http://biopieces.googlecode.com/svn/trunk@1823 74ccb610-7750-0410-82ae-013aeee3265d

bp_bin/trim_seq
bp_test/in/trim_seq.in
bp_test/out/trim_seq.out.1
bp_test/out/trim_seq.out.2
bp_test/out/trim_seq.out.3
bp_test/out/trim_seq.out.4
bp_test/out/trim_seq.out.5
bp_test/out/trim_seq.out.6 [new file with mode: 0644]
bp_test/test/test_trim_seq

index 58967e34b666e4522ba5808be6c2cd521ea8c5a0..d6c1a0933dfe8a1dce77403c81414184d2f382bf 100755 (executable)
@@ -1,6 +1,6 @@
 #!/usr/bin/env ruby
 
-# Copyright (C) 2007-2011 Martin A. Hansen.
+# Copyright (C) 2007-2012 Martin A. Hansen.
 
 # This program is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License
 
 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
 
-# Trim sequence ends for residues with a low quality score.
+# Trim sequence ends removing residues with a low quality score.
 
 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
 
 require 'maasha/biopieces'
+require 'maasha/seq'
 require 'pp'
 
 casts = []
-casts << {:long=>'min',  :short=>'m', :type=>'uint',   :mandatory=>true, :default=>15,     :allowed=>nil, :disallowed=>'0'}
-casts << {:long=>'trim', :short=>'t', :type=>'string', :mandatory=>true, :default=>'both', :allowed=>'left,right,both', :disallowed=>nil}
+casts << {:long=>'min_qual', :short=>'m', :type=>'uint',   :mandatory=>true, :default=>20,     :allowed=>nil,               :disallowed=>'0'}
+casts << {:long=>'min_len',  :short=>'l', :type=>'uint',   :mandatory=>true, :default=>3,      :allowed=>nil,               :disallowed=>'0'}
+casts << {:long=>'trim',     :short=>'t', :type=>'string', :mandatory=>true, :default=>'both', :allowed=>'left,right,both', :disallowed=>nil}
 
 options = Biopieces.options_parse(ARGV, casts)
 
-ILLUMINA_BASE = 64
-
-regex_left  = Regexp.new("^[#{(ILLUMINA_BASE).chr}-#{(ILLUMINA_BASE + options[:min]).chr}]+")
-regex_right = Regexp.new("[#{(ILLUMINA_BASE).chr}-#{(ILLUMINA_BASE + options[:min]).chr}]+$")
-
 Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output|
   input.each_record do |record|
     if record.has_key? :SEQ and record.has_key? :SCORES
+      entry = Seq.new_bp(record)
+
       case options[:trim]
-      when /both/
-        record[:SCORES].match(regex_right) do |m|
-          record[:SEQ]       = record[:SEQ][0 ... record[:SEQ].length - m.to_s.length]
-          record[:SCORES]    = $`
-          record[:SEQ_LEN]   = record[:SEQ].length
-          record[:TRIM_LEFT] = m.to_s.length
-        end
-        record[:SCORES].match(regex_left) do |m|
-          record[:SEQ]        = record[:SEQ][m.to_s.length ... record[:SEQ].length]
-          record[:SCORES]     = $'
-          record[:SEQ_LEN]    = record[:SEQ].length
-          record[:TRIM_RIGHT] = $'.length - 1
-        end
-      when /left/
-        record[:SCORES].match(regex_left) do |m|
-          record[:SEQ]       = record[:SEQ][m.to_s.length ... record[:SEQ].length]
-          record[:SCORES]    = $'
-          record[:SEQ_LEN]   = record[:SEQ].length
-          record[:TRIM_LEFT] = m.to_s.length
-        end
-      when /right/
-        record[:SCORES].match(regex_right) do |m|
-          record[:SEQ]        = record[:SEQ][0 ... record[:SEQ].length - m.to_s.length]
-          record[:SCORES]     = $`
-          record[:SEQ_LEN]    = record[:SEQ].length
-          record[:TRIM_RIGHT] = $`.length - 1
-        end
+      when /both/  then entry.quality_trim!(options[:min_qual], options[:min_len])
+      when /left/  then entry.quality_trim_left!(options[:min_qual], options[:min_len])
+      when /right/ then entry.quality_trim_right!(options[:min_qual], options[:min_len])
       end
+
+      record.merge! entry.to_bp
     end
 
     output.puts record
index 2f2c28220241ee0f7d311ddb19fe808a2d911a73..6fb0b4e57b8c200cdf39e1f0226ab6ead8d474c8 100644 (file)
@@ -1,5 +1,5 @@
 SEQ_NAME: test
-SEQ: TGGGCGGGCCGGGGCGGCGGTTGCAGCGGCGGCTACGGCTTTTCGGCATCGGCGGCGACGTTGGCGGCGGGGCCGGGCGGGT
+SEQ: gatcgatcgtacgagcagcatctgacgtatcgatcgttgattagttgctagctatgcagtctacgacgagcatgctagctag
 SEQ_LEN: 82
-SCORES: @ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghhgfedcba`_^]\[ZYXWVUTSRQPONMLKJIHGFEDCBA@
+SCORES: @ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghhgfedcba`_^]\[ZYXWVUTSRQPONMLKJIHGFEDChhh
 ---
index c891278030312c2d100ce8384c05dd0e41f5029a..2c45ca62462d7502dfecb6c36a3c09bd637dff52 100644 (file)
@@ -1,7 +1,5 @@
 SEQ_NAME: test
-SEQ: GCGGTTGCAGCGGCGGCTACGGCTTTTCGGCATCGGCGGCGACGTTGGCG
-SEQ_LEN: 50
-SCORES: PQRSTUVWXYZ[\]^_`abcdefghhgfedcba`_^]\[ZYXWVUTSRQP
-TRIM_LEFT: 16
-TRIM_RIGHT: 49
+SEQ: ctgacgtatcgatcgttgattagttgctagctatgcagtctacgacgagcatgctagctag
+SEQ_LEN: 61
+SCORES: UVWXYZ[\]^_`abcdefghhgfedcba`_^]\[ZYXWVUTSRQPONMLKJIHGFEDChhh
 ---
index 746746f178aff514d37ad8bc5ec23964d6e7b13f..77b064ffb01cc1c8c3c97516eb442edc7da0b753 100644 (file)
@@ -1,7 +1,5 @@
 SEQ_NAME: test
-SEQ: TGCAGCGGCGGCTACGGCTTTTCGGCATCGGCGGCGACGT
-SEQ_LEN: 40
-SCORES: UVWXYZ[\]^_`abcdefghhgfedcba`_^]\[ZYXWVU
-TRIM_LEFT: 21
-TRIM_RIGHT: 39
+SEQ: gtatcgatcgttgattagttgctagctatgcagtctacgacgagcatgctagctag
+SEQ_LEN: 56
+SCORES: Z[\]^_`abcdefghhgfedcba`_^]\[ZYXWVUTSRQPONMLKJIHGFEDChhh
 ---
index c891278030312c2d100ce8384c05dd0e41f5029a..2c45ca62462d7502dfecb6c36a3c09bd637dff52 100644 (file)
@@ -1,7 +1,5 @@
 SEQ_NAME: test
-SEQ: GCGGTTGCAGCGGCGGCTACGGCTTTTCGGCATCGGCGGCGACGTTGGCG
-SEQ_LEN: 50
-SCORES: PQRSTUVWXYZ[\]^_`abcdefghhgfedcba`_^]\[ZYXWVUTSRQP
-TRIM_LEFT: 16
-TRIM_RIGHT: 49
+SEQ: ctgacgtatcgatcgttgattagttgctagctatgcagtctacgacgagcatgctagctag
+SEQ_LEN: 61
+SCORES: UVWXYZ[\]^_`abcdefghhgfedcba`_^]\[ZYXWVUTSRQPONMLKJIHGFEDChhh
 ---
index 902e80515fdc05f3ff0f1f99eb33e5d9aea90fa8..2c45ca62462d7502dfecb6c36a3c09bd637dff52 100644 (file)
@@ -1,6 +1,5 @@
 SEQ_NAME: test
-SEQ: GCGGTTGCAGCGGCGGCTACGGCTTTTCGGCATCGGCGGCGACGTTGGCGGCGGGGCCGGGCGGGT
-SEQ_LEN: 66
-SCORES: PQRSTUVWXYZ[\]^_`abcdefghhgfedcba`_^]\[ZYXWVUTSRQPONMLKJIHGFEDCBA@
-TRIM_LEFT: 16
+SEQ: ctgacgtatcgatcgttgattagttgctagctatgcagtctacgacgagcatgctagctag
+SEQ_LEN: 61
+SCORES: UVWXYZ[\]^_`abcdefghhgfedcba`_^]\[ZYXWVUTSRQPONMLKJIHGFEDChhh
 ---
index 0e5aa85bff2c7a3ab9f8a70049e8c30c16315b05..6fb0b4e57b8c200cdf39e1f0226ab6ead8d474c8 100644 (file)
@@ -1,6 +1,5 @@
 SEQ_NAME: test
-SEQ: TGGGCGGGCCGGGGCGGCGGTTGCAGCGGCGGCTACGGCTTTTCGGCATCGGCGGCGACGTTGGCG
-SEQ_LEN: 66
-SCORES: @ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghhgfedcba`_^]\[ZYXWVUTSRQP
-TRIM_RIGHT: 65
+SEQ: gatcgatcgtacgagcagcatctgacgtatcgatcgttgattagttgctagctatgcagtctacgacgagcatgctagctag
+SEQ_LEN: 82
+SCORES: @ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghhgfedcba`_^]\[ZYXWVUTSRQPONMLKJIHGFEDChhh
 ---
diff --git a/bp_test/out/trim_seq.out.6 b/bp_test/out/trim_seq.out.6
new file mode 100644 (file)
index 0000000..53de4b0
--- /dev/null
@@ -0,0 +1,5 @@
+SEQ_NAME: test
+SEQ: ctgacgtatcgatcgttgattagttgctagctatgcagtc
+SEQ_LEN: 40
+SCORES: UVWXYZ[\]^_`abcdefghhgfedcba`_^]\[ZYXWVU
+---
index 232b987dcd32cabdb034aa21696b7b2967ba7ecf..b5387966849e8a792cb2e693715243872edb96c3 100755 (executable)
@@ -6,7 +6,7 @@ run "$bp -I $in -O $tmp"
 assert_no_diff $tmp $out.1
 clean
 
-run "$bp -I $in -m 20 -O $tmp"
+run "$bp -I $in -m 25 -O $tmp"
 assert_no_diff $tmp $out.2
 clean
 
@@ -21,3 +21,7 @@ clean
 run "$bp -I $in -t right -O $tmp"
 assert_no_diff $tmp $out.5
 clean
+
+run "$bp -I $in -l 4 -O $tmp"
+assert_no_diff $tmp $out.6
+clean