From 5db4d7e8230ab3fc30b0dba60a59455e0b5f783d Mon Sep 17 00:00:00 2001 From: martinahansen Date: Mon, 5 Dec 2011 14:36:19 +0000 Subject: [PATCH] upgraded mask_seq git-svn-id: http://biopieces.googlecode.com/svn/trunk@1692 74ccb610-7750-0410-82ae-013aeee3265d --- bp_bin/mask_seq | 41 ++++++++++++------------------------- bp_test/in/mask_seq.in | 6 ++++-- bp_test/out/mask_seq.out.1 | 6 ++++-- bp_test/out/mask_seq.out.2 | 5 +++++ bp_test/out/mask_seq.out.3 | 5 +++++ bp_test/test/test_mask_seq | 10 ++++++++- code_ruby/lib/maasha/seq.rb | 30 +++++++++++++++++++++++++++ 7 files changed, 70 insertions(+), 33 deletions(-) create mode 100644 bp_test/out/mask_seq.out.2 create mode 100644 bp_test/out/mask_seq.out.3 diff --git a/bp_bin/mask_seq b/bp_bin/mask_seq index 011be4d..3cd162a 100755 --- a/bp_bin/mask_seq +++ b/bp_bin/mask_seq @@ -24,45 +24,30 @@ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< -# Soft mask sequences in the stream based on quality scores. +# Mask sequences in the stream based on quality scores. # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - require 'maasha/biopieces' - -ILLUMINA_BASE = 64 - -# Expading class Hash with possibly evil monkey patch. -class Hash - # Soft masks sequence residues where the corresponding quality score - # is below a given cutoff. - def mask_seq!(cutoff) - if self.has_key? :SEQ and self.has_key? :SCORES - seq = self[:SEQ].upcase - scores = self[:SCORES] - i = 0 - - scores.each_char do |score| - seq[i] = seq[i].downcase if score.ord - ILLUMINA_BASE < cutoff - i += 1 - end - - self[:SEQ] = seq - end - - self - end -end +require 'maasha/seq' casts = [] -casts << {:long=>'cutoff', :short=>'c', :type=>'int', :mandatory=>false, :default=>20, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'cutoff', :short=>'c', :type=>'int', :mandatory=>false, :default=>20, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'hardmask', :short=>'h', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} options = Biopieces.options_parse(ARGV, casts) Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output| input.each_record do |record| - output.puts record.mask_seq!(options[:cutoff]) + if record.has_key? :SEQ + entry = Seq.new_bp(record) + + options[:hardmask] ? entry.mask_seq_hard!(options[:cutoff]) : entry.mask_seq_soft!(options[:cutoff]) + + record[:SEQ] = entry.seq + end + + output.puts record end end diff --git a/bp_test/in/mask_seq.in b/bp_test/in/mask_seq.in index aed3c6a..1fdab88 100644 --- a/bp_test/in/mask_seq.in +++ b/bp_test/in/mask_seq.in @@ -1,3 +1,5 @@ -SEQ: AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA -SCORES: !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ +SEQ_NAME: HWI-EAS157_20FFGAAXX:2:1:888:434 +SEQ: TTGGTCGCTCGCTCCGCGACCTCAGATCAGACGTGGGCGAT +SEQ_LEN: 41 +SCORES: @ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefgh --- diff --git a/bp_test/out/mask_seq.out.1 b/bp_test/out/mask_seq.out.1 index f8c5a3b..b508a2a 100644 --- a/bp_test/out/mask_seq.out.1 +++ b/bp_test/out/mask_seq.out.1 @@ -1,3 +1,5 @@ -SEQ: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA -SCORES: !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ +SEQ_NAME: HWI-EAS157_20FFGAAXX:2:1:888:434 +SEQ: ttggtcgctcgctccgcgacCTCAGATCAGACGTGGGCGAT +SEQ_LEN: 41 +SCORES: @ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefgh --- diff --git a/bp_test/out/mask_seq.out.2 b/bp_test/out/mask_seq.out.2 new file mode 100644 index 0000000..1fdab88 --- /dev/null +++ b/bp_test/out/mask_seq.out.2 @@ -0,0 +1,5 @@ +SEQ_NAME: HWI-EAS157_20FFGAAXX:2:1:888:434 +SEQ: TTGGTCGCTCGCTCCGCGACCTCAGATCAGACGTGGGCGAT +SEQ_LEN: 41 +SCORES: @ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefgh +--- diff --git a/bp_test/out/mask_seq.out.3 b/bp_test/out/mask_seq.out.3 new file mode 100644 index 0000000..7d38b47 --- /dev/null +++ b/bp_test/out/mask_seq.out.3 @@ -0,0 +1,5 @@ +SEQ_NAME: HWI-EAS157_20FFGAAXX:2:1:888:434 +SEQ: NNNNNNNNNNNNNNNNNNNNCTCAGATCAGACGTGGGCGAT +SEQ_LEN: 41 +SCORES: @ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefgh +--- diff --git a/bp_test/test/test_mask_seq b/bp_test/test/test_mask_seq index 8f02fd6..bd6191c 100755 --- a/bp_test/test/test_mask_seq +++ b/bp_test/test/test_mask_seq @@ -2,6 +2,14 @@ source "$BP_DIR/bp_test/lib/test.sh" -run "$bp -I $in -c 0 -O $tmp" +run "$bp -I $in -O $tmp" assert_no_diff $tmp $out.1 clean + +run "$bp -I $in -c 0 -O $tmp" +assert_no_diff $tmp $out.2 +clean + +run "$bp -I $in -h -O $tmp" +assert_no_diff $tmp $out.3 +clean diff --git a/code_ruby/lib/maasha/seq.rb b/code_ruby/lib/maasha/seq.rb index 598d0a2..562bab9 100644 --- a/code_ruby/lib/maasha/seq.rb +++ b/code_ruby/lib/maasha/seq.rb @@ -409,6 +409,36 @@ class Seq ((self.seq.scan(/[a-z]/).size.to_f / (self.len - self.indels).to_f) * 100).round(2) end + # Hard masks sequence residues where the corresponding quality score + # is below a given cutoff. + def mask_seq_hard!(cutoff) + seq = self.seq.upcase + scores = self.qual + i = 0 + + scores.each_char do |score| + seq[i] = 'N' if score.ord - SCORE_ILLUMINA < cutoff + i += 1 + end + + self.seq = seq + end + + # Soft masks sequence residues where the corresponding quality score + # is below a given cutoff. + def mask_seq_soft!(cutoff) + seq = self.seq.upcase + scores = self.qual + i = 0 + + scores.each_char do |score| + seq[i] = seq[i].downcase if score.ord - SCORE_ILLUMINA < cutoff + i += 1 + end + + self.seq = seq + end + # Method to convert the quality scores from a specified base # to another base. def convert_phred2illumina! -- 2.39.5