# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
-# Soft mask sequences in the stream based on quality scores.
+# Mask sequences in the stream based on quality scores.
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
-
require 'maasha/biopieces'
-
-ILLUMINA_BASE = 64
-
-# Expading class Hash with possibly evil monkey patch.
-class Hash
- # Soft masks sequence residues where the corresponding quality score
- # is below a given cutoff.
- def mask_seq!(cutoff)
- if self.has_key? :SEQ and self.has_key? :SCORES
- seq = self[:SEQ].upcase
- scores = self[:SCORES]
- i = 0
-
- scores.each_char do |score|
- seq[i] = seq[i].downcase if score.ord - ILLUMINA_BASE < cutoff
- i += 1
- end
-
- self[:SEQ] = seq
- end
-
- self
- end
-end
+require 'maasha/seq'
casts = []
-casts << {:long=>'cutoff', :short=>'c', :type=>'int', :mandatory=>false, :default=>20, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'cutoff', :short=>'c', :type=>'int', :mandatory=>false, :default=>20, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'hardmask', :short=>'h', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
options = Biopieces.options_parse(ARGV, casts)
Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output|
input.each_record do |record|
- output.puts record.mask_seq!(options[:cutoff])
+ if record.has_key? :SEQ
+ entry = Seq.new_bp(record)
+
+ options[:hardmask] ? entry.mask_seq_hard!(options[:cutoff]) : entry.mask_seq_soft!(options[:cutoff])
+
+ record[:SEQ] = entry.seq
+ end
+
+ output.puts record
end
end
-SEQ: AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
-SCORES: !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~
+SEQ_NAME: HWI-EAS157_20FFGAAXX:2:1:888:434
+SEQ: TTGGTCGCTCGCTCCGCGACCTCAGATCAGACGTGGGCGAT
+SEQ_LEN: 41
+SCORES: @ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefgh
---
-SEQ: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
-SCORES: !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~
+SEQ_NAME: HWI-EAS157_20FFGAAXX:2:1:888:434
+SEQ: ttggtcgctcgctccgcgacCTCAGATCAGACGTGGGCGAT
+SEQ_LEN: 41
+SCORES: @ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefgh
---
--- /dev/null
+SEQ_NAME: HWI-EAS157_20FFGAAXX:2:1:888:434
+SEQ: TTGGTCGCTCGCTCCGCGACCTCAGATCAGACGTGGGCGAT
+SEQ_LEN: 41
+SCORES: @ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefgh
+---
--- /dev/null
+SEQ_NAME: HWI-EAS157_20FFGAAXX:2:1:888:434
+SEQ: NNNNNNNNNNNNNNNNNNNNCTCAGATCAGACGTGGGCGAT
+SEQ_LEN: 41
+SCORES: @ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefgh
+---
source "$BP_DIR/bp_test/lib/test.sh"
-run "$bp -I $in -c 0 -O $tmp"
+run "$bp -I $in -O $tmp"
assert_no_diff $tmp $out.1
clean
+
+run "$bp -I $in -c 0 -O $tmp"
+assert_no_diff $tmp $out.2
+clean
+
+run "$bp -I $in -h -O $tmp"
+assert_no_diff $tmp $out.3
+clean
((self.seq.scan(/[a-z]/).size.to_f / (self.len - self.indels).to_f) * 100).round(2)
end
+ # Hard masks sequence residues where the corresponding quality score
+ # is below a given cutoff.
+ def mask_seq_hard!(cutoff)
+ seq = self.seq.upcase
+ scores = self.qual
+ i = 0
+
+ scores.each_char do |score|
+ seq[i] = 'N' if score.ord - SCORE_ILLUMINA < cutoff
+ i += 1
+ end
+
+ self.seq = seq
+ end
+
+ # Soft masks sequence residues where the corresponding quality score
+ # is below a given cutoff.
+ def mask_seq_soft!(cutoff)
+ seq = self.seq.upcase
+ scores = self.qual
+ i = 0
+
+ scores.each_char do |score|
+ seq[i] = seq[i].downcase if score.ord - SCORE_ILLUMINA < cutoff
+ i += 1
+ end
+
+ self.seq = seq
+ end
+
# Method to convert the quality scores from a specified base
# to another base.
def convert_phred2illumina!