]> git.donarmstrong.com Git - biopieces.git/commitdiff
upgraded mask_seq
authormartinahansen <martinahansen@74ccb610-7750-0410-82ae-013aeee3265d>
Mon, 5 Dec 2011 14:36:19 +0000 (14:36 +0000)
committermartinahansen <martinahansen@74ccb610-7750-0410-82ae-013aeee3265d>
Mon, 5 Dec 2011 14:36:19 +0000 (14:36 +0000)
git-svn-id: http://biopieces.googlecode.com/svn/trunk@1692 74ccb610-7750-0410-82ae-013aeee3265d

bp_bin/mask_seq
bp_test/in/mask_seq.in
bp_test/out/mask_seq.out.1
bp_test/out/mask_seq.out.2 [new file with mode: 0644]
bp_test/out/mask_seq.out.3 [new file with mode: 0644]
bp_test/test/test_mask_seq
code_ruby/lib/maasha/seq.rb

index 011be4da735e79d6c2765613400b11cfc462f513..3cd162a264aae007105932017a4a59636e19ddfb 100755 (executable)
 
 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
 
-# Soft mask sequences in the stream based on quality scores.
+# Mask sequences in the stream based on quality scores.
 
 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
 
-
 require 'maasha/biopieces'
-
-ILLUMINA_BASE = 64
-
-# Expading class Hash with possibly evil monkey patch.
-class Hash
-  # Soft masks sequence residues where the corresponding quality score
-  # is below a given cutoff.
-  def mask_seq!(cutoff)
-    if self.has_key? :SEQ and self.has_key? :SCORES
-      seq    = self[:SEQ].upcase
-      scores = self[:SCORES]
-      i      = 0
-
-      scores.each_char do |score|
-        seq[i] = seq[i].downcase if score.ord - ILLUMINA_BASE < cutoff
-        i += 1
-      end
-
-      self[:SEQ] = seq
-    end
-
-    self
-  end
-end
+require 'maasha/seq'
 
 casts = []
-casts << {:long=>'cutoff', :short=>'c', :type=>'int', :mandatory=>false, :default=>20, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'cutoff',   :short=>'c', :type=>'int',  :mandatory=>false, :default=>20,  :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'hardmask', :short=>'h', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
 
 options = Biopieces.options_parse(ARGV, casts)
 
 Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output|
   input.each_record do |record|
-    output.puts record.mask_seq!(options[:cutoff])
+    if record.has_key? :SEQ
+      entry = Seq.new_bp(record)
+
+      options[:hardmask] ? entry.mask_seq_hard!(options[:cutoff]) : entry.mask_seq_soft!(options[:cutoff])
+
+      record[:SEQ] = entry.seq
+    end
+
+    output.puts record
   end
 end
 
index aed3c6aa1137217f0b1676187d5a5472598b4727..1fdab88bfce4a25c4e8e19aaeb01dd1079d2f1e2 100644 (file)
@@ -1,3 +1,5 @@
-SEQ: AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
-SCORES: !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~
+SEQ_NAME: HWI-EAS157_20FFGAAXX:2:1:888:434
+SEQ: TTGGTCGCTCGCTCCGCGACCTCAGATCAGACGTGGGCGAT
+SEQ_LEN: 41
+SCORES: @ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefgh
 ---
index f8c5a3b27a7b2aab7bb15e00108f9bd092da7081..b508a2a0e2d06c6c2e38d58086f948cbc30758c5 100644 (file)
@@ -1,3 +1,5 @@
-SEQ: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
-SCORES: !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~
+SEQ_NAME: HWI-EAS157_20FFGAAXX:2:1:888:434
+SEQ: ttggtcgctcgctccgcgacCTCAGATCAGACGTGGGCGAT
+SEQ_LEN: 41
+SCORES: @ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefgh
 ---
diff --git a/bp_test/out/mask_seq.out.2 b/bp_test/out/mask_seq.out.2
new file mode 100644 (file)
index 0000000..1fdab88
--- /dev/null
@@ -0,0 +1,5 @@
+SEQ_NAME: HWI-EAS157_20FFGAAXX:2:1:888:434
+SEQ: TTGGTCGCTCGCTCCGCGACCTCAGATCAGACGTGGGCGAT
+SEQ_LEN: 41
+SCORES: @ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefgh
+---
diff --git a/bp_test/out/mask_seq.out.3 b/bp_test/out/mask_seq.out.3
new file mode 100644 (file)
index 0000000..7d38b47
--- /dev/null
@@ -0,0 +1,5 @@
+SEQ_NAME: HWI-EAS157_20FFGAAXX:2:1:888:434
+SEQ: NNNNNNNNNNNNNNNNNNNNCTCAGATCAGACGTGGGCGAT
+SEQ_LEN: 41
+SCORES: @ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefgh
+---
index 8f02fd6f17be8b93289db053d9d1722b638726b9..bd6191cb02add8a24de388e8fa2864b91b07fe61 100755 (executable)
@@ -2,6 +2,14 @@
 
 source "$BP_DIR/bp_test/lib/test.sh"
 
-run "$bp -I $in -c 0 -O $tmp"
+run "$bp -I $in -O $tmp"
 assert_no_diff $tmp $out.1
 clean
+
+run "$bp -I $in -c 0 -O $tmp"
+assert_no_diff $tmp $out.2
+clean
+
+run "$bp -I $in -h -O $tmp"
+assert_no_diff $tmp $out.3
+clean
index 598d0a2cd5056cd0786735009ba4eb59571d04fd..562bab91848cab683cef488e35122cf813a03ad0 100644 (file)
@@ -409,6 +409,36 @@ class Seq
     ((self.seq.scan(/[a-z]/).size.to_f / (self.len - self.indels).to_f) * 100).round(2)
   end
 
+  # Hard masks sequence residues where the corresponding quality score
+  # is below a given cutoff.
+  def mask_seq_hard!(cutoff)
+    seq    = self.seq.upcase
+    scores = self.qual
+    i      = 0
+
+    scores.each_char do |score|
+      seq[i] = 'N' if score.ord - SCORE_ILLUMINA < cutoff
+      i += 1 
+    end
+
+    self.seq = seq
+  end
+
+  # Soft masks sequence residues where the corresponding quality score
+  # is below a given cutoff.
+  def mask_seq_soft!(cutoff)
+    seq    = self.seq.upcase
+    scores = self.qual
+    i      = 0
+
+    scores.each_char do |score|
+      seq[i] = seq[i].downcase if score.ord - SCORE_ILLUMINA < cutoff
+      i += 1 
+    end
+
+    self.seq = seq
+  end
+
   # Method to convert the quality scores from a specified base
   # to another base.
   def convert_phred2illumina!