X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=code_ruby%2Flib%2Fmaasha%2Fseq.rb;h=4a0af47ed0f5fde9d5ffe87ca19a3caec087f6d6;hb=b982cc677363d7458963b610ec53bf2c4f7476a9;hp=6c0f35b358e67135b35c4032b7fc9e9a78e6e0fb;hpb=98f798c64a0b795f642bceaa06fb9b951a519ead;p=biopieces.git diff --git a/code_ruby/lib/maasha/seq.rb b/code_ruby/lib/maasha/seq.rb index 6c0f35b..4a0af47 100644 --- a/code_ruby/lib/maasha/seq.rb +++ b/code_ruby/lib/maasha/seq.rb @@ -67,15 +67,16 @@ TRANS_TAB11 = { "GTG" => "V", "GCG" => "A", "GAG" => "E", "GGG" => "G" } -# Quality scores bases -SCORE_BASE = 64 -SCORE_MIN = 0 -SCORE_MAX = 40 # Error class for all exceptions to do with Seq. class SeqError < StandardError; end class Seq + # Quality scores bases + SCORE_BASE = 64 + SCORE_MIN = 0 + SCORE_MAX = 40 + include Digest include Trim @@ -149,6 +150,7 @@ class Seq # by inspecting the first 100 residues. def type_guess! self.type = self.type_guess + self end # Returns the length of a sequence. @@ -333,17 +335,13 @@ class Seq key end - # Method to reverse complement sequence. - def reverse_complement - self.reverse - self.complement - self + # Method to reverse the sequence. + def reverse + Seq.new(self.seq_name, self.seq.reverse, self.type, self.qual ? self.qual.reverse : self.qual) end - alias :revcomp :reverse_complement - # Method to reverse the sequence. - def reverse + def reverse! self.seq.reverse! self.qual.reverse! if self.qual self @@ -353,6 +351,26 @@ class Seq def complement raise SeqError, "Cannot complement 0 length sequence" if self.length == 0 + entry = Seq.new + entry.seq_name = self.seq_name + entry.type = self.type + entry.qual = self.qual + + if self.is_dna? + entry.seq = self.seq.tr('AGCUTRYWSMKHDVBNagcutrywsmkhdvbn', 'TCGAAYRWSKMDHBVNtcgaayrwskmdhbvn') + elsif self.is_rna? + entry.seq = self.seq.tr('AGCUTRYWSMKHDVBNagcutrywsmkhdvbn', 'UCGAAYRWSKMDHBVNucgaayrwskmdhbvn') + else + raise SeqError, "Cannot complement sequence type: #{self.type}" + end + + entry + end + + # Method that complements sequence including ambiguity codes. + def complement! + raise SeqError, "Cannot complement 0 length sequence" if self.length == 0 + if self.is_dna? self.seq.tr!('AGCUTRYWSMKHDVBNagcutrywsmkhdvbn', 'TCGAAYRWSKMDHBVNtcgaayrwskmdhbvn') elsif self.is_rna? @@ -360,6 +378,8 @@ class Seq else raise SeqError, "Cannot complement sequence type: #{self.type}" end + + self end # Method to determine the Hamming Distance between @@ -389,7 +409,12 @@ class Seq seq_new end - # Method to shuffle a sequence readomly inline. + # Method to return a new Seq object with shuffled sequence. + def shuffle + Seq.new(self.seq_name, self.seq.split('').shuffle!.join, self.type, self.qual) + end + + # Method to shuffle a sequence randomly inline. def shuffle! self.seq = self.seq.split('').shuffle!.join self @@ -482,21 +507,6 @@ class Seq ((self.seq.scan(/[a-z]/).size.to_f / (self.len - self.indels).to_f) * 100).round(2) end - # Hard masks sequence residues where the corresponding quality score - # is below a given cutoff. - def mask_seq_hard_old(cutoff) - seq = self.seq.upcase - scores = self.qual - i = 0 - - scores.each_char do |score| - seq[i] = 'N' if score.ord - SCORE_BASE < cutoff - i += 1 - end - - self.seq = seq - end - # Hard masks sequence residues where the corresponding quality score # is below a given cutoff. def mask_seq_hard!(cutoff) @@ -538,13 +548,30 @@ class Seq # Method that determines if a quality score string can be # absolutely identified as base 33. def qual_base33? - self.qual.match(/[!-:]/) + self.qual.match(/[!-:]/) ? true : false end # Method that determines if a quality score string can be # absolutely identified as base 64. def qual_base64? - self.qual.match(/[K-h]/) + self.qual.match(/[K-h]/) ? true : false + end + + # Method to determine if a quality score is valid. + def qual_valid?(encoding) + raise SeqError, "Missing qual" if self.qual.nil? + + case encoding.downcase + when "sanger" then return true if self.qual.match(/^[!-~]*$/) + when "454" then return true if self.qual.match(/^[@-~]*$/) + when "solexa" then return true if self.qual.match(/^[;-~]*$/) + when "illumina13" then return true if self.qual.match(/^[@-~]*$/) + when "illumina15" then return true if self.qual.match(/^[@-~]*$/) + when "illumina18" then return true if self.qual.match(/^[!-~]*$/) + else raise SeqError, "unknown quality score encoding: #{encoding}" + end + + false end # Method to convert quality scores inbetween formats. @@ -552,7 +579,7 @@ class Seq # 454 base 64, range 0-40 # Solexa base 64, range -5-40 # Illumina13 base 64, range 0-40 - # Illumina15 base 64, range 3-40 + # Illumina15 base 64, range 0-40 # Illumina18 base 33, range 0-41 def convert_scores!(from, to) unless from == to @@ -575,7 +602,7 @@ class Seq when "illumina13" then na_qual += 64 when "illumina15" then na_qual += 64 when "illumina18" then na_qual += 33 - else raise SeqError, "unknown quality score encoding: #{from}" + else raise SeqError, "unknown quality score encoding: #{to}" end self.qual = na_qual.to_s