4 PROTEIN = %w[f l s y c w p h q r i m t n k v a d e g]
10 # Error class for all exceptions to do with Seq.
11 class SeqError < StandardError; end
14 attr_accessor :seq_name, :seq, :type, :qual
16 # Initialize a sequence object with the following arguments:
17 # - seq_name: Name of the sequence.
18 # - seq: The sequence.
19 # - type: The sequence type - DNA, RNA, or protein
20 # - qual: A Solexa type quality scores string.
21 def initialize(seq_name = nil, seq = nil, type = nil, qual = nil)
28 # Returns the length of a sequence.
30 self.seq.nil? ? 0 : self.seq.length
35 # Method that returns true is a given sequence type is DNA.
40 # Method that returns true is a given sequence type is RNA.
45 # Method that returns true is a given sequence type is protein.
47 self.type == 'protein'
50 # Method to transcribe DNA to RNA.
52 raise SeqError, "Cannot transcribe 0 length sequence" if self.length == 0
53 raise SeqError, "Cannot transcribe sequence type: #{self.type}" unless self.is_dna?
55 self.seq.tr!('Tt','Uu')
58 # Method to reverse-transcribe RNA to DNA.
60 raise SeqError, "Cannot reverse-transcribe 0 length sequence" if self.length == 0
61 raise SeqError, "Cannot reverse-transcribe sequence type: #{self.type}" unless self.is_rna?
63 self.seq.tr!('Uu','Tt')
66 # Method that given a Seq entry returns a Biopieces record (a hash).
68 raise SeqError, "Missing seq_name" if self.seq_name.nil?
69 raise SeqError, "Missing seq" if self.seq.nil?
71 record[:SEQ_NAME] = self.seq_name
72 record[:SEQ] = self.seq
73 record[:SEQ_LEN] = self.length
74 record[:SCORES] = self.qual if self.qual
78 # Method that given a Seq entry returns a FASTA entry (a string).
79 def to_fasta(wrap = nil)
80 raise SeqError, "Missing seq_name" if self.seq_name.nil?
81 raise SeqError, "Missing seq" if self.seq.nil?
83 seq_name = self.seq_name
87 seq.gsub! /(.{#{wrap}})/ do |match|
94 ">#{seq_name}\n#{seq}\n"
97 # Method to reverse complement sequence.
98 def reverse_complement
103 alias revcomp reverse_complement
105 # Method to reverse the sequence.
110 # Method that complements sequence including ambiguity codes.
112 raise SeqError, "Cannot complement 0 length sequence" if self.length == 0
115 self.seq.tr!( 'AGCUTRYWSMKHDVBNagcutrywsmkhdvbn', 'TCGAAYRWSKMDHBVNtcgaayrwskmdhbvn' )
117 self.seq.tr!( 'AGCUTRYWSMKHDVBNagcutrywsmkhdvbn', 'UCGAAYRWSKMDHBVNucgaayrwskmdhbvn' )
119 raise SeqError, "Cannot complement sequence type: #{self.type}"
123 # Method that generates a random sequence of a given length.
124 def generate(length,type)
125 raise SeqError, "Cannot generate negative sequence length: #{length}" if length <= 0
135 raise SeqError, "Unknown sequence type: #{type}"
138 seq_new = Array.new(length) { alph[rand(alph.size)] }.join("")
140 self.type = type.downcase
144 # Method to convert the quality scores from a specified base
147 self.qual.gsub! /./ do |score|
148 score_phred = score.ord - SCORE_PHRED
149 raise SeqError, "Bad Phred score: #{score} (#{score_phred})" unless (0 .. 40).include? score_phred
150 score_solexa = score_phred + SCORE_SOLEXA
151 score = score_solexa.chr
156 # Error class for all exceptions to do with Digest.
157 class DigestError < StandardError; end
162 # Initialize a digest object with the following arguments:
163 # - seq: A sequence object.
164 # - pattern: A restriction enzyme recognition pattern.
165 # - cut_pos: Offset from match position where the enzyme cuts.
166 def initialize(seq, pattern, cut_pos)
168 @pattern = disambiguate(pattern)
173 # Method to get the next digestion product from a sequence.
175 @seq.seq.scan @pattern do
176 pos = $`.length + @cut_pos - 1
178 if pos >= 0 and pos < @seq.seq.length - 2
179 seq = Seq.new("#{@seq.seq_name}[#{@offset + 1}-#{pos + 1}]", @seq.seq[@offset .. pos], @seq.type)
189 elsif @offset > @seq.seq.length
193 seq = Seq.new("#{@seq.seq_name}[#{@offset + 1}-#{@seq.seq.length}]", @seq.seq[@offset .. @seq.seq.length], @seq.type)
197 self # conventionally
202 # Method that returns a regexp object with a restriction
203 # enzyme pattern with ambiguity codes substituted to the
204 # appropriate regexp.
205 def disambiguate(pattern)
227 pattern.upcase.each_char do |char|
228 if ambiguity.has_key? char
229 new_pattern << ambiguity[char]
231 raise DigestError, "Could not disambiguate residue: #{char}"
235 Regexp.new(new_pattern)
246 # Class containing generic sequence methods and nucleic acid and amino acid subclasses.
248 # Guess the sequence type by analyzing the first 100 residues allowing for ambiguity codes.
250 raise ArgumentError, "No sequence." if self.empty?
252 seq_beg = self[0, 100].upcase
254 if seq_beg.count( "FLPQIE" ) > 0
256 elsif seq_beg.count("U") > 0
257 Seq::NA::RNA.new(self)
259 Seq::NA::DNA.new(self)
263 # Method to wrap a sequence to a given width using a given delimiter.
264 def wrap(width = 80, delimit = $/)
265 raise ArgumentError, "Cannot wrap sequence to negative width: #{width}." if width <= 0
267 self.delete!(" \t\n\r")
268 self.gsub(/.{#{width}}(?!$)/, "\\0#{delimit}")
271 # Method to wrap and replace a sequence to a given width using a given delimiter.
272 def wrap!(width = 80, delimit = $/)
273 self.replace(self.wrap(width, delimit))
276 # Method that generates a random sequence of a given length.
278 raise ArgumentError, "Cannot generate negative sequence length: #{length}." if length <= 0
281 Array.new(length) { alph[rand(alph.size)] }.join("")
284 # Method that replaces sequence with a randomly generated sequence of a given length.
285 def generate!(length)
286 self.replace(self.generate(length))
289 # Class containing methods specific for amino acid (AA) sequences.
291 # Method that returns an array of amino acid residues.
293 %w{ F L S Y C W P H Q R I M T N K V A D E G }
296 # Calculate the molecular weight of an amino acid seuqunce.
297 # The caluculation is only approximate since there is no correction
298 # for amino bond formation and the MW used are somewhat imprecise:
299 # http://www.expasy.ch/tools/pscale/Molecularweight.html
301 raise ArgumentError, "invalid residues found: #{self.delete("#{residues.join( "" )}")}" if self.upcase =~ /[^#{residues.join( "" )}]/
305 "R" => 174.000, # Arg
306 "N" => 132.000, # Asn
307 "D" => 133.000, # Asp
308 "C" => 121.000, # Cys
309 "Q" => 146.000, # Gln
310 "E" => 147.000, # Glu
312 "H" => 155.000, # His
313 "I" => 131.000, # Ile
314 "L" => 131.000, # Leu
315 "K" => 146.000, # Lys
316 "M" => 149.000, # Met
317 "F" => 165.000, # Phe
318 "P" => 115.000, # Pro
319 "S" => 105.000, # Ser
320 "T" => 119.000, # Thr
321 "W" => 204.000, # Trp
322 "Y" => 181.000, # Tyr
323 "V" => 117.000, # Val
328 self.upcase.each_char { |c| mw += mol_weight_aa[ c ] }