4 PROTEIN = %w[f l s y c w p h q r i m t n k v a d e g]
6 # Error class for all exceptions to do with Seq.
7 class SeqError < StandardError; end
10 attr_accessor :seq_name, :seq, :type, :qual
12 # Initialize a sequence object with the following arguments:
13 # - seq_name: Name of the sequence.
14 # - seq: The sequence.
15 # - type: The sequence type - DNA, RNA, or protein
16 # - qual: A Solexa type quality scores string.
17 def initialize(seq_name = nil, seq = nil, type = nil, qual = nil)
24 # Returns the length of a sequence.
26 self.seq.nil? ? 0 : self.seq.length
31 # Method that returns true is a given sequence type is DNA.
36 # Method that returns true is a given sequence type is RNA.
41 # Method that returns true is a given sequence type is protein.
43 self.type == 'protein'
46 # Method to transcribe DNA to RNA.
48 raise SeqError, "Cannot transcribe 0 length sequence" if self.length == 0
49 raise SeqError, "Cannot transcribe sequence type: #{self.type}" unless self.is_dna?
51 self.seq.tr!('Tt','Uu')
54 # Method to reverse-transcribe RNA to DNA.
56 raise SeqError, "Cannot reverse-transcribe 0 length sequence" if self.length == 0
57 raise SeqError, "Cannot reverse-transcribe sequence type: #{self.type}" unless self.is_rna?
59 self.seq.tr!('Uu','Tt')
62 # Method that given a Seq entry returns a Biopieces record (a hash).
64 raise SeqError, "Missing seq_name" if self.seq_name.nil?
65 raise SeqError, "Missing seq" if self.seq.nil?
67 record[:SEQ_NAME] = self.seq_name
68 record[:SEQ] = self.seq
69 record[:SEQ_LEN] = self.length
73 # Method that given a Seq entry returns a FASTA entry (a string).
74 def to_fasta(wrap = nil)
75 raise SeqError, "Missing seq_name" if self.seq_name.nil?
76 raise SeqError, "Missing seq" if self.seq.nil?
78 seq_name = self.seq_name
82 seq.gsub! /(.{#{wrap}})/ do |match|
89 ">#{seq_name}\n#{seq}\n"
92 # Method to reverse complement sequence.
93 def reverse_complement
98 alias revcomp reverse_complement
100 # Method to reverse the sequence.
105 # Method that complements sequence including ambiguity codes.
107 raise SeqError, "Cannot complement 0 length sequence" if self.length == 0
110 self.seq.tr!( 'AGCUTRYWSMKHDVBNagcutrywsmkhdvbn', 'TCGAAYRWSKMDHBVNtcgaayrwskmdhbvn' )
112 self.seq.tr!( 'AGCUTRYWSMKHDVBNagcutrywsmkhdvbn', 'UCGAAYRWSKMDHBVNucgaayrwskmdhbvn' )
114 raise SeqError, "Cannot complement sequence type: #{self.type}"
118 # Method that generates a random sequence of a given length.
119 def generate(length,type)
120 raise SeqError, "Cannot generate negative sequence length: #{length}" if length <= 0
130 raise SeqError, "Unknown sequence type: #{type}"
133 seq_new = Array.new(length) { alph[rand(alph.size)] }.join("")
135 self.type = type.downcase
140 # Error class for all exceptions to do with Digest.
141 class DigestError < StandardError; end
146 # Initialize a digest object with the following arguments:
147 # - seq: A sequence object.
148 # - pattern: A restriction enzyme recognition pattern.
149 # - cut_pos: Offset from match position where the enzyme cuts.
150 def initialize(seq, pattern, cut_pos)
152 @pattern = disambiguate(pattern)
157 # Method to get the next digestion product from a sequence.
159 @seq.seq.scan @pattern do
160 pos = $`.length + @cut_pos - 1
162 if pos >= 0 and pos < @seq.seq.length - 2
163 seq = Seq.new("#{@seq.seq_name}[#{@offset + 1}-#{pos + 1}]", @seq.seq[@offset .. pos], @seq.type)
173 elsif @offset > @seq.seq.length
177 seq = Seq.new("#{@seq.seq_name}[#{@offset + 1}-#{@seq.seq.length}]", @seq.seq[@offset .. @seq.seq.length], @seq.type)
181 self # conventionally
186 # Method that returns a regexp object with a restriction
187 # enzyme pattern with ambiguity codes substituted to the
188 # appropriate regexp.
189 def disambiguate(pattern)
211 pattern.upcase.each_char do |char|
212 if ambiguity.has_key? char
213 new_pattern << ambiguity[char]
215 raise DigestError, "Could not disambiguate residue: #{char}"
219 Regexp.new(new_pattern)
230 # Class containing generic sequence methods and nucleic acid and amino acid subclasses.
232 # Guess the sequence type by analyzing the first 100 residues allowing for ambiguity codes.
234 raise ArgumentError, "No sequence." if self.empty?
236 seq_beg = self[0, 100].upcase
238 if seq_beg.count( "FLPQIE" ) > 0
240 elsif seq_beg.count("U") > 0
241 Seq::NA::RNA.new(self)
243 Seq::NA::DNA.new(self)
247 # Method to wrap a sequence to a given width using a given delimiter.
248 def wrap(width = 80, delimit = $/)
249 raise ArgumentError, "Cannot wrap sequence to negative width: #{width}." if width <= 0
251 self.delete!(" \t\n\r")
252 self.gsub(/.{#{width}}(?!$)/, "\\0#{delimit}")
255 # Method to wrap and replace a sequence to a given width using a given delimiter.
256 def wrap!(width = 80, delimit = $/)
257 self.replace(self.wrap(width, delimit))
260 # Method that generates a random sequence of a given length.
262 raise ArgumentError, "Cannot generate negative sequence length: #{length}." if length <= 0
265 Array.new(length) { alph[rand(alph.size)] }.join("")
268 # Method that replaces sequence with a randomly generated sequence of a given length.
269 def generate!(length)
270 self.replace(self.generate(length))
273 # Class containing methods specific for amino acid (AA) sequences.
275 # Method that returns an array of amino acid residues.
277 %w{ F L S Y C W P H Q R I M T N K V A D E G }
280 # Calculate the molecular weight of an amino acid seuqunce.
281 # The caluculation is only approximate since there is no correction
282 # for amino bond formation and the MW used are somewhat imprecise:
283 # http://www.expasy.ch/tools/pscale/Molecularweight.html
285 raise ArgumentError, "invalid residues found: #{self.delete("#{residues.join( "" )}")}" if self.upcase =~ /[^#{residues.join( "" )}]/
289 "R" => 174.000, # Arg
290 "N" => 132.000, # Asn
291 "D" => 133.000, # Asp
292 "C" => 121.000, # Cys
293 "Q" => 146.000, # Gln
294 "E" => 147.000, # Glu
296 "H" => 155.000, # His
297 "I" => 131.000, # Ile
298 "L" => 131.000, # Leu
299 "K" => 146.000, # Lys
300 "M" => 149.000, # Met
301 "F" => 165.000, # Phe
302 "P" => 115.000, # Pro
303 "S" => 105.000, # Ser
304 "T" => 119.000, # Thr
305 "W" => 204.000, # Trp
306 "Y" => 181.000, # Tyr
307 "V" => 117.000, # Val
312 self.upcase.each_char { |c| mw += mol_weight_aa[ c ] }