4 PROTEIN = %w[f l s y c w p h q r i m t n k v a d e g]
6 # Error class for all exceptions to do with Seq.
7 class SeqError < StandardError; end
10 attr_accessor :seq_name, :seq, :type, :qual
12 # Initialize a sequence object with the following arguments:
13 # - seq_name: Name of the sequence.
14 # - seq: The sequence.
15 # - type: The sequence type - DNA, RNA, or protein
16 # - qual: A Solexa type quality scores string.
17 def initialize(seq_name = nil, seq = nil, type = nil, qual = nil)
24 # Returns the length of a sequence.
26 self.seq.nil? ? 0 : self.seq.length
31 # Method that returns true is a given sequence type is DNA.
36 # Method that returns true is a given sequence type is RNA.
41 # Method that returns true is a given sequence type is protein.
43 self.type == 'protein'
46 # Method to transcribe DNA to RNA.
48 raise SeqError, "Cannot transcribe 0 length sequence" if self.length == 0
49 raise SeqError, "Cannot transcribe sequence type: #{self.type}" unless self.is_dna?
51 self.seq.tr!('Tt','Uu')
54 # Method to reverse-transcribe RNA to DNA.
56 raise SeqError, "Cannot reverse-transcribe 0 length sequence" if self.length == 0
57 raise SeqError, "Cannot reverse-transcribe sequence type: #{self.type}" unless self.is_rna?
59 self.seq.tr!('Uu','Tt')
62 # Method that given a Seq entry returns a Biopieces record (a hash).
64 raise SeqError, "Missing seq_name" if self.seq_name.nil?
65 raise SeqError, "Missing seq" if self.seq.nil?
67 record[:SEQ_NAME] = self.seq_name
68 record[:SEQ] = self.seq
69 record[:SEQ_LEN] = self.length
73 # Method that given a Seq entry returns a FASTA entry (a string).
74 def to_fasta(wrap = nil)
75 raise SeqError, "Missing seq_name" if self.seq_name.nil?
76 raise SeqError, "Missing seq" if self.seq.nil?
78 seq_name = self.seq_name
82 seq.gsub! /(.{#{wrap}})/ do |match|
89 ">#{seq_name}\n#{seq}\n"
92 # Method to reverse complement sequence.
93 def reverse_complement
98 alias revcomp reverse_complement
100 # Method to reverse the sequence.
105 # Method that complements sequence including ambiguity codes.
107 raise SeqError, "Cannot complement 0 length sequence" if self.length == 0
110 self.seq.tr!( 'AGCUTRYWSMKHDVBNagcutrywsmkhdvbn', 'TCGAAYRWSKMDHBVNtcgaayrwskmdhbvn' )
112 self.seq.tr!( 'AGCUTRYWSMKHDVBNagcutrywsmkhdvbn', 'UCGAAYRWSKMDHBVNucgaayrwskmdhbvn' )
114 raise SeqError, "Cannot complement sequence type: #{self.type}"
118 # Method that generates a random sequence of a given length.
119 def generate(length,type)
120 raise SeqError, "Cannot generate negative sequence length: #{length}" if length <= 0
130 raise SeqError, "Unknown sequence type: #{type}"
133 seq_new = Array.new(length) { alph[rand(alph.size)] }.join("")
135 self.type = type.downcase
140 # Error class for all exceptions to do with Digest.
141 class DigestError < StandardError; end
146 # Initialize a digest object with the following arguments:
147 # - seq: A sequence object.
148 # - pattern: A restriction enzyme recognition pattern.
149 # - cut_pos: Offset from match position where the enzyme cuts.
150 def initialize(seq, pattern, cut_pos)
152 @pattern = disambiguate(pattern)
157 # Method to get the next digestion product from a sequence.
159 @seq.seq.scan @pattern do
160 pos = $`.length + @cut_pos - 1
162 if pos >= 0 and pos < @seq.seq.length - 2
163 seq = Seq.new("#{@seq.seq_name}[#{@offset + 1}-#{pos + 1}]", @seq.seq[@offset .. pos], @seq.type)
171 # Handle remaining sequence after last cut.
172 if @offset > 0 and @offset < @seq.seq.length
173 seq = Seq.new("#{@seq.seq_name}[#{@offset + 1}-#{@seq.seq.length}]", @seq.seq[@offset .. @seq.seq.length], @seq.type)
178 self # conventionally
183 # Method that returns a regexp object with a restriction
184 # enzyme pattern with ambiguity codes substituted to the
185 # appropriate regexp.
186 def disambiguate(pattern)
208 pattern.upcase.each_char do |char|
209 if ambiguity.has_key? char
210 new_pattern << ambiguity[char]
212 raise DigestError, "Could not disambiguate residue: #{char}"
216 Regexp.new(new_pattern)
227 # Class containing generic sequence methods and nucleic acid and amino acid subclasses.
229 # Guess the sequence type by analyzing the first 100 residues allowing for ambiguity codes.
231 raise ArgumentError, "No sequence." if self.empty?
233 seq_beg = self[0, 100].upcase
235 if seq_beg.count( "FLPQIE" ) > 0
237 elsif seq_beg.count("U") > 0
238 Seq::NA::RNA.new(self)
240 Seq::NA::DNA.new(self)
244 # Method to wrap a sequence to a given width using a given delimiter.
245 def wrap(width = 80, delimit = $/)
246 raise ArgumentError, "Cannot wrap sequence to negative width: #{width}." if width <= 0
248 self.delete!(" \t\n\r")
249 self.gsub(/.{#{width}}(?!$)/, "\\0#{delimit}")
252 # Method to wrap and replace a sequence to a given width using a given delimiter.
253 def wrap!(width = 80, delimit = $/)
254 self.replace(self.wrap(width, delimit))
257 # Method that generates a random sequence of a given length.
259 raise ArgumentError, "Cannot generate negative sequence length: #{length}." if length <= 0
262 Array.new(length) { alph[rand(alph.size)] }.join("")
265 # Method that replaces sequence with a randomly generated sequence of a given length.
266 def generate!(length)
267 self.replace(self.generate(length))
270 # Class containing methods specific for amino acid (AA) sequences.
272 # Method that returns an array of amino acid residues.
274 %w{ F L S Y C W P H Q R I M T N K V A D E G }
277 # Calculate the molecular weight of an amino acid seuqunce.
278 # The caluculation is only approximate since there is no correction
279 # for amino bond formation and the MW used are somewhat imprecise:
280 # http://www.expasy.ch/tools/pscale/Molecularweight.html
282 raise ArgumentError, "invalid residues found: #{self.delete("#{residues.join( "" )}")}" if self.upcase =~ /[^#{residues.join( "" )}]/
286 "R" => 174.000, # Arg
287 "N" => 132.000, # Asn
288 "D" => 133.000, # Asp
289 "C" => 121.000, # Cys
290 "Q" => 146.000, # Gln
291 "E" => 147.000, # Glu
293 "H" => 155.000, # His
294 "I" => 131.000, # Ile
295 "L" => 131.000, # Leu
296 "K" => 146.000, # Lys
297 "M" => 149.000, # Met
298 "F" => 165.000, # Phe
299 "P" => 115.000, # Pro
300 "S" => 105.000, # Ser
301 "T" => 119.000, # Thr
302 "W" => 204.000, # Trp
303 "Y" => 181.000, # Tyr
304 "V" => 117.000, # Val
309 self.upcase.each_char { |c| mw += mol_weight_aa[ c ] }