+DNA = %w[a t c g]
+RNA = %w[a u c g]
+PROTEIN = %w[f l s y c w p h q r i m t n k v a d e g]
+
+# Error class for all exceptions to do with Seq.
+class SeqError < StandardError; end
+
+class Seq
+ attr_accessor :seq_name, :seq, :type, :qual
+
+ def initialize(seq_name = nil, seq = nil, type = nil, qual = nil)
+ @seq_name = seq_name
+ @seq = seq
+ @type = type
+ @qual = qual
+ end
+
+ def length
+ self.seq.nil? ? 0 : self.seq.length
+ end
+
+ alias len length
+
+ # Method that returns true is a given sequence type is DNA.
+ def is_dna?
+ self.type == 'dna'
+ end
+
+ def is_rna?
+ self.type == 'rna'
+ end
+
+ def is_protein?
+ self.type == 'protein'
+ end
+
+ # Method to transcribe DNA to RNA.
+ def to_rna
+ raise SeqError, "Cannot transcribe 0 length sequence" if self.length == 0
+ raise SeqError, "Cannot transcribe sequence type: #{self.type}" unless self.is_dna?
+ self.type = 'rna'
+ self.seq.tr!('Tt','Uu')
+ end
+
+ # Method to reverse-transcribe RNA to DNA.
+ def to_dna
+ raise SeqError, "Cannot reverse-transcribe 0 length sequence" if self.length == 0
+ raise SeqError, "Cannot reverse-transcribe sequence type: #{self.type}" unless self.is_rna?
+ self.type = 'dna'
+ self.seq.tr!('Uu','Tt')
+ end
+
+ # Method that given a Seq entry returns a Biopieces record (a hash).
+ def to_bp
+ raise SeqError, "Missing seq_name" if self.seq_name.nil?
+ raise SeqError, "Missing seq" if self.seq.nil?
+ record = {}
+ record['SEQ_NAME'] = self.seq_name
+ record['SEQ'] = self.seq
+ record['SEQ_LEN'] = self.length
+ record
+ end
+
+ # Method that given a Seq entry returns a FASTA entry (a string).
+ def to_fasta(wrap = nil)
+ raise SeqError, "Missing seq_name" if self.seq_name.nil?
+ raise SeqError, "Missing seq" if self.seq.nil?
+
+ seq_name = self.seq_name
+ seq = self.seq
+
+ unless wrap.nil?
+ seq.gsub! /(.{#{wrap}})/ do |match|
+ match << "\n"
+ end
+
+ seq.chomp!
+ end
+
+ ">#{seq_name}\n#{seq}\n"
+ end
+
+ # Method that complements sequence including ambiguity codes.
+ def complement
+ raise SeqError, "Cannot complement 0 length sequence" if self.length == 0
+
+ if self.is_dna?
+ self.seq.tr!( 'AGCUTRYWSMKHDVBNagcutrywsmkhdvbn', 'TCGAAYRWSKMDHBVNtcgaayrwskmdhbvn' )
+ elsif self.is_rna?
+ self.seq.tr!( 'AGCUTRYWSMKHDVBNagcutrywsmkhdvbn', 'UCGAAYRWSKMDHBVNucgaayrwskmdhbvn' )
+ else
+ raise SeqError, "Cannot complement sequence type: #{self.type}"
+ end
+ end
+
+ # Method that generates a random sequence of a given length.
+ def generate(length,type)
+ raise SeqError, "Cannot generate negative sequence length: #{length}" if length <= 0
+
+ case type.downcase
+ when "dna"
+ alph = DNA
+ when "rna"
+ alph = RNA
+ when "protein"
+ alph = PROTEIN
+ else
+ raise SeqError, "Unknown sequence type: #{type}"
+ end
+
+ seq_new = Array.new(length) { alph[rand(alph.size)] }.join("")
+ self.seq = seq_new
+ self.type = type.downcase
+ seq_new
+ end
+end
+
+__END__
+
+
+
+
+
# Class containing generic sequence methods and nucleic acid and amino acid subclasses.
class Seq < String
- attr_accessor :seq, :seq_type
-
- # Method to initialize a new sequence.
- def initialize( seq = "", seq_type = nil )
- @seq = seq
- @seq_type = seq_type
- end
-
- # Guess the sequence type by analyzing the first 100 residues allowing for ambiguity codes.
- def seq_type_guess
- seq_beg = @seq[ 0, 100 ].upcase
-
- if seq_beg.count( "FLPQIE" ) > 0
- "AA"
- elsif seq_beg.count( "U" ) > 0
- "RNA"
- else
- "DNA"
- end
- end
-
- # Guess and replace the sequence type by analyzing the first 100 residues allowing for ambiguity codes.
- def seq_type_guess!
- @seq_type = seq_type_guess
- end
-
- # Method that return an array of the residue alphabet for a given sequence type.
- def seq_alph( seq_type )
- case seq_type.upcase
- when 'DNA'
- %w{ A T C G }
- when 'RNA'
- %w{ A U C G }
- when 'AA'
- %w{ F L S Y C W P H Q R I M T N K V A D E G }
- else
- raise "ERROR: Sequence type '#{ seq_type }' not recognized."
- end
- end
-
- # Method to wrap a sequence to a given width using a given delimiter.
- def wrap( width = 80, delimit = "\n" )
- raise "ERROR: Wrap width must be an integer." unless width.is_a? Fixnum
- raise "ERROR: Cannot wrap sequence to negative width: #{ width }." if width <= 0
- @seq.tr!( " \t\n\r", '' )
- @seq.gsub( /.{#{ width }}/, "\\0#{ delimit }" ).sub( /#{ delimit }$/, "" )
- end
-
- # Method to wrap and replace a sequence to a given width using a given delimiter.
- def wrap!( width = 80, delimit = "\n" )
- @seq = wrap( width, delimit )
- end
-
- # Method that generates a random sequence of a given length.
- def generate( length )
- raise "ERROR: Length must be an integer." unless length.is_a? Fixnum
- raise "ERROR: Cannot generate negative sequence length: #{ length }." if length <= 0
-
- alph = seq_alph( @seq_type )
- seq = Array.new( length ) { alph[ rand( alph.size ) ] }.join
- end
-
- # Method that replaces sequence with a randomly generated sequence of a given length.
- def generate!( length )
- @seq = generate( length )
- end
-
- # Class containing methods specific for amino acid (AA) sequences.
- class AA < Seq
- # Method to initialize a new amino acid sequence.
- def initialize( seq = "" )
- @seq = seq
- @seq_type = "AA"
- end
-
- # Calculate the molecular weight of an amino acid seuqunce.
- # The caluculation is only approximate since there is no correction
- # for amino bond formation and the MW used are somewhat imprecise:
- # http://www.expasy.ch/tools/pscale/Molecularweight.html
- def mol_weight
- mol_weight_aa = {
- "A" => 89.000, # Ala
- "R" => 174.000, # Arg
- "N" => 132.000, # Asn
- "D" => 133.000, # Asp
- "C" => 121.000, # Cys
- "Q" => 146.000, # Gln
- "E" => 147.000, # Glu
- "G" => 75.000, # Gly
- "H" => 155.000, # His
- "I" => 131.000, # Ile
- "L" => 131.000, # Leu
- "K" => 146.000, # Lys
- "M" => 149.000, # Met
- "F" => 165.000, # Phe
- "P" => 115.000, # Pro
- "S" => 105.000, # Ser
- "T" => 119.000, # Thr
- "W" => 204.000, # Trp
- "Y" => 181.000, # Tyr
- "V" => 117.000, # Val
- }
-
- mw = 0.0
-
- @seq.upcase.each_char do |c|
- raise "ERROR: Unknown amino acid: #{ c }" unless mol_weight_aa.include?( c )
- mw += mol_weight_aa[ c ]
- end
-
- mw
- end
- end
-
- # Class containing methods specific for nucleic acid (NA) sequences.
- class NA < Seq
- # Class containing methods specific for DNA sequences.
- class DNA < NA
- # Method that complements DNA sequence including ambiguity codes.
- def complement
- @seq.tr!( 'AGCUTRYWSMKHDVBNagcutrywsmkhdvbn', 'TCGAAYRWSKMDHBVNtcgaayrwskmdhbvn' )
- end
- end
-
- # Class containing methods specific for RNA sequences.
- class RNA < NA
- # Method that complements RNA sequence including ambiguity codes.
- def complement
- @seq.tr!( 'AGCUTRYWSMKHDVBNagcutrywsmkhdvbn', 'UCGAAYRWSKMDHBVNucgaayrwskmdhbvn' )
- end
- end
- end
-end
+ # Guess the sequence type by analyzing the first 100 residues allowing for ambiguity codes.
+ def guess_type
+ raise ArgumentError, "No sequence." if self.empty?
+
+ seq_beg = self[0, 100].upcase
+
+ if seq_beg.count( "FLPQIE" ) > 0
+ Seq::AA.new(self)
+ elsif seq_beg.count("U") > 0
+ Seq::NA::RNA.new(self)
+ else
+ Seq::NA::DNA.new(self)
+ end
+ end
+
+ # Method to wrap a sequence to a given width using a given delimiter.
+ def wrap(width = 80, delimit = $/)
+ raise ArgumentError, "Cannot wrap sequence to negative width: #{width}." if width <= 0
+
+ self.delete!(" \t\n\r")
+ self.gsub(/.{#{width}}(?!$)/, "\\0#{delimit}")
+ end
+
+ # Method to wrap and replace a sequence to a given width using a given delimiter.
+ def wrap!(width = 80, delimit = $/)
+ self.replace(self.wrap(width, delimit))
+ end
+
+ # Method that generates a random sequence of a given length.
+ def generate(length)
+ raise ArgumentError, "Cannot generate negative sequence length: #{length}." if length <= 0
+
+ alph = self.residues
+ Array.new(length) { alph[rand(alph.size)] }.join("")
+ end
+
+ # Method that replaces sequence with a randomly generated sequence of a given length.
+ def generate!(length)
+ self.replace(self.generate(length))
+ end
+
+ # Class containing methods specific for amino acid (AA) sequences.
+ class AA < Seq
+ # Method that returns an array of amino acid residues.
+ def residues
+ %w{ F L S Y C W P H Q R I M T N K V A D E G }
+ end
+
+ # Calculate the molecular weight of an amino acid seuqunce.
+ # The caluculation is only approximate since there is no correction
+ # for amino bond formation and the MW used are somewhat imprecise:
+ # http://www.expasy.ch/tools/pscale/Molecularweight.html
+ def mol_weight
+ raise ArgumentError, "invalid residues found: #{self.delete("#{residues.join( "" )}")}" if self.upcase =~ /[^#{residues.join( "" )}]/
+
+ mol_weight_aa = {
+ "A" => 89.000, # Ala
+ "R" => 174.000, # Arg
+ "N" => 132.000, # Asn
+ "D" => 133.000, # Asp
+ "C" => 121.000, # Cys
+ "Q" => 146.000, # Gln
+ "E" => 147.000, # Glu
+ "G" => 75.000, # Gly
+ "H" => 155.000, # His
+ "I" => 131.000, # Ile
+ "L" => 131.000, # Leu
+ "K" => 146.000, # Lys
+ "M" => 149.000, # Met
+ "F" => 165.000, # Phe
+ "P" => 115.000, # Pro
+ "S" => 105.000, # Ser
+ "T" => 119.000, # Thr
+ "W" => 204.000, # Trp
+ "Y" => 181.000, # Tyr
+ "V" => 117.000, # Val
+ }
+
+ mw = 0.0
+
+ self.upcase.each_char { |c| mw += mol_weight_aa[ c ] }
+
+ mw
+ end
+ end