code_ruby/Maasha/lib/seq.rb

   1 # Copyright (C) 2007-2011 Martin A. Hansen.
   2
   3 # This program is free software; you can redistribute it and/or
   4 # modify it under the terms of the GNU General Public License
   5 # as published by the Free Software Foundation; either version 2
   6 # of the License, or (at your option) any later version.
   7
   8 # This program is distributed in the hope that it will be useful,
   9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  11 # GNU General Public License for more details.
  12
  13 # You should have received a copy of the GNU General Public License
  14 # along with this program; if not, write to the Free Software
  15 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  16
  17 # http://www.gnu.org/copyleft/gpl.html
  18
  19 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  20
  21 # This software is part of the Biopieces framework (www.biopieces.org).
  22
  23 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  24
  25 # Residue alphabets
  26 DNA     = %w[a t c g]
  27 RNA     = %w[a u c g]
  28 PROTEIN = %w[f l s y c w p h q r i m t n k v a d e g]
  29 INDELS  = %w[. - _ ~]
  30
  31 # Quality scores bases
  32 SCORE_PHRED    = 33
  33 SCORE_ILLUMINA = 64
  34
  35 # Error class for all exceptions to do with Seq.
  36 class SeqError < StandardError; end
  37
  38 class Seq
  39   attr_accessor :seq_name, :seq, :type, :qual
  40
  41   # Method that generates all possible oligos of a specifed length and type.
  42   def self.generate_oligos(length, type)
  43     raise SeqError, "Cannot generate negative oligo length: #{length}" if length <= 0
  44
  45     case type.downcase
  46     when /dna/     then alph = DNA
  47     when /rna/     then alph = RNA
  48     when /protein/ then alph = PROTEIN
  49     else
  50       raise SeqError, "Unknown sequence type: #{type}"
  51     end
  52
  53     oligos = [""]
  54
  55     (1 .. length).each do
  56       list = []
  57
  58       oligos.each do |oligo|
  59         alph.each do |char|
  60           list << oligo + char
  61         end
  62       end
  63
  64       oligos = list
  65     end
  66
  67     oligos
  68   end
  69
  70   # Initialize a sequence object with the following arguments:
  71   # - seq_name: Name of the sequence.
  72   # - seq: The sequence.
  73   # - type: The sequence type - DNA, RNA, or protein
  74   # - qual: An Illumina type quality scores string.
  75   def initialize(seq_name = nil, seq = nil, type = nil, qual = nil)
  76     @seq_name = seq_name
  77     @seq      = seq
  78     @type     = type
  79     @qual     = qual
  80   end
  81
  82   # Returns the length of a sequence.
  83   def length
  84     self.seq.nil? ? 0 : self.seq.length
  85   end
  86
  87   alias len length
  88
  89   # Return the number indels in a sequence.
  90   def indels
  91     regex = Regexp.new(/[#{Regexp.escape(INDELS.join(""))}]/)
  92     self.seq.scan(regex).size
  93   end
  94
  95   # Method that returns true is a given sequence type is DNA.
  96   def is_dna?
  97     self.type == 'dna'
  98   end
  99
 100   # Method that returns true is a given sequence type is RNA.
 101   def is_rna?
 102     self.type == 'rna'
 103   end
 104
 105   # Method that returns true is a given sequence type is protein.
 106   def is_protein?
 107     self.type == 'protein'
 108   end
 109
 110   # Method to transcribe DNA to RNA.
 111   def to_rna
 112     raise SeqError, "Cannot transcribe 0 length sequence" if self.length == 0
 113     raise SeqError, "Cannot transcribe sequence type: #{self.type}" unless self.is_dna?
 114     self.type = 'rna'
 115     self.seq.tr!('Tt','Uu')
 116   end
 117
 118   # Method to reverse-transcribe RNA to DNA.
 119   def to_dna
 120     raise SeqError, "Cannot reverse-transcribe 0 length sequence" if self.length == 0
 121     raise SeqError, "Cannot reverse-transcribe sequence type: #{self.type}" unless self.is_rna?
 122     self.type = 'dna'
 123     self.seq.tr!('Uu','Tt')
 124   end
 125
 126   # Method that given a Seq entry returns a Biopieces record (a hash).
 127   def to_bp
 128     raise SeqError, "Missing seq_name" if self.seq_name.nil?
 129     raise SeqError, "Missing seq"      if self.seq.nil?
 130     record             = {}
 131     record[:SEQ_NAME] = self.seq_name
 132     record[:SEQ]      = self.seq
 133     record[:SEQ_LEN]  = self.length
 134     record[:SCORES]   = self.qual if self.qual
 135     record
 136   end
 137
 138   # Method that given a Seq entry returns a FASTA entry (a string).
 139   def to_fasta(wrap = nil)
 140     raise SeqError, "Missing seq_name" if self.seq_name.nil?
 141     raise SeqError, "Missing seq"      if self.seq.nil?
 142
 143     seq_name = self.seq_name
 144     seq      = self.seq
 145
 146     unless wrap.nil?
 147       seq.gsub! /(.{#{wrap}})/ do |match|
 148         match << "\n"
 149       end
 150
 151       seq.chomp!
 152     end
 153
 154     ">#{seq_name}\n#{seq}\n"
 155   end
 156
 157   # Method that generates a unique key for a
 158   # DNA sequence and return this key as a Fixnum.
 159   def to_key
 160     key = 0
 161
 162     self.seq.upcase.each_char do |char|
 163       key <<= 2
 164
 165       case char
 166       when 'A' then key |= 0
 167       when 'C' then key |= 1
 168       when 'G' then key |= 2
 169       when 'T' then key |= 3
 170       else raise SeqError, "Bad residue: #{char}"
 171       end
 172     end
 173
 174     key
 175   end
 176
 177   # Method to reverse complement sequence.
 178   def reverse_complement
 179     self.reverse
 180     self.complement
 181   end
 182
 183   alias revcomp reverse_complement
 184
 185   # Method to reverse the sequence.
 186   def reverse
 187     self.seq.reverse!
 188   end
 189
 190   # Method that complements sequence including ambiguity codes.
 191   def complement
 192     raise SeqError, "Cannot complement 0 length sequence" if self.length == 0
 193
 194     if self.is_dna?
 195       self.seq.tr!( 'AGCUTRYWSMKHDVBNagcutrywsmkhdvbn', 'TCGAAYRWSKMDHBVNtcgaayrwskmdhbvn' )
 196     elsif self.is_rna?
 197       self.seq.tr!( 'AGCUTRYWSMKHDVBNagcutrywsmkhdvbn', 'UCGAAYRWSKMDHBVNucgaayrwskmdhbvn' )
 198     else
 199       raise SeqError, "Cannot complement sequence type: #{self.type}"
 200     end
 201   end
 202
 203   # Method that generates a random sequence of a given length and type.
 204   def generate(length,type)
 205     raise SeqError, "Cannot generate sequence length < 1: #{length}" if length <= 0
 206
 207     case type.downcase
 208     when "dna"
 209       alph = DNA
 210     when "rna"
 211       alph = RNA
 212     when "protein"
 213       alph = PROTEIN
 214     else
 215       raise SeqError, "Unknown sequence type: #{type}"
 216     end
 217
 218     seq_new   = Array.new(length) { alph[rand(alph.size)] }.join("")
 219     self.seq  = seq_new
 220     self.type = type.downcase
 221     seq_new
 222   end
 223
 224   # Method that returns a subsequence of from a given start position
 225   # and of a given length.
 226   def subseq(start, length)
 227     raise SeqError, "sunsequence start: #{start} < 0"                                                if start  < 0
 228     raise SeqError, "subsequence length: #{length} < 1"                                              if length <= 0
 229     raise SeqError, "subsequence start + length > Seq.length: #{start} + #{length} > #{self.length}" if start + length > self.length
 230
 231     stop = start + length - 1
 232
 233     seq  = self.seq[start .. stop]
 234     qual = self.qual[start .. stop] unless self.qual.nil?
 235
 236     Seq.new(self.seq_name, seq, self.type, qual)
 237   end
 238
 239   # Method that returns a subsequence of a given length
 240   # beginning at a random position.
 241   def subseq_rand(length)
 242     if self.seq.length - length + 1 == 0
 243       start = 0
 244     else
 245       start = rand(self.seq.length - length + 1)
 246     end
 247
 248     self.subseq(start, length)
 249   end
 250
 251   # Method that returns the residue compositions of a sequence in
 252   # a hash where the key is the residue and the value is the residue
 253   # count.
 254   def composition
 255     comp = Hash.new(0);
 256
 257     self.seq.upcase.each_char do |char|
 258       comp[char] += 1
 259     end
 260
 261     comp
 262   end
 263
 264   # Method that returns the length of the longest homopolymeric stretch
 265   # found in a sequence.
 266   def homopol_max(min = 1)
 267     return 0 if self.seq.nil? or self.seq.empty?
 268
 269     found = false
 270
 271     self.seq.upcase.scan(/A{#{min},}|T{#{min},}|G{#{min},}|C{#{min},}|N{#{min},}/) do |match|
 272       found = true
 273       min   = match.size > min ? match.size : min
 274     end
 275
 276     return 0 unless found
 277
 278     min
 279   end
 280
 281   # Method that returns the percentage of hard masked residues
 282   # or N's in a sequence.
 283   def hard_mask
 284     ((self.seq.upcase.scan("N").size.to_f / (self.len - self.indels).to_f) * 100).round(2)
 285   end
 286
 287   # Method that returns the percentage of soft masked residues
 288   # or lower cased residues in a sequence.
 289   def soft_mask
 290     ((self.seq.scan(/[a-z]/).size.to_f / (self.len - self.indels).to_f) * 100).round(2)
 291   end
 292
 293   # Method to convert the quality scores from a specified base
 294   # to another base.
 295   def convert_phred2illumina!
 296     self.qual.gsub! /./ do |score|
 297       score_phred  = score.ord - SCORE_PHRED
 298       raise SeqError, "Bad Phred score: #{score} (#{score_phred})" unless (0 .. 40).include? score_phred
 299       score_illumina = score_phred + SCORE_ILLUMINA
 300       score          = score_illumina.chr
 301     end
 302   end
 303
 304   # Method to convert the quality scores from Solexa odd/ratio to
 305   # Illumina format.
 306   def convert_solexa2illumina!
 307     self.qual.gsub! /./ do |score|
 308       score = solexa_char2illumina_char(score)
 309     end
 310   end
 311
 312   private
 313
 314   # Method to convert a Solexa score (odd ratio) to
 315   # a phred (probability) integer score.
 316   def solexa2phred(score)
 317     (10.0 * Math.log(10.0 ** (score / 10.0) + 1.0, 10)).to_i
 318   end
 319
 320   # Method to convert a Solexa score encoded using base
 321   # 64 ASCII to a Phred score encoded using base 64 ASCII.
 322   def solexa_char2illumina_char(char)
 323     score_solexa = char.ord - 64
 324     score_phred  = solexa2phred(score_solexa)
 325     (score_phred + 64).chr
 326   end
 327 end
 328
 329 # Error class for all exceptions to do with Digest.
 330 class DigestError < StandardError; end
 331
 332 class Digest
 333   include Enumerable
 334
 335   # Initialize a digest object with the following arguments:
 336   # - seq: A sequence object.
 337   # - pattern: A restriction enzyme recognition pattern.
 338   # - cut_pos: Offset from match position where the enzyme cuts.
 339   def initialize(seq, pattern, cut_pos)
 340     @seq     = seq
 341     @pattern = disambiguate(pattern)
 342     @cut_pos = cut_pos
 343     @offset  = 0
 344   end
 345
 346   # Method to get the next digestion product from a sequence.
 347   def each
 348     @seq.seq.upcase.scan @pattern do
 349       pos = $`.length + @cut_pos - 1
 350
 351       if pos >= 0 and pos < @seq.seq.length - 2
 352         seq = Seq.new("#{@seq.seq_name}[#{@offset}-#{pos}]", @seq.seq[@offset .. pos], @seq.type)
 353
 354         yield seq
 355       end
 356
 357       @offset = pos + 1
 358     end
 359
 360     if @offset < 0
 361       @offset = 0
 362     elsif @offset > @seq.seq.length
 363       @offset = 0
 364     end
 365
 366     seq = Seq.new("#{@seq.seq_name}[#{@offset}-#{@seq.seq.length - 1}]", @seq.seq[@offset .. @seq.seq.length], @seq.type)
 367
 368     yield seq
 369
 370     self # conventionally
 371   end
 372
 373   private
 374
 375   # Method that returns a regexp object with a restriction
 376   # enzyme pattern with ambiguity codes substituted to the
 377   # appropriate regexp.
 378   def disambiguate(pattern)
 379     ambiguity = {
 380       'A' => "A",
 381       'T' => "T",
 382       'U' => "T",
 383       'C' => "C",
 384       'G' => "G",
 385       'M' => "[AC]",
 386       'R' => "[AG]",
 387       'W' => "[AT]",
 388       'S' => "[CG]",
 389       'Y' => "[CT]",
 390       'K' => "[GT]",
 391       'V' => "[ACG]",
 392       'H' => "[ACT]",
 393       'D' => "[AGT]",
 394       'B' => "[CGT]",
 395       'N' => "[GATC]"
 396     }
 397
 398     new_pattern = ""
 399
 400     pattern.upcase.each_char do |char|
 401       if ambiguity.has_key? char
 402         new_pattern << ambiguity[char]
 403       else
 404         raise DigestError, "Could not disambiguate residue: #{char}"
 405       end
 406     end
 407
 408     Regexp.new(new_pattern)
 409   end
 410 end
 411
 412 __END__
 413
 414
 415
 416
 417
 418 # Class containing generic sequence methods and nucleic acid and amino acid subclasses.
 419 class Seq < String
 420   # Guess the sequence type by analyzing the first 100 residues allowing for ambiguity codes.
 421   def guess_type
 422     raise ArgumentError, "No sequence." if self.empty?
 423
 424     seq_beg = self[0, 100].upcase
 425
 426     if seq_beg.count( "FLPQIE" ) > 0
 427       Seq::AA.new(self)
 428     elsif seq_beg.count("U") > 0
 429       Seq::NA::RNA.new(self)
 430     else
 431       Seq::NA::DNA.new(self)
 432     end
 433   end
 434
 435   # Method to wrap a sequence to a given width using a given delimiter.
 436   def wrap(width = 80, delimit = $/)
 437     raise ArgumentError, "Cannot wrap sequence to negative width: #{width}." if width <= 0
 438
 439     self.delete!(" \t\n\r")
 440     self.gsub(/.{#{width}}(?!$)/, "\\0#{delimit}")
 441   end
 442
 443   # Method to wrap and replace a sequence to a given width using a given delimiter.
 444   def wrap!(width = 80, delimit = $/)
 445     self.replace(self.wrap(width, delimit))
 446   end
 447
 448   # Method that replaces sequence with a randomly generated sequence of a given length.
 449   def generate!(length)
 450     self.replace(self.generate(length))
 451   end
 452
 453   # Class containing methods specific for amino acid (AA) sequences.
 454   class AA < Seq
 455     # Method that returns an array of amino acid residues.
 456     def residues
 457       %w{ F L S Y C W P H Q R I M T N K V A D E G }
 458     end
 459
 460     # Calculate the molecular weight of an amino acid seuqunce.
 461     # The caluculation is only approximate since there is no correction
 462     # for amino bond formation and the MW used are somewhat imprecise:
 463     # http://www.expasy.ch/tools/pscale/Molecularweight.html
 464     def mol_weight
 465       raise ArgumentError, "invalid residues found: #{self.delete("#{residues.join( "" )}")}" if self.upcase =~ /[^#{residues.join( "" )}]/
 466
 467       mol_weight_aa = {
 468         "A" => 89.000,    # Ala
 469         "R" => 174.000,   # Arg
 470         "N" => 132.000,   # Asn
 471         "D" => 133.000,   # Asp
 472         "C" => 121.000,   # Cys
 473         "Q" => 146.000,   # Gln
 474         "E" => 147.000,   # Glu
 475         "G" => 75.000,    # Gly
 476         "H" => 155.000,   # His
 477         "I" => 131.000,   # Ile
 478         "L" => 131.000,   # Leu
 479         "K" => 146.000,   # Lys
 480         "M" => 149.000,   # Met
 481         "F" => 165.000,   # Phe
 482         "P" => 115.000,   # Pro
 483         "S" => 105.000,   # Ser
 484         "T" => 119.000,   # Thr
 485         "W" => 204.000,   # Trp
 486         "Y" => 181.000,   # Tyr
 487         "V" => 117.000,   # Val
 488       }
 489
 490       mw = 0.0
 491
 492       self.upcase.each_char { |c| mw += mol_weight_aa[ c ] }
 493
 494       mw
 495     end
 496   end