code_ruby/lib/maasha/align.rb

   1 # Copyright (C) 2007-2011 Martin A. Hansen.
   2
   3 # This program is free software; you can redistribute it and/or
   4 # modify it under the terms of the GNU General Public License
   5 # as published by the Free Software Foundation; either version 2
   6 # of the License, or (at your option) any later version.
   7
   8 # This program is distributed in the hope that it will be useful,
   9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  11 # GNU General Public License for more details.
  12
  13 # You should have received a copy of the GNU General Public License
  14 # along with this program; if not, write to the Free Software
  15 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  16
  17 # http://www.gnu.org/copyleft/gpl.html
  18
  19 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  20
  21 # This software is part of the Biopieces framework (www.biopieces.org).
  22
  23 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  24
  25 require 'pp'
  26 require 'open3'
  27 require 'narray'
  28 require 'maasha/fasta'
  29
  30 class AlignError < StandardError; end;
  31
  32 ALPH_DNA  = %w{A T C G}
  33 ALPH_AMBI = %w{A T C G M R W S Y K V H D B N}
  34
  35 BIT_INDEL = 0
  36 # BIT_A = 1 << 0
  37 # BIT_T = 1 << 1
  38 # BIT_C = 1 << 2
  39 # BIT_G = 1 << 3
  40
  41 BIT_M = BIT_A | BIT_C
  42 BIT_R = BIT_A | BIT_G
  43 BIT_W = BIT_A | BIT_T
  44 BIT_S = BIT_C | BIT_G
  45 BIT_Y = BIT_C | BIT_T
  46 BIT_K = BIT_G | BIT_T
  47 BIT_V = BIT_A | BIT_C | BIT_G
  48 BIT_H = BIT_A | BIT_C | BIT_T
  49 BIT_D = BIT_A | BIT_G | BIT_T
  50 BIT_B = BIT_C | BIT_G | BIT_T
  51 BIT_N = BIT_G | BIT_A | BIT_T | BIT_C
  52
  53 BITMAP = [
  54   BIT_A,
  55   BIT_T,
  56   BIT_C,
  57   BIT_G,
  58   BIT_M,
  59   BIT_R,
  60   BIT_W,
  61   BIT_S,
  62   BIT_Y,
  63   BIT_K,
  64   BIT_V,
  65   BIT_H,
  66   BIT_D,
  67   BIT_B,
  68   BIT_N
  69 ]
  70
  71 TR_NUC = "-"                   + ALPH_AMBI.join("").downcase
  72 TR_HEX = [BIT_INDEL].pack("C") + BITMAP.pack("C*")
  73
  74 ROW_A = 0
  75 ROW_T = 1
  76 ROW_C = 2
  77 ROW_G = 3
  78
  79 class Fasta
  80   def initialize(io)
  81     @io = io
  82   end
  83
  84   def close
  85     @io.close
  86   end
  87 end
  88
  89 # Class for aligning sequences.
  90 class Align
  91   attr_accessor :options
  92   attr_reader   :entries
  93
  94   # Class method to align sequences in a list of Seq objects and
  95   # return these as a new list of Seq objects.
  96   def self.muscle(entries, verbose = false)
  97     result   = []
  98     index    = {}
  99
 100     Open3.popen3("muscle", "-quiet") do |stdin, stdout, stderr|
 101       entries.each do |entry|
 102         raise AlignError, "Duplicate sequence name: #{entry.seq_name}" if index.has_key? entry.seq_name
 103
 104         index[entry.seq_name] = entry.dup
 105
 106         stdin.puts entry.to_fasta
 107       end
 108
 109       stdin.close
 110
 111       stderr.each_line { |line| $stderr.puts line } if verbose
 112
 113       aligned_entries = Fasta.new(stdout)
 114
 115       aligned_entries.each do |fa_entry|
 116         fq_entry = index[fa_entry.seq_name]
 117
 118         fa_entry.seq.scan(/-+/) do |m|
 119           fq_entry.seq  = fq_entry.seq[0 ... $`.length]  + ('-' * m.length) + fq_entry.seq[$`.length .. -1]
 120           fq_entry.qual = fq_entry.qual[0 ... $`.length] + ('@' * m.length) + fq_entry.qual[$`.length .. -1] unless fq_entry.qual.nil?
 121         end
 122
 123         result << fq_entry
 124       end
 125     end
 126
 127     self.new(result)
 128   end
 129
 130   # Class method to create a pairwise alignment of two given Seq objects. The
 131   # alignment is created by casting a search space the size of the sequences
 132   # and save the longest perfect match between the sequences and recurse into
 133   # the left and right search spaced around this match. When all search spaces
 134   # are exhausted the saved matches are used to insert gaps in the sequences.
 135   def self.pair(q_entry, s_entry)
 136     q_space_beg = 0
 137     q_space_end = q_entry.length
 138     s_space_beg = 0
 139     s_space_end = s_entry.length
 140
 141     q_entry.seq.downcase!
 142     s_entry.seq.downcase!
 143
 144     ap = AlignPair.new(q_entry.seq, s_entry.seq)
 145
 146     ap.align(q_space_beg, q_space_end, s_space_beg, s_space_end, 256)
 147
 148     ap.upcase_match
 149     ap.insert_gaps
 150
 151     #ap.dump_bp and exit
 152
 153     self.new([q_entry, s_entry])
 154   end
 155
 156   # Method to initialize an Align object with a list of aligned Seq objects.
 157   def initialize(entries, options = {})
 158     @entries = entries
 159     @options = options
 160   end
 161
 162   # Method that returns the length of the alignment.
 163   def length
 164     @entries.first.length
 165   end
 166
 167   # Method that returns the number of members or sequences in the alignment.
 168   def members
 169     @entries.size
 170   end
 171
 172   # Method that returns the identity of an alignment with two members.
 173   def identity
 174     if self.members != 2
 175       raise AlignError "Bad number of members for similarity calculation: #{self.members}"
 176     end
 177
 178     na1 = NArray.to_na(@entries[0].seq.upcase, "byte")
 179     na2 = NArray.to_na(@entries[1].seq.upcase, "byte")
 180
 181     shared   = (na1 - na2).count_false
 182     total    = (@entries[0].length < @entries[1].length) ? @entries[0].length : @entries[1].length
 183     identity = shared.to_f / total
 184
 185     identity
 186   end
 187
 188   # Method to create a consensus sequence from an Align object and
 189   # return a new Seq object with the consensus.
 190   def consensus
 191     cons = Consensus.new(@entries, @options)
 192     cons.consensus
 193   end
 194
 195   # Method to pretty print an alignment from an Align object.
 196   def to_s
 197     cons = Consensus.new(@entries, @options)
 198     cons.mask_entries!
 199
 200     max_name = @entries.group_by { |entry| entry.seq_name.length }.max.first
 201
 202     output = ""
 203
 204     @entries.each do |entry|
 205       output << entry.seq_name + (" " * (max_name + 3 - entry.seq_name.length )) + entry.seq + $/
 206     end
 207
 208     cons_entry = cons.consensus
 209
 210     output << " " * (max_name + 3) + cons_entry.seq
 211     output << $/ + " " * (max_name + 3) + cons_entry.qual.tr("[@-h]", "           ..........ooooooooooOOOOOOOOOO") unless cons_entry.qual.nil?
 212     output
 213   end
 214
 215   # Method for iterating each of the aligned sequences.
 216   def each
 217     if block_given?
 218       @entries.each { |entry| yield entry }
 219     else
 220       return @entries
 221     end
 222   end
 223
 224   private
 225
 226   # Class for creating a pairwise alignment of two specified sequences. The
 227   # alignment is created by casting a search space the size of the sequences
 228   # and save the longest perfect match between the sequences and recurse into
 229   # the left and right search spaced around this match. When all search spaces
 230   # are exhausted the saved matches are used to insert gaps in the sequences.
 231   class AlignPair
 232     attr_reader :matches
 233
 234     # Method to initialize an AlignPair object given two sequence strings.
 235     def initialize(q_seq, s_seq)
 236       @q_seq   = q_seq
 237       @s_seq   = s_seq
 238       @matches = []
 239     end
 240
 241     # Method that locates the longest perfect match in a specified search space
 242     # by comparing two sequence indeces. The first index is created by hashing
 243     # for the first sequence the position of all non-overlapping unique oligos
 244     # with the oligo as key an the position as value. The second index is
 245     # created for the second sequence by hashing the position of all
 246     # overlapping unique oligos in a similar way. The longest match is located
 247     # by comparing these indeces and the longest match is expanded maximally
 248     # within the specified search space. Finally, the longest match is used
 249     # to define a left and a right search space which are recursed into
 250     # until no more matches are found.
 251     def align(q_space_beg, q_space_end, s_space_beg, s_space_end, length)
 252 #      puts "search space: #{q_space_beg}-#{q_space_end} x #{s_space_beg}-#{s_space_end}"
 253       new_matches = []
 254
 255       while new_matches.empty? and length > 0
 256         if @q_seq.length >= length and @s_seq.length >= length
 257           q_index = index_seq(q_space_beg, q_space_end, @q_seq, length, length)
 258           s_index = index_seq(s_space_beg, s_space_end, @s_seq, length, 1)
 259           new_matches = find_matches(q_index, s_index, length)
 260         end
 261
 262         unless new_matches.empty?
 263           longest_match = new_matches.sort_by { |match| match.length }.last
 264           longest_match.expand(@q_seq, @s_seq, q_space_beg, q_space_end, s_space_beg, s_space_end)
 265 #          pp longest_match
 266
 267           @matches << longest_match
 268
 269           q_space_beg_left  = (q_space_beg == 0) ? 0 : q_space_beg + 1
 270           s_space_beg_left  = (s_space_beg == 0) ? 0 : s_space_beg + 1
 271           q_space_end_left  = longest_match.q_beg - 1
 272           s_space_end_left  = longest_match.s_beg - 1
 273
 274           q_space_beg_right = longest_match.q_end + 1
 275           s_space_beg_right = longest_match.s_end + 1
 276           q_space_end_right = (q_space_end == @q_seq.length) ? @q_seq.length : q_space_end - 1
 277           s_space_end_right = (s_space_end == @s_seq.length) ? @s_seq.length : s_space_end - 1
 278
 279           if q_space_end_left - q_space_beg_left > 0 and s_space_end_left - s_space_beg_left > 0
 280             align(q_space_beg_left,  q_space_end_left,  s_space_beg_left,  s_space_end_left,  length)
 281           end
 282
 283           if q_space_end_right - q_space_beg_right > 0 and s_space_end_right - s_space_beg_right > 0
 284             align(q_space_beg_right, q_space_end_right, s_space_beg_right, s_space_end_right, length)
 285           end
 286         end
 287
 288         length /= 2
 289       end
 290     end
 291
 292     # Method for debugging purposes that upcase matching sequence while non-matches
 293     # sequence is kept in lower case.
 294     def upcase_match
 295       @matches.each do |match|
 296         @q_seq[match.q_beg .. match.q_end] = @q_seq[match.q_beg .. match.q_end].upcase
 297         @s_seq[match.s_beg .. match.s_end] = @s_seq[match.s_beg .. match.s_end].upcase
 298       end
 299     end
 300
 301     # Method that insert gaps in sequences based on a list of matches and thus
 302     # creating an alignment.
 303     # TODO check boundaries!
 304     def insert_gaps
 305       @matches.sort_by! { |m| m.q_beg }
 306
 307       q_gaps = 0
 308       s_gaps = 0
 309
 310       @matches.each do |match|
 311         diff = (q_gaps + match.q_beg) - (s_gaps + match.s_beg)
 312
 313         #pp "q_gaps #{q_gaps}   s_gaps #{s_gaps}   diff #{diff}"
 314
 315         if diff < 0
 316           @q_seq.insert(match.q_beg + q_gaps, "-" * diff.abs)
 317           q_gaps += diff.abs
 318         elsif diff > 0
 319           @s_seq.insert(match.s_beg + s_gaps, "-" * diff.abs)
 320           s_gaps += diff.abs
 321         end
 322       end
 323
 324       diff = @q_seq.length - @s_seq.length
 325
 326       if diff < 0
 327         @q_seq << ("-" * diff.abs)
 328       else
 329         @s_seq << ("-" * diff.abs)
 330       end
 331     end
 332
 333     # Method that dumps all matches as Biopiece records for use with
 334     # plot_matches.
 335     def dump_bp
 336       @matches.sort_by! { |m| m.q_beg }
 337
 338       @matches.each { |m|
 339         puts "Q_BEG: #{m.q_beg}"
 340         puts "Q_END: #{m.q_end}"
 341         puts "S_BEG: #{m.s_beg}"
 342         puts "S_END: #{m.s_end}"
 343         puts "STRAND: +"
 344         puts "---"
 345       }
 346     end
 347
 348     private
 349
 350     # Method for indexing a sequence given a search space, oligo size
 351     # and step size. All oligo positions are saved in a lookup hash
 352     # where the oligo is the key and the position is the value. Non-
 353     # unique oligos are removed and the index returned.
 354     def index_seq(space_beg, space_end, seq, oligo_size, step_size)
 355       index       = Hash.new
 356       index_count = Hash.new(0)
 357
 358       (space_beg ... space_end).step(step_size).each do |pos|
 359         oligo = seq[pos ... pos + oligo_size]
 360
 361         if oligo.length == oligo_size
 362           index[oligo]        = pos
 363           index_count[oligo] += 1
 364         end
 365       end
 366
 367       index_count.each do |oligo, count|
 368         index.delete(oligo) if count > 1
 369       end
 370
 371       index
 372     end
 373
 374     # Method for locating matches between two indexes where one is created
 375     # using non-overlapping unique oligos and their positions and one is
 376     # using overlapping unique oligos and positions. Thus iterating over the
 377     # non-overlapping oligos and checking last match, this can be extended.
 378     def find_matches(q_index, s_index, length)
 379       matches = []
 380
 381       q_index.each do |oligo, q_pos|
 382         if s_index[oligo]
 383           s_pos = s_index[oligo]
 384
 385           last_match = matches.last
 386           new_match  = Match.new(q_pos, s_pos, length)
 387
 388           if last_match and
 389              last_match.q_beg + last_match.length == new_match.q_beg and
 390              last_match.s_beg + last_match.length == new_match.s_beg
 391
 392              last_match.length += length
 393           else
 394             matches << new_match
 395           end
 396         end
 397       end
 398
 399       matches
 400     end
 401
 402     # Class for Match objects.
 403     class Match
 404       attr_accessor :length
 405       attr_reader :q_beg, :s_beg
 406
 407       def initialize(q_beg, s_beg, length)
 408         @q_beg  = q_beg
 409         @s_beg  = s_beg
 410         @length = length
 411       end
 412
 413       def q_end
 414         @q_beg + @length - 1
 415       end
 416
 417       def s_end
 418         @s_beg + @length - 1
 419       end
 420
 421       def to_s
 422         "q_beg=#{q_beg} q_end=#{q_end} s_beg=#{s_beg} s_end=#{s_end} length=#{length}"
 423       end
 424
 425       # Method to expand a match as far as possible to the left and right
 426       # within a given search space.
 427       def expand(q_seq, s_seq, q_space_beg, q_space_end, s_space_beg, s_space_end)
 428         expand_left(q_seq, s_seq, q_space_beg, s_space_beg)
 429         expand_right(q_seq, s_seq, q_space_end, s_space_end)
 430       end
 431
 432       private
 433
 434       # Method to expand a match as far as possible to the left within a given
 435       # search space.
 436       def expand_left(q_seq, s_seq, q_space_beg, s_space_beg)
 437         while @q_beg > q_space_beg and @s_beg > s_space_beg and q_seq[@q_beg - 1] == s_seq[@s_beg - 1]
 438           @q_beg  -= 1
 439           @s_beg  -= 1
 440           @length += 1
 441         end
 442       end
 443
 444       # Method to expand a match as far as possible to the right within a given
 445       # search space.
 446       def expand_right(q_seq, s_seq, q_space_end, s_space_end)
 447         while q_end + 1 <= q_space_end and s_end + 1 <= s_space_end and q_seq[q_end + 1] == s_seq[s_end + 1]
 448           @length += 1
 449         end
 450       end
 451     end
 452   end
 453
 454   class Consensus
 455     # Method to initialize a Consensus object given a list of aligned Seq object.
 456     def initialize(entries, options)
 457       @entries = entries
 458       @options = options
 459
 460       @cols = entries.first.seq.length
 461       @rows = entries.size
 462
 463       @has_qual = entries.first.qual.nil? ? false : true
 464
 465       @na_seq  = NArray.byte(@cols, @rows)
 466       @na_qual = NArray.byte(@cols, @rows) if @has_qual
 467
 468       na_add_entries
 469       consensus_calc
 470     end
 471
 472     # Method that lowercase residues that have been removed during
 473     # the determination of the consensus sequence.
 474     def mask_entries!
 475       na_seq = NArray.byte(@cols, @rows)
 476
 477       @entries.each_with_index do |entry, i|
 478         na_seq[true, i]  = NArray.to_na(entry.seq.upcase, "byte")
 479       end
 480
 481       na_seq += ((na_seq.ne('-'.ord) - @na_seq.ne(0)) * ' '.ord)
 482
 483       @entries.each_with_index do |entry, i|
 484         entry.seq = na_seq[true, i].to_s
 485       end
 486     end
 487
 488     # Method that returns a Sequence object with a consensus sequence
 489     # for the entries in an Align object.
 490     def consensus
 491       new_seq      = Seq.new
 492       new_seq.seq  = consensus_seq
 493       new_seq.qual = consensus_qual if @has_qual
 494       new_seq.type = "dna"
 495
 496       new_seq
 497     end
 498
 499     private
 500
 501     # Method to add a Seq entry object to the two NArrays; @na_seq and @na_qual
 502     def na_add_entries
 503       @entries.each_with_index do |entry, i|
 504         @na_seq[true, i]  = NArray.to_na(entry.seq.downcase.tr(TR_NUC, TR_HEX), "byte")
 505         @na_qual[true, i] = NArray.to_na(entry.qual, "byte") - SCORE_BASE if @has_qual
 506       end
 507     end
 508
 509     # Method to calculate a consensus sequence from a list of sequenced stored in two
 510     # NArrays.
 511     def consensus_calc
 512       if @has_qual
 513         if @options.has_key? :quality_min
 514           mask = mask_quality_min
 515
 516           @na_seq  *= mask
 517           @na_qual *= mask
 518         end
 519
 520         if @options.has_key? :quality_mean
 521           mask = mask_quality_mean
 522
 523           @na_seq  *= mask
 524           @na_qual *= mask
 525         end
 526       end
 527
 528       if @options.has_key? :sequence_min
 529         mask = mask_sequence_min
 530
 531         @na_seq  *= mask
 532         @na_qual *= mask if @has_qual
 533       end
 534
 535       if @options.has_key? :gap_max
 536         mask = mask_gap_max
 537
 538         @na_seq  *= mask
 539         @na_qual *= mask if @has_qual
 540       end
 541
 542       if @options.has_key? :residue_min
 543         mask = mask_residue_min
 544
 545         @na_seq  *= mask
 546         @na_qual *= mask if @has_qual
 547       end
 548     end
 549
 550     # Mask that indicates which columns have more than sequence_min sequences.
 551     # Columns with less than sequence_min are 0'ed, otherwise set to 1.
 552     def mask_sequence_min
 553       mask  = NArray.byte(@cols, @rows) + 1
 554       mask *= ((@na_seq > 0).to_type("int").sum(1) >= @options[:sequence_min])
 555       mask
 556     end
 557
 558     # Mask that indicates which residue frequencies that are above the residue_min.
 559     # The residue frequencies are calculated for each column and residue type as the
 560     # number of each residue type over the sum of all non-gap residues in that column.
 561     # Positions with residues above the residue_min are indicated with 1.
 562     def mask_residue_min
 563       cons_min = @options[:residue_min]
 564       factor   = 1 / @na_seq.ne(0).to_type("float").sum(1)
 565
 566       mask_A = (@na_seq & BIT_A > 0).to_type("int")
 567       mask_T = (@na_seq & BIT_T > 0).to_type("int")
 568       mask_C = (@na_seq & BIT_C > 0).to_type("int")
 569       mask_G = (@na_seq & BIT_G > 0).to_type("int")
 570
 571       mask_A = (mask_A * mask_A.sum(1)) * factor >= cons_min
 572       mask_T = (mask_T * mask_T.sum(1)) * factor >= cons_min
 573       mask_C = (mask_C * mask_C.sum(1)) * factor >= cons_min
 574       mask_G = (mask_G * mask_G.sum(1)) * factor >= cons_min
 575
 576       mask_A | mask_T | mask_C | mask_G
 577     end
 578
 579     # Mask that indicates which columns contain fewer gaps than max_gap.
 580     # Columns with more gaps are 0'ed, others are set to 1.
 581     def mask_gap_max
 582       mask  = NArray.byte(@cols, @rows) + 1
 583       mask *= @na_seq.ne(0).to_type("float").sum(1) / @rows > @options[:gap_max]
 584
 585       mask
 586     end
 587
 588     # Mask that indicates which residues in an alignment are above quality_min.
 589     # Positions with subquality are 0'ed - all others are set to 1.
 590     def mask_quality_min
 591       @na_qual > @options[:quality_min]
 592     end
 593
 594     # Mask that indicates which columns have a quality mean above quality_mean which
 595     # is the mean of all non-gap quality residues in that column. Columns with less then
 596     # quality_mean are 0'ed, otherwise set to 1.
 597     def mask_quality_mean
 598       mask     = NArray.byte(@cols, @rows) + 1
 599       residues = @na_seq.ne(0).to_type("int").sum(1)
 600       quality  = @na_qual.to_type("float").sum(1)
 601
 602       mask * (quality / residues).round > @options[:quality_mean]
 603     end
 604
 605     # Method to calculate a consensus sequence from a Consensus object.
 606     def consensus_seq
 607       cons  = NArray.byte(@cols)
 608       cons |= (@na_seq & BIT_A).max(1)
 609       cons |= (@na_seq & BIT_T).max(1)
 610       cons |= (@na_seq & BIT_C).max(1)
 611       cons |= (@na_seq & BIT_G).max(1)
 612
 613       cons.to_s.tr!(TR_HEX, TR_NUC).upcase
 614     end
 615
 616     # Method to calculate a consensus quality from a Consensus object.
 617     def consensus_qual
 618       (@na_qual.mean(1).round.to_type("byte") + SCORE_BASE).to_s
 619     end
 620   end
 621 end
 622
 623 __END__