require 'maasha/seq/trim'
require 'narray'
-autoload :BackTrack, 'maasha/seq/backtrack.rb'
-autoload :Dynamic, 'maasha/seq/dynamic.rb'
+autoload :BackTrack, 'maasha/seq/backtrack'
+autoload :Dynamic, 'maasha/seq/dynamic'
+autoload :Homopolymer, 'maasha/seq/homopolymer'
+autoload :Hamming, 'maasha/seq/hamming'
+autoload :Levenshtein, 'maasha/seq/levenshtein'
+autoload :Ambiguity, 'maasha/seq/ambiguity'
# Residue alphabets
DNA = %w[a t c g]
"GTG" => "V", "GCG" => "A", "GAG" => "E", "GGG" => "G"
}
-
# Error class for all exceptions to do with Seq.
class SeqError < StandardError; end
type = record[:SEQ_TYPE].to_sym if record[:SEQ_TYPE]
qual = record[:SCORES]
- self.new(seq_name, seq, type, qual)
+ self.new(seq_name: seq_name, seq: seq, type: type, qual: qual)
end
# Class method that generates all possible oligos of a specifed length and type.
oligos
end
- # Initialize a sequence object with the following arguments:
- # - seq_name: Name of the sequence.
- # - seq: The sequence.
- # - type: The sequence type - DNA, RNA, or protein
- # - qual: An Illumina type quality scores string.
- def initialize(seq_name = nil, seq = nil, type = nil, qual = nil)
- @seq_name = seq_name
- @seq = seq
- @type = type
- @qual = qual
+ # Initialize a sequence object with the following options:
+ # - :seq_name Name of the sequence.
+ # - :seq The sequence.
+ # - :type The sequence type - DNA, RNA, or protein
+ # - :qual An Illumina type quality scores string.
+ def initialize(options = {})
+ @seq_name = options[:seq_name]
+ @seq = options[:seq]
+ @type = options[:type]
+ @qual = options[:qual]
+
+ if @seq and @qual and @seq.length != @qual.length
+ raise SeqError, "Sequence length and score length mismatch: #{@seq.length} != #{@qual.length}"
+ end
end
# Method that guesses and returns the sequence type
raise SeqError, "Missing seq_name" if self.seq_name.nil? or self.seq_name == ''
raise SeqError, "Missing seq" if self.seq.nil? or self.seq.empty?
- seq_name = self.seq_name.to_s
- seq = self.seq.to_s
+ seq_name = self.seq_name
+ seq = self.seq.dup
unless wrap.nil?
seq.gsub!(/(.{#{wrap}})/) do |match|
# Method to reverse the sequence.
def reverse
- Seq.new(self.seq_name, self.seq.reverse, self.type, self.qual ? self.qual.reverse : self.qual)
+ entry = Seq.new(
+ seq_name: self.seq_name,
+ seq: self.seq.reverse,
+ type: self.type,
+ qual: (self.qual ? self.qual.reverse : self.qual)
+ )
+
+ entry
end
# Method to reverse the sequence.
def complement
raise SeqError, "Cannot complement 0 length sequence" if self.length == 0
- entry = Seq.new
- entry.seq_name = self.seq_name
- entry.type = self.type
- entry.qual = self.qual
+ entry = Seq.new(
+ seq_name: self.seq_name,
+ type: self.type,
+ qual: self.qual
+ )
if self.is_dna?
entry.seq = self.seq.tr('AGCUTRYWSMKHDVBNagcutrywsmkhdvbn', 'TCGAAYRWSKMDHBVNtcgaayrwskmdhbvn')
# Method to determine the Hamming Distance between
# two Sequence objects (case insensitive).
- def hamming_distance(seq)
- self.seq.upcase.hamming_distance(seq.seq.upcase)
+ def hamming_distance(entry, options = nil)
+ if options and options[:ambiguity]
+ Hamming.distance(self.seq, entry.seq)
+ else
+ self.seq.upcase.hamming_distance(entry.seq.upcase)
+ end
+ end
+
+ # Method to determine the Edit Distance between
+ # two Sequence objects (case insensitive).
+ def edit_distance(entry)
+ Levenshtein.distance(self.seq, entry.seq)
end
# Method that generates a random sequence of a given length and type.
# Method to return a new Seq object with shuffled sequence.
def shuffle
- Seq.new(self.seq_name, self.seq.split('').shuffle!.join, self.type, self.qual)
+ Seq.new(
+ seq_name: self.seq_name,
+ seq: self.seq.split('').shuffle!.join,
+ type: self.type,
+ qual: self.qual
+ )
end
# Method to shuffle a sequence randomly inline.
self
end
+ # Method to add two Seq objects.
+ def +(entry)
+ new_entry = Seq.new()
+ new_entry.seq = self.seq + entry.seq
+ new_entry.type = self.type if self.type == entry.type
+ new_entry.qual = self.qual + entry.qual if self.qual and entry.qual
+ new_entry
+ end
+
# Method to concatenate sequence entries.
def <<(entry)
raise SeqError, "sequences of different types" unless self.type == entry.type
self
end
+ # Index method for Seq objects.
+ def [](*args)
+ entry = Seq.new
+ entry.seq_name = self.seq_name
+ entry.seq = self.seq[*args]
+ entry.type = self.type
+ entry.qual = self.qual[*args] unless self.qual.nil?
+
+ entry
+ end
+
+ # Index assignment method for Seq objects.
+ def []=(*args, entry)
+ self.seq[*args] = entry.seq[*args]
+ self.qual[*args] = entry.qual[*args] unless self.qual.nil?
+
+ self
+ end
+
# Method that returns a subsequence of from a given start position
# and of a given length.
def subseq(start, length = self.length - start)
qual = self.qual[start .. stop] unless self.qual.nil?
end
- Seq.new(self.seq_name, seq, self.type, qual) # TODO changed self.seq_name.dup to self.seq_name -> consequence?
+ seq_name = self.seq_name.nil? ? nil : self.seq_name.dup
+
+ Seq.new(seq_name: seq_name, seq: seq, type: self.type, qual: qual)
end
# Method that replaces a sequence with a subsequence from a given start position
comp
end
- # Method that returns the length of the longest homopolymeric stretch
- # found in a sequence.
- def homopol_max(min = 1)
- return 0 if self.seq.nil? or self.seq.empty?
-
- found = false
-
- self.seq.upcase.scan(/A{#{min},}|T{#{min},}|G{#{min},}|C{#{min},}|N{#{min},}/) do |match|
- found = true
- min = match.size > min ? match.size : min
- end
-
- return 0 unless found
-
- min
- end
-
# Method that returns the percentage of hard masked residues
# or N's in a sequence.
def hard_mask
regex_start = Regexp.new(start_codons.join('|'), true)
regex_stop = Regexp.new(stop_codons.join('|'), true)
- while pos_beg and pos_beg < self.length - size_min
- if pos_beg = self.seq.index(regex_start, pos_beg)
- if pos_end = self.seq.index(regex_stop, pos_beg)
- length = (pos_end - pos_beg) + 3
+ while pos_beg = self.seq.index(regex_start, pos_beg)
+ pos_end = pos_beg + 3
+
+ while pos_end = self.seq.index(regex_stop, pos_end)
+ length = (pos_end - pos_beg) + 3
- if (length % 3) == 0
- if size_min <= length and length <= size_max
- subseq = self.subseq(pos_beg, length)
+ if (length % 3) == 0
+ if size_min <= length and length <= size_max
+ subseq = self.subseq(pos_beg, length)
- orfs << [subseq, pos_beg, pos_end + 3]
- end
+ orfs << [subseq, pos_beg, pos_end + 3]
end
+
+ break
end
- pos_beg += 1
+ pos_end += 1
end
+
+ pos_beg += 1
end
if pick_longest