]> git.donarmstrong.com Git - biopieces.git/blobdiff - code_ruby/lib/maasha/seq.rb
fixed another bug in find orfs
[biopieces.git] / code_ruby / lib / maasha / seq.rb
index 3c1c156b9e804edf3cc636a233ca8a085c37f92a..92d33982e90bdfbab281e9a4249e06b1b4d96fe6 100644 (file)
 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
 
 require 'maasha/bits'
-require 'maasha/backtrack'
+require 'maasha/seq/backtrack'
 require 'maasha/seq/digest'
+#require 'maasha/seq/patscan'
 require 'maasha/seq/patternmatcher'
 require 'maasha/seq/trim'
 require 'narray'
-#require 'maasha/patscan'
 
 # Residue alphabets
 DNA     = %w[a t c g]
@@ -324,7 +324,6 @@ class Seq
     raise SeqError, "subsequence length: #{length} < 0"                                              if length < 0
     raise SeqError, "subsequence start + length > Seq.length: #{start} + #{length} > #{self.length}" if start + length > self.length
 
-
     if length == 0
       seq  = ""
       qual = "" unless self.qual.nil?
@@ -335,8 +334,7 @@ class Seq
       qual = self.qual[start .. stop] unless self.qual.nil?
     end
 
-
-    Seq.new(self.seq_name, seq, self.type, qual)
+    Seq.new(self.seq_name.dup, seq, self.type, qual)
   end
 
   # Method that replaces a sequence with a subsequence from a given start position
@@ -499,6 +497,56 @@ class Seq
 
     self
   end
+
+  # Method to calculate and return the mean quality score.
+  def scores_mean
+    raise SeqError, "Missing qual in entry" if self.qual.nil?
+
+    na_qual = NArray.to_na(self.qual, "byte")
+    na_qual -= SCORE_BASE
+    na_qual.mean
+  end
+
+  # Method to find open reading frames (ORFs).
+  def each_orf(size_min, size_max, start_codons, stop_codons, pick_longest = false)
+    orfs    = []
+    pos_beg = 0
+
+    regex_start = Regexp.new(start_codons.join('|'), true)
+    regex_stop  = Regexp.new(stop_codons.join('|'), true)
+
+    while pos_beg and pos_beg < self.length - size_min
+      if pos_beg = self.seq.index(regex_start, pos_beg)
+        if pos_end = self.seq.index(regex_stop, pos_beg)
+          length = (pos_end - pos_beg) + 3
+
+          if (length % 3) == 0
+            if size_min <= length and length <= size_max
+              subseq = self.subseq(pos_beg, length)
+
+              orfs << [subseq, pos_beg, pos_end + 3]
+            end
+          end
+        end
+
+        pos_beg += 1
+      end
+    end
+
+    if pick_longest
+      orf_hash = {}
+
+      orfs.each { |orf| orf_hash[orf.last] = orf unless orf_hash[orf.last] }
+
+      orfs = orf_hash.values
+    end
+
+    if block_given?
+      orfs.each { |orf| yield orf }
+    else
+      return orfs
+    end
+  end
 end
 
 __END__