From 1e52229347317599a25924657e0766231587b401 Mon Sep 17 00:00:00 2001 From: martinahansen Date: Sat, 19 Mar 2011 20:38:52 +0000 Subject: [PATCH] worked on match code for ruby Seq class git-svn-id: http://biopieces.googlecode.com/svn/trunk@1301 74ccb610-7750-0410-82ae-013aeee3265d --- code_ruby/Maasha/lib/doc/created.rid | 17 ++- code_ruby/Maasha/lib/doc/index.html | 210 +++++++++++++++++++++++++-- code_ruby/Maasha/lib/seq.rb | 150 ++++++++++--------- code_ruby/Maasha/test/test_seq.rb | 20 +-- 4 files changed, 310 insertions(+), 87 deletions(-) diff --git a/code_ruby/Maasha/lib/doc/created.rid b/code_ruby/Maasha/lib/doc/created.rid index 7d21b41..36dfd11 100644 --- a/code_ruby/Maasha/lib/doc/created.rid +++ b/code_ruby/Maasha/lib/doc/created.rid @@ -1,4 +1,13 @@ -Mon, 27 Sep 2010 12:56:47 +0200 -biopieces.rb Mon, 13 Sep 2010 09:27:58 +0200 -fasta.rb Mon, 06 Sep 2010 16:36:25 +0200 -seq.rb Mon, 20 Sep 2010 10:38:53 +0200 +Sat, 19 Mar 2011 16:29:12 +0100 +./base36.rb Fri, 11 Feb 2011 19:42:32 +0100 +./biopieces.rb Sat, 19 Mar 2011 09:14:44 +0100 +./bitarray.rb Thu, 17 Mar 2011 15:02:03 +0100 +./bits.rb Tue, 08 Mar 2011 11:37:57 +0100 +./boulder.rb Sat, 19 Mar 2011 09:17:13 +0100 +./digest.rb Sat, 19 Mar 2011 15:41:06 +0100 +./fasta.rb Sat, 19 Mar 2011 09:11:06 +0100 +./fastq.rb Sat, 19 Mar 2011 09:09:52 +0100 +./filesys.rb Fri, 18 Mar 2011 22:13:22 +0100 +./genbank.rb Sat, 19 Mar 2011 09:19:25 +0100 +./seq.rb Sat, 19 Mar 2011 16:29:08 +0100 +./sff.rb Fri, 11 Feb 2011 19:55:38 +0100 diff --git a/code_ruby/Maasha/lib/doc/index.html b/code_ruby/Maasha/lib/doc/index.html index 7622b70..4979e8a 100644 --- a/code_ruby/Maasha/lib/doc/index.html +++ b/code_ruby/Maasha/lib/doc/index.html @@ -31,8 +31,20 @@

Classes/Modules

Methods

diff --git a/code_ruby/Maasha/lib/seq.rb b/code_ruby/Maasha/lib/seq.rb index bf07ab3..60f91bd 100644 --- a/code_ruby/Maasha/lib/seq.rb +++ b/code_ruby/Maasha/lib/seq.rb @@ -24,6 +24,7 @@ require 'amatch' require 'digest' +require 'narray' # Residue alphabets DNA = %w[a t c g] @@ -35,6 +36,29 @@ INDELS = %w[. - _ ~] SCORE_PHRED = 33 SCORE_ILLUMINA = 64 +# Nucleotide equivalents +EQUAL = { + :AA => true, :BU => true, :TH => true, :UY => true, + :TT => true, :CB => true, :UH => true, :SC => true, + :CC => true, :GB => true, :VA => true, :SG => true, + :GG => true, :TB => true, :VC => true, :CS => true, + :UU => true, :UB => true, :VG => true, :GS => true, + :NA => true, :DA => true, :AV => true, :WA => true, + :NT => true, :DG => true, :CV => true, :WT => true, + :NC => true, :DT => true, :GV => true, :WU => true, + :NG => true, :DU => true, :KG => true, :AW => true, + :NU => true, :AD => true, :KT => true, :TW => true, + :AN => true, :GD => true, :KU => true, :UW => true, + :TN => true, :TD => true, :GK => true, :RA => true, + :CN => true, :UD => true, :TK => true, :RG => true, + :GN => true, :HA => true, :UK => true, :AR => true, + :UN => true, :HC => true, :YC => true, :GR => true, + :NN => true, :HT => true, :YT => true, :MA => true, + :BC => true, :HU => true, :YU => true, :MC => true, + :BG => true, :AH => true, :CY => true, :AM => true, + :BT => true, :CH => true, :TY => true, :CM => true, +} + # Error class for all exceptions to do with Seq. class SeqError < StandardError; end @@ -124,6 +148,7 @@ class Seq def to_dna raise SeqError, "Cannot reverse-transcribe 0 length sequence" if self.length == 0 raise SeqError, "Cannot reverse-transcribe sequence type: #{self.type}" unless self.is_rna? + self.type = 'dna' self.seq.tr!('Uu','Tt') end @@ -132,6 +157,7 @@ class Seq def to_bp raise SeqError, "Missing seq_name" if self.seq_name.nil? raise SeqError, "Missing seq" if self.seq.nil? + record = {} record[:SEQ_NAME] = self.seq_name record[:SEQ] = self.seq @@ -402,6 +428,65 @@ class Seq end end + # ------------------------------------------------------------------------------ + # seq.match(pattern[, pos ] [, hd=max] [, ed=max]) -> matchdata or nil + # ------------------------------------------------------------------------------ + # Method to locate a pattern in a sequence and return the position of the match + # or nil if no match was found. Hamming or Edit distance may be specified. + def match(pattern, pos = 0) + while pos < self.length - pattern.length + 1 + str1 = self.seq[pos ... pos + pattern.length] + str2 = pattern + + puts "pos: #{pos} str1: #{str1} str2: #{str2}" + + rows = str1.length + 1 + cols = str2.length + 1 + + matches = 0 + mismatches = 0 + insertions = 0 + deletions = 0 + + matrix = NArray.int(rows, cols) + + for i in 0 ... rows do matrix[i, 0] = i end + for j in 0 ... cols do matrix[0, j] = j end + + for j in 1 ... cols do + for i in 1 ... rows do + puts "pos: #{pos} i: #{i} j: #{j} str1: #{str1} str2: #{str2} str1[i-1]: #{str1[i-1]} str2[j-1]: #{str2[j-1]}" + + if EQUAL[(str1[i - 1].upcase + str2[j - 1].upcase).to_sym] + matrix[i, j] = matrix[i - 1, j - 1] + matches += 1 + else + del = matrix[i - 1, j] + 1 + ins = matrix[i, j - 1] + 1 + mis = matrix[i - 1, j - 1] + 1 + + if del < ins and del < mis + deletions += 1 + matrix[i, j] = del + elsif ins < del and ins < mis + insertions += 1 + matrix[i, j] = ins + else + mismatches += 1 + matrix[i, j] = mis + end + end + end + end + pp matrix + puts "match: #{matches} mis: #{mismatches} del: #{deletions} ins: #{insertions}" + + return pos if matrix[rows - 1, cols - 1] == 0 + + pos += 1 + end + end + private # Method to convert a Solexa score (odd ratio) to @@ -418,68 +503,3 @@ class Seq (score_phred + 64).chr end end - -__END__ - - -# Class containing generic sequence methods and nucleic acid and amino acid subclasses. -class Seq < String - # Guess the sequence type by analyzing the first 100 residues allowing for ambiguity codes. - def guess_type - raise ArgumentError, "No sequence." if self.empty? - - seq_beg = self[0, 100].upcase - - if seq_beg.count( "FLPQIE" ) > 0 - Seq::AA.new(self) - elsif seq_beg.count("U") > 0 - Seq::NA::RNA.new(self) - else - Seq::NA::DNA.new(self) - end - end - - # Class containing methods specific for amino acid (AA) sequences. - class AA < Seq - # Method that returns an array of amino acid residues. - def residues - %w{ F L S Y C W P H Q R I M T N K V A D E G } - end - - # Calculate the molecular weight of an amino acid seuqunce. - # The caluculation is only approximate since there is no correction - # for amino bond formation and the MW used are somewhat imprecise: - # http://www.expasy.ch/tools/pscale/Molecularweight.html - def mol_weight - raise ArgumentError, "invalid residues found: #{self.delete("#{residues.join( "" )}")}" if self.upcase =~ /[^#{residues.join( "" )}]/ - - mol_weight_aa = { - "A" => 89.000, # Ala - "R" => 174.000, # Arg - "N" => 132.000, # Asn - "D" => 133.000, # Asp - "C" => 121.000, # Cys - "Q" => 146.000, # Gln - "E" => 147.000, # Glu - "G" => 75.000, # Gly - "H" => 155.000, # His - "I" => 131.000, # Ile - "L" => 131.000, # Leu - "K" => 146.000, # Lys - "M" => 149.000, # Met - "F" => 165.000, # Phe - "P" => 115.000, # Pro - "S" => 105.000, # Ser - "T" => 119.000, # Thr - "W" => 204.000, # Trp - "Y" => 181.000, # Tyr - "V" => 117.000, # Val - } - - mw = 0.0 - - self.upcase.each_char { |c| mw += mol_weight_aa[ c ] } - - mw - end - end diff --git a/code_ruby/Maasha/test/test_seq.rb b/code_ruby/Maasha/test/test_seq.rb index 25280c5..d8317d3 100755 --- a/code_ruby/Maasha/test/test_seq.rb +++ b/code_ruby/Maasha/test/test_seq.rb @@ -441,19 +441,21 @@ class TestSeq < Test::Unit::TestCase @entry.adaptor_clip_left("cgax", 25) assert_equal( "efghi", @entry.qual) end - - def test_Digest_new_raises_on_bad_pattern_residue - assert_raise(DigestError) { Digest.new(@entry, "X", 4) } + + def test_Seq_match_with_no_match_returns_nil + @entry.seq = "atcg" + assert_equal(nil, @entry.match("ttt")) end - def test_Digest_new_dont_raise_on_ok_pattern_residue - assert_nothing_raised { Digest.new(@entry, "AGCUTRYWSMKHDVBNagcutrywsmkhdvbn", 4) } + def test_Seq_match_returns_correctly + @entry.seq = "atcg" + assert_equal(0, @entry.match("aTc")) + assert_equal(1, @entry.match("Ncg")) end - def test_Digest_each - @entry.seq = "aaaaTTTTbbbbTTTT" - digest = Digest.new(@entry, "TTNT", 1) - assert_equal("aaaaT", digest.first.seq) + def test_Seq_match_with_pos_returns_correctly + @entry.seq = "atcatc" + assert_equal(3, @entry.match("aTc", 2)) end end -- 2.39.5