]> git.donarmstrong.com Git - biopieces.git/commitdiff
updated find_mids
authormartinahansen <martinahansen@74ccb610-7750-0410-82ae-013aeee3265d>
Wed, 30 Mar 2011 09:35:54 +0000 (09:35 +0000)
committermartinahansen <martinahansen@74ccb610-7750-0410-82ae-013aeee3265d>
Wed, 30 Mar 2011 09:35:54 +0000 (09:35 +0000)
git-svn-id: http://biopieces.googlecode.com/svn/trunk@1314 74ccb610-7750-0410-82ae-013aeee3265d

bp_bin/find_mids
bp_bin/remove_illumina_adaptor
code_ruby/Maasha/lib/patternmatcher.rb

index 4e6104975f1da424d655b85e664861dd1b2a2bef..73bbfb60c3b696eb1f4ac5558c6c737676984ec4 100755 (executable)
@@ -32,6 +32,8 @@
 require 'biopieces'
 require 'pp'
 
+MID_LEN = 10
+
 mids = %w{ ACGAGTGCGT ACGCTCGACA AGACGCACTC AGCACTGTAG ATCAGACACG
            ATATCGCGAG CGTGTCTCTA CTCGCGTGTC TAGTATCAGC TCTCTATGCG
            TGATACGTCT TACTGAGCTA CATAGTAGTG CGAGAGATAC ATACGACGTA
@@ -76,14 +78,17 @@ mids.each_with_index do |mid, i|
 end
 
 casts = []
+casts << {:long=>'pos', :short=>'p', :type=>'uint', :mandatory=>false, :default=>0, :allowed=>nil, :disallowed=>nil}
 
 bp = Biopieces.new
 
 options = bp.parse(ARGV, casts)
 
+pos = options[:pos]
+
 bp.each_record do |record|
   if record.has_key? :SEQ
-    tag = record[:SEQ][4 ... 14].upcase
+    tag = record[:SEQ][pos ... pos + MID_LEN].upcase
 
     if mid_hash.has_key? tag
       count_hash[tag] += 1
index 7f74c8a90578b7047c7b096a1a7b748abe5a139d..0b0b1b9e2cf29f79dd2461f6dc2a1dc255688aac 100755 (executable)
 
 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
 
-# Remove Illumina adaptors or parts thereof from sequences in the stream.
+# Remove adaptors or parts thereof from sequences in the stream.
 
 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
 
 
 require 'biopieces'
 require 'seq'
+require 'profile'
 
 casts = []
-casts << {:long=>'min',                :short=>'m', :type=>'uint',   :mandatory=>false, :default=>0,   :allowed=>nil, :disallowed=>nil}
-casts << {:long=>'right_adaptor',      :short=>'r', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
-casts << {:long=>'left_adaptor',       :short=>'l', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
-casts << {:long=>'right_hamming_dist', :short=>'R', :type=>'uint',   :mandatory=>false, :default=>25,  :allowed=>nil, :disallowed=>nil}
-casts << {:long=>'left_hamming_dist',  :short=>'L', :type=>'uint',   :mandatory=>false, :default=>25,  :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'adaptor',    :short=>'r', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'mismatches', :short=>'m', :type=>'uint',   :mandatory=>false, :default=>20,  :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'insertions', :short=>'i', :type=>'uint',   :mandatory=>false, :default=>10,  :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'deletions',  :short=>'d', :type=>'uint',   :mandatory=>false, :default=>10,  :allowed=>nil, :disallowed=>nil}
 
 bp = Biopieces.new
 
@@ -47,25 +47,9 @@ bp.each_record do |record|
   if record.has_key? :SEQ
     entry = Seq.new(record[:SEQ_NAME], record[:SEQ], record[:TYPE], record[:SCORES])
 
-    if options[:right_adaptor]
-      pos_right = entry.adaptor_locate_right(options[:right_adaptor], options[:right_hamming_dist])
-      entry.subseq!(0, pos_right) if pos_right >= 0 and entry.length - pos_right >= options[:min]
-      record[:CLIP_ADAPTOR_RIGHT] = pos_right
-    else
-      record[:CLIP_ADAPTOR_RIGHT] = -1
+    if match = entry.adaptor_find(options[:adaptor], options[:mismatches], options[:insertions], options[:deletions])
+      record[:POS] = match.pos
     end
-
-    if options[:left_adaptor]
-      pos_left = entry.adaptor_locate_left(options[:left_adaptor], options[:left_hamming_dist])
-      entry.subseq!(pos_left + 1) if pos_left >= options[:min]
-      record[:CLIP_ADAPTOR_LEFT] = pos_left
-    else
-      record[:CLIP_ADAPTOR_LEFT] = -1
-    end
-    
-    record[:SEQ]     = entry.seq
-    record[:SEQ_LEN] = entry.len
-    record[:SCORES]  = entry.qual
   end
 
   bp.puts record
index b4d92b840c1e769867e2ac43cb2b8b5082f20eec..17c9f4d26cf1a20d4b541f35cd2a5494d5b60f8e 100644 (file)
@@ -61,7 +61,6 @@ module PatternMatcher
   # located a Match object will be returned. If all paths are exhausted and
   # no match is located the position is incremented. If no match is located
   # whatsoever, then nil is returned.
-  # TODO: converging paths should be skipped for speed-up.
   def match(pattern, pos = 0, max_mismatches = 0, max_insertions = 0, max_deletions = 0)
     @pattern        = pattern
     @max_mismatches = max_mismatches
@@ -76,20 +75,33 @@ module PatternMatcher
         new_paths = []
 
         paths.each do |path|
-          next if path.exhausted?(@seq, @pattern)
-          return path.to_match if match_found?(path)
+          new_paths << path
 
-          if path.match?(@seq, @pattern)
-            new_paths << path.match
-          elsif path.mismatches < max_mismatches
-            new_paths << path.mismatch
+          (path.insertions ... @max_insertions).each do |i|
+            new_path = path.dup
+            new_path.insertions    += i + 1
+            new_path.pattern_index += i + 1
+            new_paths << new_path
           end
 
-          new_paths << path.insertion if path.insertions < max_insertions
-          new_paths << path.deletion  if path.deletions  < max_deletions
+          (path.deletions ... @max_deletions).each do |i|
+            new_path = path.dup
+            new_path.length    += i + 1
+            new_path.deletions += i + 1
+            new_path.seq_index += i + 1
+            new_paths << new_path
+          end
         end
 
-        paths = new_paths
+        paths = []
+
+        new_paths.each do |p|
+          next if p.exhausted?(@seq, @pattern)
+          next if p.mismatches > @max_mismatches
+          return p.to_match if match_found?(p)
+          p.extend(@seq, @pattern)
+          paths << p
+        end
       end
 
       pos += 1
@@ -172,41 +184,17 @@ module PatternMatcher
       Match.new(@pos, @matches, @mismatches, @insertions, @deletions, @length)
     end
 
-    # Method that returns a new Match object for a matching path
-    def match
-      path_match                = self.dup
-      path_match.length        += 1
-      path_match.matches       += 1
-      path_match.seq_index     += 1
-      path_match.pattern_index += 1
-      path_match
-    end
-
-    # Method that returns a new Match object for a matching path
-    def mismatch
-      path_mismatch                = self.dup
-      path_mismatch.length        += 1
-      path_mismatch.mismatches    += 1
-      path_mismatch.seq_index     += 1
-      path_mismatch.pattern_index += 1
-      path_mismatch
-    end
-
-    # Method that returns a new Match object for a insertion path
-    def insertion
-      path_insertion                = self.dup
-      path_insertion.insertions    += 1
-      path_insertion.pattern_index += 1
-      path_insertion
-    end
+    # Method that extends a path with either a match or a mismatch.
+    def extend(seq, pattern)
+      if self.match?(seq, pattern)
+        self.matches       += 1
+      else
+        self.mismatches    += 1
+      end
 
-    # Method that returns a new Match object for a deletion path
-    def deletion
-      path_deletion            = self.dup
-      path_deletion.length    += 1
-      path_deletion.deletions += 1
-      path_deletion.seq_index += 1
-      path_deletion
+      self.length        += 1
+      self.seq_index     += 1
+      self.pattern_index += 1
     end
   end