From db8e4454c9c278ac274b9a997d95324218045d8d Mon Sep 17 00:00:00 2001 From: martinahansen Date: Thu, 17 Nov 2011 09:34:12 +0000 Subject: [PATCH] added requirement of RubyInline git-svn-id: http://biopieces.googlecode.com/svn/trunk@1656 74ccb610-7750-0410-82ae-013aeee3265d --- bp_bin/uclust_seq | 25 +++--- ...er-0.42.sh => biopieces_installer-0.43.sh} | 3 +- bp_test/test_all | 3 +- code_ruby/lib/maasha/backtrack.rb | 15 +++- code_ruby/lib/maasha/seq.rb | 4 +- code_ruby/test/maasha/test_backtrack.rb | 77 +++++++++++++++++-- 6 files changed, 102 insertions(+), 25 deletions(-) rename bp_scripts/{biopieces_installer-0.42.sh => biopieces_installer-0.43.sh} (99%) diff --git a/bp_bin/uclust_seq b/bp_bin/uclust_seq index bd63df8..b20a29c 100755 --- a/bp_bin/uclust_seq +++ b/bp_bin/uclust_seq @@ -58,6 +58,13 @@ class Uclust File.rename "#{@infile}.sort", @infile end + # Method to execute clustering de novo. + def cluster + @command << "usearch --cluster #{@infile} --uc #{@outfile} --id #{@options[:identity]}" + + execute + end + # Method to execute database search. def usearch # usearch --query query.fasta --db db.fasta --uc results.uc --id 0.90 [--evalue E] @@ -67,13 +74,6 @@ class Uclust execute end - # Method to execute clustering de novo. - def cluster - @command << "usearch --cluster #{@infile} --uc #{@outfile} --id #{@options[:identity]}" - - execute - end - # Method to execute clustering to database plus de novo if not matched. def usearch_uclust # usearch --cluster seqs_sorted.fasta --db db.fasta --uc results.uc --id 0.90 @@ -129,7 +129,7 @@ class Uclust end end -ok_methods = "usearch,uclust,usearch_uclust" +ok_methods = "uclust,usearch,usearch_uclust" casts = [] casts << {:long=>'no_sort', :short=>'n', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} @@ -141,6 +141,10 @@ casts << {:long=>'e_val', :short=>'e', :type=>'float', :mandatory=>false, :d options = Biopieces.options_parse(ARGV, casts) +# At high identities, around 96% and above, compressed indexes are often more sensitive, faster +# and use less RAM. Compressed indexes are disabled by default, so I generally recommend that +# you specify the --slots and --w options when clustering at high identities. + tmpdir = Biopieces.mktmpdir infile = File.join(tmpdir, "in.fna") outfile = File.join(tmpdir, "out.uc") @@ -157,13 +161,12 @@ Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output| end uclust = Uclust.new(infile, outfile, options) - uclust.sort unless options[:no_sort] + uclust.sort unless options[:no_sort] or options[:method] == "usearch" case options[:method].to_s - when "usearch" then uclust.usearch when "uclust" then uclust.cluster + when "usearch" then uclust.usearch when "usearch_uclust" then uclust.usearch_uclust - else raise "Unknown method: #{options[:method]}" end uclust.each do |record| diff --git a/bp_scripts/biopieces_installer-0.42.sh b/bp_scripts/biopieces_installer-0.43.sh similarity index 99% rename from bp_scripts/biopieces_installer-0.42.sh rename to bp_scripts/biopieces_installer-0.43.sh index 50c5f94..b4599b8 100755 --- a/bp_scripts/biopieces_installer-0.42.sh +++ b/bp_scripts/biopieces_installer-0.43.sh @@ -208,6 +208,7 @@ function prompt_test_prerequisites test_ruby test_ruby_gem "gnuplot" test_ruby_gem "narray" + test_ruby_gem "RubyInline" test_aux_program "blastall" test_aux_program "blat" test_aux_program "bwa" @@ -221,7 +222,7 @@ function prompt_test_prerequisites test_aux_program "prodigal" test_aux_program "Ray" test_aux_program "scan_for_matches" - test_aux_program "uclust" + test_aux_program "usearch" test_aux_program "velveth" test_aux_program "velvetg" test_aux_program "vmatch" diff --git a/bp_test/test_all b/bp_test/test_all index afed9f1..4a1aa13 100755 --- a/bp_test/test_all +++ b/bp_test/test_all @@ -15,6 +15,7 @@ test_perl_module "Time::HiRes" test_ruby test_ruby_gem "gnuplot" test_ruby_gem "narray" +test_ruby_gem "RubyInline" test_aux_program "blastall" test_aux_program "blat" test_aux_program "bwa" @@ -28,7 +29,7 @@ test_aux_program "mysql" test_aux_program "prodigal" test_aux_program "Ray" test_aux_program "scan_for_matches" -test_aux_program "uclust" +test_aux_program "usearch" test_aux_program "velveth" test_aux_program "velvetg" test_aux_program "vmatch" diff --git a/code_ruby/lib/maasha/backtrack.rb b/code_ruby/lib/maasha/backtrack.rb index 02eb047..f5e581b 100644 --- a/code_ruby/lib/maasha/backtrack.rb +++ b/code_ruby/lib/maasha/backtrack.rb @@ -56,7 +56,7 @@ module BackTrack # matches are returned in an Array of Match objects. def patscan(pattern, offset = 0, max_mismatches = 0, max_insertions = 0, max_deletions = 0) raise BackTrackError, "Bad pattern: #{pattern}" unless pattern.downcase =~ OK_PATTERN - raise BackTrackError, "offset: #{offset} out of range (0 ... #{self.length - 1})" unless (0 ... self.length).include? offset + raise BackTrackError, "offset: #{offset} out of range (0 ... #{self.length - 1})" unless (0 ... self.length).include? offset raise BackTrackError, "max_mismatches: #{max_mismatches} out of range (0 .. #{MAX_MIS})" unless (0 .. MAX_MIS).include? max_mismatches raise BackTrackError, "max_insertions: #{max_insertions} out of range (0 .. #{MAX_INS})" unless (0 .. MAX_INS).include? max_insertions raise BackTrackError, "max_deletions: #{max_deletions} out of range (0 .. #{MAX_DEL})" unless (0 .. MAX_DEL).include? max_deletions @@ -83,10 +83,14 @@ module BackTrack inline do |builder| builder.add_static "id_seq", 'rb_intern("@seq")', "ID" + # Macro for matching nucleotides including ambiguity codes. builder.prefix %{ #define MATCH(A,B) ((bitmap[(unsigned short) A] & bitmap[(unsigned short) B]) != 0) } + # Bitmap for matching nucleotides including ambiguity codes. + # For each value bits are set from the left: bit pos 1 for A, + # bit pos 2 for T, bit pos 3 for C, and bit pos 4 for G. builder.prefix %{ unsigned short bitmap[256] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, @@ -108,7 +112,9 @@ module BackTrack }; } - # ss is the start of the string, used only for reporting the match endpoints. + # Backtrack algorithm for matching a pattern (p) starting in a sequence (s) allowing for mm + # mismatches, ins insertions and del deletions. ss is the start of the sequence, used only for + # reporting the match endpoints. builder.prefix %{ unsigned int backtrack(char* ss, char* s, char* p, int mm, int ins, int del) { @@ -129,8 +135,8 @@ module BackTrack } } - # Find all occurrences of p starting at pos in s, with at most - # mm mismatches, ins insertions and del deletions. + # Find pattern (p) in a sequence (@seq) starting at pos, with at most mm mismatches, ins + # insertions and del deletions. builder.c %{ static VALUE track(char* p, unsigned int pos, int mm, int ins, int del) { @@ -165,6 +171,7 @@ module BackTrack } end + # Class containing match information. class Match attr_reader :pos, :length, :match diff --git a/code_ruby/lib/maasha/seq.rb b/code_ruby/lib/maasha/seq.rb index db93dc2..598d0a2 100644 --- a/code_ruby/lib/maasha/seq.rb +++ b/code_ruby/lib/maasha/seq.rb @@ -24,7 +24,7 @@ require 'maasha/patternmatcher' require 'maasha/bits' -#require 'maasha/backtrack' +require 'maasha/backtrack' require 'maasha/digest' #require 'maasha/patscan' @@ -46,7 +46,7 @@ class SeqError < StandardError; end class Seq #include Patscan include PatternMatcher -# include BackTrack + include BackTrack include Digest attr_accessor :seq_name, :seq, :type, :qual diff --git a/code_ruby/test/maasha/test_backtrack.rb b/code_ruby/test/maasha/test_backtrack.rb index 2c271aa..b2bc7f7 100644 --- a/code_ruby/test/maasha/test_backtrack.rb +++ b/code_ruby/test/maasha/test_backtrack.rb @@ -27,12 +27,12 @@ $:.unshift File.join(File.dirname(__FILE__),'..','lib') require 'test/unit' require 'maasha/seq' -require 'stringio' -require 'pp' class BackTrackTest < Test::Unit::TestCase def setup - @seq = Seq.new("test", "tacgatgctagcatgcacg") + # 0 1 + # 01234567890123456789 + @seq = Seq.new("test", "tacgatgctagcatgcacgg") end def test_BackTrack_patscan_with_bad_pattern_raises @@ -42,37 +42,102 @@ class BackTrackTest < Test::Unit::TestCase end def test_BackTrack_patscan_with_OK_pattern_dont_raise - ["N"].each { |pattern| + ["N", "atcg"].each { |pattern| assert_nothing_raised { @seq.patscan(pattern) } } end def test_BackTrack_patscan_with_bad_pos_raises + [-1, 20].each { |pos| + assert_raise(BackTrackError) { @seq.patscan("N", pos) } + } end def test_BackTrack_patscan_with_OK_pos_dont_raise + [0, 19].each { |pos| + assert_nothing_raised { @seq.patscan("N", pos) } + } end def test_BackTrack_patscan_with_bad_mis_raises + [-1, 6].each { |mis| + assert_raise(BackTrackError) { @seq.patscan("N", 0, mis) } + } end def test_BackTrack_patscan_with_OK_mis_dont_raise + [0, 5].each { |mis| + assert_nothing_raised { @seq.patscan("N", 0, mis) } + } end def test_BackTrack_patscan_with_bad_ins_raises + [-1, 6].each { |ins| + assert_raise(BackTrackError) { @seq.patscan("N", 0, 0, ins) } + } end def test_BackTrack_patscan_with_OK_ins_dont_raise + [0, 5].each { |ins| + assert_nothing_raised { @seq.patscan("N", 0, 0, ins) } + } end def test_BackTrack_patscan_with_bad_del_raises + [-1, 6].each { |del| + assert_raise(BackTrackError) { @seq.patscan("N", 0, 0, 0, del) } + } end def test_BackTrack_patscan_with_OK_del_dont_raise + [0, 5].each { |del| + assert_nothing_raised { @seq.patscan("N", 0, 0, 0, del) } + } + end + + def test_BackTrack_patscan_perfect_left_is_ok + assert_equal("0:7:tacgatg", @seq.patscan("TACGATG").first.to_s) + end + + def test_BackTrack_patscan_perfect_right_is_ok + assert_equal("13:7:tgcacgg", @seq.patscan("TGCACGG").first.to_s) + end + + def test_BackTrack_patscan_ambiguity_is_ok + assert_equal("13:7:tgcacgg", @seq.patscan("TGCACNN").first.to_s) + end + + def test_BackTrack_patscan_pos_is_ok + assert_equal("10:1:g", @seq.patscan("N", 10).first.to_s) + assert_equal("19:1:g", @seq.patscan("N", 10).last.to_s) + end + + def test_BackTrack_patscan_mis_left_is_ok + assert_equal("0:7:tacgatg", @seq.patscan("Aacgatg", 0, 1).first.to_s) + end + + def test_BackTrack_patscan_mis_right_is_ok + assert_equal("13:7:tgcacgg", @seq.patscan("tgcacgA", 0, 1).first.to_s) + end + + def test_BackTrack_patscan_ins_left_is_ok + assert_equal("0:7:tacgatg", @seq.patscan("Atacgatg", 0, 0, 1).first.to_s) + end + + def test_BackTrack_patscan_ins_right_is_ok + assert_equal("13:7:tgcacgg", @seq.patscan("tgcacggA", 0, 0, 1).first.to_s) + end + + def test_BackTrack_patscan_del_left_is_ok + assert_equal("0:7:tacgatg", @seq.patscan("acgatg", 0, 0, 0, 1).first.to_s) + end + + def test_BackTrack_patscan_del_right_is_ok + assert_equal("12:8:atgcacgg", @seq.patscan("tgcacgg", 0, 0, 0, 1).first.to_s) end - def test_BackTrack_patscan -# assert_equal("0:4:tacg", @seq.patscan("TACG").first.to_s) + def test_BackTrack_patscan_ambiguity_mis_ins_del_all_ok + assert_equal("0:20:tacgatgctagcatgcacgg", @seq.patscan("tacatgcNagGatgcCacgg", 0, 1, 1, 1).first.to_s) end end -- 2.39.5