From 7f343b6f3e0240183f8002aafd485ade5d149417 Mon Sep 17 00:00:00 2001 From: martinahansen Date: Fri, 25 Nov 2011 12:31:49 +0000 Subject: [PATCH] moved usearch code to external file git-svn-id: http://biopieces.googlecode.com/svn/trunk@1688 74ccb610-7750-0410-82ae-013aeee3265d --- bp_bin/uclust_seq | 92 ++----------------- bp_bin/usearch_seq | 68 +------------- code_ruby/lib/maasha/usearch.rb | 153 ++++++++++++++++++++++++++++++++ 3 files changed, 166 insertions(+), 147 deletions(-) create mode 100644 code_ruby/lib/maasha/usearch.rb diff --git a/bp_bin/uclust_seq b/bp_bin/uclust_seq index 0eaee60..695d3cf 100755 --- a/bp_bin/uclust_seq +++ b/bp_bin/uclust_seq @@ -30,81 +30,7 @@ require 'maasha/biopieces' require 'maasha/fasta' - -SORT_LIMIT = 2_000_000_000 # use mergesort for files biggern than 2Gb. - -class Uclust - include Enumerable - - def initialize(infile, outfile, options) - @infile = infile - @outfile = outfile - @options = options - @command = [] - end - - # Method that calls Usearch sorting for sorting a FASTA file - # according to decending sequence length. - def sort - # usearch -sort seqs.fasta -output seqs.sorted.fasta - if File.size(@infile) < SORT_LIMIT - @command << "usearch --sort #{@infile} --output #{@infile}.sort" - else - @command << "usearch --mergesort #{@infile} --output #{@infile}.sort" - end - - execute - - File.rename "#{@infile}.sort", @infile - end - - # Method to execute clustering de novo. - def cluster - @command << "usearch --cluster #{@infile} --uc #{@outfile} --id #{@options[:identity]}" - - execute - end - - # Method to parse a Uclust .uc file and for each line of data - # yield a Biopiece record. - def each - record = {} - - File.open(@outfile, mode="r") do |ios| - ios.each_line do |line| - if line !~ /^#/ - fields = line.chomp.split("\t") - - next if fields[0] == 'C' - - record[:TYPE] = fields[0] - record[:CLUSTER] = fields[1].to_i - record[:IDENT] = fields[3].to_f - record[:Q_ID] = fields[8] - - yield record - end - end - end - - self # conventionally - end - - private - - # Method to execute a command using a system() call. - # The command is composed of bits from the @command variable. - def execute - @command.unshift "nice -n 19" - @command << "--rev" if @options[:comp] - @command << "> /dev/null 2>&1" unless @options[:verbose] - command = @command.join(" ") - system(command) - raise "Command failed: #{command}" unless $?.success? - - @command = [] - end -end +require 'maasha/usearch' casts = [] casts << {:long=>'no_sort', :short=>'n', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} @@ -130,13 +56,13 @@ Biopieces.open(options[:stream_in], file_records) do |input, output| end end -uc = Uclust.new(file_fasta, file_uclust, options) -uc.sort unless options[:no_sort] -uc.cluster +us = Usearch.new(file_fasta, file_uclust, options) +us.sort_length unless options[:no_sort] +us.cluster hash = {} -uc.each do |record| +us.each_cluster do |record| hash[record[:Q_ID].to_sym] = record.dup end @@ -144,10 +70,10 @@ Biopieces.open(file_records, options[:stream_out]) do |input, output| input.each_record do |record| if record.has_key? :SEQ_NAME and record.has_key? :SEQ if hash.has_key? record[:SEQ_NAME].to_sym - uc = hash[record[:SEQ_NAME].to_sym] - record[:CLUSTER] = uc[:CLUSTER].to_i - record[:IDENT] = uc[:IDENT].to_i - record[:IDENT] = '*' if uc[:TYPE] == 'S' + us = hash[record[:SEQ_NAME].to_sym] + record[:CLUSTER] = us[:CLUSTER].to_i + record[:IDENT] = us[:IDENT].to_i + record[:IDENT] = '*' if us[:TYPE] == 'S' end end diff --git a/bp_bin/usearch_seq b/bp_bin/usearch_seq index 9bd3d46..2075285 100755 --- a/bp_bin/usearch_seq +++ b/bp_bin/usearch_seq @@ -30,67 +30,7 @@ require 'maasha/biopieces' require 'maasha/fasta' - -class Usearch - include Enumerable - - def initialize(infile, outfile, options) - @infile = infile - @outfile = outfile - @options = options - @command = [] - end - - # Method to execute database search. - def usearch - @command << "usearch --query #{@infile} --db #{@options[:database]} --userout #{@outfile}" - @command << "--userfields target+tloz+thiz+query+bits+strand" - @command << "--id #{@options[:identity]}" if @options.has_key? :identity - @command << "--evalue #{@options[:e_val]}" if @options.has_key? :e_val - - execute - end - - # Method to parse a Useach .uc file and for each line of data - # yield a Biopiece record. - def each - record = {} - - File.open(@outfile, mode="r") do |ios| - ios.gets # skip comment line - ios.each_line do |line| - fields = line.chomp.split("\t") - - record[:REC_TYPE] = "USEARCH" - record[:S_ID] = fields[0] - record[:S_BEG] = fields[1].to_i - record[:S_END] = fields[2].to_i - record[:Q_ID] = fields[3] - record[:SCORE] = fields[4].to_f - record[:STRAND] = fields[5] - - yield record - end - end - - self # conventionally - end - - private - - # Method to execute a command using a system() call. - # The command is composed of bits from the @command variable. - def execute - @command.unshift "nice -n 19" - @command << "--rev" - @command << "> /dev/null 2>&1" unless @options[:verbose] - command = @command.join(" ") - system(command) - raise "Command failed: #{command}" unless $?.success? - - @command = [] - end -end +require 'maasha/usearch' casts = [] casts << {:long=>'database', :short=>'d', :type=>'file!', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} @@ -116,11 +56,11 @@ Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output| end end - uc = Usearch.new(infile, outfile, options) + us = Usearch.new(infile, outfile, options) - uc.usearch + us.usearch - uc.each do |record| + us.each_hit do |record| output.puts record end end diff --git a/code_ruby/lib/maasha/usearch.rb b/code_ruby/lib/maasha/usearch.rb new file mode 100644 index 0000000..b0c8c2f --- /dev/null +++ b/code_ruby/lib/maasha/usearch.rb @@ -0,0 +1,153 @@ +# Copyright (C) 2011 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# This software is part of the Biopieces framework (www.biopieces.org). + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +require 'maasha/fasta' + +# Error class for all exceptions to do with Usearch. +class UsearchError < StandardError; end + +SORT_LIMIT = 2_000_000_000 # use mergesort for files biggern than 2Gb. + +class Usearch + include Enumerable + + def initialize(infile, outfile, options) + @infile = infile + @outfile = outfile + @options = options + @command = [] + end + + # Method that calls Usearch sorting for sorting a FASTA file + # according to decending sequence length. + def sort_length + # usearch -sort seqs.fasta -output seqs.sorted.fasta + if File.size(@infile) < SORT_LIMIT + @command << "usearch --sort #{@infile} --output #{@infile}.sort" + else + @command << "usearch --mergesort #{@infile} --output #{@infile}.sort" + end + + execute + + File.rename "#{@infile}.sort", @infile + end + + # Method that calls Usearch sorting for sorting a FASTA file + # according to cluster size. + def sort_size + # usearch -sortsize seqs.fasta -output seqs.sorted.fasta + @command << "usearch --sortsize #{@infile} --output #{@infile}.sort" + + execute + + File.rename "#{@infile}.sort", @infile + end + + # Method to execute clustering de novo. + def cluster + @command << "usearch --cluster #{@infile} --uc #{@outfile} --id #{@options[:identity]}" + + execute + end + + # Method to execute database search. + def usearch + @command << "usearch --query #{@infile} --db #{@options[:database]} --userout #{@outfile}" + @command << "--userfields target+tloz+thiz+query+bits+strand" + @command << "--id #{@options[:identity]}" if @options.has_key? :identity + @command << "--evalue #{@options[:e_val]}" if @options.has_key? :e_val + @command << "--rev" + + execute + end + + # Method to parse a Uclust .uc file and for each line of data + # yield a Biopiece record. + def each_cluster + record = {} + + File.open(@outfile, mode="r") do |ios| + ios.each_line do |line| + if line !~ /^#/ + fields = line.chomp.split("\t") + + next if fields[0] == 'C' + + record[:TYPE] = fields[0] + record[:CLUSTER] = fields[1].to_i + record[:IDENT] = fields[3].to_f + record[:Q_ID] = fields[8] + + yield record + end + end + end + + self # conventionally + end + + # Method to parse a Useach user defined tabular file and for each line of data + # yield a Biopiece record. + def each_hit + record = {} + + File.open(@outfile, mode="r") do |ios| + ios.gets # skip comment line + ios.each_line do |line| + fields = line.chomp.split("\t") + + record[:REC_TYPE] = "USEARCH" + record[:S_ID] = fields[0] + record[:S_BEG] = fields[1].to_i + record[:S_END] = fields[2].to_i + record[:Q_ID] = fields[3] + record[:SCORE] = fields[4].to_f + record[:STRAND] = fields[5] + + yield record + end + end + + self # conventionally + end + + private + + # Method to execute a command using a system() call. + # The command is composed of bits from the @command variable. + def execute + @command.unshift "nice -n 19" + @command << "--rev" if @options[:comp] + @command << "> /dev/null 2>&1" unless @options[:verbose] + command = @command.join(" ") + system(command) + raise "Command failed: #{command}" unless $?.success? + + @command = [] + end +end + +__END__ + -- 2.39.2