X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=code_ruby%2Flib%2Fmaasha%2Fusearch.rb;fp=code_ruby%2Flib%2Fmaasha%2Fusearch.rb;h=b0c8c2f8ce69f63234b87e9da7d8f1b71ba3acad;hb=7f343b6f3e0240183f8002aafd485ade5d149417;hp=0000000000000000000000000000000000000000;hpb=67652c7efb076860e571e12c75fc1d07b203585e;p=biopieces.git diff --git a/code_ruby/lib/maasha/usearch.rb b/code_ruby/lib/maasha/usearch.rb new file mode 100644 index 0000000..b0c8c2f --- /dev/null +++ b/code_ruby/lib/maasha/usearch.rb @@ -0,0 +1,153 @@ +# Copyright (C) 2011 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# This software is part of the Biopieces framework (www.biopieces.org). + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +require 'maasha/fasta' + +# Error class for all exceptions to do with Usearch. +class UsearchError < StandardError; end + +SORT_LIMIT = 2_000_000_000 # use mergesort for files biggern than 2Gb. + +class Usearch + include Enumerable + + def initialize(infile, outfile, options) + @infile = infile + @outfile = outfile + @options = options + @command = [] + end + + # Method that calls Usearch sorting for sorting a FASTA file + # according to decending sequence length. + def sort_length + # usearch -sort seqs.fasta -output seqs.sorted.fasta + if File.size(@infile) < SORT_LIMIT + @command << "usearch --sort #{@infile} --output #{@infile}.sort" + else + @command << "usearch --mergesort #{@infile} --output #{@infile}.sort" + end + + execute + + File.rename "#{@infile}.sort", @infile + end + + # Method that calls Usearch sorting for sorting a FASTA file + # according to cluster size. + def sort_size + # usearch -sortsize seqs.fasta -output seqs.sorted.fasta + @command << "usearch --sortsize #{@infile} --output #{@infile}.sort" + + execute + + File.rename "#{@infile}.sort", @infile + end + + # Method to execute clustering de novo. + def cluster + @command << "usearch --cluster #{@infile} --uc #{@outfile} --id #{@options[:identity]}" + + execute + end + + # Method to execute database search. + def usearch + @command << "usearch --query #{@infile} --db #{@options[:database]} --userout #{@outfile}" + @command << "--userfields target+tloz+thiz+query+bits+strand" + @command << "--id #{@options[:identity]}" if @options.has_key? :identity + @command << "--evalue #{@options[:e_val]}" if @options.has_key? :e_val + @command << "--rev" + + execute + end + + # Method to parse a Uclust .uc file and for each line of data + # yield a Biopiece record. + def each_cluster + record = {} + + File.open(@outfile, mode="r") do |ios| + ios.each_line do |line| + if line !~ /^#/ + fields = line.chomp.split("\t") + + next if fields[0] == 'C' + + record[:TYPE] = fields[0] + record[:CLUSTER] = fields[1].to_i + record[:IDENT] = fields[3].to_f + record[:Q_ID] = fields[8] + + yield record + end + end + end + + self # conventionally + end + + # Method to parse a Useach user defined tabular file and for each line of data + # yield a Biopiece record. + def each_hit + record = {} + + File.open(@outfile, mode="r") do |ios| + ios.gets # skip comment line + ios.each_line do |line| + fields = line.chomp.split("\t") + + record[:REC_TYPE] = "USEARCH" + record[:S_ID] = fields[0] + record[:S_BEG] = fields[1].to_i + record[:S_END] = fields[2].to_i + record[:Q_ID] = fields[3] + record[:SCORE] = fields[4].to_f + record[:STRAND] = fields[5] + + yield record + end + end + + self # conventionally + end + + private + + # Method to execute a command using a system() call. + # The command is composed of bits from the @command variable. + def execute + @command.unshift "nice -n 19" + @command << "--rev" if @options[:comp] + @command << "> /dev/null 2>&1" unless @options[:verbose] + command = @command.join(" ") + system(command) + raise "Command failed: #{command}" unless $?.success? + + @command = [] + end +end + +__END__ +