X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=bp_bin%2Fuclust_seq;h=5a6d0ef901a1f9cb298b324106c568c721a50b1d;hb=7e229f1728f00bc0603eeaf412f46485fb3c5e02;hp=bd63df8e3935b64e99cbca600a335b6e2ccb9082;hpb=c5233962d06d760ad5b5d32b30338fb30117b41b;p=biopieces.git diff --git a/bp_bin/uclust_seq b/bp_bin/uclust_seq index bd63df8..5a6d0ef 100755 --- a/bp_bin/uclust_seq +++ b/bp_bin/uclust_seq @@ -30,143 +30,55 @@ require 'maasha/biopieces' require 'maasha/fasta' +require 'maasha/usearch' -SORT_LIMIT = 2_000_000_000 # use mergesort for files biggern than 2Gb. - -class Uclust - include Enumerable - - def initialize(infile, outfile, options) - @infile = infile - @outfile = outfile - @options = options - @command = [] - end - - # Method that calls Usearch sorting for sorting a FASTA file - # according to decending sequence length. - def sort - # usearch -sort seqs.fasta -output seqs.sorted.fasta - if File.size(@infile) < SORT_LIMIT - @command << "usearch --sort #{@infile} --output #{@infile}.sort" - else - @command << "usearch --mergesort #{@infile} --output #{@infile}.sort" - end - - execute - - File.rename "#{@infile}.sort", @infile - end - - # Method to execute database search. - def usearch - # usearch --query query.fasta --db db.fasta --uc results.uc --id 0.90 [--evalue E] - @command << "usearch --query #{@infile} --db #{@options[:database]} --uc #{@outfile} --id #{@options[:identity]}" - @command << "--evalue #{@options[:e_val]}" if @options.has_key? :e_val - - execute - end - - # Method to execute clustering de novo. - def cluster - @command << "usearch --cluster #{@infile} --uc #{@outfile} --id #{@options[:identity]}" +casts = [] +casts << {long: 'no_sort', short: 'n', type: 'flag', mandatory: false, default: nil, allowed: nil, disallowed: nil} +casts << {long: 'comp', short: 'c', type: 'flag', mandatory: false, default: nil, allowed: nil, disallowed: nil} +casts << {long: 'identity', short: 'i', type: 'float', mandatory: true, default: 0.9, allowed: nil, disallowed: nil} +casts << {long: 'cpus', short: 'C', type: 'uint', mandatory: false, default: 1, allowed: nil, disallowed: "0"} - execute - end +options = Biopieces.options_parse(ARGV, casts) - # Method to execute clustering to database plus de novo if not matched. - def usearch_uclust - # usearch --cluster seqs_sorted.fasta --db db.fasta --uc results.uc --id 0.90 - @command << "usearch --cluster #{@infile} --db #{@options[:database]} --uc #{@outfile} --id #{@options[:identity]}" +tmpdir = Biopieces.mktmpdir +file_records = File.join(tmpdir, "data.stream") +file_fasta = File.join(tmpdir, "in.fna") +file_uclust = File.join(tmpdir, "out.uc") - execute - end +Biopieces.open(options[:stream_in], file_records) do |input, output| + Fasta.open(file_fasta, 'w') do |fasta_io| + input.each_record do |record| + output.puts record - # Method to parse a Uclust .uc file and for each line of data - # yield a Biopiece record. - def each - record = {} - - File.open(@outfile, mode="r") do |ios| - ios.each_line do |line| - if line !~ /^#/ - fields = line.chomp.split("\t") - - record[:REC_TYPE] = "UCLUST" - record[:TYPE] = fields[0] - record[:CLUSTER] = fields[1].to_i - record[:SEQ_LEN] = fields[2].to_i - record[:IDENT] = fields[3].to_f - record[:STRAND] = fields[4] - record[:Q_BEG] = fields[5].to_i - record[:S_BEG] = fields[6].to_i - record[:S_END] = fields[6].to_i + fields[2].to_i - record[:CIGAR] = fields[7] - record[:Q_ID] = fields[8] - record[:S_ID] = fields[9] - - yield record - end + if record[:SEQ_NAME] and record[:SEQ] + fasta_io.puts Seq.new_bp(record).to_fasta end end - - self # conventionally end - - private - - # Method to execute a command using a system() call. - # The command is composed of bits from the @command variable. - def execute - @command.unshift "nice -n 19" - @command << "--rev" if @options[:comp] - @command << "> /dev/null 2>&1" unless @options[:verbose] - command = @command.join(" ") - system(command) - raise "Command failed: #{command}" unless $?.success? - - @command = [] - end end -ok_methods = "usearch,uclust,usearch_uclust" - -casts = [] -casts << {:long=>'no_sort', :short=>'n', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} -casts << {:long=>'method', :short=>'m', :type=>'string', :mandatory=>true, :default=>"uclust", :allowed=>ok_methods, :disallowed=>nil} -casts << {:long=>'database', :short=>'d', :type=>'file!', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} -casts << {:long=>'comp', :short=>'c', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} -casts << {:long=>'identity', :short=>'i', :type=>'float', :mandatory=>true, :default=>0.9, :allowed=>nil, :disallowed=>nil} -casts << {:long=>'e_val', :short=>'e', :type=>'float', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} +us = Usearch.new(file_fasta, file_uclust, options) -options = Biopieces.options_parse(ARGV, casts) +us.sortbylength unless options[:no_sort] +us.cluster_smallmem -tmpdir = Biopieces.mktmpdir -infile = File.join(tmpdir, "in.fna") -outfile = File.join(tmpdir, "out.uc") +hash = {} -Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output| - Fasta.open(infile, mode="w") do |fasta_io| - input.each_record do |record| - output.puts record +us.each_cluster do |cluster| + hash[cluster[:Q_ID].to_sym] = cluster.dup +end - if record.has_key? :SEQ_NAME and record.has_key? :SEQ - fasta_io.puts Seq.new_bp(record).to_fasta +Biopieces.open(file_records, options[:stream_out]) do |input, output| + input.each_record do |record| + if record[:SEQ_NAME] and record[:SEQ] + if hash[record[:SEQ_NAME].to_sym] + us = hash[record[:SEQ_NAME].to_sym] + record[:CLUSTER] = us[:CLUSTER].to_i + record[:IDENT] = us[:IDENT].to_i + record[:IDENT] = '*' if us[:TYPE] == 'S' end end - end - - uclust = Uclust.new(infile, outfile, options) - uclust.sort unless options[:no_sort] - - case options[:method].to_s - when "usearch" then uclust.usearch - when "uclust" then uclust.cluster - when "usearch_uclust" then uclust.usearch_uclust - else raise "Unknown method: #{options[:method]}" - end - uclust.each do |record| output.puts record end end