X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=bp_bin%2Fuclust_seq;h=5a6d0ef901a1f9cb298b324106c568c721a50b1d;hb=af282a65d141826c15944437b07a0353dd14e79c;hp=4523ab05b09bf69a6919b9ac9009bed60bbd0bc1;hpb=c4f14c511655d92281b6d70363de57b77a9b6045;p=biopieces.git diff --git a/bp_bin/uclust_seq b/bp_bin/uclust_seq index 4523ab0..5a6d0ef 100755 --- a/bp_bin/uclust_seq +++ b/bp_bin/uclust_seq @@ -1,6 +1,6 @@ #!/usr/bin/env ruby -# Copyright (C) 2007-2010 Martin A. Hansen. +# Copyright (C) 2007-2011 Martin A. Hansen. # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License @@ -28,146 +28,57 @@ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - require 'maasha/biopieces' require 'maasha/fasta' +require 'maasha/usearch' -class Uclust - include Enumerable - - def initialize(infile, outfile, options) - @infile = infile - @outfile = outfile - @options = options - @command = [] - end - - # Method that calls Uclusts sorting for sorting a FASTA file - # according to decending sequence length. - def sort - @command << "uclust --sort #{@infile} --output #{@infile}.sort" - - execute - - File.rename "#{@infile}.sort", @infile - end - - def ublast - # uclust --ublast query_seqs.fasta --db database.fasta --blast6out filename --evalue E - @options[:e_val] = 10 unless @options[:e_val] - @command << "uclust --ublast #{@infile} --db #{@options[:database]} --blast6out #{@outfile} --evalue #{@options[:e_val]}" - - execute - end - - # Method to execute database search. - def usearch - # uclust --query query.fasta --db db.fasta --uc results.uc --id 0.90 [--evalue E] - @command << "uclust --query #{@infile} --db #{@options[:database]} --uc #{@outfile} --id #{@options[:identity]}" - @command << "--evalue #{@options[:e_val]}" if @options.has_key? :e_val - - execute - end - - # Method to execute clustering de novo. - def uclust - # uclust --input seqs_sorted.fasta --uc results.uc --id 0.90 - @command << "uclust --input #{@infile} --uc #{@outfile} --id #{@options[:identity]}" +casts = [] +casts << {long: 'no_sort', short: 'n', type: 'flag', mandatory: false, default: nil, allowed: nil, disallowed: nil} +casts << {long: 'comp', short: 'c', type: 'flag', mandatory: false, default: nil, allowed: nil, disallowed: nil} +casts << {long: 'identity', short: 'i', type: 'float', mandatory: true, default: 0.9, allowed: nil, disallowed: nil} +casts << {long: 'cpus', short: 'C', type: 'uint', mandatory: false, default: 1, allowed: nil, disallowed: "0"} - execute - end +options = Biopieces.options_parse(ARGV, casts) - # Method to execute clustering to database plus de novo if not matched. - def usearch_uclust - # uclust --input seqs_sorted.fasta --lib db.fasta --uc results.uc --id 0.90 - @command << "uclust --input #{@infile} --lib #{@options[:database]} --uc #{@outfile} --id #{@options[:identity]}" - @command << "--lib #{@options[:database]}" if @options.has_key? :database +tmpdir = Biopieces.mktmpdir +file_records = File.join(tmpdir, "data.stream") +file_fasta = File.join(tmpdir, "in.fna") +file_uclust = File.join(tmpdir, "out.uc") - execute - end +Biopieces.open(options[:stream_in], file_records) do |input, output| + Fasta.open(file_fasta, 'w') do |fasta_io| + input.each_record do |record| + output.puts record - # Method to parse a Uclust .uc file and for each line of data - # yield a Biopiece record. - def each - record = {} - - File.open(@outfile, mode="r") do |ios| - ios.each_line do |line| - if line !~ /^#/ - fields = line.chomp.split("\t") - - record[:REC_TYPE] = "UCLUST" - record[:TYPE] = fields[0] - record[:CLUSTER] = fields[1] - record[:SEQ_LEN] = fields[2] - record[:IDENT] = fields[3] - record[:STRAND] = fields[4] - record[:Q_BEG] = fields[5] - record[:S_BEG] = fields[6] - record[:CIGAR] = fields[7] - record[:Q_ID] = fields[8] - record[:S_ID] = fields[9] - - yield record - end + if record[:SEQ_NAME] and record[:SEQ] + fasta_io.puts Seq.new_bp(record).to_fasta end end - - self # conventionally end - - private - - # Method to execute a command using a system() call. - # The command is composed of bits from the @command variable. - def execute - @command.unshift "nice -n 19" - @command << "--rev" if @options[:comp] - @command << "> /dev/null 2>&1" unless @options[:verbose] - command = @command.join(" ") - system(command) - raise "Command failed: #{command}" unless $?.success? - - @command = [] - end end -ok_methods = "ublast,usearch,uclust,usearch_uclust" +us = Usearch.new(file_fasta, file_uclust, options) -casts = [] -casts << {:long=>'no_sort', :short=>'n', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} -casts << {:long=>'method', :short=>'m', :type=>'string', :mandatory=>true, :default=>"uclust", :allowed=>ok_methods, :disallowed=>nil} -casts << {:long=>'database', :short=>'d', :type=>'file!', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} -casts << {:long=>'comp', :short=>'c', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} -casts << {:long=>'identity', :short=>'i', :type=>'float', :mandatory=>true, :default=>0.9, :allowed=>nil, :disallowed=>nil} -casts << {:long=>'e_val', :short=>'e', :type=>'float', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} +us.sortbylength unless options[:no_sort] +us.cluster_smallmem -options = Biopieces.options_parse(ARGV, casts) +hash = {} -tmpdir = Biopieces.mktmpdir -infile = File.join(tmpdir, "in.fna") -outfile = File.join(tmpdir, "out.uc") +us.each_cluster do |cluster| + hash[cluster[:Q_ID].to_sym] = cluster.dup +end -Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output| - Fasta.open(infile, mode="w") do |fasta_io| - input.each_record do |record| - output.puts record - fasta_io.puts record +Biopieces.open(file_records, options[:stream_out]) do |input, output| + input.each_record do |record| + if record[:SEQ_NAME] and record[:SEQ] + if hash[record[:SEQ_NAME].to_sym] + us = hash[record[:SEQ_NAME].to_sym] + record[:CLUSTER] = us[:CLUSTER].to_i + record[:IDENT] = us[:IDENT].to_i + record[:IDENT] = '*' if us[:TYPE] == 'S' + end end - end - - uclust = Uclust.new(infile, outfile, options) - uclust.sort unless options[:no_sort] - - case options[:method].to_s - when "ublast" then uclust.ublast - when "usearch" then uclust.usearch - when "uclust" then uclust.uclust - when "usearch_uclust" then uclust.usearch_uclust - else raise "Unknown method: #{options[:method]}" - end - uclust.each do |record| output.puts record end end