X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=bp_bin%2Fuclust_seq;h=1f5202a42f75201866f6affee4e219de06295651;hb=5aa6482643719c640f828c37a4ab23cde28be591;hp=76c56683b66a82d37ff39625412840f8049fc7b1;hpb=494dc53ebd515b1e3e9b91bbebf43059899ca4ce;p=biopieces.git diff --git a/bp_bin/uclust_seq b/bp_bin/uclust_seq index 76c5668..1f5202a 100755 --- a/bp_bin/uclust_seq +++ b/bp_bin/uclust_seq @@ -1,6 +1,6 @@ #!/usr/bin/env ruby -# Copyright (C) 2007-2010 Martin A. Hansen. +# Copyright (C) 2007-2011 Martin A. Hansen. # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License @@ -28,148 +28,58 @@ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - require 'maasha/biopieces' require 'maasha/fasta' +require 'maasha/usearch' -class Uclust - include Enumerable - - def initialize(infile, outfile, options) - @infile = infile - @outfile = outfile - @options = options - @command = [] - end - - # Method that calls Uclusts sorting for sorting a FASTA file - # according to decending sequence length. - def sort - @command << "uclust --sort #{@infile} --output #{@infile}.sort" - - execute - - File.rename "#{@infile}.sort", @infile - end - - def ublast - # uclust --ublast query_seqs.fasta --db database.fasta --blast6out filename --evalue E - @options[:e_val] = 10 unless @options[:e_val] - @command << "uclust --ublast #{@infile} --db #{@options[:database]} --blast6out #{@outfile} --evalue #{@options[:e_val]}" - - execute - end - - # Method to execute database search. - def usearch - # uclust --query query.fasta --db db.fasta --uc results.uc --id 0.90 [--evalue E] - @command << "uclust --query #{@infile} --db #{@options[:database]} --uc #{@outfile} --id #{@options[:identity]}" - @command << "--evalue #{@options[:e_val]}" if @options.has_key? :e_val - - execute - end - - # Method to execute clustering de novo. - def uclust - # uclust --input seqs_sorted.fasta --uc results.uc --id 0.90 - @command << "uclust --input #{@infile} --uc #{@outfile} --id #{@options[:identity]}" +casts = [] +casts << {long: 'no_sort', short: 'n', type: 'flag', mandatory: false, default: nil, allowed: nil, disallowed: nil} +casts << {long: 'comp', short: 'c', type: 'flag', mandatory: false, default: nil, allowed: nil, disallowed: nil} +casts << {long: 'identity', short: 'i', type: 'float', mandatory: true, default: 0.9, allowed: nil, disallowed: nil} - execute - end +options = Biopieces.options_parse(ARGV, casts) - # Method to execute clustering to database plus de novo if not matched. - def usearch_uclust - # uclust --input seqs_sorted.fasta --lib db.fasta --uc results.uc --id 0.90 - @command << "uclust --input #{@infile} --lib #{@options[:database]} --uc #{@outfile} --id #{@options[:identity]}" - @command << "--lib #{@options[:database]}" if @options.has_key? :database +tmpdir = Biopieces.mktmpdir +file_records = File.join(tmpdir, "data.stream") +file_fasta = File.join(tmpdir, "in.fna") +file_uclust = File.join(tmpdir, "out.uc") - execute - end +Biopieces.open(options[:stream_in], file_records) do |input, output| + Fasta.open(file_fasta, 'w') do |fasta_io| + input.each_record do |record| + output.puts record - # Method to parse a Uclust .uc file and for each line of data - # yield a Biopiece record. - def each - record = {} - - File.open(@outfile, mode="r") do |ios| - ios.each_line do |line| - if line !~ /^#/ - fields = line.chomp.split("\t") - - record[:REC_TYPE] = "UCLUST" - record[:TYPE] = fields[0] - record[:CLUSTER] = fields[1] - record[:SEQ_LEN] = fields[2] - record[:IDENT] = fields[3] - record[:STRAND] = fields[4] - record[:Q_BEG] = fields[5] - record[:S_BEG] = fields[6] - record[:CIGAR] = fields[7] - record[:Q_ID] = fields[8] - record[:S_ID] = fields[9] - - yield record - end + if record[:SEQ_NAME] and record[:SEQ] + fasta_io.puts Seq.new_bp(record).to_fasta end end - - self # conventionally end - - private - - # Method to execute a command using a system() call. - # The command is composed of bits from the @command variable. - def execute - @command.unshift "nice -n 19" - @command << "--rev" if @options[:comp] - @command << "> /dev/null 2>&1" unless @options[:verbose] - command = @command.join(" ") - system(command) - raise "Command failed: #{command}" unless $?.success? - - @command = [] - end end -ok_methods = "ublast,usearch,uclust,usearch_uclust" - -casts = [] -casts << {:long=>'no_sort', :short=>'n', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} -casts << {:long=>'method', :short=>'m', :type=>'string', :mandatory=>true, :default=>"uclust", :allowed=>ok_methods, :disallowed=>nil} -casts << {:long=>'database', :short=>'d', :type=>'file!', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} -casts << {:long=>'comp', :short=>'c', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} -casts << {:long=>'identity', :short=>'i', :type=>'float', :mandatory=>true, :default=>0.9, :allowed=>nil, :disallowed=>nil} -casts << {:long=>'e_val', :short=>'e', :type=>'float', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} +us = Usearch.new(file_fasta, file_uclust, options) -bp = Biopieces.new +us.sortbylength unless options[:no_sort] +us.cluster_smallmem -options = bp.parse(ARGV, casts) +hash = {} -tmpdir = bp.mktmpdir -infile = "#{tmpdir}/in.fna" -outfile = "#{tmpdir}/out.uc" - -Fasta.open(infile, mode="w") do |fasta_io| - bp.each_record do |record| - bp.puts record - fasta_io.puts record - end +us.each_cluster do |cluster| + hash[cluster[:Q_ID].to_sym] = cluster.dup end -uclust = Uclust.new(infile, outfile, options) -uclust.sort unless options[:no_sort] - -case options[:method].to_s -when "ublast" then uclust.ublast -when "usearch" then uclust.usearch -when "uclust" then uclust.uclust -when "usearch_uclust" then uclust.usearch_uclust -else raise "Unknown method: #{options[:method]}" -end +Biopieces.open(file_records, options[:stream_out]) do |input, output| + input.each_record do |record| + if record[:SEQ_NAME] and record[:SEQ] + if hash[record[:SEQ_NAME].to_sym] + us = hash[record[:SEQ_NAME].to_sym] + record[:CLUSTER] = us[:CLUSTER].to_i + record[:IDENT] = us[:IDENT].to_i + record[:IDENT] = '*' if us[:TYPE] == 'S' + end + end -uclust.each do |record| - bp.puts record + output.puts record + end end