From 46d0f24129a8b289f7297ecdee74eeebf7d26aa9 Mon Sep 17 00:00:00 2001 From: martinahansen Date: Tue, 21 Sep 2010 09:22:37 +0000 Subject: [PATCH] added uclust_seq Biopiece - Work in progress! git-svn-id: http://biopieces.googlecode.com/svn/trunk@1100 74ccb610-7750-0410-82ae-013aeee3265d --- bp_bin/uclust_seq | 166 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 166 insertions(+) create mode 100755 bp_bin/uclust_seq diff --git a/bp_bin/uclust_seq b/bp_bin/uclust_seq new file mode 100755 index 0000000..4738212 --- /dev/null +++ b/bp_bin/uclust_seq @@ -0,0 +1,166 @@ +#!/usr/bin/env ruby + +# Copyright (C) 2007-2010 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# This program is part of the Biopieces framework (www.biopieces.org). + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +require 'biopieces' +require 'fasta' + +class Uclust + include Enumerable + + def initialize(infile, outfile) + @infile = infile + @outfile = outfile + end + + # Method that calls Uclusts sorting for sorting a FASTA file + # according to decending sequence length. + def sort(options) + command = "nice -n 19" + command << " uclust --sort #{@infile} --output #{@infile}.sort" + command << " > /dev/null 2>&1" unless options[:verbose] + system(command) + raise "Failed sorting file: #{@infile}" unless $?.success? + + File.rename "#{@infile}.sort", @infile + end + + def ublast(options) + # uclust --ublast query_seqs.fasta --db database.fasta --blast6out filename --evalue E + options[:e_val] = 10 if options[:e_val].is_nil? + command = "nice -n 19" + command << " uclust --ublast #{@infile} --db #{options[:database]} --uc #{@outfile} --evalue #{options[:e_val]}" + command << " > /dev/null 2>&1" unless options[:verbose] + system(command) + raise "Command failed: #{command}" unless $?.success? + end + + def usearch(options) + # uclust --query query.fasta --db db.fasta --uc results.uc --id 0.90 [--evalue E] + command = "nice -n 19" + command << " uclust --query #{@infile} --db #{options[:database]} --uc #{@outfile} --id #{options[:identity]}" + command << " --evalue #{options[:e_val]}" if options.has_key? :e_val + command << " > /dev/null 2>&1" unless options[:verbose] + system(command) + raise "Command failed: #{command}" unless $?.success? + end + + def uclust(options) + # uclust --input seqs_sorted.fasta --uc results.uc --id 0.90 + command = "nice -n 19" + command << " uclust --input #{@infile} --uc #{@outfile} --id #{options[:identity]}" + command << " > /dev/null 2>&1" unless options[:verbose] + system(command) + raise "Command failed: #{command}" unless $?.success? + end + + def usearch_uclust(options) + # uclust --input seqs_sorted.fasta --lib db.fasta --uc results.uc --id 0.90 + command = "nice -n 19" + command << " uclust --input #{@infile} --lib #{options[:database]} --uc #{@outfile} --id #{options[:identity]}" + command << " --lib #{options[:database]}" if options.has_key? :database + command << " > /dev/null 2>&1" unless options[:verbose] + system(command) + raise "Command failed: #{command}" unless $?.success? + end + + def each + record = {} + + File.open(@outfile, mode="r") do |ios| + ios.each_line do |line| + if line !~ /^#/ + fields = line.split("\t") + + record[:REC_TYPE] = "UCLUST" + record[:TYPE] = fields[0] + record[:CLUSTER] = fields[1] + record[:SEQ_LEN] = fields[2] + record[:IDENT] = fields[3] + record[:STRAND] = fields[4] + record[:Q_BEG] = fields[5] + record[:S_BEG] = fields[6] + record[:CIGAR] = fields[7] + record[:Q_ID] = fields[8] + record[:S_ID] = fields[9] + + yield record + end + end + end + + self # conventionally + end +end + +ok_methods = "ublast,usearch,uclust,usearch_uclust" + +casts = [] +casts << {:long=>'no_sort', :short=>'n', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'method', :short=>'m', :type=>'string', :mandatory=>true, :default=>"uclust", :allowed=>ok_methods, :disallowed=>nil} +casts << {:long=>'database', :short=>'d', :type=>'file!', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'identity', :short=>'i', :type=>'float', :mandatory=>true, :default=>0.9, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'e_val', :short=>'e', :type=>'float', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} + +bp = Biopieces.new + +options = bp.parse(ARGV, casts) + +tmpdir = bp.mktmpdir +infile = "#{tmpdir}/in.fna" +outfile = "#{tmpdir}/out.uc" + +Fasta.open(infile, mode="w") do |fasta_io| + bp.each_record do |record| + bp.puts record + fasta_io.puts record + end +end + +uclust = Uclust.new(infile, outfile) +uclust.sort(options) unless options[:no_sort] + +case options[:method].to_s +when "ublast" then uclust.ublast(options) +when "usearch" then uclust.usearch(options) +when "uclust" then uclust.uclust(options) +when "usearch_uclust" then uclust.usearch_uclust(options) +else raise "Unknown method: #{options[:method]}" +end + +uclust.each do |record| + bp.puts record +end + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +__END__ -- 2.39.2