From 0fe46f5571c01ba067b852441e7b57f931bed7f1 Mon Sep 17 00:00:00 2001 From: martinahansen Date: Mon, 21 Nov 2011 10:29:30 +0000 Subject: [PATCH] upgraded usearch_seq git-svn-id: http://biopieces.googlecode.com/svn/trunk@1671 74ccb610-7750-0410-82ae-013aeee3265d --- bp_bin/usearch_seq | 100 ++++++++++++--------------------------------- 1 file changed, 26 insertions(+), 74 deletions(-) diff --git a/bp_bin/usearch_seq b/bp_bin/usearch_seq index b20a29c..5cac949 100755 --- a/bp_bin/usearch_seq +++ b/bp_bin/usearch_seq @@ -24,16 +24,14 @@ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< -# Run Uclust on sequences in the stream. +# Usearch sequences in the stream against a specified database. # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< require 'maasha/biopieces' require 'maasha/fasta' -SORT_LIMIT = 2_000_000_000 # use mergesort for files biggern than 2Gb. - -class Uclust +class Usearch include Enumerable def initialize(infile, outfile, options) @@ -43,70 +41,35 @@ class Uclust @command = [] end - # Method that calls Usearch sorting for sorting a FASTA file - # according to decending sequence length. - def sort - # usearch -sort seqs.fasta -output seqs.sorted.fasta - if File.size(@infile) < SORT_LIMIT - @command << "usearch --sort #{@infile} --output #{@infile}.sort" - else - @command << "usearch --mergesort #{@infile} --output #{@infile}.sort" - end - - execute - - File.rename "#{@infile}.sort", @infile - end - - # Method to execute clustering de novo. - def cluster - @command << "usearch --cluster #{@infile} --uc #{@outfile} --id #{@options[:identity]}" - - execute - end - # Method to execute database search. def usearch - # usearch --query query.fasta --db db.fasta --uc results.uc --id 0.90 [--evalue E] - @command << "usearch --query #{@infile} --db #{@options[:database]} --uc #{@outfile} --id #{@options[:identity]}" + @command << "usearch --query #{@infile} --db #{@options[:database]} --userout #{@outfile}" + @command << "--userfields target+tloz+thiz+query+bits+strand" + @command << "--id #{@options[:identity]}" if @options.has_key? :identity @command << "--evalue #{@options[:e_val]}" if @options.has_key? :e_val execute end - # Method to execute clustering to database plus de novo if not matched. - def usearch_uclust - # usearch --cluster seqs_sorted.fasta --db db.fasta --uc results.uc --id 0.90 - @command << "usearch --cluster #{@infile} --db #{@options[:database]} --uc #{@outfile} --id #{@options[:identity]}" - - execute - end - - # Method to parse a Uclust .uc file and for each line of data + # Method to parse a Useach .uc file and for each line of data # yield a Biopiece record. def each record = {} File.open(@outfile, mode="r") do |ios| + ios.gets # skip comment line ios.each_line do |line| - if line !~ /^#/ - fields = line.chomp.split("\t") - - record[:REC_TYPE] = "UCLUST" - record[:TYPE] = fields[0] - record[:CLUSTER] = fields[1].to_i - record[:SEQ_LEN] = fields[2].to_i - record[:IDENT] = fields[3].to_f - record[:STRAND] = fields[4] - record[:Q_BEG] = fields[5].to_i - record[:S_BEG] = fields[6].to_i - record[:S_END] = fields[6].to_i + fields[2].to_i - record[:CIGAR] = fields[7] - record[:Q_ID] = fields[8] - record[:S_ID] = fields[9] - - yield record - end + fields = line.chomp.split("\t") + + record[:REC_TYPE] = "USEARCH" + record[:S_ID] = fields[0] + record[:S_BEG] = fields[1].to_i + record[:S_END] = fields[2].to_i + record[:Q_ID] = fields[3] + record[:SCORE] = fields[4].to_f + record[:STRAND] = fields[5] + + yield record end end @@ -129,21 +92,15 @@ class Uclust end end -ok_methods = "uclust,usearch,usearch_uclust" - casts = [] -casts << {:long=>'no_sort', :short=>'n', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} -casts << {:long=>'method', :short=>'m', :type=>'string', :mandatory=>true, :default=>"uclust", :allowed=>ok_methods, :disallowed=>nil} -casts << {:long=>'database', :short=>'d', :type=>'file!', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} -casts << {:long=>'comp', :short=>'c', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} -casts << {:long=>'identity', :short=>'i', :type=>'float', :mandatory=>true, :default=>0.9, :allowed=>nil, :disallowed=>nil} -casts << {:long=>'e_val', :short=>'e', :type=>'float', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'database', :short=>'d', :type=>'file!', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'comp', :short=>'c', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'identity', :short=>'i', :type=>'float', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'e_val', :short=>'e', :type=>'float', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} options = Biopieces.options_parse(ARGV, casts) -# At high identities, around 96% and above, compressed indexes are often more sensitive, faster -# and use less RAM. Compressed indexes are disabled by default, so I generally recommend that -# you specify the --slots and --w options when clustering at high identities. +raise ArgumentError, "--identity or --e_val must be specified" unless options[:identity] or options[:e_val] tmpdir = Biopieces.mktmpdir infile = File.join(tmpdir, "in.fna") @@ -160,16 +117,11 @@ Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output| end end - uclust = Uclust.new(infile, outfile, options) - uclust.sort unless options[:no_sort] or options[:method] == "usearch" + uc = Usearch.new(infile, outfile, options) - case options[:method].to_s - when "uclust" then uclust.cluster - when "usearch" then uclust.usearch - when "usearch_uclust" then uclust.usearch_uclust - end + uc.usearch - uclust.each do |record| + uc.each do |record| output.puts record end end -- 2.39.2