From 2949257922cc90a28af23386a72dbad8157e4fa8 Mon Sep 17 00:00:00 2001 From: martinahansen Date: Fri, 6 Sep 2013 10:17:57 +0000 Subject: [PATCH] upgraded uclust_seq to support usearch7 git-svn-id: http://biopieces.googlecode.com/svn/trunk@2189 74ccb610-7750-0410-82ae-013aeee3265d --- bp_bin/uclust_seq | 17 +++++++------- bp_test/out/uclust_seq.out.1 | 18 +++++++-------- bp_test/out/uclust_seq.out.2 | 18 +++++++-------- bp_test/out/uclust_seq.out.3 | 22 +++++++++--------- code_ruby/lib/maasha/usearch.rb | 40 +++++++++++++++++++-------------- 5 files changed, 61 insertions(+), 54 deletions(-) diff --git a/bp_bin/uclust_seq b/bp_bin/uclust_seq index 936d642..f155bdf 100755 --- a/bp_bin/uclust_seq +++ b/bp_bin/uclust_seq @@ -33,9 +33,9 @@ require 'maasha/fasta' require 'maasha/usearch' casts = [] -casts << {:long=>'no_sort', :short=>'n', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} -casts << {:long=>'comp', :short=>'c', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} -casts << {:long=>'identity', :short=>'i', :type=>'float', :mandatory=>true, :default=>0.9, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'no_sort', :short=>'n', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'comp', :short=>'c', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'identity', :short=>'i', :type=>'float', :mandatory=>true, :default=>0.9, :allowed=>nil, :disallowed=>nil} options = Biopieces.options_parse(ARGV, casts) @@ -45,7 +45,7 @@ file_fasta = File.join(tmpdir, "in.fna") file_uclust = File.join(tmpdir, "out.uc") Biopieces.open(options[:stream_in], file_records) do |input, output| - Fasta.open(file_fasta, mode="w") do |fasta_io| + Fasta.open(file_fasta, 'w') do |fasta_io| input.each_record do |record| output.puts record @@ -57,13 +57,14 @@ Biopieces.open(options[:stream_in], file_records) do |input, output| end us = Usearch.new(file_fasta, file_uclust, options) -us.sort_length unless options[:no_sort] -us.cluster + +us.sortbylength unless options[:no_sort] +us.cluster_smallmem hash = {} -us.each_cluster do |record| - hash[record[:Q_ID].to_sym] = record.dup +us.each_cluster do |cluster| + hash[cluster[:Q_ID].to_sym] = cluster.dup end Biopieces.open(file_records, options[:stream_out]) do |input, output| diff --git a/bp_test/out/uclust_seq.out.1 b/bp_test/out/uclust_seq.out.1 index 6a64040..43691ea 100644 --- a/bp_test/out/uclust_seq.out.1 +++ b/bp_test/out/uclust_seq.out.1 @@ -1,42 +1,42 @@ SEQ_NAME: test1 SEQ: tgtacgtagctagctagctagctagctagctagctagctagctgactatcgtgatcgtg SEQ_LEN: 59 -CLUSTER: 0 +CLUSTER: 2 IDENT: * --- SEQ_NAME: test1_100 SEQ: tgtacgtagctagctagctagctagctagctagctagctagctgactatcgtgatcgtg SEQ_LEN: 59 -CLUSTER: 0 +CLUSTER: 2 IDENT: 100 --- SEQ_NAME: test1_2 SEQ: tgtacgtagctagctagctagctagcGagctagctagcAagctgactatcgtgatcgtg SEQ_LEN: 59 -CLUSTER: 0 +CLUSTER: 2 IDENT: 96 --- SEQ_NAME: test2 SEQ: ggttgtgtgtgtgtatcgatgtagtctacatcgtctatctgtactgacttactgactac SEQ_LEN: 59 -CLUSTER: 1 -IDENT: * +CLUSTER: 0 +IDENT: 100 --- SEQ_NAME: test2_100 SEQ: ggttgtgtgtgtgtatcgatgtagtctacatcgtctatctgtactgacttactgactac SEQ_LEN: 59 -CLUSTER: 1 -IDENT: 100 +CLUSTER: 0 +IDENT: * --- SEQ_NAME: test2_1 SEQ: ggtAgtgtgtgAgtatcgatgtagtctacatcgtctatctgtactgacttactgactac SEQ_LEN: 59 -CLUSTER: 1 +CLUSTER: 0 IDENT: 96 --- SEQ_NAME: test1_rc SEQ: cacgatcacgatagtcagctagctagctagctagctagctagctagctagctacgtaca SEQ_LEN: 59 -CLUSTER: 2 +CLUSTER: 1 IDENT: * --- diff --git a/bp_test/out/uclust_seq.out.2 b/bp_test/out/uclust_seq.out.2 index 92997a0..5c1b6e4 100644 --- a/bp_test/out/uclust_seq.out.2 +++ b/bp_test/out/uclust_seq.out.2 @@ -1,42 +1,42 @@ SEQ_NAME: test1 SEQ: tgtacgtagctagctagctagctagctagctagctagctagctgactatcgtgatcgtg SEQ_LEN: 59 -CLUSTER: 0 +CLUSTER: 3 IDENT: * --- SEQ_NAME: test1_100 SEQ: tgtacgtagctagctagctagctagctagctagctagctagctgactatcgtgatcgtg SEQ_LEN: 59 -CLUSTER: 0 +CLUSTER: 3 IDENT: 100 --- SEQ_NAME: test1_2 SEQ: tgtacgtagctagctagctagctagcGagctagctagcAagctgactatcgtgatcgtg SEQ_LEN: 59 -CLUSTER: 1 +CLUSTER: 4 IDENT: * --- SEQ_NAME: test2 SEQ: ggttgtgtgtgtgtatcgatgtagtctacatcgtctatctgtactgacttactgactac SEQ_LEN: 59 -CLUSTER: 2 -IDENT: * +CLUSTER: 0 +IDENT: 100 --- SEQ_NAME: test2_100 SEQ: ggttgtgtgtgtgtatcgatgtagtctacatcgtctatctgtactgacttactgactac SEQ_LEN: 59 -CLUSTER: 2 -IDENT: 100 +CLUSTER: 0 +IDENT: * --- SEQ_NAME: test2_1 SEQ: ggtAgtgtgtgAgtatcgatgtagtctacatcgtctatctgtactgacttactgactac SEQ_LEN: 59 -CLUSTER: 3 +CLUSTER: 1 IDENT: * --- SEQ_NAME: test1_rc SEQ: cacgatcacgatagtcagctagctagctagctagctagctagctagctagctacgtaca SEQ_LEN: 59 -CLUSTER: 4 +CLUSTER: 2 IDENT: * --- diff --git a/bp_test/out/uclust_seq.out.3 b/bp_test/out/uclust_seq.out.3 index d4b8e23..8052dfc 100644 --- a/bp_test/out/uclust_seq.out.3 +++ b/bp_test/out/uclust_seq.out.3 @@ -1,42 +1,42 @@ SEQ_NAME: test1 SEQ: tgtacgtagctagctagctagctagctagctagctagctagctgactatcgtgatcgtg SEQ_LEN: 59 -CLUSTER: 0 -IDENT: * +CLUSTER: 2 +IDENT: 100 --- SEQ_NAME: test1_100 SEQ: tgtacgtagctagctagctagctagctagctagctagctagctgactatcgtgatcgtg SEQ_LEN: 59 -CLUSTER: 0 +CLUSTER: 2 IDENT: 100 --- SEQ_NAME: test1_2 SEQ: tgtacgtagctagctagctagctagcGagctagctagcAagctgactatcgtgatcgtg SEQ_LEN: 59 -CLUSTER: 1 +CLUSTER: 3 IDENT: * --- SEQ_NAME: test2 SEQ: ggttgtgtgtgtgtatcgatgtagtctacatcgtctatctgtactgacttactgactac SEQ_LEN: 59 -CLUSTER: 2 -IDENT: * +CLUSTER: 0 +IDENT: 100 --- SEQ_NAME: test2_100 SEQ: ggttgtgtgtgtgtatcgatgtagtctacatcgtctatctgtactgacttactgactac SEQ_LEN: 59 -CLUSTER: 2 -IDENT: 100 +CLUSTER: 0 +IDENT: * --- SEQ_NAME: test2_1 SEQ: ggtAgtgtgtgAgtatcgatgtagtctacatcgtctatctgtactgacttactgactac SEQ_LEN: 59 -CLUSTER: 3 +CLUSTER: 1 IDENT: * --- SEQ_NAME: test1_rc SEQ: cacgatcacgatagtcagctagctagctagctagctagctagctagctagctacgtaca SEQ_LEN: 59 -CLUSTER: 0 -IDENT: 100 +CLUSTER: 2 +IDENT: * --- diff --git a/code_ruby/lib/maasha/usearch.rb b/code_ruby/lib/maasha/usearch.rb index 8375850..0bc7235 100644 --- a/code_ruby/lib/maasha/usearch.rb +++ b/code_ruby/lib/maasha/usearch.rb @@ -27,8 +27,6 @@ require 'maasha/fasta' # Error class for all exceptions to do with Usearch. class UsearchError < StandardError; end -SORT_LIMIT = 2_000_000_000 # use mergesort for files biggern than 2Gb. - class Usearch include Enumerable @@ -43,13 +41,9 @@ class Usearch # Method that calls Usearch sorting for sorting a FASTA file # according to decending sequence length. - def sort_length - # usearch -sort seqs.fasta -output seqs.sorted.fasta - if File.size(@infile) < SORT_LIMIT - @command << "usearch --sort #{@infile} --output #{@infile}.sort" - else - @command << "usearch --mergesort #{@infile} --output #{@infile}.sort" - end + def sortbylength + # usearch -sortbylength seqs.fasta -output seqs_sorted.fasta -minseqlength 64 + @command << "usearch -sortbylength #{@infile} -output #{@infile}.sort" execute @@ -58,20 +52,31 @@ class Usearch # Method that calls Usearch sorting for sorting a FASTA file # according to cluster size. - def sort_size - @command << "usearch --sortsize #{@infile} --output #{@infile}.sort" + def sortbysize + # usearch -sortbysize seqs.fasta -output seqs_sorted.fasta -minsize 4 + @command << "usearch -sortbysize #{@infile} -output #{@infile}.sort" execute File.rename "#{@infile}.sort", @infile end - # Method to execute clustering de novo. - def cluster - @command << "usearch --cluster #{@infile} --uc #{@outfile} --id #{@options[:identity]}" - @command << "--rev" if @options[:comp] + # Method to execute cluster_fast. + def cluster_fast + # usearch -cluster_fast query.fasta -id 0.9 -centroids nr.fasta -uc clusters.uc + @command << "usearch -cluster_fast #{@infile} -id #{@options[:identity]} -uc #{@outfile}" - execute + execute + end + + # Method to execute cluster_smallmem. + # NB sequences must be sorted with sortbylength or sortbysize. + def cluster_smallmem + # usearch -cluster_smallmem query.fasta -id 0.9 -centroids nr.fasta -uc clusters.uc + @command << "usearch -cluster_smallmem #{@infile} -id #{@options[:identity]} -uc #{@outfile}" + @command << "-strand both" if @options[:comp] + + execute end # Method to execute database search. @@ -188,6 +193,8 @@ class Usearch yield Align.new(entries) unless entries.empty? end + + self # conventionally end private @@ -209,7 +216,6 @@ class Usearch # Method to execute a command using a system() call. # The command is composed of bits from the @command variable. def execute - @command.unshift "nice -n 19" @command << "--quiet" unless @options[:verbose] command = @command.join(" ") $stderr.puts "Running command: #{command}" if @options[:verbose] -- 2.39.2