From f0d75c8a152f9dd551d75feb18982b35dcb7e759 Mon Sep 17 00:00:00 2001 From: martinahansen Date: Fri, 18 Nov 2011 10:57:25 +0000 Subject: [PATCH] rewriting uclust_seq git-svn-id: http://biopieces.googlecode.com/svn/trunk@1665 74ccb610-7750-0410-82ae-013aeee3265d --- bp_bin/cluster_seq | 168 --------------------- bp_bin/remove_primers | 27 ++-- bp_bin/uclust_seq | 75 ++++----- bp_bin/usearch_seq | 344 ++++++++++++++++++------------------------ 4 files changed, 194 insertions(+), 420 deletions(-) delete mode 100755 bp_bin/cluster_seq diff --git a/bp_bin/cluster_seq b/bp_bin/cluster_seq deleted file mode 100755 index 53d8f2c..0000000 --- a/bp_bin/cluster_seq +++ /dev/null @@ -1,168 +0,0 @@ -#!/usr/bin/env perl - -# Copyright (C) 2007-2009 Martin A. Hansen. - -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License -# as published by the Free Software Foundation; either version 2 -# of the License, or (at your option) any later version. - -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. - -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. - -# http://www.gnu.org/copyleft/gpl.html - - -# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - -# Cluster sequences in the stream. - -# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - - -use warnings; -use strict; -use Data::Dumper; -use Maasha::Common; -use Maasha::Biopieces; -use Maasha::Fasta; -use Maasha::Seq; -use Maasha::Filesys; - - -# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - - -my ( $options, $in, $out, $tmp_dir, $tmp_file, $tmp_fh, $record, $entry, @args, $arg_str, @fields, $line ); - -$options = Maasha::Biopieces::parse_options( - [ - { long => 'identity', short => 'i', type => 'float', mandatory => 'no', default => "0.9", allowed => undef, disallowed => undef }, - { long => 'library', short => 'l', type => 'file!', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, - { long => 'no_sort', short => 'n', type => 'flag', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, - ] -); - -$in = Maasha::Biopieces::read_stream( $options->{ "stream_in" } ); -$out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } ); - -$tmp_dir = Maasha::Biopieces::get_tmpdir(); -$tmp_file = "$tmp_dir/uclust.fasta"; -$tmp_fh = Maasha::Filesys::file_write_open( $tmp_file ); - -while ( $record = Maasha::Biopieces::get_record( $in ) ) -{ - if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) ) { - Maasha::Fasta::put_entry( $entry, $tmp_fh ); - } - - Maasha::Biopieces::put_record( $record, $out ); -} - -close $tmp_fh; - -uclust_sort( $tmp_file, $tmp_dir, $options->{ 'verbose' } ) if not $options->{ 'no_sort' }; - -push @args, "--input $tmp_file"; -push @args, "--id $options->{ 'identity' }"; -push @args, "--tmpdir $tmp_dir"; -push @args, "--uc $tmp_file.out"; -push @args, "--lib $options->{ 'library' }" if $options->{ 'library' }; -push @args, "--libonly" if $options->{ 'library' }; -push @args, "--quiet" if not $options->{ 'verbose' }; -push @args, "> /dev/null 2>&1" if not $options->{ 'verbose' }; - -$arg_str = join " ", @args; - -Maasha::Common::run( "uclust", $arg_str ); - -$tmp_fh = Maasha::Filesys::file_read_open( "$tmp_file.out" ); - -while ( $line = <$tmp_fh> ) -{ - next if $line =~ /^#/; - - chomp $line; - - @fields = split "\t", $line; - - $record = { - REC_TYPE => 'UCLUST', - TYPE => $fields[ 0 ], - CLUSTER => $fields[ 1 ], - SIZE => $fields[ 2 ], - IDENT => $fields[ 3 ], - STRAND => $fields[ 4 ], - Q_BEG => $fields[ 5 ], - S_BEG => $fields[ 6 ], - CIGAR => $fields[ 7 ], - SEQ_NAME => $fields[ 8 ], - }; - - Maasha::Biopieces::put_record( $record, $out ); -} - -close $tmp_fh; - -Maasha::Biopieces::close_stream( $in ); -Maasha::Biopieces::close_stream( $out ); - - -# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SUBROUTINES <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - - -sub uclust_sort -{ - # Martin A. Hansen, January 2010. - - # Sorts a FASTA file according to sequence length - # - longest first - using uclust. - - my ( $file, # file to sort - $tmp_dir, # temporary directory for sorting - $verbose, # verbose flag - OPTIONAL - ) = @_; - - # Returns nothing. - - my ( @args, $arg_str ); - - push @args, "--mergesort $file"; - push @args, "--output $file.sort"; - push @args, "--tmpdir $tmp_dir"; - push @args, "--quiet" if not $verbose; - push @args, "> /dev/null 2>&1" if not $verbose; - - $arg_str = join " ", @args; - - Maasha::Common::run( "uclust", $arg_str ); - - rename "$file.sort", $file; -} - - -# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - - -BEGIN -{ - Maasha::Biopieces::status_set(); -} - - -END -{ - Maasha::Biopieces::status_log(); -} - - -# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - - -__END__ diff --git a/bp_bin/remove_primers b/bp_bin/remove_primers index 9c8457c..720d943 100755 --- a/bp_bin/remove_primers +++ b/bp_bin/remove_primers @@ -45,30 +45,35 @@ options = Biopieces.options_parse(ARGV, casts) Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output| input.each do |record| if record.has_key? :SEQ + forward = false + reverse = false seq = Seq.new_bp(record) - pos = 0 - len = 0 - if forward = seq.patscan(options[:forward].to_s, pos, options[:mismatches], options[:insertions], options[:deletions]) - record[:FORWARD_POS] = forward.last.pos - record[:FORWARD_LEN] = forward.last.length - pos = forward.last.pos + forward.last.length + seq.patscan(options[:forward].to_s, 0, options[:mismatches], options[:insertions], options[:deletions]) do |match| + record[:FORWARD_POS] = match.pos + record[:FORWARD_LEN] = match.length + pos = match.pos + match.length len = seq.length - pos seq.subseq!(pos, len) if len > 0 + forward = true + break end - if pos < seq.length && reverse = seq.patscan(options[:reverse].to_s, pos, options[:mismatches], options[:insertions], options[:deletions]) - record[:REVERSE_POS] = reverse.first.pos - record[:REVERSE_LEN] = reverse.first.length + seq.patscan(options[:reverse].to_s, 0, options[:mismatches], options[:insertions], options[:deletions]) do |match| + record[:REVERSE_POS] = match.pos + record[:REVERSE_LEN] = match.length pos = 0 - len = reverse.first.pos + len = match.pos if len == 0 seq.seq = "" seq.qual = "" if seq.qual else seq.subseq!(pos, len) - end + end + + reverse = true + break end if forward or reverse diff --git a/bp_bin/uclust_seq b/bp_bin/uclust_seq index b20a29c..0eaee60 100755 --- a/bp_bin/uclust_seq +++ b/bp_bin/uclust_seq @@ -65,23 +65,6 @@ class Uclust execute end - # Method to execute database search. - def usearch - # usearch --query query.fasta --db db.fasta --uc results.uc --id 0.90 [--evalue E] - @command << "usearch --query #{@infile} --db #{@options[:database]} --uc #{@outfile} --id #{@options[:identity]}" - @command << "--evalue #{@options[:e_val]}" if @options.has_key? :e_val - - execute - end - - # Method to execute clustering to database plus de novo if not matched. - def usearch_uclust - # usearch --cluster seqs_sorted.fasta --db db.fasta --uc results.uc --id 0.90 - @command << "usearch --cluster #{@infile} --db #{@options[:database]} --uc #{@outfile} --id #{@options[:identity]}" - - execute - end - # Method to parse a Uclust .uc file and for each line of data # yield a Biopiece record. def each @@ -92,18 +75,12 @@ class Uclust if line !~ /^#/ fields = line.chomp.split("\t") - record[:REC_TYPE] = "UCLUST" + next if fields[0] == 'C' + record[:TYPE] = fields[0] record[:CLUSTER] = fields[1].to_i - record[:SEQ_LEN] = fields[2].to_i record[:IDENT] = fields[3].to_f - record[:STRAND] = fields[4] - record[:Q_BEG] = fields[5].to_i - record[:S_BEG] = fields[6].to_i - record[:S_END] = fields[6].to_i + fields[2].to_i - record[:CIGAR] = fields[7] record[:Q_ID] = fields[8] - record[:S_ID] = fields[9] yield record end @@ -129,28 +106,20 @@ class Uclust end end -ok_methods = "uclust,usearch,usearch_uclust" - casts = [] casts << {:long=>'no_sort', :short=>'n', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} -casts << {:long=>'method', :short=>'m', :type=>'string', :mandatory=>true, :default=>"uclust", :allowed=>ok_methods, :disallowed=>nil} -casts << {:long=>'database', :short=>'d', :type=>'file!', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} casts << {:long=>'comp', :short=>'c', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} casts << {:long=>'identity', :short=>'i', :type=>'float', :mandatory=>true, :default=>0.9, :allowed=>nil, :disallowed=>nil} -casts << {:long=>'e_val', :short=>'e', :type=>'float', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} options = Biopieces.options_parse(ARGV, casts) -# At high identities, around 96% and above, compressed indexes are often more sensitive, faster -# and use less RAM. Compressed indexes are disabled by default, so I generally recommend that -# you specify the --slots and --w options when clustering at high identities. - -tmpdir = Biopieces.mktmpdir -infile = File.join(tmpdir, "in.fna") -outfile = File.join(tmpdir, "out.uc") +tmpdir = Biopieces.mktmpdir +file_records = File.join(tmpdir, "data.stream") +file_fasta = File.join(tmpdir, "in.fna") +file_uclust = File.join(tmpdir, "out.uc") -Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output| - Fasta.open(infile, mode="w") do |fasta_io| +Biopieces.open(options[:stream_in], file_records) do |input, output| + Fasta.open(file_fasta, mode="w") do |fasta_io| input.each_record do |record| output.puts record @@ -159,17 +128,29 @@ Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output| end end end +end - uclust = Uclust.new(infile, outfile, options) - uclust.sort unless options[:no_sort] or options[:method] == "usearch" +uc = Uclust.new(file_fasta, file_uclust, options) +uc.sort unless options[:no_sort] +uc.cluster - case options[:method].to_s - when "uclust" then uclust.cluster - when "usearch" then uclust.usearch - when "usearch_uclust" then uclust.usearch_uclust - end +hash = {} + +uc.each do |record| + hash[record[:Q_ID].to_sym] = record.dup +end + +Biopieces.open(file_records, options[:stream_out]) do |input, output| + input.each_record do |record| + if record.has_key? :SEQ_NAME and record.has_key? :SEQ + if hash.has_key? record[:SEQ_NAME].to_sym + uc = hash[record[:SEQ_NAME].to_sym] + record[:CLUSTER] = uc[:CLUSTER].to_i + record[:IDENT] = uc[:IDENT].to_i + record[:IDENT] = '*' if uc[:TYPE] == 'S' + end + end - uclust.each do |record| output.puts record end end diff --git a/bp_bin/usearch_seq b/bp_bin/usearch_seq index e563893..b20a29c 100755 --- a/bp_bin/usearch_seq +++ b/bp_bin/usearch_seq @@ -1,6 +1,6 @@ -#!/usr/bin/env perl +#!/usr/bin/env ruby -# Copyright (C) 2007-2010 Martin A. Hansen. +# Copyright (C) 2007-2011 Martin A. Hansen. # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License @@ -18,205 +18,161 @@ # http://www.gnu.org/copyleft/gpl.html - -# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - -# USEARCH sequences in the stream against a specified database or genome. - -# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - - -use warnings; -use strict; -use Data::Dumper; -use Maasha::Biopieces; -use Maasha::Common; -use Maasha::Seq; -use Maasha::Fasta; - - # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< +# This program is part of the Biopieces framework (www.biopieces.org). -my ( $TYPE_HASH, $options, $in, $out, $tmp_dir, $tmp_in, $tmp_out, $q_type, $record, $entry, $fh_in, $fh_out ); - -$TYPE_HASH = { - 'L' => 'LibSeed', - 'S' => 'NewSeed', - 'H' => 'Hit', - 'R' => 'Reject', - 'D' => 'LibCluster', - 'C' => 'NewCluster', - 'N' => 'NoHit' -}; - -$options = Maasha::Biopieces::parse_options( - [ - { long => 'database', short => 'd', type => 'file', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, - { long => 'genome', short => 'g', type => 'genome', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, - { long => 'id', short => 'i', type => 'float', mandatory => 'no', default => '0.90', allowed => undef, disallowed => undef }, - ] -); - -Maasha::Common::error( qq(both --database and --genome specified) ) if $options->{ "genome" } and $options->{ "database" }; -Maasha::Common::error( qq(no --database or --genome specified) ) if not $options->{ "genome" } and not $options->{ "database" }; - -$in = Maasha::Biopieces::read_stream( $options->{ "stream_in" } ); -$out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } ); - -$options->{ "database" } = "$ENV{ 'BP_DATA' }/genomes/$options->{ 'genome' }/fasta/$options->{ 'genome' }.fna" if $options->{ 'genome' }; - -$tmp_dir = Maasha::Biopieces::get_tmpdir(); -$tmp_in = "$tmp_dir/usearch_query.seq"; -$tmp_out = "$tmp_dir/usearch.uc"; - -$fh_out = Maasha::Filesys::file_write_open( $tmp_in ); - -while ( $record = Maasha::Biopieces::get_record( $in ) ) -{ - if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) ) - { - $q_type = Maasha::Seq::seq_guess_type( $record->{ 'SEQ' } ) if not $q_type; - - Maasha::Fasta::put_entry( $entry, $fh_out ); - } - - Maasha::Biopieces::put_record( $record, $out ); -} - -close $fh_out; - - -if ( $options->{ 'verbose' } ) -{ - Maasha::Common::run( - "uclust", - join( " ", - "--input $tmp_in", - "--lib $options->{ 'database' }", - "--libonly", - "--uc $tmp_out", - "--id $options->{ 'id' }", - "--rev", - ), - 1 - ); -} -else -{ - Maasha::Common::run( - "uclust", - join( " ", - "--input $tmp_in", - "--lib $options->{ 'database' }", - "--libonly", - "--uc $tmp_out", - "--id $options->{ 'id' }", - "--rev", - "> /dev/null 2>&1" - ), - 1 - ); -} - -unlink $tmp_in; - -$fh_in = Maasha::Filesys::file_read_open( $tmp_out ); - -while ( $entry = get_tab_entry( $fh_in ) ) -{ - $record = uclust_tab2biopiece( $entry ); - - Maasha::Biopieces::put_record( $record, $out ) if $record->{ 'TYPE' } eq 'Hit'; -} - -close $fh_out; - -unlink $tmp_out; - -Maasha::Biopieces::close_stream( $in ); -Maasha::Biopieces::close_stream( $out ); - - -# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SUBROUTINES <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - - -sub get_tab_entry -{ - # Martin A. Hansen, May 2009. - - # Get the next tabular entry from a filehandle to a USEARCH file. - - my ( $fh, # filehandle - ) = @_; - - # Returns a list - - my ( $line, @fields ); - - while ( $line = <$fh> ) - { - next if $line =~ /^#/; - - @fields = split /\s+/, $line; - - return wantarray ? @fields : \@fields; - } - - return undef; -} - - -sub uclust_tab2biopiece -{ - # Martin A. Hansen, May 2009. - - # Get the next USEARCH tabular entry and convert it to - # a biopiece record that is returned. - - my ( $entry, # USEARCH tabular entry - ) = @_; - - # Returns a hashref. - - my ( %record ); - - $record{ "REC_TYPE" } = "USEARCH"; - $record{ "TYPE" } = $TYPE_HASH->{ $entry->[ 0 ] }; - $record{ "CLUSTER" } = $entry->[ 1 ]; - $record{ "ALIGN_LEN" } = $entry->[ 2 ]; - $record{ "ID" } = $entry->[ 3 ]; - $record{ "STRAND" } = $entry->[ 4 ]; - $record{ "Q_BEG" } = $entry->[ 5 ]; - $record{ "S_BEG" } = $entry->[ 6 ]; - $record{ "CIGAR" } = $entry->[ 7 ]; - $record{ "Q_ID" } = $entry->[ 8 ]; - $record{ "S_ID" } = $entry->[ 9 ]; - - if ( $record{ 'TYPE' } eq 'Hit' ) - { - $record{ "Q_END" } = $record{ "Q_BEG" } + $record{ "ALIGN_LEN" } - 1; - $record{ "S_END" } = $record{ "S_BEG" } + $record{ "ALIGN_LEN" } - 1; - } - - return wantarray ? %record : \%record; -} - +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< +# Run Uclust on sequences in the stream. # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - -BEGIN -{ - Maasha::Biopieces::status_set(); -} - - -END -{ - Maasha::Biopieces::status_log(); -} +require 'maasha/biopieces' +require 'maasha/fasta' + +SORT_LIMIT = 2_000_000_000 # use mergesort for files biggern than 2Gb. + +class Uclust + include Enumerable + + def initialize(infile, outfile, options) + @infile = infile + @outfile = outfile + @options = options + @command = [] + end + + # Method that calls Usearch sorting for sorting a FASTA file + # according to decending sequence length. + def sort + # usearch -sort seqs.fasta -output seqs.sorted.fasta + if File.size(@infile) < SORT_LIMIT + @command << "usearch --sort #{@infile} --output #{@infile}.sort" + else + @command << "usearch --mergesort #{@infile} --output #{@infile}.sort" + end + + execute + + File.rename "#{@infile}.sort", @infile + end + + # Method to execute clustering de novo. + def cluster + @command << "usearch --cluster #{@infile} --uc #{@outfile} --id #{@options[:identity]}" + + execute + end + + # Method to execute database search. + def usearch + # usearch --query query.fasta --db db.fasta --uc results.uc --id 0.90 [--evalue E] + @command << "usearch --query #{@infile} --db #{@options[:database]} --uc #{@outfile} --id #{@options[:identity]}" + @command << "--evalue #{@options[:e_val]}" if @options.has_key? :e_val + + execute + end + + # Method to execute clustering to database plus de novo if not matched. + def usearch_uclust + # usearch --cluster seqs_sorted.fasta --db db.fasta --uc results.uc --id 0.90 + @command << "usearch --cluster #{@infile} --db #{@options[:database]} --uc #{@outfile} --id #{@options[:identity]}" + + execute + end + + # Method to parse a Uclust .uc file and for each line of data + # yield a Biopiece record. + def each + record = {} + + File.open(@outfile, mode="r") do |ios| + ios.each_line do |line| + if line !~ /^#/ + fields = line.chomp.split("\t") + + record[:REC_TYPE] = "UCLUST" + record[:TYPE] = fields[0] + record[:CLUSTER] = fields[1].to_i + record[:SEQ_LEN] = fields[2].to_i + record[:IDENT] = fields[3].to_f + record[:STRAND] = fields[4] + record[:Q_BEG] = fields[5].to_i + record[:S_BEG] = fields[6].to_i + record[:S_END] = fields[6].to_i + fields[2].to_i + record[:CIGAR] = fields[7] + record[:Q_ID] = fields[8] + record[:S_ID] = fields[9] + + yield record + end + end + end + + self # conventionally + end + + private + + # Method to execute a command using a system() call. + # The command is composed of bits from the @command variable. + def execute + @command.unshift "nice -n 19" + @command << "--rev" if @options[:comp] + @command << "> /dev/null 2>&1" unless @options[:verbose] + command = @command.join(" ") + system(command) + raise "Command failed: #{command}" unless $?.success? + + @command = [] + end +end + +ok_methods = "uclust,usearch,usearch_uclust" + +casts = [] +casts << {:long=>'no_sort', :short=>'n', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'method', :short=>'m', :type=>'string', :mandatory=>true, :default=>"uclust", :allowed=>ok_methods, :disallowed=>nil} +casts << {:long=>'database', :short=>'d', :type=>'file!', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'comp', :short=>'c', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'identity', :short=>'i', :type=>'float', :mandatory=>true, :default=>0.9, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'e_val', :short=>'e', :type=>'float', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} + +options = Biopieces.options_parse(ARGV, casts) + +# At high identities, around 96% and above, compressed indexes are often more sensitive, faster +# and use less RAM. Compressed indexes are disabled by default, so I generally recommend that +# you specify the --slots and --w options when clustering at high identities. + +tmpdir = Biopieces.mktmpdir +infile = File.join(tmpdir, "in.fna") +outfile = File.join(tmpdir, "out.uc") + +Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output| + Fasta.open(infile, mode="w") do |fasta_io| + input.each_record do |record| + output.puts record + + if record.has_key? :SEQ_NAME and record.has_key? :SEQ + fasta_io.puts Seq.new_bp(record).to_fasta + end + end + end + + uclust = Uclust.new(infile, outfile, options) + uclust.sort unless options[:no_sort] or options[:method] == "usearch" + + case options[:method].to_s + when "uclust" then uclust.cluster + when "usearch" then uclust.usearch + when "usearch_uclust" then uclust.usearch_uclust + end + + uclust.each do |record| + output.puts record + end +end # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< -- 2.39.2