execute
end
- # Method to execute database search.
- def usearch
- # usearch --query query.fasta --db db.fasta --uc results.uc --id 0.90 [--evalue E]
- @command << "usearch --query #{@infile} --db #{@options[:database]} --uc #{@outfile} --id #{@options[:identity]}"
- @command << "--evalue #{@options[:e_val]}" if @options.has_key? :e_val
-
- execute
- end
-
- # Method to execute clustering to database plus de novo if not matched.
- def usearch_uclust
- # usearch --cluster seqs_sorted.fasta --db db.fasta --uc results.uc --id 0.90
- @command << "usearch --cluster #{@infile} --db #{@options[:database]} --uc #{@outfile} --id #{@options[:identity]}"
-
- execute
- end
-
# Method to parse a Uclust .uc file and for each line of data
# yield a Biopiece record.
def each
if line !~ /^#/
fields = line.chomp.split("\t")
- record[:REC_TYPE] = "UCLUST"
+ next if fields[0] == 'C'
+
record[:TYPE] = fields[0]
record[:CLUSTER] = fields[1].to_i
- record[:SEQ_LEN] = fields[2].to_i
record[:IDENT] = fields[3].to_f
- record[:STRAND] = fields[4]
- record[:Q_BEG] = fields[5].to_i
- record[:S_BEG] = fields[6].to_i
- record[:S_END] = fields[6].to_i + fields[2].to_i
- record[:CIGAR] = fields[7]
record[:Q_ID] = fields[8]
- record[:S_ID] = fields[9]
yield record
end
end
end
-ok_methods = "uclust,usearch,usearch_uclust"
-
casts = []
casts << {:long=>'no_sort', :short=>'n', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
-casts << {:long=>'method', :short=>'m', :type=>'string', :mandatory=>true, :default=>"uclust", :allowed=>ok_methods, :disallowed=>nil}
-casts << {:long=>'database', :short=>'d', :type=>'file!', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
casts << {:long=>'comp', :short=>'c', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
casts << {:long=>'identity', :short=>'i', :type=>'float', :mandatory=>true, :default=>0.9, :allowed=>nil, :disallowed=>nil}
-casts << {:long=>'e_val', :short=>'e', :type=>'float', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
options = Biopieces.options_parse(ARGV, casts)
-# At high identities, around 96% and above, compressed indexes are often more sensitive, faster
-# and use less RAM. Compressed indexes are disabled by default, so I generally recommend that
-# you specify the --slots and --w options when clustering at high identities.
-
-tmpdir = Biopieces.mktmpdir
-infile = File.join(tmpdir, "in.fna")
-outfile = File.join(tmpdir, "out.uc")
+tmpdir = Biopieces.mktmpdir
+file_records = File.join(tmpdir, "data.stream")
+file_fasta = File.join(tmpdir, "in.fna")
+file_uclust = File.join(tmpdir, "out.uc")
-Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output|
- Fasta.open(infile, mode="w") do |fasta_io|
+Biopieces.open(options[:stream_in], file_records) do |input, output|
+ Fasta.open(file_fasta, mode="w") do |fasta_io|
input.each_record do |record|
output.puts record
end
end
end
+end
- uclust = Uclust.new(infile, outfile, options)
- uclust.sort unless options[:no_sort] or options[:method] == "usearch"
+uc = Uclust.new(file_fasta, file_uclust, options)
+uc.sort unless options[:no_sort]
+uc.cluster
- case options[:method].to_s
- when "uclust" then uclust.cluster
- when "usearch" then uclust.usearch
- when "usearch_uclust" then uclust.usearch_uclust
- end
+hash = {}
+
+uc.each do |record|
+ hash[record[:Q_ID].to_sym] = record.dup
+end
+
+Biopieces.open(file_records, options[:stream_out]) do |input, output|
+ input.each_record do |record|
+ if record.has_key? :SEQ_NAME and record.has_key? :SEQ
+ if hash.has_key? record[:SEQ_NAME].to_sym
+ uc = hash[record[:SEQ_NAME].to_sym]
+ record[:CLUSTER] = uc[:CLUSTER].to_i
+ record[:IDENT] = uc[:IDENT].to_i
+ record[:IDENT] = '*' if uc[:TYPE] == 'S'
+ end
+ end
- uclust.each do |record|
output.puts record
end
end