X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=code_ruby%2Flib%2Fmaasha%2Fusearch.rb;h=63757911deebea8e62de71ea688e29311d5427b8;hb=a80169a9121e8537f169cd85010d2ceae3a8d4fd;hp=837585010f41fda3d18ea91292bdef452ba3076d;hpb=c4b49c5ce1ed3b46ae37e6ebd73c21d67d6d4810;p=biopieces.git diff --git a/code_ruby/lib/maasha/usearch.rb b/code_ruby/lib/maasha/usearch.rb index 8375850..6375791 100644 --- a/code_ruby/lib/maasha/usearch.rb +++ b/code_ruby/lib/maasha/usearch.rb @@ -27,8 +27,6 @@ require 'maasha/fasta' # Error class for all exceptions to do with Usearch. class UsearchError < StandardError; end -SORT_LIMIT = 2_000_000_000 # use mergesort for files biggern than 2Gb. - class Usearch include Enumerable @@ -43,13 +41,11 @@ class Usearch # Method that calls Usearch sorting for sorting a FASTA file # according to decending sequence length. - def sort_length - # usearch -sort seqs.fasta -output seqs.sorted.fasta - if File.size(@infile) < SORT_LIMIT - @command << "usearch --sort #{@infile} --output #{@infile}.sort" - else - @command << "usearch --mergesort #{@infile} --output #{@infile}.sort" - end + def sortbylength + # usearch -sortbylength seqs.fasta -output seqs_sorted.fasta -minseqlength 64 + @command << "usearch" + @command << "-sortbylength #{@infile}" + @command << "-output #{@infile}.sort" execute @@ -58,31 +54,76 @@ class Usearch # Method that calls Usearch sorting for sorting a FASTA file # according to cluster size. - def sort_size - @command << "usearch --sortsize #{@infile} --output #{@infile}.sort" + def sortbysize + # usearch -sortbysize seqs.fasta -output seqs_sorted.fasta -minsize 4 + @command << "usearch" + @command << "-sortbysize #{@infile}" + @command << "-output #{@infile}.sort" execute File.rename "#{@infile}.sort", @infile end - # Method to execute clustering de novo. - def cluster - @command << "usearch --cluster #{@infile} --uc #{@outfile} --id #{@options[:identity]}" - @command << "--rev" if @options[:comp] + # Method to execute cluster_fast. + def cluster_fast + # usearch -cluster_fast query.fasta -id 0.9 -centroids nr.fasta -uc clusters.uc + @command << "usearch" + @command << "-cluster_fast #{@infile}" + @command << "-id #{@options[:identity]}" + @command << "-uc #{@outfile}" + @command << "-threads #{@options[:cpus]}" - execute + execute end - # Method to execute database search. - def usearch - @command << "usearch --query #{@infile} --db #{@options[:database]} --userout #{@outfile}" - @command << "--userfields target+tloz+thiz+query+bits+strand" - @command << "--id #{@options[:identity]}" if @options[:identity] - @command << "--evalue #{@options[:e_val]}" if @options[:e_val] - @command << "--rev" + # Method to execute cluster_smallmem. + # NB sequences must be sorted with sortbylength or sortbysize. + def cluster_smallmem + # usearch -cluster_smallmem query.fasta -id 0.9 -centroids nr.fasta -uc clusters.uc + @command << "usearch" + @command << "-cluster_smallmem #{@infile}" + @command << "-id #{@options[:identity]}" + @command << "-uc #{@outfile}" + @command << "-strand both" if @options[:comp] - execute + execute + end + + # Method to execute database local search. + def usearch_local + # usearch -usearch_local query.fasta -db proteins.udb -id 0.8 -alnout results.aln + # usearch -usearch_local query.fasta -db ESTs.fasta -id 0.9 -evalue 1e-6 -blast6out results.m8 -strand plus -maxaccepts 8 -maxrejects 256 + + @command << "usearch" + @command << "-usearch_local #{@infile}" + @command << "-db #{@options[:database]}" + @command << "-blast6out #{@outfile}" + @command << "-strand both" + @command << "-id #{@options[:identity]}" if @options[:identity] + @command << "-evalue #{@options[:e_val]}" if @options[:e_val] + @command << "-maxaccepts #{@options[:maxaccepts]}" + @command << "-threads #{@options[:cpus]}" + + execute + end + + # Method to execute database global search. + def usearch_global + # usearch -usearch_global query.fasta -db proteins.udb -id 0.8 -alnout results.aln + # usearch -usearch_global query.fasta -db ESTs.fasta -id 0.9 -blast6out results.m8 -strand plus -maxaccepts 8 -maxrejects 256 + + @command << "usearch" + @command << "-usearch_global #{@infile}" + @command << "-db #{@options[:database]}" + @command << "-blast6out #{@outfile}" + @command << "-strand both" + @command << "-id #{@options[:identity]}" + @command << "-evalue #{@options[:e_val]}" if @options[:e_val] + @command << "-maxaccepts #{@options[:maxaccepts]}" + @command << "-threads #{@options[:cpus]}" + + execute end # Method to execute uchime chimera detection. @@ -142,17 +183,24 @@ class Usearch record = {} File.open(@outfile, "r") do |ios| - ios.gets # skip comment line ios.each_line do |line| fields = line.chomp.split("\t") - - record[:REC_TYPE] = "USEARCH" - record[:S_ID] = fields[0] - record[:S_BEG] = fields[1].to_i - record[:S_END] = fields[2].to_i - record[:Q_ID] = fields[3] - record[:SCORE] = fields[4].to_f - record[:STRAND] = fields[5] + record[:REC_TYPE] = "USEARCH" + record[:Q_ID] = fields[0] + record[:S_ID] = fields[1] + record[:IDENT] = fields[2].to_f + record[:ALIGN_LEN] = fields[3].to_i + record[:MISMATCHES] = fields[4].to_i + record[:GAPS] = fields[5].to_i + record[:Q_BEG] = fields[6].to_i - 1 + record[:Q_END] = fields[7].to_i - 1 + record[:S_BEG] = fields[8].to_i - 1 + record[:S_END] = fields[9].to_i - 1 + record[:E_VAL] = fields[10] == '*' ? '*' : fields[10].to_f + record[:SCORE] = fields[11] == '*' ? '*' : fields[11].to_f + record[:STRAND] = record[:S_BEG].to_i < record[:S_END].to_i ? '+' : '-' + + record[:S_BEG], record[:S_END] = record[:S_END], record[:S_BEG] if record[:STRAND] == '-' yield record end @@ -188,6 +236,8 @@ class Usearch yield Align.new(entries) unless entries.empty? end + + self # conventionally end private @@ -209,7 +259,6 @@ class Usearch # Method to execute a command using a system() call. # The command is composed of bits from the @command variable. def execute - @command.unshift "nice -n 19" @command << "--quiet" unless @options[:verbose] command = @command.join(" ") $stderr.puts "Running command: #{command}" if @options[:verbose]