]> git.donarmstrong.com Git - biopieces.git/blobdiff - code_ruby/lib/maasha/usearch.rb
fixing usearch upgrade
[biopieces.git] / code_ruby / lib / maasha / usearch.rb
index b0c8c2f8ce69f63234b87e9da7d8f1b71ba3acad..63757911deebea8e62de71ea688e29311d5427b8 100644 (file)
@@ -27,8 +27,6 @@ require 'maasha/fasta'
 # Error class for all exceptions to do with Usearch.
 class UsearchError < StandardError; end
 
-SORT_LIMIT = 2_000_000_000   # use mergesort for files biggern than 2Gb.
-
 class Usearch
   include Enumerable
 
@@ -37,17 +35,17 @@ class Usearch
     @outfile = outfile
     @options = options
     @command = []
+
+    raise UsearchError, %{Empty input file -> "#{@infile}"} if File.size(@infile) == 0
   end
 
   # Method that calls Usearch sorting for sorting a FASTA file
   # according to decending sequence length.
-  def sort_length
-    # usearch -sort seqs.fasta -output seqs.sorted.fasta
-    if File.size(@infile) < SORT_LIMIT
-      @command << "usearch --sort #{@infile} --output #{@infile}.sort"
-    else
-      @command << "usearch --mergesort #{@infile} --output #{@infile}.sort"
-    end
+  def sortbylength
+    # usearch -sortbylength seqs.fasta -output seqs_sorted.fasta -minseqlength 64
+    @command << "usearch"
+    @command << "-sortbylength #{@infile}"
+    @command << "-output #{@infile}.sort"
 
                execute
 
@@ -56,31 +54,102 @@ class Usearch
 
   # Method that calls Usearch sorting for sorting a FASTA file
   # according to cluster size.
-  def sort_size
-    # usearch -sortsize seqs.fasta -output seqs.sorted.fasta
-    @command << "usearch --sortsize #{@infile} --output #{@infile}.sort"
+  def sortbysize
+    # usearch -sortbysize seqs.fasta -output seqs_sorted.fasta -minsize 4
+    @command << "usearch"
+    @command << "-sortbysize #{@infile}"
+    @command << "-output #{@infile}.sort"
 
                execute
 
     File.rename "#{@infile}.sort", @infile
   end
 
-       # Method to execute clustering de novo.
-  def cluster
-    @command << "usearch --cluster #{@infile} --uc #{@outfile} --id #{@options[:identity]}"
+  # Method to execute cluster_fast.
+  def cluster_fast
+    # usearch -cluster_fast query.fasta -id 0.9 -centroids nr.fasta -uc clusters.uc
+    @command << "usearch"
+    @command << "-cluster_fast #{@infile}"
+    @command << "-id #{@options[:identity]}"
+    @command << "-uc #{@outfile}"
+    @command << "-threads #{@options[:cpus]}"
 
-               execute
+    execute
+  end
+
+  # Method to execute cluster_smallmem.
+  # NB sequences must be sorted with sortbylength or sortbysize.
+  def cluster_smallmem
+    # usearch -cluster_smallmem query.fasta -id 0.9 -centroids nr.fasta -uc clusters.uc
+    @command << "usearch"
+    @command << "-cluster_smallmem #{@infile}"
+    @command << "-id #{@options[:identity]}"
+    @command << "-uc #{@outfile}"
+    @command << "-strand both" if @options[:comp]
+
+    execute
   end
 
-  # Method to execute database search.
-  def usearch
-    @command << "usearch --query #{@infile} --db #{@options[:database]} --userout #{@outfile}"
-    @command << "--userfields target+tloz+thiz+query+bits+strand"
-    @command << "--id #{@options[:identity]}"  if @options.has_key? :identity
-    @command << "--evalue #{@options[:e_val]}" if @options.has_key? :e_val
-    @command << "--rev"
+  # Method to execute database local search.
+  def usearch_local
+    # usearch -usearch_local query.fasta -db proteins.udb -id 0.8 -alnout results.aln
+    # usearch -usearch_local query.fasta -db ESTs.fasta -id 0.9 -evalue 1e-6 -blast6out results.m8 -strand plus -maxaccepts 8 -maxrejects 256
+    
+    @command << "usearch"
+    @command << "-usearch_local #{@infile}"
+    @command << "-db #{@options[:database]}"
+    @command << "-blast6out #{@outfile}"
+    @command << "-strand both"
+    @command << "-id #{@options[:identity]}"  if @options[:identity]
+    @command << "-evalue #{@options[:e_val]}" if @options[:e_val]
+    @command << "-maxaccepts #{@options[:maxaccepts]}"
+    @command << "-threads #{@options[:cpus]}"
+
+    execute
+  end
 
-               execute
+  # Method to execute database global search.
+  def usearch_global
+    # usearch -usearch_global query.fasta -db proteins.udb -id 0.8 -alnout results.aln
+    # usearch -usearch_global query.fasta -db ESTs.fasta -id 0.9 -blast6out results.m8 -strand plus -maxaccepts 8 -maxrejects 256
+
+    @command << "usearch"
+    @command << "-usearch_global #{@infile}"
+    @command << "-db #{@options[:database]}"
+    @command << "-blast6out #{@outfile}"
+    @command << "-strand both"
+    @command << "-id #{@options[:identity]}"
+    @command << "-evalue #{@options[:e_val]}" if @options[:e_val]
+    @command << "-maxaccepts #{@options[:maxaccepts]}"
+    @command << "-threads #{@options[:cpus]}"
+
+    execute
+  end
+
+  # Method to execute uchime chimera detection.
+  def uchime
+    @command << "usearch --uchime #{@infile} --db #{@options[:database]} --uchimeout #{@outfile}"
+
+    execute
+  end
+
+  # Method to execute ustar alignment.
+  def ustar
+    command = %Q{grep "^[SH]" #{@outfile} > #{@outfile}.sub}
+    system(command)
+    raise "Command failed: #{command}" unless $?.success?
+
+    File.rename "#{@outfile}.sub", @outfile
+
+    @command << "usearch --uc2fastax #{@outfile} --input #{@infile} --output #{@infile}.sub"
+
+    execute
+
+    @command << "usearch --staralign #{@infile}.sub --output #{@outfile}"
+
+    execute
+
+    File.delete "#{@infile}.sub"
   end
 
        # Method to parse a Uclust .uc file and for each line of data
@@ -88,7 +157,7 @@ class Usearch
   def each_cluster
     record = {}
 
-    File.open(@outfile, mode="r") do |ios|
+    File.open(@outfile, "r") do |ios|
       ios.each_line do |line|
         if line !~ /^#/
           fields = line.chomp.split("\t")
@@ -113,18 +182,25 @@ class Usearch
   def each_hit
     record = {}
 
-    File.open(@outfile, mode="r") do |ios|
-      ios.gets   # skip comment line
+    File.open(@outfile, "r") do |ios|
       ios.each_line do |line|
         fields = line.chomp.split("\t")
-
-        record[:REC_TYPE] = "USEARCH"
-        record[:S_ID]     = fields[0]
-        record[:S_BEG]    = fields[1].to_i
-        record[:S_END]    = fields[2].to_i
-        record[:Q_ID]     = fields[3]
-        record[:SCORE]    = fields[4].to_f
-        record[:STRAND]   = fields[5]
+        record[:REC_TYPE]   = "USEARCH"
+        record[:Q_ID]       = fields[0]
+        record[:S_ID]       = fields[1]
+        record[:IDENT]      = fields[2].to_f
+        record[:ALIGN_LEN]  = fields[3].to_i
+        record[:MISMATCHES] = fields[4].to_i
+        record[:GAPS]       = fields[5].to_i
+        record[:Q_BEG]      = fields[6].to_i - 1
+        record[:Q_END]      = fields[7].to_i - 1
+        record[:S_BEG]      = fields[8].to_i - 1
+        record[:S_END]      = fields[9].to_i - 1
+        record[:E_VAL]      = fields[10] == '*' ? '*' : fields[10].to_f
+        record[:SCORE]      = fields[11] == '*' ? '*' : fields[11].to_f
+        record[:STRAND]     = record[:S_BEG].to_i < record[:S_END].to_i ? '+' : '-'
+
+        record[:S_BEG], record[:S_END] = record[:S_END], record[:S_BEG] if record[:STRAND] == '-'
 
         yield record
       end
@@ -133,15 +209,59 @@ class Usearch
     self # conventionally
   end
 
+  # Method to parse a FASTA file with Ustar alignments and for each alignment
+  # yield an Align object.
+  def each_alignment
+    old_cluster = 0
+    entries     = []
+
+    Fasta.open(@outfile, "r") do |ios|
+      ios.each do |entry|
+        entry.seq.tr!('.', '-')   # Replace . with - in Ustar alignments.
+        cluster, identity, name = entry.seq_name.split('|')
+        cluster = cluster.to_i
+
+        if cluster == old_cluster
+          entries << entry
+        else
+          fix_alignment(entries)
+
+          yield Align.new(entries)
+
+          old_cluster = cluster
+          entries     = []
+          entries << entry
+        end
+      end
+
+      yield Align.new(entries) unless entries.empty?
+    end
+
+    self # conventionally
+  end
+
   private
 
+  # Method that fixed Ustar bug resulting in alignments with uneven 
+  # sequence length.
+  def fix_alignment(entries)
+    if entries.size > 1
+      min, max = entries.minmax { |a, b| a.length <=> b.length } 
+
+      if min.length != max.length
+        entries.each do |entry|
+          entry.seq << '-' * (max.length - entry.length)
+        end
+      end
+    end
+  end
+
        # Method to execute a command using a system() call.
        # The command is composed of bits from the @command variable.
        def execute
-               @command.unshift "nice -n 19"
-    @command << "--rev" if @options[:comp]
-               @command << "> /dev/null 2>&1" unless @options[:verbose]
+               @command << "--quiet" unless @options[:verbose]
                command = @command.join(" ")
+    $stderr.puts "Running command: #{command}" if @options[:verbose]
     system(command)
     raise "Command failed: #{command}" unless $?.success?