bp_bin/uclust_seq

   1 #!/usr/bin/env ruby
   2
   3 # Copyright (C) 2007-2011 Martin A. Hansen.
   4
   5 # This program is free software; you can redistribute it and/or
   6 # modify it under the terms of the GNU General Public License
   7 # as published by the Free Software Foundation; either version 2
   8 # of the License, or (at your option) any later version.
   9
  10 # This program is distributed in the hope that it will be useful,
  11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 # GNU General Public License for more details.
  14
  15 # You should have received a copy of the GNU General Public License
  16 # along with this program; if not, write to the Free Software
  17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  18
  19 # http://www.gnu.org/copyleft/gpl.html
  20
  21 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  22
  23 # This program is part of the Biopieces framework (www.biopieces.org).
  24
  25 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  26
  27 # Run Uclust on sequences in the stream.
  28
  29 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  30
  31 require 'maasha/biopieces'
  32 require 'maasha/fasta'
  33
  34 SORT_LIMIT = 2_000_000_000   # use mergesort for files biggern than 2Gb.
  35
  36 class Uclust
  37   include Enumerable
  38
  39   def initialize(infile, outfile, options)
  40     @infile  = infile
  41     @outfile = outfile
  42     @options = options
  43     @command = []
  44   end
  45
  46   # Method that calls Usearch sorting for sorting a FASTA file
  47   # according to decending sequence length.
  48   def sort
  49     # usearch -sort seqs.fasta -output seqs.sorted.fasta
  50     if File.size(@infile) < SORT_LIMIT
  51       @command << "usearch --sort #{@infile} --output #{@infile}.sort"
  52     else
  53       @command << "usearch --mergesort #{@infile} --output #{@infile}.sort"
  54     end
  55
  56                 execute
  57
  58     File.rename "#{@infile}.sort", @infile
  59   end
  60
  61         # Method to execute clustering de novo.
  62   def cluster
  63     @command << "usearch --cluster #{@infile} --uc #{@outfile} --id #{@options[:identity]}"
  64
  65                 execute
  66   end
  67
  68   # Method to execute database search.
  69   def usearch
  70     # usearch --query query.fasta --db db.fasta --uc results.uc --id 0.90 [--evalue E]
  71     @command << "usearch --query #{@infile} --db #{@options[:database]} --uc #{@outfile} --id #{@options[:identity]}"
  72     @command << "--evalue #{@options[:e_val]}" if @options.has_key? :e_val
  73
  74                 execute
  75   end
  76
  77         # Method to execute clustering to database plus de novo if not matched.
  78   def usearch_uclust
  79     # usearch --cluster seqs_sorted.fasta --db db.fasta --uc results.uc --id 0.90
  80     @command << "usearch --cluster #{@infile} --db #{@options[:database]} --uc #{@outfile} --id #{@options[:identity]}"
  81
  82                 execute
  83   end
  84
  85         # Method to parse a Uclust .uc file and for each line of data
  86         # yield a Biopiece record.
  87   def each
  88     record = {}
  89
  90     File.open(@outfile, mode="r") do |ios|
  91       ios.each_line do |line|
  92         if line !~ /^#/
  93           fields = line.chomp.split("\t")
  94
  95           record[:REC_TYPE] = "UCLUST"
  96           record[:TYPE]     = fields[0]
  97           record[:CLUSTER]  = fields[1].to_i
  98           record[:SEQ_LEN]  = fields[2].to_i
  99           record[:IDENT]    = fields[3].to_f
 100           record[:STRAND]   = fields[4]
 101           record[:Q_BEG]    = fields[5].to_i
 102           record[:S_BEG]    = fields[6].to_i
 103           record[:S_END]    = fields[6].to_i + fields[2].to_i
 104           record[:CIGAR]    = fields[7]
 105           record[:Q_ID]     = fields[8]
 106           record[:S_ID]     = fields[9]
 107
 108           yield record
 109         end
 110       end
 111     end
 112
 113     self # conventionally
 114   end
 115
 116   private
 117
 118         # Method to execute a command using a system() call.
 119         # The command is composed of bits from the @command variable.
 120         def execute
 121                 @command.unshift "nice -n 19"
 122     @command << "--rev" if @options[:comp]
 123                 @command << "> /dev/null 2>&1" unless @options[:verbose]
 124                 command = @command.join(" ")
 125     system(command)
 126     raise "Command failed: #{command}" unless $?.success?
 127
 128                 @command = []
 129         end
 130 end
 131
 132 ok_methods = "uclust,usearch,usearch_uclust"
 133
 134 casts = []
 135 casts << {:long=>'no_sort',  :short=>'n', :type=>'flag',   :mandatory=>false, :default=>nil,      :allowed=>nil,        :disallowed=>nil}
 136 casts << {:long=>'method',   :short=>'m', :type=>'string', :mandatory=>true,  :default=>"uclust", :allowed=>ok_methods, :disallowed=>nil}
 137 casts << {:long=>'database', :short=>'d', :type=>'file!',  :mandatory=>false, :default=>nil,      :allowed=>nil,        :disallowed=>nil}
 138 casts << {:long=>'comp',     :short=>'c', :type=>'flag',   :mandatory=>false, :default=>nil,      :allowed=>nil,        :disallowed=>nil}
 139 casts << {:long=>'identity', :short=>'i', :type=>'float',  :mandatory=>true,  :default=>0.9,      :allowed=>nil,        :disallowed=>nil}
 140 casts << {:long=>'e_val',    :short=>'e', :type=>'float',  :mandatory=>false, :default=>nil,      :allowed=>nil,        :disallowed=>nil}
 141
 142 options = Biopieces.options_parse(ARGV, casts)
 143
 144 # At high identities, around 96% and above, compressed indexes are often more sensitive, faster
 145 # and use less RAM. Compressed indexes are disabled by default, so I generally recommend that
 146 # you specify the --slots and --w options when clustering at high identities.
 147
 148 tmpdir  = Biopieces.mktmpdir
 149 infile  = File.join(tmpdir, "in.fna")
 150 outfile = File.join(tmpdir, "out.uc")
 151
 152 Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output|
 153   Fasta.open(infile, mode="w") do |fasta_io|
 154     input.each_record do |record|
 155       output.puts record
 156
 157       if record.has_key? :SEQ_NAME and record.has_key? :SEQ
 158         fasta_io.puts Seq.new_bp(record).to_fasta
 159       end
 160     end
 161   end
 162
 163   uclust = Uclust.new(infile, outfile, options)
 164   uclust.sort unless options[:no_sort] or options[:method] == "usearch"
 165
 166   case options[:method].to_s
 167   when "uclust"         then uclust.cluster
 168   when "usearch"        then uclust.usearch
 169   when "usearch_uclust" then uclust.usearch_uclust
 170   end
 171
 172   uclust.each do |record|
 173     output.puts record
 174   end
 175 end
 176
 177
 178 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
 179
 180
 181 __END__