]> git.donarmstrong.com Git - biopieces.git/commitdiff
fixed typo in max_vals
authormartinahansen <martinahansen@74ccb610-7750-0410-82ae-013aeee3265d>
Thu, 6 Feb 2014 10:33:20 +0000 (10:33 +0000)
committermartinahansen <martinahansen@74ccb610-7750-0410-82ae-013aeee3265d>
Thu, 6 Feb 2014 10:33:20 +0000 (10:33 +0000)
git-svn-id: http://biopieces.googlecode.com/svn/trunk@2290 74ccb610-7750-0410-82ae-013aeee3265d

bp_bin/denoise_seq
bp_bin/max_vals

index baa62ad7eb6102e724f7ad7b02d842ceb9d2d2d3..b0c535a20cb178fed7158ffe5a50ed0b0e4e6775 100755 (executable)
@@ -36,14 +36,159 @@ require 'maasha/fasta'
 require 'maasha/align'
 require 'maasha/usearch'
 
+class Seq
+  ALPH_DNA  = %w{A T C G}
+  ALPH_AMBI = %w{A T C G M R W S Y K V H D B N}
+
+  BIT_INDEL = 0
+  BIT_A = 1 << 0
+  BIT_T = 1 << 1
+  BIT_C = 1 << 2
+  BIT_G = 1 << 3
+
+  BIT_M = BIT_A | BIT_C
+  BIT_R = BIT_A | BIT_G
+  BIT_W = BIT_A | BIT_T
+  BIT_S = BIT_C | BIT_G
+  BIT_Y = BIT_C | BIT_T
+  BIT_K = BIT_G | BIT_T
+  BIT_V = BIT_A | BIT_C | BIT_G
+  BIT_H = BIT_A | BIT_C | BIT_T
+  BIT_D = BIT_A | BIT_G | BIT_T
+  BIT_B = BIT_C | BIT_G | BIT_T
+  BIT_N = BIT_G | BIT_A | BIT_T | BIT_C
+
+  BITMAP = [
+    BIT_A,
+    BIT_T,
+    BIT_C,
+    BIT_G,
+    BIT_M,
+    BIT_R,
+    BIT_W,
+    BIT_S,
+    BIT_Y,
+    BIT_K,
+    BIT_V,
+    BIT_H,
+    BIT_D,
+    BIT_B,
+    BIT_N
+  ]
+
+  TR_NUC = "-" + ALPH_AMBI.join("").downcase
+  TR_HEX = [BIT_INDEL].pack("C") + BITMAP.pack("C*")
+
+  def to_na
+    entry      = Seq.new
+    entry.seq  = NArray.to_na(self.seq.downcase.tr(TR_NUC, TR_HEX), "byte")
+    entry.qual = NArray.to_na(self.qual, "byte") - Seq::SCORE_BASE if self.qual
+    entry
+  end
+end
+
+class Align
+  def to_na
+    cols = self.length
+    rows = self.members
+
+    na_seq  = NArray.byte(cols, rows)
+    na_qual = NArray.byte(cols, rows)
+
+    self.entries.each_with_index do |entry, i|
+      na_entry = entry.to_na
+      na_seq[true, i]  = na_entry.seq
+      na_qual[true, i] = na_entry.qual
+    end
+
+    return na_seq, na_qual
+  end
+end
+
+class Denoise
+  attr_reader :align
+
+  ROW_A = 0
+  ROW_T = 1
+  ROW_C = 2
+  ROW_G = 3
+
+  def initialize(align, options)
+    @align   = align
+    @options = options
+    @cols    = align.length
+    @rows    = align.members
+    @na_seq, @na_qual = align.to_na
+    @na_rescored = nil
+  end
+
+  def denoise_sequences
+    freq = NArray.int(@cols, 4)
+
+    freq[true, ROW_A] = (@na_seq & Seq::BIT_A > 0).to_type("int").sum(1)
+    freq[true, ROW_T] = (@na_seq & Seq::BIT_T > 0).to_type("int").sum(1)
+    freq[true, ROW_C] = (@na_seq & Seq::BIT_C > 0).to_type("int").sum(1)
+    freq[true, ROW_G] = (@na_seq & Seq::BIT_G > 0).to_type("int").sum(1)
+
+    mask_freq = freq.eq freq.max(1)
+
+    mask_freq[true, ROW_A] *= Seq::BIT_A
+    mask_freq[true, ROW_T] *= Seq::BIT_T
+    mask_freq[true, ROW_C] *= Seq::BIT_C
+    mask_freq[true, ROW_G] *= Seq::BIT_G
+
+    mask_replace = mask_freq.max(1)
+
+    mask_bad = @na_qual <= @options[:quality_min]
+
+    @na_rescored = mask_bad.dup
+
+    new_values = mask_replace * mask_bad
+    old_values = mask_bad.eq(0) * @na_seq
+
+    old_scores = @na_qual * mask_bad.eq(0)
+
+    sum   = old_scores.to_type("float").sum(1)
+    count = (old_scores > 0).to_type("int").sum(1)
+    mean  = (sum / count).to_type("byte")
+
+    new_scores = mask_bad * mean
+
+    @na_seq  = new_values + old_values
+    @na_qual = new_scores + old_scores
+
+    self
+  end
+
+  # Method that lowercase residues that have been removed during
+  # the determination of the consensus sequence.
+  def mask_sequences
+    @align.each_with_index do |entry, i|
+      entry.qual = (@na_qual[true, i] + Seq::SCORE_BASE).to_s
+
+      j = 0
+
+      while entry.seq[j] do
+        if @na_rescored[j, i] == 0
+          entry.seq[j] = entry.seq[j].upcase
+        else
+          entry.seq[j] = entry.seq[j].downcase
+        end
+
+        j += 1
+      end
+    end
+  end
+end
+
 casts = []
 casts << {:long=>'cluster_ident', :short=>'i', :type=>'float', :mandatory=>true,  :default=>0.97, :allowed=>nil, :disallowed=>nil}
-casts << {:long=>'cluster_min',   :short=>'c', :type=>'uint',  :mandatory=>true,  :default=>1,    :allowed=>nil, :disallowed=>"0"}
 casts << {:long=>'sequence_min',  :short=>'s', :type=>'uint',  :mandatory=>true,  :default=>1,    :allowed=>nil, :disallowed=>"0"}
 casts << {:long=>'residue_min',   :short=>'r', :type=>'float', :mandatory=>true,  :default=>0.3,  :allowed=>nil, :disallowed=>nil}
 casts << {:long=>'gap_max',       :short=>'g', :type=>'float', :mandatory=>true,  :default=>0.4,  :allowed=>nil, :disallowed=>nil}
 casts << {:long=>'quality_min',   :short=>'q', :type=>'uint',  :mandatory=>true,  :default=>10,   :allowed=>nil, :disallowed=>nil}
 casts << {:long=>'quality_mean',  :short=>'Q', :type=>'uint',  :mandatory=>true,  :default=>15,   :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'cpus',          :short=>'C', :type=>'uint',  :mandatory=>true,  :default=>1,    :allowed=>nil, :disallowed=>"0"}
 
 options          = Biopieces.options_parse(ARGV, casts)
 tmpdir           = Biopieces.mktmpdir
@@ -52,15 +197,18 @@ fasta_file       = File.join(tmpdir, "test.fna")
 fasta_file_align = File.join(tmpdir, "test.aln.fna")
 
 options[:identity] = options[:cluster_ident]
+options[:msa]      = true
 
 def alignment_to_fastq(entries, index)
   entries.each do |entry|
-    cluster, ident, name = entry.seq_name.split('|')
+    seq_name = entry.seq_name.sub(/^\*/, '')
+    elem     = index.get(seq_name)   # disk based lookup
 
-    entry.qual = index.get(name).qual # disk based lookup
+    entry.seq_name = elem.seq_name
+    entry.qual     = elem.qual
 
     entry.seq.scan(/-+/) do |m|
-      entry.qual = entry.qual[0 ... $`.length] + ('@' * m.length) + entry.qual[$`.length .. -1]
+      entry.qual = entry.qual[0 ... $`.length] + ('!' * m.length) + entry.qual[$`.length .. -1]
     end
   end
 end
@@ -74,12 +222,13 @@ Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output|
       input.each_record do |record|
         if record[:SEQ] and record[:SCORES]
           entry = Seq.new_bp(record)
+          orig_name = entry.seq_name.dup
           entry.seq_name = seq_count.to_s
 
           fasta_io.puts entry.to_fasta
           fastq_io.puts entry.to_fastq
 
-          index.add(entry)
+          index.add(entry, orig_name)
 
           seq_count += 1
         else
@@ -92,38 +241,27 @@ Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output|
   fastq_io  = File.open(fastq_file, "r")
   index.ios = fastq_io
 
-  tot_clusters = 0
-  tot_entries  = 0
-
   uc = Usearch.new(fasta_file, fasta_file_align, options)
-  uc.sort_length
-  uc.cluster
-  uc.ustar
+  uc.sortbylength
+  uc.cluster_smallmem
 
   uc.each_alignment do |align|
-    if align.members >= options[:cluster_min]
-      align.options = options
-
-      alignment_to_fastq(align.entries, index)
-
-      cons          = align.consensus
-      cons.seq_name = "#{tot_clusters}_#{align.members}"
-      cons.indels_remove
-
-      new_record = cons.to_bp
-      new_record[:CLUSTER]       = tot_clusters
-      new_record[:CLUSTER_COUNT] = align.members
-
-      puts align if options[:verbose]
-
-      output.puts new_record
-
-      tot_clusters += 1
+    align.options = options
+    alignment_to_fastq(align.entries, index)
+
+    dn = Denoise.new(align, options)
+    dn.denoise_sequences
+    dn.mask_sequences
+
+    if options[:verbose]
+      puts dn.align
+    else
+      dn.align.each do |entry|
+        output.puts entry.to_bp
+      end
     end
   end
 end
 
 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
 
-
-__END__
index 52ede7c4fd16f18c5e817d37f0c9e17602dc398e..f45b3cac2817941b55d79c9292e50cdde7d41016 100755 (executable)
@@ -85,7 +85,7 @@ foreach $key ( @{ $options->{ "keys" } } ) {
 
 if ( $options->{ "keys" } and $new_record )
 {
-    $new_record->{ 'REC_TYPE' } = "MIN";
+    $new_record->{ 'REC_TYPE' } = "MAX";
 
     Maasha::Biopieces::put_record( $new_record, $fh );
 }