From: martinahansen Date: Thu, 6 Feb 2014 10:33:20 +0000 (+0000) Subject: fixed typo in max_vals X-Git-Url: https://git.donarmstrong.com/?p=biopieces.git;a=commitdiff_plain;h=da9fc300ec207fce5d21e883781dab839b378fc7 fixed typo in max_vals git-svn-id: http://biopieces.googlecode.com/svn/trunk@2290 74ccb610-7750-0410-82ae-013aeee3265d --- diff --git a/bp_bin/denoise_seq b/bp_bin/denoise_seq index baa62ad..b0c535a 100755 --- a/bp_bin/denoise_seq +++ b/bp_bin/denoise_seq @@ -36,14 +36,159 @@ require 'maasha/fasta' require 'maasha/align' require 'maasha/usearch' +class Seq + ALPH_DNA = %w{A T C G} + ALPH_AMBI = %w{A T C G M R W S Y K V H D B N} + + BIT_INDEL = 0 + BIT_A = 1 << 0 + BIT_T = 1 << 1 + BIT_C = 1 << 2 + BIT_G = 1 << 3 + + BIT_M = BIT_A | BIT_C + BIT_R = BIT_A | BIT_G + BIT_W = BIT_A | BIT_T + BIT_S = BIT_C | BIT_G + BIT_Y = BIT_C | BIT_T + BIT_K = BIT_G | BIT_T + BIT_V = BIT_A | BIT_C | BIT_G + BIT_H = BIT_A | BIT_C | BIT_T + BIT_D = BIT_A | BIT_G | BIT_T + BIT_B = BIT_C | BIT_G | BIT_T + BIT_N = BIT_G | BIT_A | BIT_T | BIT_C + + BITMAP = [ + BIT_A, + BIT_T, + BIT_C, + BIT_G, + BIT_M, + BIT_R, + BIT_W, + BIT_S, + BIT_Y, + BIT_K, + BIT_V, + BIT_H, + BIT_D, + BIT_B, + BIT_N + ] + + TR_NUC = "-" + ALPH_AMBI.join("").downcase + TR_HEX = [BIT_INDEL].pack("C") + BITMAP.pack("C*") + + def to_na + entry = Seq.new + entry.seq = NArray.to_na(self.seq.downcase.tr(TR_NUC, TR_HEX), "byte") + entry.qual = NArray.to_na(self.qual, "byte") - Seq::SCORE_BASE if self.qual + entry + end +end + +class Align + def to_na + cols = self.length + rows = self.members + + na_seq = NArray.byte(cols, rows) + na_qual = NArray.byte(cols, rows) + + self.entries.each_with_index do |entry, i| + na_entry = entry.to_na + na_seq[true, i] = na_entry.seq + na_qual[true, i] = na_entry.qual + end + + return na_seq, na_qual + end +end + +class Denoise + attr_reader :align + + ROW_A = 0 + ROW_T = 1 + ROW_C = 2 + ROW_G = 3 + + def initialize(align, options) + @align = align + @options = options + @cols = align.length + @rows = align.members + @na_seq, @na_qual = align.to_na + @na_rescored = nil + end + + def denoise_sequences + freq = NArray.int(@cols, 4) + + freq[true, ROW_A] = (@na_seq & Seq::BIT_A > 0).to_type("int").sum(1) + freq[true, ROW_T] = (@na_seq & Seq::BIT_T > 0).to_type("int").sum(1) + freq[true, ROW_C] = (@na_seq & Seq::BIT_C > 0).to_type("int").sum(1) + freq[true, ROW_G] = (@na_seq & Seq::BIT_G > 0).to_type("int").sum(1) + + mask_freq = freq.eq freq.max(1) + + mask_freq[true, ROW_A] *= Seq::BIT_A + mask_freq[true, ROW_T] *= Seq::BIT_T + mask_freq[true, ROW_C] *= Seq::BIT_C + mask_freq[true, ROW_G] *= Seq::BIT_G + + mask_replace = mask_freq.max(1) + + mask_bad = @na_qual <= @options[:quality_min] + + @na_rescored = mask_bad.dup + + new_values = mask_replace * mask_bad + old_values = mask_bad.eq(0) * @na_seq + + old_scores = @na_qual * mask_bad.eq(0) + + sum = old_scores.to_type("float").sum(1) + count = (old_scores > 0).to_type("int").sum(1) + mean = (sum / count).to_type("byte") + + new_scores = mask_bad * mean + + @na_seq = new_values + old_values + @na_qual = new_scores + old_scores + + self + end + + # Method that lowercase residues that have been removed during + # the determination of the consensus sequence. + def mask_sequences + @align.each_with_index do |entry, i| + entry.qual = (@na_qual[true, i] + Seq::SCORE_BASE).to_s + + j = 0 + + while entry.seq[j] do + if @na_rescored[j, i] == 0 + entry.seq[j] = entry.seq[j].upcase + else + entry.seq[j] = entry.seq[j].downcase + end + + j += 1 + end + end + end +end + casts = [] casts << {:long=>'cluster_ident', :short=>'i', :type=>'float', :mandatory=>true, :default=>0.97, :allowed=>nil, :disallowed=>nil} -casts << {:long=>'cluster_min', :short=>'c', :type=>'uint', :mandatory=>true, :default=>1, :allowed=>nil, :disallowed=>"0"} casts << {:long=>'sequence_min', :short=>'s', :type=>'uint', :mandatory=>true, :default=>1, :allowed=>nil, :disallowed=>"0"} casts << {:long=>'residue_min', :short=>'r', :type=>'float', :mandatory=>true, :default=>0.3, :allowed=>nil, :disallowed=>nil} casts << {:long=>'gap_max', :short=>'g', :type=>'float', :mandatory=>true, :default=>0.4, :allowed=>nil, :disallowed=>nil} casts << {:long=>'quality_min', :short=>'q', :type=>'uint', :mandatory=>true, :default=>10, :allowed=>nil, :disallowed=>nil} casts << {:long=>'quality_mean', :short=>'Q', :type=>'uint', :mandatory=>true, :default=>15, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'cpus', :short=>'C', :type=>'uint', :mandatory=>true, :default=>1, :allowed=>nil, :disallowed=>"0"} options = Biopieces.options_parse(ARGV, casts) tmpdir = Biopieces.mktmpdir @@ -52,15 +197,18 @@ fasta_file = File.join(tmpdir, "test.fna") fasta_file_align = File.join(tmpdir, "test.aln.fna") options[:identity] = options[:cluster_ident] +options[:msa] = true def alignment_to_fastq(entries, index) entries.each do |entry| - cluster, ident, name = entry.seq_name.split('|') + seq_name = entry.seq_name.sub(/^\*/, '') + elem = index.get(seq_name) # disk based lookup - entry.qual = index.get(name).qual # disk based lookup + entry.seq_name = elem.seq_name + entry.qual = elem.qual entry.seq.scan(/-+/) do |m| - entry.qual = entry.qual[0 ... $`.length] + ('@' * m.length) + entry.qual[$`.length .. -1] + entry.qual = entry.qual[0 ... $`.length] + ('!' * m.length) + entry.qual[$`.length .. -1] end end end @@ -74,12 +222,13 @@ Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output| input.each_record do |record| if record[:SEQ] and record[:SCORES] entry = Seq.new_bp(record) + orig_name = entry.seq_name.dup entry.seq_name = seq_count.to_s fasta_io.puts entry.to_fasta fastq_io.puts entry.to_fastq - index.add(entry) + index.add(entry, orig_name) seq_count += 1 else @@ -92,38 +241,27 @@ Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output| fastq_io = File.open(fastq_file, "r") index.ios = fastq_io - tot_clusters = 0 - tot_entries = 0 - uc = Usearch.new(fasta_file, fasta_file_align, options) - uc.sort_length - uc.cluster - uc.ustar + uc.sortbylength + uc.cluster_smallmem uc.each_alignment do |align| - if align.members >= options[:cluster_min] - align.options = options - - alignment_to_fastq(align.entries, index) - - cons = align.consensus - cons.seq_name = "#{tot_clusters}_#{align.members}" - cons.indels_remove - - new_record = cons.to_bp - new_record[:CLUSTER] = tot_clusters - new_record[:CLUSTER_COUNT] = align.members - - puts align if options[:verbose] - - output.puts new_record - - tot_clusters += 1 + align.options = options + alignment_to_fastq(align.entries, index) + + dn = Denoise.new(align, options) + dn.denoise_sequences + dn.mask_sequences + + if options[:verbose] + puts dn.align + else + dn.align.each do |entry| + output.puts entry.to_bp + end end end end # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - -__END__ diff --git a/bp_bin/max_vals b/bp_bin/max_vals index 52ede7c..f45b3ca 100755 --- a/bp_bin/max_vals +++ b/bp_bin/max_vals @@ -85,7 +85,7 @@ foreach $key ( @{ $options->{ "keys" } } ) { if ( $options->{ "keys" } and $new_record ) { - $new_record->{ 'REC_TYPE' } = "MIN"; + $new_record->{ 'REC_TYPE' } = "MAX"; Maasha::Biopieces::put_record( $new_record, $fh ); }