From: martinahansen Date: Fri, 3 Feb 2012 08:05:58 +0000 (+0000) Subject: added denoise_seq X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=844087144441278cb50b8d88246417648c02e239;p=biopieces.git added denoise_seq git-svn-id: http://biopieces.googlecode.com/svn/trunk@1743 74ccb610-7750-0410-82ae-013aeee3265d --- diff --git a/bp_bin/denoise_seq b/bp_bin/denoise_seq new file mode 100755 index 0000000..ff557b4 --- /dev/null +++ b/bp_bin/denoise_seq @@ -0,0 +1,120 @@ +#!/usr/bin/env ruby + +# Copyright (C) 2007-2012 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# This program is part of the Biopieces framework (www.biopieces.org). + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# Denoises sequences with quality scores in the stream. + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +require 'pp' +require 'maasha/biopieces' +require 'maasha/seq' +require 'maasha/fastq' +require 'maasha/fasta' +require 'maasha/align' +require 'maasha/usearch' + +casts = [] +casts << {:long=>'identity', :short=>'i', :type=>'float', :mandatory=>true, :default=>0.97, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'cluster_min', :short=>'c', :type=>'uint', :mandatory=>true, :default=>2, :allowed=>nil, :disallowed=>"0"} + +options = Biopieces.options_parse(ARGV, casts) +tmpdir = Biopieces.mktmpdir +fastq_file = File.join(tmpdir, "test.fq") +fasta_file = File.join(tmpdir, "test.fna") +fasta_file_align = File.join(tmpdir, "test.aln.fna") + +def alignment_to_fastq(entries, index) + entries.each do |entry| + cluster, ident, name = entry.seq_name.split('|') + + entry.qual = index.get(name).qual # disk based lookup + + entry.seq.scan(/-+/) do |m| + entry.qual = entry.qual[0 ... $`.length] + ('@' * m.length) + entry.qual[$`.length .. -1] + end + end +end + +index = FastqIndex.new +seq_count = 0 + +Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output| + Fasta.open(fasta_file, "w") do |fasta_io| + Fastq.open(fastq_file, "w") do |fastq_io| + input.each_record do |record| + if record.has_key? :SEQ and record.has_key? :SCORES + entry = Seq.new_bp(record) + entry.seq_name = seq_count.to_s + + fasta_io.puts entry.to_fasta + fastq_io.puts entry.to_fastq + + index.add(entry) + + seq_count += 1 + else + output.puts record + end + end + end + end + + fastq_io = File.open(fastq_file, "r") + index.ios = fastq_io + + tot_clusters = 0 + tot_entries = 0 + + uc = Usearch.new(fasta_file, fasta_file_align, options) + uc.sort_length + uc.cluster + uc.ustar + + uc.each_alignment do |align| + if align.members >= options[:cluster_min] + alignment_to_fastq(align.entries, index) + + cons = align.consensus + cons.seq_name = "#{tot_clusters}_#{align.members}" + cons.indels_remove + + new_record = cons.to_bp + new_record[:CLUSTER] = tot_clusters + new_record[:CLUSTER_COUNT] = align.members + + puts align if options[:verbose] + + output.puts new_record + + tot_clusters += 1 + end + end +end + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +__END__