From: martinahansen Date: Tue, 10 Apr 2012 19:04:36 +0000 (+0000) Subject: upgraded read_fastq to handle illumina1.8 better X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=c50a8848af62038cbde81ae901e8e18cf35f67c7;p=biopieces.git upgraded read_fastq to handle illumina1.8 better git-svn-id: http://biopieces.googlecode.com/svn/trunk@1787 74ccb610-7750-0410-82ae-013aeee3265d --- diff --git a/bp_bin/read_fastq b/bp_bin/read_fastq index 8fec43b..4a94957 100755 --- a/bp_bin/read_fastq +++ b/bp_bin/read_fastq @@ -31,17 +31,18 @@ require 'maasha/biopieces' require 'maasha/fastq' -casts = [] -casts << {:long=>'data_in', :short=>'i', :type=>'files!', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} -casts << {:long=>'num', :short=>'n', :type=>'uint', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>'0'} -casts << {:long=>'solexa', :short=>'s', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} +allowed_enc = 'auto,sanger,solexa,illumina13,illumina15,illumina18' -PHRED_SCORES = Regexp.new('[!"#$%&\'()*+,-./0123456789:]') +casts = [] +casts << {:long=>'data_in', :short=>'i', :type=>'files!', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'num', :short=>'n', :type=>'uint', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>'0'} +casts << {:long=>'encoding', :short=>'e', :type=>'string', :mandatory=>false, :default=>'auto', :allowed=>allowed_enc, :disallowed=>nil} options = Biopieces.options_parse(ARGV, casts) -num = 0 -last = false +num = 0 +last = false +encoding = options[:encoding] Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output| unless options[:data_in].first == '-' @@ -54,8 +55,17 @@ Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output| options[:data_in].each do |file| Fastq.open(file, mode='r') do |fastq| fastq.each do |entry| - entry.convert_phred2illumina! if entry.qual.match PHRED_SCORES - entry.convert_solexa2illumina! if options[:solexa] + if encoding == 'auto' + if entry.qual.match(/[!-:]/) # sanger or illumina18 + encoding = 'illumina18' + elsif entry.qual.match(/[K-h]/) # solexa or illumina13 or illumina15 + encoding = 'illumina13' + else + raise SeqError, "Could not auto-detect quality score encoding" + end + end + + entry.convert_scores!(encoding, 'illumina13') output.puts entry.to_bp num += 1