From: martinahansen Date: Wed, 25 Sep 2013 14:23:05 +0000 (+0000) Subject: adding bzip2 support in ruby X-Git-Url: https://git.donarmstrong.com/?p=biopieces.git;a=commitdiff_plain;h=2f0fd91b461033529a4a72e161bd133252a22eb6 adding bzip2 support in ruby git-svn-id: http://biopieces.googlecode.com/svn/trunk@2208 74ccb610-7750-0410-82ae-013aeee3265d --- diff --git a/bp_bin/analyze_assembly b/bp_bin/analyze_assembly index a9a6d12..614a9e5 100755 --- a/bp_bin/analyze_assembly +++ b/bp_bin/analyze_assembly @@ -50,7 +50,7 @@ tmpdir = Biopieces.mktmpdir infile = File.join(tmpdir, "in.fna") outfile = File.join(tmpdir, "out.prodigal") -Fasta.open(infile, mode="w") do |fasta_output| +Fasta.open(infile, "w") do |fasta_output| Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output| input.each_record do |record| if record[:SEQ] diff --git a/bp_bin/assemble_seq_idba b/bp_bin/assemble_seq_idba index e600c03..87b81a2 100755 --- a/bp_bin/assemble_seq_idba +++ b/bp_bin/assemble_seq_idba @@ -83,7 +83,7 @@ Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output| file_contig = File.join(options[:directory], "IDBA-UD", "contig.fa") - Fasta.open(file_contig, mode="r") do |fasta_io| + Fasta.open(file_contig, "r") do |fasta_io| fasta_io.each do |entry| output.puts entry.to_bp end diff --git a/bp_bin/assemble_seq_ray b/bp_bin/assemble_seq_ray index fbef82a..87d25f3 100755 --- a/bp_bin/assemble_seq_ray +++ b/bp_bin/assemble_seq_ray @@ -100,7 +100,7 @@ class Ray count = 0 n50 = 0 - Fasta.open(file, mode="r") do |fasta_io| + Fasta.open(file, "r") do |fasta_io| fasta_io.each do |entry| total += entry.length lengths << entry.length @@ -139,7 +139,7 @@ Dir.mkdir(options[:directory]) unless Dir.exists?(options[:directory]) file_fasta = File.join(options[:directory], "sequence_in.fasta") Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output| - Fasta.open(file_fasta, mode="w") do |fasta_io| + Fasta.open(file_fasta, "w") do |fasta_io| input.each_record do |record| if record[:SEQ_NAME] and record[:SEQ] seq = Seq.new_bp(record) @@ -154,7 +154,7 @@ Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output| ray.run file_contigs = ray.pick_best_assembly - Fasta.open(file_contigs, mode="r") do |fasta_io| + Fasta.open(file_contigs, "r") do |fasta_io| fasta_io.each do |entry| output.puts entry.to_bp end diff --git a/bp_bin/assemble_seq_velvet b/bp_bin/assemble_seq_velvet index abe6bd5..e2753ce 100755 --- a/bp_bin/assemble_seq_velvet +++ b/bp_bin/assemble_seq_velvet @@ -101,7 +101,7 @@ class Velvet count = 0 n50 = 0 - Fasta.open(file, mode="r") do |fasta_io| + Fasta.open(file, "r") do |fasta_io| fasta_io.each do |entry| total += entry.length lengths << entry.length @@ -144,7 +144,7 @@ Dir.mkdir(options[:directory]) unless Dir.exists?(options[:directory]) file_fasta = File.join(options[:directory], "sequence_in.fna") Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output| - Fasta.open(file_fasta, mode="w") do |fasta_io| + Fasta.open(file_fasta, "w") do |fasta_io| input.each_record do |record| if record[:SEQ_NAME] and record[:SEQ] seq = Seq.new_bp(record) @@ -161,7 +161,7 @@ Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output| kmer = file_contigs.match(/_\d+/) - Fasta.open(file_contigs, mode="r") do |fasta_io| + Fasta.open(file_contigs, "r") do |fasta_io| fasta_io.each do |entry| entry.seq_name << "_kmer#{kmer}" output.puts entry.to_bp diff --git a/bp_bin/create_bowtie2_index b/bp_bin/create_bowtie2_index index bce4f88..73545eb 100755 --- a/bp_bin/create_bowtie2_index +++ b/bp_bin/create_bowtie2_index @@ -46,7 +46,7 @@ tmpdir = Biopieces.mktmpdir tmpfile = File.join(tmpdir, "tmp.fna") Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output| - Fasta.open(tmpfile, mode="w") do |fasta_io| + Fasta.open(tmpfile, "w") do |fasta_io| input.each_record do |record| output.puts record unless options[:no_stream] diff --git a/bp_bin/find_genes b/bp_bin/find_genes index 15e7f57..88dacf2 100755 --- a/bp_bin/find_genes +++ b/bp_bin/find_genes @@ -44,7 +44,7 @@ infile = File.join(tmpdir, "in.fna") outfile = File.join(tmpdir, "out.prodigal") Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output| - Fasta.open(infile, mode="w") do |fasta_io| + Fasta.open(infile, "w") do |fasta_io| input.each_record do |record| output.puts record diff --git a/bp_bin/patscan_seq b/bp_bin/patscan_seq index a209bc6..28f028d 100755 --- a/bp_bin/patscan_seq +++ b/bp_bin/patscan_seq @@ -161,7 +161,7 @@ seq_name_hash = {} seq_type = nil Biopieces.open(options[:stream_in], tmp_file) do |input, output| - Fasta.open(in_file, mode="w") do |fasta_io| + Fasta.open(in_file, "w") do |fasta_io| input.each_record do |record| if record[:SEQ_NAME] seq_name_hash[seq_name_count] = record[:SEQ_NAME] diff --git a/bp_bin/pcr_seq b/bp_bin/pcr_seq index 09e6269..6afbb34 100755 --- a/bp_bin/pcr_seq +++ b/bp_bin/pcr_seq @@ -146,7 +146,7 @@ class Pattern # Save a pattern to file def save_pattern(file) - File.open(file, mode="w") do |ios| + File.open(file, "w") do |ios| ios.puts self end end @@ -173,7 +173,7 @@ end raise ArgumentError, "no adaptor specified" unless options[:forward] or options[:reverse] Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output| - Fasta.open(infile, mode="w") do |ios| + Fasta.open(infile, "w") do |ios| input.each_record do |record| output.puts record @@ -187,7 +187,7 @@ Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output| outfiles = Pcr.new(tmpdir, infile, options).run outfiles.each do |outfile| - Fasta.open(outfile, mode="r") do |ios| + Fasta.open(outfile, "r") do |ios| ios.each do |entry| record = entry.to_bp record[:REC_TYPE] = "PCR" diff --git a/bp_bin/read_embl b/bp_bin/read_embl index 0a3558c..ca2839b 100755 --- a/bp_bin/read_embl +++ b/bp_bin/read_embl @@ -54,7 +54,7 @@ Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output| if options[:data_in] options[:data_in].each do |file| - EMBL.open(file, mode='r') do |embl_io| + EMBL.open(file, 'r') do |embl_io| embl_io.each(hash_keys, hash_feats, hash_quals) do |entry| output.puts entry diff --git a/bp_bin/read_fasta b/bp_bin/read_fasta index 8bc5874..e649440 100755 --- a/bp_bin/read_fasta +++ b/bp_bin/read_fasta @@ -49,7 +49,7 @@ Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output| if options[:data_in] options[:data_in].each do |file| - Fasta.open(file, mode='r') do |fasta| + Fasta.open(file, 'r') do |fasta| fasta.each do |entry| output.puts entry.to_bp num += 1 diff --git a/bp_bin/read_fastq b/bp_bin/read_fastq index 0c5b9b2..e4fd75f 100755 --- a/bp_bin/read_fastq +++ b/bp_bin/read_fastq @@ -65,10 +65,7 @@ Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output| io1 = Fastq.open(file1, 'r') io2 = Fastq.open(file2, 'r') - while not io1.eof? and not io2.eof? - entry1 = io1.get_entry - entry2 = io2.get_entry - + while entry1 = io1.get_entry and entry2 = io2.get_entry if encoding == :auto if entry1.qual_base33? or entry2.qual_base33? encoding = :base_33 diff --git a/bp_bin/read_genbank b/bp_bin/read_genbank index 7cabf2b..006cb50 100755 --- a/bp_bin/read_genbank +++ b/bp_bin/read_genbank @@ -55,7 +55,7 @@ Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output| if options[:data_in] options[:data_in].each do |file| - Genbank.open(file, mode='r') do |gb| + Genbank.open(file, 'r') do |gb| gb.each(hash_keys, hash_feats, hash_quals) do |entry| output.puts entry diff --git a/bp_bin/read_sff b/bp_bin/read_sff index cd9d58a..1410af6 100755 --- a/bp_bin/read_sff +++ b/bp_bin/read_sff @@ -49,7 +49,7 @@ Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output| if options[:data_in] options[:data_in].each do |file| - SFF.open(file, mode='r') do |sff| + SFF.open(file, 'r') do |sff| sff.each do |entry| entry.mask if options[:mask] entry.clip if options[:clip] diff --git a/bp_bin/uchime_seq b/bp_bin/uchime_seq index 0504918..8843274 100755 --- a/bp_bin/uchime_seq +++ b/bp_bin/uchime_seq @@ -43,7 +43,7 @@ tmpfile = File.join(tmpdir, "tmp.stream") outfile = File.join(tmpdir, "out.uc") Biopieces.open(options[:stream_in], tmpfile) do |input, output| - Fasta.open(infile, mode="w") do |fasta_io| + Fasta.open(infile, "w") do |fasta_io| input.each_record do |record| output.puts record diff --git a/bp_bin/usearch_seq b/bp_bin/usearch_seq index 946659f..38f6556 100755 --- a/bp_bin/usearch_seq +++ b/bp_bin/usearch_seq @@ -54,7 +54,7 @@ infile = File.join(tmpdir, "in.fna") outfile = File.join(tmpdir, "out.uc") Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output| - Fasta.open(infile, mode="w") do |fasta_io| + Fasta.open(infile, "w") do |fasta_io| input.each_record do |record| output.puts record diff --git a/bp_bin/write_fastq b/bp_bin/write_fastq index 30bdd89..d15c96f 100755 --- a/bp_bin/write_fastq +++ b/bp_bin/write_fastq @@ -34,38 +34,33 @@ require 'maasha/fastq' allowed_enc = 'base_33,base_64' casts = [] -casts << {:long=>'no_stream', :short=>'x', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} -casts << {:long=>'data_out', :short=>'o', :type=>'file', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} -casts << {:long=>'encoding', :short=>'e', :type=>'string', :mandatory=>false, :default=>'base_33', :allowed=>allowed_enc, :disallowed=>nil} -casts << {:long=>'compress', :short=>'Z', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} +casts << {long: 'no_stream', short: 'x', type: 'flag', mandatory: false, default: nil, allowed: nil, disallowed: nil} +casts << {long: 'data_out', short: 'o', type: 'file', mandatory: false, default: nil, allowed: nil, disallowed: nil} +casts << {long: 'encoding', short: 'e', type: 'string', mandatory: false, default: 'base_33', allowed: allowed_enc, disallowed: nil} +casts << {long: 'compress', short: 'Z', type: 'string', mandatory: false, default: nil, allowed: "gzip,bzip,bzip2", disallowed: nil} options = Biopieces.options_parse(ARGV, casts) encoding = options[:encoding].to_sym +compress = options[:compress] ? options[:compress].to_sym : nil + +raise "--data_out is mandatory for compressed output" if compress and not options[:data_out] Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output| - if options[:data_out] - if options[:compress] - io_out = Zlib::GzipWriter.open(options[:data_out]) - else - io_out = Fastq.open(options[:data_out], 'w') - end - else - io_out = $stdout - end + fastq_out = options[:data_out] ? Fastq.open(options[:data_out], 'w', compress: compress) : STDOUT input.each do |record| if record[:SEQ_NAME] and record[:SEQ] and record[:SCORES] entry = Seq.new_bp(record) entry.qual_convert!(:base_33, encoding) - io_out.puts entry.to_fastq + fastq_out.puts entry.to_fastq end output.puts record unless options[:no_stream] end - io_out.close + fastq_out.close end diff --git a/code_ruby/lib/maasha/biopieces.rb b/code_ruby/lib/maasha/biopieces.rb index e51d538..039bb69 100644 --- a/code_ruby/lib/maasha/biopieces.rb +++ b/code_ruby/lib/maasha/biopieces.rb @@ -84,7 +84,7 @@ class Biopieces io_in = self.open_input(input) io_out = self.open_output(output) - if block_given? + if block_given? # FIXME begin block outmost? begin yield io_in, io_out ensure diff --git a/code_ruby/lib/maasha/filesys.rb b/code_ruby/lib/maasha/filesys.rb index e842178..4bb7b8f 100644 --- a/code_ruby/lib/maasha/filesys.rb +++ b/code_ruby/lib/maasha/filesys.rb @@ -23,6 +23,7 @@ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< require 'zlib' +require 'bzip2' # Error class for all exceptions to do with Filesys. class FilesysError < StandardError; end @@ -40,75 +41,67 @@ class Filesys path end - # Class method allowing open to be used on (zipped) files. - # See File.open. def self.open(*args) - args = *args - file = args.first - - if file == "-" - ios = self.new(STDIN) - elsif File.pipe? file - ios = self.new(File.open(*args)) + file = args.shift + mode = args.shift + options = args.shift || {} + + if mode == 'w' + case options[:compress] + when :gzip + ios = Zlib::GzipWriter.new File.open(file, mode, options) + when :bzip, :bzip2 + ios = Bzip2::Writer.new File.open(file, mode, options) + else + ios = File.open(file, mode, options) + end else - ios = self.zopen(*args) + if file == '-' + ios = STDIN + else + case `file #{file}` + when /gzip/ + ios = Zlib::GzipReader.new File.open(file, mode, options) + when /bzip/ + ios = Bzip2::Reader.new File.open(file, mode, options) + else + ios = File.open(file, mode, options) + end + end end if block_given? begin - yield ios + yield self.new(ios) ensure ios.close end else - return ios + return self.new(ios) end end - def initialize(io) - @io = io + def initialize(ios) + @io = ios + end + + def puts(*args) + @io.puts(*args) end - # Method to close ios. def close - @io.close unless @io.is_a? Zlib::GzipReader + @io.close end def eof? @io.eof? end - # Method to check if io is closed. - def closed? - @io.closed? - end - # Iterator method for parsing entries. def each while entry = get_entry do yield entry end end - - # Method to puts directoy on Filesys objects. - def puts(*args) - @io.puts(*args) - end - - private - - # Helper method to return an ios to a file that may be zipped in which case - # the ios is unzipped on the fly. See File.open. - def self.zopen(*args) - ios = File.open(*args) - - begin - ios = Zlib::GzipReader.new(ios) - rescue - ios.rewind - end - - self.new(ios) - end end