X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=code_ruby%2Flib%2Fmaasha%2Ffastq.rb;h=8789899e50d796406657b10727252c32f5f6479f;hb=eb99c8692714ae906908545a7ff8f59f0d9575e9;hp=e2a96ea86c69bf2f97712c2c0b6fa7c73995b4cd;hpb=178225ead000ff26d6e76a654fceb10000bfc87d;p=biopieces.git diff --git a/code_ruby/lib/maasha/fastq.rb b/code_ruby/lib/maasha/fastq.rb index e2a96ea..8789899 100644 --- a/code_ruby/lib/maasha/fastq.rb +++ b/code_ruby/lib/maasha/fastq.rb @@ -28,25 +28,81 @@ require 'maasha/filesys' # Error class for all exceptions to do with FASTQ. class FastqError < StandardError; end +# Class for parsing FASTQ entries from an ios and return as Seq objects. class Fastq < Filesys - # Method to get the next FASTQ entry form an ios and return this + # Method to get the next FASTQ entry from an ios and return this # as a Seq object. If no entry is found or eof then nil is returned. def get_entry - begin - seq_name = @io.gets.chomp! - seq = @io.gets.chomp! - qual_name = @io.gets.chomp! - qual = @io.gets.chomp! - - entry = Seq.new - entry.seq = seq - entry.seq_name = seq_name[1 .. seq_name.length] - entry.qual = qual - entry.type = nil - - entry - rescue - nil + seq_name = @io.gets.chomp! + seq = @io.gets.chomp! + @io.gets + qual = @io.gets.chomp! + + entry = Seq.new + entry.seq = seq + entry.seq_name = seq_name[1 .. seq_name.length] + entry.qual = qual + entry.type = nil + + entry + rescue + nil + end +end + +# Class for indesing FASTQ entries. The index will be +# a hash with the FASTQ sequence name as key and a +# FastqElem as value. The latter contains info on +# byte offset and length for each entry. +class FastqIndex + HEADCHAR = 1 + NEWLINE = 1 + + attr_accessor :ios + + # Method to initialize a FastqIndex object. For reading + # entries from file an _ios_ object must be supplied. + def initialize(ios = nil) + @ios = ios + @index = {} + @offset = 0 + end + + # Method to add a Fastq entry to a FastqIndex. + def add(entry) + offset_seq = @offset + HEADCHAR + entry.seq_name.length + NEWLINE + offset_qual = @offset + HEADCHAR + entry.seq_name.length + NEWLINE + entry.length + NEWLINE + HEADCHAR + NEWLINE + + @index[entry.seq_name] = FastqElem.new(offset_seq, offset_qual, entry.length) + + @offset += HEADCHAR + entry.seq_name.length + NEWLINE + entry.length + NEWLINE + HEADCHAR + NEWLINE + entry.length + NEWLINE + end + + # Method to read from file a Fastq entry from an indexed position, + # and return the entry as a Seq object. + def get(seq_name) + raise FastqError, "Sequence name: #{seq_name} not found in index." unless @index[seq_name] + + elem = @index[seq_name] + @ios.sysseek(elem.offset_seq) + seq = @ios.sysread(elem.length) + @ios.sysseek(elem.offset_qual) + qual = @ios.sysread(elem.length) + + Seq.new(seq_name, seq, nil, qual) + end + + private + + # Class for storing index information to be used + # with disk based index. + class FastqElem + attr_reader :offset_seq, :offset_qual, :length + + def initialize(offset_seq, offset_qual, length) + @offset_seq = offset_seq + @offset_qual = offset_qual + @length = length end end end