From: martinahansen Date: Tue, 8 Feb 2011 20:38:46 +0000 (+0000) Subject: added read_sff biopiece X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=45cc6eab58cfc1d5af693a10f70de2ce18985286;p=biopieces.git added read_sff biopiece git-svn-id: http://biopieces.googlecode.com/svn/trunk@1262 74ccb610-7750-0410-82ae-013aeee3265d --- diff --git a/bp_bin/read_sff b/bp_bin/read_sff new file mode 100755 index 0000000..a14048e --- /dev/null +++ b/bp_bin/read_sff @@ -0,0 +1,75 @@ +#!/usr/bin/env ruby + +# Copyright (C) 2007-2010 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# This program is part of the Biopieces framework (www.biopieces.org). + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# Read SFF entries from one or more files. + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +require 'biopieces' +require 'sff' + +casts = [] +casts << {:long=>'data_in', :short=>'i', :type=>'files!', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'num', :short=>'n', :type=>'uint', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>'0'} +casts << {:long=>'mask', :short=>'m', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'clip', :short=>'c', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} + +bp = Biopieces.new + +options = bp.parse(ARGV, casts) + +bp.each_record do |record| + bp.puts record +end + +num = 0 +last = false + +if options.has_key? :data_in + options[:data_in].each do |file| + SFF.open(file, mode='r') do |sff| + sff.each do |entry| + entry.mask if options[:mask] + entry.clip if options[:clip] + bp.puts entry.to_bp + num += 1 + + if options.has_key? :num and options[:num] == num + last = true + break + end + end + end + + break if last + end +end + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +__END__ diff --git a/bp_test/in/read_sff.in b/bp_test/in/read_sff.in new file mode 100644 index 0000000..59f742e Binary files /dev/null and b/bp_test/in/read_sff.in differ diff --git a/bp_test/in/read_sff.in.gz b/bp_test/in/read_sff.in.gz new file mode 100644 index 0000000..31e4540 Binary files /dev/null and b/bp_test/in/read_sff.in.gz differ diff --git a/bp_test/out/read_sff.out.1 b/bp_test/out/read_sff.out.1 new file mode 100644 index 0000000..299cfe3 --- /dev/null +++ b/bp_test/out/read_sff.out.1 @@ -0,0 +1,90 @@ +SEQ_NAME: FQIBXOY01DRIMT +SEQ: TCAGTCATATTTTTTAGAAACATGTTTGTTTGGACTCATTAATTCATGATTAAAATCACCATCATTCGTTATCAATAAAAGCCCTTCTGTATCTTTATCAAGACGACCAACCGGAAAAATATTTAGATGTTGGTATTCAGGTATTAAATCAATAACGGTTTTTGAATGATGATCTTCAGTTGCTGATATATAACCTTTTGGCTTATTTAACATAATATAGACATTTTCAATGTATTCTATTAATTCTCCACGAACTGTTATCTTATCGTTTTCTGGTTC +SEQ_LEN: 279 +CLIP_QUAL_LEFT: 4 +CLIP_QUAL_RIGHT: 277 +CLIP_ADAPTOR_LEFT: 0 +CLIP_ADAPTOR_RIGHT: 0 +SCORES: aaa`[[[_[NNNNNNTUP[[__`abbcccddddeeeeeeedddcdcccc``bbbbaaaba_`````bbbbba````____\\\\``_aabbbbbbbba````_WVV\\bbbb``\\`^_bXbbb_`_``bbbbbaaabbb___aabbaaa[[UUUbbbZZZZZabbbbbcaa[[UUU[[[[aabbbac____aPPNNNNPP]PPPWWabaaaabbbbbbbaaabbbbbaaabbbXXX__XXXXXXXXXXUYYUUUY[UUUYUUUUYXXMMMMRKMMMMM +--- +SEQ_NAME: FQIBXOY01AV4UR +SEQ: TCAGCTTGAGCAAATTCTTTATCTTTAAAATTAAACATTTTGTTGAAATTACTGTATCTTTAAAACTTAGATTCAATCGCTTCTTTTATTCTCTTCTGATGACACTCCTACTTGATTCGCAATAACTCAATCCAAACGACCAACCAATGTCAGCTAATTCATCAAGTTGTACGTCTAACGGCTTACCTGGTTTCTTCTTCCAGTTCTTGAACGTTTCCTAATGTGTTAAACCAACTTCTAAAGAAATTCAACCACATACGCAATCTTGCTATCTCGTAAATTTAAGTTG +SEQ_LEN: 289 +CLIP_QUAL_LEFT: 4 +CLIP_QUAL_RIGHT: 69 +CLIP_ADAPTOR_LEFT: 0 +CLIP_ADAPTOR_RIGHT: 0 +SCORES: ```````````UUU\\`[[[]Wa]XXMMMMKKIIIRKMMMMRIIYY[[QQNSQ[[^^\]]][[XXRNNMXYZ\ZZZ[[XWXX[[[\[^JJRW\\WVVVV\\ZUUUUUUOOOMMMMVZSSRSSPPPPYXXXWVVWMMMSKMMMMMMMMMSVVVXXXSUOOOO\\UUUSOOOUUWWURSSUSMMMMMRRMMMUUMLIIRPTRRLLLLOOSRRRWWUUUOOMOOOSTTTTQQQKKIIKKKQKKKKKKMMKKKKKKKKKRUUUUUUVVVVVUTTUUURRPOKKKMMMKHKHHR +--- +SEQ_NAME: FQIBXOY01CU7IT +SEQ: TCAGTTCGTAAAAGTGTGATAGATGATGGCAGATGTTATCTCTGTCCGTGTCTAGGCTATCCAAGACAATGGCGTTCAGAAGATATTTACCAGGAAATAAATGAGACGATACAAATAATAGAAATTTAAAATGCGCAAACCTGACCCAGTTTGCGCATTTTATGTTTTACACACGCGAGTAATGTGTTTACTTACGTGTGTTTATTTTGTTGCTGATTTTCAATTGTATATGAATGTGGTTGCACATAAATGCACTTTC +SEQ_LEN: 259 +CLIP_QUAL_LEFT: 4 +CLIP_QUAL_RIGHT: 257 +CLIP_ADAPTOR_LEFT: 0 +CLIP_ADAPTOR_RIGHT: 0 +SCORES: eeeeeeeeeeeeeeeeeeeeeehhhghffgghhhhgghhhhhhhgffhheeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeaaadeeeda]]]ddeeeeeeeeeeeeeeeeeedddedd]]]]dddeeeeeeeeeeeeeeeeedddeeedddddeeeeeeeeeeeeeeeeeeeeeedd\\\ddddeeeeeeeddddddddeeeeeeeeeeeedddddddedddddddddddaaadddddVVRY__]YPPMO +--- +SEQ_NAME: FQIBXOY01CEZSI +SEQ: TCAGAGGTTATGACGTTAAAGCTATTGATGGTCATTCGAACATAACAGAAGCAAGTTTGAAAAGTTCCAAAATATTTGTAATTCCTGAGGCTAACATTCCTTTCAAAGAATCAGAACAGGCAGCAATTGTTAAATATGTGAAACAAGGTGGCAATGTTGTCTTTATTTCAGATCATTACAATGCTGACCGAAATTTAAATCGTATTGATTCATCGGAGGCAATGAATGGTTATCGACGTGGAGCATATGAAGATATG +SEQ_LEN: 257 +CLIP_QUAL_LEFT: 4 +CLIP_QUAL_RIGHT: 253 +CLIP_ADAPTOR_LEFT: 0 +CLIP_ADAPTOR_RIGHT: 0 +SCORES: eeeeeeeeeeeeeeeeedddddfhhfffffhfhhhhhhhhhhhhhhhhfeeeeeeddcccceeeeeeeeeeeeecccc```ddeeeedddddddeeddddeedd```eeeeeeeeeeeeeeeeeeeeeeeecccddeeee\\\deeeeeeeeeeeeeeeeeeeeeeeeeccceeeeeeeeeeeeeeeeed\\\ccceeeeeeeeeeeedddeeeeeeeeedddddddeefeddddddddddaaaddddd____^YYY +--- +SEQ_NAME: FQIBXOY01C4ETH +SEQ: TCAGTGAAACAAACACGCAACAATCATTTGCTAATTGTAAGCAACTTAGACAAGTATATCCGAATGGTGTCACTGCCGATCATCCAGCATATCGACCACATTTAGATAGAGATAAAGATAAACGTGCATGTGAACCTGATAAATATTAAACAACAAGCGAATTGAATTCAAATTGTATTTAGCTTTATGCACTAATCACATAGTAAATAATGAGGGAGATTTTTTAGGCATGAGCAATCAATTCAAAAG +SEQ_LEN: 249 +CLIP_QUAL_LEFT: 4 +CLIP_QUAL_RIGHT: 248 +CLIP_ADAPTOR_LEFT: 0 +CLIP_ADAPTOR_RIGHT: 0 +SCORES: eeeeeeeeeeeeeeeeeeeeeehgffaaahheggeddddddddegggfgeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeddddeeeeeeeeeeeeeeeeeedddeeedddddeeeeeeeeeeeeeeeeeeeeeeedd\\\adeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee`_____aaaddaaadddddd[[[[dZZZZ^ +--- +SEQ_NAME: FQIBXOY01BXD7A +SEQ: TCAGAAAAGCTTTATAATTTTATAATTGCTAAATCTTTTCAACAACCAGTTGGAAGTACGTTCACTTATGGTGAATTAAGAAAGAAATATAATGTGGTTTGTAGCACGAATGATCAACGTGAAGTAGGACGTCGTTTTGCTTACTGGATTAAGTACACGCCAGGATTACCATTTAAAATGTAGGAACAAAAAATGGCAGTTATTATATCAGAAATAGGGATAAACCCATGTCAATAATAGCACGCCTTCGAAAGGAGGTGATTGCTAATGAGTTACACNTTAGTTTG +SEQ_LEN: 287 +CLIP_QUAL_LEFT: 4 +CLIP_QUAL_RIGHT: 177 +CLIP_ADAPTOR_LEFT: 0 +CLIP_ADAPTOR_RIGHT: 0 +SCORES: ]]]XRRRR^^[[[^[^^^^^^[PTRWW```^^^WYW^^_^^]\XXVWWYXXWWXX\a\^^[[[\^^^^\]]\\XXXXVVXPPPVUWW]Z\\\^^^^^`^^WUW````^^^``__^^^^`^^[[[_\\\\\\\UPPPPPMVXXZXQUUVVVZZ^\XUUUUUUYZUOOMMMWWUUUMMMMJMMWRRMMMNNNNNNRRWURLLLRPPRTUUULLLLLRMUUURQOSSXTTMOOOWWWWWTTOQQQKKKQKKKKKKKKKKOKKKPKKKKPRROPKKKPRPOO@KKPPLKKK +--- +SEQ_NAME: FQIBXOY01BWE7M +SEQ: TCAGTATGATGACGGCTAATGATGATGTAGAGGCGCCGAGTGACTTTGAAAAAATCAGAGCTGAAGTTTCATGGTAATAGATATTATCATTTTTGAATTAATTATATTAATGTGTTTAGCAATAGCACTGGAGGTGTTGTAAATATGTGGATTGTCATTTCAATTGTTTTATCTATATTTTTATTGATCTTGTTAAGTAGCATTTCTCATAAGATGAAAACCATAGAAGCATTGGAGTATATGAATG +SEQ_LEN: 247 +CLIP_QUAL_LEFT: 4 +CLIP_QUAL_RIGHT: 242 +CLIP_ADAPTOR_LEFT: 0 +CLIP_ADAPTOR_RIGHT: 0 +SCORES: eeeeeeeeeeeeedddeeeeeehhhhhhhhhhhhhhhhhhhhhhhhff__ff__eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeedddeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeaafddddaaadddddddeddddddddd +--- +SEQ_NAME: FQIBXOY01A2RM8 +SEQ: TCAGAAACTAAAAAACTTAAAAAAGCATGCCAATCAGTACATCATAATTGCGTCTTGGGGACAGACAAATGATGAATAGAGATTGGCATGCTTTTTATTTTTGAATATAAATATTTAGTTCATGGCATTTCTAGTTACATGACGTCCATGAATTAAGAAGTAAACAAGCATAGTAATGATTGCTAAAGCGGCCATAAAGCCGAAGATTTCACTATATGAAAACATATGAGTAAATAACCCAAGGAATGATGGACCGAAGCCGAC +SEQ_LEN: 264 +CLIP_QUAL_LEFT: 4 +CLIP_QUAL_RIGHT: 261 +CLIP_ADAPTOR_LEFT: 0 +CLIP_ADAPTOR_RIGHT: 0 +SCORES: ```^OOO[ULLLLLLS[[IIIIII[[`````bbbccddcccddddddddbbbbbbbbbbb__^_^[TTT__`bbbb`^_^bbbbbbbbb``YY^^Y`JJJJJZZZ]]XXZ``[TTTZ``]bbb`___ZZZ_]bb````^__b_^_^^`````bbbbbbbbb```b`````bbbb``]]]bb`]YSRRR`````]QQQR]]]`]]]]`b``XXX]`XXXXYYVVMVSZZZXSNNNNKKNNVXXSSSV][VPPPYYYYYYYYYYYY +--- +SEQ_NAME: FQIBXOY01API7E +SEQ: TCAGCAATAGATATAATTTATGGTTTATATCTATTTCGGCATCTTTACCTTTCACTTGTTCAACTTATGTACCATAAATACTTCTGACAAGTTACTAATTAAACATGCAACCTCTAACTCAATTTAATATTTTAACTAACTTGTAATATACAGGATTCATCACGCATAATCAACCCTGTAAAACTTGATACGCAATAAAAGTTTTAAAGCATTTTATTGCGACAACTGTCTATCTATGTTTTTTCAAACGAATTTCATCAACTAGATTCCAGATAAATTC +SEQ_LEN: 280 +CLIP_QUAL_LEFT: 4 +CLIP_QUAL_RIGHT: 278 +CLIP_ADAPTOR_LEFT: 0 +CLIP_ADAPTOR_RIGHT: 0 +SCORES: eeeeeeeeeeeeeeeeeeeeeeggfffffhffffffffffefeeeefeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeddddddeeeeeeeeeeeeeeeeedddeeeeeeeeee``\\\``eeddddeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeccccdddeeeeeedccccdeeeee```ceeee````WWceeeeeccccceedddddddddddddda\[[[^]]]adaaad_____________^^^_\\\\^___\\\\\\ +--- +SEQ_NAME: FQIBXOY01BJJRF +SEQ: TCAGTAACTATCAAATAAAATGATAACGGTTTCATCTATCTATTTTATCGGTCTAGTGGCTGATTTCAAGCTAGAAATATTGAATGACAATACAACTCTGTTAAAATGATGGACGTAGACAAATATGCGTATTGACGCTTTATTTTAAAAATTAACATGCTTATAACATGTTTATAGAAGGAGATTAACCTATGAACTATCAAGTTCTTTTATATTATAAATATATGACGATTGATGACCCTGAACAGTTTGCTCAGGATCAC +SEQ_LEN: 263 +CLIP_QUAL_LEFT: 4 +CLIP_QUAL_RIGHT: 260 +CLIP_ADAPTOR_LEFT: 0 +CLIP_ADAPTOR_RIGHT: 0 +SCORES: cccc```ccbbb___bXXXXbbcccegeeddffffffedddecdddccccbbbccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccbbbccccccccbbbbccccccccccccccccccccccccccccccccccccccVVVVVVaccbbbcccccccccccccccccccccccccccccccccccaaaaccccccccccccaWRR\YVVVVVVYYYY +--- diff --git a/bp_test/out/read_sff.out.2 b/bp_test/out/read_sff.out.2 new file mode 100644 index 0000000..7591dc9 --- /dev/null +++ b/bp_test/out/read_sff.out.2 @@ -0,0 +1,9 @@ +SEQ_NAME: FQIBXOY01DRIMT +SEQ: TCAGTCATATTTTTTAGAAACATGTTTGTTTGGACTCATTAATTCATGATTAAAATCACCATCATTCGTTATCAATAAAAGCCCTTCTGTATCTTTATCAAGACGACCAACCGGAAAAATATTTAGATGTTGGTATTCAGGTATTAAATCAATAACGGTTTTTGAATGATGATCTTCAGTTGCTGATATATAACCTTTTGGCTTATTTAACATAATATAGACATTTTCAATGTATTCTATTAATTCTCCACGAACTGTTATCTTATCGTTTTCTGGTTC +SEQ_LEN: 279 +CLIP_QUAL_LEFT: 4 +CLIP_QUAL_RIGHT: 277 +CLIP_ADAPTOR_LEFT: 0 +CLIP_ADAPTOR_RIGHT: 0 +SCORES: aaa`[[[_[NNNNNNTUP[[__`abbcccddddeeeeeeedddcdcccc``bbbbaaaba_`````bbbbba````____\\\\``_aabbbbbbbba````_WVV\\bbbb``\\`^_bXbbb_`_``bbbbbaaabbb___aabbaaa[[UUUbbbZZZZZabbbbbcaa[[UUU[[[[aabbbac____aPPNNNNPP]PPPWWabaaaabbbbbbbaaabbbbbaaabbbXXX__XXXXXXXXXXUYYUUUY[UUUYUUUUYXXMMMMRKMMMMM +--- diff --git a/bp_test/out/read_sff.out.3 b/bp_test/out/read_sff.out.3 new file mode 100644 index 0000000..a8e3bcf --- /dev/null +++ b/bp_test/out/read_sff.out.3 @@ -0,0 +1,9 @@ +SEQ_NAME: FQIBXOY01DRIMT +SEQ: tcagTCATATTTTTTAGAAACATGTTTGTTTGGACTCATTAATTCATGATTAAAATCACCATCATTCGTTATCAATAAAAGCCCTTCTGTATCTTTATCAAGACGACCAACCGGAAAAATATTTAGATGTTGGTATTCAGGTATTAAATCAATAACGGTTTTTGAATGATGATCTTCAGTTGCTGATATATAACCTTTTGGCTTATTTAACATAATATAGACATTTTCAATGTATTCTATTAATTCTCCACGAACTGTTATCTTATCGTTTTCTGGTTc +SEQ_LEN: 279 +CLIP_QUAL_LEFT: 4 +CLIP_QUAL_RIGHT: 277 +CLIP_ADAPTOR_LEFT: 0 +CLIP_ADAPTOR_RIGHT: 0 +SCORES: aaa`[[[_[NNNNNNTUP[[__`abbcccddddeeeeeeedddcdcccc``bbbbaaaba_`````bbbbba````____\\\\``_aabbbbbbbba````_WVV\\bbbb``\\`^_bXbbb_`_``bbbbbaaabbb___aabbaaa[[UUUbbbZZZZZabbbbbcaa[[UUU[[[[aabbbac____aPPNNNNPP]PPPWWabaaaabbbbbbbaaabbbbbaaabbbXXX__XXXXXXXXXXUYYUUUY[UUUYUUUUYXXMMMMRKMMMMM +--- diff --git a/bp_test/out/read_sff.out.4 b/bp_test/out/read_sff.out.4 new file mode 100644 index 0000000..9809971 --- /dev/null +++ b/bp_test/out/read_sff.out.4 @@ -0,0 +1,9 @@ +SEQ_NAME: FQIBXOY01DRIMT +SEQ: TCATATTTTTTAGAAACATGTTTGTTTGGACTCATTAATTCATGATTAAAATCACCATCATTCGTTATCAATAAAAGCCCTTCTGTATCTTTATCAAGACGACCAACCGGAAAAATATTTAGATGTTGGTATTCAGGTATTAAATCAATAACGGTTTTTGAATGATGATCTTCAGTTGCTGATATATAACCTTTTGGCTTATTTAACATAATATAGACATTTTCAATGTATTCTATTAATTCTCCACGAACTGTTATCTTATCGTTTTCTGGTT +SEQ_LEN: 274 +CLIP_QUAL_LEFT: 4 +CLIP_QUAL_RIGHT: 277 +CLIP_ADAPTOR_LEFT: 0 +CLIP_ADAPTOR_RIGHT: 0 +SCORES: [[[_[NNNNNNTUP[[__`abbcccddddeeeeeeedddcdcccc``bbbbaaaba_`````bbbbba````____\\\\``_aabbbbbbbba````_WVV\\bbbb``\\`^_bXbbb_`_``bbbbbaaabbb___aabbaaa[[UUUbbbZZZZZabbbbbcaa[[UUU[[[[aabbbac____aPPNNNNPP]PPPWWabaaaabbbbbbbaaabbbbbaaabbbXXX__XXXXXXXXXXUYYUUUY[UUUYUUUUYXXMMMMRKMMMM +--- diff --git a/bp_test/test/test_read_sff b/bp_test/test/test_read_sff new file mode 100755 index 0000000..3b05613 --- /dev/null +++ b/bp_test/test/test_read_sff @@ -0,0 +1,19 @@ +#!/bin/bash + +source "$BP_DIR/bp_test/lib/test.sh" + +run "$bp -i $in -O $tmp" +assert_no_diff $tmp $out.1 +clean + +run "$bp -i $in -n 1 -O $tmp" +assert_no_diff $tmp $out.2 +clean + +run "$bp -i $in -n 1 -m -O $tmp" +assert_no_diff $tmp $out.3 +clean + +run "$bp -i $in -n 1 -c -O $tmp" +assert_no_diff $tmp $out.4 +clean diff --git a/code_ruby/Maasha/lib/sff.rb b/code_ruby/Maasha/lib/sff.rb new file mode 100644 index 0000000..3e0fc37 --- /dev/null +++ b/code_ruby/Maasha/lib/sff.rb @@ -0,0 +1,225 @@ +# Copyright (C) 2011 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# This software is part of the Biopieces framework (www.biopieces.org). + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# Error class for all exceptions to do with SFF. +class SFFError < StandardError; end + +# Class containing methods to parse SFF files: +# http://www.ncbi.nlm.nih.gov/Traces/trace.cgi?cmd=show&f=formats&m=doc&s=format#sff +class SFF + include Enumerable + + @@count = 0 + + # Class method for opening SFF files. + def self.open(*args) + ios = File.open(*args) + + if block_given? + begin + yield self.new(ios) + ensure + ios.close + end + else + return self.new(ios) + end + end + + # Method to initialize a SFF object along with + # instance variables pertaining to the SFF header + # section. + def initialize(io) + @io = io + @magic_number = 0 + @version = "" + @index_offset = 0 + @index_length = 0 + @number_of_reads = 0 + @header_length = 0 + @key_length = 0 + @number_of_flows_per_read = 0 + @flowgram_format_code = 0 + @flow_chars = "" + @key_sequence = "" + @eight_byte_padding = 0 + + header_parse + end + + # Method to close ios. + def close + @io.close + end + + # Method to iterate over each SFF entry. + def each + while (read = read_parse) do + yield read + end + + self # conventionally + end + + private + + # Method to parse the SFF file's header section + # and load the information into the instance variables. + def header_parse + template = "NC4N2NNnnnC" + bits_in_uint = 32 + + data = @io.read(31).unpack(template) + + @magic_number = data[0] + @version = data[1 .. 4].join "" + @index_offset = (data[5] << bits_in_uint) | data[6] + @index_length = data[7] + @number_of_reads = data[8] + @header_length = data[9] + @key_length = data[10] + @number_of_flows_per_read = data[11] + @flowgram_format_code = data[12] + @flow_chars = @io.read(@number_of_flows_per_read).unpack("A*").join "" + @key_sequence = @io.read(@key_length).unpack("A*").join "" + + fast_forward + + check_magic_number + check_version + check_header_length + end + + # Method that reads the eight_byte_padding field found at the end of the + # data section and fast forwards, i.e. move the file read pointer, + # so that the length of the section is divisible by 8. + def fast_forward + eight_byte_padding = 8 - (@io.pos % 8) + + @io.read(eight_byte_padding) unless eight_byte_padding == 8 + end + + # Method to parse a read section of a SFF file. + def read_parse + return nil if @number_of_reads == @@count + + template = "nnNnnnn" + + read = Read.new() + + data = @io.read(16).unpack(template) + + read.read_header_length = data[0] + read.name_length = data[1] + read.number_of_bases = data[2] + read.clip_qual_left = data[3] + read.clip_qual_right = data[4] + read.clip_adapter_left = data[5] + read.clip_adaptor_right = data[6] + read.name = @io.read(read.name_length).unpack("A*").join "" + + fast_forward + + @io.read(2 * @number_of_flows_per_read) # skip through flowgram_values + @io.read(read.number_of_bases) # skip through flow_index_per_base + + # NB! Parsing of flowgram_values and flow_index_per_base is currently disabled since these are not needed. + # read.flowgram_values = @io.read(2 * @number_of_flows_per_read).unpack("n*").map { |val| val = sprintf("%.2f", val * 0.01) } + # flow_index_per_base = @io.read(read.number_of_bases).unpack("C*") + # (1 ... flow_index_per_base.length).each { |i| flow_index_per_base[i] += flow_index_per_base[i - 1] } + # read.flow_index_per_base = flow_index_per_base + + read.bases = @io.read(read.number_of_bases).unpack("A*").join "" + read.quality_scores = @io.read(read.number_of_bases).unpack("C*") + + fast_forward + + @@count += 1 + + read + end + + # Method to check the magic number of a SFF file. + # Raises an error if the magic number don't match. + def check_magic_number + raise SFFError, "Badly formatted SFF file." unless @magic_number == 779314790 + end + + # Method to check the version number of a SFF file. + # Raises an error if the version don't match. + def check_version + raise SFFError, "Wrong version #{@version}" unless @version.to_i == 1 + end + + # Method to check the header length of a SFF file. + # Raises an error if the header length don't match + # the file position after reading the header section. + def check_header_length + raise SFFError, "Bad header length: #{header_length}" unless @io.pos == @header_length + end +end + +# Class containing data accessor methods for an SFF entry and methods +# for manipulating this entry. +class Read + attr_accessor :read_header_length, :name_length, :number_of_bases, + :clip_qual_left, :clip_qual_right, :clip_adapter_left, :clip_adaptor_right, + :name, :flowgram_values, :flow_index_per_base, :bases, :quality_scores + + # Method that converts a Read object's data to a Biopiece record (a hash). + def to_bp + hash = {} + + hash[:SEQ_NAME] = self.name + hash[:SEQ] = self.bases + hash[:SEQ_LEN] = self.bases.length + hash[:CLIP_QUAL_LEFT] = self.clip_qual_left - 1 + hash[:CLIP_QUAL_RIGHT] = self.clip_qual_right - 1 + hash[:CLIP_ADAPTOR_LEFT] = self.clip_adapter_left + hash[:CLIP_ADAPTOR_RIGHT] = self.clip_adaptor_right + hash[:SCORES] = self.quality_scores.map { |i| (i += 64).chr }.join "" + + hash + end + + # Method that soft masks the sequence (i.e. lowercases sequence) according to + # clip_qual_left and clip_qual_right information. + def mask + left = self.bases[0 ... self.clip_qual_left - 1].downcase + middle = self.bases[self.clip_qual_left - 1 ... self.clip_qual_right] + right = self.bases[self.clip_qual_right ... self.bases.length].downcase + + self.bases = left + middle + right + end + + # Method that clips sequence (i.e. trims) according to + # clip_qual_left and clip_qual_right information. + def clip + self.bases = self.bases[self.clip_qual_left - 1 ... self.clip_qual_right] + self.quality_scores = self.quality_scores[self.clip_qual_left - 1 ... self.clip_qual_right] + end +end + +__END__ +