]> git.donarmstrong.com Git - biopieces.git/commitdiff
added read_sff biopiece
authormartinahansen <martinahansen@74ccb610-7750-0410-82ae-013aeee3265d>
Tue, 8 Feb 2011 20:38:46 +0000 (20:38 +0000)
committermartinahansen <martinahansen@74ccb610-7750-0410-82ae-013aeee3265d>
Tue, 8 Feb 2011 20:38:46 +0000 (20:38 +0000)
git-svn-id: http://biopieces.googlecode.com/svn/trunk@1262 74ccb610-7750-0410-82ae-013aeee3265d

bp_bin/read_sff [new file with mode: 0755]
bp_test/in/read_sff.in [new file with mode: 0644]
bp_test/in/read_sff.in.gz [new file with mode: 0644]
bp_test/out/read_sff.out.1 [new file with mode: 0644]
bp_test/out/read_sff.out.2 [new file with mode: 0644]
bp_test/out/read_sff.out.3 [new file with mode: 0644]
bp_test/out/read_sff.out.4 [new file with mode: 0644]
bp_test/test/test_read_sff [new file with mode: 0755]
code_ruby/Maasha/lib/sff.rb [new file with mode: 0644]

diff --git a/bp_bin/read_sff b/bp_bin/read_sff
new file mode 100755 (executable)
index 0000000..a14048e
--- /dev/null
@@ -0,0 +1,75 @@
+#!/usr/bin/env ruby
+
+# Copyright (C) 2007-2010 Martin A. Hansen.
+
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+# http://www.gnu.org/copyleft/gpl.html
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# This program is part of the Biopieces framework (www.biopieces.org).
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# Read SFF entries from one or more files.
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+require 'biopieces'
+require 'sff'
+
+casts = []
+casts << {:long=>'data_in', :short=>'i', :type=>'files!', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'num',     :short=>'n', :type=>'uint',   :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>'0'}
+casts << {:long=>'mask',    :short=>'m', :type=>'flag',   :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'clip',    :short=>'c', :type=>'flag',   :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
+
+bp = Biopieces.new
+
+options = bp.parse(ARGV, casts)
+
+bp.each_record do |record|
+  bp.puts record
+end
+
+num  = 0
+last = false
+
+if options.has_key? :data_in
+  options[:data_in].each do |file|
+    SFF.open(file, mode='r') do |sff|
+      sff.each do |entry|
+        entry.mask if options[:mask]
+        entry.clip if options[:clip]
+        bp.puts entry.to_bp
+        num += 1
+
+        if options.has_key? :num and options[:num] == num
+          last = true
+          break
+        end
+      end
+    end
+
+    break if last
+  end
+end
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+__END__
diff --git a/bp_test/in/read_sff.in b/bp_test/in/read_sff.in
new file mode 100644 (file)
index 0000000..59f742e
Binary files /dev/null and b/bp_test/in/read_sff.in differ
diff --git a/bp_test/in/read_sff.in.gz b/bp_test/in/read_sff.in.gz
new file mode 100644 (file)
index 0000000..31e4540
Binary files /dev/null and b/bp_test/in/read_sff.in.gz differ
diff --git a/bp_test/out/read_sff.out.1 b/bp_test/out/read_sff.out.1
new file mode 100644 (file)
index 0000000..299cfe3
--- /dev/null
@@ -0,0 +1,90 @@
+SEQ_NAME: FQIBXOY01DRIMT
+SEQ: TCAGTCATATTTTTTAGAAACATGTTTGTTTGGACTCATTAATTCATGATTAAAATCACCATCATTCGTTATCAATAAAAGCCCTTCTGTATCTTTATCAAGACGACCAACCGGAAAAATATTTAGATGTTGGTATTCAGGTATTAAATCAATAACGGTTTTTGAATGATGATCTTCAGTTGCTGATATATAACCTTTTGGCTTATTTAACATAATATAGACATTTTCAATGTATTCTATTAATTCTCCACGAACTGTTATCTTATCGTTTTCTGGTTC
+SEQ_LEN: 279
+CLIP_QUAL_LEFT: 4
+CLIP_QUAL_RIGHT: 277
+CLIP_ADAPTOR_LEFT: 0
+CLIP_ADAPTOR_RIGHT: 0
+SCORES: aaa`[[[_[NNNNNNTUP[[__`abbcccddddeeeeeeedddcdcccc``bbbbaaaba_`````bbbbba````____\\\\``_aabbbbbbbba````_WVV\\bbbb``\\`^_bXbbb_`_``bbbbbaaabbb___aabbaaa[[UUUbbbZZZZZabbbbbcaa[[UUU[[[[aabbbac____aPPNNNNPP]PPPWWabaaaabbbbbbbaaabbbbbaaabbbXXX__XXXXXXXXXXUYYUUUY[UUUYUUUUYXXMMMMRKMMMMM
+---
+SEQ_NAME: FQIBXOY01AV4UR
+SEQ: TCAGCTTGAGCAAATTCTTTATCTTTAAAATTAAACATTTTGTTGAAATTACTGTATCTTTAAAACTTAGATTCAATCGCTTCTTTTATTCTCTTCTGATGACACTCCTACTTGATTCGCAATAACTCAATCCAAACGACCAACCAATGTCAGCTAATTCATCAAGTTGTACGTCTAACGGCTTACCTGGTTTCTTCTTCCAGTTCTTGAACGTTTCCTAATGTGTTAAACCAACTTCTAAAGAAATTCAACCACATACGCAATCTTGCTATCTCGTAAATTTAAGTTG
+SEQ_LEN: 289
+CLIP_QUAL_LEFT: 4
+CLIP_QUAL_RIGHT: 69
+CLIP_ADAPTOR_LEFT: 0
+CLIP_ADAPTOR_RIGHT: 0
+SCORES: ```````````UUU\\`[[[]Wa]XXMMMMKKIIIRKMMMMRIIYY[[QQNSQ[[^^\]]][[XXRNNMXYZ\ZZZ[[XWXX[[[\[^JJRW\\WVVVV\\ZUUUUUUOOOMMMMVZSSRSSPPPPYXXXWVVWMMMSKMMMMMMMMMSVVVXXXSUOOOO\\UUUSOOOUUWWURSSUSMMMMMRRMMMUUMLIIRPTRRLLLLOOSRRRWWUUUOOMOOOSTTTTQQQKKIIKKKQKKKKKKMMKKKKKKKKKRUUUUUUVVVVVUTTUUURRPOKKKMMMKHKHHR
+---
+SEQ_NAME: FQIBXOY01CU7IT
+SEQ: TCAGTTCGTAAAAGTGTGATAGATGATGGCAGATGTTATCTCTGTCCGTGTCTAGGCTATCCAAGACAATGGCGTTCAGAAGATATTTACCAGGAAATAAATGAGACGATACAAATAATAGAAATTTAAAATGCGCAAACCTGACCCAGTTTGCGCATTTTATGTTTTACACACGCGAGTAATGTGTTTACTTACGTGTGTTTATTTTGTTGCTGATTTTCAATTGTATATGAATGTGGTTGCACATAAATGCACTTTC
+SEQ_LEN: 259
+CLIP_QUAL_LEFT: 4
+CLIP_QUAL_RIGHT: 257
+CLIP_ADAPTOR_LEFT: 0
+CLIP_ADAPTOR_RIGHT: 0
+SCORES: eeeeeeeeeeeeeeeeeeeeeehhhghffgghhhhgghhhhhhhgffhheeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeaaadeeeda]]]ddeeeeeeeeeeeeeeeeeedddedd]]]]dddeeeeeeeeeeeeeeeeedddeeedddddeeeeeeeeeeeeeeeeeeeeeedd\\\ddddeeeeeeeddddddddeeeeeeeeeeeedddddddedddddddddddaaadddddVVRY__]YPPMO
+---
+SEQ_NAME: FQIBXOY01CEZSI
+SEQ: TCAGAGGTTATGACGTTAAAGCTATTGATGGTCATTCGAACATAACAGAAGCAAGTTTGAAAAGTTCCAAAATATTTGTAATTCCTGAGGCTAACATTCCTTTCAAAGAATCAGAACAGGCAGCAATTGTTAAATATGTGAAACAAGGTGGCAATGTTGTCTTTATTTCAGATCATTACAATGCTGACCGAAATTTAAATCGTATTGATTCATCGGAGGCAATGAATGGTTATCGACGTGGAGCATATGAAGATATG
+SEQ_LEN: 257
+CLIP_QUAL_LEFT: 4
+CLIP_QUAL_RIGHT: 253
+CLIP_ADAPTOR_LEFT: 0
+CLIP_ADAPTOR_RIGHT: 0
+SCORES: eeeeeeeeeeeeeeeeedddddfhhfffffhfhhhhhhhhhhhhhhhhfeeeeeeddcccceeeeeeeeeeeeecccc```ddeeeedddddddeeddddeedd```eeeeeeeeeeeeeeeeeeeeeeeecccddeeee\\\deeeeeeeeeeeeeeeeeeeeeeeeeccceeeeeeeeeeeeeeeeed\\\ccceeeeeeeeeeeedddeeeeeeeeedddddddeefeddddddddddaaaddddd____^YYY
+---
+SEQ_NAME: FQIBXOY01C4ETH
+SEQ: TCAGTGAAACAAACACGCAACAATCATTTGCTAATTGTAAGCAACTTAGACAAGTATATCCGAATGGTGTCACTGCCGATCATCCAGCATATCGACCACATTTAGATAGAGATAAAGATAAACGTGCATGTGAACCTGATAAATATTAAACAACAAGCGAATTGAATTCAAATTGTATTTAGCTTTATGCACTAATCACATAGTAAATAATGAGGGAGATTTTTTAGGCATGAGCAATCAATTCAAAAG
+SEQ_LEN: 249
+CLIP_QUAL_LEFT: 4
+CLIP_QUAL_RIGHT: 248
+CLIP_ADAPTOR_LEFT: 0
+CLIP_ADAPTOR_RIGHT: 0
+SCORES: eeeeeeeeeeeeeeeeeeeeeehgffaaahheggeddddddddegggfgeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeddddeeeeeeeeeeeeeeeeeedddeeedddddeeeeeeeeeeeeeeeeeeeeeeedd\\\adeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee`_____aaaddaaadddddd[[[[dZZZZ^
+---
+SEQ_NAME: FQIBXOY01BXD7A
+SEQ: TCAGAAAAGCTTTATAATTTTATAATTGCTAAATCTTTTCAACAACCAGTTGGAAGTACGTTCACTTATGGTGAATTAAGAAAGAAATATAATGTGGTTTGTAGCACGAATGATCAACGTGAAGTAGGACGTCGTTTTGCTTACTGGATTAAGTACACGCCAGGATTACCATTTAAAATGTAGGAACAAAAAATGGCAGTTATTATATCAGAAATAGGGATAAACCCATGTCAATAATAGCACGCCTTCGAAAGGAGGTGATTGCTAATGAGTTACACNTTAGTTTG
+SEQ_LEN: 287
+CLIP_QUAL_LEFT: 4
+CLIP_QUAL_RIGHT: 177
+CLIP_ADAPTOR_LEFT: 0
+CLIP_ADAPTOR_RIGHT: 0
+SCORES: ]]]XRRRR^^[[[^[^^^^^^[PTRWW```^^^WYW^^_^^]\XXVWWYXXWWXX\a\^^[[[\^^^^\]]\\XXXXVVXPPPVUWW]Z\\\^^^^^`^^WUW````^^^``__^^^^`^^[[[_\\\\\\\UPPPPPMVXXZXQUUVVVZZ^\XUUUUUUYZUOOMMMWWUUUMMMMJMMWRRMMMNNNNNNRRWURLLLRPPRTUUULLLLLRMUUURQOSSXTTMOOOWWWWWTTOQQQKKKQKKKKKKKKKKOKKKPKKKKPRROPKKKPRPOO@KKPPLKKK
+---
+SEQ_NAME: FQIBXOY01BWE7M
+SEQ: TCAGTATGATGACGGCTAATGATGATGTAGAGGCGCCGAGTGACTTTGAAAAAATCAGAGCTGAAGTTTCATGGTAATAGATATTATCATTTTTGAATTAATTATATTAATGTGTTTAGCAATAGCACTGGAGGTGTTGTAAATATGTGGATTGTCATTTCAATTGTTTTATCTATATTTTTATTGATCTTGTTAAGTAGCATTTCTCATAAGATGAAAACCATAGAAGCATTGGAGTATATGAATG
+SEQ_LEN: 247
+CLIP_QUAL_LEFT: 4
+CLIP_QUAL_RIGHT: 242
+CLIP_ADAPTOR_LEFT: 0
+CLIP_ADAPTOR_RIGHT: 0
+SCORES: eeeeeeeeeeeeedddeeeeeehhhhhhhhhhhhhhhhhhhhhhhhff__ff__eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeedddeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeaafddddaaadddddddeddddddddd
+---
+SEQ_NAME: FQIBXOY01A2RM8
+SEQ: TCAGAAACTAAAAAACTTAAAAAAGCATGCCAATCAGTACATCATAATTGCGTCTTGGGGACAGACAAATGATGAATAGAGATTGGCATGCTTTTTATTTTTGAATATAAATATTTAGTTCATGGCATTTCTAGTTACATGACGTCCATGAATTAAGAAGTAAACAAGCATAGTAATGATTGCTAAAGCGGCCATAAAGCCGAAGATTTCACTATATGAAAACATATGAGTAAATAACCCAAGGAATGATGGACCGAAGCCGAC
+SEQ_LEN: 264
+CLIP_QUAL_LEFT: 4
+CLIP_QUAL_RIGHT: 261
+CLIP_ADAPTOR_LEFT: 0
+CLIP_ADAPTOR_RIGHT: 0
+SCORES: ```^OOO[ULLLLLLS[[IIIIII[[`````bbbccddcccddddddddbbbbbbbbbbb__^_^[TTT__`bbbb`^_^bbbbbbbbb``YY^^Y`JJJJJZZZ]]XXZ``[TTTZ``]bbb`___ZZZ_]bb````^__b_^_^^`````bbbbbbbbb```b`````bbbb``]]]bb`]YSRRR`````]QQQR]]]`]]]]`b``XXX]`XXXXYYVVMVSZZZXSNNNNKKNNVXXSSSV][VPPPYYYYYYYYYYYY
+---
+SEQ_NAME: FQIBXOY01API7E
+SEQ: TCAGCAATAGATATAATTTATGGTTTATATCTATTTCGGCATCTTTACCTTTCACTTGTTCAACTTATGTACCATAAATACTTCTGACAAGTTACTAATTAAACATGCAACCTCTAACTCAATTTAATATTTTAACTAACTTGTAATATACAGGATTCATCACGCATAATCAACCCTGTAAAACTTGATACGCAATAAAAGTTTTAAAGCATTTTATTGCGACAACTGTCTATCTATGTTTTTTCAAACGAATTTCATCAACTAGATTCCAGATAAATTC
+SEQ_LEN: 280
+CLIP_QUAL_LEFT: 4
+CLIP_QUAL_RIGHT: 278
+CLIP_ADAPTOR_LEFT: 0
+CLIP_ADAPTOR_RIGHT: 0
+SCORES: eeeeeeeeeeeeeeeeeeeeeeggfffffhffffffffffefeeeefeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeddddddeeeeeeeeeeeeeeeeedddeeeeeeeeee``\\\``eeddddeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeccccdddeeeeeedccccdeeeee```ceeee````WWceeeeeccccceedddddddddddddda\[[[^]]]adaaad_____________^^^_\\\\^___\\\\\\
+---
+SEQ_NAME: FQIBXOY01BJJRF
+SEQ: TCAGTAACTATCAAATAAAATGATAACGGTTTCATCTATCTATTTTATCGGTCTAGTGGCTGATTTCAAGCTAGAAATATTGAATGACAATACAACTCTGTTAAAATGATGGACGTAGACAAATATGCGTATTGACGCTTTATTTTAAAAATTAACATGCTTATAACATGTTTATAGAAGGAGATTAACCTATGAACTATCAAGTTCTTTTATATTATAAATATATGACGATTGATGACCCTGAACAGTTTGCTCAGGATCAC
+SEQ_LEN: 263
+CLIP_QUAL_LEFT: 4
+CLIP_QUAL_RIGHT: 260
+CLIP_ADAPTOR_LEFT: 0
+CLIP_ADAPTOR_RIGHT: 0
+SCORES: cccc```ccbbb___bXXXXbbcccegeeddffffffedddecdddccccbbbccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccbbbccccccccbbbbccccccccccccccccccccccccccccccccccccccVVVVVVaccbbbcccccccccccccccccccccccccccccccccccaaaaccccccccccccaWRR\YVVVVVVYYYY
+---
diff --git a/bp_test/out/read_sff.out.2 b/bp_test/out/read_sff.out.2
new file mode 100644 (file)
index 0000000..7591dc9
--- /dev/null
@@ -0,0 +1,9 @@
+SEQ_NAME: FQIBXOY01DRIMT
+SEQ: TCAGTCATATTTTTTAGAAACATGTTTGTTTGGACTCATTAATTCATGATTAAAATCACCATCATTCGTTATCAATAAAAGCCCTTCTGTATCTTTATCAAGACGACCAACCGGAAAAATATTTAGATGTTGGTATTCAGGTATTAAATCAATAACGGTTTTTGAATGATGATCTTCAGTTGCTGATATATAACCTTTTGGCTTATTTAACATAATATAGACATTTTCAATGTATTCTATTAATTCTCCACGAACTGTTATCTTATCGTTTTCTGGTTC
+SEQ_LEN: 279
+CLIP_QUAL_LEFT: 4
+CLIP_QUAL_RIGHT: 277
+CLIP_ADAPTOR_LEFT: 0
+CLIP_ADAPTOR_RIGHT: 0
+SCORES: aaa`[[[_[NNNNNNTUP[[__`abbcccddddeeeeeeedddcdcccc``bbbbaaaba_`````bbbbba````____\\\\``_aabbbbbbbba````_WVV\\bbbb``\\`^_bXbbb_`_``bbbbbaaabbb___aabbaaa[[UUUbbbZZZZZabbbbbcaa[[UUU[[[[aabbbac____aPPNNNNPP]PPPWWabaaaabbbbbbbaaabbbbbaaabbbXXX__XXXXXXXXXXUYYUUUY[UUUYUUUUYXXMMMMRKMMMMM
+---
diff --git a/bp_test/out/read_sff.out.3 b/bp_test/out/read_sff.out.3
new file mode 100644 (file)
index 0000000..a8e3bcf
--- /dev/null
@@ -0,0 +1,9 @@
+SEQ_NAME: FQIBXOY01DRIMT
+SEQ: tcagTCATATTTTTTAGAAACATGTTTGTTTGGACTCATTAATTCATGATTAAAATCACCATCATTCGTTATCAATAAAAGCCCTTCTGTATCTTTATCAAGACGACCAACCGGAAAAATATTTAGATGTTGGTATTCAGGTATTAAATCAATAACGGTTTTTGAATGATGATCTTCAGTTGCTGATATATAACCTTTTGGCTTATTTAACATAATATAGACATTTTCAATGTATTCTATTAATTCTCCACGAACTGTTATCTTATCGTTTTCTGGTTc
+SEQ_LEN: 279
+CLIP_QUAL_LEFT: 4
+CLIP_QUAL_RIGHT: 277
+CLIP_ADAPTOR_LEFT: 0
+CLIP_ADAPTOR_RIGHT: 0
+SCORES: aaa`[[[_[NNNNNNTUP[[__`abbcccddddeeeeeeedddcdcccc``bbbbaaaba_`````bbbbba````____\\\\``_aabbbbbbbba````_WVV\\bbbb``\\`^_bXbbb_`_``bbbbbaaabbb___aabbaaa[[UUUbbbZZZZZabbbbbcaa[[UUU[[[[aabbbac____aPPNNNNPP]PPPWWabaaaabbbbbbbaaabbbbbaaabbbXXX__XXXXXXXXXXUYYUUUY[UUUYUUUUYXXMMMMRKMMMMM
+---
diff --git a/bp_test/out/read_sff.out.4 b/bp_test/out/read_sff.out.4
new file mode 100644 (file)
index 0000000..9809971
--- /dev/null
@@ -0,0 +1,9 @@
+SEQ_NAME: FQIBXOY01DRIMT
+SEQ: TCATATTTTTTAGAAACATGTTTGTTTGGACTCATTAATTCATGATTAAAATCACCATCATTCGTTATCAATAAAAGCCCTTCTGTATCTTTATCAAGACGACCAACCGGAAAAATATTTAGATGTTGGTATTCAGGTATTAAATCAATAACGGTTTTTGAATGATGATCTTCAGTTGCTGATATATAACCTTTTGGCTTATTTAACATAATATAGACATTTTCAATGTATTCTATTAATTCTCCACGAACTGTTATCTTATCGTTTTCTGGTT
+SEQ_LEN: 274
+CLIP_QUAL_LEFT: 4
+CLIP_QUAL_RIGHT: 277
+CLIP_ADAPTOR_LEFT: 0
+CLIP_ADAPTOR_RIGHT: 0
+SCORES: [[[_[NNNNNNTUP[[__`abbcccddddeeeeeeedddcdcccc``bbbbaaaba_`````bbbbba````____\\\\``_aabbbbbbbba````_WVV\\bbbb``\\`^_bXbbb_`_``bbbbbaaabbb___aabbaaa[[UUUbbbZZZZZabbbbbcaa[[UUU[[[[aabbbac____aPPNNNNPP]PPPWWabaaaabbbbbbbaaabbbbbaaabbbXXX__XXXXXXXXXXUYYUUUY[UUUYUUUUYXXMMMMRKMMMM
+---
diff --git a/bp_test/test/test_read_sff b/bp_test/test/test_read_sff
new file mode 100755 (executable)
index 0000000..3b05613
--- /dev/null
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+source "$BP_DIR/bp_test/lib/test.sh"
+
+run "$bp -i $in -O $tmp"
+assert_no_diff $tmp $out.1
+clean
+
+run "$bp -i $in -n 1 -O $tmp"
+assert_no_diff $tmp $out.2
+clean
+
+run "$bp -i $in -n 1 -m -O $tmp"
+assert_no_diff $tmp $out.3
+clean
+
+run "$bp -i $in -n 1 -c -O $tmp"
+assert_no_diff $tmp $out.4
+clean
diff --git a/code_ruby/Maasha/lib/sff.rb b/code_ruby/Maasha/lib/sff.rb
new file mode 100644 (file)
index 0000000..3e0fc37
--- /dev/null
@@ -0,0 +1,225 @@
+# Copyright (C) 2011 Martin A. Hansen.
+
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+# http://www.gnu.org/copyleft/gpl.html
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# This software is part of the Biopieces framework (www.biopieces.org).
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# Error class for all exceptions to do with SFF.
+class SFFError < StandardError; end
+
+# Class containing methods to parse SFF files:
+# http://www.ncbi.nlm.nih.gov/Traces/trace.cgi?cmd=show&f=formats&m=doc&s=format#sff
+class SFF
+  include Enumerable
+
+  @@count = 0
+
+  # Class method for opening SFF files.
+  def self.open(*args)
+    ios = File.open(*args)
+
+    if block_given?
+      begin
+        yield self.new(ios)
+      ensure
+        ios.close
+      end
+    else
+      return self.new(ios)
+    end
+  end
+
+  # Method to initialize a SFF object along with
+  # instance variables pertaining to the SFF header
+  # section.
+  def initialize(io)
+    @io                       = io
+    @magic_number             = 0
+    @version                  = ""
+    @index_offset             = 0
+    @index_length             = 0
+    @number_of_reads          = 0
+    @header_length            = 0
+    @key_length               = 0
+    @number_of_flows_per_read = 0
+    @flowgram_format_code     = 0
+    @flow_chars               = ""
+    @key_sequence             = ""
+    @eight_byte_padding       = 0
+
+    header_parse
+  end
+
+  # Method to close ios.
+  def close
+    @io.close
+  end
+
+  # Method to iterate over each SFF entry.
+  def each
+    while (read = read_parse) do
+      yield read
+    end
+
+    self   # conventionally
+  end
+
+  private
+
+  # Method to parse the SFF file's header section
+  # and load the information into the instance variables.
+  def header_parse
+    template     = "NC4N2NNnnnC"
+    bits_in_uint = 32
+
+    data = @io.read(31).unpack(template)
+    
+    @magic_number             = data[0]
+    @version                  = data[1 .. 4].join ""
+    @index_offset             = (data[5] << bits_in_uint) | data[6]
+    @index_length             = data[7]
+    @number_of_reads          = data[8]
+    @header_length            = data[9]
+    @key_length               = data[10]
+    @number_of_flows_per_read = data[11]
+    @flowgram_format_code     = data[12]
+    @flow_chars               = @io.read(@number_of_flows_per_read).unpack("A*").join ""
+    @key_sequence             = @io.read(@key_length).unpack("A*").join ""
+
+    fast_forward
+
+    check_magic_number
+    check_version
+    check_header_length
+  end
+
+  # Method that reads the eight_byte_padding field found at the end of the
+  # data section and fast forwards, i.e. move the file read pointer,
+  # so that the length of the section is divisible by 8.
+  def fast_forward
+    eight_byte_padding = 8 - (@io.pos % 8)
+
+    @io.read(eight_byte_padding) unless eight_byte_padding == 8
+  end
+
+  # Method to parse a read section of a SFF file.
+  def read_parse
+    return nil if @number_of_reads == @@count
+
+    template = "nnNnnnn"
+
+    read = Read.new()
+
+    data = @io.read(16).unpack(template)
+
+    read.read_header_length  = data[0]
+    read.name_length         = data[1]
+    read.number_of_bases     = data[2]
+    read.clip_qual_left      = data[3]
+    read.clip_qual_right     = data[4]
+    read.clip_adapter_left   = data[5]
+    read.clip_adaptor_right  = data[6]
+    read.name                = @io.read(read.name_length).unpack("A*").join ""
+
+    fast_forward
+
+    @io.read(2 * @number_of_flows_per_read) # skip through flowgram_values
+    @io.read(read.number_of_bases)          # skip through flow_index_per_base
+
+    # NB! Parsing of flowgram_values and flow_index_per_base is currently disabled since these are not needed.
+    # read.flowgram_values     = @io.read(2 * @number_of_flows_per_read).unpack("n*").map { |val| val = sprintf("%.2f", val * 0.01) }
+    # flow_index_per_base      = @io.read(read.number_of_bases).unpack("C*")
+    # (1 ... flow_index_per_base.length).each { |i| flow_index_per_base[i] += flow_index_per_base[i - 1] }
+    # read.flow_index_per_base = flow_index_per_base
+
+    read.bases               = @io.read(read.number_of_bases).unpack("A*").join ""
+    read.quality_scores      = @io.read(read.number_of_bases).unpack("C*")
+
+    fast_forward
+
+    @@count += 1
+
+    read
+  end
+
+  # Method to check the magic number of a SFF file.
+  # Raises an error if the magic number don't match.
+  def check_magic_number
+    raise SFFError, "Badly formatted SFF file." unless @magic_number == 779314790
+  end
+
+  # Method to check the version number of a SFF file.
+  # Raises an error if the version don't match.
+  def check_version
+    raise SFFError, "Wrong version #{@version}" unless @version.to_i == 1
+  end
+
+  # Method to check the header length of a SFF file.
+  # Raises an error if the header length don't match
+  # the file position after reading the header section.
+  def check_header_length
+    raise SFFError, "Bad header length: #{header_length}" unless @io.pos == @header_length
+  end
+end
+
+# Class containing data accessor methods for an SFF entry and methods
+# for manipulating this entry.
+class Read
+  attr_accessor :read_header_length, :name_length, :number_of_bases,   
+    :clip_qual_left, :clip_qual_right, :clip_adapter_left, :clip_adaptor_right,
+    :name, :flowgram_values, :flow_index_per_base, :bases, :quality_scores
+
+  # Method that converts a Read object's data to a Biopiece record (a hash).
+  def to_bp
+    hash = {}
+
+    hash[:SEQ_NAME]           = self.name
+    hash[:SEQ]                = self.bases
+    hash[:SEQ_LEN]            = self.bases.length
+    hash[:CLIP_QUAL_LEFT]     = self.clip_qual_left  - 1
+    hash[:CLIP_QUAL_RIGHT]    = self.clip_qual_right - 1
+    hash[:CLIP_ADAPTOR_LEFT]  = self.clip_adapter_left
+    hash[:CLIP_ADAPTOR_RIGHT] = self.clip_adaptor_right
+    hash[:SCORES]             = self.quality_scores.map { |i| (i += 64).chr }.join ""
+
+    hash
+  end
+
+  # Method that soft masks the sequence (i.e. lowercases sequence) according to
+  # clip_qual_left and clip_qual_right information.
+  def mask
+    left   = self.bases[0 ... self.clip_qual_left - 1].downcase
+    middle = self.bases[self.clip_qual_left - 1 ... self.clip_qual_right]
+    right  = self.bases[self.clip_qual_right ... self.bases.length].downcase
+
+    self.bases = left + middle + right
+  end
+
+  # Method that clips sequence (i.e. trims) according to
+  # clip_qual_left and clip_qual_right information.
+  def clip
+    self.bases          = self.bases[self.clip_qual_left - 1 ... self.clip_qual_right]
+    self.quality_scores = self.quality_scores[self.clip_qual_left - 1 ... self.clip_qual_right]
+  end
+end
+
+__END__
+