--- /dev/null
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+# http://www.gnu.org/copyleft/gpl.html
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# This software is part of the Biopieces framework (www.biopieces.org).
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# SAM format version v1.4-r962 - April 17, 2011
+#
+# http://samtools.sourceforge.net/SAM1.pdf
+
+require 'pp'
+require 'maasha/filesys'
+require 'maasha/seq'
+
+# Error class for all exceptions to do with Genbank.
+class SamError < StandardError; end
+
+REGEX_HEADER = Regexp.new(/^@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/)
+REGEX_COMMENT = Regexp.new(/^@CO\t.*/)
+
+class Sam < Filesys
+ attr_accessor :io
+
+ def initialize(io = nil)
+ @io = io
+ @header_hash = {}
+ end
+
+ # Each header line should match: /^@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ or /^@CO\t.*/.
+ # Tags containing lowercase letters are reserved for end users.
+ def header
+ @io.each_line do |line|
+ line.chomp!
+
+ if line =~ /^@([A-Za-z][A-Za-z])/
+ tag = $1
+
+ case tag
+ when 'HD' then parse_header(line)
+ when 'SQ' then parse_sequence(line)
+ when 'RG' then parse_read_group(line)
+ when 'PG' then parse_program(line)
+ when 'CO' then parse_comment(line)
+ else
+ raise SamError, "Unknown header tag: #{tag}"
+ end
+ else
+ @io.rewind
+ break
+ end
+ end
+
+ return @header_hash.empty? ? nil : @header_hash
+ end
+
+ private
+
+ def parse_header(line)
+ hash = {}
+ fields = line.split("\t")
+
+ if fields[1] =~ /^VN:([0-9]+\.[0-9]+)$/
+ hash[:VN] = $1.to_f
+ else
+ raise SamError, "Bad version number: #{fields[1]}"
+ end
+
+ if fields.size > 2
+ if fields[2] =~ /^SO:(unknown|unsorted|queryname|coordinate)$/
+ hash[:SO] = $1
+ else
+ raise SamError, "Bad sort order: #{fields[2]}"
+ end
+ end
+
+ @header_hash[:HD] = hash
+ end
+
+ def parse_sequence(line)
+ @header_hash[:SQ] = Hash.new unless @header_hash[:SQ].is_a? Hash
+ hash = {}
+
+ fields = line.split("\t")
+
+ if fields[1] =~ /^SN:([!-)+-<>-~][!-~]*)$/
+ seq_name = $1.to_sym
+ else
+ raise SamError, "Bad sequence name: #{fields[1]}"
+ end
+
+ if fields[2] =~ /^LN:(\d+)$/
+ hash[:LN] = $1.to_i
+ else
+ raise SamError, "Bad sequence length: #{fields[2]}"
+ end
+
+ (3 ... fields.size).each do |i|
+ if fields[i] =~ /^(AS|M5|SP|UR):([ -~]+)$/
+ hash[$1.to_sym] = $2
+ else
+ raise SamError, "Bad sequence tag: #{fields[i]}"
+ end
+ end
+
+ @header_hash[:SQ][:SN] = Hash.new unless @header_hash[:SQ][:SN].is_a? Hash
+
+ if @header_hash[:SQ][:SN].has_key? seq_name
+ raise SamError, "Non-unique sequence name: #{seq_name}"
+ else
+ @header_hash[:SQ][:SN][seq_name] = hash
+ end
+ end
+
+ def parse_read_group(line)
+ @header_hash[:RG] = Hash.new unless @header_hash[:RG].is_a? Hash
+ hash = {}
+
+ fields = line.split("\t")
+
+ if fields[1] =~ /^ID:([ -~]+)$/
+ id = $1.to_sym
+ else
+ raise SamError, "Bad read group identifier: #{fields[1]}"
+ end
+
+ (2 ... fields.size).each do |i|
+ if fields[i] =~ /^(CN|DS|DT|FO|KS|LB|PG|PI|PL|PU|SM):([ -~]+)$/
+ hash[$1.to_sym] = $2
+ else
+ raise SamError, "Bad read group tag: #{fields[i]}"
+ end
+ end
+
+ if hash.has_key? :FO
+ unless hash[:FO] =~ /^\*|[ACMGRSVTWYHKDBN]+$/
+ raise SamError, "Bad flow order: #{hash[:FO]}"
+ end
+ end
+
+ if hash.has_key? :PL
+ unless hash[:PL] =~ /^(CAPILLARY|LS454|ILLUMINA|SOLID|HELICOS|IONTORRENT|PACBIO)$/
+ raise SamError, "Bad platform: #{hash[:PL]}"
+ end
+ end
+
+ @header_hash[:RG][:ID] = Hash.new unless @header_hash[:RG][:ID].is_a? Hash
+
+ if @header_hash[:RG][:ID].has_key? id
+ raise SamError, "Non-unique read group identifier: #{id}"
+ else
+ @header_hash[:RG][:ID][id] = hash
+ end
+ end
+
+ def parse_program(line)
+ @header_hash[:PG] = Hash.new unless @header_hash[:PG].is_a? Hash
+ hash = {}
+
+ fields = line.split("\t")
+
+ if fields[1] =~ /^ID:([ -~]+)$/
+ id = $1.to_sym
+ else
+ raise SamError, "Bad program record identifier: #{fields[1]}"
+ end
+
+ (2 ... fields.size).each do |i|
+ if fields[i] =~ /^(PN|CL|PP|VN):([ -~]+)$/
+ hash[$1.to_sym] = $2
+ else
+ raise SamError, "Bad program record tag: #{fields[i]}"
+ end
+ end
+
+ @header_hash[:PG][:ID] = Hash.new unless @header_hash[:PG][:ID].is_a? Hash
+
+ if @header_hash[:PG][:ID].has_key? id
+ raise SamError, "Non-unique program record identifier: #{id}"
+ else
+ @header_hash[:PG][:ID][id] = hash
+ end
+ end
+
+ def parse_comment(line)
+ @header_hash[:CO] = Array.new unless @header_hash[:CO].is_a? Array
+
+ if line =~ /^@CO\t(.+)/
+ @header_hash[:CO] << $1
+ else
+ raise SamError, "Bad comment line: #{line}"
+ end
+ end
+
+ def get_entry
+ end
+end
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+__END__
+