From c6954bdfd701288e274404e0f14957805b862030 Mon Sep 17 00:00:00 2001 From: martinahansen Date: Fri, 17 Jun 2011 13:47:30 +0000 Subject: [PATCH] added sam.rb to ruby trunk git-svn-id: http://biopieces.googlecode.com/svn/trunk@1479 74ccb610-7750-0410-82ae-013aeee3265d --- code_ruby/lib/maasha/sam.rb | 217 ++++++++++++++++++++++++++++++++++++ 1 file changed, 217 insertions(+) create mode 100644 code_ruby/lib/maasha/sam.rb diff --git a/code_ruby/lib/maasha/sam.rb b/code_ruby/lib/maasha/sam.rb new file mode 100644 index 0000000..b45f039 --- /dev/null +++ b/code_ruby/lib/maasha/sam.rb @@ -0,0 +1,217 @@ +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# This software is part of the Biopieces framework (www.biopieces.org). + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# SAM format version v1.4-r962 - April 17, 2011 +# +# http://samtools.sourceforge.net/SAM1.pdf + +require 'pp' +require 'maasha/filesys' +require 'maasha/seq' + +# Error class for all exceptions to do with Genbank. +class SamError < StandardError; end + +REGEX_HEADER = Regexp.new(/^@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/) +REGEX_COMMENT = Regexp.new(/^@CO\t.*/) + +class Sam < Filesys + attr_accessor :io + + def initialize(io = nil) + @io = io + @header_hash = {} + end + + # Each header line should match: /^@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ or /^@CO\t.*/. + # Tags containing lowercase letters are reserved for end users. + def header + @io.each_line do |line| + line.chomp! + + if line =~ /^@([A-Za-z][A-Za-z])/ + tag = $1 + + case tag + when 'HD' then parse_header(line) + when 'SQ' then parse_sequence(line) + when 'RG' then parse_read_group(line) + when 'PG' then parse_program(line) + when 'CO' then parse_comment(line) + else + raise SamError, "Unknown header tag: #{tag}" + end + else + @io.rewind + break + end + end + + return @header_hash.empty? ? nil : @header_hash + end + + private + + def parse_header(line) + hash = {} + fields = line.split("\t") + + if fields[1] =~ /^VN:([0-9]+\.[0-9]+)$/ + hash[:VN] = $1.to_f + else + raise SamError, "Bad version number: #{fields[1]}" + end + + if fields.size > 2 + if fields[2] =~ /^SO:(unknown|unsorted|queryname|coordinate)$/ + hash[:SO] = $1 + else + raise SamError, "Bad sort order: #{fields[2]}" + end + end + + @header_hash[:HD] = hash + end + + def parse_sequence(line) + @header_hash[:SQ] = Hash.new unless @header_hash[:SQ].is_a? Hash + hash = {} + + fields = line.split("\t") + + if fields[1] =~ /^SN:([!-)+-<>-~][!-~]*)$/ + seq_name = $1.to_sym + else + raise SamError, "Bad sequence name: #{fields[1]}" + end + + if fields[2] =~ /^LN:(\d+)$/ + hash[:LN] = $1.to_i + else + raise SamError, "Bad sequence length: #{fields[2]}" + end + + (3 ... fields.size).each do |i| + if fields[i] =~ /^(AS|M5|SP|UR):([ -~]+)$/ + hash[$1.to_sym] = $2 + else + raise SamError, "Bad sequence tag: #{fields[i]}" + end + end + + @header_hash[:SQ][:SN] = Hash.new unless @header_hash[:SQ][:SN].is_a? Hash + + if @header_hash[:SQ][:SN].has_key? seq_name + raise SamError, "Non-unique sequence name: #{seq_name}" + else + @header_hash[:SQ][:SN][seq_name] = hash + end + end + + def parse_read_group(line) + @header_hash[:RG] = Hash.new unless @header_hash[:RG].is_a? Hash + hash = {} + + fields = line.split("\t") + + if fields[1] =~ /^ID:([ -~]+)$/ + id = $1.to_sym + else + raise SamError, "Bad read group identifier: #{fields[1]}" + end + + (2 ... fields.size).each do |i| + if fields[i] =~ /^(CN|DS|DT|FO|KS|LB|PG|PI|PL|PU|SM):([ -~]+)$/ + hash[$1.to_sym] = $2 + else + raise SamError, "Bad read group tag: #{fields[i]}" + end + end + + if hash.has_key? :FO + unless hash[:FO] =~ /^\*|[ACMGRSVTWYHKDBN]+$/ + raise SamError, "Bad flow order: #{hash[:FO]}" + end + end + + if hash.has_key? :PL + unless hash[:PL] =~ /^(CAPILLARY|LS454|ILLUMINA|SOLID|HELICOS|IONTORRENT|PACBIO)$/ + raise SamError, "Bad platform: #{hash[:PL]}" + end + end + + @header_hash[:RG][:ID] = Hash.new unless @header_hash[:RG][:ID].is_a? Hash + + if @header_hash[:RG][:ID].has_key? id + raise SamError, "Non-unique read group identifier: #{id}" + else + @header_hash[:RG][:ID][id] = hash + end + end + + def parse_program(line) + @header_hash[:PG] = Hash.new unless @header_hash[:PG].is_a? Hash + hash = {} + + fields = line.split("\t") + + if fields[1] =~ /^ID:([ -~]+)$/ + id = $1.to_sym + else + raise SamError, "Bad program record identifier: #{fields[1]}" + end + + (2 ... fields.size).each do |i| + if fields[i] =~ /^(PN|CL|PP|VN):([ -~]+)$/ + hash[$1.to_sym] = $2 + else + raise SamError, "Bad program record tag: #{fields[i]}" + end + end + + @header_hash[:PG][:ID] = Hash.new unless @header_hash[:PG][:ID].is_a? Hash + + if @header_hash[:PG][:ID].has_key? id + raise SamError, "Non-unique program record identifier: #{id}" + else + @header_hash[:PG][:ID][id] = hash + end + end + + def parse_comment(line) + @header_hash[:CO] = Array.new unless @header_hash[:CO].is_a? Array + + if line =~ /^@CO\t(.+)/ + @header_hash[:CO] << $1 + else + raise SamError, "Bad comment line: #{line}" + end + end + + def get_entry + end +end + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +__END__ + -- 2.39.2