]> git.donarmstrong.com Git - biopieces.git/commitdiff
added sam.rb to ruby trunk
authormartinahansen <martinahansen@74ccb610-7750-0410-82ae-013aeee3265d>
Fri, 17 Jun 2011 13:47:30 +0000 (13:47 +0000)
committermartinahansen <martinahansen@74ccb610-7750-0410-82ae-013aeee3265d>
Fri, 17 Jun 2011 13:47:30 +0000 (13:47 +0000)
git-svn-id: http://biopieces.googlecode.com/svn/trunk@1479 74ccb610-7750-0410-82ae-013aeee3265d

code_ruby/lib/maasha/sam.rb [new file with mode: 0644]

diff --git a/code_ruby/lib/maasha/sam.rb b/code_ruby/lib/maasha/sam.rb
new file mode 100644 (file)
index 0000000..b45f039
--- /dev/null
@@ -0,0 +1,217 @@
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+# http://www.gnu.org/copyleft/gpl.html
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# This software is part of the Biopieces framework (www.biopieces.org).
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# SAM format version v1.4-r962 - April 17, 2011
+#
+# http://samtools.sourceforge.net/SAM1.pdf
+
+require 'pp'
+require 'maasha/filesys'
+require 'maasha/seq'
+
+# Error class for all exceptions to do with Genbank.
+class SamError < StandardError; end
+
+REGEX_HEADER  = Regexp.new(/^@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/)
+REGEX_COMMENT = Regexp.new(/^@CO\t.*/)
+
+class Sam < Filesys
+  attr_accessor :io
+
+  def initialize(io = nil)
+    @io          = io
+    @header_hash = {}
+  end
+
+  # Each header line should match: /^@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ or /^@CO\t.*/.
+  # Tags containing lowercase letters are reserved for end users.
+  def header
+    @io.each_line do |line|
+      line.chomp!
+
+      if line =~ /^@([A-Za-z][A-Za-z])/
+        tag = $1
+
+        case tag
+        when 'HD' then parse_header(line)
+        when 'SQ' then parse_sequence(line)
+        when 'RG' then parse_read_group(line)
+        when 'PG' then parse_program(line)
+        when 'CO' then parse_comment(line)
+        else
+          raise SamError, "Unknown header tag: #{tag}"
+        end
+      else
+        @io.rewind
+        break
+      end
+    end
+
+    return @header_hash.empty? ? nil : @header_hash
+  end
+
+  private
+
+  def parse_header(line)
+    hash   = {}
+    fields = line.split("\t")
+
+    if fields[1] =~ /^VN:([0-9]+\.[0-9]+)$/
+      hash[:VN] = $1.to_f
+    else
+      raise SamError, "Bad version number: #{fields[1]}"
+    end
+
+    if fields.size > 2
+      if fields[2] =~ /^SO:(unknown|unsorted|queryname|coordinate)$/
+        hash[:SO] = $1
+      else
+        raise SamError, "Bad sort order: #{fields[2]}"
+      end
+    end
+
+    @header_hash[:HD] = hash
+  end
+
+  def parse_sequence(line)
+    @header_hash[:SQ] = Hash.new unless @header_hash[:SQ].is_a? Hash
+    hash = {}
+
+    fields = line.split("\t")
+
+    if fields[1] =~ /^SN:([!-)+-<>-~][!-~]*)$/
+      seq_name = $1.to_sym
+    else
+      raise SamError, "Bad sequence name: #{fields[1]}"
+    end
+
+    if fields[2] =~ /^LN:(\d+)$/
+      hash[:LN] = $1.to_i
+    else
+      raise SamError, "Bad sequence length: #{fields[2]}"
+    end
+
+    (3 ... fields.size).each do |i|
+      if fields[i] =~ /^(AS|M5|SP|UR):([ -~]+)$/
+        hash[$1.to_sym] = $2
+      else
+        raise SamError, "Bad sequence tag: #{fields[i]}"
+      end
+    end
+
+    @header_hash[:SQ][:SN] = Hash.new unless @header_hash[:SQ][:SN].is_a? Hash
+
+    if @header_hash[:SQ][:SN].has_key? seq_name
+      raise SamError, "Non-unique sequence name: #{seq_name}"
+    else
+      @header_hash[:SQ][:SN][seq_name] = hash
+    end
+  end
+
+  def parse_read_group(line)
+    @header_hash[:RG] = Hash.new unless @header_hash[:RG].is_a? Hash
+    hash = {}
+
+    fields = line.split("\t")
+
+    if fields[1] =~ /^ID:([ -~]+)$/
+      id = $1.to_sym
+    else
+      raise SamError, "Bad read group identifier: #{fields[1]}"
+    end
+
+    (2 ... fields.size).each do |i|
+      if fields[i] =~ /^(CN|DS|DT|FO|KS|LB|PG|PI|PL|PU|SM):([ -~]+)$/
+        hash[$1.to_sym] = $2
+      else
+        raise SamError, "Bad read group tag: #{fields[i]}"
+      end
+    end
+
+    if hash.has_key? :FO
+      unless hash[:FO] =~ /^\*|[ACMGRSVTWYHKDBN]+$/
+        raise SamError, "Bad flow order: #{hash[:FO]}"
+      end
+    end
+
+    if hash.has_key? :PL
+      unless hash[:PL] =~ /^(CAPILLARY|LS454|ILLUMINA|SOLID|HELICOS|IONTORRENT|PACBIO)$/
+        raise SamError, "Bad platform: #{hash[:PL]}"
+      end
+    end
+
+    @header_hash[:RG][:ID] = Hash.new unless @header_hash[:RG][:ID].is_a? Hash
+
+    if @header_hash[:RG][:ID].has_key? id
+      raise SamError, "Non-unique read group identifier: #{id}"
+    else
+      @header_hash[:RG][:ID][id] = hash
+    end
+  end
+
+  def parse_program(line)
+    @header_hash[:PG] = Hash.new unless @header_hash[:PG].is_a? Hash
+    hash = {}
+
+    fields = line.split("\t")
+
+    if fields[1] =~ /^ID:([ -~]+)$/
+      id = $1.to_sym
+    else
+      raise SamError, "Bad program record identifier: #{fields[1]}"
+    end
+
+    (2 ... fields.size).each do |i|
+      if fields[i] =~ /^(PN|CL|PP|VN):([ -~]+)$/
+        hash[$1.to_sym] = $2
+      else
+        raise SamError, "Bad program record tag: #{fields[i]}"
+      end
+    end
+
+    @header_hash[:PG][:ID] = Hash.new unless @header_hash[:PG][:ID].is_a? Hash
+
+    if @header_hash[:PG][:ID].has_key? id
+      raise SamError, "Non-unique program record identifier: #{id}"
+    else
+      @header_hash[:PG][:ID][id] = hash
+    end
+  end
+
+  def parse_comment(line)
+    @header_hash[:CO] = Array.new unless @header_hash[:CO].is_a? Array
+
+    if line =~ /^@CO\t(.+)/
+      @header_hash[:CO] << $1
+    else
+      raise SamError, "Bad comment line: #{line}"
+    end
+  end
+
+  def get_entry
+  end
+end
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+__END__
+