1 # Copyright (C) 2007-2011 Martin A. Hansen.
3 # This program is free software; you can redistribute it and/or
4 # modify it under the terms of the GNU General Public License
5 # as published by the Free Software Foundation; either version 2
6 # of the License, or (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU General Public License for more details.
13 # You should have received a copy of the GNU General Public License
14 # along with this program; if not, write to the Free Software
15 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17 # http://www.gnu.org/copyleft/gpl.html
19 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
21 # This software is part of the Biopieces framework (www.biopieces.org).
23 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
25 # SAM format version v1.4-r962 - April 17, 2011
27 # http://samtools.sourceforge.net/SAM1.pdf
30 require 'maasha/filesys'
33 REGEX_HEADER = Regexp.new(/^@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/)
34 REGEX_COMMENT = Regexp.new(/^@CO\t.*/)
36 FLAG_MULTI = 0x1 # Template having multiple fragments in sequencing
37 FLAG_ALIGNED = 0x2 # Each fragment properly aligned according to the aligner
38 FLAG_UNMAPPED = 0x4 # Fragment unmapped
39 FLAG_NEXT_UNMAPPED = 0x8 # Next fragment in the template unmapped
40 FLAG_REVCOMP = 0x10 # SEQ being reverse complemented
41 FLAG_NEXT_REVCOMP = 0x20 # SEQ of the next fragment in the template being reversed
42 FLAG_FIRST = 0x40 # The first fragment in the template
43 FLAG_LAST = 0x80 # The last fragment in the template
44 FLAG_SECONDARY_ALIGNMENT = 0x100 # Secondary alignment
45 FLAG_QUALITY_FAIL = 0x200 # Not passing quality controls
46 FLAG_DUPLICATES = 0x400 # PCR or optical duplicate
48 # Error class for all exceptions to do with Genbank.
49 class SamError < StandardError; end
51 # Class to parse and write SAM files.
53 attr_accessor :io, :header
55 # Class method to convert a SAM entry
56 # to a Biopiece record.
61 bp[:Q_ID] = sam[:QNAME]
62 bp[:STRAND] = sam[:FLAG].revcomp? ? '-' : '+'
63 bp[:S_ID] = sam[:RNAME]
64 bp[:S_BEG] = sam[:POS]
65 bp[:MAPQ] = sam[:MAPQ]
66 bp[:CIGAR] = sam[:CIGAR]
68 unless sam[:RNEXT] == '*'
69 bp[:Q_ID2] = sam[:RNEXT]
70 bp[:S_BEG2] = sam[:PNEXT]
71 bp[:TLEN] = sam[:TLEN]
74 bp[:SEQ] = sam[:SEQ].seq
76 unless sam[:SEQ].qual.nil?
77 bp[:SCORES] = sam[:SEQ].convert_phred2illumina!.qual
80 if sam.has_key? :NM and sam[:NM].to_i > 0
81 bp[:ALIGN] = self.align_descriptors(sam)
87 # Create align descriptors according to the KISS format description:
88 # http://code.google.com/p/biopieces/wiki/KissFormat
89 def self.align_descriptors(sam)
95 sam[:CIGAR].scan(/([0-9]+)([MIDNSHPX=])/).each do |len, op|
99 (0 ... len).each_with_index do |i|
100 nt = sam[:SEQ].seq[offset + i]
102 align << [offset + i, "->#{nt}"]
112 sam[:MD].scan(/\d+|\^[A-Z]+|[A-Z]+/).each do |m|
113 if m =~ /\d+/ # Matches
115 elsif m[0] == '^' # Deletions
116 m[1 .. -1].each_char do |nt|
117 align << [offset, "#{nt}>-"]
124 nt2 = sam[:SEQ].seq[offset - deletions]
126 align << [offset, "#{nt}>#{nt2}"]
133 align.sort_by { |a| a.first }.map { |k,v| "#{k}:#{v}" }.join(",")
136 # Method to initialize a Sam object.
137 def initialize(io = nil)
145 @io.each_line do |line|
146 unless line[0] == '@'
147 entry = parse_alignment(line.chomp)
149 yield entry if block_given?
156 # Method to parse the header section of a SAM file.
157 # Each header line should match:
158 # /^@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ or /^@CO\t.*/.
159 # Tags containing lowercase letters are reserved for end users.
161 @io.each_line do |line|
162 if line =~ /^@([A-Za-z][A-Za-z])/
168 when 'HD' then subparse_header(line)
169 when 'SQ' then subparse_sequence(line)
170 when 'RG' then subparse_read_group(line)
171 when 'PG' then subparse_program(line)
172 when 'CO' then subparse_comment(line)
174 raise SamError, "Unknown header tag: #{tag}"
182 return @header.empty? ? nil : @header
185 # Method to subparse header lines.
186 def subparse_header(line)
188 fields = line.split("\t")
190 if fields[1] =~ /^VN:([0-9]+\.[0-9]+)$/
193 raise SamError, "Bad version number: #{fields[1]}"
197 if fields[2] =~ /^SO:(unknown|unsorted|queryname|coordinate)$/
200 raise SamError, "Bad sort order: #{fields[2]}"
207 # Method to subparse sequence lines.
208 def subparse_sequence(line)
209 @header[:SQ] = Hash.new unless @header[:SQ].is_a? Hash
212 fields = line.split("\t")
214 if fields[1] =~ /^SN:([!-)+-<>-~][!-~]*)$/
217 raise SamError, "Bad sequence name: #{fields[1]}"
220 if fields[2] =~ /^LN:(\d+)$/
223 raise SamError, "Bad sequence length: #{fields[2]}"
226 (3 ... fields.size).each do |i|
227 if fields[i] =~ /^(AS|M5|SP|UR):([ -~]+)$/
230 raise SamError, "Bad sequence tag: #{fields[i]}"
234 @header[:SQ][:SN] = Hash.new unless @header[:SQ][:SN].is_a? Hash
236 if @header[:SQ][:SN].has_key? seq_name
237 raise SamError, "Non-unique sequence name: #{seq_name}"
239 @header[:SQ][:SN][seq_name] = hash
243 # Method to subparse read group lines.
244 def subparse_read_group(line)
245 @header[:RG] = Hash.new unless @header[:RG].is_a? Hash
248 fields = line.split("\t")
250 if fields[1] =~ /^ID:([ -~]+)$/
253 raise SamError, "Bad read group identifier: #{fields[1]}"
256 (2 ... fields.size).each do |i|
257 if fields[i] =~ /^(CN|DS|DT|FO|KS|LB|PG|PI|PL|PU|SM):([ -~]+)$/
260 raise SamError, "Bad read group tag: #{fields[i]}"
265 unless hash[:FO] =~ /^\*|[ACMGRSVTWYHKDBN]+$/
266 raise SamError, "Bad flow order: #{hash[:FO]}"
271 unless hash[:PL] =~ /^(CAPILLARY|LS454|ILLUMINA|SOLID|HELICOS|IONTORRENT|PACBIO)$/
272 raise SamError, "Bad platform: #{hash[:PL]}"
276 @header[:RG][:ID] = Hash.new unless @header[:RG][:ID].is_a? Hash
278 if @header[:RG][:ID].has_key? id
279 raise SamError, "Non-unique read group identifier: #{id}"
281 @header[:RG][:ID][id] = hash
285 # Method to subparse program lines.
286 def subparse_program(line)
287 @header[:PG] = Hash.new unless @header[:PG].is_a? Hash
290 fields = line.split("\t")
292 if fields[1] =~ /^ID:([ -~]+)$/
295 raise SamError, "Bad program record identifier: #{fields[1]}"
298 (2 ... fields.size).each do |i|
299 if fields[i] =~ /^(PN|CL|PP|VN):([ -~]+)$/
302 raise SamError, "Bad program record tag: #{fields[i]}"
306 @header[:PG][:ID] = Hash.new unless @header[:PG][:ID].is_a? Hash
308 if @header[:PG][:ID].has_key? id
309 raise SamError, "Non-unique program record identifier: #{id}"
311 @header[:PG][:ID][id] = hash
315 # Method to subparse comment lines.
316 def subparse_comment(line)
317 @header[:CO] = Array.new unless @header[:CO].is_a? Array
319 if line =~ /^@CO\t(.+)/
322 raise SamError, "Bad comment line: #{line}"
326 # Method to subparse alignment lines.
327 def parse_alignment(line)
328 fields = line.split("\t")
330 raise SamError, "Bad number of fields: #{fields.size} < 11" if fields.size < 11
333 flag = fields[1].to_i
336 mapq = fields[4].to_i
339 pnext = fields[7].to_i
340 tlen = fields[8].to_i
349 check_cigar(cigar, seq)
357 entry[:QNAME] = qname
358 entry[:FLAG] = Flag.new(flag)
359 entry[:RNAME] = rname
362 entry[:CIGAR] = cigar
363 entry[:RNEXT] = rnext
364 entry[:PNEXT] = pnext
366 entry[:SEQ] = (qual == '*') ? Seq.new(qname, seq) : Seq.new(qname, seq, qual)
369 # Optional fields - where some are really important! HATE HATE HATE SAM!!!
371 fields[11 .. -1].each do |field|
372 tag, type, val = field.split(':')
374 raise SamError, "Non-unique optional tag: #{tag}" if entry.has_key? tag.to_sym
376 # A [!-~] Printable character
378 # i [-+]?[0-9]+ Singed 32-bit integer
380 raise SamError, "Bad tag in optional field: #{field}" unless val =~ /^[-+]?[0-9]+$/
384 # f [-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)? Single-precision floating number
385 # Z [ !-~]+ Printable string, including space
386 # H [0-9A-F]+ Byte array in the Hex format
387 # B [cCsSiIf](,[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?)+ Integer or numeric array
389 entry[tag.to_sym] = val
395 # Method to check qname.
396 def check_qname(qname)
397 raise SamError, "Bad qname: #{qname}" unless qname =~ /^[!-?A-~]{1,255}$/
400 # Method to check flag.
402 raise SamError, "Bad flag: #{flag}" unless (0 .. 2**16 - 1).include? flag
405 # Method to check if rname, when not '*' and
406 # @SQ header lines are present, is located in
408 def check_rname(rname)
409 raise SamError, "Bad rname: #{rname}" unless rname =~ /^(\*|[!-()+-<>-~][!-~]*)$/
411 unless @header.empty? or rname == '*'
412 unless @header[:SQ][:SN].has_key? rname.to_sym
413 raise SamError, "rname not found in header hash: #{rname}"
418 # Method to check pos.
420 raise SamError, "Bad pos: #{pos}" unless (0 .. 2**29 - 1).include? pos
423 # Method to check mapq.
425 raise SamError, "Bad mapq: #{mapq}" unless (0 .. 2**8 - 1).include? mapq
428 # Method to check cigar string.
429 def check_cigar(cigar, seq)
430 raise SamError, "Bad cigar: #{cigar}" unless cigar =~ /^(\*|([0-9]+[MIDNSHPX=])+)$/
433 check_cigar_hard_clip(cigar)
434 check_cigar_soft_clip(cigar)
435 check_cigar_seq_len(cigar, seq) unless seq == '*'
439 # Method to check cigar hard clipping only at ends.
440 def check_cigar_hard_clip(cigar)
441 if cigar.gsub(/^[0-9]+H|[0-9]+H$/, "").match('H')
442 raise SamError, "Bad cigar with internal H: #{cigar}"
446 # Method to check cigar soft clipping only at ends or H.
447 def check_cigar_soft_clip(cigar)
448 if cigar.gsub(/^[0-9]+H|[0-9]+H$/, "").gsub(/^[0-9]+S|[0-9]+S$/, "").match('S')
449 raise SamError, "Bad cigar with internal S: #{cigar}"
453 # Method to check cigar length matches sequence length.
454 def check_cigar_seq_len(cigar, seq)
457 cigar.scan(/([0-9]+)([MIDNSHPX=])/).each do |len, op|
458 cigar_len += len.to_i if op =~ /[MISX=]/
461 if cigar_len != seq.length
462 raise SamError, "cigar and sequence length mismatch: #{cigar_len} != #{seq.length}"
466 # Method to check if rnext, when not '*' or '='
467 # and @SQ header lines are present, is located
468 # in the header hash.
469 def check_rnext(rnext)
470 raise SamError, "Bad rnext: #{rnext}" unless rnext =~ /^(\*|=|[!-()+-<>-~][!-~]*)$/
472 unless @header.empty? or rnext == '*' or rnext == '='
473 unless @header[:SQ][:SN].has_key? rnext.to_sym
474 raise SamError, "rnext not found in header hash: #{rnext}"
479 # Method to check pnext.
480 def check_pnext(pnext)
481 raise SamError, "Bad pnext: #{pnext}" unless (0 .. 2**29 - 1).include? pnext
484 # Method to check tlen.
486 raise SamError, "Bad tlen: #{tlen}" unless (-2**29 + 1 .. 2**29 - 1).include? tlen
489 # Method to check seq.
491 raise SamError, "Bad seq: #{seq}" unless seq =~ /^(\*|[A-Za-z=.]+)$/
494 # Method to check qual.
496 raise SamError, "Bad qual: #{qual}" unless qual =~ /^[!-~]+$/
499 # Method to deconvolute the SAM flag field.
503 # Method to initialize a Flag object.
508 # Method to test if template have
509 # multiple fragments in sequencing.
514 # Method to test if each fragment
515 # properly aligned according to the aligner.
520 # Method to test if the fragment was unmapped.
525 # Method to test if the next fragment was unmapped.
527 flag & FLAG_NEXT_UNMAPPED
530 # Method to test if the fragment was reverse complemented.
535 # Method to test if the next fragment was reverse complemented.
537 flag & FLAG_NEXT_REVCOMP
540 # Method to test if the fragment was first in the template.
545 # Method to test if the fragment was last in the template.
550 # Method to test for secondary alignment.
551 def secondary_alignment?
552 flag & FLAG_SECONDARY_ALIGNMENT
555 # Method to test for quality fail.
557 flag & FLAG_QUALITY_FAIL
560 # Method to test for PCR or optical duplicates.
562 flag & FLAG_DUPLICATES
568 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<