1 # as published by the Free Software Foundation; either version 2
2 # of the License, or (at your option) any later version.
4 # This program is distributed in the hope that it will be useful,
5 # but WITHOUT ANY WARRANTY; without even the implied warranty of
6 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
7 # GNU General Public License for more details.
9 # You should have received a copy of the GNU General Public License
10 # along with this program; if not, write to the Free Software
11 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
13 # http://www.gnu.org/copyleft/gpl.html
15 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
17 # This software is part of the Biopieces framework (www.biopieces.org).
19 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
21 # SAM format version v1.4-r962 - April 17, 2011
23 # http://samtools.sourceforge.net/SAM1.pdf
26 require 'maasha/filesys'
29 # Error class for all exceptions to do with Genbank.
30 class SamError < StandardError; end
32 REGEX_HEADER = Regexp.new(/^@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/)
33 REGEX_COMMENT = Regexp.new(/^@CO\t.*/)
38 def initialize(io = nil)
43 # Each header line should match: /^@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ or /^@CO\t.*/.
44 # Tags containing lowercase letters are reserved for end users.
46 @io.each_line do |line|
49 if line =~ /^@([A-Za-z][A-Za-z])/
53 when 'HD' then parse_header(line)
54 when 'SQ' then parse_sequence(line)
55 when 'RG' then parse_read_group(line)
56 when 'PG' then parse_program(line)
57 when 'CO' then parse_comment(line)
59 raise SamError, "Unknown header tag: #{tag}"
67 return @header_hash.empty? ? nil : @header_hash
72 def parse_header(line)
74 fields = line.split("\t")
76 if fields[1] =~ /^VN:([0-9]+\.[0-9]+)$/
79 raise SamError, "Bad version number: #{fields[1]}"
83 if fields[2] =~ /^SO:(unknown|unsorted|queryname|coordinate)$/
86 raise SamError, "Bad sort order: #{fields[2]}"
90 @header_hash[:HD] = hash
93 def parse_sequence(line)
94 @header_hash[:SQ] = Hash.new unless @header_hash[:SQ].is_a? Hash
97 fields = line.split("\t")
99 if fields[1] =~ /^SN:([!-)+-<>-~][!-~]*)$/
102 raise SamError, "Bad sequence name: #{fields[1]}"
105 if fields[2] =~ /^LN:(\d+)$/
108 raise SamError, "Bad sequence length: #{fields[2]}"
111 (3 ... fields.size).each do |i|
112 if fields[i] =~ /^(AS|M5|SP|UR):([ -~]+)$/
115 raise SamError, "Bad sequence tag: #{fields[i]}"
119 @header_hash[:SQ][:SN] = Hash.new unless @header_hash[:SQ][:SN].is_a? Hash
121 if @header_hash[:SQ][:SN].has_key? seq_name
122 raise SamError, "Non-unique sequence name: #{seq_name}"
124 @header_hash[:SQ][:SN][seq_name] = hash
128 def parse_read_group(line)
129 @header_hash[:RG] = Hash.new unless @header_hash[:RG].is_a? Hash
132 fields = line.split("\t")
134 if fields[1] =~ /^ID:([ -~]+)$/
137 raise SamError, "Bad read group identifier: #{fields[1]}"
140 (2 ... fields.size).each do |i|
141 if fields[i] =~ /^(CN|DS|DT|FO|KS|LB|PG|PI|PL|PU|SM):([ -~]+)$/
144 raise SamError, "Bad read group tag: #{fields[i]}"
149 unless hash[:FO] =~ /^\*|[ACMGRSVTWYHKDBN]+$/
150 raise SamError, "Bad flow order: #{hash[:FO]}"
155 unless hash[:PL] =~ /^(CAPILLARY|LS454|ILLUMINA|SOLID|HELICOS|IONTORRENT|PACBIO)$/
156 raise SamError, "Bad platform: #{hash[:PL]}"
160 @header_hash[:RG][:ID] = Hash.new unless @header_hash[:RG][:ID].is_a? Hash
162 if @header_hash[:RG][:ID].has_key? id
163 raise SamError, "Non-unique read group identifier: #{id}"
165 @header_hash[:RG][:ID][id] = hash
169 def parse_program(line)
170 @header_hash[:PG] = Hash.new unless @header_hash[:PG].is_a? Hash
173 fields = line.split("\t")
175 if fields[1] =~ /^ID:([ -~]+)$/
178 raise SamError, "Bad program record identifier: #{fields[1]}"
181 (2 ... fields.size).each do |i|
182 if fields[i] =~ /^(PN|CL|PP|VN):([ -~]+)$/
185 raise SamError, "Bad program record tag: #{fields[i]}"
189 @header_hash[:PG][:ID] = Hash.new unless @header_hash[:PG][:ID].is_a? Hash
191 if @header_hash[:PG][:ID].has_key? id
192 raise SamError, "Non-unique program record identifier: #{id}"
194 @header_hash[:PG][:ID][id] = hash
198 def parse_comment(line)
199 @header_hash[:CO] = Array.new unless @header_hash[:CO].is_a? Array
201 if line =~ /^@CO\t(.+)/
202 @header_hash[:CO] << $1
204 raise SamError, "Bad comment line: #{line}"
213 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<