1 # -*- coding: utf-8 -*-
3 # Copyright (c) 2008/2013 Andrey Vlasovskikh
5 # Permission is hereby granted, free of charge, to any person obtaining
6 # a copy of this software and associated documentation files (the
7 # "Software"), to deal in the Software without restriction, including
8 # without limitation the rights to use, copy, modify, merge, publish,
9 # distribute, sublicense, and/or sell copies of the Software, and to
10 # permit persons to whom the Software is furnished to do so, subject to
11 # the following conditions:
13 # The above copyright notice and this permission notice shall be included
14 # in all copies or substantial portions of the Software.
16 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19 # IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
20 # CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
21 # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
22 # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24 __all__ = ['make_tokenizer', 'Token', 'LexerError']
29 class LexerError(Exception):
30 def __init__(self, place, msg):
35 s = 'cannot tokenize data'
36 line, pos = self.place
37 return '%s: %d,%d: "%s"' % (s, line, pos, self.msg)
41 def __init__(self, type, value, start=None, end=None):
48 return 'Token(%r, %r)' % (self.type, self.value)
50 def __eq__(self, other):
51 # FIXME: Case sensitivity is assumed here
52 return self.type == other.type and self.value == other.value
55 if self.start is None or self.end is None:
60 return '%d,%d-%d,%d:' % (sl, sp, el, ep)
63 s = "%s %s '%s'" % (self._pos_str(), self.type, self.value)
71 return "%s %s '%s'" % (self._pos_str().ljust(20),
76 def make_tokenizer(specs):
77 """[(str, (str, int?))] -> (str -> Iterable(Token))"""
79 def compile_spec(spec):
81 return name, re.compile(*args)
83 compiled = [compile_spec(s) for s in specs]
85 def match_specs(specs, str, i, position):
87 for type, regexp in specs:
88 m = regexp.match(str, i)
91 nls = value.count('\n')
94 n_pos = pos + len(value)
96 n_pos = len(value) - value.rfind('\n') - 1
97 return Token(type, value, (line, pos + 1), (n_line, n_pos))
99 errline = str.splitlines()[line - 1]
100 raise LexerError((line, pos + 1), errline)
107 t = match_specs(compiled, str, i, (line, pos))
114 # This is an example of a token spec. See also [this article][1] for a
115 # discussion of searching for multiline comments using regexps (including `*?`).
117 # [1]: http://ostermiller.org/findcomment.html
118 _example_token_specs = [
119 ('COMMENT', (r'\(\*(.|[\r\n])*?\*\)', re.MULTILINE)),
120 ('COMMENT', (r'\{(.|[\r\n])*?\}', re.MULTILINE)),
121 ('COMMENT', (r'//.*',)),
122 ('NL', (r'[\r\n]+',)),
123 ('SPACE', (r'[ \t\r\n]+',)),
124 ('NAME', (r'[A-Za-z_][A-Za-z_0-9]*',)),
125 ('REAL', (r'[0-9]+\.[0-9]*([Ee][+\-]?[0-9]+)*',)),
126 ('INT', (r'[0-9]+',)),
127 ('INT', (r'\$[0-9A-Fa-f]+',)),
128 ('OP', (r'(\.\.)|(<>)|(<=)|(>=)|(:=)|[;,=\(\):\[\]\.+\-<>\*/@\^]',)),
129 ('STRING', (r"'([^']|(''))*'",)),
130 ('CHAR', (r'#[0-9]+',)),
131 ('CHAR', (r'#\$[0-9A-Fa-f]+',)),
133 #tokenize = make_tokenizer(_example_token_specs)