X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=funcparserlib%2Flexer.py;fp=funcparserlib%2Flexer.py;h=96cbd988573b6ef742b68911ea2b13e9a7ba7f26;hb=1f0bf65417613b1cb3a5bfd744750e1e823c8622;hp=0000000000000000000000000000000000000000;hpb=3057d43e4908a17145db673727b99fb85450fd85;p=kiibohd-kll.git diff --git a/funcparserlib/lexer.py b/funcparserlib/lexer.py new file mode 100644 index 0000000..96cbd98 --- /dev/null +++ b/funcparserlib/lexer.py @@ -0,0 +1,133 @@ +# -*- coding: utf-8 -*- + +# Copyright (c) 2008/2013 Andrey Vlasovskikh +# +# Permission is hereby granted, free of charge, to any person obtaining +# a copy of this software and associated documentation files (the +# "Software"), to deal in the Software without restriction, including +# without limitation the rights to use, copy, modify, merge, publish, +# distribute, sublicense, and/or sell copies of the Software, and to +# permit persons to whom the Software is furnished to do so, subject to +# the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +__all__ = ['make_tokenizer', 'Token', 'LexerError'] + +import re + + +class LexerError(Exception): + def __init__(self, place, msg): + self.place = place + self.msg = msg + + def __str__(self): + s = u'cannot tokenize data' + line, pos = self.place + return u'%s: %d,%d: "%s"' % (s, line, pos, self.msg) + + +class Token(object): + def __init__(self, type, value, start=None, end=None): + self.type = type + self.value = value + self.start = start + self.end = end + + def __repr__(self): + return u'Token(%r, %r)' % (self.type, self.value) + + def __eq__(self, other): + # FIXME: Case sensitivity is assumed here + return self.type == other.type and self.value == other.value + + def _pos_str(self): + if self.start is None or self.end is None: + return '' + else: + sl, sp = self.start + el, ep = self.end + return u'%d,%d-%d,%d:' % (sl, sp, el, ep) + + def __str__(self): + s = u"%s %s '%s'" % (self._pos_str(), self.type, self.value) + return s.strip() + + @property + def name(self): + return self.value + + def pformat(self): + return u"%s %s '%s'" % (self._pos_str().ljust(20), + self.type.ljust(14), + self.value) + + +def make_tokenizer(specs): + """[(str, (str, int?))] -> (str -> Iterable(Token))""" + + def compile_spec(spec): + name, args = spec + return name, re.compile(*args) + + compiled = [compile_spec(s) for s in specs] + + def match_specs(specs, str, i, position): + line, pos = position + for type, regexp in specs: + m = regexp.match(str, i) + if m is not None: + value = m.group() + nls = value.count(u'\n') + n_line = line + nls + if nls == 0: + n_pos = pos + len(value) + else: + n_pos = len(value) - value.rfind(u'\n') - 1 + return Token(type, value, (line, pos + 1), (n_line, n_pos)) + else: + errline = str.splitlines()[line - 1] + raise LexerError((line, pos + 1), errline) + + def f(str): + length = len(str) + line, pos = 1, 0 + i = 0 + while i < length: + t = match_specs(compiled, str, i, (line, pos)) + yield t + line, pos = t.end + i += len(t.value) + + return f + +# This is an example of a token spec. See also [this article][1] for a +# discussion of searching for multiline comments using regexps (including `*?`). +# +# [1]: http://ostermiller.org/findcomment.html +_example_token_specs = [ + ('COMMENT', (r'\(\*(.|[\r\n])*?\*\)', re.MULTILINE)), + ('COMMENT', (r'\{(.|[\r\n])*?\}', re.MULTILINE)), + ('COMMENT', (r'//.*',)), + ('NL', (r'[\r\n]+',)), + ('SPACE', (r'[ \t\r\n]+',)), + ('NAME', (r'[A-Za-z_][A-Za-z_0-9]*',)), + ('REAL', (r'[0-9]+\.[0-9]*([Ee][+\-]?[0-9]+)*',)), + ('INT', (r'[0-9]+',)), + ('INT', (r'\$[0-9A-Fa-f]+',)), + ('OP', (r'(\.\.)|(<>)|(<=)|(>=)|(:=)|[;,=\(\):\[\]\.+\-<>\*/@\^]',)), + ('STRING', (r"'([^']|(''))*'",)), + ('CHAR', (r'#[0-9]+',)), + ('CHAR', (r'#\$[0-9A-Fa-f]+',)), +] +#tokenize = make_tokenizer(_example_token_specs)