funcparserlib/lexer.py

   1 # -*- coding: utf-8 -*-
   2
   3 # Copyright (c) 2008/2013 Andrey Vlasovskikh
   4 #
   5 # Permission is hereby granted, free of charge, to any person obtaining
   6 # a copy of this software and associated documentation files (the
   7 # "Software"), to deal in the Software without restriction, including
   8 # without limitation the rights to use, copy, modify, merge, publish,
   9 # distribute, sublicense, and/or sell copies of the Software, and to
  10 # permit persons to whom the Software is furnished to do so, subject to
  11 # the following conditions:
  12 #
  13 # The above copyright notice and this permission notice shall be included
  14 # in all copies or substantial portions of the Software.
  15 #
  16 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  17 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  18 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  19 # IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
  20 # CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  21 # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  22 # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  23
  24 __all__ = ['make_tokenizer', 'Token', 'LexerError']
  25
  26 import re
  27
  28
  29 class LexerError(Exception):
  30     def __init__(self, place, msg):
  31         self.place = place
  32         self.msg = msg
  33
  34     def __str__(self):
  35         s = 'cannot tokenize data'
  36         line, pos = self.place
  37         return '%s: %d,%d: "%s"' % (s, line, pos, self.msg)
  38
  39
  40 class Token(object):
  41     def __init__(self, type, value, start=None, end=None):
  42         self.type = type
  43         self.value = value
  44         self.start = start
  45         self.end = end
  46
  47     def __repr__(self):
  48         return 'Token(%r, %r)' % (self.type, self.value)
  49
  50     def __eq__(self, other):
  51         # FIXME: Case sensitivity is assumed here
  52         return self.type == other.type and self.value == other.value
  53
  54     def _pos_str(self):
  55         if self.start is None or self.end is None:
  56             return ''
  57         else:
  58             sl, sp = self.start
  59             el, ep = self.end
  60             return '%d,%d-%d,%d:' % (sl, sp, el, ep)
  61
  62     def __str__(self):
  63         s = "%s %s '%s'" % (self._pos_str(), self.type, self.value)
  64         return s.strip()
  65
  66     @property
  67     def name(self):
  68         return self.value
  69
  70     def pformat(self):
  71         return "%s %s '%s'" % (self._pos_str().ljust(20),
  72                                 self.type.ljust(14),
  73                                 self.value)
  74
  75
  76 def make_tokenizer(specs):
  77     """[(str, (str, int?))] -> (str -> Iterable(Token))"""
  78
  79     def compile_spec(spec):
  80         name, args = spec
  81         return name, re.compile(*args)
  82
  83     compiled = [compile_spec(s) for s in specs]
  84
  85     def match_specs(specs, str, i, position):
  86         line, pos = position
  87         for type, regexp in specs:
  88             m = regexp.match(str, i)
  89             if m is not None:
  90                 value = m.group()
  91                 nls = value.count('\n')
  92                 n_line = line + nls
  93                 if nls == 0:
  94                     n_pos = pos + len(value)
  95                 else:
  96                     n_pos = len(value) - value.rfind('\n') - 1
  97                 return Token(type, value, (line, pos + 1), (n_line, n_pos))
  98         else:
  99             errline = str.splitlines()[line - 1]
 100             raise LexerError((line, pos + 1), errline)
 101
 102     def f(str):
 103         length = len(str)
 104         line, pos = 1, 0
 105         i = 0
 106         while i < length:
 107             t = match_specs(compiled, str, i, (line, pos))
 108             yield t
 109             line, pos = t.end
 110             i += len(t.value)
 111
 112     return f
 113
 114 # This is an example of a token spec. See also [this article][1] for a
 115 # discussion of searching for multiline comments using regexps (including `*?`).
 116 #
 117 #   [1]: http://ostermiller.org/findcomment.html
 118 _example_token_specs = [
 119     ('COMMENT', (r'\(\*(.|[\r\n])*?\*\)', re.MULTILINE)),
 120     ('COMMENT', (r'\{(.|[\r\n])*?\}', re.MULTILINE)),
 121     ('COMMENT', (r'//.*',)),
 122     ('NL', (r'[\r\n]+',)),
 123     ('SPACE', (r'[ \t\r\n]+',)),
 124     ('NAME', (r'[A-Za-z_][A-Za-z_0-9]*',)),
 125     ('REAL', (r'[0-9]+\.[0-9]*([Ee][+\-]?[0-9]+)*',)),
 126     ('INT', (r'[0-9]+',)),
 127     ('INT', (r'\$[0-9A-Fa-f]+',)),
 128     ('OP', (r'(\.\.)|(<>)|(<=)|(>=)|(:=)|[;,=\(\):\[\]\.+\-<>\*/@\^]',)),
 129     ('STRING', (r"'([^']|(''))*'",)),
 130     ('CHAR', (r'#[0-9]+',)),
 131     ('CHAR', (r'#\$[0-9A-Fa-f]+',)),
 132 ]
 133 #tokenize = make_tokenizer(_example_token_specs)