# -*- coding: utf-8 -*-
# (c) Jérôme Laheurte 2015-2026
# See LICENSE.txt
import inspect
import re
import collections
import warnings
from ptk.utils import Singleton, callback_by_name, chars
LexerPosition = collections.namedtuple('_LexerPosition', ['column', 'line'])
# In Python 3 we'd use __prepare__ and an ordered dict...
_TOKREGISTER = []
class _LexerMeta(type):
def __new__(mcs, name, bases, attrs):
global _TOKREGISTER # pylint: disable=W0603
try:
attrs['__tokens__'] = (set(), []) # Set of token names, list of (rx, callback, defaultType)
klass = super().__new__(mcs, name, bases, attrs)
for func, rx, toktypes in _TOKREGISTER:
klass.add_token_type(func.__name__, callback_by_name(func.__name__), rx, toktypes)
return klass
finally:
_TOKREGISTER = []
[docs]
def token(rx, types=None):
"""
The method decorator for tokens
"""
def _wrap(func):
if any(func.__name__ == aFunc.__name__ and func != aFunc for aFunc, _, _ in _TOKREGISTER):
raise TypeError(f'Duplicate token method name "{func.__name__}"')
_TOKREGISTER.append((func, rx, types))
return func
return _wrap
[docs]
class SkipToken(Exception):
"""
Raise this from your consumer to ignore the token.
"""
[docs]
class LexerError(Exception):
"""
Unrecognized token in input
:ivar lineno: Line in input
:ivar colno: Column in input
"""
def __init__(self, char, pos):
super().__init__(f'Unrecognized token {repr(char)}')
self.position = pos
# Getters for compatibility with <1.3.8
@property
def colno(self):
"""
Column
"""
return self.position.column
@property
def lineno(self):
"""
Line
"""
return self.position.line
[docs]
class EOF(metaclass=Singleton):
"""
End symbol
"""
__reprval__ = '$'
@property
def type(self):
"""Read-only attribute for Token duck-typing"""
return self
@property
def value(self):
"""Read-only attribute for Token duck-typing"""
return self
[docs]
class LexerBase(metaclass=_LexerMeta):
"""
This defines the interface for lexer classes. For concrete
implementations, see :py:class:`ProgressiveLexer` and
:py:class:`ReLexer`.
"""
Token = collections.namedtuple('Token', ['type', 'value', 'position'])
# Shut up pychecker. Those are actually set by the metaclass.
__tokens__ = ()
class _MutableToken: # pylint: disable=too-few-public-methods
def __init__(self, type_, value, position):
self.type = type_
self.value = value
self.position = position
def token(self):
"""Returns the unmutable equivalent"""
return EOF if EOF in [self.type, self.value] else LexerBase.Token(self.type, self.value, self.position)
def __init__(self):
super().__init__()
self._pos = None
self._consumer = None
self.restart_lexer()
def restart_lexer(self, reset_pos=True):
if reset_pos:
self._pos = LexerPosition(column=1, line=1)
self._input = []
self._consumer = None
def restartLexer(self, resetPos=True): # pylint: disable=invalid-name
warnings.warn('restartLexer is deprecated in favor of restart_lexer', DeprecationWarning)
self.restart_lexer(reset_pos=resetPos)
[docs]
def position(self):
"""
:return: The current position in stream as a 2-tuple (column, line).
"""
return self._pos
[docs]
def advance_column(self, count=1):
"""
Advances the current position by *count* columns.
"""
self._pos = self._pos._replace(column=self._pos.column + count)
def advanceColumn(self, count=1): # pylint: disable=invalid-name
warnings.warn('advanceColumn is deprecated in favor of advance_column', DeprecationWarning)
self.advance_column(count=count)
[docs]
def advance_line(self, count=1):
"""
Advances the current position by *count* lines.
"""
self._pos = self._pos._replace(column=1, line=self._pos.line + count)
def advanceLine(self, count=1): # pylint: disable=invalid-name
warnings.warn('advanceLine is deprecated in favor of advance_line', DeprecationWarning)
self.advance_line(count=count)
[docs]
@staticmethod
def ignore(char):
"""
Override this to ignore characters in input stream. The
default is to ignore spaces and tabs.
:param char: The character to test
:return: True if *char* should be ignored
"""
return char in chars(' ') + chars('\t')
[docs]
def set_consumer(self, consumer):
"""
Sets the current consumer. A consumer is an object with a
*feed* method; all characters seen on the input stream after
the consumer is set are passed directly to it. When the *feed*
method returns a 2-tuple (type, value), the corresponding
token is generated and the consumer reset to None. This may be
handy to parse tokens that are not easily recognized by a
regular expression but easily by code; for instance the
following lexer recognizes C strings without having to use
negative lookahead:
.. code-block:: python
class MyLexer(ReLexer):
@token('"')
def cstring(self, tok):
class CString:
def __init__(self):
self.state = 0
self.value = StringIO.StringIO()
def feed(self, char):
if self.state == 0:
if char == '"':
return 'cstring', self.value.getvalue()
if char == '\\\\':
self.state = 1
else:
self.value.write(char)
elif self.state == 1:
self.value.write(char)
self.state = 0
self.set_consumer(CString())
You can also raise SkipToken instead of returning a token if it
is to be ignored (comments).
"""
self._consumer = consumer
def setConsumer(self, consumer): # pylint: disable=invalid-name
warnings.warn('setConsumer is deprecated in favor of set_consumer', DeprecationWarning)
self.set_consumer(consumer)
def consumer(self):
return self._consumer
[docs]
def parse(self, string): # pragma: no cover
"""
Parses the whole *string*; returns the start symbol semantic value
"""
raise NotImplementedError
[docs]
def new_token(self, tok): # pragma: no cover
"""
This method will be invoked as soon as a token is recognized on input.
:param tok: The token. This is a named tuple with *type* and *value* attributes.
"""
raise NotImplementedError
def newToken(self, tok): # pylint: disable=invalid-name
warnings.warn('newToken is deprecated in favor of new_token', DeprecationWarning)
self.new_token(tok)
@classmethod
def add_token_type(cls, name, callback, regex, types=None):
for type_name in [name] if types is None else types:
if type_name is not EOF:
cls.__tokens__[0].add(type_name)
cls.__tokens__[1].append((regex, callback, name if types is None else None))
@classmethod
def _all_tokens(cls):
tokens = (set(), [])
for base in inspect.getmro(cls):
if issubclass(base, LexerBase):
tokens[0].update(base.__tokens__[0])
tokens[1].extend(base.__tokens__[1])
return tokens
[docs]
@classmethod
def token_types(cls):
"""
:return: the set of all token names, as strings.
"""
return cls._all_tokens()[0]
@classmethod
def tokenTypes(cls): # pylint: disable=invalid-name
warnings.warn('tokenTypes is deprecated in favor of token_types', DeprecationWarning)
return cls.token_types()
[docs]
class ReLexer(LexerBase): # pylint: disable=W0223
"""
Concrete lexer based on Python regular expressions.
"""
def __init__(self):
self._regexes = []
for rx, callback, default_type in self._all_tokens()[1]:
crx = re.compile((b'^' if isinstance(rx, bytes) else '^') + rx)
self._regexes.append((crx, callback, default_type))
super().__init__()
def _parse(self, string, pos):
# pylint: disable=too-many-nested-blocks
while pos < len(string):
char = string[pos]
try:
if self.consumer() is None:
if self.ignore(char):
pos += 1
continue
pos = self._find_match(string, pos)
else:
try:
tok = self.consumer().feed(char)
except SkipToken:
self.set_consumer(None)
else:
if tok is not None:
self.set_consumer(None)
if tok[0] is not None:
self.new_token(self.Token(*tok, self.position()))
pos += 1
finally:
if char in chars('\n'):
self.advance_line()
else:
self.advance_column()
return pos
[docs]
def parse(self, string):
try:
self._parse(string, 0)
return self.new_token(EOF)
except LexerError:
self.restart_lexer()
raise
def _find_match(self, string, pos):
match = None
matchlen = 0
pos2d = self.position()
for rx, callback, default_type in self._regexes:
mtc = rx.match(string[pos:])
if mtc:
value = mtc.group(0)
if len(value) > matchlen:
match = value, callback, default_type
matchlen = len(value)
if match:
value, callback, default_type = match
tok = self._MutableToken(default_type, value, pos2d)
callback(self, tok)
pos += matchlen
if self.consumer() is None and tok.type is not None:
self.new_token(tok.token())
self.advance_column(matchlen - 1)
return pos
raise LexerError(self._guess_token(string, pos), pos2d)
def _guess_token(self, string, pos):
start = pos
while True:
pos += 1
if pos == len(string) or self.ignore(string[pos]):
break
for rx, _, _ in self._regexes:
mtc = rx.match(string[pos:])
if mtc:
break
else:
continue
break
return string[start:pos]