Source code for ptk.parser

# -*- coding: utf-8 -*-

# (c) Jérôme Laheurte 2015-2026
# See LICENSE.txt

import functools
import collections
import logging
import re
import warnings

from ptk.lexer import EOF, token, LexerPosition, ReLexer
from ptk.grammar import Grammar, Production, GrammarError
# production is only imported so that client code doesn't have to import it from grammar
from ptk.grammar import production # pylint: disable=W0611
from ptk.utils import Singleton, callback_by_name


[docs] class ParseError(Exception): """ Syntax error when parsing. :ivar token: The unexpected token. """ def __init__(self, grammar, tok, state, tokens): self.token = tok super().__init__(f'Unexpected token "{tok.value}" ({tok.type}) in state "{sorted(state)}"') self._state = state self._expecting = set() for terminal in grammar.token_types(): if grammar.__actions__.get((state, terminal), None) is not None: self._expecting.add(terminal) self._tokens = tokens @property def position(self): return self.token.position
[docs] def expecting(self): """ Returns a set of tokens types that would have been valid in input. """ return self._expecting
[docs] def state(self): """ Returns the parser state when the error was encountered. """ return self._state
[docs] def last_token(self): """ Returns the last valid token seen before this error """ return self._tokens[-1]
def lastToken(self): # pylint: disable=invalid-name warnings.warn('lastToken is deprecated in favor of last_token', DeprecationWarning) return self.last_token()
[docs] def tokens(self): """ Returns all tokens seen """ return self._tokens
[docs] def left_assoc(*operators): """ Class decorator for left associative operators. Use this to decorate your :py:class:`Parser` class. Operators passed as argument are assumed to have the same priority. The later you declare associativity, the higher the priority; so the following code .. code-block:: python @left_assoc('+', '-') @left_assoc('*', '/') class MyParser(LRParser): # ... declares '+' and '-' to be left associative, with the same priority. '*' and '/' are also left associative, with a higher priority than '+' and '-'. See also the *priority* argument to :py:func:`production`. """ def _wrapper(cls): cls.__precedence__.insert(0, ('left', set(operators))) return cls return _wrapper
def leftAssoc(*operators): # pylint: disable=invalid-name warnings.warn('leftAssoc is deprecated in favor of left_assoc', DeprecationWarning) return left_assoc(*operators)
[docs] def right_assoc(*operators): """ Class decorator for right associative operators. Same remarks as :py:func:`left_assoc`. """ def _wrapper(cls): cls.__precedence__.insert(0, ('right', set(operators))) return cls return _wrapper
def rightAssoc(*operators): # pylint: disable=invalid-name warnings.warn('rightAssoc is deprecated in favor of right_assoc', DeprecationWarning) return right_assoc(*operators)
[docs] def non_assoc(*operators): """ Class decorator for non associative operators. Same remarks as :py:func:`left_assoc`. """ def _wrapper(cls): cls.__precedence__.insert(0, ('non', set(operators))) return cls return _wrapper
def nonAssoc(*operators): # pylint: disable=invalid-name warnings.warn('nonAssoc is deprecated in favor of non_assoc', DeprecationWarning) return non_assoc(*operators) class _StartState(metaclass=Singleton): # pylint: disable=too-few-public-methods __reprval__ = '\u03A3' class _ResolveError(Exception): pass @functools.total_ordering class _Item: __slots__ = ('production', 'dot', 'terminal', 'index', 'should_reduce', 'expecting') def __init__(self, prod, dot, terminal): self.production = prod self.dot = dot self.terminal = terminal self.index = None self.should_reduce = self.dot == len(self.production.right) self.expecting = None if self.should_reduce else self.production.right[self.dot] def next(self): """ Returns an item with the dot advanced one position """ return _Item(self.production, self.dot + 1, self.terminal) def __repr__(self): symbols = list(self.production.right) symbols.insert(self.dot, '\u2022') return '%s -> %s (%s)' % (self.production.name, ' '.join([repr(sym) for sym in symbols]), self.terminal) # pylint: disable=consider-using-f-string def __eq__(self, other): return (self.production, self.dot, self.terminal) == (other.production, other.dot, other.terminal) def __lt__(self, other): return (self.production, self.dot, self.terminal) < (other.production, other.dot, other.terminal) def __hash__(self): return hash((self.production, self.dot, self.terminal)) class _Accept(BaseException): def __init__(self, result): self.result = result super().__init__() _StackItem = collections.namedtuple('_StackItem', ['state', 'value', 'position']) class _Shift: # pylint: disable=too-few-public-methods def __init__(self, new_state): self.new_state = new_state def do_action(self, grammar, stack, tok): # pylint: disable=W0613 stack.append(_StackItem(self.new_state, tok.value, tok.position)) return True class _Reduce: # pylint: disable=too-few-public-methods def __init__(self, item): self.item = item self.nargs = len(item.production.right) def do_action(self, grammar, stack, tok): # pylint: disable=W0613 pos, (callback, kwargs) = self._get_callback(stack) self._applied(grammar, stack, callback(grammar, **kwargs), pos) return False def _applied(self, grammar, stack, prod_val, position): stack.append(_StackItem(grammar.goto(stack[-1].state, self.item.production.name), prod_val, position)) def _get_callback(self, stack): if self.nargs: args = [stackItem.value for stackItem in stack[-self.nargs:]] pos = stack[-self.nargs].position stack[-self.nargs:] = [] else: args = [] pos = stack[-1].position # Hum. return pos, self.item.production.apply(args, pos)
[docs] class LRParser(Grammar): """ LR(1) parser. This class is intended to be used with a lexer class derived from :py:class:`LexerBase`, using inheritance; it overrides :py:func:`LexerBase.new_token` so you must inherit from the parser first, then the lexer: .. code-block:: python class MyParser(LRParser, ReLexer): # ... """ def __init__(self): # pylint: disable=R0914,R0912 super().__init__() self._restart_parser() def rstack(self): return reversed(self.__stack) def new_token(self, tok): try: for action, stack in self._process_token(tok): if action.do_action(self, stack, tok): break self.__tokens.append(tok) return None except _Accept as exc: self._restart_parser() self.on_start_symbol(exc.result) return exc.result def _process_token(self, tok): while True: action = self.__actions__.get((self.__stack[-1].state, tok.type), None) if action is None: raise ParseError(self, tok, self.__stack[-1].state, self.__tokens) yield action, self.__stack
[docs] def on_start_symbol(self, start_symbol): # pragma: no cover """ This is called when the start symbol has been reduced. :param start_symbol: The value associated with the start symbol. """ return self.newSentence(start_symbol)
def newSentence(self, start_symbol): # pylint: disable=invalid-name,unused-argument warnings.warn('newSentence has been deprecated in favor of on_start_symbol', DeprecationWarning) raise NotImplementedError @classmethod def _create_production_parser(cls, name, priority, attrs): return ProductionParser(callback_by_name(name), priority, cls, attrs) @classmethod def _create_reduce_action(cls, item): return _Reduce(item) @classmethod def _create_shift_action(cls, state): return _Shift(state) @classmethod def prepare(cls): for prod in cls.productions(): if prod.name is _StartState: break else: def acceptor(_, result): raise _Accept(result) prod = Production(_StartState, acceptor) prod.add_symbol(cls._default_start_symbol() if cls.startSymbol is None else cls.startSymbol, name='result') cls.__productions__.insert(0, prod) cls.startSymbol = _StartState super().prepare() states, goto = cls.__compute_states(prod) reachable = cls.__compute_actions(states, goto) logger = logging.getLogger('LRParser') cls.__resolve_conflicts(logger) used_tokens = {symbol for _, symbol in cls.__actions__ if symbol is not EOF} if used_tokens != cls.token_types(): # pragma: no cover pylint: disable=no-member logger.warning('The following tokens are not used: %s', ','.join([repr(sym) for sym in sorted(cls.token_types() - used_tokens)])) # pylint: disable=no-member if reachable != cls.nonterminals(): # pragma: no cover logger.warning('The following nonterminals are not reachable: %s', ','.join([repr(sym) for sym in sorted(cls.nonterminals() - reachable)])) # Reductions only need goto entries for nonterminals cls._goto = {(state, symbol): new_state for (state, symbol), new_state in goto.items() if symbol not in cls.token_types()} # pylint: disable=no-member parts = [] if cls.nSR: parts.append(f'{cls.nSR} shift/reduce conflicts') if cls.nRR: parts.append(f'{cls.nRR} reduce/reduce conflicts') if parts: logger.warning(', '.join(parts)) # Cast to tuple because sets are not totally ordered for index, state in enumerate([tuple(cls._startState)] + sorted([tuple(state) for state in states if state != cls._startState])): logger.debug('State %d', index) for item in sorted(state): logger.debug(' %s', item) item.index = index cls.__lrstates__.append(sorted(state)) # pylint: disable=no-member logger.info('%d states.', len(states)) @classmethod def __compute_states(cls, start): all_syms = list(cls.token_types() | cls.nonterminals()) # pylint: disable=no-member goto = [] cls._startState = frozenset([_Item(start, 0, EOF)]) states = set([cls._startState]) stack = [cls._startState] while stack: state = stack.pop() state_closure = cls.__item_set_closure(state) for symbol in all_syms: # Compute goto(symbol, state) next_state = frozenset([item.next() for item in state_closure if item.expecting == symbol]) if next_state: goto.append(((state, symbol), next_state)) if next_state not in states: states.add(next_state) stack.append(next_state) return states, dict(goto) @classmethod def __compute_actions(cls, states, goto): cls.__actions__ = {} reachable = set() token_types = cls.token_types() # pylint: disable=no-member for state in states: for item in cls.__item_set_closure(state): if item.should_reduce: action = cls._create_reduce_action(item) reachable.add(item.production.name) cls.__add_reduce_action(state, item.terminal, action) else: symbol = item.production.right[item.dot] if symbol in token_types: cls.__add_shift_action(state, symbol, cls._create_shift_action(goto[(state, symbol)])) return reachable @classmethod def __should_prefer_shift(cls, logger, reduce_action, symbol): logger.info('Shift/reduce conflict for "%s" on "%s"', reduce_action.item, symbol) prod_precedence = reduce_action.item.production.precedence(cls) token_precedence = cls.terminal_precedence(symbol) # If both precedences are specified, use priority/associativity if prod_precedence is not None and token_precedence is not None: prod_assoc, prod_prio = prod_precedence token_assoc, token_prio = token_precedence if prod_prio > token_prio: logger.info('Resolving in favor of reduction because of priority') return False if prod_prio < token_prio: logger.info('Resolving in favor of shift because of priority') return True if prod_assoc == token_assoc: if prod_assoc == 'non': logger.info('Resolving in favor of error because of associativity') raise _ResolveError() if prod_assoc == 'left': logger.info('Resolving in favor of reduction because of associativity') return False logger.info('Resolving in favor of shift because of associativity') return True # At least one of those is not specified; use shift logger.warning('Could not disambiguate shift/reduce conflict for "%s" on "%s"; using shift', reduce_action.item, symbol) cls.nSR += 1 return True @classmethod def __resolve_conflicts(cls, logger): cls.nSR = 0 cls.nRR = 0 for (state, symbol), actions in sorted(cls.__actions__.items()): action = actions.pop() while actions: conflicting = actions.pop() try: action = cls.__resolve_conflict(logger, action, conflicting, symbol) except _ResolveError: del cls.__actions__[(state, symbol)] break else: cls.__actions__[(state, symbol)] = action @classmethod def __resolve_conflict(cls, logger, action1, action2, symbol): if isinstance(action2, _Shift): action1, action2 = action2, action1 if isinstance(action1, _Shift): # Shift/reduce return action1 if cls.__should_prefer_shift(logger, action2, symbol) else action2 # Reduce/reduce logger.warning('Reduce/reduce conflict between "%s" and "%s"', action1.item, action2.item) cls.nRR += 1 # Use the first one to be declared for prod in cls.productions(): if prod == action1.item.production: logger.warning('Using "%s', prod) return action1 if prod == action2.item.production: logger.warning('Using "%s', prod) return action2 return None @classmethod def __add_reduce_action(cls, state, symbol, action): cls.__actions__.setdefault((state, symbol), []).append(action) @classmethod def __add_shift_action(cls, state, symbol, action): for existing in cls.__actions__.get((state, symbol), []): if isinstance(existing, _Shift): return cls.__actions__.setdefault((state, symbol), []).append(action) @classmethod def goto(cls, state, symbol): return cls._goto[(state, symbol)] def _restart_parser(self): self.__stack = [_StackItem(self._startState, None, LexerPosition(1, 1))] self.__tokens = [] self.restart_lexer() # pylint: disable=no-member @classmethod @functools.cache def __item_set_closure(cls, items): result = set(items) while True: prev = set(result) for item in [item for item in result if not item.should_reduce]: symbol = item.production.right[item.dot] if symbol not in cls.token_types(): # pylint: disable=no-member terminals = cls.first(*tuple(item.production.right[item.dot + 1:] + [item.terminal])) for prod in (prod for prod in cls.productions() if prod.name == symbol): for terminal in terminals: result.add(_Item(prod, 0, terminal)) if prev == result: break return result
[docs] class ProductionParser(LRParser, ReLexer): # pylint: disable=R0904 """ Parser for productions in other parsers """ def __init__(self, callback, priority, grammar_class, attributes): # pylint: disable=R0915 self.callback = callback self.priority = priority self.grammar_class = grammar_class self.attributes = attributes super().__init__() @classmethod def prepare(cls, **kwargs): # pylint: disable=R0915 # Obviously cannot use @production here # When mixing async and sync parsers in the same program this may be called twice, # because AsyncProductionParser inherits from ProductionParser if cls.productions(): # FIXME remove this, we don't do the async stuff any more return # DECL -> identifier "->" PRODS prod = Production('DECL', cls.DECL) prod.add_symbol('LEFT', 'left') prod.add_symbol('arrow') prod.add_symbol('PRODS', 'prods') cls.__productions__.append(prod) # LEFT -> identifier prod = Production('LEFT', cls.LEFT) prod.add_symbol('identifier', 'name') cls.__productions__.append(prod) # LEFT -> identifier "<" posarg ">" prod = Production('LEFT', cls.LEFT) prod.add_symbol('identifier', 'name') prod.add_symbol('lchev') prod.add_symbol('identifier', 'posarg') prod.add_symbol('rchev') cls.__productions__.append(prod) # PRODS -> P prod = Production('PRODS', cls.PRODS1) prod.add_symbol('P', 'prodlist') cls.__productions__.append(prod) # PRODS -> PRODS "|" P prod = Production('PRODS', cls.PRODS2) prod.add_symbol('PRODS', 'prods') prod.add_symbol('union') prod.add_symbol('P', 'prodlist') cls.__productions__.append(prod) # P -> P SYM prod = Production('P', cls.P1) prod.add_symbol('P', 'prodlist') prod.add_symbol('SYM', 'sym') cls.__productions__.append(prod) # P -> ɛ prod = Production('P', cls.P2) cls.__productions__.append(prod) # SYM -> SYMNAME PROPERTIES prod = Production('SYM', cls.SYM) prod.add_symbol('SYMNAME', 'symname') prod.add_symbol('PROPERTIES', 'properties') cls.__productions__.append(prod) # SYM -> SYMNAME repeat PROPERTIES prod = Production('SYM', cls.SYMREP) prod.add_symbol('SYMNAME', 'symname') prod.add_symbol('repeat', 'repeat') prod.add_symbol('PROPERTIES', 'properties') cls.__productions__.append(prod) # SYM -> SYMNAME repeat lparen identifier rparen PROPERTIES prod = Production('SYM', cls.SYMREP) prod.add_symbol('SYMNAME', 'symname') prod.add_symbol('repeat', 'repeat') prod.add_symbol('lparen') prod.add_symbol('identifier', 'separator') prod.add_symbol('rparen') prod.add_symbol('PROPERTIES', 'properties') cls.__productions__.append(prod) # SYM -> SYMNAME repeat lparen litteral rparen PROPERTIES prod = Production('SYM', cls.SYMREP_LIT) prod.add_symbol('SYMNAME', 'symname') prod.add_symbol('repeat', 'repeat') prod.add_symbol('lparen') prod.add_symbol('litteral', 'separator') prod.add_symbol('rparen') prod.add_symbol('PROPERTIES', 'properties') cls.__productions__.append(prod) # SYMNAME -> identifier prod = Production('SYMNAME', cls.SYMNAME1) prod.add_symbol('identifier', 'identifier') cls.__productions__.append(prod) # SYMNAME -> litteral prod = Production('SYMNAME', cls.SYMNAME2) prod.add_symbol('litteral', 'litteral') cls.__productions__.append(prod) # PROPERTIES -> ɛ prod = Production('PROPERTIES', cls.PROPERTIES1) cls.__productions__.append(prod) # PROPERTIES -> lchev identifier rchev prod = Production('PROPERTIES', cls.PROPERTIES2) prod.add_symbol('lchev') prod.add_symbol('identifier', 'name') prod.add_symbol('rchev') cls.__productions__.append(prod) super().prepare(**kwargs)
[docs] def on_start_symbol(self, start_symbol): (name, posarg), prods = start_symbol for prod in prods: if prod.name is None: prod.name = name prod.posarg = posarg self.grammar_class.__productions__.extend(prods)
# Lexer
[docs] @staticmethod def ignore(char): return char in ' \t\n'
@token('->') def arrow(self, tok): pass @token('<') def lchev(self, tok): pass @token('>') def rchev(self, tok): pass @token(r'\|') def union(self, tok): pass @token(r'\*|\+|\?') def repeat(self, tok): pass @token(r'\(') def lparen(self, tok): pass @token(r'\)') def rparen(self, tok): pass @token('[a-zA-Z_][a-zA-Z0-9_]*') def identifier(self, tok): pass @token(r'"|\'') def litteral(self, tok): class StringBuilder: # pylint: disable=missing-class-docstring,too-few-public-methods def __init__(self, quotetype): self.quotetype = quotetype self.chars = [] self.state = 0 def feed(self, char): if self.state == 0: if char == '\\': self.state = 1 elif char == self.quotetype: return 'litteral', ''.join(self.chars) else: self.chars.append(char) elif self.state == 1: self.chars.append(char) self.state = 0 return None self.set_consumer(StringBuilder(tok.value)) # Parser def DECL(self, left, prods): # pylint: disable=invalid-name name, _ = left if name in self.grammar_class.token_types(): raise GrammarError(f'"{name}" is a token name and cannot be used as non-terminal') return (left, prods) def LEFT(self, name, posarg=None): # pylint: disable=invalid-name return (name, posarg) def PRODS1(self, prodlist): # pylint: disable=invalid-name return prodlist def PRODS2(self, prods, prodlist): # pylint: disable=invalid-name prods.extend(prodlist) return prods def P1(self, sym, prodlist): # pylint: disable=invalid-name result = [] symbol, properties, repeat, sep = sym for prod in prodlist: if prod.name is None: if repeat is None: prod.add_symbol(symbol, name=properties.get('name', None)) result.append(prod) elif repeat == '?': if sep is not None: raise GrammarError('A separator makes no sense for "?"') self.__add_at_most_one(result, prod, symbol, properties.get('name', None)) elif repeat in ['*', '+']: self.__add_list(result, prod, symbol, properties.get('name', None), repeat == '*', sep) else: result.append(prod) return result def __add_at_most_one(self, productions, prod, symbol, name): clone = prod.cloned() if name is not None: self._wrap_callback_none(name, clone) productions.append(clone) prod.add_symbol(symbol, name=name) productions.append(prod) def _wrap_callback_none(self, name, prod): previous = prod.callback def callback(*args, **kwargs): kwargs[name] = None return previous(*args, **kwargs) prod.callback = callback def __add_list(self, productions, prod, symbol, name, allow_empty, sep): # pylint: disable=too-many-arguments,too-many-positional-arguments class ListSymbol(metaclass=Singleton): # pylint: disable=missing-class-docstring,too-few-public-methods __reprval__ = 'List(%s, "%s")' % (symbol, '*' if allow_empty else '+') # pylint: disable=consider-using-f-string if allow_empty: clone = prod.cloned() self._wrap_callback_empty(name, clone) productions.append(clone) prod.add_symbol(ListSymbol, name=name) productions.append(prod) list_prod = Production(ListSymbol, self._wrap_callback_one()) list_prod.add_symbol(symbol, name='item') productions.append(list_prod) list_prod = Production(ListSymbol, self._wrap_callback_next()) list_prod.add_symbol(ListSymbol, name='items') if sep is not None: list_prod.add_symbol(sep) list_prod.add_symbol(symbol, name='item') productions.append(list_prod) def _wrap_callback_empty(self, name, prod): previous = prod.callback def callback_empty(*args, **kwargs): if name is not None: kwargs[name] = [] return previous(*args, **kwargs) prod.callback = callback_empty def _wrap_callback_one(self): def callback_one(_, item): return [item] return callback_one def _wrap_callback_next(self): def callback_next(_, items, item): items.append(item) return items return callback_next def P2(self): # pylint: disable=invalid-name # 'name' is replaced in on_start_symbol() return [Production(None, self.callback, priority=self.priority, attributes=self.attributes)] def SYMNAME1(self, identifier): # pylint: disable=invalid-name return identifier def SYMNAME2(self, litteral): # pylint: disable=invalid-name name = litteral if name not in self.grammar_class.token_types(): self.grammar_class.add_token_type(name, lambda s, tok: None, re.escape(name), None) return name def SYM(self, symname, properties): # pylint: disable=invalid-name return (symname, properties, None, None) def SYMREP(self, symname, repeat, properties, separator=None): # pylint: disable=invalid-name return (symname, properties, repeat, separator) def SYMREP_LIT(self, symname, repeat, properties, separator): # pylint: disable=invalid-name if separator not in self.grammar_class.token_types(): self.grammar_class.add_token_type(separator, lambda s, tok: None, re.escape(separator), None) return self.SYMREP(symname, repeat, properties, separator) def PROPERTIES1(self): # pylint: disable=invalid-name return {} def PROPERTIES2(self, name): # pylint: disable=invalid-name return {'name': name}