fune/third_party/rust/jsparagus/js_parser/parse_esgrammar.py

"""Parse a grammar written in ECMArkup."""

from __future__ import annotations
# mypy: no-implicit-optional

import os
import collections
from typing import Dict, Iterable, Optional, Tuple

from jsparagus import parse_pgen, gen, grammar, extension, types
from jsparagus.lexer import LexicalGrammar
from jsparagus.ordered import OrderedSet, OrderedFrozenSet


ESGrammarLexer = LexicalGrammar(
    # the operators and keywords:
    "[ ] { } , ~ + ? <! = == != => ( ) @ < > ' ; "
    "but empty here lookahead no not of one or returns through Some None impl for let",

    NL="\n",

    # any number of colons together
    EQ=r':+',

    # terminals of the ES grammar, quoted with backticks
    T=r'`[^` \n]+`|```',

    # also terminals, denoting control characters
    CHR=r'<[A-Z ]+>|U\+[0-9A-f]{4}',

    # nonterminals/types that will be followed by parameters
    NTCALL=r'[A-Za-z]\w*(?=[\[<])',

    # nonterminals (also, boolean parameters and type names)
    NT=r'[A-Za-z]\w*',

    # nonterminals wrapped in vertical bars for no apparent reason
    NTALT=r'\|[A-Z]\w+\|',

    # the spec also gives a few productions names
    PRODID=r'#[A-Za-z]\w*',

    # prose not wrapped in square brackets
    # To avoid conflict with the `>` token, this is recognized only after a space.
    PROSE=r'(?<= )>[^\n]*',

    # prose wrapped in square brackets
    WPROSE=r'\[>[^]]*\]',

    # expression denoting a matched terminal or nonterminal
    MATCH_REF=r'\$(?:0|[1-9][0-9]*)',

    # the spec also gives a few productions names
    RUSTCOMMENT=r'//.*\n',
)


ESGrammarParser = gen.compile(
    parse_pgen.load_grammar(
        os.path.join(os.path.dirname(__file__), "esgrammar.pgen")))


SIGIL_FALSE = '~'
SIGIL_TRUE = '+'

# Abbreviations for single-character terminals, used in the lexical grammar.
ECMASCRIPT_CODE_POINTS = {
    # From <https://tc39.es/ecma262/#table-31>
    '<ZWNJ>': grammar.Literal('\u200c'),
    '<ZWJ>': grammar.Literal('\u200d'),
    '<ZWNBSP>': grammar.Literal('\ufeff'),

    # From <https://tc39.es/ecma262/#table-32>
    '<TAB>': grammar.Literal('\t'),
    '<VT>': grammar.Literal('\u000b'),
    '<FF>': grammar.Literal('\u000c'),
    '<SP>': grammar.Literal(' '),
    '<NBSP>': grammar.Literal('\u00a0'),
    # <ZWNBSP> already defined above
    '<USP>': grammar.UnicodeCategory('Zs'),

    # From <https://tc39.es/ecma262/#table-33>
    '<LF>': grammar.Literal('\u000a'),
    '<CR>': grammar.Literal('\u000d'),
    '<LS>': grammar.Literal('\u2028'),
    '<PS>': grammar.Literal('\u2028'),
}


class ESGrammarBuilder:
    def __init__(self, terminal_names):
        # Names of terminals that are written as nonterminals in the grammar.
        # For example, "BooleanLiteral" is a terminal name when parsing the
        # syntactic grammar.
        if terminal_names is None:
            terminal_names = frozenset()
        self.terminal_names = frozenset(terminal_names)
        self.reset()

    def reset(self):
        self.lexer = None
        # This is how full-parsing and lazy-parsing are implemented, using
        # different traits.
        #
        # This field contains the Rust's trait used for calling the method.
        # When a CallMethod is generated, it is assumed to be a function of
        # this trait. The trait is used by the Rust backend to generate
        # multiple backends which are implementing different set of traits.
        # Having the trait on the function call is useful as a way to filter
        # functions calls at code-generation time.
        #
        # This field is updated by the `rust_param_impl`, which is used in
        # grammar extensions, and visited before producing any CallMethod.
        self.method_trait = "AstBuilder"

    def rust_edsl(self, impl, grammar):
        return extension.GrammarExtension(impl, grammar, self.lexer.filename)

    def rust_param_impl(self, trait, for_type, param):
        self.method_trait = trait
        return extension.ImplFor(param, trait, for_type)

    def rust_impl(self, trait, impl_type):
        return self.rust_param_impl(trait, impl_type, [])

    def rust_nt_def(self, lhs, rhs_line):
        # Right now, only focus on the syntactic grammar, and assume that all
        # rules are patching existing grammar production by adding code.
        return extension.ExtPatch(self.nt_def(None, lhs, ':', [rhs_line]))

    def rust_rhs_line(self, symbols):
        return self.rhs_line(None, symbols, None, None)

    def rust_expr(self, expr):
        assert isinstance(expr, grammar.CallMethod)
        return expr

    def empty(self):
        return []

    def single(self, x):
        return [x]

    def append(self, x, y):
        return x + [y]

    def concat(self, x, y):
        return x + y

    def blank_line(self):
        return []

    def nt_def_to_list(self, nt_def):
        return [nt_def]

    def to_production(self, lhs, i, rhs, is_sole_production):
        """Wrap a list of grammar symbols `rhs` in a Production object."""
        body, reducer, condition = rhs
        if reducer is None:
            reducer = self.default_reducer(lhs, i, body, is_sole_production)
        return grammar.Production(body, reducer, condition=condition)

    def default_reducer(self, lhs, i, body, is_sole_production):
        assert isinstance(lhs, grammar.Nt)
        nt_name = lhs.name

        nargs = sum(1 for e in body if grammar.is_concrete_element(e))
        if is_sole_production:
            method_name = nt_name
        else:
            method_name = '{} {}'.format(nt_name, i)
        return self.expr_call(method_name, tuple(range(nargs)), None)

    def needs_asi(self, lhs, p):
        """True if p is a production in which ASI can happen."""
        # The purpose of the fake ForLexicalDeclaration production is to have a
        # copy of LexicalDeclaration that does not trigger ASI.
        #
        # Two productions have body == [";"] -- one for EmptyStatement and one
        # for ClassMember. Neither should trigger ASI.
        #
        # The only other semicolons that should not trigger ASI are the ones in
        # `for` statement productions, which happen to be exactly those
        # semicolons that are not at the end of a production.
        return (not (isinstance(lhs, grammar.Nt)
                     and lhs.name == 'ForLexicalDeclaration')
                and len(p.body) > 1
                and p.body[-1] == ';')

    def apply_asi(self, p, reducer_was_autogenerated):
        """Return two rules based on p, so that ASI can be applied."""
        assert isinstance(p.reducer, grammar.CallMethod)

        if reducer_was_autogenerated:
            # Don't pass the semicolon to the method.
            reducer = self.expr_call(p.reducer.method,
                                     p.reducer.args[:-1],
                                     None)
        else:
            reducer = p.reducer

        # Except for do-while loops, check at runtime that ASI occurs only at
        # the end of a line.
        if (len(p.body) == 7
                and p.body[0] == 'do'
                and p.body[2] == 'while'
                and p.body[3] == '('
                and p.body[5] == ')'
                and p.body[6] == ';'):
            code = "do_while_asi"
        else:
            code = "asi"

        return [
            # The preferred production, with the semicolon in.
            p.copy_with(body=p.body[:],
                        reducer=reducer),
            # The fallback production, performing ASI.
            p.copy_with(body=p.body[:-1] + [grammar.ErrorSymbol(code)],
                        reducer=reducer),
        ]

    def expand_lexical_rhs(self, rhs):
        body, reducer, condition = rhs
        out = []
        for e in body:
            if isinstance(e, str):
                # The terminal symbols of the lexical grammar are characters, so
                # add each character of this string as a separate element.
                out += [grammar.Literal(ch) for ch in e]
            else:
                out.append(e)
        return [out, reducer, condition]

    def nt_def(self, nt_type, lhs, eq, rhs_list):
        has_sole_production = (len(rhs_list) == 1)
        production_list = []
        for i, rhs in enumerate(rhs_list):
            if eq == ':':
                # Syntactic grammar. A hack is needed for ASI.
                reducer_was_autogenerated = rhs[1] is None
                p = self.to_production(lhs, i, rhs, has_sole_production)
                if self.needs_asi(lhs, p):
                    production_list += self.apply_asi(p, reducer_was_autogenerated)
                else:
                    production_list.append(p)
            elif eq == '::':
                # Lexical grammar. A hack is needed to replace multicharacter
                # terminals like `!==` into sequences of character terminals.
                rhs = self.expand_lexical_rhs(rhs)
                p = self.to_production(lhs, i, rhs, has_sole_production)
                production_list.append(p)
        return (lhs.name, eq, grammar.NtDef(lhs.args, production_list, nt_type))

    def nt_def_one_of(self, nt_type, nt_lhs, eq, terminals):
        return self.nt_def(nt_type, nt_lhs, eq, [([t], None, None) for t in terminals])

    def nt_lhs_no_params(self, name):
        return grammar.Nt(name, ())

    def nt_lhs_with_params(self, name, params):
        return grammar.Nt(name, tuple(params))

    def simple_type(self, name):
        return types.Type(name)

    def lifetime_type(self, name):
        return types.Lifetime(name)

    def parameterized_type(self, name, args):
        return types.Type(name, tuple(args))

    def t_list_line(self, terminals):
        return terminals

    def terminal(self, t):
        assert t[0] == "`"
        assert t[-1] == "`"
        return t[1:-1]

    def terminal_chr(self, chr):
        raise ValueError("FAILED: %r" % chr)

    def rhs_line(self, ifdef, rhs, reducer, _prodid):
        return (rhs, reducer, ifdef)

    def rhs_line_prose(self, prose):
        return ([prose], None, None)

    def empty_rhs(self):
        return []

    def expr_match_ref(self, token):
        assert token.startswith('$')
        return int(token[1:])

    def expr_call(self, method, args, fallible):
        # NOTE: Currently "AstBuilder" functions are made fallible using the
        # fallible_methods taken from some Rust code which extract this
        # information to produce a JSON file.
        if self.method_trait == "AstBuilder":
            fallible = None
        return grammar.CallMethod(method, args or (), types.Type(self.method_trait),
                                  fallible is not None)

    def expr_some(self, expr):
        return grammar.Some(expr)

    def expr_none(self):
        return None

    def ifdef(self, value, nt):
        return nt, value

    def optional(self, nt):
        return grammar.Optional(nt)

    def but_not(self, nt, exclusion):
        _, exclusion = exclusion
        return grammar.Exclude(nt, [exclusion])
        # return ('-', nt, exclusion)

    def but_not_one_of(self, nt, exclusion_list):
        exclusion_list = [exclusion for _, exclusion in exclusion_list]
        return grammar.Exclude(nt, exclusion_list)
        # return ('-', nt, exclusion_list)

    def no_line_terminator_here(self, lt):
        if lt not in ('LineTerminator', '|LineTerminator|'):
            raise ValueError("unrecognized directive " + repr("[no " + lt + " here]"))
        return grammar.NoLineTerminatorHere

    def nonterminal(self, name):
        if name in self.terminal_names:
            return name
        return grammar.Nt(name, ())

    def nonterminal_apply(self, name, args):
        if name in self.terminal_names:
            raise ValueError("parameters applied to terminal {!r}".format(name))
        if len(set(k for k, expr in args)) != len(args):
            raise ValueError("parameter passed multiple times")
        return grammar.Nt(name, tuple(args))

    def arg_expr(self, sigil, argname):
        if sigil == '?':
            return (argname, grammar.Var(argname))
        else:
            return (argname, sigil)

    def sigil_false(self):
        return False

    def sigil_true(self):
        return True

    def exclusion_terminal(self, t):
        return ("t", t)

    def exclusion_nonterminal(self, nt):
        return ("nt", nt)

    def exclusion_chr_range(self, c1, c2):
        return ("range", c1, c2)

    def la_eq(self, t):
        return grammar.LookaheadRule(OrderedFrozenSet([t]), True)

    def la_ne(self, t):
        return grammar.LookaheadRule(OrderedFrozenSet([t]), False)

    def la_not_in_nonterminal(self, nt):
        return grammar.LookaheadRule(OrderedFrozenSet([nt]), False)

    def la_not_in_set(self, lookahead_exclusions):
        if all(len(excl) == 1 for excl in lookahead_exclusions):
            return grammar.LookaheadRule(
                OrderedFrozenSet(excl[0] for excl in lookahead_exclusions),
                False)
        raise ValueError("unsupported: lookahead > 1 token, {!r}"
                         .format(lookahead_exclusions))

    def chr(self, t):
        assert t[0] == "<" or t[0] == 'U'
        if t[0] == "<":
            assert t[-1] == ">"
            if t not in ECMASCRIPT_CODE_POINTS:
                raise ValueError("unrecognized character abbreviation {!r}".format(t))
            return ECMASCRIPT_CODE_POINTS[t]
        else:
            assert t[1] == "+"
            return grammar.Literal(chr(int(t[2:], base=16)))


def finish_grammar(nt_defs, goals, variable_terminals, synthetic_terminals,
                   single_grammar=True, extensions=[]):
    nt_grammars = {}
    for nt_name, eq, _ in nt_defs:
        if nt_name in nt_grammars:
            raise ValueError(
                "duplicate definitions for nonterminal {!r}"
                .format(nt_name))
        nt_grammars[nt_name] = eq

    # Figure out which grammar we were trying to get (":" for syntactic,
    # "::" for lexical) based on the goal symbols.
    goals = list(goals)
    if len(goals) == 0:
        raise ValueError("no goal nonterminals specified")
    if single_grammar:
        selected_grammars = set(nt_grammars[goal] for goal in goals)
        assert len(selected_grammars) != 0
        if len(selected_grammars) > 1:
            raise ValueError(
                "all goal nonterminals must be part of the same grammar; "
                "got {!r} (matching these grammars: {!r})"
                .format(set(goals), set(selected_grammars)))
        [selected_grammar] = selected_grammars

    terminal_set = set()

    def hack_production(p):
        for i, e in enumerate(p.body):
            if isinstance(e, str) and e[:1] == "`":
                if len(e) < 3 or e[-1:] != "`":
                    raise ValueError(
                        "Unrecognized grammar symbol: {!r} (in {!r})"
                        .format(e, p))
                p[i] = token = e[1:-1]
                terminal_set.add(token)

    nonterminals = {}
    for nt_name, eq, rhs_list_or_lambda in nt_defs:
        if single_grammar and eq != selected_grammar:
            continue

        if isinstance(rhs_list_or_lambda, grammar.NtDef):
            nonterminals[nt_name] = rhs_list_or_lambda
        else:
            rhs_list = rhs_list_or_lambda
            for p in rhs_list:
                if not isinstance(p, grammar.Production):
                    raise ValueError(
                        "invalid grammar: ifdef in non-function-call context")
                hack_production(p)
            if nt_name in nonterminals:
                raise ValueError(
                    "unsupported: multiple definitions for nt " + nt_name)
            nonterminals[nt_name] = rhs_list

    for t in terminal_set:
        if t in nonterminals:
            raise ValueError(
                "grammar contains both a terminal `{}` and nonterminal {}"
                .format(t, t))

    # Add execution modes to generate the various functions needed to handle
    # syntax parsing and full parsing execution modes.
    exec_modes = collections.defaultdict(OrderedSet)
    noop_parser = types.Type("ParserTrait", (types.Lifetime("alloc"), types.UnitType))
    token_parser = types.Type("ParserTrait", (
        types.Lifetime("alloc"), types.Type("StackValue", (types.Lifetime("alloc"),))))
    ast_builder = types.Type("AstBuilderDelegate", (types.Lifetime("alloc"),))

    # Full parsing takes token as input and build an AST.
    exec_modes["full_actions"].extend([token_parser, ast_builder])

    # Syntax parsing takes token as input but skip building the AST.
    # TODO: The syntax parser is commented out for now, as we need something to
    # be produced when we cannot call the AstBuilder for producing the values.

    # No-op parsing is used for the simulator, which is so far used for
    # querying whether we can end the incremental input and lookup if a state
    # can accept some kind of tokens.
    exec_modes["noop_actions"].add(noop_parser)

    # Extensions are using an equivalent of Rust types to define the kind of
    # parsers to be used, this map is used to convert these type names to the
    # various execution modes.
    full_parser = types.Type("FullParser")
    syntax_parser = types.Type("SyntaxParser")
    noop_parser = types.Type("NoopParser")
    type_to_modes = {
        noop_parser: ["noop_actions", "full_actions"],
        syntax_parser: ["full_actions"],
        full_parser: ["full_actions"],
    }

    result = grammar.Grammar(
        nonterminals,
        goal_nts=goals,
        variable_terminals=variable_terminals,
        synthetic_terminals=synthetic_terminals,
        exec_modes=exec_modes,
        type_to_modes=type_to_modes)
    result.patch(extensions)
    return result


def parse_esgrammar(
        text: str,
        *,
        filename: Optional[str] = None,
        extensions: Iterable[Tuple[os.PathLike, int, str]] = (),
        goals: Optional[Iterable[str]] = None,
        terminal_names: Iterable[str] = (),
        synthetic_terminals: Optional[Dict[str, OrderedSet[str]]] = None,
        single_grammar: bool = True
) -> grammar.Grammar:
    if not text.endswith("\n\n"):
        # Horrible hack: add a blank line at the end of the document so that
        # the esgrammar grammar can use newlines as delimiters. :-P
        text += "\n"

    terminal_names = frozenset(terminal_names)
    if synthetic_terminals is None:
        synthetic_terminals = {}

    builder = ESGrammarBuilder(terminal_names)
    parser = ESGrammarParser(builder=builder, goal="grammar")
    lexer = ESGrammarLexer(parser, filename=filename)
    lexer.write(text)
    nt_defs = lexer.close()
    grammar_extensions = []
    for ext_filename, start_lineno, content in extensions:
        builder.reset()
        parser = ESGrammarParser(builder=builder, goal="rust_edsl")
        lexer = ESGrammarLexer(parser, filename=ext_filename)
        builder.lexer = lexer
        lexer.start_lineno = start_lineno
        lexer.write(content)
        result = lexer.close()
        grammar_extensions.append(result)

    if goals is None:
        # Default to the first nonterminal in the input.
        goals = [nt_defs[0][0]]

    return finish_grammar(
        nt_defs,
        goals=goals,
        variable_terminals=terminal_names - frozenset(synthetic_terminals),
        synthetic_terminals=synthetic_terminals,
        single_grammar=single_grammar,
        extensions=grammar_extensions)