fune/third_party/rust/jsparagus/js_parser/load_es_grammar.py

""" Functions for loading the ECMAScript lexical and syntactic grammars. """

from jsparagus.ordered import OrderedSet, OrderedFrozenSet
from jsparagus import gen, grammar
from .lexer import ECMASCRIPT_FULL_KEYWORDS, ECMASCRIPT_CONDITIONAL_KEYWORDS
from .parse_esgrammar import parse_esgrammar


ECMASCRIPT_LEXICAL_SYNTHETIC_TERMINALS: grammar.SyntheticTerminalsDict = {
    # Theoretically, this should be the set of all Unicode characters, but that
    # would take a lot of memory, and in practice, the set is not used.
    'SourceCharacter': OrderedFrozenSet([]),
}

ECMASCRIPT_LEXICAL_GOAL_NTS = [
    'WhiteSpace',
    'InputElementDiv',
    'InputElementRegExp',
]


def load_lexical_grammar(filename):
    """Load the ECMAScript lexical grammar."""
    with open(filename) as f:
        grammar_text = f.read()
    g = parse_esgrammar(
        grammar_text,
        filename=filename,
        goals=ECMASCRIPT_LEXICAL_GOAL_NTS,
        synthetic_terminals=ECMASCRIPT_LEXICAL_SYNTHETIC_TERMINALS,
        terminal_names=ECMASCRIPT_LEXICAL_SYNTHETIC_TERMINALS.keys())
    return gen.expand_parameterized_nonterminals(g)


ECMASCRIPT_SYNTACTIC_GOAL_NTS = [
    'Script',
    'Module',
    # 'FormalParameters',
    # 'FunctionBody',
]

# Identifiers are complicated. A "synthetic terminal" is a shorthand symbol
# that stands for any one of a set of terminals. For example, *IdentifierName*
# stands for any token that looks like an identifier, including keywords.
#
# These sets must use the names of the terminals produced by the lexer.  Except
# for `Name`, our lexer output uses the terminal symbols of the syntactic
# grammar, which include some nonterminals of the lexical grammar. The
# syntactic grammar uses `BooleanLiteral`, not `true` and `false`; and it uses
# `NullLiteral` instead of `null`.
ECMASCRIPT_SYNTHETIC_TERMINALS = {
    'IdentifierName': OrderedSet([
        'Name',
        'BooleanLiteral',
        'NullLiteral',
        'NameWithEscape',
        *ECMASCRIPT_FULL_KEYWORDS,
        *ECMASCRIPT_CONDITIONAL_KEYWORDS
    ]) - OrderedSet(['true', 'false', 'null']),
    'Identifier': OrderedSet([
        'Name',
        'NameWithEscape',
        *ECMASCRIPT_CONDITIONAL_KEYWORDS
    ]),
}

# Lexical nonterminals that are used as terminals in the syntactic grammar.
ECMASCRIPT_TOKEN_NAMES = [
    'BooleanLiteral',
    'IdentifierName',
    'PrivateIdentifier',
    'NoSubstitutionTemplate',
    'NullLiteral',
    'NumericLiteral',
    'BigIntLiteral',
    'RegularExpressionLiteral',
    'StringLiteral',
    'TemplateHead',
    'TemplateMiddle',
    'TemplateTail',
]

# List of all terminals, other than keywords, that our (hand-coded) lexer
# produces.
#
# (What our lexer implements for IdentifierName and friends is a slight
# variation on the spec. See `ECMASCRIPT_SYNTHETIC_TERMINALS` above.)
TERMINAL_NAMES_FOR_SYNTACTIC_GRAMMAR = ECMASCRIPT_TOKEN_NAMES + [
    'Identifier',
    'Name',
]


def load_syntactic_grammar(filename, extensions):
    """Load the ECMAScript syntactic grammar."""
    with open(filename) as f:
        grammar_text = f.read()

    extensions_content = []
    for ext_filename in extensions:
        # Extract grammar_extension! macro content, and store in a list.
        with open(ext_filename) as ext_file:
            content = None
            start_line = 0
            for lineno, line in enumerate(ext_file):
                if line.startswith("grammar_extension!"):
                    assert line.endswith("{\n")
                    content = ""
                    # +2: enumerate starts at 0, while the first line is 1.
                    # Also, the first line added to the content variable is the
                    # next one.
                    start_line = lineno + 2
                    continue
                if line.startswith("}") and content:
                    extensions_content.append((ext_filename, start_line, content))
                    content = None
                    continue
                if content is not None:
                    content += line

    g = parse_esgrammar(
        grammar_text,
        filename=filename,
        extensions=extensions_content,
        goals=ECMASCRIPT_SYNTACTIC_GOAL_NTS,
        synthetic_terminals=ECMASCRIPT_SYNTHETIC_TERMINALS,
        terminal_names=TERMINAL_NAMES_FOR_SYNTACTIC_GRAMMAR)

    return g