fune/third_party/rust/jsparagus/js_parser/lexer.py

"""Vague approximation of an ECMAScript lexer.

A parser has two levels: the *lexer* scans bytes to produce tokens. The
*parser* consumes tokens and produces ASTs.

In a traditional design, the parser drives the process. It *pulls* one token at
a time from the lexer. However, for a parser that can accept arbitrary slabs of
data, scan them, then keep going, it makes more sense for the user to feed
those slabs to the lexer, which then *pushes* tokens to the parser. So that's
what we do.

Usage:

    from js_parser.lexer import JSLexer
    from js_parser.parser import JSParser

    lexer = JSLexer(JSParser())
    lexer.write(some_source_text)
    lexer.write(some_more_source_text)
    ast = lexer.close()
"""

import re
import jsparagus.lexer


def _get_punctuators():
    punctuators = '''
        &&= ||= ??=
        { ( ) [ ] . ... ; , < > <= >= == != === !== + - * % ** ++ --
        << >> >>> & | ^ ! ~ && || ? : = += -= *= %=
        **= ><<= >>= >>>= &= |= ^= =>
    '''.split()

    return '|'.join(
        re.escape(token)
        for token in sorted(punctuators, key=len, reverse=True))


TOKEN_RE = re.compile(r'''(?x)
  (?:
      # WhiteSpace
      [\ \t\v\r\n\u00a0\u2028\u2029\ufeff]
      # SingleLineComment
    | // [^\r\n\u2028\u2029]* (?= [\r\n\u2028\u2029] | \Z )
      # MultiLineComment
    | /\*  (?: [^*] | \*+[^/] )*  \*+/
  )*
  (
      # Incomplete MultiLineComment
      /\*  (?: [^*] | \*+[^/] )*  \**
    | # Incomplete SingleLineComment
      // [^\r\n\u2028\u2029]*
    | # IdentifierName
      (?: [$_A-Za-z]     | \\ u [0-9A-Fa-f]{4} | \\ u \{ [0-9A-Fa-f]+ \})
      (?: [$_0-9A-Za-z]  | \\ u [0-9A-Fa-f]{4} | \\ u \{ [0-9A-Fa-f]+ \})*
    | # NumericLiteral
      [0-9][0-9A-Za-z]*(?:\.[0-9A-Za-z]*)?
    | \.[0-9][0-9A-Za-z]*
    | # Punctuator
      <INSERT_PUNCTUATORS>
    | # The slash special case
      /
    | # The curly brace special case
      }
    | # StringLiteral
      '
        # SingleStringCharacters
        (?:
            # SourceCharacter but not one of ' or \\ or LineTerminator
            # but also allow LINE SEPARATOR or PARAGRAPH SEPARATOR
            [^'\\\r\n]
          | \\ [^0-9xu\r\n\u2028\u2029]  # CharacterEscapeSequence
          | \\ x [0-9A-Fa-f]{2}          # HexEscapeSequence
          | \\ u [0-9A-Fa-f]{4}          # UnicodeEscapeSequence
          | \\ u \{ [0-9A-Fa-f]+ \}
          | \\\r\n?                      # LineContinuation
          | \\[\n\u2028\u2029]
        )*
      '
    | "
        # DoubleStringCharacters
        (?:
            # SourceCharacter but not one of " or \\ or LineTerminator
            # but also allow LINE SEPARATOR or PARAGRAPH SEPARATOR
            [^"\\\r\n]
          | \\ [^0-9xu\r\n\u2028\u2029]  # CharacterEscapeSequence
          | \\ x [0-9A-Fa-f]{2}          # HexEscapeSequence
          | \\ u [0-9A-Fa-f]{4}          # UnicodeEscapeSequence
          | \\ u \{ [0-9A-Fa-f]+ \}
          | \\\r\n?                      # LineContinuation
          | \\[\n\u2028\u2029]
        )*
      "
    | # Template
      ` (?: [^`\\$] | \\. )* (?: \${ | ` )
    | # illegal character or end of input (this branch matches no characters)
  )
'''.replace("<INSERT_PUNCTUATORS>", _get_punctuators()))

DIV_RE = re.compile(r'(/=?)')

REGEXP_RE = re.compile(r'''(?x)
(
    /
    (?:
        # RegularExpressionFirstChar - implemented using
        #     RegularExpressionChars on the theory that we have already
        #     ruled out the possibility of a comment.
        # RegularExpressionChars
        (?:
            # RegularExpressionNonTerminator but not one of \\ or / or [
            [^/\\\[\r\n\u2028\u2029]
          | # RegularExpressionBackslashSequence
            \\ [^\r\n\u2028\u2029]
          | # RegularExpressionClass
            \[
                # RegularExpressionClassChars
                (?:
                    # RegularExpressionNonTerminator but not one of ] or \\
                    [^]\\\r\n\u2028\u2029]
                  | # RegularExpressionBackslashSequence
                    \\ [^\r\n\u2028\u2029]
                )*
            \]
        )+
    )
    /
    (?: \w* )
)
''')

# Words that never match Identifier. (`await` and `yield` nonetheless
# conditionally match IdentifierReference, BindingIdentifier, and
# LabelIdentifier.)
#
# Technically the term for these is "reserved word", not "keyword", but
# whatever.
ECMASCRIPT_FULL_KEYWORDS = [
    'await',
    'break',
    'case',
    'catch',
    'class',
    'const',
    'continue',
    'debugger',
    'default',
    'delete',
    'do',
    'else',
    'enum',
    'export',
    'extends',
    'finally',
    'for',
    'function',
    'if',
    'import',
    'in',
    'instanceof',
    'new',
    'null',
    'return',
    'super',
    'switch',
    'this',
    'throw',
    'true',
    'false',
    'try',
    'typeof',
    'var',
    'void',
    'while',
    'with',
    'yield',
]

ECMASCRIPT_CONDITIONAL_KEYWORDS = [
    # Words that are identifiers except in strict mode
    'let',  # this one is also banned at the beginning of an ExpressionStatement
    'static',
    'implements',
    'interface',
    'package',
    'private',
    'protected',
    'public',

    # Words that are always allowed as identifiers, but are also keywords in
    # other contexts.
    'as',
    'async',
    'from',
    'get',
    'of',
    'set',
    'target',
]

# Technically this set includes a reserved word that isn't currently being used
# as a keyword in the grammar: `enum`.
ALL_KEYWORDS = set(ECMASCRIPT_FULL_KEYWORDS + ECMASCRIPT_CONDITIONAL_KEYWORDS)


class JSLexer(jsparagus.lexer.FlatStringLexer):
    """Vague approximation of an ECMAScript lexer. """
    def __init__(self, parser, filename=None):
        super().__init__(parser, filename)

    def _match(self, closing):
        match = TOKEN_RE.match(self.src, self.point)
        assert match is not None

        if match.end() == len(self.src) and not closing:
            # The current token runs right up against the end of the current
            # chunk of source and thus might continue in the next chunk. Do not
            # move self.point.
            return None

        token = match.group(1)
        if token == '':
            # Whitespace followed by end of input or illegal character.
            if match.end() == len(self.src):
                # End of input. Success!
                assert closing
                self.point = match.end()
                return None
            else:
                c = self.src[match.end()]
                self.throw("unexpected character: {!r}".format(c))

        c = token[0]
        t = None
        if c.isdigit() or c == '.' and token != '.':
            t = 'NumericLiteral'
        elif c.isalpha() or c in '$_':
            if token in ALL_KEYWORDS:  # TODO support strict mode
                if token == 'null':
                    t = 'NullLiteral'
                elif token in ('true', 'false'):
                    t = 'BooleanLiteral'
                else:
                    t = token
            else:
                t = 'Name'
        elif c == '/':
            if token.startswith(('/*', '//')):
                # Incomplete comment. (In non-closing mode, this is handled
                # above, immediately after the match.)
                assert match.end() == len(self.src)
                assert closing
                self.point = len(self.src)
                self.throw("incomplete comment at end of source")

            # We choose RegExp vs. division based on what the parser can
            # accept, a literal implementation of the spec.
            #
            # To make this correct in combination with end-of-line ASI, make
            # the parser rewind the lexer one token and ask for it again in
            # that case, so that the lexer asks the can-accept question again.
            point = match.start(1)
            if self.parser.can_accept_terminal(self, 'RegularExpressionLiteral'):
                match = REGEXP_RE.match(self.src, point)
                if match is None:
                    if closing:
                        self.throw("unterminated regexp literal")
                    else:
                        return None
                token = 'RegularExpressionLiteral'
            else:
                match = DIV_RE.match(self.src, point)
                token = match.group(1)

            if not closing and match.end() == len(self.src):
                # At the end of a chunk, `/a*b/` could be the start of
                # `/a*b/g`, and `/` could be the start of `/=`.
                return None

            t = token
        elif c == '`':
            if token.endswith('`'):
                t = 'NoSubstitutionTemplate'
            else:
                t = 'TemplateHead'
        elif c == '"' or c == "'":
            t = 'StringLiteral'
        elif c == '}':
            # TODO: TemplateTail
            t = token
        elif c in '{()[];,~?:.<>=!+-*%&|^':
            t = token
        else:
            assert False

        self._current_match = match
        self.previous_token_end = self.point
        self.current_token_start = match.start(1)
        self.point = match.end()
        return t

    def take(self):
        return self._current_match.group(1)

    def saw_line_terminator(self):
        """True if there's a LineTerminator before the current token."""
        i = self.previous_token_end
        j = self.current_token_start
        ws_between = self.src[i:j]
        return any(c in ws_between for c in '\r\n\u2028\u2029')

    def can_close(self):
        match = TOKEN_RE.match(self.src)
        return match.group(1) == '' and self.parser.can_close()