forked from mirrors/gecko-dev
1567 lines
50 KiB
Python
Executable file
1567 lines
50 KiB
Python
Executable file
#!/usr/bin/env python
|
|
# -*- coding: utf-8 -*-
|
|
# Based upon makeunicodedata.py
|
|
# (http://hg.python.org/cpython/file/c8192197d23d/Tools/unicode/makeunicodedata.py)
|
|
# written by Fredrik Lundh (fredrik@pythonware.com)
|
|
#
|
|
# Copyright (C) 2011 Tom Schuster <evilpies@gmail.com>
|
|
#
|
|
# This program is free software: you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License as published by
|
|
# the Free Software Foundation, either version 3 of the License, or
|
|
# (at your option) any later version.
|
|
#
|
|
# This program is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public License
|
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
import csv
|
|
import io
|
|
import os
|
|
import re
|
|
import sys
|
|
from contextlib import closing
|
|
from functools import partial
|
|
from itertools import chain, tee
|
|
from operator import is_not, itemgetter
|
|
from zipfile import ZipFile
|
|
|
|
if sys.version_info.major == 2:
|
|
from itertools import ifilter as filter
|
|
from itertools import imap as map
|
|
from itertools import izip_longest as zip_longest
|
|
|
|
from urllib2 import urlopen
|
|
|
|
range = xrange
|
|
else:
|
|
from itertools import zip_longest
|
|
from urllib.request import urlopen
|
|
|
|
|
|
class codepoint_dict(dict):
|
|
def name(self, code_point):
|
|
(_, _, name, alias) = self[code_point]
|
|
return "{}{}".format(name, (" (" + alias + ")" if alias else ""))
|
|
|
|
def full_name(self, code_point):
|
|
(_, _, name, alias) = self[code_point]
|
|
return "U+{:04X} {}{}".format(
|
|
code_point, name, (" (" + alias + ")" if alias else "")
|
|
)
|
|
|
|
|
|
# ECMAScript 2016
|
|
# §11.2 White Space
|
|
whitespace = [
|
|
# python doesn't support using control character names :(
|
|
0x9, # CHARACTER TABULATION
|
|
0xB, # LINE TABULATION
|
|
0xC, # FORM FEED
|
|
ord("\N{SPACE}"),
|
|
ord("\N{NO-BREAK SPACE}"),
|
|
ord("\N{ZERO WIDTH NO-BREAK SPACE}"), # also BOM
|
|
]
|
|
|
|
# §11.3 Line Terminators
|
|
line_terminator = [
|
|
0xA, # LINE FEED
|
|
0xD, # CARRIAGE RETURN
|
|
ord("\N{LINE SEPARATOR}"),
|
|
ord("\N{PARAGRAPH SEPARATOR}"),
|
|
]
|
|
|
|
# These are also part of IdentifierPart §11.6 Names and Keywords
|
|
compatibility_identifier_part = [
|
|
ord("\N{ZERO WIDTH NON-JOINER}"),
|
|
ord("\N{ZERO WIDTH JOINER}"),
|
|
]
|
|
|
|
FLAG_SPACE = 1 << 0
|
|
FLAG_UNICODE_ID_START = 1 << 1
|
|
FLAG_UNICODE_ID_CONTINUE_ONLY = 1 << 2
|
|
|
|
MAX_BMP = 0xFFFF
|
|
|
|
public_domain = """
|
|
/*
|
|
* Any copyright is dedicated to the Public Domain.
|
|
* http://creativecommons.org/licenses/publicdomain/
|
|
*/
|
|
"""
|
|
|
|
mpl_license = """\
|
|
/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
|
|
* vim: set ts=8 sts=4 et sw=4 tw=99:
|
|
* This Source Code Form is subject to the terms of the Mozilla Public
|
|
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
|
"""
|
|
|
|
warning_message = """\
|
|
/* Generated by make_unicode.py DO NOT MODIFY */
|
|
"""
|
|
|
|
unicode_version_message = """\
|
|
/* Unicode version: {0} */
|
|
"""
|
|
|
|
|
|
def read_unicode_data(unicode_data):
|
|
"""
|
|
If you want to understand how this wonderful file format works checkout
|
|
Unicode Standard Annex #44 - Unicode Character Database
|
|
http://www.unicode.org/reports/tr44/
|
|
"""
|
|
|
|
reader = csv.reader(unicode_data, delimiter=str(";"))
|
|
|
|
while True:
|
|
row = next(reader, None)
|
|
if row is None:
|
|
return
|
|
name = row[1]
|
|
|
|
# We need to expand the UAX #44 4.2.3 Code Point Range
|
|
if name.startswith("<") and name.endswith("First>"):
|
|
next_row = next(reader)
|
|
|
|
for i in range(int(row[0], 16), int(next_row[0], 16) + 1):
|
|
row[0] = i
|
|
row[1] = name[1:-8]
|
|
|
|
yield row
|
|
else:
|
|
row[0] = int(row[0], 16)
|
|
yield row
|
|
|
|
|
|
def read_case_folding(case_folding):
|
|
"""
|
|
File format is:
|
|
<code>; <status>; <mapping>; # <name>
|
|
"""
|
|
|
|
for line in case_folding:
|
|
if line == "\n" or line.startswith("#"):
|
|
continue
|
|
row = line.split("; ")
|
|
if row[1] in ["F", "T"]:
|
|
continue
|
|
assert row[1] in ["C", "S"], "expect either (C)ommon or (S)imple case foldings"
|
|
code = int(row[0], 16)
|
|
mapping = int(row[2], 16)
|
|
yield (code, mapping)
|
|
|
|
|
|
def read_derived_core_properties(derived_core_properties):
|
|
for line in derived_core_properties:
|
|
if line == "\n" or line.startswith("#"):
|
|
continue
|
|
row = line.split("#")[0].split(";")
|
|
char_range = row[0].strip()
|
|
char_property = row[1].strip()
|
|
if ".." not in char_range:
|
|
yield (int(char_range, 16), char_property)
|
|
else:
|
|
[start, end] = char_range.split("..")
|
|
for char in range(int(start, 16), int(end, 16) + 1):
|
|
yield (char, char_property)
|
|
|
|
|
|
def read_special_casing(special_casing):
|
|
# Format:
|
|
# <code>; <lower>; <title>; <upper>; (<condition_list>;)? # <comment>
|
|
for line in special_casing:
|
|
if line == "\n" or line.startswith("#"):
|
|
continue
|
|
row = line.split("#")[0].split(";")
|
|
code = int(row[0].strip(), 16)
|
|
lower = row[1].strip()
|
|
lower = [int(c, 16) for c in lower.split(" ")] if lower else []
|
|
upper = row[3].strip()
|
|
upper = [int(c, 16) for c in upper.split(" ")] if upper else []
|
|
languages = []
|
|
contexts = []
|
|
condition = row[4].strip()
|
|
if condition:
|
|
for cond in condition.split(" "):
|
|
if cond[0].islower():
|
|
languages.append(cond)
|
|
else:
|
|
contexts.append(cond)
|
|
pass
|
|
yield (code, lower, upper, languages, contexts)
|
|
|
|
|
|
def int_ranges(ints):
|
|
"""Yields consecutive ranges (inclusive) from integer values."""
|
|
(a, b) = tee(sorted(ints))
|
|
start = next(b)
|
|
for curr, succ in zip_longest(a, b):
|
|
if curr + 1 != succ:
|
|
yield (start, curr)
|
|
start = succ
|
|
|
|
|
|
def utf16_encode(code):
|
|
NonBMPMin = 0x10000
|
|
LeadSurrogateMin = 0xD800
|
|
TrailSurrogateMin = 0xDC00
|
|
|
|
lead = (code - NonBMPMin) // 1024 + LeadSurrogateMin
|
|
trail = ((code - NonBMPMin) % 1024) + TrailSurrogateMin
|
|
|
|
return lead, trail
|
|
|
|
|
|
def make_non_bmp_convert_macro(out_file, name, convert_map, codepoint_table):
|
|
# Find continuous range in convert_map.
|
|
convert_list = []
|
|
entry = None
|
|
for code in sorted(convert_map.keys()):
|
|
lead, trail = utf16_encode(code)
|
|
converted = convert_map[code]
|
|
diff = converted - code
|
|
|
|
if (
|
|
entry
|
|
and code == entry["code"] + entry["length"]
|
|
and diff == entry["diff"]
|
|
and lead == entry["lead"]
|
|
):
|
|
entry["length"] += 1
|
|
continue
|
|
|
|
entry = {
|
|
"code": code,
|
|
"diff": diff,
|
|
"length": 1,
|
|
"lead": lead,
|
|
"trail": trail,
|
|
}
|
|
convert_list.append(entry)
|
|
|
|
# Generate macro call for each range.
|
|
lines = []
|
|
comment = []
|
|
for entry in convert_list:
|
|
from_code = entry["code"]
|
|
to_code = entry["code"] + entry["length"] - 1
|
|
diff = entry["diff"]
|
|
|
|
lead = entry["lead"]
|
|
from_trail = entry["trail"]
|
|
to_trail = entry["trail"] + entry["length"] - 1
|
|
|
|
lines.append(
|
|
" MACRO(0x{:x}, 0x{:x}, 0x{:x}, 0x{:x}, 0x{:x}, {:d})".format(
|
|
from_code, to_code, lead, from_trail, to_trail, diff
|
|
)
|
|
)
|
|
comment.append(
|
|
"// {} .. {}".format(
|
|
codepoint_table.full_name(from_code), codepoint_table.full_name(to_code)
|
|
)
|
|
)
|
|
|
|
out_file.write("\n".join(comment))
|
|
out_file.write("\n")
|
|
out_file.write("#define FOR_EACH_NON_BMP_{}(MACRO) \\\n".format(name))
|
|
out_file.write(" \\\n".join(lines))
|
|
out_file.write("\n")
|
|
|
|
|
|
def process_derived_core_properties(derived_core_properties):
|
|
id_start = set()
|
|
id_continue = set()
|
|
|
|
for char, prop in read_derived_core_properties(derived_core_properties):
|
|
if prop == "ID_Start":
|
|
id_start.add(char)
|
|
if prop == "ID_Continue":
|
|
id_continue.add(char)
|
|
|
|
return (id_start, id_continue)
|
|
|
|
|
|
def process_unicode_data(unicode_data, derived_core_properties):
|
|
dummy = (0, 0, 0)
|
|
table = [dummy]
|
|
cache = {dummy: 0}
|
|
index = [0] * (MAX_BMP + 1)
|
|
|
|
codepoint_table = codepoint_dict()
|
|
test_space_table = []
|
|
|
|
non_bmp_lower_map = {}
|
|
non_bmp_upper_map = {}
|
|
non_bmp_id_start_set = {}
|
|
non_bmp_id_cont_set = {}
|
|
non_bmp_space_set = {}
|
|
|
|
(id_start, id_continue) = process_derived_core_properties(derived_core_properties)
|
|
|
|
for row in read_unicode_data(unicode_data):
|
|
code = row[0]
|
|
name = row[1]
|
|
category = row[2]
|
|
alias = row[-5]
|
|
uppercase = row[-3]
|
|
lowercase = row[-2]
|
|
|
|
if uppercase:
|
|
upper = int(uppercase, 16)
|
|
else:
|
|
upper = code
|
|
|
|
if lowercase:
|
|
lower = int(lowercase, 16)
|
|
else:
|
|
lower = code
|
|
|
|
codepoint_table[code] = (upper, lower, name, alias)
|
|
|
|
if code > MAX_BMP:
|
|
if code != lower:
|
|
non_bmp_lower_map[code] = lower
|
|
if code != upper:
|
|
non_bmp_upper_map[code] = upper
|
|
if category == "Zs":
|
|
non_bmp_space_set[code] = 1
|
|
test_space_table.append(code)
|
|
if code in id_start:
|
|
non_bmp_id_start_set[code] = 1
|
|
if code in id_continue:
|
|
non_bmp_id_cont_set[code] = 1
|
|
continue
|
|
|
|
assert lower <= MAX_BMP and upper <= MAX_BMP
|
|
|
|
flags = 0
|
|
|
|
# we combine whitespace and lineterminators because in pratice we don't need them separated
|
|
if category == "Zs" or code in whitespace or code in line_terminator:
|
|
flags |= FLAG_SPACE
|
|
test_space_table.append(code)
|
|
|
|
# §11.6 (IdentifierStart)
|
|
if code in id_start:
|
|
flags |= FLAG_UNICODE_ID_START
|
|
|
|
# §11.6 (IdentifierPart)
|
|
elif code in id_continue or code in compatibility_identifier_part:
|
|
flags |= FLAG_UNICODE_ID_CONTINUE_ONLY
|
|
|
|
up_d = upper - code
|
|
low_d = lower - code
|
|
|
|
assert up_d > -65535 and up_d < 65535
|
|
assert low_d > -65535 and low_d < 65535
|
|
|
|
upper = up_d & 0xFFFF
|
|
lower = low_d & 0xFFFF
|
|
|
|
item = (upper, lower, flags)
|
|
|
|
i = cache.get(item)
|
|
if i is None:
|
|
assert item not in table
|
|
cache[item] = i = len(table)
|
|
table.append(item)
|
|
index[code] = i
|
|
|
|
return (
|
|
table,
|
|
index,
|
|
non_bmp_lower_map,
|
|
non_bmp_upper_map,
|
|
non_bmp_space_set,
|
|
non_bmp_id_start_set,
|
|
non_bmp_id_cont_set,
|
|
codepoint_table,
|
|
test_space_table,
|
|
)
|
|
|
|
|
|
def process_case_folding(case_folding):
|
|
folding_map = {}
|
|
rev_folding_map = {}
|
|
folding_dummy = (0,)
|
|
folding_table = [folding_dummy]
|
|
folding_cache = {folding_dummy: 0}
|
|
folding_index = [0] * (MAX_BMP + 1)
|
|
|
|
folding_tests = []
|
|
folding_codes = set()
|
|
|
|
for code, mapping in read_case_folding(case_folding):
|
|
folding_map[code] = mapping
|
|
|
|
if mapping not in rev_folding_map:
|
|
rev_folding_map[mapping] = [code]
|
|
else:
|
|
rev_folding_map[mapping].append(code)
|
|
|
|
folding_codes.add(code)
|
|
folding_codes.add(mapping)
|
|
|
|
for code in sorted(folding_codes):
|
|
if code in folding_map:
|
|
folding = folding_map[code]
|
|
else:
|
|
folding = code
|
|
|
|
if code in rev_folding_map:
|
|
rev_folding = rev_folding_map[code]
|
|
elif folding in rev_folding_map:
|
|
rev_folding = [c for c in rev_folding_map[folding] if c != code]
|
|
else:
|
|
rev_folding = []
|
|
|
|
if folding != code or len(rev_folding):
|
|
item = [code]
|
|
if folding != code:
|
|
item.append(folding)
|
|
folding_tests.append(item + rev_folding)
|
|
|
|
if code > MAX_BMP:
|
|
continue
|
|
|
|
folding_d = folding - code
|
|
|
|
assert folding_d > -65535 and folding_d < 65535
|
|
|
|
folding = folding_d & 0xFFFF
|
|
|
|
item = (folding,)
|
|
|
|
i = folding_cache.get(item)
|
|
if i is None:
|
|
assert item not in folding_table
|
|
folding_cache[item] = i = len(folding_table)
|
|
folding_table.append(item)
|
|
folding_index[code] = i
|
|
return (folding_table, folding_index, folding_tests)
|
|
|
|
|
|
def process_special_casing(special_casing, table, index):
|
|
# Unconditional special casing.
|
|
unconditional_tolower = {}
|
|
unconditional_toupper = {}
|
|
|
|
# Conditional special casing, language independent.
|
|
conditional_tolower = {}
|
|
conditional_toupper = {}
|
|
|
|
# Conditional special casing, language dependent.
|
|
lang_conditional_tolower = {}
|
|
lang_conditional_toupper = {}
|
|
|
|
def caseInfo(code):
|
|
(upper, lower, flags) = table[index[code]]
|
|
return ((code + lower) & 0xFFFF, (code + upper) & 0xFFFF)
|
|
|
|
for code, lower, upper, languages, contexts in read_special_casing(special_casing):
|
|
assert code <= MAX_BMP, "Unexpected character outside of BMP: %s" % code
|
|
assert len(languages) <= 1, "Expected zero or one language ids: %s" % languages
|
|
assert len(contexts) <= 1, (
|
|
"Expected zero or one casing contexts: %s" % languages
|
|
)
|
|
|
|
(default_lower, default_upper) = caseInfo(code)
|
|
special_lower = len(lower) != 1 or lower[0] != default_lower
|
|
special_upper = len(upper) != 1 or upper[0] != default_upper
|
|
|
|
# Invariant: If |code| has casing per UnicodeData.txt, then it also has
|
|
# casing rules in SpecialCasing.txt.
|
|
assert code == default_lower or len(lower) != 1 or code != lower[0]
|
|
assert code == default_upper or len(upper) != 1 or code != upper[0]
|
|
|
|
language = languages[0] if languages else None
|
|
context = contexts[0] if contexts else None
|
|
|
|
if not language and not context:
|
|
if special_lower:
|
|
unconditional_tolower[code] = lower
|
|
if special_upper:
|
|
unconditional_toupper[code] = upper
|
|
elif not language and context:
|
|
if special_lower:
|
|
conditional_tolower[code] = (lower, context)
|
|
if special_upper:
|
|
conditional_toupper[code] = (upper, context)
|
|
else:
|
|
if language not in lang_conditional_tolower:
|
|
lang_conditional_tolower[language] = {}
|
|
lang_conditional_toupper[language] = {}
|
|
if special_lower:
|
|
lang_conditional_tolower[language][code] = (lower, context)
|
|
if special_upper:
|
|
lang_conditional_toupper[language][code] = (upper, context)
|
|
|
|
# Certain special casing rules are inlined in jsstr.cpp, ensure these cases
|
|
# still match the current SpecialCasing.txt file.
|
|
def lowerCase(code):
|
|
(lower, _) = caseInfo(code)
|
|
return lower
|
|
|
|
def upperCase(code):
|
|
(_, upper) = caseInfo(code)
|
|
return upper
|
|
|
|
def ascii(char_dict):
|
|
return (ch for ch in char_dict.keys() if ch <= 0x7F)
|
|
|
|
def latin1(char_dict):
|
|
return (ch for ch in char_dict.keys() if ch <= 0xFF)
|
|
|
|
def is_empty(iterable):
|
|
return not any(True for _ in iterable)
|
|
|
|
def is_equals(iter1, iter2):
|
|
return all(x == y for (x, y) in zip_longest(iter1, iter2))
|
|
|
|
# Ensure no ASCII characters have special case mappings.
|
|
assert is_empty(ascii(unconditional_tolower))
|
|
assert is_empty(ascii(unconditional_toupper))
|
|
assert is_empty(ascii(conditional_tolower))
|
|
assert is_empty(ascii(conditional_toupper))
|
|
|
|
# Ensure no Latin1 characters have special lower case mappings.
|
|
assert is_empty(latin1(unconditional_tolower))
|
|
assert is_empty(latin1(conditional_tolower))
|
|
|
|
# Ensure no Latin1 characters have conditional special upper case mappings.
|
|
assert is_empty(latin1(conditional_toupper))
|
|
|
|
# Ensure U+00DF is the only Latin1 character with a special upper case mapping.
|
|
assert is_equals([0x00DF], latin1(unconditional_toupper))
|
|
|
|
# Ensure U+0130 is the only character with a special lower case mapping.
|
|
assert is_equals([0x0130], unconditional_tolower)
|
|
|
|
# Ensure no characters have language independent conditional upper case mappings.
|
|
assert is_empty(conditional_toupper)
|
|
|
|
# Ensure U+03A3 is the only character with language independent conditional lower case mapping.
|
|
assert is_equals([0x03A3], conditional_tolower)
|
|
|
|
# Verify U+0130 and U+03A3 have simple lower case mappings.
|
|
assert all(ch != lowerCase(ch) for ch in [0x0130, 0x03A3])
|
|
|
|
# Ensure Azeri, Lithuanian, and Turkish are the only languages with conditional case mappings.
|
|
assert is_equals(["az", "lt", "tr"], sorted(lang_conditional_tolower.keys()))
|
|
assert is_equals(["az", "lt", "tr"], sorted(lang_conditional_toupper.keys()))
|
|
|
|
# Maximum case mapping length is three characters.
|
|
assert (
|
|
max(
|
|
map(
|
|
len,
|
|
chain(
|
|
unconditional_tolower.values(),
|
|
unconditional_toupper.values(),
|
|
map(itemgetter(0), conditional_tolower.values()),
|
|
map(itemgetter(0), conditional_toupper.values()),
|
|
map(
|
|
itemgetter(0),
|
|
chain.from_iterable(
|
|
d.values() for d in lang_conditional_tolower.values()
|
|
),
|
|
),
|
|
map(
|
|
itemgetter(0),
|
|
chain.from_iterable(
|
|
d.values() for d in lang_conditional_toupper.values()
|
|
),
|
|
),
|
|
),
|
|
)
|
|
)
|
|
<= 3
|
|
)
|
|
|
|
# Ensure all case mapping contexts are known (see Unicode 9.0, §3.13 Default Case Algorithms).
|
|
assert set(
|
|
[
|
|
"After_I",
|
|
"After_Soft_Dotted",
|
|
"Final_Sigma",
|
|
"More_Above",
|
|
"Not_Before_Dot",
|
|
]
|
|
).issuperset(
|
|
set(
|
|
filter(
|
|
partial(is_not, None),
|
|
chain(
|
|
map(itemgetter(1), conditional_tolower.values()),
|
|
map(itemgetter(1), conditional_toupper.values()),
|
|
map(
|
|
itemgetter(1),
|
|
chain.from_iterable(
|
|
d.values() for d in lang_conditional_tolower.values()
|
|
),
|
|
),
|
|
map(
|
|
itemgetter(1),
|
|
chain.from_iterable(
|
|
d.values() for d in lang_conditional_toupper.values()
|
|
),
|
|
),
|
|
),
|
|
)
|
|
)
|
|
)
|
|
|
|
# Special casing for U+00DF (LATIN SMALL LETTER SHARP S).
|
|
assert upperCase(0x00DF) == 0x00DF and unconditional_toupper[0x00DF] == [
|
|
0x0053,
|
|
0x0053,
|
|
]
|
|
|
|
# Special casing for U+0130 (LATIN CAPITAL LETTER I WITH DOT ABOVE).
|
|
assert unconditional_tolower[0x0130] == [0x0069, 0x0307]
|
|
|
|
# Special casing for U+03A3 (GREEK CAPITAL LETTER SIGMA).
|
|
assert lowerCase(0x03A3) == 0x03C3 and conditional_tolower[0x03A3] == (
|
|
[0x03C2],
|
|
"Final_Sigma",
|
|
)
|
|
|
|
return (unconditional_tolower, unconditional_toupper)
|
|
|
|
|
|
def make_non_bmp_file(version, non_bmp_lower_map, non_bmp_upper_map, codepoint_table):
|
|
file_name = "UnicodeNonBMP.h"
|
|
with io.open(file_name, mode="w", encoding="utf-8") as non_bmp_file:
|
|
non_bmp_file.write(mpl_license)
|
|
non_bmp_file.write("\n")
|
|
non_bmp_file.write(warning_message)
|
|
non_bmp_file.write(unicode_version_message.format(version))
|
|
non_bmp_file.write(
|
|
"""
|
|
#ifndef util_UnicodeNonBMP_h
|
|
#define util_UnicodeNonBMP_h
|
|
|
|
// |MACRO| receives the following arguments
|
|
// MACRO(FROM, TO, LEAD, TRAIL_FROM, TRAIL_TO, DIFF)
|
|
// FROM: code point where the range starts
|
|
// TO: code point where the range ends
|
|
// LEAD: common lead surrogate of FROM and TO
|
|
// TRAIL_FROM: trail surrogate of FROM
|
|
// TRAIL_FROM: trail surrogate of TO
|
|
// DIFF: the difference between the code point in the range and
|
|
// converted code point
|
|
|
|
"""
|
|
)
|
|
|
|
make_non_bmp_convert_macro(
|
|
non_bmp_file, "LOWERCASE", non_bmp_lower_map, codepoint_table
|
|
)
|
|
non_bmp_file.write("\n")
|
|
make_non_bmp_convert_macro(
|
|
non_bmp_file, "UPPERCASE", non_bmp_upper_map, codepoint_table
|
|
)
|
|
|
|
non_bmp_file.write(
|
|
"""
|
|
#endif /* util_UnicodeNonBMP_h */
|
|
"""
|
|
)
|
|
|
|
|
|
def write_special_casing_methods(unconditional_toupper, codepoint_table, println):
|
|
def hexlit(n):
|
|
"""Returns C++ hex-literal for |n|."""
|
|
return "0x{:04X}".format(n)
|
|
|
|
def describe_range(ranges, depth):
|
|
indent = depth * " "
|
|
for start, end in ranges:
|
|
if start == end:
|
|
println(indent, "// {}".format(codepoint_table.full_name(start)))
|
|
else:
|
|
println(
|
|
indent,
|
|
"// {} .. {}".format(
|
|
codepoint_table.full_name(start), codepoint_table.full_name(end)
|
|
),
|
|
)
|
|
|
|
def out_range(start, end):
|
|
"""Tests if the input character isn't a member of the set {x | start <= x <= end}."""
|
|
if start == end:
|
|
return "ch != {}".format(hexlit(start))
|
|
return "ch < {} || ch > {}".format(hexlit(start), hexlit(end))
|
|
|
|
def in_range(start, end, parenthesize=False):
|
|
"""Tests if the input character is in the set {x | start <= x <= end}."""
|
|
if start == end:
|
|
return "ch == {}".format(hexlit(start))
|
|
(left, right) = ("(", ")") if parenthesize else ("", "")
|
|
return "{}ch >= {} && ch <= {}{}".format(
|
|
left, hexlit(start), hexlit(end), right
|
|
)
|
|
|
|
def in_any_range(ranges, spaces):
|
|
"""Tests if the input character is included in any of the given ranges."""
|
|
lines = [[]]
|
|
for start, end in ranges:
|
|
expr = in_range(start, end, parenthesize=True)
|
|
line = " || ".join(lines[-1] + [expr])
|
|
if len(line) < (100 - len(spaces) - len(" ||")):
|
|
lines[-1].append(expr)
|
|
else:
|
|
lines.append([expr])
|
|
return " ||\n{}".format(spaces).join(" || ".join(t) for t in lines)
|
|
|
|
def write_range_accept(parent_list, child_list, depth):
|
|
"""Accepts the input character if it matches any code unit in |child_list|."""
|
|
(min_parent, max_parent) = (parent_list[0], parent_list[-1])
|
|
(min_child, max_child) = (child_list[0], child_list[-1])
|
|
assert min_child >= min_parent
|
|
assert max_child <= max_parent
|
|
indent = depth * " "
|
|
|
|
child_ranges = list(int_ranges(child_list))
|
|
has_successor = max_child != max_parent
|
|
|
|
# If |child_list| is a contiguous list of code units, emit a simple
|
|
# range check: |min_child <= input <= max_child|.
|
|
if len(child_ranges) == 1:
|
|
describe_range(child_ranges, depth)
|
|
if has_successor:
|
|
println(indent, "if (ch <= {}) {{".format(hexlit(max_child)))
|
|
println(indent, " return ch >= {};".format(hexlit(min_child)))
|
|
println(indent, "}")
|
|
else:
|
|
println(indent, "return {};".format(in_range(min_child, max_child)))
|
|
return
|
|
|
|
# Otherwise create a disjunction over the subranges in |child_ranges|.
|
|
if not has_successor:
|
|
spaces = indent + len("return ") * " "
|
|
else:
|
|
spaces = indent + len(" return ") * " "
|
|
range_test_expr = in_any_range(child_ranges, spaces)
|
|
|
|
if min_child != min_parent:
|
|
println(indent, "if (ch < {}) {{".format(hexlit(min_child)))
|
|
println(indent, " return false;")
|
|
println(indent, "}")
|
|
|
|
# If there's no successor block, we can omit the |input <= max_child| check,
|
|
# because it was already checked when we emitted the parent range test.
|
|
if not has_successor:
|
|
describe_range(child_ranges, depth)
|
|
println(indent, "return {};".format(range_test_expr))
|
|
else:
|
|
println(indent, "if (ch <= {}) {{".format(hexlit(max_child)))
|
|
describe_range(child_ranges, depth + 1)
|
|
println(indent, " return {};".format(range_test_expr))
|
|
println(indent, "}")
|
|
|
|
def write_ChangesWhenUpperCasedSpecialCasing():
|
|
"""Checks if the input has a special upper case mapping."""
|
|
println("bool")
|
|
println("js::unicode::ChangesWhenUpperCasedSpecialCasing(char16_t ch)")
|
|
println("{")
|
|
|
|
assert unconditional_toupper, "|unconditional_toupper| is not empty"
|
|
|
|
# Sorted list of code units with special upper case mappings.
|
|
code_list = sorted(unconditional_toupper.keys())
|
|
|
|
# Fail-fast if the input character isn't a special casing character.
|
|
println(" if ({}) {{".format(out_range(code_list[0], code_list[-1])))
|
|
println(" return false;")
|
|
println(" }")
|
|
|
|
for i in range(0, 16):
|
|
# Check if the input characters is in the range:
|
|
# |start_point <= input < end_point|.
|
|
start_point = i << 12
|
|
end_point = (i + 1) << 12
|
|
matches = [cu for cu in code_list if start_point <= cu < end_point]
|
|
|
|
# Skip empty ranges.
|
|
if not matches:
|
|
continue
|
|
|
|
# If |matches| consists of only a few characters, directly check
|
|
# the input against the characters in |matches|.
|
|
if len(matches) <= 8:
|
|
write_range_accept(code_list, matches, depth=1)
|
|
continue
|
|
|
|
# Otherwise split into further subranges.
|
|
|
|
# Only enter the if-block if the input is less-or-equals to the
|
|
# largest value in the current range.
|
|
is_last_block = matches[-1] == code_list[-1]
|
|
if not is_last_block:
|
|
println(" if (ch <= {}) {{".format(hexlit(matches[-1])))
|
|
else:
|
|
println(" if (ch < {}) {{".format(hexlit(matches[0])))
|
|
println(" return false;")
|
|
println(" }")
|
|
|
|
for j in range(0, 16):
|
|
inner_start = start_point + (j << 8)
|
|
inner_end = start_point + ((j + 1) << 8)
|
|
inner_matches = [cu for cu in matches if inner_start <= cu < inner_end]
|
|
|
|
if inner_matches:
|
|
d = 1 if is_last_block else 2
|
|
write_range_accept(matches, inner_matches, depth=d)
|
|
|
|
if not is_last_block:
|
|
println(" }")
|
|
|
|
println("}")
|
|
|
|
def write_LengthUpperCaseSpecialCasing():
|
|
"""Slow case: Special casing character was found, returns its mapping length."""
|
|
println("size_t")
|
|
println("js::unicode::LengthUpperCaseSpecialCasing(char16_t ch)")
|
|
println("{")
|
|
|
|
println(" switch(ch) {")
|
|
for code, converted in sorted(unconditional_toupper.items(), key=itemgetter(0)):
|
|
println(
|
|
" case {}: return {}; // {}".format(
|
|
hexlit(code), len(converted), codepoint_table.name(code)
|
|
)
|
|
)
|
|
println(" }")
|
|
println("")
|
|
println(' MOZ_ASSERT_UNREACHABLE("Bad character input.");')
|
|
println(" return 0;")
|
|
|
|
println("}")
|
|
|
|
def write_AppendUpperCaseSpecialCasing():
|
|
"""Slow case: Special casing character was found, append its mapping characters."""
|
|
println("void")
|
|
println(
|
|
"js::unicode::AppendUpperCaseSpecialCasing(char16_t ch, char16_t* elements, size_t* index)" # NOQA: E501
|
|
)
|
|
println("{")
|
|
|
|
println(" switch(ch) {")
|
|
for code, converted in sorted(unconditional_toupper.items(), key=itemgetter(0)):
|
|
println(
|
|
" case {}: // {}".format(hexlit(code), codepoint_table.name(code))
|
|
)
|
|
for ch in converted:
|
|
println(
|
|
" elements[(*index)++] = {}; // {}".format(
|
|
hexlit(ch), codepoint_table.name(ch)
|
|
)
|
|
)
|
|
println(" return;")
|
|
println(" }")
|
|
println("")
|
|
println(' MOZ_ASSERT_UNREACHABLE("Bad character input.");')
|
|
|
|
println("}")
|
|
|
|
write_ChangesWhenUpperCasedSpecialCasing()
|
|
println("")
|
|
write_LengthUpperCaseSpecialCasing()
|
|
println("")
|
|
write_AppendUpperCaseSpecialCasing()
|
|
|
|
|
|
def write_ascii_lookup_tables(table, index, write, println):
|
|
def is_id_compat(code):
|
|
return code == ord("\N{DOLLAR SIGN}") or code == ord("\N{LOW LINE}")
|
|
|
|
def is_id_start(code):
|
|
(upper, lower, flags) = table[index[code]]
|
|
return (flags & FLAG_UNICODE_ID_START) or is_id_compat(code)
|
|
|
|
def is_id_continue(code):
|
|
(upper, lower, flags) = table[index[code]]
|
|
return (flags & FLAG_UNICODE_ID_CONTINUE_ONLY) or is_id_start(code)
|
|
|
|
def is_space(code):
|
|
(upper, lower, flags) = table[index[code]]
|
|
return flags & FLAG_SPACE
|
|
|
|
def write_entries(name, predicate):
|
|
println("const bool unicode::{}[] = {{".format(name))
|
|
header = "".join("{0: <6}".format(x) for x in range(0, 10)).rstrip()
|
|
println("/* {} */".format(header))
|
|
for i in range(0, 13):
|
|
write("/* {0: >2} */".format(i))
|
|
for j in range(0, 10):
|
|
code = i * 10 + j
|
|
if code <= 0x7F:
|
|
write(" {},".format("true" if predicate(code) else "____"))
|
|
println("")
|
|
println("};")
|
|
|
|
println("")
|
|
println("#define ____ false")
|
|
|
|
println(
|
|
"""
|
|
/*
|
|
* Identifier start chars:
|
|
* - 36: $
|
|
* - 65..90: A..Z
|
|
* - 95: _
|
|
* - 97..122: a..z
|
|
*/"""
|
|
)
|
|
write_entries("js_isidstart", is_id_start)
|
|
|
|
println(
|
|
"""
|
|
/*
|
|
* Identifier chars:
|
|
* - 36: $
|
|
* - 48..57: 0..9
|
|
* - 65..90: A..Z
|
|
* - 95: _
|
|
* - 97..122: a..z
|
|
*/"""
|
|
)
|
|
write_entries("js_isident", is_id_continue)
|
|
|
|
println(
|
|
"""
|
|
/* Whitespace chars: '\\t', '\\n', '\\v', '\\f', '\\r', ' '. */"""
|
|
)
|
|
write_entries("js_isspace", is_space)
|
|
|
|
println("")
|
|
println("#undef ____")
|
|
|
|
|
|
def write_latin1_lookup_tables(table, index, write, println):
|
|
def case_info(code):
|
|
assert 0 <= code and code <= MAX_BMP
|
|
(upper, lower, flags) = table[index[code]]
|
|
return ((code + upper) & 0xFFFF, (code + lower) & 0xFFFF, flags)
|
|
|
|
def toLowerCase(code):
|
|
(_, lower, _) = case_info(code)
|
|
assert lower <= 0xFF, "lower-case of Latin-1 is always Latin-1"
|
|
return lower
|
|
|
|
def write_entries(name, mapper):
|
|
println("const JS::Latin1Char unicode::{}[] = {{".format(name))
|
|
header = "".join("{0: <6}".format(x) for x in range(0, 16)).rstrip()
|
|
println("/* {} */".format(header))
|
|
for i in range(0, 16):
|
|
write("/* {0: >2} */".format(i))
|
|
for j in range(0, 16):
|
|
code = i * 16 + j
|
|
if code <= 0xFF:
|
|
write(" 0x{:02X},".format(mapper(code)))
|
|
println("")
|
|
println("};")
|
|
|
|
println("")
|
|
write_entries("latin1ToLowerCaseTable", toLowerCase)
|
|
|
|
|
|
def make_bmp_mapping_test(
|
|
version, codepoint_table, unconditional_tolower, unconditional_toupper
|
|
):
|
|
def unicodeEsc(n):
|
|
return "\\u{:04X}".format(n)
|
|
|
|
file_name = "../tests/non262/String/string-upper-lower-mapping.js"
|
|
with io.open(file_name, mode="w", encoding="utf-8") as output:
|
|
write = partial(print, file=output, sep="", end="")
|
|
println = partial(print, file=output, sep="", end="\n")
|
|
|
|
write(warning_message)
|
|
write(unicode_version_message.format(version))
|
|
write(public_domain)
|
|
println("var mapping = [")
|
|
for code in range(0, MAX_BMP + 1):
|
|
entry = codepoint_table.get(code)
|
|
|
|
if entry:
|
|
(upper, lower, _, _) = entry
|
|
upper = (
|
|
unconditional_toupper[code]
|
|
if code in unconditional_toupper
|
|
else [upper]
|
|
)
|
|
lower = (
|
|
unconditional_tolower[code]
|
|
if code in unconditional_tolower
|
|
else [lower]
|
|
)
|
|
println(
|
|
' ["{}", "{}"], /* {} */'.format(
|
|
"".join(map(unicodeEsc, upper)),
|
|
"".join(map(unicodeEsc, lower)),
|
|
codepoint_table.name(code),
|
|
)
|
|
)
|
|
else:
|
|
println(' ["{0}", "{0}"],'.format(unicodeEsc(code)))
|
|
println("];")
|
|
write(
|
|
"""
|
|
assertEq(mapping.length, 0x10000);
|
|
for (var i = 0; i <= 0xffff; i++) {
|
|
var char = String.fromCharCode(i);
|
|
var info = mapping[i];
|
|
|
|
assertEq(char.toUpperCase(), info[0]);
|
|
assertEq(char.toLowerCase(), info[1]);
|
|
}
|
|
|
|
if (typeof reportCompare === "function")
|
|
reportCompare(true, true);
|
|
"""
|
|
)
|
|
|
|
|
|
def make_non_bmp_mapping_test(
|
|
version, non_bmp_upper_map, non_bmp_lower_map, codepoint_table
|
|
):
|
|
file_name = "../tests/non262/String/string-code-point-upper-lower-mapping.js"
|
|
with io.open(file_name, mode="w", encoding="utf-8") as test_non_bmp_mapping:
|
|
test_non_bmp_mapping.write(warning_message)
|
|
test_non_bmp_mapping.write(unicode_version_message.format(version))
|
|
test_non_bmp_mapping.write(public_domain)
|
|
|
|
for code in sorted(non_bmp_upper_map.keys()):
|
|
test_non_bmp_mapping.write(
|
|
"""\
|
|
assertEq(String.fromCodePoint(0x{:04X}).toUpperCase().codePointAt(0), 0x{:04X}); // {}, {}
|
|
""".format(
|
|
code,
|
|
non_bmp_upper_map[code],
|
|
codepoint_table.name(code),
|
|
codepoint_table.name(non_bmp_upper_map[code]),
|
|
)
|
|
)
|
|
|
|
for code in sorted(non_bmp_lower_map.keys()):
|
|
test_non_bmp_mapping.write(
|
|
"""\
|
|
assertEq(String.fromCodePoint(0x{:04X}).toLowerCase().codePointAt(0), 0x{:04X}); // {}, {}
|
|
""".format(
|
|
code,
|
|
non_bmp_lower_map[code],
|
|
codepoint_table.name(code),
|
|
codepoint_table.name(non_bmp_lower_map[code]),
|
|
)
|
|
)
|
|
|
|
test_non_bmp_mapping.write(
|
|
"""
|
|
if (typeof reportCompare === "function")
|
|
reportCompare(true, true);
|
|
"""
|
|
)
|
|
|
|
|
|
def make_space_test(version, test_space_table, codepoint_table):
|
|
def hex_and_name(c):
|
|
return " 0x{:04X} /* {} */".format(c, codepoint_table.name(c))
|
|
|
|
file_name = "../tests/non262/String/string-space-trim.js"
|
|
with io.open(file_name, mode="w", encoding="utf-8") as test_space:
|
|
test_space.write(warning_message)
|
|
test_space.write(unicode_version_message.format(version))
|
|
test_space.write(public_domain)
|
|
test_space.write("var onlySpace = String.fromCharCode(\n")
|
|
test_space.write(",\n".join(map(hex_and_name, test_space_table)))
|
|
test_space.write("\n);\n")
|
|
test_space.write(
|
|
"""
|
|
assertEq(onlySpace.trim(), "");
|
|
assertEq((onlySpace + 'aaaa').trim(), 'aaaa');
|
|
assertEq(('aaaa' + onlySpace).trim(), 'aaaa');
|
|
assertEq((onlySpace + 'aaaa' + onlySpace).trim(), 'aaaa');
|
|
|
|
if (typeof reportCompare === "function")
|
|
reportCompare(true, true);
|
|
"""
|
|
)
|
|
|
|
|
|
def make_regexp_space_test(version, test_space_table, codepoint_table):
|
|
def hex_and_name(c):
|
|
return " 0x{:04X} /* {} */".format(c, codepoint_table.name(c))
|
|
|
|
file_name = "../tests/non262/RegExp/character-class-escape-s.js"
|
|
with io.open(file_name, mode="w", encoding="utf-8") as test_space:
|
|
test_space.write(warning_message)
|
|
test_space.write(unicode_version_message.format(version))
|
|
test_space.write(public_domain)
|
|
test_space.write("var onlySpace = String.fromCodePoint(\n")
|
|
test_space.write(",\n".join(map(hex_and_name, test_space_table)))
|
|
test_space.write("\n);\n")
|
|
test_space.write(
|
|
r"""
|
|
assertEq(/^\s+$/.exec(onlySpace) !== null, true);
|
|
assertEq(/^[\s]+$/.exec(onlySpace) !== null, true);
|
|
assertEq(/^[^\s]+$/.exec(onlySpace) === null, true);
|
|
|
|
assertEq(/^\S+$/.exec(onlySpace) === null, true);
|
|
assertEq(/^[\S]+$/.exec(onlySpace) === null, true);
|
|
assertEq(/^[^\S]+$/.exec(onlySpace) !== null, true);
|
|
|
|
// Also test with Unicode RegExps.
|
|
assertEq(/^\s+$/u.exec(onlySpace) !== null, true);
|
|
assertEq(/^[\s]+$/u.exec(onlySpace) !== null, true);
|
|
assertEq(/^[^\s]+$/u.exec(onlySpace) === null, true);
|
|
|
|
assertEq(/^\S+$/u.exec(onlySpace) === null, true);
|
|
assertEq(/^[\S]+$/u.exec(onlySpace) === null, true);
|
|
assertEq(/^[^\S]+$/u.exec(onlySpace) !== null, true);
|
|
|
|
if (typeof reportCompare === "function")
|
|
reportCompare(true, true);
|
|
"""
|
|
)
|
|
|
|
|
|
def make_icase_test(version, folding_tests, codepoint_table):
|
|
def char_hex(c):
|
|
return "0x{:04X}".format(c)
|
|
|
|
file_name = "../tests/non262/RegExp/unicode-ignoreCase.js"
|
|
with io.open(file_name, mode="w", encoding="utf-8") as test_icase:
|
|
test_icase.write(warning_message)
|
|
test_icase.write(unicode_version_message.format(version))
|
|
test_icase.write(public_domain)
|
|
test_icase.write(
|
|
"""
|
|
var BUGNUMBER = 1135377;
|
|
var summary = "Implement RegExp unicode flag -- ignoreCase flag.";
|
|
|
|
print(BUGNUMBER + ": " + summary);
|
|
|
|
function test(code, ...equivs) {
|
|
var codeRe = new RegExp(String.fromCodePoint(code) + "+", "iu");
|
|
var ans = String.fromCodePoint(code) + equivs.map(c => String.fromCodePoint(c)).join("");
|
|
assertEqArray(codeRe.exec("<" + ans + ">"), [ans]);
|
|
codeRe = new RegExp("[" + String.fromCodePoint(code) + "]+", "iu");
|
|
assertEqArray(codeRe.exec("<" + ans + ">"), [ans]);
|
|
}
|
|
"""
|
|
)
|
|
for args in folding_tests:
|
|
test_icase.write(
|
|
"test({}); // {}\n".format(
|
|
", ".join(map(char_hex, args)),
|
|
", ".join(map(codepoint_table.name, args)),
|
|
)
|
|
)
|
|
test_icase.write(
|
|
"""
|
|
if (typeof reportCompare === "function")
|
|
reportCompare(true, true);
|
|
"""
|
|
)
|
|
|
|
|
|
def make_unicode_file(
|
|
version,
|
|
table,
|
|
index,
|
|
folding_table,
|
|
folding_index,
|
|
non_bmp_space_set,
|
|
non_bmp_id_start_set,
|
|
non_bmp_id_cont_set,
|
|
unconditional_toupper,
|
|
codepoint_table,
|
|
):
|
|
index1, index2, shift = splitbins(index)
|
|
|
|
# Don't forget to update CharInfo in Unicode.h if you need to change this
|
|
assert shift == 6
|
|
|
|
folding_index1, folding_index2, folding_shift = splitbins(folding_index)
|
|
|
|
# Don't forget to update CaseFoldInfo in Unicode.h if you need to change this
|
|
assert folding_shift == 5
|
|
|
|
# verify correctness
|
|
for char in index:
|
|
test = table[index[char]]
|
|
|
|
idx = index1[char >> shift]
|
|
idx = index2[(idx << shift) + (char & ((1 << shift) - 1))]
|
|
|
|
assert test == table[idx]
|
|
|
|
# verify correctness
|
|
for char in folding_index:
|
|
test = folding_table[folding_index[char]]
|
|
|
|
idx = folding_index1[char >> folding_shift]
|
|
idx = folding_index2[
|
|
(idx << folding_shift) + (char & ((1 << folding_shift) - 1))
|
|
]
|
|
|
|
assert test == folding_table[idx]
|
|
|
|
comment = """
|
|
/*
|
|
* So how does indexing work?
|
|
* First let's have a look at a char16_t, 16-bits:
|
|
* [................]
|
|
* Step 1:
|
|
* Extracting the upper 11 bits from the char16_t.
|
|
* upper = char >> 5 ([***********.....])
|
|
* Step 2:
|
|
* Using these bits to get an reduced index from index1.
|
|
* index = index1[upper]
|
|
* Step 3:
|
|
* Combining the index and the bottom 5 bits of the original char16_t.
|
|
* real_index = index2[(index << 5) + (char & ((1 << 5) - 1))] ([...********+++++])
|
|
*
|
|
* The advantage here is that the biggest number in index1 doesn't need 10 bits,
|
|
* but 7 and we save some memory.
|
|
*
|
|
* Step 4:
|
|
* Get the character informations by looking up real_index in js_charinfo.
|
|
*
|
|
* Pseudocode of generation:
|
|
*
|
|
* let table be the mapping of char16_t => js_charinfo_index
|
|
* let index1 be an empty array
|
|
* let index2 be an empty array
|
|
* let cache be a hash map
|
|
*
|
|
* while shift is less then maximal amount you can shift 0xffff before it's 0
|
|
* let chunks be table split in chunks of size 2**shift
|
|
*
|
|
* for every chunk in chunks
|
|
* if chunk is in cache
|
|
* let index be cache[chunk]
|
|
* else
|
|
* let index be the max key of index2 + 1
|
|
* for element in chunk
|
|
* push element to index2
|
|
* put index as chunk in cache
|
|
*
|
|
* push index >> shift to index1
|
|
*
|
|
* increase shift
|
|
* stop if you found the best shift
|
|
*/
|
|
"""
|
|
|
|
def dump(data, name, println):
|
|
println("const uint8_t unicode::{}[] = {{".format(name))
|
|
|
|
line = pad = " " * 4
|
|
lines = []
|
|
for entry in data:
|
|
assert entry < 256
|
|
s = str(entry)
|
|
s = s.rjust(3)
|
|
|
|
if len(line + s) + 5 > 99:
|
|
lines.append(line.rstrip())
|
|
line = pad + s + ", "
|
|
else:
|
|
line = line + s + ", "
|
|
lines.append(line.rstrip())
|
|
|
|
println("\n".join(lines))
|
|
println("};")
|
|
|
|
def write_table(data_type, name, tbl, idx1_name, idx1, idx2_name, idx2, println):
|
|
println("const {} unicode::{}[] = {{".format(data_type, name))
|
|
for d in tbl:
|
|
println(" {{ {} }},".format(", ".join(str(e) for e in d)))
|
|
println("};")
|
|
println("")
|
|
|
|
dump(idx1, idx1_name, println)
|
|
println("")
|
|
dump(idx2, idx2_name, println)
|
|
println("")
|
|
|
|
def write_supplemental_identifier_method(name, group_set, println):
|
|
println("bool")
|
|
println("js::unicode::{}(char32_t codePoint)".format(name))
|
|
println("{")
|
|
for from_code, to_code in int_ranges(group_set.keys()):
|
|
println(
|
|
" if (codePoint >= 0x{:X} && codePoint <= 0x{:X}) {{ // {} .. {}".format(
|
|
from_code,
|
|
to_code,
|
|
codepoint_table.name(from_code),
|
|
codepoint_table.name(to_code),
|
|
)
|
|
)
|
|
println(" return true;")
|
|
println(" }")
|
|
println(" return false;")
|
|
println("}")
|
|
println("")
|
|
|
|
file_name = "Unicode.cpp"
|
|
with io.open(file_name, "w", encoding="utf-8") as data_file:
|
|
write = partial(print, file=data_file, sep="", end="")
|
|
println = partial(print, file=data_file, sep="", end="\n")
|
|
|
|
write(warning_message)
|
|
write(unicode_version_message.format(version))
|
|
write(public_domain)
|
|
println('#include "util/Unicode.h"')
|
|
println("")
|
|
println("using namespace js;")
|
|
println("using namespace js::unicode;")
|
|
write(comment)
|
|
|
|
write_table(
|
|
"CharacterInfo",
|
|
"js_charinfo",
|
|
table,
|
|
"index1",
|
|
index1,
|
|
"index2",
|
|
index2,
|
|
println,
|
|
)
|
|
|
|
write_table(
|
|
"FoldingInfo",
|
|
"js_foldinfo",
|
|
folding_table,
|
|
"folding_index1",
|
|
folding_index1,
|
|
"folding_index2",
|
|
folding_index2,
|
|
println,
|
|
)
|
|
|
|
# If the following assert fails, it means space character is added to
|
|
# non-BMP area. In that case the following code should be uncommented
|
|
# and the corresponding code should be added to frontend. (At least
|
|
# unicode::IsSpace will require updating to handle this.)
|
|
assert len(non_bmp_space_set.keys()) == 0
|
|
|
|
write_supplemental_identifier_method(
|
|
"IsIdentifierStartNonBMP", non_bmp_id_start_set, println
|
|
)
|
|
|
|
write_supplemental_identifier_method(
|
|
"IsIdentifierPartNonBMP", non_bmp_id_cont_set, println
|
|
)
|
|
|
|
write_special_casing_methods(unconditional_toupper, codepoint_table, println)
|
|
|
|
write_ascii_lookup_tables(table, index, write, println)
|
|
|
|
write_latin1_lookup_tables(table, index, write, println)
|
|
|
|
|
|
def getsize(data):
|
|
"""return smallest possible integer size for the given array"""
|
|
maxdata = max(data)
|
|
assert maxdata < 2**32
|
|
|
|
if maxdata < 256:
|
|
return 1
|
|
elif maxdata < 65536:
|
|
return 2
|
|
else:
|
|
return 4
|
|
|
|
|
|
def splitbins(t):
|
|
"""t -> (t1, t2, shift). Split a table to save space.
|
|
|
|
t is a sequence of ints. This function can be useful to save space if
|
|
many of the ints are the same. t1 and t2 are lists of ints, and shift
|
|
is an int, chosen to minimize the combined size of t1 and t2 (in C
|
|
code), and where for each i in range(len(t)),
|
|
t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
|
|
where mask is a bitmask isolating the last "shift" bits.
|
|
"""
|
|
|
|
def dump(t1, t2, shift, bytes):
|
|
print(
|
|
"%d+%d bins at shift %d; %d bytes" % (len(t1), len(t2), shift, bytes),
|
|
file=sys.stderr,
|
|
)
|
|
print("Size of original table:", len(t) * getsize(t), "bytes", file=sys.stderr)
|
|
|
|
n = len(t) - 1 # last valid index
|
|
maxshift = 0 # the most we can shift n and still have something left
|
|
if n > 0:
|
|
while n >> 1:
|
|
n >>= 1
|
|
maxshift += 1
|
|
del n
|
|
bytes = sys.maxsize # smallest total size so far
|
|
t = tuple(t) # so slices can be dict keys
|
|
for shift in range(maxshift + 1):
|
|
t1 = []
|
|
t2 = []
|
|
size = 2**shift
|
|
bincache = {}
|
|
|
|
for i in range(0, len(t), size):
|
|
bin = t[i : i + size]
|
|
|
|
index = bincache.get(bin)
|
|
if index is None:
|
|
index = len(t2)
|
|
bincache[bin] = index
|
|
t2.extend(bin)
|
|
t1.append(index >> shift)
|
|
|
|
# determine memory size
|
|
b = len(t1) * getsize(t1) + len(t2) * getsize(t2)
|
|
if b < bytes:
|
|
best = t1, t2, shift
|
|
bytes = b
|
|
t1, t2, shift = best
|
|
|
|
print("Best:", end=" ", file=sys.stderr)
|
|
dump(t1, t2, shift, bytes)
|
|
|
|
# exhaustively verify that the decomposition is correct
|
|
mask = 2**shift - 1
|
|
for i in range(len(t)):
|
|
assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
|
|
return best
|
|
|
|
|
|
def update_unicode(args):
|
|
base_path = os.getcwd()
|
|
|
|
version = args.version
|
|
if version is not None:
|
|
baseurl = "https://unicode.org/Public"
|
|
if version == "UNIDATA":
|
|
url = "%s/%s" % (baseurl, version)
|
|
else:
|
|
url = "%s/%s/ucd" % (baseurl, version)
|
|
|
|
print("Arguments:")
|
|
if version is not None:
|
|
print("\tVersion: %s" % version)
|
|
print("\tDownload url: %s" % url)
|
|
|
|
request_url = "{}/UCD.zip".format(url)
|
|
with closing(urlopen(request_url)) as downloaded_file:
|
|
downloaded_data = io.BytesIO(downloaded_file.read())
|
|
|
|
with ZipFile(downloaded_data) as zip_file:
|
|
for fname in [
|
|
"UnicodeData.txt",
|
|
"CaseFolding.txt",
|
|
"DerivedCoreProperties.txt",
|
|
"SpecialCasing.txt",
|
|
]:
|
|
zip_file.extract(fname, path=base_path)
|
|
else:
|
|
print("\tUsing local files.")
|
|
print("\tAlways make sure you have the newest Unicode files!")
|
|
print("")
|
|
|
|
def version_from_file(f, fname):
|
|
pat_version = re.compile(r"# %s-(?P<version>\d+\.\d+\.\d+).txt" % fname)
|
|
return pat_version.match(f.readline()).group("version")
|
|
|
|
with io.open(
|
|
os.path.join(base_path, "UnicodeData.txt"), "r", encoding="utf-8"
|
|
) as unicode_data, io.open(
|
|
os.path.join(base_path, "CaseFolding.txt"), "r", encoding="utf-8"
|
|
) as case_folding, io.open(
|
|
os.path.join(base_path, "DerivedCoreProperties.txt"), "r", encoding="utf-8"
|
|
) as derived_core_properties, io.open(
|
|
os.path.join(base_path, "SpecialCasing.txt"), "r", encoding="utf-8"
|
|
) as special_casing:
|
|
unicode_version = version_from_file(
|
|
derived_core_properties, "DerivedCoreProperties"
|
|
)
|
|
|
|
print("Processing...")
|
|
(
|
|
table,
|
|
index,
|
|
non_bmp_lower_map,
|
|
non_bmp_upper_map,
|
|
non_bmp_space_set,
|
|
non_bmp_id_start_set,
|
|
non_bmp_id_cont_set,
|
|
codepoint_table,
|
|
test_space_table,
|
|
) = process_unicode_data(unicode_data, derived_core_properties)
|
|
(folding_table, folding_index, folding_tests) = process_case_folding(
|
|
case_folding
|
|
)
|
|
(unconditional_tolower, unconditional_toupper) = process_special_casing(
|
|
special_casing, table, index
|
|
)
|
|
|
|
print("Generating...")
|
|
make_unicode_file(
|
|
unicode_version,
|
|
table,
|
|
index,
|
|
folding_table,
|
|
folding_index,
|
|
non_bmp_space_set,
|
|
non_bmp_id_start_set,
|
|
non_bmp_id_cont_set,
|
|
unconditional_toupper,
|
|
codepoint_table,
|
|
)
|
|
make_non_bmp_file(
|
|
unicode_version, non_bmp_lower_map, non_bmp_upper_map, codepoint_table
|
|
)
|
|
|
|
make_bmp_mapping_test(
|
|
unicode_version, codepoint_table, unconditional_tolower, unconditional_toupper
|
|
)
|
|
make_non_bmp_mapping_test(
|
|
unicode_version, non_bmp_upper_map, non_bmp_lower_map, codepoint_table
|
|
)
|
|
make_space_test(unicode_version, test_space_table, codepoint_table)
|
|
make_regexp_space_test(unicode_version, test_space_table, codepoint_table)
|
|
make_icase_test(unicode_version, folding_tests, codepoint_table)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import argparse
|
|
|
|
# This script must be run from js/src/util to work correctly.
|
|
if "/".join(os.path.normpath(os.getcwd()).split(os.sep)[-3:]) != "js/src/util":
|
|
raise RuntimeError("%s must be run from js/src/util" % sys.argv[0])
|
|
|
|
parser = argparse.ArgumentParser(description="Update Unicode data.")
|
|
|
|
parser.add_argument(
|
|
"--version",
|
|
help='Optional Unicode version number. If specified, downloads the\
|
|
selected version from <https://unicode.org/Public>. If not specified\
|
|
uses the existing local files to generate the Unicode data. The\
|
|
number must match a published Unicode version, e.g. use\
|
|
"--version=8.0.0" to download Unicode 8 files. Alternatively use\
|
|
"--version=UNIDATA" to download the latest published version.',
|
|
)
|
|
|
|
parser.set_defaults(func=update_unicode)
|
|
|
|
args = parser.parse_args()
|
|
args.func(args)
|