forked from mirrors/gecko-dev
354 lines
9.5 KiB
Python
354 lines
9.5 KiB
Python
#!/usr/bin/env python3
|
|
|
|
""" Generate Unicode data table for parser
|
|
"""
|
|
|
|
import argparse
|
|
import io
|
|
import re
|
|
import sys
|
|
from contextlib import closing
|
|
from itertools import tee, zip_longest
|
|
from urllib.request import urlopen
|
|
from zipfile import ZipFile
|
|
|
|
|
|
# These are also part of IdentifierPart §11.6 Names and Keywords
|
|
compatibility_identifier_part = [
|
|
ord(u'\N{ZERO WIDTH NON-JOINER}'),
|
|
ord(u'\N{ZERO WIDTH JOINER}'),
|
|
]
|
|
|
|
FLAG_ID_START = 1 << 0
|
|
FLAG_ID_CONTINUE = 1 << 1
|
|
|
|
|
|
def download_derived_core_properties(version):
|
|
"""Downloads UCD.zip for given version, and return the content of
|
|
DerivedCoreProperties.txt. """
|
|
|
|
baseurl = 'https://unicode.org/Public'
|
|
if version == 'UNIDATA':
|
|
url = '%s/%s' % (baseurl, version)
|
|
else:
|
|
url = '%s/%s/ucd' % (baseurl, version)
|
|
|
|
request_url = '{}/UCD.zip'.format(url)
|
|
with closing(urlopen(request_url)) as downloaded_file:
|
|
downloaded_data = io.BytesIO(downloaded_file.read())
|
|
|
|
with ZipFile(downloaded_data) as zip_file:
|
|
return zip_file.read('DerivedCoreProperties.txt').decode()
|
|
|
|
|
|
def read_derived_core_properties(derived_core_properties):
|
|
"""Read DerivedCoreProperties.txt content and yield each item. """
|
|
for line in derived_core_properties.split('\n'):
|
|
if line == '' or line.startswith('#'):
|
|
continue
|
|
row = line.split('#')[0].split(';')
|
|
char_range = row[0].strip()
|
|
char_property = row[1].strip()
|
|
if '..' not in char_range:
|
|
yield (int(char_range, 16), char_property)
|
|
else:
|
|
[start, end] = char_range.split('..')
|
|
for char in range(int(start, 16), int(end, 16) + 1):
|
|
yield (char, char_property)
|
|
|
|
|
|
def process_derived_core_properties(derived_core_properties):
|
|
"""Parse DerivedCoreProperties.txt and returns its version,
|
|
and set of characters with ID_Start and ID_Continue. """
|
|
id_start = set()
|
|
id_continue = set()
|
|
|
|
m = re.match('# DerivedCoreProperties-([0-9\.]+).txt', derived_core_properties)
|
|
assert m
|
|
version = m.group(1)
|
|
|
|
for (char, prop) in read_derived_core_properties(derived_core_properties):
|
|
if prop == 'ID_Start':
|
|
id_start.add(char)
|
|
if prop == 'ID_Continue':
|
|
id_continue.add(char)
|
|
|
|
return (version, id_start, id_continue)
|
|
|
|
|
|
def int_ranges(ints):
|
|
""" Yields consecutive ranges (inclusive) from integer values. """
|
|
(a, b) = tee(sorted(ints))
|
|
start = next(b)
|
|
for (curr, succ) in zip_longest(a, b):
|
|
if curr + 1 != succ:
|
|
yield (start, curr)
|
|
start = succ
|
|
|
|
|
|
def process_unicode_data(derived_core_properties):
|
|
MAX_BMP = 0xffff
|
|
|
|
dummy = 0
|
|
table = [dummy]
|
|
cache = {dummy: 0}
|
|
index = [0] * (MAX_BMP + 1)
|
|
non_bmp_id_start_set = {}
|
|
non_bmp_id_continue_set = {}
|
|
|
|
(version, id_start, id_continue) = process_derived_core_properties(derived_core_properties)
|
|
codes = id_start.union(id_continue)
|
|
|
|
for code in codes:
|
|
if code > MAX_BMP:
|
|
if code in id_start:
|
|
non_bmp_id_start_set[code] = 1
|
|
if code in id_continue:
|
|
non_bmp_id_continue_set[code] = 1
|
|
continue
|
|
|
|
flags = 0
|
|
if code in id_start:
|
|
flags |= FLAG_ID_START
|
|
if code in id_continue or code in compatibility_identifier_part:
|
|
flags |= FLAG_ID_CONTINUE
|
|
|
|
i = cache.get(flags)
|
|
if i is None:
|
|
assert flags not in table
|
|
cache[flags] = i = len(table)
|
|
table.append(flags)
|
|
index[code] = i
|
|
|
|
return (
|
|
version,
|
|
table,
|
|
index,
|
|
id_start,
|
|
id_continue,
|
|
non_bmp_id_start_set,
|
|
non_bmp_id_continue_set,
|
|
)
|
|
|
|
|
|
def getsize(data):
|
|
""" return smallest possible integer size for the given array """
|
|
maxdata = max(data)
|
|
assert maxdata < 2**32
|
|
|
|
if maxdata < 256:
|
|
return 1
|
|
elif maxdata < 65536:
|
|
return 2
|
|
else:
|
|
return 4
|
|
|
|
|
|
def splitbins(t):
|
|
"""t -> (t1, t2, shift). Split a table to save space.
|
|
|
|
t is a sequence of ints. This function can be useful to save space if
|
|
many of the ints are the same. t1 and t2 are lists of ints, and shift
|
|
is an int, chosen to minimize the combined size of t1 and t2 (in C
|
|
code), and where for each i in range(len(t)),
|
|
t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
|
|
where mask is a bitmask isolating the last "shift" bits.
|
|
"""
|
|
|
|
def dump(t1, t2, shift, bytes):
|
|
print("%d+%d bins at shift %d; %d bytes" % (
|
|
len(t1), len(t2), shift, bytes), file=sys.stderr)
|
|
print("Size of original table:", len(t) * getsize(t),
|
|
"bytes", file=sys.stderr)
|
|
|
|
|
|
n = len(t)-1 # last valid index
|
|
maxshift = 0 # the most we can shift n and still have something left
|
|
if n > 0:
|
|
while n >> 1:
|
|
n >>= 1
|
|
maxshift += 1
|
|
del n
|
|
bytes = sys.maxsize # smallest total size so far
|
|
t = tuple(t) # so slices can be dict keys
|
|
for shift in range(maxshift + 1):
|
|
t1 = []
|
|
t2 = []
|
|
size = 2**shift
|
|
bincache = {}
|
|
|
|
for i in range(0, len(t), size):
|
|
bin = t[i:i + size]
|
|
|
|
index = bincache.get(bin)
|
|
if index is None:
|
|
index = len(t2)
|
|
bincache[bin] = index
|
|
t2.extend(bin)
|
|
t1.append(index >> shift)
|
|
|
|
# determine memory size
|
|
b = len(t1) * getsize(t1) + len(t2) * getsize(t2)
|
|
if b < bytes:
|
|
best = t1, t2, shift
|
|
bytes = b
|
|
t1, t2, shift = best
|
|
|
|
print("Best:", end=' ', file=sys.stderr)
|
|
dump(t1, t2, shift, bytes)
|
|
|
|
# exhaustively verify that the decomposition is correct
|
|
mask = 2**shift - 1
|
|
for i in range(len(t)):
|
|
assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
|
|
return best
|
|
|
|
|
|
def write_table(f, name, type, table, formatter, per_line):
|
|
f.write(f"""
|
|
pub const {name}: &'static [{type}] = &[
|
|
""")
|
|
|
|
i = 0
|
|
for item in table:
|
|
if i == 0:
|
|
f.write(' ')
|
|
f.write(f'{formatter(item)},')
|
|
i += 1
|
|
if i == per_line:
|
|
i = 0
|
|
f.write("""
|
|
""")
|
|
|
|
f.write("""\
|
|
];
|
|
""")
|
|
|
|
|
|
def write_func(f, name, group_set):
|
|
f.write(f"""
|
|
pub fn {name}(c: char) -> bool {{""")
|
|
|
|
for (from_code, to_code) in int_ranges(group_set.keys()):
|
|
f.write(f"""
|
|
if c >= \'\\u{{{from_code:X}}}\' && c <= \'\\u{{{to_code:X}}}\' {{
|
|
return true;
|
|
}}""")
|
|
|
|
f.write("""
|
|
false
|
|
}
|
|
""")
|
|
|
|
|
|
def make_unicode_file(version, table, index,
|
|
id_start, id_continue,
|
|
non_bmp_id_start_set, non_bmp_id_continue_set):
|
|
index1, index2, shift = splitbins(index)
|
|
|
|
# verify correctness
|
|
for char in index:
|
|
test = table[index[char]]
|
|
|
|
idx = index1[char >> shift]
|
|
idx = index2[(idx << shift) + (char & ((1 << shift) - 1))]
|
|
|
|
assert test == table[idx]
|
|
|
|
with open('crates/parser/src/unicode_data.rs', 'w') as f:
|
|
f.write(f"""\
|
|
// Generated by update_unicode.py DO NOT MODIFY
|
|
// Unicode version: {version}
|
|
""")
|
|
|
|
f.write(f"""
|
|
const FLAG_ID_START: u8 = {FLAG_ID_START};
|
|
const FLAG_ID_CONTINUE: u8 = {FLAG_ID_CONTINUE};
|
|
""")
|
|
|
|
f.write("""
|
|
pub struct CharInfo {
|
|
flags: u8,
|
|
}
|
|
|
|
impl CharInfo {
|
|
pub fn is_id_start(&self) -> bool {
|
|
self.flags & FLAG_ID_START != 0
|
|
}
|
|
|
|
pub fn is_id_continue(&self) -> bool {
|
|
self.flags & FLAG_ID_CONTINUE != 0
|
|
}
|
|
}
|
|
""")
|
|
|
|
write_table(f, 'CHAR_INFO_TABLE', 'CharInfo', table,
|
|
lambda flag: f"CharInfo {{ flags: {flag} }}",
|
|
1)
|
|
write_table(f, 'INDEX1', 'u8', index1,
|
|
lambda i: f'{i:4d}', 8)
|
|
write_table(f, 'INDEX2', 'u8', index2,
|
|
lambda i: f'{i:4d}', 8)
|
|
|
|
f.write(f"""
|
|
const SHIFT: usize = {shift};
|
|
""")
|
|
|
|
f.write("""
|
|
pub fn char_info(c: char) -> &'static CharInfo {
|
|
let code = c as usize;
|
|
let index = INDEX1[code >> SHIFT] as usize;
|
|
let index = INDEX2[(index << SHIFT) + (code & ((1 << SHIFT) - 1))] as usize;
|
|
|
|
&CHAR_INFO_TABLE[index]
|
|
}
|
|
""")
|
|
|
|
def format_bool(b):
|
|
if b:
|
|
return 'true '
|
|
else:
|
|
return 'false'
|
|
|
|
write_table(f, 'IS_ID_START_TABLE', 'bool', range(0, 128),
|
|
lambda code: format_bool(code in id_start), 8)
|
|
write_table(f, 'IS_ID_CONTINUE_TABLE', 'bool', range(0, 128),
|
|
lambda code: format_bool(code in id_continue), 8)
|
|
|
|
write_func(f, 'is_id_start_non_bmp', non_bmp_id_start_set)
|
|
write_func(f, 'is_id_continue_non_bmp', non_bmp_id_continue_set)
|
|
|
|
|
|
|
|
parser = argparse.ArgumentParser(description='Generate Unicode data table for parser')
|
|
parser.add_argument('VERSION',
|
|
help='Unicode version number to download from\
|
|
<https://unicode.org/Public>. The number must match\
|
|
a published Unicode version, e.g. use\
|
|
"--version=8.0.0" to download Unicode 8 files. Alternatively use\
|
|
"--version=UNIDATA" to download the latest published version.')
|
|
parser.add_argument('PATH_TO_JSPARAGUS',
|
|
help='Path to jsparagus')
|
|
args = parser.parse_args()
|
|
|
|
derived_core_properties = download_derived_core_properties(args.VERSION)
|
|
|
|
(
|
|
version,
|
|
table,
|
|
index,
|
|
id_start,
|
|
id_continue,
|
|
non_bmp_id_start_set,
|
|
non_bmp_id_continue_set,
|
|
) = process_unicode_data(derived_core_properties)
|
|
|
|
make_unicode_file(
|
|
version,
|
|
table,
|
|
index,
|
|
id_start,
|
|
id_continue,
|
|
non_bmp_id_start_set,
|
|
non_bmp_id_continue_set,
|
|
)
|