forked from mirrors/gecko-dev
		
	
		
			
				
	
	
		
			354 lines
		
	
	
	
		
			9.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			354 lines
		
	
	
	
		
			9.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| #!/usr/bin/env python3
 | |
| 
 | |
| """ Generate Unicode data table for parser
 | |
| """
 | |
| 
 | |
| import argparse
 | |
| import io
 | |
| import re
 | |
| import sys
 | |
| from contextlib import closing
 | |
| from itertools import tee, zip_longest
 | |
| from urllib.request import urlopen
 | |
| from zipfile import ZipFile
 | |
| 
 | |
| 
 | |
| # These are also part of IdentifierPart §11.6 Names and Keywords
 | |
| compatibility_identifier_part = [
 | |
|     ord(u'\N{ZERO WIDTH NON-JOINER}'),
 | |
|     ord(u'\N{ZERO WIDTH JOINER}'),
 | |
| ]
 | |
| 
 | |
| FLAG_ID_START = 1 << 0
 | |
| FLAG_ID_CONTINUE = 1 << 1
 | |
| 
 | |
| 
 | |
| def download_derived_core_properties(version):
 | |
|     """Downloads UCD.zip for given version, and return the content of
 | |
|     DerivedCoreProperties.txt. """
 | |
| 
 | |
|     baseurl = 'https://unicode.org/Public'
 | |
|     if version == 'UNIDATA':
 | |
|         url = '%s/%s' % (baseurl, version)
 | |
|     else:
 | |
|         url = '%s/%s/ucd' % (baseurl, version)
 | |
| 
 | |
|     request_url = '{}/UCD.zip'.format(url)
 | |
|     with closing(urlopen(request_url)) as downloaded_file:
 | |
|         downloaded_data = io.BytesIO(downloaded_file.read())
 | |
| 
 | |
|     with ZipFile(downloaded_data) as zip_file:
 | |
|         return zip_file.read('DerivedCoreProperties.txt').decode()
 | |
| 
 | |
| 
 | |
| def read_derived_core_properties(derived_core_properties):
 | |
|     """Read DerivedCoreProperties.txt content and yield each item. """
 | |
|     for line in derived_core_properties.split('\n'):
 | |
|         if line == '' or line.startswith('#'):
 | |
|             continue
 | |
|         row = line.split('#')[0].split(';')
 | |
|         char_range = row[0].strip()
 | |
|         char_property = row[1].strip()
 | |
|         if '..' not in char_range:
 | |
|             yield (int(char_range, 16), char_property)
 | |
|         else:
 | |
|             [start, end] = char_range.split('..')
 | |
|             for char in range(int(start, 16), int(end, 16) + 1):
 | |
|                 yield (char, char_property)
 | |
| 
 | |
| 
 | |
| def process_derived_core_properties(derived_core_properties):
 | |
|     """Parse DerivedCoreProperties.txt and returns its version,
 | |
|     and set of characters with ID_Start and ID_Continue. """
 | |
|     id_start = set()
 | |
|     id_continue = set()
 | |
| 
 | |
|     m = re.match('# DerivedCoreProperties-([0-9\.]+).txt', derived_core_properties)
 | |
|     assert m
 | |
|     version = m.group(1)
 | |
| 
 | |
|     for (char, prop) in read_derived_core_properties(derived_core_properties):
 | |
|         if prop == 'ID_Start':
 | |
|             id_start.add(char)
 | |
|         if prop == 'ID_Continue':
 | |
|             id_continue.add(char)
 | |
| 
 | |
|     return (version, id_start, id_continue)
 | |
| 
 | |
| 
 | |
| def int_ranges(ints):
 | |
|     """ Yields consecutive ranges (inclusive) from integer values. """
 | |
|     (a, b) = tee(sorted(ints))
 | |
|     start = next(b)
 | |
|     for (curr, succ) in zip_longest(a, b):
 | |
|         if curr + 1 != succ:
 | |
|             yield (start, curr)
 | |
|             start = succ
 | |
| 
 | |
| 
 | |
| def process_unicode_data(derived_core_properties):
 | |
|     MAX_BMP = 0xffff
 | |
| 
 | |
|     dummy = 0
 | |
|     table = [dummy]
 | |
|     cache = {dummy: 0}
 | |
|     index = [0] * (MAX_BMP + 1)
 | |
|     non_bmp_id_start_set = {}
 | |
|     non_bmp_id_continue_set = {}
 | |
| 
 | |
|     (version, id_start, id_continue) = process_derived_core_properties(derived_core_properties)
 | |
|     codes = id_start.union(id_continue)
 | |
| 
 | |
|     for code in codes:
 | |
|         if code > MAX_BMP:
 | |
|             if code in id_start:
 | |
|                 non_bmp_id_start_set[code] = 1
 | |
|             if code in id_continue:
 | |
|                 non_bmp_id_continue_set[code] = 1
 | |
|             continue
 | |
| 
 | |
|         flags = 0
 | |
|         if code in id_start:
 | |
|             flags |= FLAG_ID_START
 | |
|         if code in id_continue or code in compatibility_identifier_part:
 | |
|             flags |= FLAG_ID_CONTINUE
 | |
| 
 | |
|         i = cache.get(flags)
 | |
|         if i is None:
 | |
|             assert flags not in table
 | |
|             cache[flags] = i = len(table)
 | |
|             table.append(flags)
 | |
|         index[code] = i
 | |
| 
 | |
|     return (
 | |
|         version,
 | |
|         table,
 | |
|         index,
 | |
|         id_start,
 | |
|         id_continue,
 | |
|         non_bmp_id_start_set,
 | |
|         non_bmp_id_continue_set,
 | |
|     )
 | |
| 
 | |
| 
 | |
| def getsize(data):
 | |
|     """ return smallest possible integer size for the given array """
 | |
|     maxdata = max(data)
 | |
|     assert maxdata < 2**32
 | |
| 
 | |
|     if maxdata < 256:
 | |
|         return 1
 | |
|     elif maxdata < 65536:
 | |
|         return 2
 | |
|     else:
 | |
|         return 4
 | |
| 
 | |
| 
 | |
| def splitbins(t):
 | |
|     """t -> (t1, t2, shift).  Split a table to save space.
 | |
| 
 | |
|     t is a sequence of ints.  This function can be useful to save space if
 | |
|     many of the ints are the same.  t1 and t2 are lists of ints, and shift
 | |
|     is an int, chosen to minimize the combined size of t1 and t2 (in C
 | |
|     code), and where for each i in range(len(t)),
 | |
|         t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
 | |
|     where mask is a bitmask isolating the last "shift" bits.
 | |
|     """
 | |
| 
 | |
|     def dump(t1, t2, shift, bytes):
 | |
|         print("%d+%d bins at shift %d; %d bytes" % (
 | |
|             len(t1), len(t2), shift, bytes), file=sys.stderr)
 | |
|         print("Size of original table:", len(t) * getsize(t),
 | |
|               "bytes", file=sys.stderr)
 | |
| 
 | |
| 
 | |
|     n = len(t)-1    # last valid index
 | |
|     maxshift = 0    # the most we can shift n and still have something left
 | |
|     if n > 0:
 | |
|         while n >> 1:
 | |
|             n >>= 1
 | |
|             maxshift += 1
 | |
|     del n
 | |
|     bytes = sys.maxsize  # smallest total size so far
 | |
|     t = tuple(t)    # so slices can be dict keys
 | |
|     for shift in range(maxshift + 1):
 | |
|         t1 = []
 | |
|         t2 = []
 | |
|         size = 2**shift
 | |
|         bincache = {}
 | |
| 
 | |
|         for i in range(0, len(t), size):
 | |
|             bin = t[i:i + size]
 | |
| 
 | |
|             index = bincache.get(bin)
 | |
|             if index is None:
 | |
|                 index = len(t2)
 | |
|                 bincache[bin] = index
 | |
|                 t2.extend(bin)
 | |
|             t1.append(index >> shift)
 | |
| 
 | |
|         # determine memory size
 | |
|         b = len(t1) * getsize(t1) + len(t2) * getsize(t2)
 | |
|         if b < bytes:
 | |
|             best = t1, t2, shift
 | |
|             bytes = b
 | |
|     t1, t2, shift = best
 | |
| 
 | |
|     print("Best:", end=' ', file=sys.stderr)
 | |
|     dump(t1, t2, shift, bytes)
 | |
| 
 | |
|     # exhaustively verify that the decomposition is correct
 | |
|     mask = 2**shift - 1
 | |
|     for i in range(len(t)):
 | |
|         assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
 | |
|     return best
 | |
| 
 | |
| 
 | |
| def write_table(f, name, type, table, formatter, per_line):
 | |
|     f.write(f"""
 | |
| pub const {name}: &'static [{type}] = &[
 | |
| """)
 | |
| 
 | |
|     i = 0
 | |
|     for item in table:
 | |
|         if i == 0:
 | |
|             f.write('    ')
 | |
|         f.write(f'{formatter(item)},')
 | |
|         i += 1
 | |
|         if i == per_line:
 | |
|             i = 0
 | |
|             f.write("""
 | |
| """)
 | |
| 
 | |
|     f.write("""\
 | |
| ];
 | |
| """)
 | |
| 
 | |
| 
 | |
| def write_func(f, name, group_set):
 | |
|         f.write(f"""
 | |
| pub fn {name}(c: char) -> bool {{""")
 | |
| 
 | |
|         for (from_code, to_code) in int_ranges(group_set.keys()):
 | |
|             f.write(f"""
 | |
|     if c >= \'\\u{{{from_code:X}}}\' && c <= \'\\u{{{to_code:X}}}\' {{
 | |
|         return true;
 | |
|     }}""")
 | |
| 
 | |
|         f.write("""
 | |
|     false
 | |
| }
 | |
| """)
 | |
| 
 | |
| 
 | |
| def make_unicode_file(version, table, index,
 | |
|                       id_start, id_continue,
 | |
|                       non_bmp_id_start_set, non_bmp_id_continue_set):
 | |
|     index1, index2, shift = splitbins(index)
 | |
| 
 | |
|     # verify correctness
 | |
|     for char in index:
 | |
|         test = table[index[char]]
 | |
| 
 | |
|         idx = index1[char >> shift]
 | |
|         idx = index2[(idx << shift) + (char & ((1 << shift) - 1))]
 | |
| 
 | |
|         assert test == table[idx]
 | |
| 
 | |
|     with open('crates/parser/src/unicode_data.rs', 'w') as f:
 | |
|         f.write(f"""\
 | |
| // Generated by update_unicode.py DO NOT MODIFY
 | |
| // Unicode version: {version}
 | |
| """)
 | |
| 
 | |
|         f.write(f"""
 | |
| const FLAG_ID_START: u8 = {FLAG_ID_START};
 | |
| const FLAG_ID_CONTINUE: u8 = {FLAG_ID_CONTINUE};
 | |
| """)
 | |
| 
 | |
|         f.write("""
 | |
| pub struct CharInfo {
 | |
|     flags: u8,
 | |
| }
 | |
| 
 | |
| impl CharInfo {
 | |
|     pub fn is_id_start(&self) -> bool {
 | |
|         self.flags & FLAG_ID_START != 0
 | |
|     }
 | |
| 
 | |
|     pub fn is_id_continue(&self) -> bool {
 | |
|         self.flags & FLAG_ID_CONTINUE != 0
 | |
|     }
 | |
| }
 | |
| """)
 | |
| 
 | |
|         write_table(f, 'CHAR_INFO_TABLE', 'CharInfo', table,
 | |
|                     lambda flag: f"CharInfo {{ flags: {flag} }}",
 | |
|                     1)
 | |
|         write_table(f, 'INDEX1', 'u8', index1,
 | |
|                     lambda i: f'{i:4d}', 8)
 | |
|         write_table(f, 'INDEX2', 'u8', index2,
 | |
|                     lambda i: f'{i:4d}', 8)
 | |
| 
 | |
|         f.write(f"""
 | |
| const SHIFT: usize = {shift};
 | |
| """)
 | |
| 
 | |
|         f.write("""
 | |
| pub fn char_info(c: char) -> &'static CharInfo {
 | |
|     let code = c as usize;
 | |
|     let index = INDEX1[code >> SHIFT] as usize;
 | |
|     let index = INDEX2[(index << SHIFT) + (code & ((1 << SHIFT) - 1))] as usize;
 | |
| 
 | |
|     &CHAR_INFO_TABLE[index]
 | |
| }
 | |
| """)
 | |
| 
 | |
|         def format_bool(b):
 | |
|             if b:
 | |
|                 return 'true '
 | |
|             else:
 | |
|                 return 'false'
 | |
| 
 | |
|         write_table(f, 'IS_ID_START_TABLE', 'bool', range(0, 128),
 | |
|                     lambda code: format_bool(code in id_start), 8)
 | |
|         write_table(f, 'IS_ID_CONTINUE_TABLE', 'bool', range(0, 128),
 | |
|                     lambda code: format_bool(code in id_continue), 8)
 | |
| 
 | |
|         write_func(f, 'is_id_start_non_bmp', non_bmp_id_start_set)
 | |
|         write_func(f, 'is_id_continue_non_bmp', non_bmp_id_continue_set)
 | |
| 
 | |
| 
 | |
| 
 | |
| parser = argparse.ArgumentParser(description='Generate Unicode data table for parser')
 | |
| parser.add_argument('VERSION',
 | |
|                     help='Unicode version number to download from\
 | |
|                     <https://unicode.org/Public>. The number must match\
 | |
|                     a published Unicode version, e.g. use\
 | |
|                     "--version=8.0.0" to download Unicode 8 files. Alternatively use\
 | |
|                     "--version=UNIDATA" to download the latest published version.')
 | |
| parser.add_argument('PATH_TO_JSPARAGUS',
 | |
|                     help='Path to jsparagus')
 | |
| args = parser.parse_args()
 | |
| 
 | |
| derived_core_properties = download_derived_core_properties(args.VERSION)
 | |
| 
 | |
| (
 | |
|     version,
 | |
|     table,
 | |
|     index,
 | |
|     id_start,
 | |
|     id_continue,
 | |
|     non_bmp_id_start_set,
 | |
|     non_bmp_id_continue_set,
 | |
| ) = process_unicode_data(derived_core_properties)
 | |
| 
 | |
| make_unicode_file(
 | |
|     version,
 | |
|     table,
 | |
|     index,
 | |
|     id_start,
 | |
|     id_continue,
 | |
|     non_bmp_id_start_set,
 | |
|     non_bmp_id_continue_set,
 | |
| )
 | 
