forked from mirrors/gecko-dev
Bug 1564347 - Part 1: Add a lookup table for Latin-1 lower-case conversion. r=tcampbell
Using a lookup table for Latin-1 lower-case conversion allows the compiler to emit the sequence `mov, cmp, setne` for ChangesWhenUpperCased resp. two `mov` instructions for ToLowerCase. This makes it faster than the current approach which requires multiple instructions for both operations. Latin-1 upper-case conversion wasn't change to use a lookup table, because not all Latin-1 characters have an upper-case representation which is also a single Latin-1 character, cf. the conversion for U+00B5, U+00DF, and U+00FF. Differential Revision: https://phabricator.services.mozilla.com/D37376 --HG-- extra : moz-landing-system : lando
This commit is contained in:
parent
eaccc90075
commit
47de48cd69
4 changed files with 76 additions and 17 deletions
|
|
@ -710,7 +710,7 @@ static size_t ToLowerCaseImpl(CharT* destChars, const CharT* srcChars,
|
||||||
|
|
||||||
size_t j = startIndex;
|
size_t j = startIndex;
|
||||||
for (size_t i = startIndex; i < srcLength; i++) {
|
for (size_t i = startIndex; i < srcLength; i++) {
|
||||||
char16_t c = srcChars[i];
|
CharT c = srcChars[i];
|
||||||
if constexpr (!IsSame<CharT, Latin1Char>::value) {
|
if constexpr (!IsSame<CharT, Latin1Char>::value) {
|
||||||
if (unicode::IsLeadSurrogate(c) && i + 1 < srcLength) {
|
if (unicode::IsLeadSurrogate(c) && i + 1 < srcLength) {
|
||||||
char16_t trail = srcChars[i + 1];
|
char16_t trail = srcChars[i + 1];
|
||||||
|
|
@ -745,8 +745,6 @@ static size_t ToLowerCaseImpl(CharT* destChars, const CharT* srcChars,
|
||||||
}
|
}
|
||||||
|
|
||||||
c = unicode::ToLowerCase(c);
|
c = unicode::ToLowerCase(c);
|
||||||
MOZ_ASSERT_IF((IsSame<CharT, Latin1Char>::value),
|
|
||||||
c <= JSString::MAX_LATIN1_CHAR);
|
|
||||||
destChars[j++] = c;
|
destChars[j++] = c;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -795,8 +793,7 @@ static JSString* ToLowerCase(JSContext* cx, JSLinearString* str) {
|
||||||
// static strings cache.
|
// static strings cache.
|
||||||
if constexpr (IsSame<CharT, Latin1Char>::value) {
|
if constexpr (IsSame<CharT, Latin1Char>::value) {
|
||||||
if (length == 1) {
|
if (length == 1) {
|
||||||
char16_t lower = unicode::ToLowerCase(chars[0]);
|
CharT lower = unicode::ToLowerCase(chars[0]);
|
||||||
MOZ_ASSERT(lower <= JSString::MAX_LATIN1_CHAR);
|
|
||||||
MOZ_ASSERT(StaticStrings::hasUnit(lower));
|
MOZ_ASSERT(StaticStrings::hasUnit(lower));
|
||||||
|
|
||||||
return cx->staticStrings().getUnit(lower);
|
return cx->staticStrings().getUnit(lower);
|
||||||
|
|
|
||||||
|
|
@ -4011,3 +4011,23 @@ const bool unicode::js_isspace[] = {
|
||||||
};
|
};
|
||||||
|
|
||||||
#undef ____
|
#undef ____
|
||||||
|
|
||||||
|
const JS::Latin1Char unicode::latin1ToLowerCaseTable[] = {
|
||||||
|
/* 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 */
|
||||||
|
/* 0 */ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
|
||||||
|
/* 1 */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
|
||||||
|
/* 2 */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F,
|
||||||
|
/* 3 */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x3F,
|
||||||
|
/* 4 */ 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
|
||||||
|
/* 5 */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F,
|
||||||
|
/* 6 */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
|
||||||
|
/* 7 */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F,
|
||||||
|
/* 8 */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F,
|
||||||
|
/* 9 */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9D, 0x9E, 0x9F,
|
||||||
|
/* 10 */ 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF,
|
||||||
|
/* 11 */ 0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF,
|
||||||
|
/* 12 */ 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
|
||||||
|
/* 13 */ 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xD7, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xDF,
|
||||||
|
/* 14 */ 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
|
||||||
|
/* 15 */ 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
|
||||||
|
};
|
||||||
|
|
|
||||||
|
|
@ -68,8 +68,6 @@ const uint8_t UNICODE_ID_CONTINUE = UNICODE_ID_START + UNICODE_ID_CONTINUE_ONLY;
|
||||||
|
|
||||||
constexpr char16_t NO_BREAK_SPACE = 0x00A0;
|
constexpr char16_t NO_BREAK_SPACE = 0x00A0;
|
||||||
constexpr char16_t MICRO_SIGN = 0x00B5;
|
constexpr char16_t MICRO_SIGN = 0x00B5;
|
||||||
constexpr char16_t LATIN_CAPITAL_LETTER_A_WITH_GRAVE = 0x00C0;
|
|
||||||
constexpr char16_t MULTIPLICATION_SIGN = 0x00D7;
|
|
||||||
constexpr char16_t LATIN_SMALL_LETTER_SHARP_S = 0x00DF;
|
constexpr char16_t LATIN_SMALL_LETTER_SHARP_S = 0x00DF;
|
||||||
constexpr char16_t LATIN_SMALL_LETTER_A_WITH_GRAVE = 0x00E0;
|
constexpr char16_t LATIN_SMALL_LETTER_A_WITH_GRAVE = 0x00E0;
|
||||||
constexpr char16_t DIVISION_SIGN = 0x00F7;
|
constexpr char16_t DIVISION_SIGN = 0x00F7;
|
||||||
|
|
@ -298,6 +296,27 @@ inline char16_t ToLowerCase(char16_t ch) {
|
||||||
return uint16_t(ch) + info.lowerCase;
|
return uint16_t(ch) + info.lowerCase;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
extern const JS::Latin1Char latin1ToLowerCaseTable[];
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Returns the simple lower case mapping (possibly the identity mapping; see
|
||||||
|
* ChangesWhenUpperCasedSpecialCasing for details) of the given Latin-1 code
|
||||||
|
* point.
|
||||||
|
*/
|
||||||
|
inline JS::Latin1Char ToLowerCase(JS::Latin1Char ch) {
|
||||||
|
return latin1ToLowerCaseTable[ch];
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Returns the simple lower case mapping (possibly the identity mapping; see
|
||||||
|
* ChangesWhenUpperCasedSpecialCasing for details) of the given ASCII code
|
||||||
|
* point.
|
||||||
|
*/
|
||||||
|
inline char ToLowerCase(char ch) {
|
||||||
|
MOZ_ASSERT(static_cast<unsigned char>(ch) < 128);
|
||||||
|
return latin1ToLowerCaseTable[uint8_t(ch)];
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns true iff ToUpperCase(ch) != ch.
|
* Returns true iff ToUpperCase(ch) != ch.
|
||||||
*
|
*
|
||||||
|
|
@ -344,15 +363,7 @@ inline bool ChangesWhenLowerCased(char16_t ch) {
|
||||||
|
|
||||||
// Returns true iff ToLowerCase(ch) != ch.
|
// Returns true iff ToLowerCase(ch) != ch.
|
||||||
inline bool ChangesWhenLowerCased(JS::Latin1Char ch) {
|
inline bool ChangesWhenLowerCased(JS::Latin1Char ch) {
|
||||||
if (MOZ_LIKELY(ch < 128)) {
|
return latin1ToLowerCaseTable[ch] != ch;
|
||||||
return ch >= 'A' && ch <= 'Z';
|
|
||||||
}
|
|
||||||
|
|
||||||
// U+00C0 to U+00DE, except U+00D7, have a lowercase form.
|
|
||||||
bool hasLower = ((ch & ~0x1F) == LATIN_CAPITAL_LETTER_A_WITH_GRAVE) &&
|
|
||||||
((ch & MULTIPLICATION_SIGN) != MULTIPLICATION_SIGN);
|
|
||||||
MOZ_ASSERT(hasLower == ChangesWhenLowerCased(char16_t(ch)));
|
|
||||||
return hasLower;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#define CHECK_RANGE(FROM, TO, LEAD, TRAIL_FROM, TRAIL_TO, DIFF) \
|
#define CHECK_RANGE(FROM, TO, LEAD, TRAIL_FROM, TRAIL_TO, DIFF) \
|
||||||
|
|
|
||||||
|
|
@ -876,7 +876,8 @@ def write_ascii_lookup_tables(table, index, write, println):
|
||||||
|
|
||||||
def write_entries(name, predicate):
|
def write_entries(name, predicate):
|
||||||
println('const bool unicode::{}[] = {{'.format(name))
|
println('const bool unicode::{}[] = {{'.format(name))
|
||||||
println('/* 0 1 2 3 4 5 6 7 8 9 */')
|
header = "".join("{0: <6}".format(x) for x in range(0, 10)).rstrip()
|
||||||
|
println('/* {} */'.format(header))
|
||||||
for i in range(0, 13):
|
for i in range(0, 13):
|
||||||
write('/* {0: >2} */'.format(i))
|
write('/* {0: >2} */'.format(i))
|
||||||
for j in range(0, 10):
|
for j in range(0, 10):
|
||||||
|
|
@ -918,6 +919,34 @@ def write_ascii_lookup_tables(table, index, write, println):
|
||||||
println('#undef ____')
|
println('#undef ____')
|
||||||
|
|
||||||
|
|
||||||
|
def write_latin1_lookup_tables(table, index, write, println):
|
||||||
|
def case_info(code):
|
||||||
|
assert 0 <= code and code <= MAX_BMP
|
||||||
|
(upper, lower, flags) = table[index[code]]
|
||||||
|
return ((code + upper) & 0xffff, (code + lower) & 0xffff, flags)
|
||||||
|
|
||||||
|
def toLowerCase(code):
|
||||||
|
(_, lower, _) = case_info(code)
|
||||||
|
assert lower <= 0xff, "lower-case of Latin-1 is always Latin-1"
|
||||||
|
return lower
|
||||||
|
|
||||||
|
def write_entries(name, mapper):
|
||||||
|
println('const JS::Latin1Char unicode::{}[] = {{'.format(name))
|
||||||
|
header = "".join("{0: <6}".format(x) for x in range(0, 16)).rstrip()
|
||||||
|
println('/* {} */'.format(header))
|
||||||
|
for i in range(0, 16):
|
||||||
|
write('/* {0: >2} */'.format(i))
|
||||||
|
for j in range(0, 16):
|
||||||
|
code = i * 16 + j
|
||||||
|
if (code <= 0xff):
|
||||||
|
write(' 0x{:02X},'.format(mapper(code)))
|
||||||
|
println('')
|
||||||
|
println('};')
|
||||||
|
|
||||||
|
println('')
|
||||||
|
write_entries('latin1ToLowerCaseTable', toLowerCase)
|
||||||
|
|
||||||
|
|
||||||
def make_bmp_mapping_test(version, codepoint_table, unconditional_tolower, unconditional_toupper):
|
def make_bmp_mapping_test(version, codepoint_table, unconditional_tolower, unconditional_toupper):
|
||||||
def unicodeEsc(n):
|
def unicodeEsc(n):
|
||||||
return '\\u{:04X}'.format(n)
|
return '\\u{:04X}'.format(n)
|
||||||
|
|
@ -1269,6 +1298,8 @@ def make_unicode_file(version,
|
||||||
|
|
||||||
write_ascii_lookup_tables(table, index, write, println)
|
write_ascii_lookup_tables(table, index, write, println)
|
||||||
|
|
||||||
|
write_latin1_lookup_tables(table, index, write, println)
|
||||||
|
|
||||||
|
|
||||||
def getsize(data):
|
def getsize(data):
|
||||||
""" return smallest possible integer size for the given array """
|
""" return smallest possible integer size for the given array """
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue