Bug 1564347 - Part 1: Add a lookup table for Latin-1 lower-case conversion. r=tcampbell

Using a lookup table for Latin-1 lower-case conversion allows the compiler to emit the sequence `mov, cmp, setne` for ChangesWhenUpperCased resp. two `mov` instructions for ToLowerCase. This makes it faster than the current approach which requires multiple instructions for both operations. Latin-1 upper-case conversion wasn't change to use a lookup table, because not all Latin-1 characters have an upper-case representation which is also a single Latin-1 character, cf. the conversion for U+00B5, U+00DF, and U+00FF. Differential Revision: https://phabricator.services.mozilla.com/D37376 --HG-- extra : moz-landing-system : lando
2020-02-28 19:08:17 +00:00 · 2020-02-28 19:08:17 +00:00 · 47de48cd69
commit 47de48cd69
parent eaccc90075
4 changed files with 76 additions and 17 deletions
--- a/js/src/builtin/String.cpp
+++ b/js/src/builtin/String.cpp
@ -710,7 +710,7 @@ static size_t ToLowerCaseImpl(CharT* destChars, const CharT* srcChars,
  size_t j = startIndex;
  for (size_t i = startIndex; i < srcLength; i++) {
-    char16_t c = srcChars[i];
+    CharT c = srcChars[i];
    if constexpr (!IsSame<CharT, Latin1Char>::value) {
      if (unicode::IsLeadSurrogate(c) && i + 1 < srcLength) {
        char16_t trail = srcChars[i + 1];
@ -745,8 +745,6 @@ static size_t ToLowerCaseImpl(CharT* destChars, const CharT* srcChars,
    }
    c = unicode::ToLowerCase(c);
    MOZ_ASSERT_IF((IsSame<CharT, Latin1Char>::value),
                  c <= JSString::MAX_LATIN1_CHAR);
    destChars[j++] = c;
  }
@ -795,8 +793,7 @@ static JSString* ToLowerCase(JSContext* cx, JSLinearString* str) {
    // static strings cache.
    if constexpr (IsSame<CharT, Latin1Char>::value) {
      if (length == 1) {
-        char16_t lower = unicode::ToLowerCase(chars[0]);
+        CharT lower = unicode::ToLowerCase(chars[0]);
        MOZ_ASSERT(lower <= JSString::MAX_LATIN1_CHAR);
        MOZ_ASSERT(StaticStrings::hasUnit(lower));
        return cx->staticStrings().getUnit(lower);
--- a/js/src/util/Unicode.cpp
+++ b/js/src/util/Unicode.cpp
@ -4011,3 +4011,23 @@ const bool unicode::js_isspace[] = {
 };
 #undef ____
 const JS::Latin1Char unicode::latin1ToLowerCaseTable[] = {
 /*       0     1     2     3     4     5     6     7     8     9     10    11    12    13    14    15  */
 /*  0 */ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
 /*  1 */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
 /*  2 */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F,
 /*  3 */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x3F,
 /*  4 */ 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
 /*  5 */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F,
 /*  6 */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
 /*  7 */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F,
 /*  8 */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F,
 /*  9 */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9D, 0x9E, 0x9F,
 /* 10 */ 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF,
 /* 11 */ 0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF,
 /* 12 */ 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
 /* 13 */ 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xD7, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xDF,
 /* 14 */ 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
 /* 15 */ 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
 };
--- a/js/src/util/Unicode.h
+++ b/js/src/util/Unicode.h
@ -68,8 +68,6 @@ const uint8_t UNICODE_ID_CONTINUE = UNICODE_ID_START + UNICODE_ID_CONTINUE_ONLY;
 constexpr char16_t NO_BREAK_SPACE = 0x00A0;
 constexpr char16_t MICRO_SIGN = 0x00B5;
 constexpr char16_t LATIN_CAPITAL_LETTER_A_WITH_GRAVE = 0x00C0;
 constexpr char16_t MULTIPLICATION_SIGN = 0x00D7;
 constexpr char16_t LATIN_SMALL_LETTER_SHARP_S = 0x00DF;
 constexpr char16_t LATIN_SMALL_LETTER_A_WITH_GRAVE = 0x00E0;
 constexpr char16_t DIVISION_SIGN = 0x00F7;
@ -298,6 +296,27 @@ inline char16_t ToLowerCase(char16_t ch) {
  return uint16_t(ch) + info.lowerCase;
 }
 extern const JS::Latin1Char latin1ToLowerCaseTable[];
 /*
 * Returns the simple lower case mapping (possibly the identity mapping; see
 * ChangesWhenUpperCasedSpecialCasing for details) of the given Latin-1 code
 * point.
 */
 inline JS::Latin1Char ToLowerCase(JS::Latin1Char ch) {
  return latin1ToLowerCaseTable[ch];
 }
 /*
 * Returns the simple lower case mapping (possibly the identity mapping; see
 * ChangesWhenUpperCasedSpecialCasing for details) of the given ASCII code
 * point.
 */
 inline char ToLowerCase(char ch) {
  MOZ_ASSERT(static_cast<unsigned char>(ch) < 128);
  return latin1ToLowerCaseTable[uint8_t(ch)];
 }
 /**
 * Returns true iff ToUpperCase(ch) != ch.
 *
@ -344,15 +363,7 @@ inline bool ChangesWhenLowerCased(char16_t ch) {
 // Returns true iff ToLowerCase(ch) != ch.
 inline bool ChangesWhenLowerCased(JS::Latin1Char ch) {
-  if (MOZ_LIKELY(ch < 128)) {
+  return latin1ToLowerCaseTable[ch] != ch;
    return ch >= 'A' && ch <= 'Z';
  }
  // U+00C0 to U+00DE, except U+00D7, have a lowercase form.
  bool hasLower = ((ch & ~0x1F) == LATIN_CAPITAL_LETTER_A_WITH_GRAVE) &&
                  ((ch & MULTIPLICATION_SIGN) != MULTIPLICATION_SIGN);
  MOZ_ASSERT(hasLower == ChangesWhenLowerCased(char16_t(ch)));
  return hasLower;
 }
 #define CHECK_RANGE(FROM, TO, LEAD, TRAIL_FROM, TRAIL_TO, DIFF) \
--- a/js/src/util/make_unicode.py
+++ b/js/src/util/make_unicode.py
@ -876,7 +876,8 @@ def write_ascii_lookup_tables(table, index, write, println):
    def write_entries(name, predicate):
        println('const bool unicode::{}[] = {{'.format(name))
-        println('/*       0     1     2     3     4     5     6     7     8     9  */')
+        header = "".join("{0: <6}".format(x) for x in range(0, 10)).rstrip()
        println('/*       {}  */'.format(header))
        for i in range(0, 13):
            write('/* {0: >2} */'.format(i))
            for j in range(0, 10):
@ -918,6 +919,34 @@ def write_ascii_lookup_tables(table, index, write, println):
    println('#undef ____')
 def write_latin1_lookup_tables(table, index, write, println):
    def case_info(code):
        assert 0 <= code and code <= MAX_BMP
        (upper, lower, flags) = table[index[code]]
        return ((code + upper) & 0xffff, (code + lower) & 0xffff, flags)
    def toLowerCase(code):
        (_, lower, _) = case_info(code)
        assert lower <= 0xff, "lower-case of Latin-1 is always Latin-1"
        return lower
    def write_entries(name, mapper):
        println('const JS::Latin1Char unicode::{}[] = {{'.format(name))
        header = "".join("{0: <6}".format(x) for x in range(0, 16)).rstrip()
        println('/*       {}  */'.format(header))
        for i in range(0, 16):
            write('/* {0: >2} */'.format(i))
            for j in range(0, 16):
                code = i * 16 + j
                if (code <= 0xff):
                    write(' 0x{:02X},'.format(mapper(code)))
            println('')
        println('};')
    println('')
    write_entries('latin1ToLowerCaseTable', toLowerCase)
 def make_bmp_mapping_test(version, codepoint_table, unconditional_tolower, unconditional_toupper):
    def unicodeEsc(n):
        return '\\u{:04X}'.format(n)
@ -1269,6 +1298,8 @@ def make_unicode_file(version,
        write_ascii_lookup_tables(table, index, write, println)
        write_latin1_lookup_tables(table, index, write, println)
 def getsize(data):
    """ return smallest possible integer size for the given array """