Bug 1426909 - Abstract out mozilla::DecodeOneUtf8CodePoint for decoding a UTF-8 code point after having consumed a non-ASCII lead unit, with configurable error notification through optional user-provided functors. r=froydnj

--HG-- extra : rebase_source : 25836018b00b545a60969abccf40ce313d4da1af
2018-07-12 17:41:31 -07:00 · 2018-07-12 17:41:31 -07:00 · d243be25b1
commit d243be25b1
parent 8040464690
3 changed files with 902 additions and 62 deletions
--- a/mfbt/Utf8.cpp
+++ b/mfbt/Utf8.cpp
@ -4,6 +4,8 @@
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

+#include "mozilla/Maybe.h"
+#include "mozilla/TextUtils.h"
 #include "mozilla/Types.h"
 #include "mozilla/Utf8.h"

@ -14,66 +16,24 @@ MFBT_API bool
 mozilla::IsValidUtf8(const void* aCodeUnits, size_t aCount)
 {
  const auto* s = static_cast<const unsigned char*>(aCodeUnits);
-  const auto* limit = s + aCount;
+  const auto* const limit = s + aCount;

  while (s < limit) {
-    uint32_t n = *s++;
+    unsigned char c = *s++;

    // If the first byte is ASCII, it's the only one in the code point.  Have a
    // fast path that avoids all the rest of the work and looping in that case.
-    if ((n & 0x80) == 0) {
+    if (IsAscii(c)) {
      continue;
    }

-    // The leading code unit determines the length of the next code point and
-    // the number of bits of the leading code unit that contribute to the code
-    // point's value.
-    uint_fast8_t remaining;
-    uint32_t min;
-    if ((n & 0xE0) == 0xC0) {
-      remaining = 1;
-      min = 0x80;
-      n &= 0x1F;
-    } else if ((n & 0xF0) == 0xE0) {
-      remaining = 2;
-      min = 0x800;
-      n &= 0x0F;
-    } else if ((n & 0xF8) == 0xF0) {
-      remaining = 3;
-      min = 0x10000;
-      n &= 0x07;
-    } else {
-      // UTF-8 used to have a hyper-long encoding form, but it's been removed
-      // for years now.  So in this case, the string is not valid UTF-8.
+    Maybe<char32_t> maybeCodePoint =
+      DecodeOneUtf8CodePoint(Utf8Unit(c), &s, limit);
+    if (maybeCodePoint.isNothing()) {
      return false;
    }
-
-    // If the code point would require more code units than remain, the encoding
-    // is invalid.
-    if (s + remaining > limit) {
-      return false;
-    }
-
-    for (uint_fast8_t i = 0; i < remaining; i++) {
-      // Every non-leading code unit in properly encoded UTF-8 has its high bit
-      // set and the next-highest bit unset.
-      if ((s[i] & 0xC0) != 0x80) {
-        return false;
-      }
-
-      // The code point being encoded is the concatenation of all the
-      // unconstrained bits.
-      n = (n << 6) | (s[i] & 0x3F);
-    }
-
-    // Don't consider code points that are overlong, UTF-16 surrogates, or
-    // exceed the maximum code point to be valid.
-    if (n < min || (0xD800 <= n && n < 0xE000) || n >= 0x110000) {
-      return false;
-    }
-
-    s += remaining;
  }

+  MOZ_ASSERT(s == limit);
  return true;
 }
--- a/mfbt/Utf8.h
+++ b/mfbt/Utf8.h
@ -12,6 +12,10 @@
 #ifndef mozilla_Utf8_h
 #define mozilla_Utf8_h

+#include "mozilla/Casting.h" // for mozilla::AssertedCast
+#include "mozilla/Likely.h" // for MOZ_UNLIKELY
+#include "mozilla/Maybe.h" // for mozilla::Maybe
+#include "mozilla/TextUtils.h" // for mozilla::IsAscii
 #include "mozilla/Types.h" // for MFBT_API

 #include <limits.h> // for CHAR_BIT
@ -205,6 +209,217 @@ public:
 extern MFBT_API bool
 IsValidUtf8(const void* aCodeUnits, size_t aCount);

+/**
+ * Given |aLeadUnit| that is a non-ASCII code unit, a pointer to an |Iter aIter|
+ * that (initially) itself points one unit past |aLeadUnit|, and
+ * |const EndIter aEnd| that denotes the end of the UTF-8 data when compared
+ * against |*aIter| using |aEnd - *aIter|:
+ *
+ * If |aLeadUnit| and subsequent code units computed using |*aIter| (up to
+ * |aEnd|) encode a valid code point -- not exceeding Unicode's range, not a
+ * surrogate, in shortest form -- then return Some(that code point) and advance
+ * |*aIter| past those code units.
+ *
+ * Otherwise decrement |*aIter| (so that it points at |aLeadUnit|) and return
+ * Nothing().
+ *
+ * |Iter| and |EndIter| are generalized concepts most easily understood as if
+ * they were |const char*|, |const unsigned char*|, or |const Utf8Unit*|:
+ * iterators that when dereferenced can be used to construct a |Utf8Unit| and
+ * that can be compared and modified in certain limited ways.  (Carefully note
+ * that this function mutates |*aIter|.)  |Iter| and |EndIter| are template
+ * parameters to support more-complicated adaptor iterators.
+ *
+ * The template parameters after |Iter| allow users to implement custom handling
+ * for various forms of invalid UTF-8.  A version of this function that defaults
+ * all such handling to no-ops is defined below this function.  To learn how to
+ * define your own custom handling, consult the implementation of that function,
+ * which documents exactly how custom handler functors are invoked.
+ *
+ * This function is MOZ_ALWAYS_INLINE: if you don't need that, use the version
+ * of this function without the "Inline" suffix on the name.
+ */
+template<typename Iter,
+         typename EndIter,
+         class OnBadLeadUnit,
+         class OnNotEnoughUnits,
+         class OnBadTrailingUnit,
+         class OnBadCodePoint,
+         class OnNotShortestForm>
+MOZ_ALWAYS_INLINE Maybe<char32_t>
+DecodeOneUtf8CodePointInline(const Utf8Unit aLeadUnit,
+                             Iter* aIter, const EndIter aEnd,
+                             OnBadLeadUnit aOnBadLeadUnit,
+                             OnNotEnoughUnits aOnNotEnoughUnits,
+                             OnBadTrailingUnit aOnBadTrailingUnit,
+                             OnBadCodePoint aOnBadCodePoint,
+                             OnNotShortestForm aOnNotShortestForm)
+{
+  MOZ_ASSERT(Utf8Unit((*aIter)[-1]) == aLeadUnit);
+
+  char32_t n = aLeadUnit.toUint8();
+  MOZ_ASSERT(!IsAscii(n));
+
+  // |aLeadUnit| determines the number of trailing code units in the code point
+  // and the bits of |aLeadUnit| that contribute to the code point's value.
+  uint8_t remaining;
+  uint32_t min;
+  if ((n & 0b1110'0000) == 0b1100'0000) {
+    remaining = 1;
+    min = 0x80;
+    n &= 0b0001'1111;
+  } else if ((n & 0b1111'0000) == 0b1110'0000) {
+    remaining = 2;
+    min = 0x800;
+    n &= 0b0000'1111;
+  } else if ((n & 0b1111'1000) == 0b1111'0000) {
+    remaining = 3;
+    min = 0x10000;
+    n &= 0b0000'0111;
+  } else {
+    *aIter -= 1;
+    aOnBadLeadUnit();
+    return Nothing();
+  }
+
+  // If the code point would require more code units than remain, the encoding
+  // is invalid.
+  auto actual = aEnd - *aIter;
+  if (MOZ_UNLIKELY(actual < remaining)) {
+    *aIter -= 1;
+    aOnNotEnoughUnits(AssertedCast<uint8_t>(actual + 1), remaining + 1);
+    return Nothing();
+  }
+
+  for (uint8_t i = 0; i < remaining; i++) {
+    uint8_t unit = Utf8Unit(*(*aIter)++).toUint8();
+
+    // Every non-leading code unit in properly encoded UTF-8 has its high
+    // bit set and the next-highest bit unset.
+    if (MOZ_UNLIKELY((unit & 0b1100'0000) != 0b1000'0000)) {
+      uint8_t unitsObserved = i + 1 + 1;
+      *aIter -= unitsObserved;
+      aOnBadTrailingUnit(unitsObserved);
+      return Nothing();
+    }
+
+    // The code point being encoded is the concatenation of all the
+    // unconstrained bits.
+    n = (n << 6) | (unit & 0b0011'1111);
+  }
+
+  // UTF-16 surrogates and values outside the Unicode range are invalid.
+  if (MOZ_UNLIKELY(n > 0x10FFFF || (0xD800 <= n && n <= 0xDFFF))) {
+    uint8_t unitsObserved = remaining + 1;
+    *aIter -= unitsObserved;
+    aOnBadCodePoint(n, unitsObserved);
+    return Nothing();
+  }
+
+  // Overlong code points are also invalid.
+  if (MOZ_UNLIKELY(n < min)) {
+    uint8_t unitsObserved = remaining + 1;
+    *aIter -= unitsObserved;
+    aOnNotShortestForm(n, unitsObserved);
+    return Nothing();
+  }
+
+  return Some(n);
+}
+
+/**
+ * Identical to the above function, but not forced to be instantiated inline --
+ * the compiler is permitted to common up separate invocations if it chooses.
+ */
+template<typename Iter,
+         typename EndIter,
+         class OnBadLeadUnit,
+         class OnNotEnoughUnits,
+         class OnBadTrailingUnit,
+         class OnBadCodePoint,
+         class OnNotShortestForm>
+inline Maybe<char32_t>
+DecodeOneUtf8CodePoint(const Utf8Unit aLeadUnit,
+                       Iter* aIter, const EndIter aEnd,
+                       OnBadLeadUnit aOnBadLeadUnit,
+                       OnNotEnoughUnits aOnNotEnoughUnits,
+                       OnBadTrailingUnit aOnBadTrailingUnit,
+                       OnBadCodePoint aOnBadCodePoint,
+                       OnNotShortestForm aOnNotShortestForm)
+{
+  return DecodeOneUtf8CodePointInline(aLeadUnit, aIter, aEnd,
+                                      aOnBadLeadUnit, aOnNotEnoughUnits,
+                                      aOnBadTrailingUnit, aOnBadCodePoint,
+                                      aOnNotShortestForm);
+}
+
+/**
+ * Like the always-inlined function above, but with no-op behavior from all
+ * trailing if-invalid notifier functors.
+ *
+ * This function is MOZ_ALWAYS_INLINE: if you don't need that, use the version
+ * of this function without the "Inline" suffix on the name.
+ */
+template<typename Iter, typename EndIter>
+MOZ_ALWAYS_INLINE Maybe<char32_t>
+DecodeOneUtf8CodePointInline(const Utf8Unit aLeadUnit,
+                             Iter* aIter, const EndIter aEnd)
+{
+  // aOnBadLeadUnit is called when |aLeadUnit| itself is an invalid lead unit in
+  // a multi-unit code point.  It is passed no arguments: the caller already has
+  // |aLeadUnit| on hand, so no need to provide it again.
+  auto onBadLeadUnit = []() {};
+
+  // aOnNotEnoughUnits is called when |aLeadUnit| properly indicates a code
+  // point length, but there aren't enough units from |*aIter| to |aEnd| to
+  // satisfy that length.  It is passed the number of code units actually
+  // available (according to |aEnd - *aIter|) and the number of code units that
+  // |aLeadUnit| indicates are needed.  Both numbers include the contribution
+  // of |aLeadUnit| itself: so |aUnitsAvailable <= 3|, |aUnitsNeeded <= 4|, and
+  // |aUnitsAvailable < aUnitsNeeded|.  As above, it also is not passed the lead
+  // code unit.
+  auto onNotEnoughUnits = [](uint8_t aUnitsAvailable, uint8_t aUnitsNeeded) {};
+
+  // aOnBadTrailingUnit is called when one of the trailing code units implied by
+  // |aLeadUnit| doesn't match the 0b10xx'xxxx bit pattern that all UTF-8
+  // trailing code units must satisfy.  It is passed the total count of units
+  // observed (including |aLeadUnit|).  The bad trailing code unit will
+  // conceptually be at |(*aIter)[aUnitsObserved - 1]| if this functor is
+  // called, and so |aUnitsObserved <= 4|.
+  auto onBadTrailingUnit = [](uint8_t aUnitsObserved) {};
+
+  // aOnBadCodePoint is called when a structurally-correct code point encoding
+  // is found, but the *value* that is encoded is not a valid code point: either
+  // because it exceeded the U+10FFFF Unicode maximum code point, or because it
+  // was a UTF-16 surrogate.  It is passed the non-code point value and the
+  // number of code units used to encode it.
+  auto onBadCodePoint = [](char32_t aBadCodePoint, uint8_t aUnitsObserved) {};
+
+  // aOnNotShortestForm is called when structurally-correct encoding is found,
+  // but the encoded value should have been encoded in fewer code units (e.g.
+  // mis-encoding U+0000 as 0b1100'0000 0b1000'0000 in two code units instead of
+  // as 0b0000'0000).  It is passed the mis-encoded code point (which will be
+  // valid and not a surrogate) and the count of code units that mis-encoded it.
+  auto onNotShortestForm = [](char32_t aBadCodePoint, uint8_t aUnitsObserved) {};
+
+  return DecodeOneUtf8CodePointInline(aLeadUnit, aIter, aEnd,
+                                      onBadLeadUnit, onNotEnoughUnits,
+                                      onBadTrailingUnit, onBadCodePoint,
+                                      onNotShortestForm);
+}
+
+/**
+ * Identical to the above function, but not forced to be instantiated inline --
+ * the compiler/linker are allowed to common up separate invocations.
+ */
+template<typename Iter, typename EndIter>
+inline Maybe<char32_t>
+DecodeOneUtf8CodePoint(const Utf8Unit aLeadUnit,
+                       Iter* aIter, const EndIter aEnd)
+{
+  return DecodeOneUtf8CodePointInline(aLeadUnit, aIter, aEnd);
+}
+
 } // namespace mozilla

 #endif /* mozilla_Utf8_h */
--- a/mfbt/tests/TestUtf8.cpp
+++ b/mfbt/tests/TestUtf8.cpp
@ -8,8 +8,15 @@

 #include "mozilla/ArrayUtils.h"
 #include "mozilla/Assertions.h"
+#include "mozilla/EnumSet.h"
+#include "mozilla/IntegerRange.h"
+#include "mozilla/TextUtils.h"

 using mozilla::ArrayLength;
+using mozilla::DecodeOneUtf8CodePoint;
+using mozilla::EnumSet;
+using mozilla::IntegerRange;
+using mozilla::IsAscii;
 using mozilla::IsValidUtf8;
 using mozilla::Utf8Unit;

@ -35,6 +42,242 @@ TestUtf8Unit()
  MOZ_RELEASE_ASSERT(first == second);
 }

+template<typename Char>
+struct ToUtf8Units
+{
+public:
+  explicit ToUtf8Units(const Char* aStart, const Char* aEnd)
+    : lead(Utf8Unit(aStart[0]))
+    , iter(aStart + 1)
+    , end(aEnd)
+  {
+    MOZ_RELEASE_ASSERT(!IsAscii(aStart[0]));
+  }
+
+  const Utf8Unit lead;
+  const Char* iter;
+  const Char* const end;
+};
+
+class AssertIfCalled
+{
+public:
+  template<typename... Args>
+  void operator()(Args&&... aArgs) {
+    MOZ_RELEASE_ASSERT(false, "AssertIfCalled instance was called");
+  }
+};
+
+// NOTE: For simplicity in treating |aCharN| identically regardless whether it's
+//       a string literal or a more-generalized array, we require |aCharN| be
+//       null-terminated.
+
+template<typename Char, size_t N>
+static void
+ExpectValidCodePoint(const Char (&aCharN)[N],
+                     char32_t aExpectedCodePoint)
+{
+  MOZ_RELEASE_ASSERT(aCharN[N - 1] == 0,
+                     "array must be null-terminated for |aCharN + N - 1| to "
+                     "compute the value of |aIter| as altered by "
+                     "DecodeOneUtf8CodePoint");
+
+  ToUtf8Units<Char> simpleUnit(aCharN, aCharN + N - 1);
+  auto simple =
+    DecodeOneUtf8CodePoint(simpleUnit.lead, &simpleUnit.iter, simpleUnit.end);
+  MOZ_RELEASE_ASSERT(simple.isSome());
+  MOZ_RELEASE_ASSERT(*simple == aExpectedCodePoint);
+  MOZ_RELEASE_ASSERT(simpleUnit.iter == simpleUnit.end);
+
+  ToUtf8Units<Char> complexUnit(aCharN, aCharN + N - 1);
+  auto complex =
+    DecodeOneUtf8CodePoint(complexUnit.lead, &complexUnit.iter, complexUnit.end,
+                           AssertIfCalled(),
+                           AssertIfCalled(),
+                           AssertIfCalled(),
+                           AssertIfCalled(),
+                           AssertIfCalled());
+  MOZ_RELEASE_ASSERT(complex.isSome());
+  MOZ_RELEASE_ASSERT(*complex == aExpectedCodePoint);
+  MOZ_RELEASE_ASSERT(complexUnit.iter == complexUnit.end);
+}
+
+enum class InvalidUtf8Reason
+{
+  BadLeadUnit,
+  NotEnoughUnits,
+  BadTrailingUnit,
+  BadCodePoint,
+  NotShortestForm,
+};
+
+template<typename Char, size_t N>
+static void
+ExpectInvalidCodePointHelper(const Char (&aCharN)[N],
+                             InvalidUtf8Reason aExpectedReason,
+                             uint8_t aExpectedUnitsAvailable,
+                             uint8_t aExpectedUnitsNeeded,
+                             char32_t aExpectedBadCodePoint,
+                             uint8_t aExpectedUnitsObserved)
+{
+  MOZ_RELEASE_ASSERT(aCharN[N - 1] == 0,
+                     "array must be null-terminated for |aCharN + N - 1| to "
+                     "compute the value of |aIter| as altered by "
+                     "DecodeOneUtf8CodePoint");
+
+  ToUtf8Units<Char> simpleUnit(aCharN, aCharN + N - 1);
+  auto simple =
+    DecodeOneUtf8CodePoint(simpleUnit.lead, &simpleUnit.iter, simpleUnit.end);
+  MOZ_RELEASE_ASSERT(simple.isNothing());
+  MOZ_RELEASE_ASSERT(static_cast<const void*>(simpleUnit.iter) == aCharN);
+
+  EnumSet<InvalidUtf8Reason> reasons;
+  uint8_t unitsAvailable;
+  uint8_t unitsNeeded;
+  char32_t badCodePoint;
+  uint8_t unitsObserved;
+
+  struct OnNotShortestForm
+  {
+    EnumSet<InvalidUtf8Reason>& reasons;
+    char32_t& badCodePoint;
+    uint8_t& unitsObserved;
+
+    void operator()(char32_t aBadCodePoint, uint8_t aUnitsObserved) {
+      reasons += InvalidUtf8Reason::NotShortestForm;
+      badCodePoint = aBadCodePoint;
+      unitsObserved = aUnitsObserved;
+    }
+  };
+
+  ToUtf8Units<Char> complexUnit(aCharN, aCharN + N - 1);
+  auto complex =
+    DecodeOneUtf8CodePoint(complexUnit.lead, &complexUnit.iter, complexUnit.end,
+                           [&reasons]() {
+                             reasons += InvalidUtf8Reason::BadLeadUnit;
+                           },
+                           [&reasons, &unitsAvailable, &unitsNeeded](uint8_t aUnitsAvailable,
+                                                                     uint8_t aUnitsNeeded)
+                           {
+                             reasons += InvalidUtf8Reason::NotEnoughUnits;
+                             unitsAvailable = aUnitsAvailable;
+                             unitsNeeded = aUnitsNeeded;
+                           },
+                           [&reasons, &unitsObserved](uint8_t aUnitsObserved)
+                           {
+                             reasons += InvalidUtf8Reason::BadTrailingUnit;
+                             unitsObserved = aUnitsObserved;
+                           },
+                           [&reasons, &badCodePoint, &unitsObserved](char32_t aBadCodePoint,
+                                                                     uint8_t aUnitsObserved)
+                           {
+                             reasons += InvalidUtf8Reason::BadCodePoint;
+                             badCodePoint = aBadCodePoint;
+                             unitsObserved = aUnitsObserved;
+                           },
+                           [&reasons, &badCodePoint, &unitsObserved](char32_t aBadCodePoint,
+                                                                     uint8_t aUnitsObserved)
+                           {
+                             reasons += InvalidUtf8Reason::NotShortestForm;
+                             badCodePoint = aBadCodePoint;
+                             unitsObserved = aUnitsObserved;
+                           });
+  MOZ_RELEASE_ASSERT(complex.isNothing());
+  MOZ_RELEASE_ASSERT(static_cast<const void*>(complexUnit.iter) == aCharN);
+
+  bool alreadyIterated = false;
+  for (InvalidUtf8Reason reason : reasons) {
+    MOZ_RELEASE_ASSERT(!alreadyIterated);
+    alreadyIterated = true;
+
+    switch (reason) {
+    case InvalidUtf8Reason::BadLeadUnit:
+      break;
+
+    case InvalidUtf8Reason::NotEnoughUnits:
+      MOZ_RELEASE_ASSERT(unitsAvailable == aExpectedUnitsAvailable);
+      MOZ_RELEASE_ASSERT(unitsNeeded == aExpectedUnitsNeeded);
+      break;
+
+    case InvalidUtf8Reason::BadTrailingUnit:
+      MOZ_RELEASE_ASSERT(unitsObserved == aExpectedUnitsObserved);
+      break;
+
+    case InvalidUtf8Reason::BadCodePoint:
+      MOZ_RELEASE_ASSERT(badCodePoint == aExpectedBadCodePoint);
+      MOZ_RELEASE_ASSERT(unitsObserved == aExpectedUnitsObserved);
+      break;
+
+    case InvalidUtf8Reason::NotShortestForm:
+      MOZ_RELEASE_ASSERT(badCodePoint == aExpectedBadCodePoint);
+      MOZ_RELEASE_ASSERT(unitsObserved == aExpectedUnitsObserved);
+      break;
+    }
+  }
+}
+
+// NOTE: For simplicity in treating |aCharN| identically regardless whether it's
+//       a string literal or a more-generalized array, we require |aCharN| be
+//       null-terminated in all these functions.
+
+template<typename Char, size_t N>
+static void
+ExpectBadLeadUnit(const Char (&aCharN)[N])
+{
+  ExpectInvalidCodePointHelper(aCharN,
+                               InvalidUtf8Reason::BadLeadUnit,
+                               0xFF, 0xFF, 0xFFFFFFFF, 0xFF);
+}
+
+template<typename Char, size_t N>
+static void
+ExpectNotEnoughUnits(const Char (&aCharN)[N],
+                     uint8_t aExpectedUnitsAvailable,
+                     uint8_t aExpectedUnitsNeeded)
+{
+  ExpectInvalidCodePointHelper(aCharN,
+                               InvalidUtf8Reason::NotEnoughUnits,
+                               aExpectedUnitsAvailable, aExpectedUnitsNeeded,
+                               0xFFFFFFFF, 0xFF);
+}
+
+template<typename Char, size_t N>
+static void
+ExpectBadTrailingUnit(const Char (&aCharN)[N],
+                      uint8_t aExpectedUnitsObserved)
+{
+  ExpectInvalidCodePointHelper(aCharN,
+                               InvalidUtf8Reason::BadTrailingUnit,
+                               0xFF, 0xFF, 0xFFFFFFFF,
+                               aExpectedUnitsObserved);
+}
+
+template<typename Char, size_t N>
+static void
+ExpectNotShortestForm(const Char (&aCharN)[N],
+                      char32_t aExpectedBadCodePoint,
+                      uint8_t aExpectedUnitsObserved)
+{
+  ExpectInvalidCodePointHelper(aCharN,
+                               InvalidUtf8Reason::NotShortestForm,
+                               0xFF, 0xFF,
+                               aExpectedBadCodePoint,
+                               aExpectedUnitsObserved);
+}
+
+template<typename Char, size_t N>
+static void
+ExpectBadCodePoint(const Char (&aCharN)[N],
+                   char32_t aExpectedBadCodePoint,
+                   uint8_t aExpectedUnitsObserved)
+{
+  ExpectInvalidCodePointHelper(aCharN,
+                               InvalidUtf8Reason::BadCodePoint,
+                               0xFF, 0xFF,
+                               aExpectedBadCodePoint,
+                               aExpectedUnitsObserved);
+}
+
 static void
 TestIsValidUtf8()
 {
@ -62,48 +305,469 @@ TestIsValidUtf8()
  static_assert(twoBytesLen == 3, "U+0606 in two bytes plus nul");
  MOZ_RELEASE_ASSERT(IsValidUtf8(twoBytes, twoBytesLen));

+  ExpectValidCodePoint(twoBytes, 0x0606);
+
  // 3
  static const char threeBytes[] = u8"᨞"; // U+1A1E BUGINESE PALLAWA
  constexpr size_t threeBytesLen = ArrayLength(threeBytes);
  static_assert(threeBytesLen == 4, "U+1A1E in three bytes plus nul");
  MOZ_RELEASE_ASSERT(IsValidUtf8(threeBytes, threeBytesLen));

+  ExpectValidCodePoint(threeBytes, 0x1A1E);
+
  // 4
  static const char fourBytes[] = u8"🁡"; // U+1F061 DOMINO TILE HORIZONTAL-06-06
  constexpr size_t fourBytesLen = ArrayLength(fourBytes);
  static_assert(fourBytesLen == 5, "U+1F061 in four bytes plus nul");
  MOZ_RELEASE_ASSERT(IsValidUtf8(fourBytes, fourBytesLen));

+  ExpectValidCodePoint(fourBytes, 0x1F061);
+
  // Max code point
  static const char maxCodePoint[] = u8"􏿿"; // U+10FFFF
  constexpr size_t maxCodePointLen = ArrayLength(maxCodePoint);
  static_assert(maxCodePointLen == 5, "U+10FFFF in four bytes plus nul");
  MOZ_RELEASE_ASSERT(IsValidUtf8(maxCodePoint, maxCodePointLen));

+  ExpectValidCodePoint(maxCodePoint, 0x10FFFF);
+
  // One past max code point
-  static unsigned const char onePastMaxCodePoint[] = { 0xF4, 0x90, 0x80, 0x80 };
+  static const unsigned char onePastMaxCodePoint[] = { 0xF4, 0x90, 0x80, 0x80, 0x0 };
  constexpr size_t onePastMaxCodePointLen = ArrayLength(onePastMaxCodePoint);
  MOZ_RELEASE_ASSERT(!IsValidUtf8(onePastMaxCodePoint, onePastMaxCodePointLen));

+  ExpectBadCodePoint(onePastMaxCodePoint, 0x110000, 4);
+
  // Surrogate-related testing

-  static const unsigned char justBeforeSurrogates[] = { 0xED, 0x9F, 0xBF };
-  MOZ_RELEASE_ASSERT(IsValidUtf8(justBeforeSurrogates, ArrayLength(justBeforeSurrogates)));
+  // (Note that the various code unit sequences here are null-terminated to
+  // simplify life for ExpectValidCodePoint, which presumes null termination.)

-  static const unsigned char leastSurrogate[] = { 0xED, 0xA0, 0x80 };
-  MOZ_RELEASE_ASSERT(!IsValidUtf8(leastSurrogate, ArrayLength(leastSurrogate)));
+  static const unsigned char justBeforeSurrogates[] = { 0xED, 0x9F, 0xBF, 0x0 };
+  constexpr size_t justBeforeSurrogatesLen = ArrayLength(justBeforeSurrogates) - 1;
+  MOZ_RELEASE_ASSERT(IsValidUtf8(justBeforeSurrogates, justBeforeSurrogatesLen));

-  static const unsigned char arbitraryHighSurrogate[] = { 0xED, 0xA2, 0x87 };
-  MOZ_RELEASE_ASSERT(!IsValidUtf8(arbitraryHighSurrogate, ArrayLength(arbitraryHighSurrogate)));
+  ExpectValidCodePoint(justBeforeSurrogates, 0xD7FF);

-  static const unsigned char arbitraryLowSurrogate[] = { 0xED, 0xB7, 0xAF };
-  MOZ_RELEASE_ASSERT(!IsValidUtf8(arbitraryLowSurrogate, ArrayLength(arbitraryLowSurrogate)));
+  static const unsigned char leastSurrogate[] = { 0xED, 0xA0, 0x80, 0x0 };
+  constexpr size_t leastSurrogateLen = ArrayLength(leastSurrogate) - 1;
+  MOZ_RELEASE_ASSERT(!IsValidUtf8(leastSurrogate, leastSurrogateLen));

-  static const unsigned char greatestSurrogate[] = { 0xED, 0xBF, 0xBF };
-  MOZ_RELEASE_ASSERT(!IsValidUtf8(greatestSurrogate, ArrayLength(greatestSurrogate)));
+  ExpectBadCodePoint(leastSurrogate, 0xD800, 3);

-  static const unsigned char justAfterSurrogates[] = { 0xEE, 0x80, 0x80 };
-  MOZ_RELEASE_ASSERT(IsValidUtf8(justAfterSurrogates, ArrayLength(justAfterSurrogates)));
+  static const unsigned char arbitraryHighSurrogate[] = { 0xED, 0xA2, 0x87, 0x0 };
+  constexpr size_t arbitraryHighSurrogateLen = ArrayLength(arbitraryHighSurrogate) - 1;
+  MOZ_RELEASE_ASSERT(!IsValidUtf8(arbitraryHighSurrogate, arbitraryHighSurrogateLen));
+
+  ExpectBadCodePoint(arbitraryHighSurrogate, 0xD887, 3);
+
+  static const unsigned char arbitraryLowSurrogate[] = { 0xED, 0xB7, 0xAF, 0x0 };
+  constexpr size_t arbitraryLowSurrogateLen = ArrayLength(arbitraryLowSurrogate) - 1;
+  MOZ_RELEASE_ASSERT(!IsValidUtf8(arbitraryLowSurrogate, arbitraryLowSurrogateLen));
+
+  ExpectBadCodePoint(arbitraryLowSurrogate, 0xDDEF, 3);
+
+  static const unsigned char greatestSurrogate[] = { 0xED, 0xBF, 0xBF, 0x0 };
+  constexpr size_t greatestSurrogateLen = ArrayLength(greatestSurrogate) - 1;
+  MOZ_RELEASE_ASSERT(!IsValidUtf8(greatestSurrogate, greatestSurrogateLen));
+
+  ExpectBadCodePoint(greatestSurrogate, 0xDFFF, 3);
+
+  static const unsigned char justAfterSurrogates[] = { 0xEE, 0x80, 0x80, 0x0 };
+  constexpr size_t justAfterSurrogatesLen = ArrayLength(justAfterSurrogates) - 1;
+  MOZ_RELEASE_ASSERT(IsValidUtf8(justAfterSurrogates, justAfterSurrogatesLen));
+
+  ExpectValidCodePoint(justAfterSurrogates, 0xE000);
+}
+
+static void
+TestDecodeOneValidUtf8CodePoint()
+{
+  // NOTE: DecodeOneUtf8CodePoint decodes only *non*-ASCII code points that
+  //       consist of multiple code units, so there are no ASCII tests below.
+
+  // Length two.
+
+  ExpectValidCodePoint(u8"", 0x80); // <control>
+  ExpectValidCodePoint(u8"©", 0xA9); // COPYRIGHT SIGN
+  ExpectValidCodePoint(u8"¶", 0xB6); // PILCROW SIGN
+  ExpectValidCodePoint(u8"¾", 0xBE); // VULGAR FRACTION THREE QUARTERS
+  ExpectValidCodePoint(u8"÷", 0xF7); // DIVISION SIGN
+  ExpectValidCodePoint(u8"ÿ", 0xFF); // LATIN SMALL LETTER Y WITH DIAERESIS
+  ExpectValidCodePoint(u8"Ā", 0x100); // LATIN CAPITAL LETTER A WITH MACRON
+  ExpectValidCodePoint(u8"Ĳ", 0x132); // LATIN CAPITAL LETTER LIGATURE IJ
+  ExpectValidCodePoint(u8"ͼ", 0x37C); // GREEK SMALL DOTTED LUNATE SIGMA SYMBOL
+  ExpectValidCodePoint(u8"Ӝ", 0x4DC); // CYRILLIC CAPITAL LETTER ZHE WITTH DIAERESIS
+  ExpectValidCodePoint(u8"۩", 0x6E9); // ARABIC PLACE OF SAJDAH
+  ExpectValidCodePoint(u8"߿", 0x7FF); // <not assigned>
+
+  // Length three.
+
+  ExpectValidCodePoint(u8"ࠀ", 0x800); // SAMARITAN LETTER ALAF
+  ExpectValidCodePoint(u8"ࡁ", 0x841); // MANDAIC LETTER AB
+  ExpectValidCodePoint(u8"ࣿ", 0x8FF); // ARABIC MARK SIDEWAYS NOON GHUNNA
+  ExpectValidCodePoint(u8"ஆ", 0xB86); // TAMIL LETTER AA
+  ExpectValidCodePoint(u8"༃", 0xF03); // TIBETAN MARK GTER YIG MGO -UM GTER TSHEG MA
+  ExpectValidCodePoint(u8"࿉", 0xFC9); // TIBETAN SYMBOL NOR BU (but on my system it really looks like SOFT-SERVE ICE CREAM FROM ABOVE THE PLANE if you ask me)
+  ExpectValidCodePoint(u8"ဪ", 0x102A); // MYANMAR LETTER AU
+  ExpectValidCodePoint(u8"ᚏ", 0x168F); // OGHAM LETTER RUIS
+  ExpectValidCodePoint("\xE2\x80\xA8", 0x2028); // (the hated) LINE SEPARATOR
+  ExpectValidCodePoint("\xE2\x80\xA9", 0x2029); // (the hated) PARAGRAPH SEPARATOR
+  ExpectValidCodePoint(u8"☬", 0x262C); // ADI SHAKTI
+  ExpectValidCodePoint(u8"㊮", 0x32AE); // CIRCLED IDEOGRAPH RESOURCE
+  ExpectValidCodePoint(u8"㏖", 0x33D6); // SQUARE MOL
+  ExpectValidCodePoint(u8"ꔄ", 0xA504); // VAI SYLLABLE WEEN
+  ExpectValidCodePoint(u8"ퟕ", 0xD7D5); // HANGUL JONGSEONG RIEUL-SSANGKIYEOK
+  ExpectValidCodePoint(u8"퟿", 0xD7FF); // <not assigned>
+  ExpectValidCodePoint(u8"", 0xE000); // <Private Use>
+  ExpectValidCodePoint(u8"鱗", 0xF9F2); // CJK COMPATIBILITY IDEOGRAPH-F9F
+  ExpectValidCodePoint(u8"﷽", 0xFDFD); // ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHHHEEEEM
+  ExpectValidCodePoint(u8"", 0xFFFF); // <not assigned>
+
+  // Length four.
+  ExpectValidCodePoint(u8"𐀀", 0x10000); // LINEAR B SYLLABLE B008 A
+  ExpectValidCodePoint(u8"𔑀", 0x14440); // ANATOLIAN HIEROGLYPH A058
+  ExpectValidCodePoint(u8"𝛗", 0x1D6D7); // MATHEMATICAL BOLD SMALL PHI
+  ExpectValidCodePoint(u8"💩", 0x1F4A9); // PILE OF POO
+  ExpectValidCodePoint(u8"🔫", 0x1F52B); // PISTOL
+  ExpectValidCodePoint(u8"🥌", 0x1F94C); // CURLING STONE
+  ExpectValidCodePoint(u8"🥏", 0x1F94F); // FLYING DISC
+  ExpectValidCodePoint(u8"𠍆", 0x20346); // CJK UNIFIED IDEOGRAPH-20346
+  ExpectValidCodePoint(u8"𡠺", 0x2183A); // CJK UNIFIED IDEOGRAPH-2183A
+  ExpectValidCodePoint(u8"񁟶", 0x417F6); // <not assigned>
+  ExpectValidCodePoint(u8"񾠶", 0x7E836); // <not assigned>
+  ExpectValidCodePoint(u8"󾽧", 0xFEF67); // <Plane 15 Private Use>
+  ExpectValidCodePoint(u8"􏿿", 0x10FFFF); //
+}
+
+static void
+TestDecodeBadLeadUnit()
+{
+  // These tests are actually exhaustive.
+
+  unsigned char badLead[] = { '\0', '\0' };
+
+  for (uint8_t lead : IntegerRange(0b1000'0000, 0b1100'0000)) {
+    badLead[0] = lead;
+    ExpectBadLeadUnit(badLead);
+  }
+
+  {
+    uint8_t lead = 0b1111'1000;
+    do {
+      badLead[0] = lead;
+      ExpectBadLeadUnit(badLead);
+      if (lead == 0b1111'1111) {
+        break;
+      }
+
+      lead++;
+    } while (true);
+  }
+}
+
+static void
+TestTooFewOrBadTrailingUnits()
+{
+  // Lead unit indicates a two-byte code point.
+
+  char truncatedTwo[] = { '\0', '\0' };
+  char badTrailTwo[] = { '\0', '\0', '\0' };
+
+  for (uint8_t lead : IntegerRange(0b1100'0000, 0b1110'0000)) {
+    truncatedTwo[0] = lead;
+    ExpectNotEnoughUnits(truncatedTwo, 1, 2);
+
+    badTrailTwo[0] = lead;
+    for (uint8_t trail : IntegerRange(0b0000'0000, 0b1000'0000)) {
+      badTrailTwo[1] = trail;
+      ExpectBadTrailingUnit(badTrailTwo, 2);
+    }
+
+    for (uint8_t trail : IntegerRange(0b1100'0000, 0b1111'1111)) {
+      badTrailTwo[1] = trail;
+      ExpectBadTrailingUnit(badTrailTwo, 2);
+    }
+  }
+
+  // Lead unit indicates a three-byte code point.
+
+  char truncatedThreeOne[] = { '\0', '\0' };
+  char truncatedThreeTwo[] = { '\0', '\0', '\0' };
+  unsigned char badTrailThree[] = { '\0', '\0', '\0', '\0' };
+
+  for (uint8_t lead : IntegerRange(0b1110'0000, 0b1111'0000)) {
+    truncatedThreeOne[0] = lead;
+    ExpectNotEnoughUnits(truncatedThreeOne, 1, 3);
+
+    truncatedThreeTwo[0] = lead;
+    ExpectNotEnoughUnits(truncatedThreeTwo, 2, 3);
+
+    badTrailThree[0] = lead;
+    badTrailThree[2] = 0b1011'1111; // make valid to test overreads
+    for (uint8_t mid : IntegerRange(0b0000'0000, 0b1000'0000)) {
+      badTrailThree[1] = mid;
+      ExpectBadTrailingUnit(badTrailThree, 2);
+    }
+    {
+      uint8_t mid = 0b1100'0000;
+      do {
+        badTrailThree[1] = mid;
+        ExpectBadTrailingUnit(badTrailThree, 2);
+        if (mid == 0b1111'1111) {
+          break;
+        }
+
+        mid++;
+      } while (true);
+    }
+
+    badTrailThree[1] = 0b1011'1111;
+    for (uint8_t last : IntegerRange(0b0000'0000, 0b1000'0000)) {
+      badTrailThree[2] = last;
+      ExpectBadTrailingUnit(badTrailThree, 3);
+    }
+    {
+      uint8_t last = 0b1100'0000;
+      do {
+        badTrailThree[2] = last;
+        ExpectBadTrailingUnit(badTrailThree, 3);
+        if (last == 0b1111'1111) {
+          break;
+        }
+
+        last++;
+      } while (true);
+    }
+  }
+
+  // Lead unit indicates a four-byte code point.
+
+  char truncatedFourOne[] = { '\0', '\0' };
+  char truncatedFourTwo[] = { '\0', '\0', '\0' };
+  char truncatedFourThree[] = { '\0', '\0', '\0', '\0' };
+
+  unsigned char badTrailFour[] = { '\0', '\0', '\0', '\0', '\0' };
+
+  for (uint8_t lead : IntegerRange(0b1111'0000, 0b1111'1000)) {
+    truncatedFourOne[0] = lead;
+    ExpectNotEnoughUnits(truncatedFourOne, 1, 4);
+
+    truncatedFourTwo[0] = lead;
+    ExpectNotEnoughUnits(truncatedFourTwo, 2, 4);
+
+    truncatedFourThree[0] = lead;
+    ExpectNotEnoughUnits(truncatedFourThree, 3, 4);
+
+    badTrailFour[0] = lead;
+    badTrailFour[2] = badTrailFour[3] = 0b1011'1111; // test for overreads
+    for (uint8_t second : IntegerRange(0b0000'0000, 0b1000'0000)) {
+      badTrailFour[1] = second;
+      ExpectBadTrailingUnit(badTrailFour, 2);
+    }
+    {
+      uint8_t second = 0b1100'0000;
+      do {
+        badTrailFour[1] = second;
+        ExpectBadTrailingUnit(badTrailFour, 2);
+        if (second == 0b1111'1111) {
+          break;
+        }
+
+        second++;
+      } while (true);
+    }
+
+    badTrailFour[1] = badTrailFour[3] = 0b1011'1111; // test for overreads
+    for (uint8_t third : IntegerRange(0b0000'0000, 0b1000'0000)) {
+      badTrailFour[2] = third;
+      ExpectBadTrailingUnit(badTrailFour, 3);
+    }
+    {
+      uint8_t third = 0b1100'0000;
+      do {
+        badTrailFour[2] = third;
+        ExpectBadTrailingUnit(badTrailFour, 3);
+        if (third == 0b1111'1111) {
+          break;
+        }
+
+        third++;
+      } while (true);
+    }
+
+    badTrailFour[2] = 0b1011'1111;
+    for (uint8_t fourth : IntegerRange(0b0000'0000, 0b1000'0000)) {
+      badTrailFour[3] = fourth;
+      ExpectBadTrailingUnit(badTrailFour, 4);
+    }
+    {
+      uint8_t fourth = 0b1100'0000;
+      do {
+        badTrailFour[3] = fourth;
+        ExpectBadTrailingUnit(badTrailFour, 4);
+        if (fourth == 0b1111'1111) {
+          break;
+        }
+
+        fourth++;
+      } while (true);
+    }
+  }
+}
+
+static void
+TestBadSurrogate()
+{
+  // These tests are actually exhaustive.
+
+  ExpectValidCodePoint("\xED\x9F\xBF", 0xD7FF); // last before surrogates
+  ExpectValidCodePoint("\xEE\x80\x80", 0xE000); // first after surrogates
+
+  // First invalid surrogate encoding is { 0xED, 0xA0, 0x80 }.  Last invalid
+  // surrogate encoding is { 0xED, 0xBF, 0xBF }.
+
+  char badSurrogate[] = { '\xED', '\0', '\0', '\0' };
+
+  for (char32_t c = 0xD800; c < 0xE000; c++) {
+    badSurrogate[1] = 0b1000'0000 ^ ((c & 0b1111'1100'0000) >> 6);
+    badSurrogate[2] = 0b1000'0000 ^ ((c & 0b0000'0011'1111));
+
+    ExpectBadCodePoint(badSurrogate, c, 3);
+  }
+}
+
+static void
+TestBadTooBig()
+{
+  // These tests are actually exhaustive.
+
+  ExpectValidCodePoint("\xF4\x8F\xBF\xBF", 0x10'FFFF); // last code point
+
+  // Four-byte code points are
+  //
+  //   0b1111'0xxx 0b10xx'xxxx 0b10xx'xxxx 0b10xx'xxxx
+  //
+  // with 3 + 6 + 6 + 6 == 21 unconstrained bytes, so the structurally
+  // representable limit (exclusive) is 2**21 - 1 == 2097152.
+
+  char tooLargeCodePoint[] = { '\0', '\0', '\0', '\0', '\0' };
+
+  for (char32_t c = 0x11'0000; c < (1 << 21); c++) {
+    tooLargeCodePoint[0] = 0b1111'0000 ^ ((c & 0b1'1100'0000'0000'0000'0000) >> 18);
+    tooLargeCodePoint[1] = 0b1000'0000 ^ ((c & 0b0'0011'1111'0000'0000'0000) >> 12);
+    tooLargeCodePoint[2] = 0b1000'0000 ^ ((c & 0b0'0000'0000'1111'1100'0000) >> 6);
+    tooLargeCodePoint[3] = 0b1000'0000 ^ ((c & 0b0'0000'0000'0000'0011'1111));
+
+    ExpectBadCodePoint(tooLargeCodePoint, c, 4);
+  }
+}
+
+static void
+TestBadCodePoint()
+{
+  TestBadSurrogate();
+  TestBadTooBig();
+}
+
+static void
+TestNotShortestForm()
+{
+  {
+    // One-byte in two-byte.
+
+    char oneInTwo[] = { '\0', '\0', '\0' };
+
+    for (char32_t c = '\0'; c < 0x80; c++) {
+      oneInTwo[0] = 0b1100'0000 ^ ((c & 0b0111'1100'0000) >> 6);
+      oneInTwo[1] = 0b1000'0000 ^ ((c & 0b0000'0011'1111));
+
+      ExpectNotShortestForm(oneInTwo, c, 2);
+    }
+
+    // One-byte in three-byte.
+
+    char oneInThree[] = { '\0', '\0', '\0', '\0' };
+
+    for (char32_t c = '\0'; c < 0x80; c++) {
+      oneInThree[0] = 0b1110'0000 ^ ((c & 0b1111'0000'0000'0000) >> 12);
+      oneInThree[1] = 0b1000'0000 ^ ((c & 0b0000'1111'1100'0000) >> 6);
+      oneInThree[2] = 0b1000'0000 ^ ((c & 0b0000'0000'0011'1111));
+
+      ExpectNotShortestForm(oneInThree, c, 3);
+    }
+
+    // One-byte in four-byte.
+
+    char oneInFour[] = { '\0', '\0', '\0', '\0', '\0' };
+
+    for (char32_t c = '\0'; c < 0x80; c++) {
+      oneInFour[0] = 0b1111'0000 ^ ((c & 0b1'1100'0000'0000'0000'0000) >> 18);
+      oneInFour[1] = 0b1000'0000 ^ ((c & 0b0'0011'1111'0000'0000'0000) >> 12);
+      oneInFour[2] = 0b1000'0000 ^ ((c & 0b0'0000'0000'1111'1100'0000) >> 6);
+      oneInFour[3] = 0b1000'0000 ^ ((c & 0b0'0000'0000'0000'0011'1111));
+
+      ExpectNotShortestForm(oneInFour, c, 4);
+    }
+  }
+
+  {
+    // Two-byte in three-byte.
+
+    char twoInThree[] = { '\0', '\0', '\0', '\0' };
+
+    for (char32_t c = 0x80; c < 0x800; c++) {
+      twoInThree[0] = 0b1110'0000 ^ ((c & 0b1111'0000'0000'0000) >> 12);
+      twoInThree[1] = 0b1000'0000 ^ ((c & 0b0000'1111'1100'0000) >> 6);
+      twoInThree[2] = 0b1000'0000 ^ ((c & 0b0000'0000'0011'1111));
+
+      ExpectNotShortestForm(twoInThree, c, 3);
+    }
+
+    // Two-byte in four-byte.
+
+    char twoInFour[] = { '\0', '\0', '\0', '\0', '\0' };
+
+    for (char32_t c = 0x80; c < 0x800; c++) {
+      twoInFour[0] = 0b1111'0000 ^ ((c & 0b1'1100'0000'0000'0000'0000) >> 18);
+      twoInFour[1] = 0b1000'0000 ^ ((c & 0b0'0011'1111'0000'0000'0000) >> 12);
+      twoInFour[2] = 0b1000'0000 ^ ((c & 0b0'0000'0000'1111'1100'0000) >> 6);
+      twoInFour[3] = 0b1000'0000 ^ ((c & 0b0'0000'0000'0000'0011'1111));
+
+      ExpectNotShortestForm(twoInFour, c, 4);
+    }
+  }
+
+  {
+    // Three-byte in four-byte.
+
+    char threeInFour[] = { '\0', '\0', '\0', '\0', '\0' };
+
+    for (char32_t c = 0x800; c < 0x1'0000; c++) {
+      threeInFour[0] = 0b1111'0000 ^ ((c & 0b1'1100'0000'0000'0000'0000) >> 18);
+      threeInFour[1] = 0b1000'0000 ^ ((c & 0b0'0011'1111'0000'0000'0000) >> 12);
+      threeInFour[2] = 0b1000'0000 ^ ((c & 0b0'0000'0000'1111'1100'0000) >> 6);
+      threeInFour[3] = 0b1000'0000 ^ ((c & 0b0'0000'0000'0000'0011'1111));
+
+      ExpectNotShortestForm(threeInFour, c, 4);
+    }
+  }
+}
+
+static void
+TestDecodeOneInvalidUtf8CodePoint()
+{
+  TestDecodeBadLeadUnit();
+  TestTooFewOrBadTrailingUnits();
+  TestBadCodePoint();
+  TestNotShortestForm();
+}
+
+static void
+TestDecodeOneUtf8CodePoint()
+{
+  TestDecodeOneValidUtf8CodePoint();
+  TestDecodeOneInvalidUtf8CodePoint();
 }

 int
@ -111,5 +775,6 @@ main()
 {
  TestUtf8Unit();
  TestIsValidUtf8();
+  TestDecodeOneUtf8CodePoint();
  return 0;
 }