forked from mirrors/gecko-dev
		
	 1b75555dc7
			
		
	
	
		1b75555dc7
		
	
	
	
	
		
			
			`nscore.h` includes `xpcom-config.h` which need not be generated for non-XPCOM consumers. In additon, `nullptr` and `bool` are C++ keywords, so at least some of the comments were dated. The added include lines address transitive consumers of `nscore.h`. Differential Revision: https://phabricator.services.mozilla.com/D82640
		
			
				
	
	
		
			247 lines
		
	
	
	
		
			7.5 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			247 lines
		
	
	
	
		
			7.5 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
| /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
 | |
| /* vim: set ts=8 sts=2 et sw=2 tw=80: */
 | |
| /* This Source Code Form is subject to the terms of the Mozilla Public
 | |
|  * License, v. 2.0. If a copy of the MPL was not distributed with this
 | |
|  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 | |
| #ifndef nsUTF8Utils_h_
 | |
| #define nsUTF8Utils_h_
 | |
| 
 | |
| // NB: This code may be used from non-XPCOM code, in particular, the
 | |
| // standalone updater executable.  That is, this file may be used in
 | |
| // two ways: if MOZILLA_INTERNAL_API is defined, this file will
 | |
| // provide signatures for the Mozilla abstract string types. It will
 | |
| // use XPCOM assertion/debugging macros, etc.
 | |
| 
 | |
| #include <type_traits>
 | |
| 
 | |
| #include "mozilla/Assertions.h"
 | |
| #include "mozilla/EndianUtils.h"
 | |
| 
 | |
| #include "nsCharTraits.h"
 | |
| 
 | |
| #ifdef MOZILLA_INTERNAL_API
 | |
| #  define UTF8UTILS_WARNING(msg) NS_WARNING(msg)
 | |
| #else
 | |
| #  define UTF8UTILS_WARNING(msg)
 | |
| #endif
 | |
| 
 | |
| class UTF8traits {
 | |
|  public:
 | |
|   static bool isASCII(char aChar) { return (aChar & 0x80) == 0x00; }
 | |
|   static bool isInSeq(char aChar) { return (aChar & 0xC0) == 0x80; }
 | |
|   static bool is2byte(char aChar) { return (aChar & 0xE0) == 0xC0; }
 | |
|   static bool is3byte(char aChar) { return (aChar & 0xF0) == 0xE0; }
 | |
|   static bool is4byte(char aChar) { return (aChar & 0xF8) == 0xF0; }
 | |
|   static bool is5byte(char aChar) { return (aChar & 0xFC) == 0xF8; }
 | |
|   static bool is6byte(char aChar) { return (aChar & 0xFE) == 0xFC; }
 | |
|   // return the number of bytes in a sequence beginning with aChar
 | |
|   static int bytes(char aChar) {
 | |
|     if (isASCII(aChar)) {
 | |
|       return 1;
 | |
|     }
 | |
|     if (is2byte(aChar)) {
 | |
|       return 2;
 | |
|     }
 | |
|     if (is3byte(aChar)) {
 | |
|       return 3;
 | |
|     }
 | |
|     if (is4byte(aChar)) {
 | |
|       return 4;
 | |
|     }
 | |
|     MOZ_ASSERT_UNREACHABLE("should not be used for in-sequence characters");
 | |
|     return 1;
 | |
|   }
 | |
| };
 | |
| 
 | |
| /**
 | |
|  * Extract the next Unicode scalar value from the buffer and return it. The
 | |
|  * pointer passed in is advanced to the start of the next character in the
 | |
|  * buffer. Upon error, the return value is 0xFFFD, *aBuffer is advanced
 | |
|  * over the maximal valid prefix and *aErr is set to true (if aErr is not
 | |
|  * null).
 | |
|  *
 | |
|  * Note: This method never sets *aErr to false to allow error accumulation
 | |
|  * across multiple calls.
 | |
|  *
 | |
|  * Precondition: *aBuffer < aEnd
 | |
|  */
 | |
| class UTF8CharEnumerator {
 | |
|  public:
 | |
|   static inline char32_t NextChar(const char** aBuffer, const char* aEnd,
 | |
|                                   bool* aErr = nullptr) {
 | |
|     MOZ_ASSERT(aBuffer, "null buffer pointer pointer");
 | |
|     MOZ_ASSERT(aEnd, "null end pointer");
 | |
| 
 | |
|     const unsigned char* p = reinterpret_cast<const unsigned char*>(*aBuffer);
 | |
|     const unsigned char* end = reinterpret_cast<const unsigned char*>(aEnd);
 | |
| 
 | |
|     MOZ_ASSERT(p, "null buffer");
 | |
|     MOZ_ASSERT(p < end, "Bogus range");
 | |
| 
 | |
|     unsigned char first = *p;
 | |
|     ++p;
 | |
| 
 | |
|     if (MOZ_LIKELY(first < 0x80U)) {
 | |
|       *aBuffer = reinterpret_cast<const char*>(p);
 | |
|       return first;
 | |
|     }
 | |
| 
 | |
|     // Unsigned underflow is defined behavior
 | |
|     if (MOZ_UNLIKELY((p == end) || ((first - 0xC2U) >= (0xF5U - 0xC2U)))) {
 | |
|       *aBuffer = reinterpret_cast<const char*>(p);
 | |
|       if (aErr) {
 | |
|         *aErr = true;
 | |
|       }
 | |
|       return 0xFFFDU;
 | |
|     }
 | |
| 
 | |
|     unsigned char second = *p;
 | |
| 
 | |
|     if (first < 0xE0U) {
 | |
|       // Two-byte
 | |
|       if (MOZ_LIKELY((second & 0xC0U) == 0x80U)) {
 | |
|         ++p;
 | |
|         *aBuffer = reinterpret_cast<const char*>(p);
 | |
|         return ((uint32_t(first) & 0x1FU) << 6) | (uint32_t(second) & 0x3FU);
 | |
|       }
 | |
|       *aBuffer = reinterpret_cast<const char*>(p);
 | |
|       if (aErr) {
 | |
|         *aErr = true;
 | |
|       }
 | |
|       return 0xFFFDU;
 | |
|     }
 | |
| 
 | |
|     if (MOZ_LIKELY(first < 0xF0U)) {
 | |
|       // Three-byte
 | |
|       unsigned char lower = 0x80U;
 | |
|       unsigned char upper = 0xBFU;
 | |
|       if (first == 0xE0U) {
 | |
|         lower = 0xA0U;
 | |
|       } else if (first == 0xEDU) {
 | |
|         upper = 0x9FU;
 | |
|       }
 | |
|       if (MOZ_LIKELY(second >= lower && second <= upper)) {
 | |
|         ++p;
 | |
|         if (MOZ_LIKELY(p != end)) {
 | |
|           unsigned char third = *p;
 | |
|           if (MOZ_LIKELY((third & 0xC0U) == 0x80U)) {
 | |
|             ++p;
 | |
|             *aBuffer = reinterpret_cast<const char*>(p);
 | |
|             return ((uint32_t(first) & 0xFU) << 12) |
 | |
|                    ((uint32_t(second) & 0x3FU) << 6) |
 | |
|                    (uint32_t(third) & 0x3FU);
 | |
|           }
 | |
|         }
 | |
|       }
 | |
|       *aBuffer = reinterpret_cast<const char*>(p);
 | |
|       if (aErr) {
 | |
|         *aErr = true;
 | |
|       }
 | |
|       return 0xFFFDU;
 | |
|     }
 | |
| 
 | |
|     // Four-byte
 | |
|     unsigned char lower = 0x80U;
 | |
|     unsigned char upper = 0xBFU;
 | |
|     if (first == 0xF0U) {
 | |
|       lower = 0x90U;
 | |
|     } else if (first == 0xF4U) {
 | |
|       upper = 0x8FU;
 | |
|     }
 | |
|     if (MOZ_LIKELY(second >= lower && second <= upper)) {
 | |
|       ++p;
 | |
|       if (MOZ_LIKELY(p != end)) {
 | |
|         unsigned char third = *p;
 | |
|         if (MOZ_LIKELY((third & 0xC0U) == 0x80U)) {
 | |
|           ++p;
 | |
|           if (MOZ_LIKELY(p != end)) {
 | |
|             unsigned char fourth = *p;
 | |
|             if (MOZ_LIKELY((fourth & 0xC0U) == 0x80U)) {
 | |
|               ++p;
 | |
|               *aBuffer = reinterpret_cast<const char*>(p);
 | |
|               return ((uint32_t(first) & 0x7U) << 18) |
 | |
|                      ((uint32_t(second) & 0x3FU) << 12) |
 | |
|                      ((uint32_t(third) & 0x3FU) << 6) |
 | |
|                      (uint32_t(fourth) & 0x3FU);
 | |
|             }
 | |
|           }
 | |
|         }
 | |
|       }
 | |
|     }
 | |
|     *aBuffer = reinterpret_cast<const char*>(p);
 | |
|     if (aErr) {
 | |
|       *aErr = true;
 | |
|     }
 | |
|     return 0xFFFDU;
 | |
|   }
 | |
| };
 | |
| 
 | |
| /**
 | |
|  * Extract the next Unicode scalar value from the buffer and return it. The
 | |
|  * pointer passed in is advanced to the start of the next character in the
 | |
|  * buffer. Upon error, the return value is 0xFFFD, *aBuffer is advanced over
 | |
|  * the unpaired surrogate and *aErr is set to true (if aErr is not null).
 | |
|  *
 | |
|  * Note: This method never sets *aErr to false to allow error accumulation
 | |
|  * across multiple calls.
 | |
|  *
 | |
|  * Precondition: *aBuffer < aEnd
 | |
|  */
 | |
| class UTF16CharEnumerator {
 | |
|  public:
 | |
|   static inline char32_t NextChar(const char16_t** aBuffer,
 | |
|                                   const char16_t* aEnd, bool* aErr = nullptr) {
 | |
|     MOZ_ASSERT(aBuffer, "null buffer pointer pointer");
 | |
|     MOZ_ASSERT(aEnd, "null end pointer");
 | |
| 
 | |
|     const char16_t* p = *aBuffer;
 | |
| 
 | |
|     MOZ_ASSERT(p, "null buffer");
 | |
|     MOZ_ASSERT(p < aEnd, "Bogus range");
 | |
| 
 | |
|     char16_t c = *p++;
 | |
| 
 | |
|     // Let's use encoding_rs-style code golf here.
 | |
|     // Unsigned underflow is defined behavior
 | |
|     char16_t cMinusSurrogateStart = c - 0xD800U;
 | |
|     if (MOZ_LIKELY(cMinusSurrogateStart > (0xDFFFU - 0xD800U))) {
 | |
|       *aBuffer = p;
 | |
|       return c;
 | |
|     }
 | |
|     if (MOZ_LIKELY(cMinusSurrogateStart <= (0xDBFFU - 0xD800U))) {
 | |
|       // High surrogate
 | |
|       if (MOZ_LIKELY(p != aEnd)) {
 | |
|         char16_t second = *p;
 | |
|         // Unsigned underflow is defined behavior
 | |
|         if (MOZ_LIKELY((second - 0xDC00U) <= (0xDFFFU - 0xDC00U))) {
 | |
|           *aBuffer = ++p;
 | |
|           return (uint32_t(c) << 10) + uint32_t(second) -
 | |
|                  (((0xD800U << 10) - 0x10000U) + 0xDC00U);
 | |
|         }
 | |
|       }
 | |
|     }
 | |
|     // Unpaired surrogate
 | |
|     *aBuffer = p;
 | |
|     if (aErr) {
 | |
|       *aErr = true;
 | |
|     }
 | |
|     return 0xFFFDU;
 | |
|   }
 | |
| };
 | |
| 
 | |
| template <typename Char, typename UnsignedT>
 | |
| inline UnsignedT RewindToPriorUTF8Codepoint(const Char* utf8Chars,
 | |
|                                             UnsignedT index) {
 | |
|   static_assert(std::is_same_v<Char, char> ||
 | |
|                     std::is_same_v<Char, unsigned char> ||
 | |
|                     std::is_same_v<Char, signed char>,
 | |
|                 "UTF-8 data must be in 8-bit units");
 | |
|   static_assert(std::is_unsigned_v<UnsignedT>, "index type must be unsigned");
 | |
|   while (index > 0 && (utf8Chars[index] & 0xC0) == 0x80) --index;
 | |
| 
 | |
|   return index;
 | |
| }
 | |
| 
 | |
| #undef UTF8UTILS_WARNING
 | |
| 
 | |
| #endif /* !defined(nsUTF8Utils_h_) */
 |