forked from mirrors/gecko-dev
		
	Bug 1719535 - Part 5. Add ICU4X based segmenter modules. r=TYLin,jfkthame
Depends on D167673 Differential Revision: https://phabricator.services.mozilla.com/D167675
This commit is contained in:
		
							parent
							
								
									80aa728f71
								
							
						
					
					
						commit
						d8a2fdf7da
					
				
					 7 changed files with 655 additions and 8 deletions
				
			
		|  | @ -13,6 +13,20 @@ | ||||||
| #include "mozilla/intl/Segmenter.h" | #include "mozilla/intl/Segmenter.h" | ||||||
| #include "mozilla/intl/UnicodeProperties.h" | #include "mozilla/intl/UnicodeProperties.h" | ||||||
| 
 | 
 | ||||||
|  | #if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) | ||||||
|  | #  include "ICU4XDataProvider.h" | ||||||
|  | #  include "ICU4XLineBreakIteratorLatin1.hpp" | ||||||
|  | #  include "ICU4XLineBreakIteratorUtf16.hpp" | ||||||
|  | #  include "ICU4XLineSegmenter.h" | ||||||
|  | #  include "mozilla/CheckedInt.h" | ||||||
|  | #  include "mozilla/ClearOnShutdown.h" | ||||||
|  | #  include "mozilla/intl/ICU4XGeckoDataProvider.h" | ||||||
|  | #  include "mozilla/StaticPrefs_intl.h" | ||||||
|  | #  include "nsThreadUtils.h" | ||||||
|  | 
 | ||||||
|  | #  include <mutex> | ||||||
|  | #endif | ||||||
|  | 
 | ||||||
| using namespace mozilla::unicode; | using namespace mozilla::unicode; | ||||||
| using namespace mozilla::intl; | using namespace mozilla::intl; | ||||||
| 
 | 
 | ||||||
|  | @ -978,9 +992,136 @@ static bool SuppressBreakForKeepAll(uint32_t aPrev, uint32_t aCh) { | ||||||
|          affectedByKeepAll(GetLineBreakClass(aCh)); |          affectedByKeepAll(GetLineBreakClass(aCh)); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | #if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) | ||||||
|  | static capi::ICU4XLineBreakStrictness ConvertLineBreakRuleToICU4X( | ||||||
|  |     LineBreakRule aLevel) { | ||||||
|  |   switch (aLevel) { | ||||||
|  |     case LineBreakRule::Auto: | ||||||
|  |       return capi::ICU4XLineBreakStrictness_Strict; | ||||||
|  |     case LineBreakRule::Strict: | ||||||
|  |       return capi::ICU4XLineBreakStrictness_Strict; | ||||||
|  |     case LineBreakRule::Loose: | ||||||
|  |       return capi::ICU4XLineBreakStrictness_Loose; | ||||||
|  |     case LineBreakRule::Normal: | ||||||
|  |       return capi::ICU4XLineBreakStrictness_Normal; | ||||||
|  |     case LineBreakRule::Anywhere: | ||||||
|  |       return capi::ICU4XLineBreakStrictness_Anywhere; | ||||||
|  |   } | ||||||
|  |   MOZ_ASSERT_UNREACHABLE("should have been handled already"); | ||||||
|  |   return capi::ICU4XLineBreakStrictness_Normal; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static capi::ICU4XLineBreakWordOption ConvertWordBreakRuleToICU4X( | ||||||
|  |     WordBreakRule aWordBreak) { | ||||||
|  |   switch (aWordBreak) { | ||||||
|  |     case WordBreakRule::Normal: | ||||||
|  |       return capi::ICU4XLineBreakWordOption_Normal; | ||||||
|  |     case WordBreakRule::BreakAll: | ||||||
|  |       return capi::ICU4XLineBreakWordOption_BreakAll; | ||||||
|  |     case WordBreakRule::KeepAll: | ||||||
|  |       return capi::ICU4XLineBreakWordOption_KeepAll; | ||||||
|  |   } | ||||||
|  |   MOZ_ASSERT_UNREACHABLE("should have been handled already"); | ||||||
|  |   return capi::ICU4XLineBreakWordOption_Normal; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static capi::ICU4XLineSegmenter* sLineSegmenter = nullptr; | ||||||
|  | 
 | ||||||
|  | static capi::ICU4XLineSegmenter* GetDefaultLineSegmenter() { | ||||||
|  |   static std::once_flag sOnce; | ||||||
|  | 
 | ||||||
|  |   std::call_once(sOnce, [] { | ||||||
|  |     auto result = capi::ICU4XLineSegmenter_create_auto(GetDataProvider()); | ||||||
|  |     MOZ_ASSERT(result.is_ok); | ||||||
|  |     sLineSegmenter = result.ok; | ||||||
|  | 
 | ||||||
|  |     if (NS_IsMainThread()) { | ||||||
|  |       mozilla::RunOnShutdown([] { | ||||||
|  |         if (sLineSegmenter) { | ||||||
|  |           capi::ICU4XLineSegmenter_destroy(sLineSegmenter); | ||||||
|  |         } | ||||||
|  |         sLineSegmenter = nullptr; | ||||||
|  |       }); | ||||||
|  |       return; | ||||||
|  |     } | ||||||
|  |     NS_DispatchToMainThread( | ||||||
|  |         NS_NewRunnableFunction("GetDefaultLineSegmenter", [] { | ||||||
|  |           mozilla::RunOnShutdown([] { | ||||||
|  |             if (sLineSegmenter) { | ||||||
|  |               capi::ICU4XLineSegmenter_destroy(sLineSegmenter); | ||||||
|  |             } | ||||||
|  |             sLineSegmenter = nullptr; | ||||||
|  |           }); | ||||||
|  |         })); | ||||||
|  |   }); | ||||||
|  | 
 | ||||||
|  |   return sLineSegmenter; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static bool UseDefaultLineSegmenter(WordBreakRule aWordBreak, | ||||||
|  |                                     LineBreakRule aLevel, | ||||||
|  |                                     bool aIsChineseOrJapanese) { | ||||||
|  |   return aWordBreak == WordBreakRule::Normal && | ||||||
|  |          (aLevel == LineBreakRule::Strict || aLevel == LineBreakRule::Auto) && | ||||||
|  |          !aIsChineseOrJapanese; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static capi::ICU4XLineSegmenter* GetLineSegmenter(bool aUseDefault, | ||||||
|  |                                                   WordBreakRule aWordBreak, | ||||||
|  |                                                   LineBreakRule aLevel, | ||||||
|  |                                                   bool aIsChineseOrJapanese) { | ||||||
|  |   if (aUseDefault) { | ||||||
|  |     MOZ_ASSERT( | ||||||
|  |         UseDefaultLineSegmenter(aWordBreak, aLevel, aIsChineseOrJapanese)); | ||||||
|  |     return GetDefaultLineSegmenter(); | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  |   capi::ICU4XLineBreakOptionsV1 options; | ||||||
|  |   options.word_option = ConvertWordBreakRuleToICU4X(aWordBreak); | ||||||
|  |   options.strictness = ConvertLineBreakRuleToICU4X(aLevel); | ||||||
|  |   options.ja_zh = aIsChineseOrJapanese; | ||||||
|  | 
 | ||||||
|  |   auto result = capi::ICU4XLineSegmenter_create_lstm_with_options_v1( | ||||||
|  |       GetDataProvider(), options); | ||||||
|  |   MOZ_ASSERT(result.is_ok); | ||||||
|  |   return result.ok; | ||||||
|  | } | ||||||
|  | #endif | ||||||
|  | 
 | ||||||
| void LineBreaker::ComputeBreakPositions( | void LineBreaker::ComputeBreakPositions( | ||||||
|     const char16_t* aChars, uint32_t aLength, WordBreakRule aWordBreak, |     const char16_t* aChars, uint32_t aLength, WordBreakRule aWordBreak, | ||||||
|     LineBreakRule aLevel, bool aIsChineseOrJapanese, uint8_t* aBreakBefore) { |     LineBreakRule aLevel, bool aIsChineseOrJapanese, uint8_t* aBreakBefore) { | ||||||
|  | #if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) | ||||||
|  |   if (StaticPrefs::intl_icu4x_segmenter_enabled()) { | ||||||
|  |     memset(aBreakBefore, 0, aLength); | ||||||
|  | 
 | ||||||
|  |     CheckedInt<int32_t> length = aLength; | ||||||
|  |     if (!length.isValid()) { | ||||||
|  |       return; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     const bool useDefault = | ||||||
|  |         UseDefaultLineSegmenter(aWordBreak, aLevel, aIsChineseOrJapanese); | ||||||
|  |     capi::ICU4XLineSegmenter* lineSegmenter = | ||||||
|  |         GetLineSegmenter(useDefault, aWordBreak, aLevel, aIsChineseOrJapanese); | ||||||
|  |     ICU4XLineBreakIteratorUtf16 iterator(capi::ICU4XLineSegmenter_segment_utf16( | ||||||
|  |         lineSegmenter, (const uint16_t*)aChars, aLength)); | ||||||
|  | 
 | ||||||
|  |     while (true) { | ||||||
|  |       const int32_t nextPos = iterator.next(); | ||||||
|  |       if (nextPos < 0 || nextPos >= length.value()) { | ||||||
|  |         break; | ||||||
|  |       } | ||||||
|  |       aBreakBefore[nextPos] = 1; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     if (!useDefault) { | ||||||
|  |       capi::ICU4XLineSegmenter_destroy(lineSegmenter); | ||||||
|  |     } | ||||||
|  |     return; | ||||||
|  |   } | ||||||
|  | #endif | ||||||
|  | 
 | ||||||
|   uint32_t cur; |   uint32_t cur; | ||||||
|   int8_t lastClass = CLASS_NONE; |   int8_t lastClass = CLASS_NONE; | ||||||
|   ContextState state(aChars, aLength); |   ContextState state(aChars, aLength); | ||||||
|  | @ -1110,6 +1251,38 @@ void LineBreaker::ComputeBreakPositions(const uint8_t* aChars, uint32_t aLength, | ||||||
|                                         LineBreakRule aLevel, |                                         LineBreakRule aLevel, | ||||||
|                                         bool aIsChineseOrJapanese, |                                         bool aIsChineseOrJapanese, | ||||||
|                                         uint8_t* aBreakBefore) { |                                         uint8_t* aBreakBefore) { | ||||||
|  | #if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) | ||||||
|  |   if (StaticPrefs::intl_icu4x_segmenter_enabled()) { | ||||||
|  |     memset(aBreakBefore, 0, aLength); | ||||||
|  | 
 | ||||||
|  |     CheckedInt<int32_t> length = aLength; | ||||||
|  |     if (!length.isValid()) { | ||||||
|  |       return; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     const bool useDefault = | ||||||
|  |         UseDefaultLineSegmenter(aWordBreak, aLevel, aIsChineseOrJapanese); | ||||||
|  |     capi::ICU4XLineSegmenter* lineSegmenter = | ||||||
|  |         GetLineSegmenter(useDefault, aWordBreak, aLevel, aIsChineseOrJapanese); | ||||||
|  |     ICU4XLineBreakIteratorLatin1 iterator( | ||||||
|  |         capi::ICU4XLineSegmenter_segment_latin1( | ||||||
|  |             lineSegmenter, (const uint8_t*)aChars, aLength)); | ||||||
|  | 
 | ||||||
|  |     while (true) { | ||||||
|  |       const int32_t nextPos = iterator.next(); | ||||||
|  |       if (nextPos < 0 || nextPos >= length.value()) { | ||||||
|  |         break; | ||||||
|  |       } | ||||||
|  |       aBreakBefore[nextPos] = 1; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     if (!useDefault) { | ||||||
|  |       capi::ICU4XLineSegmenter_destroy(lineSegmenter); | ||||||
|  |     } | ||||||
|  |     return; | ||||||
|  |   } | ||||||
|  | #endif | ||||||
|  | 
 | ||||||
|   uint32_t cur; |   uint32_t cur; | ||||||
|   int8_t lastClass = CLASS_NONE; |   int8_t lastClass = CLASS_NONE; | ||||||
|   ContextState state(aChars, aLength); |   ContextState state(aChars, aLength); | ||||||
|  |  | ||||||
|  | @ -11,9 +11,19 @@ | ||||||
| #include "mozilla/intl/LineBreaker.h" | #include "mozilla/intl/LineBreaker.h" | ||||||
| #include "mozilla/intl/WordBreaker.h" | #include "mozilla/intl/WordBreaker.h" | ||||||
| #include "mozilla/intl/UnicodeProperties.h" | #include "mozilla/intl/UnicodeProperties.h" | ||||||
|  | #include "mozilla/StaticPrefs_intl.h" | ||||||
| #include "nsUnicodeProperties.h" | #include "nsUnicodeProperties.h" | ||||||
| #include "nsCharTraits.h" | #include "nsCharTraits.h" | ||||||
| 
 | 
 | ||||||
|  | #if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) | ||||||
|  | #  include "ICU4XDataProvider.h" | ||||||
|  | #  include "ICU4XGraphemeClusterSegmenter.h" | ||||||
|  | #  include "ICU4XLineSegmenter.h" | ||||||
|  | #  include "ICU4XSentenceSegmenter.h" | ||||||
|  | #  include "ICU4XWordSegmenter.h" | ||||||
|  | #  include "mozilla/intl/ICU4XGeckoDataProvider.h" | ||||||
|  | #endif | ||||||
|  | 
 | ||||||
| using namespace mozilla::unicode; | using namespace mozilla::unicode; | ||||||
| 
 | 
 | ||||||
| namespace mozilla::intl { | namespace mozilla::intl { | ||||||
|  | @ -30,9 +40,45 @@ Maybe<uint32_t> SegmentIteratorUtf16::Seek(uint32_t aPos) { | ||||||
| 
 | 
 | ||||||
| LineBreakIteratorUtf16::LineBreakIteratorUtf16(Span<const char16_t> aText, | LineBreakIteratorUtf16::LineBreakIteratorUtf16(Span<const char16_t> aText, | ||||||
|                                                const LineBreakOptions& aOptions) |                                                const LineBreakOptions& aOptions) | ||||||
|     : SegmentIteratorUtf16(aText), mOptions(aOptions) {} |     : SegmentIteratorUtf16(aText), mOptions(aOptions) { | ||||||
|  | #if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) | ||||||
|  |   if (!StaticPrefs::intl_icu4x_segmenter_enabled()) { | ||||||
|  |     return; | ||||||
|  |   } | ||||||
|  |   auto result = | ||||||
|  |       capi::ICU4XLineSegmenter_create_auto(mozilla::intl::GetDataProvider()); | ||||||
|  |   MOZ_RELEASE_ASSERT(result.is_ok); | ||||||
|  |   mSegmenter = result.ok; | ||||||
|  |   mIterator = capi::ICU4XLineSegmenter_segment_utf16( | ||||||
|  |       mSegmenter, (const uint16_t*)mText.Elements(), mText.Length()); | ||||||
|  | #endif | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | LineBreakIteratorUtf16::~LineBreakIteratorUtf16() { | ||||||
|  | #if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) | ||||||
|  |   if (mIterator) { | ||||||
|  |     capi::ICU4XLineBreakIteratorUtf16_destroy(mIterator); | ||||||
|  |   } | ||||||
|  |   if (mSegmenter) { | ||||||
|  |     capi::ICU4XLineSegmenter_destroy(mSegmenter); | ||||||
|  |   } | ||||||
|  | #endif | ||||||
|  | } | ||||||
| 
 | 
 | ||||||
| Maybe<uint32_t> LineBreakIteratorUtf16::Next() { | Maybe<uint32_t> LineBreakIteratorUtf16::Next() { | ||||||
|  | #if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) | ||||||
|  |   if (mIterator) { | ||||||
|  |     const int32_t nextPos = capi::ICU4XLineBreakIteratorUtf16_next(mIterator); | ||||||
|  |     if (nextPos < 0) { | ||||||
|  |       return Nothing(); | ||||||
|  |     } | ||||||
|  |     if (!nextPos) { | ||||||
|  |       return Next(); | ||||||
|  |     } | ||||||
|  |     mPos = nextPos; | ||||||
|  |     return Some(mPos); | ||||||
|  |   } | ||||||
|  | #endif | ||||||
|   const int32_t nextPos = |   const int32_t nextPos = | ||||||
|       LineBreaker::Next(mText.Elements(), mText.Length(), mPos); |       LineBreaker::Next(mText.Elements(), mText.Length(), mPos); | ||||||
|   if (nextPos == NS_LINEBREAKER_NEED_MORE_TEXT) { |   if (nextPos == NS_LINEBREAKER_NEED_MORE_TEXT) { | ||||||
|  | @ -42,10 +88,71 @@ Maybe<uint32_t> LineBreakIteratorUtf16::Next() { | ||||||
|   return Some(mPos); |   return Some(mPos); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | Maybe<uint32_t> LineBreakIteratorUtf16::Seek(uint32_t aPos) { | ||||||
|  | #if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) | ||||||
|  |   if (mIterator) { | ||||||
|  |     if (mPos >= aPos) { | ||||||
|  |       return Next(); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     while (mPos < aPos) { | ||||||
|  |       const int32_t nextPos = capi::ICU4XLineBreakIteratorUtf16_next(mIterator); | ||||||
|  |       if (nextPos < 0) { | ||||||
|  |         return Nothing(); | ||||||
|  |       } | ||||||
|  |       mPos = static_cast<uint32_t>(nextPos); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     if (aPos < mPos) { | ||||||
|  |       return Some(mPos); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     return Next(); | ||||||
|  |   } | ||||||
|  | #endif | ||||||
|  |   return SegmentIteratorUtf16::Seek(aPos); | ||||||
|  | } | ||||||
|  | 
 | ||||||
| WordBreakIteratorUtf16::WordBreakIteratorUtf16(Span<const char16_t> aText) | WordBreakIteratorUtf16::WordBreakIteratorUtf16(Span<const char16_t> aText) | ||||||
|     : SegmentIteratorUtf16(aText) {} |     : SegmentIteratorUtf16(aText) { | ||||||
|  | #if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) | ||||||
|  |   if (!StaticPrefs::intl_icu4x_segmenter_enabled()) { | ||||||
|  |     return; | ||||||
|  |   } | ||||||
|  |   auto result = | ||||||
|  |       capi::ICU4XWordSegmenter_create_auto(mozilla::intl::GetDataProvider()); | ||||||
|  |   MOZ_RELEASE_ASSERT(result.is_ok); | ||||||
|  |   mSegmenter = result.ok; | ||||||
|  |   mIterator = capi::ICU4XWordSegmenter_segment_utf16( | ||||||
|  |       mSegmenter, (const uint16_t*)mText.Elements(), mText.Length()); | ||||||
|  | #endif | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | WordBreakIteratorUtf16::~WordBreakIteratorUtf16() { | ||||||
|  | #if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) | ||||||
|  |   if (mIterator) { | ||||||
|  |     capi::ICU4XWordBreakIteratorUtf16_destroy(mIterator); | ||||||
|  |   } | ||||||
|  |   if (mSegmenter) { | ||||||
|  |     capi::ICU4XWordSegmenter_destroy(mSegmenter); | ||||||
|  |   } | ||||||
|  | #endif | ||||||
|  | } | ||||||
| 
 | 
 | ||||||
| Maybe<uint32_t> WordBreakIteratorUtf16::Next() { | Maybe<uint32_t> WordBreakIteratorUtf16::Next() { | ||||||
|  | #if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) | ||||||
|  |   if (mIterator) { | ||||||
|  |     const int32_t nextPos = capi::ICU4XWordBreakIteratorUtf16_next(mIterator); | ||||||
|  |     if (nextPos < 0) { | ||||||
|  |       return Nothing(); | ||||||
|  |     } | ||||||
|  |     if (!nextPos) { | ||||||
|  |       return Next(); | ||||||
|  |     } | ||||||
|  |     mPos = nextPos; | ||||||
|  |     return Some(mPos); | ||||||
|  |   } | ||||||
|  | #endif | ||||||
|   const int32_t nextPos = |   const int32_t nextPos = | ||||||
|       WordBreaker::Next(mText.Elements(), mText.Length(), mPos); |       WordBreaker::Next(mText.Elements(), mText.Length(), mPos); | ||||||
|   if (nextPos == NS_WORDBREAKER_NEED_MORE_TEXT) { |   if (nextPos == NS_WORDBREAKER_NEED_MORE_TEXT) { | ||||||
|  | @ -55,9 +162,57 @@ Maybe<uint32_t> WordBreakIteratorUtf16::Next() { | ||||||
|   return Some(mPos); |   return Some(mPos); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | Maybe<uint32_t> WordBreakIteratorUtf16::Seek(uint32_t aPos) { | ||||||
|  | #if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) | ||||||
|  |   if (mIterator) { | ||||||
|  |     if (mPos >= aPos) { | ||||||
|  |       return Next(); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     while (mPos < aPos) { | ||||||
|  |       const int32_t nextPos = capi::ICU4XWordBreakIteratorUtf16_next(mIterator); | ||||||
|  |       if (nextPos < 0) { | ||||||
|  |         return Nothing(); | ||||||
|  |       } | ||||||
|  |       mPos = static_cast<uint32_t>(nextPos); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     if (aPos < mPos) { | ||||||
|  |       return Some(mPos); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     return Next(); | ||||||
|  |   } | ||||||
|  | #endif | ||||||
|  |   return SegmentIteratorUtf16::Seek(aPos); | ||||||
|  | } | ||||||
|  | 
 | ||||||
| GraphemeClusterBreakIteratorUtf16::GraphemeClusterBreakIteratorUtf16( | GraphemeClusterBreakIteratorUtf16::GraphemeClusterBreakIteratorUtf16( | ||||||
|     Span<const char16_t> aText) |     Span<const char16_t> aText) | ||||||
|     : SegmentIteratorUtf16(aText) {} |     : SegmentIteratorUtf16(aText) { | ||||||
|  | #if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) | ||||||
|  |   if (!StaticPrefs::intl_icu4x_segmenter_enabled()) { | ||||||
|  |     return; | ||||||
|  |   } | ||||||
|  |   auto result = capi::ICU4XGraphemeClusterSegmenter_create( | ||||||
|  |       mozilla::intl::GetDataProvider()); | ||||||
|  |   MOZ_RELEASE_ASSERT(result.is_ok); | ||||||
|  |   mSegmenter = result.ok; | ||||||
|  |   mIterator = capi::ICU4XGraphemeClusterSegmenter_segment_utf16( | ||||||
|  |       mSegmenter, (const uint16_t*)mText.Elements(), mText.Length()); | ||||||
|  | #endif | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | GraphemeClusterBreakIteratorUtf16::~GraphemeClusterBreakIteratorUtf16() { | ||||||
|  | #if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) | ||||||
|  |   if (mIterator) { | ||||||
|  |     capi::ICU4XGraphemeClusterBreakIteratorUtf16_destroy(mIterator); | ||||||
|  |   } | ||||||
|  |   if (mSegmenter) { | ||||||
|  |     capi::ICU4XGraphemeClusterSegmenter_destroy(mSegmenter); | ||||||
|  |   } | ||||||
|  | #endif | ||||||
|  | } | ||||||
| 
 | 
 | ||||||
| enum HSType { | enum HSType { | ||||||
|   HST_NONE = U_HST_NOT_APPLICABLE, |   HST_NONE = U_HST_NOT_APPLICABLE, | ||||||
|  | @ -75,6 +230,20 @@ static HSType GetHangulSyllableType(uint32_t aCh) { | ||||||
| 
 | 
 | ||||||
| Maybe<uint32_t> GraphemeClusterBreakIteratorUtf16::Next() { | Maybe<uint32_t> GraphemeClusterBreakIteratorUtf16::Next() { | ||||||
|   const auto len = mText.Length(); |   const auto len = mText.Length(); | ||||||
|  | #if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) | ||||||
|  |   if (mIterator) { | ||||||
|  |     const int32_t nextPos = | ||||||
|  |         capi::ICU4XGraphemeClusterBreakIteratorUtf16_next(mIterator); | ||||||
|  |     if (nextPos < 0) { | ||||||
|  |       return Nothing(); | ||||||
|  |     } | ||||||
|  |     if (!nextPos) { | ||||||
|  |       return Next(); | ||||||
|  |     } | ||||||
|  |     mPos = nextPos; | ||||||
|  |     return Some(mPos); | ||||||
|  |   } | ||||||
|  | #endif | ||||||
|   if (mPos >= len) { |   if (mPos >= len) { | ||||||
|     // The iterator has already reached the end.
 |     // The iterator has already reached the end.
 | ||||||
|     return Nothing(); |     return Nothing(); | ||||||
|  | @ -195,6 +364,32 @@ Maybe<uint32_t> GraphemeClusterBreakIteratorUtf16::Next() { | ||||||
|   return Some(mPos); |   return Some(mPos); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | Maybe<uint32_t> GraphemeClusterBreakIteratorUtf16::Seek(uint32_t aPos) { | ||||||
|  | #if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) | ||||||
|  |   if (mIterator) { | ||||||
|  |     if (mPos >= aPos) { | ||||||
|  |       return Next(); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     while (mPos < aPos) { | ||||||
|  |       const int32_t nextPos = | ||||||
|  |           capi::ICU4XGraphemeClusterBreakIteratorUtf16_next(mIterator); | ||||||
|  |       if (nextPos < 0) { | ||||||
|  |         return Nothing(); | ||||||
|  |       } | ||||||
|  |       mPos = static_cast<uint32_t>(nextPos); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     if (aPos < mPos) { | ||||||
|  |       return Some(mPos); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     return Next(); | ||||||
|  |   } | ||||||
|  | #endif | ||||||
|  |   return SegmentIteratorUtf16::Seek(aPos); | ||||||
|  | } | ||||||
|  | 
 | ||||||
| GraphemeClusterBreakReverseIteratorUtf16:: | GraphemeClusterBreakReverseIteratorUtf16:: | ||||||
|     GraphemeClusterBreakReverseIteratorUtf16(Span<const char16_t> aText) |     GraphemeClusterBreakReverseIteratorUtf16(Span<const char16_t> aText) | ||||||
|     : SegmentIteratorUtf16(aText) { |     : SegmentIteratorUtf16(aText) { | ||||||
|  | @ -231,12 +426,77 @@ Maybe<uint32_t> GraphemeClusterBreakReverseIteratorUtf16::Seek(uint32_t aPos) { | ||||||
|   return Next(); |   return Next(); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | #if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) | ||||||
|  | SentenceBreakIteratorUtf16::SentenceBreakIteratorUtf16( | ||||||
|  |     Span<const char16_t> aText) | ||||||
|  |     : SegmentIteratorUtf16(aText) { | ||||||
|  |   auto result = | ||||||
|  |       capi::ICU4XSentenceSegmenter_create(mozilla::intl::GetDataProvider()); | ||||||
|  |   MOZ_RELEASE_ASSERT(result.is_ok); | ||||||
|  |   mSegmenter = result.ok; | ||||||
|  |   mIterator = capi::ICU4XSentenceSegmenter_segment_utf16( | ||||||
|  |       mSegmenter, (const uint16_t*)mText.Elements(), mText.Length()); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | SentenceBreakIteratorUtf16::~SentenceBreakIteratorUtf16() { | ||||||
|  |   if (mIterator) { | ||||||
|  |     capi::ICU4XSentenceBreakIteratorUtf16_destroy(mIterator); | ||||||
|  |   } | ||||||
|  |   if (mSegmenter) { | ||||||
|  |     capi::ICU4XSentenceSegmenter_destroy(mSegmenter); | ||||||
|  |   } | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | Maybe<uint32_t> SentenceBreakIteratorUtf16::Seek(uint32_t aPos) { | ||||||
|  |   if (!mIterator) { | ||||||
|  |     return Nothing(); | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  |   if (mPos >= aPos) { | ||||||
|  |     return Next(); | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  |   while (mPos < aPos) { | ||||||
|  |     const int32_t nextPos = | ||||||
|  |         capi::ICU4XSentenceBreakIteratorUtf16_next(mIterator); | ||||||
|  |     if (nextPos < 0) { | ||||||
|  |       return Nothing(); | ||||||
|  |     } | ||||||
|  |     mPos = static_cast<uint32_t>(nextPos); | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  |   if (aPos < mPos) { | ||||||
|  |     return Some(mPos); | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  |   return Next(); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | Maybe<uint32_t> SentenceBreakIteratorUtf16::Next() { | ||||||
|  |   if (!mIterator) { | ||||||
|  |     return Nothing(); | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  |   const int32_t nextPos = capi::ICU4XSentenceBreakIteratorUtf16_next(mIterator); | ||||||
|  |   if (nextPos < 0) { | ||||||
|  |     return Nothing(); | ||||||
|  |   } | ||||||
|  |   if (!nextPos) { | ||||||
|  |     return Next(); | ||||||
|  |   } | ||||||
|  |   mPos = nextPos; | ||||||
|  |   return Some(mPos); | ||||||
|  | } | ||||||
|  | #endif | ||||||
|  | 
 | ||||||
| Result<UniquePtr<Segmenter>, ICUError> Segmenter::TryCreate( | Result<UniquePtr<Segmenter>, ICUError> Segmenter::TryCreate( | ||||||
|     Span<const char> aLocale, const SegmenterOptions& aOptions) { |     Span<const char> aLocale, const SegmenterOptions& aOptions) { | ||||||
|  | #if !defined(MOZ_ICU4X) || !defined(JS_HAS_INTL_API) | ||||||
|   if (aOptions.mGranularity == SegmenterGranularity::Sentence) { |   if (aOptions.mGranularity == SegmenterGranularity::Sentence) { | ||||||
|     // Grapheme and Sentence iterator are not yet implemented.
 |     // Grapheme and Sentence iterator are not yet implemented.
 | ||||||
|     return Err(ICUError::InternalError); |     return Err(ICUError::InternalError); | ||||||
|   } |   } | ||||||
|  | #endif | ||||||
|   return MakeUnique<Segmenter>(aLocale, aOptions); |   return MakeUnique<Segmenter>(aLocale, aOptions); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | @ -246,6 +506,11 @@ UniquePtr<SegmentIteratorUtf16> Segmenter::Segment( | ||||||
|     case SegmenterGranularity::Grapheme: |     case SegmenterGranularity::Grapheme: | ||||||
|       return MakeUnique<GraphemeClusterBreakIteratorUtf16>(aText); |       return MakeUnique<GraphemeClusterBreakIteratorUtf16>(aText); | ||||||
|     case SegmenterGranularity::Sentence: |     case SegmenterGranularity::Sentence: | ||||||
|  | #if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) | ||||||
|  |       if (StaticPrefs::intl_icu4x_segmenter_enabled()) { | ||||||
|  |         return MakeUnique<SentenceBreakIteratorUtf16>(aText); | ||||||
|  |       } | ||||||
|  | #endif | ||||||
|       MOZ_ASSERT_UNREACHABLE("Unimplemented yet!"); |       MOZ_ASSERT_UNREACHABLE("Unimplemented yet!"); | ||||||
|       return nullptr; |       return nullptr; | ||||||
|     case SegmenterGranularity::Word: |     case SegmenterGranularity::Word: | ||||||
|  |  | ||||||
|  | @ -15,6 +15,19 @@ | ||||||
| #include "mozilla/Span.h" | #include "mozilla/Span.h" | ||||||
| #include "mozilla/UniquePtr.h" | #include "mozilla/UniquePtr.h" | ||||||
| 
 | 
 | ||||||
|  | #if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) | ||||||
|  | namespace capi { | ||||||
|  | struct ICU4XLineSegmenter; | ||||||
|  | struct ICU4XLineBreakIteratorUtf16; | ||||||
|  | struct ICU4XWordSegmenter; | ||||||
|  | struct ICU4XWordBreakIteratorUtf16; | ||||||
|  | struct ICU4XGraphemeClusterSegmenter; | ||||||
|  | struct ICU4XGraphemeClusterBreakIteratorUtf16; | ||||||
|  | struct ICU4XSentenceSegmenter; | ||||||
|  | struct ICU4XSentenceBreakIteratorUtf16; | ||||||
|  | }  // namespace capi
 | ||||||
|  | #endif | ||||||
|  | 
 | ||||||
| namespace mozilla::intl { | namespace mozilla::intl { | ||||||
| 
 | 
 | ||||||
| enum class SegmenterGranularity : uint8_t { | enum class SegmenterGranularity : uint8_t { | ||||||
|  | @ -104,11 +117,18 @@ class LineBreakIteratorUtf16 final : public SegmentIteratorUtf16 { | ||||||
|  public: |  public: | ||||||
|   explicit LineBreakIteratorUtf16(Span<const char16_t> aText, |   explicit LineBreakIteratorUtf16(Span<const char16_t> aText, | ||||||
|                                   const LineBreakOptions& aOptions = {}); |                                   const LineBreakOptions& aOptions = {}); | ||||||
|  |   ~LineBreakIteratorUtf16() override; | ||||||
| 
 | 
 | ||||||
|   Maybe<uint32_t> Next() override; |   Maybe<uint32_t> Next() override; | ||||||
|  |   Maybe<uint32_t> Seek(uint32_t aPos) override; | ||||||
| 
 | 
 | ||||||
|  private: |  private: | ||||||
|   LineBreakOptions mOptions; |   LineBreakOptions mOptions; | ||||||
|  | 
 | ||||||
|  | #ifdef MOZ_ICU4X | ||||||
|  |   capi::ICU4XLineSegmenter* mSegmenter = nullptr; | ||||||
|  |   capi::ICU4XLineBreakIteratorUtf16* mIterator = nullptr; | ||||||
|  | #endif | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
| /**
 | /**
 | ||||||
|  | @ -117,8 +137,16 @@ class LineBreakIteratorUtf16 final : public SegmentIteratorUtf16 { | ||||||
| class WordBreakIteratorUtf16 final : public SegmentIteratorUtf16 { | class WordBreakIteratorUtf16 final : public SegmentIteratorUtf16 { | ||||||
|  public: |  public: | ||||||
|   explicit WordBreakIteratorUtf16(Span<const char16_t> aText); |   explicit WordBreakIteratorUtf16(Span<const char16_t> aText); | ||||||
|  |   ~WordBreakIteratorUtf16() override; | ||||||
| 
 | 
 | ||||||
|   Maybe<uint32_t> Next() override; |   Maybe<uint32_t> Next() override; | ||||||
|  |   Maybe<uint32_t> Seek(uint32_t aPos) override; | ||||||
|  | 
 | ||||||
|  | #if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) | ||||||
|  |  private: | ||||||
|  |   capi::ICU4XWordSegmenter* mSegmenter = nullptr; | ||||||
|  |   capi::ICU4XWordBreakIteratorUtf16* mIterator = nullptr; | ||||||
|  | #endif | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
| /**
 | /**
 | ||||||
|  | @ -127,8 +155,16 @@ class WordBreakIteratorUtf16 final : public SegmentIteratorUtf16 { | ||||||
| class GraphemeClusterBreakIteratorUtf16 final : public SegmentIteratorUtf16 { | class GraphemeClusterBreakIteratorUtf16 final : public SegmentIteratorUtf16 { | ||||||
|  public: |  public: | ||||||
|   explicit GraphemeClusterBreakIteratorUtf16(Span<const char16_t> aText); |   explicit GraphemeClusterBreakIteratorUtf16(Span<const char16_t> aText); | ||||||
|  |   ~GraphemeClusterBreakIteratorUtf16() override; | ||||||
| 
 | 
 | ||||||
|   Maybe<uint32_t> Next() override; |   Maybe<uint32_t> Next() override; | ||||||
|  |   Maybe<uint32_t> Seek(uint32_t aPos) override; | ||||||
|  | 
 | ||||||
|  | #if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) | ||||||
|  |  private: | ||||||
|  |   capi::ICU4XGraphemeClusterSegmenter* mSegmenter = nullptr; | ||||||
|  |   capi::ICU4XGraphemeClusterBreakIteratorUtf16* mIterator = nullptr; | ||||||
|  | #endif | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
| /**
 | /**
 | ||||||
|  | @ -146,6 +182,24 @@ class GraphemeClusterBreakReverseIteratorUtf16 final | ||||||
|   Maybe<uint32_t> Seek(uint32_t aPos) override; |   Maybe<uint32_t> Seek(uint32_t aPos) override; | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
|  | #if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) | ||||||
|  | /**
 | ||||||
|  |  * Sentence break iterator for UTF-16 text. | ||||||
|  |  */ | ||||||
|  | class SentenceBreakIteratorUtf16 final : public SegmentIteratorUtf16 { | ||||||
|  |  public: | ||||||
|  |   explicit SentenceBreakIteratorUtf16(Span<const char16_t> aText); | ||||||
|  |   ~SentenceBreakIteratorUtf16() override; | ||||||
|  | 
 | ||||||
|  |   Maybe<uint32_t> Next() override; | ||||||
|  |   Maybe<uint32_t> Seek(uint32_t aPos) override; | ||||||
|  | 
 | ||||||
|  |  private: | ||||||
|  |   capi::ICU4XSentenceSegmenter* mSegmenter = nullptr; | ||||||
|  |   capi::ICU4XSentenceBreakIteratorUtf16* mIterator = nullptr; | ||||||
|  | }; | ||||||
|  | #endif | ||||||
|  | 
 | ||||||
| /**
 | /**
 | ||||||
|  * This component is a Mozilla-focused API for working with segmenters in |  * This component is a Mozilla-focused API for working with segmenters in | ||||||
|  * internationalization code. |  * internationalization code. | ||||||
|  |  | ||||||
|  | @ -10,6 +10,14 @@ | ||||||
| #include "nsTArray.h" | #include "nsTArray.h" | ||||||
| #include "nsUnicodeProperties.h" | #include "nsUnicodeProperties.h" | ||||||
| 
 | 
 | ||||||
|  | #if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) | ||||||
|  | #  include "ICU4XDataProvider.h" | ||||||
|  | #  include "ICU4XWordBreakIteratorUtf16.hpp" | ||||||
|  | #  include "ICU4XWordSegmenter.hpp" | ||||||
|  | #  include "mozilla/intl/ICU4XGeckoDataProvider.h" | ||||||
|  | #  include "mozilla/StaticPrefs_intl.h" | ||||||
|  | #endif | ||||||
|  | 
 | ||||||
| using mozilla::intl::Script; | using mozilla::intl::Script; | ||||||
| using mozilla::intl::UnicodeProperties; | using mozilla::intl::UnicodeProperties; | ||||||
| using mozilla::intl::WordBreaker; | using mozilla::intl::WordBreaker; | ||||||
|  | @ -102,9 +110,34 @@ WordRange WordBreaker::FindWord(const char16_t* aText, uint32_t aLen, | ||||||
|     return {aLen, aLen}; |     return {aLen, aLen}; | ||||||
|   } |   } | ||||||
| 
 | 
 | ||||||
|   WordBreakClass c = GetClass(aText[aPos]); |  | ||||||
|   WordRange range{0, aLen}; |   WordRange range{0, aLen}; | ||||||
| 
 | 
 | ||||||
|  | #if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) | ||||||
|  |   if (StaticPrefs::intl_icu4x_segmenter_enabled()) { | ||||||
|  |     auto result = | ||||||
|  |         capi::ICU4XWordSegmenter_create_auto(mozilla::intl::GetDataProvider()); | ||||||
|  |     MOZ_ASSERT(result.is_ok); | ||||||
|  |     ICU4XWordSegmenter segmenter(result.ok); | ||||||
|  |     ICU4XWordBreakIteratorUtf16 iterator = | ||||||
|  |         segmenter.segment_utf16(diplomat::span((const uint16_t*)aText, aLen)); | ||||||
|  | 
 | ||||||
|  |     uint32_t previousPos = 0; | ||||||
|  |     while (true) { | ||||||
|  |       const int32_t nextPos = iterator.next(); | ||||||
|  |       if (nextPos < 0) { | ||||||
|  |         return {previousPos, aLen}; | ||||||
|  |       } | ||||||
|  |       if ((uint32_t)nextPos > aPos) { | ||||||
|  |         return {previousPos, (uint32_t)nextPos}; | ||||||
|  |       } | ||||||
|  | 
 | ||||||
|  |       previousPos = nextPos; | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  | #endif | ||||||
|  | 
 | ||||||
|  |   WordBreakClass c = GetClass(aText[aPos]); | ||||||
|  | 
 | ||||||
|   // Scan forward
 |   // Scan forward
 | ||||||
|   for (uint32_t i = aPos + 1; i <= aLen; i++) { |   for (uint32_t i = aPos + 1; i <= aLen; i++) { | ||||||
|     if (c != GetClass(aText[i])) { |     if (c != GetClass(aText[i])) { | ||||||
|  |  | ||||||
|  | @ -7,11 +7,15 @@ | ||||||
| #include "gtest/gtest.h" | #include "gtest/gtest.h" | ||||||
| 
 | 
 | ||||||
| #include "mozilla/intl/Segmenter.h" | #include "mozilla/intl/Segmenter.h" | ||||||
|  | #include "mozilla/Preferences.h" | ||||||
| 
 | 
 | ||||||
| namespace mozilla::intl { | namespace mozilla::intl { | ||||||
| 
 | 
 | ||||||
| TEST(IntlSegmenter, TestLineBreakIteratorUtf16) | TEST(IntlSegmenter, TestLineBreakIteratorUtf16SeekOld) | ||||||
| { | { | ||||||
|  |   nsresult rv = Preferences::SetBool("intl.icu4x.segmenter.enabled", false); | ||||||
|  |   EXPECT_TRUE(rv == NS_OK); | ||||||
|  | 
 | ||||||
|   const SegmenterOptions options{SegmenterGranularity::Line}; |   const SegmenterOptions options{SegmenterGranularity::Line}; | ||||||
|   auto result = Segmenter::TryCreate("en", options); |   auto result = Segmenter::TryCreate("en", options); | ||||||
|   ASSERT_TRUE(result.isOk()); |   ASSERT_TRUE(result.isOk()); | ||||||
|  | @ -30,7 +34,50 @@ TEST(IntlSegmenter, TestLineBreakIteratorUtf16) | ||||||
|   ASSERT_EQ(segIter->Seek(0u), Nothing()); |   ASSERT_EQ(segIter->Seek(0u), Nothing()); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| TEST(IntlSegmenter, TestWordBreakIteratorUtf16) | TEST(IntlSegmenter, TestLineBreakIteratorUtf16Seek) | ||||||
|  | { | ||||||
|  |   nsresult rv = Preferences::SetBool("intl.icu4x.segmenter.enabled", true); | ||||||
|  |   EXPECT_TRUE(rv == NS_OK); | ||||||
|  | 
 | ||||||
|  |   const SegmenterOptions options{SegmenterGranularity::Line}; | ||||||
|  |   auto result = Segmenter::TryCreate("en", options); | ||||||
|  |   ASSERT_TRUE(result.isOk()); | ||||||
|  |   auto lineSegmenter = result.unwrap(); | ||||||
|  | 
 | ||||||
|  |   const char16_t text[] = u"hello world"; | ||||||
|  |   UniquePtr<SegmentIteratorUtf16> segIter = | ||||||
|  |       lineSegmenter->Segment(MakeStringSpan(text)); | ||||||
|  | 
 | ||||||
|  |   // Seek to space between "hello" and "world".
 | ||||||
|  |   // UAX#14 rule returns before "w".
 | ||||||
|  |   ASSERT_EQ(segIter->Seek(5u), Some(6u)); | ||||||
|  | 
 | ||||||
|  |   ASSERT_EQ(segIter->Next(), Some(11u)); | ||||||
|  | 
 | ||||||
|  |   ASSERT_EQ(segIter->Next(), Nothing()); | ||||||
|  | 
 | ||||||
|  |   // Same as calling Next().
 | ||||||
|  |   ASSERT_EQ(segIter->Seek(0u), Nothing()); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | TEST(IntlSegmenter, TestWordBreakIteratorUtf16Simple) | ||||||
|  | { | ||||||
|  |   const SegmenterOptions options{SegmenterGranularity::Word}; | ||||||
|  |   auto result = Segmenter::TryCreate("en", options); | ||||||
|  |   ASSERT_TRUE(result.isOk()); | ||||||
|  |   auto wordSegmenter = result.unwrap(); | ||||||
|  | 
 | ||||||
|  |   const char16_t text[] = u"hello world"; | ||||||
|  |   UniquePtr<SegmentIteratorUtf16> segIter = | ||||||
|  |       wordSegmenter->Segment(MakeStringSpan(text)); | ||||||
|  | 
 | ||||||
|  |   ASSERT_EQ(segIter->Next(), Some(5u)); | ||||||
|  |   ASSERT_EQ(segIter->Next(), Some(6u)); | ||||||
|  |   ASSERT_EQ(segIter->Next(), Some(11u)); | ||||||
|  |   ASSERT_EQ(segIter->Next(), Nothing()); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | TEST(IntlSegmenter, TestWordBreakIteratorUtf16Seek) | ||||||
| { | { | ||||||
|   const SegmenterOptions options{SegmenterGranularity::Word}; |   const SegmenterOptions options{SegmenterGranularity::Word}; | ||||||
|   auto result = Segmenter::TryCreate("en", options); |   auto result = Segmenter::TryCreate("en", options); | ||||||
|  | @ -51,7 +98,32 @@ TEST(IntlSegmenter, TestWordBreakIteratorUtf16) | ||||||
|   ASSERT_EQ(segIter->Seek(0u), Nothing()); |   ASSERT_EQ(segIter->Seek(0u), Nothing()); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| TEST(IntlSegmenter, TestGraphemeClusterBreakIteratorUtf16) | TEST(IntlSegmenter, TestGraphemeClusterBreakIteratorUtf16Simple) | ||||||
|  | { | ||||||
|  |   SegmenterOptions options{SegmenterGranularity::Grapheme}; | ||||||
|  |   auto result = Segmenter::TryCreate("en", options); | ||||||
|  |   ASSERT_TRUE(result.isOk()); | ||||||
|  |   auto graphemeClusterSegmenter = result.unwrap(); | ||||||
|  | 
 | ||||||
|  |   const char16_t text[] = u"hello world"; | ||||||
|  |   UniquePtr<SegmentIteratorUtf16> segIter = | ||||||
|  |       graphemeClusterSegmenter->Segment(MakeStringSpan(text)); | ||||||
|  | 
 | ||||||
|  |   ASSERT_EQ(segIter->Next(), Some(1u)); | ||||||
|  |   ASSERT_EQ(segIter->Next(), Some(2u)); | ||||||
|  |   ASSERT_EQ(segIter->Next(), Some(3u)); | ||||||
|  |   ASSERT_EQ(segIter->Next(), Some(4u)); | ||||||
|  |   ASSERT_EQ(segIter->Next(), Some(5u)); | ||||||
|  |   ASSERT_EQ(segIter->Next(), Some(6u)); | ||||||
|  |   ASSERT_EQ(segIter->Next(), Some(7u)); | ||||||
|  |   ASSERT_EQ(segIter->Next(), Some(8u)); | ||||||
|  |   ASSERT_EQ(segIter->Next(), Some(9u)); | ||||||
|  |   ASSERT_EQ(segIter->Next(), Some(10u)); | ||||||
|  |   ASSERT_EQ(segIter->Next(), Some(11u)); | ||||||
|  |   ASSERT_EQ(segIter->Next(), Nothing()); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | TEST(IntlSegmenter, TestGraphemeClusterBreakIteratorUtf16Seek) | ||||||
| { | { | ||||||
|   SegmenterOptions options{SegmenterGranularity::Grapheme}; |   SegmenterOptions options{SegmenterGranularity::Grapheme}; | ||||||
|   auto result = Segmenter::TryCreate("en", options); |   auto result = Segmenter::TryCreate("en", options); | ||||||
|  | @ -97,9 +169,41 @@ TEST(IntlSegmenter, TestGraphemeClusterBreakReverseIteratorUtf16) | ||||||
| 
 | 
 | ||||||
| TEST(IntlSegmenter, TestSentenceBreakIteratorUtf16) | TEST(IntlSegmenter, TestSentenceBreakIteratorUtf16) | ||||||
| { | { | ||||||
|  |   nsresult rv = Preferences::SetBool("intl.icu4x.segmenter.enabled", true); | ||||||
|  |   EXPECT_TRUE(rv == NS_OK); | ||||||
|  | 
 | ||||||
|   SegmenterOptions options{SegmenterGranularity::Sentence}; |   SegmenterOptions options{SegmenterGranularity::Sentence}; | ||||||
|   auto result = Segmenter::TryCreate("en", options); |   auto result = Segmenter::TryCreate("en", options); | ||||||
|   ASSERT_TRUE(result.isErr()); |   ASSERT_TRUE(result.isOk()); | ||||||
|  |   auto sentenceSegmenter = result.unwrap(); | ||||||
|  | 
 | ||||||
|  |   const char16_t text[] = u"Hello world. Hello world."; | ||||||
|  |   UniquePtr<SegmentIteratorUtf16> segIter = | ||||||
|  |       sentenceSegmenter->Segment(MakeStringSpan(text)); | ||||||
|  | 
 | ||||||
|  |   ASSERT_EQ(segIter->Next(), Some(13u)); | ||||||
|  |   ASSERT_EQ(segIter->Next(), Some(25u)); | ||||||
|  |   ASSERT_EQ(segIter->Next(), Nothing()); | ||||||
|  | 
 | ||||||
|  |   // Same as calling Next().
 | ||||||
|  |   ASSERT_EQ(segIter->Seek(0u), Nothing()); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | TEST(IntlSegmenter, TestSentenceBreakIteratorUtf16Seek) | ||||||
|  | { | ||||||
|  |   nsresult rv = Preferences::SetBool("intl.icu4x.segmenter.enabled", true); | ||||||
|  |   EXPECT_TRUE(rv == NS_OK); | ||||||
|  | 
 | ||||||
|  |   SegmenterOptions options{SegmenterGranularity::Sentence}; | ||||||
|  |   auto result = Segmenter::TryCreate("en", options); | ||||||
|  |   ASSERT_TRUE(result.isOk()); | ||||||
|  |   auto sentenceSegmenter = result.unwrap(); | ||||||
|  | 
 | ||||||
|  |   const char16_t text[] = u"Hello world. Hello world."; | ||||||
|  |   UniquePtr<SegmentIteratorUtf16> segIter = | ||||||
|  |       sentenceSegmenter->Segment(MakeStringSpan(text)); | ||||||
|  | 
 | ||||||
|  |   ASSERT_EQ(segIter->Seek(5u), Some(13u)); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| }  // namespace mozilla::intl
 | }  // namespace mozilla::intl
 | ||||||
|  |  | ||||||
|  | @ -42,4 +42,16 @@ else: | ||||||
|         "rulebrk.c", |         "rulebrk.c", | ||||||
|     ] |     ] | ||||||
| 
 | 
 | ||||||
|  | if CONFIG["JS_HAS_INTL_API"] and CONFIG["MOZ_ICU4X"]: | ||||||
|  |     LOCAL_INCLUDES += [ | ||||||
|  |         "/third_party/rust/icu_capi/cpp/include", | ||||||
|  |     ] | ||||||
|  |     # Disable warnings when including C++ headers of ICU4X. | ||||||
|  |     # - https://github.com/rust-diplomat/diplomat/issues/277 | ||||||
|  |     # - https://github.com/rust-diplomat/diplomat/issues/335 | ||||||
|  |     CXXFLAGS += [ | ||||||
|  |         "-Wno-mismatched-tags", | ||||||
|  |         "-Wno-pessimizing-move", | ||||||
|  |     ] | ||||||
|  | 
 | ||||||
| FINAL_LIBRARY = "xul" | FINAL_LIBRARY = "xul" | ||||||
|  |  | ||||||
|  | @ -7304,6 +7304,12 @@ | ||||||
|   mirror: always |   mirror: always | ||||||
| #endif | #endif | ||||||
| 
 | 
 | ||||||
|  | # If true, we use UAX14/29 compatible segmenter rules using ICU4X | ||||||
|  | - name: intl.icu4x.segmenter.enabled | ||||||
|  |   type: RelaxedAtomicBool | ||||||
|  |   value: false | ||||||
|  |   mirror: always | ||||||
|  | 
 | ||||||
| #--------------------------------------------------------------------------- | #--------------------------------------------------------------------------- | ||||||
| # Prefs starting with "javascript." | # Prefs starting with "javascript." | ||||||
| # | # | ||||||
|  |  | ||||||
		Loading…
	
		Reference in a new issue
	
	 Makoto Kato
						Makoto Kato