forked from mirrors/gecko-dev
Bug 1719535 - Part 5. Add ICU4X based segmenter modules. r=TYLin,jfkthame
Depends on D167673 Differential Revision: https://phabricator.services.mozilla.com/D167675
This commit is contained in:
parent
80aa728f71
commit
d8a2fdf7da
7 changed files with 655 additions and 8 deletions
|
|
@ -13,6 +13,20 @@
|
||||||
#include "mozilla/intl/Segmenter.h"
|
#include "mozilla/intl/Segmenter.h"
|
||||||
#include "mozilla/intl/UnicodeProperties.h"
|
#include "mozilla/intl/UnicodeProperties.h"
|
||||||
|
|
||||||
|
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
|
||||||
|
# include "ICU4XDataProvider.h"
|
||||||
|
# include "ICU4XLineBreakIteratorLatin1.hpp"
|
||||||
|
# include "ICU4XLineBreakIteratorUtf16.hpp"
|
||||||
|
# include "ICU4XLineSegmenter.h"
|
||||||
|
# include "mozilla/CheckedInt.h"
|
||||||
|
# include "mozilla/ClearOnShutdown.h"
|
||||||
|
# include "mozilla/intl/ICU4XGeckoDataProvider.h"
|
||||||
|
# include "mozilla/StaticPrefs_intl.h"
|
||||||
|
# include "nsThreadUtils.h"
|
||||||
|
|
||||||
|
# include <mutex>
|
||||||
|
#endif
|
||||||
|
|
||||||
using namespace mozilla::unicode;
|
using namespace mozilla::unicode;
|
||||||
using namespace mozilla::intl;
|
using namespace mozilla::intl;
|
||||||
|
|
||||||
|
|
@ -978,9 +992,136 @@ static bool SuppressBreakForKeepAll(uint32_t aPrev, uint32_t aCh) {
|
||||||
affectedByKeepAll(GetLineBreakClass(aCh));
|
affectedByKeepAll(GetLineBreakClass(aCh));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
|
||||||
|
static capi::ICU4XLineBreakStrictness ConvertLineBreakRuleToICU4X(
|
||||||
|
LineBreakRule aLevel) {
|
||||||
|
switch (aLevel) {
|
||||||
|
case LineBreakRule::Auto:
|
||||||
|
return capi::ICU4XLineBreakStrictness_Strict;
|
||||||
|
case LineBreakRule::Strict:
|
||||||
|
return capi::ICU4XLineBreakStrictness_Strict;
|
||||||
|
case LineBreakRule::Loose:
|
||||||
|
return capi::ICU4XLineBreakStrictness_Loose;
|
||||||
|
case LineBreakRule::Normal:
|
||||||
|
return capi::ICU4XLineBreakStrictness_Normal;
|
||||||
|
case LineBreakRule::Anywhere:
|
||||||
|
return capi::ICU4XLineBreakStrictness_Anywhere;
|
||||||
|
}
|
||||||
|
MOZ_ASSERT_UNREACHABLE("should have been handled already");
|
||||||
|
return capi::ICU4XLineBreakStrictness_Normal;
|
||||||
|
}
|
||||||
|
|
||||||
|
static capi::ICU4XLineBreakWordOption ConvertWordBreakRuleToICU4X(
|
||||||
|
WordBreakRule aWordBreak) {
|
||||||
|
switch (aWordBreak) {
|
||||||
|
case WordBreakRule::Normal:
|
||||||
|
return capi::ICU4XLineBreakWordOption_Normal;
|
||||||
|
case WordBreakRule::BreakAll:
|
||||||
|
return capi::ICU4XLineBreakWordOption_BreakAll;
|
||||||
|
case WordBreakRule::KeepAll:
|
||||||
|
return capi::ICU4XLineBreakWordOption_KeepAll;
|
||||||
|
}
|
||||||
|
MOZ_ASSERT_UNREACHABLE("should have been handled already");
|
||||||
|
return capi::ICU4XLineBreakWordOption_Normal;
|
||||||
|
}
|
||||||
|
|
||||||
|
static capi::ICU4XLineSegmenter* sLineSegmenter = nullptr;
|
||||||
|
|
||||||
|
static capi::ICU4XLineSegmenter* GetDefaultLineSegmenter() {
|
||||||
|
static std::once_flag sOnce;
|
||||||
|
|
||||||
|
std::call_once(sOnce, [] {
|
||||||
|
auto result = capi::ICU4XLineSegmenter_create_auto(GetDataProvider());
|
||||||
|
MOZ_ASSERT(result.is_ok);
|
||||||
|
sLineSegmenter = result.ok;
|
||||||
|
|
||||||
|
if (NS_IsMainThread()) {
|
||||||
|
mozilla::RunOnShutdown([] {
|
||||||
|
if (sLineSegmenter) {
|
||||||
|
capi::ICU4XLineSegmenter_destroy(sLineSegmenter);
|
||||||
|
}
|
||||||
|
sLineSegmenter = nullptr;
|
||||||
|
});
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
NS_DispatchToMainThread(
|
||||||
|
NS_NewRunnableFunction("GetDefaultLineSegmenter", [] {
|
||||||
|
mozilla::RunOnShutdown([] {
|
||||||
|
if (sLineSegmenter) {
|
||||||
|
capi::ICU4XLineSegmenter_destroy(sLineSegmenter);
|
||||||
|
}
|
||||||
|
sLineSegmenter = nullptr;
|
||||||
|
});
|
||||||
|
}));
|
||||||
|
});
|
||||||
|
|
||||||
|
return sLineSegmenter;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool UseDefaultLineSegmenter(WordBreakRule aWordBreak,
|
||||||
|
LineBreakRule aLevel,
|
||||||
|
bool aIsChineseOrJapanese) {
|
||||||
|
return aWordBreak == WordBreakRule::Normal &&
|
||||||
|
(aLevel == LineBreakRule::Strict || aLevel == LineBreakRule::Auto) &&
|
||||||
|
!aIsChineseOrJapanese;
|
||||||
|
}
|
||||||
|
|
||||||
|
static capi::ICU4XLineSegmenter* GetLineSegmenter(bool aUseDefault,
|
||||||
|
WordBreakRule aWordBreak,
|
||||||
|
LineBreakRule aLevel,
|
||||||
|
bool aIsChineseOrJapanese) {
|
||||||
|
if (aUseDefault) {
|
||||||
|
MOZ_ASSERT(
|
||||||
|
UseDefaultLineSegmenter(aWordBreak, aLevel, aIsChineseOrJapanese));
|
||||||
|
return GetDefaultLineSegmenter();
|
||||||
|
}
|
||||||
|
|
||||||
|
capi::ICU4XLineBreakOptionsV1 options;
|
||||||
|
options.word_option = ConvertWordBreakRuleToICU4X(aWordBreak);
|
||||||
|
options.strictness = ConvertLineBreakRuleToICU4X(aLevel);
|
||||||
|
options.ja_zh = aIsChineseOrJapanese;
|
||||||
|
|
||||||
|
auto result = capi::ICU4XLineSegmenter_create_lstm_with_options_v1(
|
||||||
|
GetDataProvider(), options);
|
||||||
|
MOZ_ASSERT(result.is_ok);
|
||||||
|
return result.ok;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
void LineBreaker::ComputeBreakPositions(
|
void LineBreaker::ComputeBreakPositions(
|
||||||
const char16_t* aChars, uint32_t aLength, WordBreakRule aWordBreak,
|
const char16_t* aChars, uint32_t aLength, WordBreakRule aWordBreak,
|
||||||
LineBreakRule aLevel, bool aIsChineseOrJapanese, uint8_t* aBreakBefore) {
|
LineBreakRule aLevel, bool aIsChineseOrJapanese, uint8_t* aBreakBefore) {
|
||||||
|
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
|
||||||
|
if (StaticPrefs::intl_icu4x_segmenter_enabled()) {
|
||||||
|
memset(aBreakBefore, 0, aLength);
|
||||||
|
|
||||||
|
CheckedInt<int32_t> length = aLength;
|
||||||
|
if (!length.isValid()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const bool useDefault =
|
||||||
|
UseDefaultLineSegmenter(aWordBreak, aLevel, aIsChineseOrJapanese);
|
||||||
|
capi::ICU4XLineSegmenter* lineSegmenter =
|
||||||
|
GetLineSegmenter(useDefault, aWordBreak, aLevel, aIsChineseOrJapanese);
|
||||||
|
ICU4XLineBreakIteratorUtf16 iterator(capi::ICU4XLineSegmenter_segment_utf16(
|
||||||
|
lineSegmenter, (const uint16_t*)aChars, aLength));
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
const int32_t nextPos = iterator.next();
|
||||||
|
if (nextPos < 0 || nextPos >= length.value()) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
aBreakBefore[nextPos] = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!useDefault) {
|
||||||
|
capi::ICU4XLineSegmenter_destroy(lineSegmenter);
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
uint32_t cur;
|
uint32_t cur;
|
||||||
int8_t lastClass = CLASS_NONE;
|
int8_t lastClass = CLASS_NONE;
|
||||||
ContextState state(aChars, aLength);
|
ContextState state(aChars, aLength);
|
||||||
|
|
@ -1110,6 +1251,38 @@ void LineBreaker::ComputeBreakPositions(const uint8_t* aChars, uint32_t aLength,
|
||||||
LineBreakRule aLevel,
|
LineBreakRule aLevel,
|
||||||
bool aIsChineseOrJapanese,
|
bool aIsChineseOrJapanese,
|
||||||
uint8_t* aBreakBefore) {
|
uint8_t* aBreakBefore) {
|
||||||
|
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
|
||||||
|
if (StaticPrefs::intl_icu4x_segmenter_enabled()) {
|
||||||
|
memset(aBreakBefore, 0, aLength);
|
||||||
|
|
||||||
|
CheckedInt<int32_t> length = aLength;
|
||||||
|
if (!length.isValid()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const bool useDefault =
|
||||||
|
UseDefaultLineSegmenter(aWordBreak, aLevel, aIsChineseOrJapanese);
|
||||||
|
capi::ICU4XLineSegmenter* lineSegmenter =
|
||||||
|
GetLineSegmenter(useDefault, aWordBreak, aLevel, aIsChineseOrJapanese);
|
||||||
|
ICU4XLineBreakIteratorLatin1 iterator(
|
||||||
|
capi::ICU4XLineSegmenter_segment_latin1(
|
||||||
|
lineSegmenter, (const uint8_t*)aChars, aLength));
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
const int32_t nextPos = iterator.next();
|
||||||
|
if (nextPos < 0 || nextPos >= length.value()) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
aBreakBefore[nextPos] = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!useDefault) {
|
||||||
|
capi::ICU4XLineSegmenter_destroy(lineSegmenter);
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
uint32_t cur;
|
uint32_t cur;
|
||||||
int8_t lastClass = CLASS_NONE;
|
int8_t lastClass = CLASS_NONE;
|
||||||
ContextState state(aChars, aLength);
|
ContextState state(aChars, aLength);
|
||||||
|
|
|
||||||
|
|
@ -11,9 +11,19 @@
|
||||||
#include "mozilla/intl/LineBreaker.h"
|
#include "mozilla/intl/LineBreaker.h"
|
||||||
#include "mozilla/intl/WordBreaker.h"
|
#include "mozilla/intl/WordBreaker.h"
|
||||||
#include "mozilla/intl/UnicodeProperties.h"
|
#include "mozilla/intl/UnicodeProperties.h"
|
||||||
|
#include "mozilla/StaticPrefs_intl.h"
|
||||||
#include "nsUnicodeProperties.h"
|
#include "nsUnicodeProperties.h"
|
||||||
#include "nsCharTraits.h"
|
#include "nsCharTraits.h"
|
||||||
|
|
||||||
|
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
|
||||||
|
# include "ICU4XDataProvider.h"
|
||||||
|
# include "ICU4XGraphemeClusterSegmenter.h"
|
||||||
|
# include "ICU4XLineSegmenter.h"
|
||||||
|
# include "ICU4XSentenceSegmenter.h"
|
||||||
|
# include "ICU4XWordSegmenter.h"
|
||||||
|
# include "mozilla/intl/ICU4XGeckoDataProvider.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
using namespace mozilla::unicode;
|
using namespace mozilla::unicode;
|
||||||
|
|
||||||
namespace mozilla::intl {
|
namespace mozilla::intl {
|
||||||
|
|
@ -30,9 +40,45 @@ Maybe<uint32_t> SegmentIteratorUtf16::Seek(uint32_t aPos) {
|
||||||
|
|
||||||
LineBreakIteratorUtf16::LineBreakIteratorUtf16(Span<const char16_t> aText,
|
LineBreakIteratorUtf16::LineBreakIteratorUtf16(Span<const char16_t> aText,
|
||||||
const LineBreakOptions& aOptions)
|
const LineBreakOptions& aOptions)
|
||||||
: SegmentIteratorUtf16(aText), mOptions(aOptions) {}
|
: SegmentIteratorUtf16(aText), mOptions(aOptions) {
|
||||||
|
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
|
||||||
|
if (!StaticPrefs::intl_icu4x_segmenter_enabled()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
auto result =
|
||||||
|
capi::ICU4XLineSegmenter_create_auto(mozilla::intl::GetDataProvider());
|
||||||
|
MOZ_RELEASE_ASSERT(result.is_ok);
|
||||||
|
mSegmenter = result.ok;
|
||||||
|
mIterator = capi::ICU4XLineSegmenter_segment_utf16(
|
||||||
|
mSegmenter, (const uint16_t*)mText.Elements(), mText.Length());
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
LineBreakIteratorUtf16::~LineBreakIteratorUtf16() {
|
||||||
|
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
|
||||||
|
if (mIterator) {
|
||||||
|
capi::ICU4XLineBreakIteratorUtf16_destroy(mIterator);
|
||||||
|
}
|
||||||
|
if (mSegmenter) {
|
||||||
|
capi::ICU4XLineSegmenter_destroy(mSegmenter);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
Maybe<uint32_t> LineBreakIteratorUtf16::Next() {
|
Maybe<uint32_t> LineBreakIteratorUtf16::Next() {
|
||||||
|
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
|
||||||
|
if (mIterator) {
|
||||||
|
const int32_t nextPos = capi::ICU4XLineBreakIteratorUtf16_next(mIterator);
|
||||||
|
if (nextPos < 0) {
|
||||||
|
return Nothing();
|
||||||
|
}
|
||||||
|
if (!nextPos) {
|
||||||
|
return Next();
|
||||||
|
}
|
||||||
|
mPos = nextPos;
|
||||||
|
return Some(mPos);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
const int32_t nextPos =
|
const int32_t nextPos =
|
||||||
LineBreaker::Next(mText.Elements(), mText.Length(), mPos);
|
LineBreaker::Next(mText.Elements(), mText.Length(), mPos);
|
||||||
if (nextPos == NS_LINEBREAKER_NEED_MORE_TEXT) {
|
if (nextPos == NS_LINEBREAKER_NEED_MORE_TEXT) {
|
||||||
|
|
@ -42,10 +88,71 @@ Maybe<uint32_t> LineBreakIteratorUtf16::Next() {
|
||||||
return Some(mPos);
|
return Some(mPos);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Maybe<uint32_t> LineBreakIteratorUtf16::Seek(uint32_t aPos) {
|
||||||
|
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
|
||||||
|
if (mIterator) {
|
||||||
|
if (mPos >= aPos) {
|
||||||
|
return Next();
|
||||||
|
}
|
||||||
|
|
||||||
|
while (mPos < aPos) {
|
||||||
|
const int32_t nextPos = capi::ICU4XLineBreakIteratorUtf16_next(mIterator);
|
||||||
|
if (nextPos < 0) {
|
||||||
|
return Nothing();
|
||||||
|
}
|
||||||
|
mPos = static_cast<uint32_t>(nextPos);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (aPos < mPos) {
|
||||||
|
return Some(mPos);
|
||||||
|
}
|
||||||
|
|
||||||
|
return Next();
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
return SegmentIteratorUtf16::Seek(aPos);
|
||||||
|
}
|
||||||
|
|
||||||
WordBreakIteratorUtf16::WordBreakIteratorUtf16(Span<const char16_t> aText)
|
WordBreakIteratorUtf16::WordBreakIteratorUtf16(Span<const char16_t> aText)
|
||||||
: SegmentIteratorUtf16(aText) {}
|
: SegmentIteratorUtf16(aText) {
|
||||||
|
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
|
||||||
|
if (!StaticPrefs::intl_icu4x_segmenter_enabled()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
auto result =
|
||||||
|
capi::ICU4XWordSegmenter_create_auto(mozilla::intl::GetDataProvider());
|
||||||
|
MOZ_RELEASE_ASSERT(result.is_ok);
|
||||||
|
mSegmenter = result.ok;
|
||||||
|
mIterator = capi::ICU4XWordSegmenter_segment_utf16(
|
||||||
|
mSegmenter, (const uint16_t*)mText.Elements(), mText.Length());
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
WordBreakIteratorUtf16::~WordBreakIteratorUtf16() {
|
||||||
|
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
|
||||||
|
if (mIterator) {
|
||||||
|
capi::ICU4XWordBreakIteratorUtf16_destroy(mIterator);
|
||||||
|
}
|
||||||
|
if (mSegmenter) {
|
||||||
|
capi::ICU4XWordSegmenter_destroy(mSegmenter);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
Maybe<uint32_t> WordBreakIteratorUtf16::Next() {
|
Maybe<uint32_t> WordBreakIteratorUtf16::Next() {
|
||||||
|
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
|
||||||
|
if (mIterator) {
|
||||||
|
const int32_t nextPos = capi::ICU4XWordBreakIteratorUtf16_next(mIterator);
|
||||||
|
if (nextPos < 0) {
|
||||||
|
return Nothing();
|
||||||
|
}
|
||||||
|
if (!nextPos) {
|
||||||
|
return Next();
|
||||||
|
}
|
||||||
|
mPos = nextPos;
|
||||||
|
return Some(mPos);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
const int32_t nextPos =
|
const int32_t nextPos =
|
||||||
WordBreaker::Next(mText.Elements(), mText.Length(), mPos);
|
WordBreaker::Next(mText.Elements(), mText.Length(), mPos);
|
||||||
if (nextPos == NS_WORDBREAKER_NEED_MORE_TEXT) {
|
if (nextPos == NS_WORDBREAKER_NEED_MORE_TEXT) {
|
||||||
|
|
@ -55,9 +162,57 @@ Maybe<uint32_t> WordBreakIteratorUtf16::Next() {
|
||||||
return Some(mPos);
|
return Some(mPos);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Maybe<uint32_t> WordBreakIteratorUtf16::Seek(uint32_t aPos) {
|
||||||
|
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
|
||||||
|
if (mIterator) {
|
||||||
|
if (mPos >= aPos) {
|
||||||
|
return Next();
|
||||||
|
}
|
||||||
|
|
||||||
|
while (mPos < aPos) {
|
||||||
|
const int32_t nextPos = capi::ICU4XWordBreakIteratorUtf16_next(mIterator);
|
||||||
|
if (nextPos < 0) {
|
||||||
|
return Nothing();
|
||||||
|
}
|
||||||
|
mPos = static_cast<uint32_t>(nextPos);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (aPos < mPos) {
|
||||||
|
return Some(mPos);
|
||||||
|
}
|
||||||
|
|
||||||
|
return Next();
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
return SegmentIteratorUtf16::Seek(aPos);
|
||||||
|
}
|
||||||
|
|
||||||
GraphemeClusterBreakIteratorUtf16::GraphemeClusterBreakIteratorUtf16(
|
GraphemeClusterBreakIteratorUtf16::GraphemeClusterBreakIteratorUtf16(
|
||||||
Span<const char16_t> aText)
|
Span<const char16_t> aText)
|
||||||
: SegmentIteratorUtf16(aText) {}
|
: SegmentIteratorUtf16(aText) {
|
||||||
|
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
|
||||||
|
if (!StaticPrefs::intl_icu4x_segmenter_enabled()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
auto result = capi::ICU4XGraphemeClusterSegmenter_create(
|
||||||
|
mozilla::intl::GetDataProvider());
|
||||||
|
MOZ_RELEASE_ASSERT(result.is_ok);
|
||||||
|
mSegmenter = result.ok;
|
||||||
|
mIterator = capi::ICU4XGraphemeClusterSegmenter_segment_utf16(
|
||||||
|
mSegmenter, (const uint16_t*)mText.Elements(), mText.Length());
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
GraphemeClusterBreakIteratorUtf16::~GraphemeClusterBreakIteratorUtf16() {
|
||||||
|
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
|
||||||
|
if (mIterator) {
|
||||||
|
capi::ICU4XGraphemeClusterBreakIteratorUtf16_destroy(mIterator);
|
||||||
|
}
|
||||||
|
if (mSegmenter) {
|
||||||
|
capi::ICU4XGraphemeClusterSegmenter_destroy(mSegmenter);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
enum HSType {
|
enum HSType {
|
||||||
HST_NONE = U_HST_NOT_APPLICABLE,
|
HST_NONE = U_HST_NOT_APPLICABLE,
|
||||||
|
|
@ -75,6 +230,20 @@ static HSType GetHangulSyllableType(uint32_t aCh) {
|
||||||
|
|
||||||
Maybe<uint32_t> GraphemeClusterBreakIteratorUtf16::Next() {
|
Maybe<uint32_t> GraphemeClusterBreakIteratorUtf16::Next() {
|
||||||
const auto len = mText.Length();
|
const auto len = mText.Length();
|
||||||
|
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
|
||||||
|
if (mIterator) {
|
||||||
|
const int32_t nextPos =
|
||||||
|
capi::ICU4XGraphemeClusterBreakIteratorUtf16_next(mIterator);
|
||||||
|
if (nextPos < 0) {
|
||||||
|
return Nothing();
|
||||||
|
}
|
||||||
|
if (!nextPos) {
|
||||||
|
return Next();
|
||||||
|
}
|
||||||
|
mPos = nextPos;
|
||||||
|
return Some(mPos);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
if (mPos >= len) {
|
if (mPos >= len) {
|
||||||
// The iterator has already reached the end.
|
// The iterator has already reached the end.
|
||||||
return Nothing();
|
return Nothing();
|
||||||
|
|
@ -195,6 +364,32 @@ Maybe<uint32_t> GraphemeClusterBreakIteratorUtf16::Next() {
|
||||||
return Some(mPos);
|
return Some(mPos);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Maybe<uint32_t> GraphemeClusterBreakIteratorUtf16::Seek(uint32_t aPos) {
|
||||||
|
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
|
||||||
|
if (mIterator) {
|
||||||
|
if (mPos >= aPos) {
|
||||||
|
return Next();
|
||||||
|
}
|
||||||
|
|
||||||
|
while (mPos < aPos) {
|
||||||
|
const int32_t nextPos =
|
||||||
|
capi::ICU4XGraphemeClusterBreakIteratorUtf16_next(mIterator);
|
||||||
|
if (nextPos < 0) {
|
||||||
|
return Nothing();
|
||||||
|
}
|
||||||
|
mPos = static_cast<uint32_t>(nextPos);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (aPos < mPos) {
|
||||||
|
return Some(mPos);
|
||||||
|
}
|
||||||
|
|
||||||
|
return Next();
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
return SegmentIteratorUtf16::Seek(aPos);
|
||||||
|
}
|
||||||
|
|
||||||
GraphemeClusterBreakReverseIteratorUtf16::
|
GraphemeClusterBreakReverseIteratorUtf16::
|
||||||
GraphemeClusterBreakReverseIteratorUtf16(Span<const char16_t> aText)
|
GraphemeClusterBreakReverseIteratorUtf16(Span<const char16_t> aText)
|
||||||
: SegmentIteratorUtf16(aText) {
|
: SegmentIteratorUtf16(aText) {
|
||||||
|
|
@ -231,12 +426,77 @@ Maybe<uint32_t> GraphemeClusterBreakReverseIteratorUtf16::Seek(uint32_t aPos) {
|
||||||
return Next();
|
return Next();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
|
||||||
|
SentenceBreakIteratorUtf16::SentenceBreakIteratorUtf16(
|
||||||
|
Span<const char16_t> aText)
|
||||||
|
: SegmentIteratorUtf16(aText) {
|
||||||
|
auto result =
|
||||||
|
capi::ICU4XSentenceSegmenter_create(mozilla::intl::GetDataProvider());
|
||||||
|
MOZ_RELEASE_ASSERT(result.is_ok);
|
||||||
|
mSegmenter = result.ok;
|
||||||
|
mIterator = capi::ICU4XSentenceSegmenter_segment_utf16(
|
||||||
|
mSegmenter, (const uint16_t*)mText.Elements(), mText.Length());
|
||||||
|
}
|
||||||
|
|
||||||
|
SentenceBreakIteratorUtf16::~SentenceBreakIteratorUtf16() {
|
||||||
|
if (mIterator) {
|
||||||
|
capi::ICU4XSentenceBreakIteratorUtf16_destroy(mIterator);
|
||||||
|
}
|
||||||
|
if (mSegmenter) {
|
||||||
|
capi::ICU4XSentenceSegmenter_destroy(mSegmenter);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Maybe<uint32_t> SentenceBreakIteratorUtf16::Seek(uint32_t aPos) {
|
||||||
|
if (!mIterator) {
|
||||||
|
return Nothing();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (mPos >= aPos) {
|
||||||
|
return Next();
|
||||||
|
}
|
||||||
|
|
||||||
|
while (mPos < aPos) {
|
||||||
|
const int32_t nextPos =
|
||||||
|
capi::ICU4XSentenceBreakIteratorUtf16_next(mIterator);
|
||||||
|
if (nextPos < 0) {
|
||||||
|
return Nothing();
|
||||||
|
}
|
||||||
|
mPos = static_cast<uint32_t>(nextPos);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (aPos < mPos) {
|
||||||
|
return Some(mPos);
|
||||||
|
}
|
||||||
|
|
||||||
|
return Next();
|
||||||
|
}
|
||||||
|
|
||||||
|
Maybe<uint32_t> SentenceBreakIteratorUtf16::Next() {
|
||||||
|
if (!mIterator) {
|
||||||
|
return Nothing();
|
||||||
|
}
|
||||||
|
|
||||||
|
const int32_t nextPos = capi::ICU4XSentenceBreakIteratorUtf16_next(mIterator);
|
||||||
|
if (nextPos < 0) {
|
||||||
|
return Nothing();
|
||||||
|
}
|
||||||
|
if (!nextPos) {
|
||||||
|
return Next();
|
||||||
|
}
|
||||||
|
mPos = nextPos;
|
||||||
|
return Some(mPos);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
Result<UniquePtr<Segmenter>, ICUError> Segmenter::TryCreate(
|
Result<UniquePtr<Segmenter>, ICUError> Segmenter::TryCreate(
|
||||||
Span<const char> aLocale, const SegmenterOptions& aOptions) {
|
Span<const char> aLocale, const SegmenterOptions& aOptions) {
|
||||||
|
#if !defined(MOZ_ICU4X) || !defined(JS_HAS_INTL_API)
|
||||||
if (aOptions.mGranularity == SegmenterGranularity::Sentence) {
|
if (aOptions.mGranularity == SegmenterGranularity::Sentence) {
|
||||||
// Grapheme and Sentence iterator are not yet implemented.
|
// Grapheme and Sentence iterator are not yet implemented.
|
||||||
return Err(ICUError::InternalError);
|
return Err(ICUError::InternalError);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
return MakeUnique<Segmenter>(aLocale, aOptions);
|
return MakeUnique<Segmenter>(aLocale, aOptions);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -246,6 +506,11 @@ UniquePtr<SegmentIteratorUtf16> Segmenter::Segment(
|
||||||
case SegmenterGranularity::Grapheme:
|
case SegmenterGranularity::Grapheme:
|
||||||
return MakeUnique<GraphemeClusterBreakIteratorUtf16>(aText);
|
return MakeUnique<GraphemeClusterBreakIteratorUtf16>(aText);
|
||||||
case SegmenterGranularity::Sentence:
|
case SegmenterGranularity::Sentence:
|
||||||
|
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
|
||||||
|
if (StaticPrefs::intl_icu4x_segmenter_enabled()) {
|
||||||
|
return MakeUnique<SentenceBreakIteratorUtf16>(aText);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
MOZ_ASSERT_UNREACHABLE("Unimplemented yet!");
|
MOZ_ASSERT_UNREACHABLE("Unimplemented yet!");
|
||||||
return nullptr;
|
return nullptr;
|
||||||
case SegmenterGranularity::Word:
|
case SegmenterGranularity::Word:
|
||||||
|
|
|
||||||
|
|
@ -15,6 +15,19 @@
|
||||||
#include "mozilla/Span.h"
|
#include "mozilla/Span.h"
|
||||||
#include "mozilla/UniquePtr.h"
|
#include "mozilla/UniquePtr.h"
|
||||||
|
|
||||||
|
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
|
||||||
|
namespace capi {
|
||||||
|
struct ICU4XLineSegmenter;
|
||||||
|
struct ICU4XLineBreakIteratorUtf16;
|
||||||
|
struct ICU4XWordSegmenter;
|
||||||
|
struct ICU4XWordBreakIteratorUtf16;
|
||||||
|
struct ICU4XGraphemeClusterSegmenter;
|
||||||
|
struct ICU4XGraphemeClusterBreakIteratorUtf16;
|
||||||
|
struct ICU4XSentenceSegmenter;
|
||||||
|
struct ICU4XSentenceBreakIteratorUtf16;
|
||||||
|
} // namespace capi
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace mozilla::intl {
|
namespace mozilla::intl {
|
||||||
|
|
||||||
enum class SegmenterGranularity : uint8_t {
|
enum class SegmenterGranularity : uint8_t {
|
||||||
|
|
@ -104,11 +117,18 @@ class LineBreakIteratorUtf16 final : public SegmentIteratorUtf16 {
|
||||||
public:
|
public:
|
||||||
explicit LineBreakIteratorUtf16(Span<const char16_t> aText,
|
explicit LineBreakIteratorUtf16(Span<const char16_t> aText,
|
||||||
const LineBreakOptions& aOptions = {});
|
const LineBreakOptions& aOptions = {});
|
||||||
|
~LineBreakIteratorUtf16() override;
|
||||||
|
|
||||||
Maybe<uint32_t> Next() override;
|
Maybe<uint32_t> Next() override;
|
||||||
|
Maybe<uint32_t> Seek(uint32_t aPos) override;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
LineBreakOptions mOptions;
|
LineBreakOptions mOptions;
|
||||||
|
|
||||||
|
#ifdef MOZ_ICU4X
|
||||||
|
capi::ICU4XLineSegmenter* mSegmenter = nullptr;
|
||||||
|
capi::ICU4XLineBreakIteratorUtf16* mIterator = nullptr;
|
||||||
|
#endif
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -117,8 +137,16 @@ class LineBreakIteratorUtf16 final : public SegmentIteratorUtf16 {
|
||||||
class WordBreakIteratorUtf16 final : public SegmentIteratorUtf16 {
|
class WordBreakIteratorUtf16 final : public SegmentIteratorUtf16 {
|
||||||
public:
|
public:
|
||||||
explicit WordBreakIteratorUtf16(Span<const char16_t> aText);
|
explicit WordBreakIteratorUtf16(Span<const char16_t> aText);
|
||||||
|
~WordBreakIteratorUtf16() override;
|
||||||
|
|
||||||
Maybe<uint32_t> Next() override;
|
Maybe<uint32_t> Next() override;
|
||||||
|
Maybe<uint32_t> Seek(uint32_t aPos) override;
|
||||||
|
|
||||||
|
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
|
||||||
|
private:
|
||||||
|
capi::ICU4XWordSegmenter* mSegmenter = nullptr;
|
||||||
|
capi::ICU4XWordBreakIteratorUtf16* mIterator = nullptr;
|
||||||
|
#endif
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -127,8 +155,16 @@ class WordBreakIteratorUtf16 final : public SegmentIteratorUtf16 {
|
||||||
class GraphemeClusterBreakIteratorUtf16 final : public SegmentIteratorUtf16 {
|
class GraphemeClusterBreakIteratorUtf16 final : public SegmentIteratorUtf16 {
|
||||||
public:
|
public:
|
||||||
explicit GraphemeClusterBreakIteratorUtf16(Span<const char16_t> aText);
|
explicit GraphemeClusterBreakIteratorUtf16(Span<const char16_t> aText);
|
||||||
|
~GraphemeClusterBreakIteratorUtf16() override;
|
||||||
|
|
||||||
Maybe<uint32_t> Next() override;
|
Maybe<uint32_t> Next() override;
|
||||||
|
Maybe<uint32_t> Seek(uint32_t aPos) override;
|
||||||
|
|
||||||
|
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
|
||||||
|
private:
|
||||||
|
capi::ICU4XGraphemeClusterSegmenter* mSegmenter = nullptr;
|
||||||
|
capi::ICU4XGraphemeClusterBreakIteratorUtf16* mIterator = nullptr;
|
||||||
|
#endif
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -146,6 +182,24 @@ class GraphemeClusterBreakReverseIteratorUtf16 final
|
||||||
Maybe<uint32_t> Seek(uint32_t aPos) override;
|
Maybe<uint32_t> Seek(uint32_t aPos) override;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
|
||||||
|
/**
|
||||||
|
* Sentence break iterator for UTF-16 text.
|
||||||
|
*/
|
||||||
|
class SentenceBreakIteratorUtf16 final : public SegmentIteratorUtf16 {
|
||||||
|
public:
|
||||||
|
explicit SentenceBreakIteratorUtf16(Span<const char16_t> aText);
|
||||||
|
~SentenceBreakIteratorUtf16() override;
|
||||||
|
|
||||||
|
Maybe<uint32_t> Next() override;
|
||||||
|
Maybe<uint32_t> Seek(uint32_t aPos) override;
|
||||||
|
|
||||||
|
private:
|
||||||
|
capi::ICU4XSentenceSegmenter* mSegmenter = nullptr;
|
||||||
|
capi::ICU4XSentenceBreakIteratorUtf16* mIterator = nullptr;
|
||||||
|
};
|
||||||
|
#endif
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This component is a Mozilla-focused API for working with segmenters in
|
* This component is a Mozilla-focused API for working with segmenters in
|
||||||
* internationalization code.
|
* internationalization code.
|
||||||
|
|
|
||||||
|
|
@ -10,6 +10,14 @@
|
||||||
#include "nsTArray.h"
|
#include "nsTArray.h"
|
||||||
#include "nsUnicodeProperties.h"
|
#include "nsUnicodeProperties.h"
|
||||||
|
|
||||||
|
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
|
||||||
|
# include "ICU4XDataProvider.h"
|
||||||
|
# include "ICU4XWordBreakIteratorUtf16.hpp"
|
||||||
|
# include "ICU4XWordSegmenter.hpp"
|
||||||
|
# include "mozilla/intl/ICU4XGeckoDataProvider.h"
|
||||||
|
# include "mozilla/StaticPrefs_intl.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
using mozilla::intl::Script;
|
using mozilla::intl::Script;
|
||||||
using mozilla::intl::UnicodeProperties;
|
using mozilla::intl::UnicodeProperties;
|
||||||
using mozilla::intl::WordBreaker;
|
using mozilla::intl::WordBreaker;
|
||||||
|
|
@ -102,9 +110,34 @@ WordRange WordBreaker::FindWord(const char16_t* aText, uint32_t aLen,
|
||||||
return {aLen, aLen};
|
return {aLen, aLen};
|
||||||
}
|
}
|
||||||
|
|
||||||
WordBreakClass c = GetClass(aText[aPos]);
|
|
||||||
WordRange range{0, aLen};
|
WordRange range{0, aLen};
|
||||||
|
|
||||||
|
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
|
||||||
|
if (StaticPrefs::intl_icu4x_segmenter_enabled()) {
|
||||||
|
auto result =
|
||||||
|
capi::ICU4XWordSegmenter_create_auto(mozilla::intl::GetDataProvider());
|
||||||
|
MOZ_ASSERT(result.is_ok);
|
||||||
|
ICU4XWordSegmenter segmenter(result.ok);
|
||||||
|
ICU4XWordBreakIteratorUtf16 iterator =
|
||||||
|
segmenter.segment_utf16(diplomat::span((const uint16_t*)aText, aLen));
|
||||||
|
|
||||||
|
uint32_t previousPos = 0;
|
||||||
|
while (true) {
|
||||||
|
const int32_t nextPos = iterator.next();
|
||||||
|
if (nextPos < 0) {
|
||||||
|
return {previousPos, aLen};
|
||||||
|
}
|
||||||
|
if ((uint32_t)nextPos > aPos) {
|
||||||
|
return {previousPos, (uint32_t)nextPos};
|
||||||
|
}
|
||||||
|
|
||||||
|
previousPos = nextPos;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
WordBreakClass c = GetClass(aText[aPos]);
|
||||||
|
|
||||||
// Scan forward
|
// Scan forward
|
||||||
for (uint32_t i = aPos + 1; i <= aLen; i++) {
|
for (uint32_t i = aPos + 1; i <= aLen; i++) {
|
||||||
if (c != GetClass(aText[i])) {
|
if (c != GetClass(aText[i])) {
|
||||||
|
|
|
||||||
|
|
@ -7,11 +7,15 @@
|
||||||
#include "gtest/gtest.h"
|
#include "gtest/gtest.h"
|
||||||
|
|
||||||
#include "mozilla/intl/Segmenter.h"
|
#include "mozilla/intl/Segmenter.h"
|
||||||
|
#include "mozilla/Preferences.h"
|
||||||
|
|
||||||
namespace mozilla::intl {
|
namespace mozilla::intl {
|
||||||
|
|
||||||
TEST(IntlSegmenter, TestLineBreakIteratorUtf16)
|
TEST(IntlSegmenter, TestLineBreakIteratorUtf16SeekOld)
|
||||||
{
|
{
|
||||||
|
nsresult rv = Preferences::SetBool("intl.icu4x.segmenter.enabled", false);
|
||||||
|
EXPECT_TRUE(rv == NS_OK);
|
||||||
|
|
||||||
const SegmenterOptions options{SegmenterGranularity::Line};
|
const SegmenterOptions options{SegmenterGranularity::Line};
|
||||||
auto result = Segmenter::TryCreate("en", options);
|
auto result = Segmenter::TryCreate("en", options);
|
||||||
ASSERT_TRUE(result.isOk());
|
ASSERT_TRUE(result.isOk());
|
||||||
|
|
@ -30,7 +34,50 @@ TEST(IntlSegmenter, TestLineBreakIteratorUtf16)
|
||||||
ASSERT_EQ(segIter->Seek(0u), Nothing());
|
ASSERT_EQ(segIter->Seek(0u), Nothing());
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(IntlSegmenter, TestWordBreakIteratorUtf16)
|
TEST(IntlSegmenter, TestLineBreakIteratorUtf16Seek)
|
||||||
|
{
|
||||||
|
nsresult rv = Preferences::SetBool("intl.icu4x.segmenter.enabled", true);
|
||||||
|
EXPECT_TRUE(rv == NS_OK);
|
||||||
|
|
||||||
|
const SegmenterOptions options{SegmenterGranularity::Line};
|
||||||
|
auto result = Segmenter::TryCreate("en", options);
|
||||||
|
ASSERT_TRUE(result.isOk());
|
||||||
|
auto lineSegmenter = result.unwrap();
|
||||||
|
|
||||||
|
const char16_t text[] = u"hello world";
|
||||||
|
UniquePtr<SegmentIteratorUtf16> segIter =
|
||||||
|
lineSegmenter->Segment(MakeStringSpan(text));
|
||||||
|
|
||||||
|
// Seek to space between "hello" and "world".
|
||||||
|
// UAX#14 rule returns before "w".
|
||||||
|
ASSERT_EQ(segIter->Seek(5u), Some(6u));
|
||||||
|
|
||||||
|
ASSERT_EQ(segIter->Next(), Some(11u));
|
||||||
|
|
||||||
|
ASSERT_EQ(segIter->Next(), Nothing());
|
||||||
|
|
||||||
|
// Same as calling Next().
|
||||||
|
ASSERT_EQ(segIter->Seek(0u), Nothing());
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(IntlSegmenter, TestWordBreakIteratorUtf16Simple)
|
||||||
|
{
|
||||||
|
const SegmenterOptions options{SegmenterGranularity::Word};
|
||||||
|
auto result = Segmenter::TryCreate("en", options);
|
||||||
|
ASSERT_TRUE(result.isOk());
|
||||||
|
auto wordSegmenter = result.unwrap();
|
||||||
|
|
||||||
|
const char16_t text[] = u"hello world";
|
||||||
|
UniquePtr<SegmentIteratorUtf16> segIter =
|
||||||
|
wordSegmenter->Segment(MakeStringSpan(text));
|
||||||
|
|
||||||
|
ASSERT_EQ(segIter->Next(), Some(5u));
|
||||||
|
ASSERT_EQ(segIter->Next(), Some(6u));
|
||||||
|
ASSERT_EQ(segIter->Next(), Some(11u));
|
||||||
|
ASSERT_EQ(segIter->Next(), Nothing());
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(IntlSegmenter, TestWordBreakIteratorUtf16Seek)
|
||||||
{
|
{
|
||||||
const SegmenterOptions options{SegmenterGranularity::Word};
|
const SegmenterOptions options{SegmenterGranularity::Word};
|
||||||
auto result = Segmenter::TryCreate("en", options);
|
auto result = Segmenter::TryCreate("en", options);
|
||||||
|
|
@ -51,7 +98,32 @@ TEST(IntlSegmenter, TestWordBreakIteratorUtf16)
|
||||||
ASSERT_EQ(segIter->Seek(0u), Nothing());
|
ASSERT_EQ(segIter->Seek(0u), Nothing());
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(IntlSegmenter, TestGraphemeClusterBreakIteratorUtf16)
|
TEST(IntlSegmenter, TestGraphemeClusterBreakIteratorUtf16Simple)
|
||||||
|
{
|
||||||
|
SegmenterOptions options{SegmenterGranularity::Grapheme};
|
||||||
|
auto result = Segmenter::TryCreate("en", options);
|
||||||
|
ASSERT_TRUE(result.isOk());
|
||||||
|
auto graphemeClusterSegmenter = result.unwrap();
|
||||||
|
|
||||||
|
const char16_t text[] = u"hello world";
|
||||||
|
UniquePtr<SegmentIteratorUtf16> segIter =
|
||||||
|
graphemeClusterSegmenter->Segment(MakeStringSpan(text));
|
||||||
|
|
||||||
|
ASSERT_EQ(segIter->Next(), Some(1u));
|
||||||
|
ASSERT_EQ(segIter->Next(), Some(2u));
|
||||||
|
ASSERT_EQ(segIter->Next(), Some(3u));
|
||||||
|
ASSERT_EQ(segIter->Next(), Some(4u));
|
||||||
|
ASSERT_EQ(segIter->Next(), Some(5u));
|
||||||
|
ASSERT_EQ(segIter->Next(), Some(6u));
|
||||||
|
ASSERT_EQ(segIter->Next(), Some(7u));
|
||||||
|
ASSERT_EQ(segIter->Next(), Some(8u));
|
||||||
|
ASSERT_EQ(segIter->Next(), Some(9u));
|
||||||
|
ASSERT_EQ(segIter->Next(), Some(10u));
|
||||||
|
ASSERT_EQ(segIter->Next(), Some(11u));
|
||||||
|
ASSERT_EQ(segIter->Next(), Nothing());
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(IntlSegmenter, TestGraphemeClusterBreakIteratorUtf16Seek)
|
||||||
{
|
{
|
||||||
SegmenterOptions options{SegmenterGranularity::Grapheme};
|
SegmenterOptions options{SegmenterGranularity::Grapheme};
|
||||||
auto result = Segmenter::TryCreate("en", options);
|
auto result = Segmenter::TryCreate("en", options);
|
||||||
|
|
@ -97,9 +169,41 @@ TEST(IntlSegmenter, TestGraphemeClusterBreakReverseIteratorUtf16)
|
||||||
|
|
||||||
TEST(IntlSegmenter, TestSentenceBreakIteratorUtf16)
|
TEST(IntlSegmenter, TestSentenceBreakIteratorUtf16)
|
||||||
{
|
{
|
||||||
|
nsresult rv = Preferences::SetBool("intl.icu4x.segmenter.enabled", true);
|
||||||
|
EXPECT_TRUE(rv == NS_OK);
|
||||||
|
|
||||||
SegmenterOptions options{SegmenterGranularity::Sentence};
|
SegmenterOptions options{SegmenterGranularity::Sentence};
|
||||||
auto result = Segmenter::TryCreate("en", options);
|
auto result = Segmenter::TryCreate("en", options);
|
||||||
ASSERT_TRUE(result.isErr());
|
ASSERT_TRUE(result.isOk());
|
||||||
|
auto sentenceSegmenter = result.unwrap();
|
||||||
|
|
||||||
|
const char16_t text[] = u"Hello world. Hello world.";
|
||||||
|
UniquePtr<SegmentIteratorUtf16> segIter =
|
||||||
|
sentenceSegmenter->Segment(MakeStringSpan(text));
|
||||||
|
|
||||||
|
ASSERT_EQ(segIter->Next(), Some(13u));
|
||||||
|
ASSERT_EQ(segIter->Next(), Some(25u));
|
||||||
|
ASSERT_EQ(segIter->Next(), Nothing());
|
||||||
|
|
||||||
|
// Same as calling Next().
|
||||||
|
ASSERT_EQ(segIter->Seek(0u), Nothing());
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(IntlSegmenter, TestSentenceBreakIteratorUtf16Seek)
|
||||||
|
{
|
||||||
|
nsresult rv = Preferences::SetBool("intl.icu4x.segmenter.enabled", true);
|
||||||
|
EXPECT_TRUE(rv == NS_OK);
|
||||||
|
|
||||||
|
SegmenterOptions options{SegmenterGranularity::Sentence};
|
||||||
|
auto result = Segmenter::TryCreate("en", options);
|
||||||
|
ASSERT_TRUE(result.isOk());
|
||||||
|
auto sentenceSegmenter = result.unwrap();
|
||||||
|
|
||||||
|
const char16_t text[] = u"Hello world. Hello world.";
|
||||||
|
UniquePtr<SegmentIteratorUtf16> segIter =
|
||||||
|
sentenceSegmenter->Segment(MakeStringSpan(text));
|
||||||
|
|
||||||
|
ASSERT_EQ(segIter->Seek(5u), Some(13u));
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace mozilla::intl
|
} // namespace mozilla::intl
|
||||||
|
|
|
||||||
|
|
@ -42,4 +42,16 @@ else:
|
||||||
"rulebrk.c",
|
"rulebrk.c",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
if CONFIG["JS_HAS_INTL_API"] and CONFIG["MOZ_ICU4X"]:
|
||||||
|
LOCAL_INCLUDES += [
|
||||||
|
"/third_party/rust/icu_capi/cpp/include",
|
||||||
|
]
|
||||||
|
# Disable warnings when including C++ headers of ICU4X.
|
||||||
|
# - https://github.com/rust-diplomat/diplomat/issues/277
|
||||||
|
# - https://github.com/rust-diplomat/diplomat/issues/335
|
||||||
|
CXXFLAGS += [
|
||||||
|
"-Wno-mismatched-tags",
|
||||||
|
"-Wno-pessimizing-move",
|
||||||
|
]
|
||||||
|
|
||||||
FINAL_LIBRARY = "xul"
|
FINAL_LIBRARY = "xul"
|
||||||
|
|
|
||||||
|
|
@ -7304,6 +7304,12 @@
|
||||||
mirror: always
|
mirror: always
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
# If true, we use UAX14/29 compatible segmenter rules using ICU4X
|
||||||
|
- name: intl.icu4x.segmenter.enabled
|
||||||
|
type: RelaxedAtomicBool
|
||||||
|
value: false
|
||||||
|
mirror: always
|
||||||
|
|
||||||
#---------------------------------------------------------------------------
|
#---------------------------------------------------------------------------
|
||||||
# Prefs starting with "javascript."
|
# Prefs starting with "javascript."
|
||||||
#
|
#
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue