mirror of
https://github.com/mozilla/gecko-dev.git
synced 2025-11-02 17:28:50 +02:00
Includes the examples from the report as a testcase, though there is not yet any formal spec for the exact behavior of segment break transformation. (But nevertheless there is an existing collection of tests, so this just adds one for the punctuation case.) Differential Revision: https://phabricator.services.mozilla.com/D231476
441 lines
16 KiB
C++
441 lines
16 KiB
C++
/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
|
|
/* vim: set ts=8 sts=2 et sw=2 tw=80: */
|
|
/* This Source Code Form is subject to the terms of the Mozilla Public
|
|
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
|
|
|
#include "nsTextFrameUtils.h"
|
|
|
|
#include "mozilla/dom/Text.h"
|
|
#include "nsBidiUtils.h"
|
|
#include "nsCharTraits.h"
|
|
#include "nsIContent.h"
|
|
#include "nsStyleStruct.h"
|
|
#include "nsTextFragment.h"
|
|
#include "nsUnicharUtils.h"
|
|
#include "nsUnicodeProperties.h"
|
|
#include <algorithm>
|
|
|
|
using namespace mozilla;
|
|
using namespace mozilla::dom;
|
|
using namespace mozilla::unicode;
|
|
|
|
// static
|
|
bool nsTextFrameUtils::IsSpaceCombiningSequenceTail(const char16_t* aChars,
|
|
int32_t aLength) {
|
|
return aLength > 0 &&
|
|
(IsClusterExtenderExcludingJoiners(aChars[0]) ||
|
|
(IsBidiControl(aChars[0]) &&
|
|
IsSpaceCombiningSequenceTail(aChars + 1, aLength - 1)));
|
|
}
|
|
|
|
static bool IsDiscardable(char16_t ch, nsTextFrameUtils::Flags* aFlags) {
|
|
// Unlike IS_DISCARDABLE, we don't discard \r. \r will be ignored by
|
|
// gfxTextRun and discarding it would force us to copy text in many cases of
|
|
// preformatted text containing \r\n.
|
|
if (ch == CH_SHY) {
|
|
*aFlags |= nsTextFrameUtils::Flags::HasShy;
|
|
return true;
|
|
}
|
|
return IsBidiControl(ch);
|
|
}
|
|
|
|
static bool IsDiscardable(uint8_t ch, nsTextFrameUtils::Flags* aFlags) {
|
|
if (ch == CH_SHY) {
|
|
*aFlags |= nsTextFrameUtils::Flags::HasShy;
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
static bool IsSegmentBreak(char16_t aCh) { return aCh == '\n'; }
|
|
|
|
static bool IsSpaceOrTab(char16_t aCh) { return aCh == ' ' || aCh == '\t'; }
|
|
|
|
static bool IsSpaceOrTabOrSegmentBreak(char16_t aCh) {
|
|
return IsSpaceOrTab(aCh) || IsSegmentBreak(aCh);
|
|
}
|
|
|
|
template <typename CharT>
|
|
/* static */
|
|
bool nsTextFrameUtils::IsSkippableCharacterForTransformText(CharT aChar) {
|
|
return aChar == ' ' || aChar == '\t' || aChar == '\n' || aChar == CH_SHY ||
|
|
(aChar > 0xFF && IsBidiControl(aChar));
|
|
}
|
|
|
|
#ifdef DEBUG
|
|
template <typename CharT>
|
|
static void AssertSkippedExpectedChars(const CharT* aText,
|
|
const gfxSkipChars& aSkipChars,
|
|
int32_t aSkipCharsOffset) {
|
|
gfxSkipCharsIterator it(aSkipChars);
|
|
it.AdvanceOriginal(aSkipCharsOffset);
|
|
while (it.GetOriginalOffset() < it.GetOriginalEnd()) {
|
|
CharT ch = aText[it.GetOriginalOffset() - aSkipCharsOffset];
|
|
MOZ_ASSERT(!it.IsOriginalCharSkipped() ||
|
|
nsTextFrameUtils::IsSkippableCharacterForTransformText(ch),
|
|
"skipped unexpected character; need to update "
|
|
"IsSkippableCharacterForTransformText?");
|
|
it.AdvanceOriginal(1);
|
|
}
|
|
}
|
|
#endif
|
|
|
|
template <class CharT>
|
|
static CharT* TransformWhiteSpaces(
|
|
const CharT* aText, uint32_t aLength, uint32_t aBegin, uint32_t aEnd,
|
|
bool aHasSegmentBreak, bool& aInWhitespace, CharT* aOutput,
|
|
nsTextFrameUtils::Flags& aFlags,
|
|
nsTextFrameUtils::CompressionMode aCompression, gfxSkipChars* aSkipChars,
|
|
bool aLangIsJapaneseOrChinese) {
|
|
MOZ_ASSERT(aCompression == nsTextFrameUtils::COMPRESS_WHITESPACE ||
|
|
aCompression == nsTextFrameUtils::COMPRESS_WHITESPACE_NEWLINE,
|
|
"whitespaces should be skippable!!");
|
|
// Get the context preceding/following this white space range.
|
|
// For 8-bit text (sizeof CharT == 1), the checks here should get optimized
|
|
// out, and isSegmentBreakSkippable should be initialized to be 'false'.
|
|
bool isSegmentBreakSkippable =
|
|
sizeof(CharT) > 1 &&
|
|
((aBegin > 0 && IS_ZERO_WIDTH_SPACE(aText[aBegin - 1])) ||
|
|
(aEnd < aLength && IS_ZERO_WIDTH_SPACE(aText[aEnd])));
|
|
if (sizeof(CharT) > 1 && !isSegmentBreakSkippable && aBegin > 0 &&
|
|
aEnd < aLength) {
|
|
// Get the characters before and after the segment break, skipping past
|
|
// any default-ignorable characters (e.g. variation selectors, various
|
|
// invisible control chars, etc)
|
|
uint32_t ucs4before, ucs4after;
|
|
uint32_t pos = aBegin;
|
|
do {
|
|
if (pos > 1 && NS_IS_SURROGATE_PAIR(aText[pos - 2], aText[pos - 1])) {
|
|
ucs4before = SURROGATE_TO_UCS4(aText[pos - 2], aText[pos - 1]);
|
|
pos -= 2;
|
|
} else {
|
|
ucs4before = aText[pos - 1];
|
|
pos -= 1;
|
|
}
|
|
} while (IsDefaultIgnorable(ucs4before) && pos > 0);
|
|
|
|
pos = aEnd;
|
|
do {
|
|
if (pos + 1 < aLength &&
|
|
NS_IS_SURROGATE_PAIR(aText[pos], aText[pos + 1])) {
|
|
ucs4after = SURROGATE_TO_UCS4(aText[pos], aText[pos + 1]);
|
|
pos += 2;
|
|
} else {
|
|
ucs4after = aText[pos];
|
|
pos += 1;
|
|
}
|
|
} while (IsDefaultIgnorable(ucs4after) && pos < aLength);
|
|
|
|
// Discard newlines between characters that have F, W, or H EastAsianWidth
|
|
// property and neither side is Hangul.
|
|
// For Japanese/Chinese, also discard if *either* character is a fullwidth/
|
|
// wide punctuation character.
|
|
isSegmentBreakSkippable =
|
|
(IsSegmentBreakSkipChar(ucs4before) &&
|
|
IsSegmentBreakSkipChar(ucs4after)) ||
|
|
(aLangIsJapaneseOrChinese && (IsEastAsianPunctuation(ucs4before) ||
|
|
IsEastAsianPunctuation(ucs4after)));
|
|
}
|
|
|
|
for (uint32_t i = aBegin; i < aEnd; ++i) {
|
|
CharT ch = aText[i];
|
|
bool keepChar = false;
|
|
bool keepTransformedWhiteSpace = false;
|
|
if (IsDiscardable(ch, &aFlags)) {
|
|
aSkipChars->SkipChar();
|
|
continue;
|
|
}
|
|
if (IsSpaceOrTab(ch)) {
|
|
if (aHasSegmentBreak) {
|
|
// If white-space is set to normal, nowrap, or pre-line, white space
|
|
// characters are considered collapsible and all spaces and tabs
|
|
// immediately preceding or following a segment break are removed.
|
|
aSkipChars->SkipChar();
|
|
continue;
|
|
}
|
|
|
|
if (aInWhitespace) {
|
|
aSkipChars->SkipChar();
|
|
continue;
|
|
} else {
|
|
keepTransformedWhiteSpace = true;
|
|
}
|
|
} else {
|
|
// Apply Segment Break Transformation Rules (CSS Text 3 - 4.1.2) for
|
|
// segment break characters.
|
|
if (aCompression == nsTextFrameUtils::COMPRESS_WHITESPACE ||
|
|
// XXX: According to CSS Text 3, a lone CR should not always be
|
|
// kept, but still go through the Segment Break Transformation
|
|
// Rules. However, this is what current modern browser engines
|
|
// (webkit/blink/edge) do. So, once we can get some clarity
|
|
// from the specification issue, we should either remove the
|
|
// lone CR condition here, or leave it here with this comment
|
|
// being rephrased.
|
|
// Please see https://github.com/w3c/csswg-drafts/issues/855.
|
|
ch == '\r') {
|
|
keepChar = true;
|
|
} else {
|
|
// aCompression == COMPRESS_WHITESPACE_NEWLINE
|
|
|
|
// Any collapsible segment break immediately following another
|
|
// collapsible segment break is removed. Then the remaining segment
|
|
// break is either transformed into a space (U+0020) or removed
|
|
// depending on the context before and after the break.
|
|
if (isSegmentBreakSkippable || aInWhitespace) {
|
|
aSkipChars->SkipChar();
|
|
continue;
|
|
}
|
|
isSegmentBreakSkippable = true;
|
|
keepTransformedWhiteSpace = true;
|
|
}
|
|
}
|
|
|
|
if (keepChar) {
|
|
*aOutput++ = ch;
|
|
aSkipChars->KeepChar();
|
|
aInWhitespace = IsSpaceOrTab(ch);
|
|
} else if (keepTransformedWhiteSpace) {
|
|
*aOutput++ = ' ';
|
|
aSkipChars->KeepChar();
|
|
aInWhitespace = true;
|
|
} else {
|
|
MOZ_ASSERT_UNREACHABLE("Should've skipped the character!!");
|
|
}
|
|
}
|
|
return aOutput;
|
|
}
|
|
|
|
template <class CharT>
|
|
CharT* nsTextFrameUtils::TransformText(
|
|
const CharT* aText, uint32_t aLength, CharT* aOutput,
|
|
CompressionMode aCompression, uint8_t* aIncomingFlags,
|
|
gfxSkipChars* aSkipChars, Flags* aAnalysisFlags, const nsAtom* aLanguage) {
|
|
Flags flags = Flags();
|
|
#ifdef DEBUG
|
|
int32_t skipCharsOffset = aSkipChars->GetOriginalCharCount();
|
|
#endif
|
|
|
|
bool lastCharArabic = false;
|
|
if (aCompression == COMPRESS_NONE ||
|
|
aCompression == COMPRESS_NONE_TRANSFORM_TO_SPACE) {
|
|
// Skip discardables.
|
|
uint32_t i;
|
|
for (i = 0; i < aLength; ++i) {
|
|
CharT ch = aText[i];
|
|
if (IsDiscardable(ch, &flags)) {
|
|
aSkipChars->SkipChar();
|
|
} else {
|
|
aSkipChars->KeepChar();
|
|
if (ch > ' ') {
|
|
lastCharArabic = IS_ARABIC_CHAR(ch);
|
|
} else if (aCompression == COMPRESS_NONE_TRANSFORM_TO_SPACE) {
|
|
if (ch == '\t' || ch == '\n') {
|
|
ch = ' ';
|
|
}
|
|
} else {
|
|
// aCompression == COMPRESS_NONE
|
|
if (ch == '\t') {
|
|
flags |= Flags::HasTab;
|
|
} else if (ch == '\n') {
|
|
flags |= Flags::HasNewline;
|
|
}
|
|
}
|
|
*aOutput++ = ch;
|
|
}
|
|
}
|
|
if (lastCharArabic) {
|
|
*aIncomingFlags |= INCOMING_ARABICCHAR;
|
|
} else {
|
|
*aIncomingFlags &= ~INCOMING_ARABICCHAR;
|
|
}
|
|
*aIncomingFlags &= ~INCOMING_WHITESPACE;
|
|
} else {
|
|
bool langIsJapaneseOrChinese = [=]() {
|
|
if (!aLanguage || aLanguage->GetLength() < 2) {
|
|
return false;
|
|
}
|
|
const char16_t* text = aLanguage->GetUTF16String();
|
|
if ((ToLowerCaseASCII(text[0]) == char16_t('j') &&
|
|
ToLowerCaseASCII(text[1]) == char16_t('a')) ||
|
|
(ToLowerCaseASCII(text[0]) == char16_t('z') &&
|
|
ToLowerCaseASCII(text[1]) == char16_t('h'))) {
|
|
return aLanguage->GetLength() == 2 || text[2] == '-';
|
|
}
|
|
return false;
|
|
}();
|
|
bool inWhitespace = (*aIncomingFlags & INCOMING_WHITESPACE) != 0;
|
|
uint32_t i;
|
|
for (i = 0; i < aLength; ++i) {
|
|
CharT ch = aText[i];
|
|
// CSS Text 3 - 4.1. The White Space Processing Rules
|
|
// White space processing in CSS affects only the document white space
|
|
// characters: spaces (U+0020), tabs (U+0009), and segment breaks.
|
|
// Since we need the context of segment breaks and their surrounding
|
|
// white spaces to proceed the white space processing, a consecutive run
|
|
// of spaces/tabs/segment breaks is collected in a first pass loop, then
|
|
// we apply the collapsing and transformation rules to this run in a
|
|
// second pass loop.
|
|
if (IsSpaceOrTabOrSegmentBreak(ch)) {
|
|
bool keepLastSpace = false;
|
|
bool hasSegmentBreak = IsSegmentBreak(ch);
|
|
uint32_t countTrailingDiscardables = 0;
|
|
uint32_t j;
|
|
for (j = i + 1; j < aLength && (IsSpaceOrTabOrSegmentBreak(aText[j]) ||
|
|
IsDiscardable(aText[j], &flags));
|
|
j++) {
|
|
if (IsSegmentBreak(aText[j])) {
|
|
hasSegmentBreak = true;
|
|
}
|
|
}
|
|
// Exclude trailing discardables before checking space combining
|
|
// sequence tail.
|
|
for (; IsDiscardable(aText[j - 1], &flags); j--) {
|
|
countTrailingDiscardables++;
|
|
}
|
|
// If the last white space is followed by a combining sequence tail,
|
|
// exclude it from the range of TransformWhiteSpaces.
|
|
if (sizeof(CharT) > 1 && aText[j - 1] == ' ' && j < aLength &&
|
|
IsSpaceCombiningSequenceTail(&aText[j], aLength - j)) {
|
|
keepLastSpace = true;
|
|
j--;
|
|
}
|
|
if (j > i) {
|
|
aOutput = TransformWhiteSpaces(
|
|
aText, aLength, i, j, hasSegmentBreak, inWhitespace, aOutput,
|
|
flags, aCompression, aSkipChars, langIsJapaneseOrChinese);
|
|
}
|
|
// We need to keep KeepChar()/SkipChar() in order, so process the
|
|
// last white space first, then process the trailing discardables.
|
|
if (keepLastSpace) {
|
|
keepLastSpace = false;
|
|
*aOutput++ = ' ';
|
|
aSkipChars->KeepChar();
|
|
lastCharArabic = false;
|
|
j++;
|
|
}
|
|
for (; countTrailingDiscardables > 0; countTrailingDiscardables--) {
|
|
aSkipChars->SkipChar();
|
|
j++;
|
|
}
|
|
i = j - 1;
|
|
continue;
|
|
}
|
|
// Process characters other than the document white space characters.
|
|
if (IsDiscardable(ch, &flags)) {
|
|
aSkipChars->SkipChar();
|
|
} else {
|
|
*aOutput++ = ch;
|
|
aSkipChars->KeepChar();
|
|
}
|
|
lastCharArabic = IS_ARABIC_CHAR(ch);
|
|
inWhitespace = false;
|
|
}
|
|
|
|
if (lastCharArabic) {
|
|
*aIncomingFlags |= INCOMING_ARABICCHAR;
|
|
} else {
|
|
*aIncomingFlags &= ~INCOMING_ARABICCHAR;
|
|
}
|
|
if (inWhitespace) {
|
|
*aIncomingFlags |= INCOMING_WHITESPACE;
|
|
} else {
|
|
*aIncomingFlags &= ~INCOMING_WHITESPACE;
|
|
}
|
|
}
|
|
|
|
*aAnalysisFlags = flags;
|
|
|
|
#ifdef DEBUG
|
|
AssertSkippedExpectedChars(aText, *aSkipChars, skipCharsOffset);
|
|
#endif
|
|
return aOutput;
|
|
}
|
|
|
|
/*
|
|
* NOTE: The TransformText and IsSkippableCharacterForTransformText template
|
|
* functions are part of the public API of nsTextFrameUtils, while
|
|
* their function bodies are not available in the header. They may stop working
|
|
* (fail to resolve symbol in link time) once their callsites are moved to a
|
|
* different translation unit (e.g. a different unified source file).
|
|
* Explicit instantiating this function template with `uint8_t` and `char16_t`
|
|
* could prevent us from the potential risk.
|
|
*/
|
|
template uint8_t* nsTextFrameUtils::TransformText(
|
|
const uint8_t* aText, uint32_t aLength, uint8_t* aOutput,
|
|
CompressionMode aCompression, uint8_t* aIncomingFlags,
|
|
gfxSkipChars* aSkipChars, Flags* aAnalysisFlags, const nsAtom* aLanguage);
|
|
template char16_t* nsTextFrameUtils::TransformText(
|
|
const char16_t* aText, uint32_t aLength, char16_t* aOutput,
|
|
CompressionMode aCompression, uint8_t* aIncomingFlags,
|
|
gfxSkipChars* aSkipChars, Flags* aAnalysisFlags, const nsAtom* aLanguage);
|
|
template bool nsTextFrameUtils::IsSkippableCharacterForTransformText(
|
|
uint8_t aChar);
|
|
template bool nsTextFrameUtils::IsSkippableCharacterForTransformText(
|
|
char16_t aChar);
|
|
|
|
template <typename CharT>
|
|
static uint32_t DoComputeApproximateLengthWithWhitespaceCompression(
|
|
const CharT* aChars, uint32_t aLength, const nsStyleText* aStyleText) {
|
|
// This is an approximation so we don't really need anything
|
|
// too fancy here.
|
|
uint32_t len;
|
|
if (aStyleText->WhiteSpaceIsSignificant()) {
|
|
return aLength;
|
|
}
|
|
bool prevWS = true; // more important to ignore blocks with
|
|
// only whitespace than get inline boundaries
|
|
// exactly right
|
|
len = 0;
|
|
for (uint32_t i = 0; i < aLength; ++i) {
|
|
CharT c = aChars[i];
|
|
if (c == ' ' || c == '\n' || c == '\t' || c == '\r') {
|
|
if (!prevWS) {
|
|
++len;
|
|
}
|
|
prevWS = true;
|
|
} else {
|
|
++len;
|
|
prevWS = false;
|
|
}
|
|
}
|
|
return len;
|
|
}
|
|
|
|
uint32_t nsTextFrameUtils::ComputeApproximateLengthWithWhitespaceCompression(
|
|
Text* aText, const nsStyleText* aStyleText) {
|
|
const nsTextFragment* frag = &aText->TextFragment();
|
|
if (frag->Is2b()) {
|
|
return DoComputeApproximateLengthWithWhitespaceCompression(
|
|
frag->Get2b(), frag->GetLength(), aStyleText);
|
|
}
|
|
return DoComputeApproximateLengthWithWhitespaceCompression(
|
|
frag->Get1b(), frag->GetLength(), aStyleText);
|
|
}
|
|
|
|
uint32_t nsTextFrameUtils::ComputeApproximateLengthWithWhitespaceCompression(
|
|
const nsAString& aString, const nsStyleText* aStyleText) {
|
|
return DoComputeApproximateLengthWithWhitespaceCompression(
|
|
aString.BeginReading(), aString.Length(), aStyleText);
|
|
}
|
|
|
|
bool nsSkipCharsRunIterator::NextRun() {
|
|
do {
|
|
if (mRunLength) {
|
|
mIterator.AdvanceOriginal(mRunLength);
|
|
NS_ASSERTION(mRunLength > 0,
|
|
"No characters in run (initial length too large?)");
|
|
if (!mSkipped || mLengthIncludesSkipped) {
|
|
mRemainingLength -= mRunLength;
|
|
}
|
|
}
|
|
if (!mRemainingLength) {
|
|
return false;
|
|
}
|
|
int32_t length;
|
|
mSkipped = mIterator.IsOriginalCharSkipped(&length);
|
|
mRunLength = std::min(length, mRemainingLength);
|
|
} while (!mVisitSkipped && mSkipped);
|
|
|
|
return true;
|
|
}
|