Bug 1402247 - Use encoding_rs for XPCOM string encoding conversions. r=Nika,erahm,froydnj.

Correctness improvements:

 * UTF errors are handled safely per spec instead of dangerously truncating
   strings.

 * There are fewer converter implementations.

Performance improvements:

 * The old code did exact buffer length math, which meant doing UTF math twice
   on each input string (once for length calculation and another time for
   conversion). Exact length math is more complicated when handling errors
   properly, which the old code didn't do. The new code does UTF math on the
   string content only once (when converting) but risks allocating more than
   once. There are heuristics in place to lower the probability of
   reallocation in cases where the double math avoidance isn't enough of a
   saving to absorb an allocation and memcpy.

 * Previously, in UTF-16 <-> UTF-8 conversions, an ASCII prefix was optimized
   but a single non-ASCII code point pessimized the rest of the string. The
   new code tries to get back on the fast ASCII path.

 * UTF-16 to Latin1 conversion guarantees less about handling of out-of-range
   input to eliminate an operation from the inner loop on x86/x86_64.

 * When assigning to a pre-existing string, the new code tries to reuse the
   old buffer instead of first releasing the old buffer and then allocating a
   new one.

 * When reallocating from the new code, the memcpy covers only the data that
   is part of the logical length of the old string instead of memcpying the
   whole capacity. (For old callers old excess memcpy behavior is preserved
   due to bogus callers. See bug 1472113.)

 * UTF-8 strings in XPConnect that are in the Latin1 range are passed to
   SpiderMonkey as Latin1.

New features:

 * Conversion between UTF-8 and Latin1 is added in order to enable faster
   future interop between Rust code (or otherwise UTF-8-using code) and text
   node and SpiderMonkey code that uses Latin1.

MozReview-Commit-ID: JaJuExfILM9
This commit is contained in:
Henri Sivonen 2018-07-06 10:44:43 +03:00
parent 4a5309765f
commit 3edc601325
71 changed files with 4059 additions and 2480 deletions

1
Cargo.lock generated
View file

@ -1458,6 +1458,7 @@ name = "nsstring"
version = "0.1.0"
dependencies = [
"bitflags 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
"encoding_rs 0.8.4 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]

View file

@ -4773,7 +4773,7 @@ nsDocShell::LoadErrorPage(nsIURI* aURI, const char16_t* aURL,
nsresult rv = aURI->GetSpec(url);
NS_ENSURE_SUCCESS(rv, rv);
} else if (aURL) {
CopyUTF16toUTF8(aURL, url);
CopyUTF16toUTF8(MakeStringSpan(aURL), url);
} else {
return NS_ERROR_INVALID_POINTER;
}

View file

@ -239,7 +239,7 @@ Exception::GetName(nsAString& aName)
nsXPCException::NameAndFormatForNSResult(mResult, &name, nullptr);
if (name) {
CopyUTF8toUTF16(name, aName);
CopyUTF8toUTF16(mozilla::MakeStringSpan(name), aName);
}
}
}

View file

@ -16,7 +16,6 @@
#include "nsMemory.h"
#include "nsBidiUtils.h"
#include "nsUnicharUtils.h"
#include "nsUTF8Utils.h"
#include "mozilla/CheckedInt.h"
#include "mozilla/MemoryReporting.h"
#include "mozilla/SSE.h"
@ -319,8 +318,8 @@ nsTextFragment::SetTo(const char16_t* aBuffer, int32_t aLength,
}
// Copy data
LossyConvertEncoding16to8 converter(buff);
copy_string(aBuffer, aBuffer+aLength, converter);
LossyConvertUTF16toLatin1(MakeSpan(aBuffer, aLength),
MakeSpan(buff, aLength));
m1b = buff;
mState.mIs2b = false;
}
@ -351,9 +350,7 @@ nsTextFragment::CopyTo(char16_t *aDest, int32_t aOffset, int32_t aCount)
memcpy(aDest, Get2b() + aOffset, sizeof(char16_t) * aCount);
} else {
const char *cp = m1b + aOffset;
const char *end = cp + aCount;
LossyConvertEncoding8to16 converter(aDest);
copy_string(cp, end, converter);
ConvertLatin1toUTF16(MakeSpan(cp, aCount), MakeSpan(aDest, aCount));
}
}
}
@ -440,8 +437,8 @@ nsTextFragment::Append(const char16_t* aBuffer, uint32_t aLength,
// Copy data into buff
char16_t* data = static_cast<char16_t*>(buff->Data());
LossyConvertEncoding8to16 converter(data);
copy_string(m1b, m1b+mState.mLength, converter);
ConvertLatin1toUTF16(MakeSpan(m1b, mState.mLength),
MakeSpan(data, mState.mLength));
memcpy(data + mState.mLength, aBuffer, aLength * sizeof(char16_t));
mState.mLength += aLength;
@ -483,8 +480,8 @@ nsTextFragment::Append(const char16_t* aBuffer, uint32_t aLength,
}
// Copy aBuffer into buff.
LossyConvertEncoding16to8 converter(buff + mState.mLength);
copy_string(aBuffer, aBuffer + aLength, converter);
LossyConvertUTF16toLatin1(MakeSpan(aBuffer, aLength),
MakeSpan(buff + mState.mLength, aLength));
m1b = buff;
mState.mLength += aLength;

View file

@ -592,9 +592,7 @@ void
nsXMLContentSerializer::GenerateNewPrefix(nsAString& aPrefix)
{
aPrefix.Assign('a');
char buf[128];
SprintfLiteral(buf, "%d", mPrefixIndex++);
AppendASCIItoUTF16(buf, aPrefix);
aPrefix.AppendInt(mPrefixIndex++);
}
bool
@ -1251,7 +1249,10 @@ nsXMLContentSerializer::AppendAndTranslateEntities(const nsAString& aStr,
NS_ENSURE_TRUE(aOutputStr.Append(fragmentStart, advanceLength, mozilla::fallible), false);
if (entityText) {
NS_ENSURE_TRUE(AppendASCIItoUTF16(entityText, aOutputStr, mozilla::fallible), false);
NS_ENSURE_TRUE(AppendASCIItoUTF16(mozilla::MakeStringSpan(entityText),
aOutputStr,
mozilla::fallible),
false);
advanceLength++;
}
}

View file

@ -1142,7 +1142,7 @@ Event::GetWidgetEventType(WidgetEvent* aEvent, nsAString& aType)
const char* name = GetEventName(aEvent->mMessage);
if (name) {
CopyASCIItoUTF16(name, aType);
CopyASCIItoUTF16(mozilla::MakeStringSpan(name), aType);
return;
} else if (aEvent->mMessage == eUnidentifiedEvent &&
aEvent->mSpecifiedEventType) {

View file

@ -316,7 +316,7 @@ nsDOMOfflineResourceList::IndexedGetter(uint32_t aIndex, bool& aFound,
}
aFound = true;
CopyUTF8toUTF16(mCachedKeys[aIndex], aURI);
CopyUTF8toUTF16(mozilla::MakeStringSpan(mCachedKeys[aIndex]), aURI);
}
void

View file

@ -606,15 +606,15 @@ ThrowJSExceptionASCII(JSContext *cx, const char *message)
nsAutoString ucex;
if (message) {
AppendASCIItoUTF16(message, ucex);
AppendASCIItoUTF16(mozilla::MakeStringSpan(message), ucex);
AppendASCIItoUTF16(" [plugin exception: ", ucex);
ucex.AppendLiteral(" [plugin exception: ");
}
AppendUTF8toUTF16(ex, ucex);
AppendUTF8toUTF16(mozilla::MakeStringSpan(ex), ucex);
if (message) {
AppendASCIItoUTF16("].", ucex);
ucex.AppendLiteral("].");
}
JSString *str = ::JS_NewUCStringCopyN(cx, ucex.get(), ucex.Length());

View file

@ -855,7 +855,7 @@ nsNPAPIPluginInstance::GetFormValue(nsAString& aValue)
if (NS_FAILED(rv) || !value)
return NS_ERROR_FAILURE;
CopyUTF8toUTF16(value, aValue);
CopyUTF8toUTF16(MakeStringSpan(value), aValue);
// NPPVformValue allocates with NPN_MemAlloc(), which uses
// nsMemory.

View file

@ -216,7 +216,7 @@ PresentationRequest::StartWithDevice(const nsAString& aDeviceId,
char buffer[NSID_LENGTH];
uuid.ToProvidedString(buffer);
nsAutoString id;
CopyASCIItoUTF16(buffer, id);
CopyASCIItoUTF16(MakeSpan(buffer, NSID_LENGTH - 1), id);
nsCOMPtr<nsIPresentationService> service =
do_GetService(PRESENTATION_SERVICE_CONTRACTID);

View file

@ -106,12 +106,11 @@ SRICheck::IntegrityMetadata(const nsAString& aMetadataList,
}
// put a reasonable bound on the length of the metadata
NS_LossyConvertUTF16toASCII metadataList(aMetadataList);
NS_ConvertUTF16toUTF8 metadataList(aMetadataList);
if (metadataList.Length() > SRICheck::MAX_METADATA_LENGTH) {
metadataList.Truncate(SRICheck::MAX_METADATA_LENGTH);
}
SRILOG(("SRICheck::IntegrityMetadata, metadataList=%s", metadataList.get()));
MOZ_ASSERT(metadataList.Length() <= aMetadataList.Length());
// the integrity attribute is a list of whitespace-separated hashes
// and options so we need to look at them one by one and pick the

View file

@ -1534,7 +1534,7 @@ nsWebBrowserPersist::GetExtensionForContentType(const char16_t *aContentType, ch
}
nsAutoCString contentType;
LossyCopyUTF16toASCII(aContentType, contentType);
LossyCopyUTF16toASCII(MakeStringSpan(aContentType), contentType);
nsAutoCString ext;
rv = mMIMEService->GetPrimaryExtension(contentType, EmptyCString(), ext);
if (NS_SUCCEEDED(rv))

View file

@ -312,8 +312,9 @@ txEXSLTFunctionCall::evaluate(txIEvalContext *aContext,
rv = aContext->recycler()->getStringResult(getter_AddRefs(strRes));
NS_ENSURE_SUCCESS(rv, rv);
AppendASCIItoUTF16(sTypes[exprResult->getResultType()],
strRes->mValue);
AppendASCIItoUTF16(
MakeStringSpan(sTypes[exprResult->getResultType()]),
strRes->mValue);
NS_ADDREF(*aResult = strRes);

View file

@ -204,11 +204,16 @@ void txRomanCounter::appendNumber(int32_t aNumber, nsAString& aDest)
// Hundreds
posValue = aNumber / 100;
aNumber %= 100;
AppendASCIItoUTF16(kTxRomanNumbers[posValue + mTableOffset], aDest);
AppendASCIItoUTF16(
mozilla::MakeStringSpan(kTxRomanNumbers[posValue + mTableOffset]), aDest);
// Tens
posValue = aNumber / 10;
aNumber %= 10;
AppendASCIItoUTF16(kTxRomanNumbers[10 + posValue + mTableOffset], aDest);
AppendASCIItoUTF16(
mozilla::MakeStringSpan(kTxRomanNumbers[10 + posValue + mTableOffset]),
aDest);
// Ones
AppendASCIItoUTF16(kTxRomanNumbers[20 + aNumber + mTableOffset], aDest);
AppendASCIItoUTF16(
mozilla::MakeStringSpan(kTxRomanNumbers[20 + aNumber + mTableOffset]),
aDest);
}

View file

@ -65,7 +65,7 @@ void ProcessDefaultValue(const nsAString* aInputString,
const char* aPrependString,
const char* aAppendString)
{
CopyASCIItoUTF16(aDefaultValueString, aOutputString);
CopyASCIItoUTF16(MakeStringSpan(aDefaultValueString), aOutputString);
}
static
@ -92,11 +92,11 @@ void ProcessExtendedValue(const nsAString* aInputString,
aOutputString.Truncate();
if (aInputString) {
if (aPrependString) {
AppendASCIItoUTF16(aPrependString, aOutputString);
AppendASCIItoUTF16(MakeStringSpan(aPrependString), aOutputString);
}
aOutputString.Append(*aInputString);
if (aAppendString) {
AppendASCIItoUTF16(aAppendString, aOutputString);
AppendASCIItoUTF16(MakeStringSpan(aAppendString), aOutputString);
}
}
}

View file

@ -428,10 +428,10 @@ CreateNamedFontEntry(FT_Face aFace, const char* aFilename, uint8_t aIndex)
return nullptr;
}
nsAutoString fontName;
AppendUTF8toUTF16(aFace->family_name, fontName);
AppendUTF8toUTF16(mozilla::MakeStringSpan(aFace->family_name), fontName);
if (aFace->style_name && strcmp("Regular", aFace->style_name)) {
fontName.Append(' ');
AppendUTF8toUTF16(aFace->style_name, fontName);
AppendUTF8toUTF16(mozilla::MakeStringSpan(aFace->style_name), fontName);
}
return FT2FontEntry::CreateFontEntry(aFace, aFilename, aIndex, fontName);
}

View file

@ -130,14 +130,14 @@ GetFaceNames(FcPattern* aFont, const nsAString& aFamilyName,
// get the Postscript name
FcChar8* psname;
if (FcPatternGetString(aFont, FC_POSTSCRIPT_NAME, 0, &psname) == FcResultMatch) {
AppendUTF8toUTF16(ToCharPtr(psname), aPostscriptName);
AppendUTF8toUTF16(MakeStringSpan(ToCharPtr(psname)), aPostscriptName);
}
// get the canonical fullname (i.e. en name or first name)
uint32_t en = FindCanonicalNameIndex(aFont, FC_FULLNAMELANG);
FcChar8* fullname;
if (FcPatternGetString(aFont, FC_FULLNAME, en, &fullname) == FcResultMatch) {
AppendUTF8toUTF16(ToCharPtr(fullname), aFullname);
AppendUTF8toUTF16(MakeStringSpan(ToCharPtr(fullname)), aFullname);
}
// if have fullname, done
@ -154,7 +154,7 @@ GetFaceNames(FcPattern* aFont, const nsAString& aFamilyName,
FcChar8* stylename = nullptr;
FcPatternGetString(aFont, FC_STYLE, en, &stylename);
if (stylename) {
AppendUTF8toUTF16(ToCharPtr(stylename), style);
AppendUTF8toUTF16(MakeStringSpan(ToCharPtr(stylename)), style);
}
if (!style.IsEmpty() && !style.EqualsLiteral("Regular")) {
@ -1578,7 +1578,7 @@ gfxFcPlatformFontList::AddPatternToFontList(FcPattern* aFont,
// add new family if one doesn't already exist
aFamilyName.Truncate();
AppendUTF8toUTF16(ToCharPtr(canonical), aFamilyName);
AppendUTF8toUTF16(MakeStringSpan(ToCharPtr(canonical)), aFamilyName);
nsAutoString keyName(aFamilyName);
ToLowerCase(keyName);
@ -1810,7 +1810,7 @@ GetSystemFontList(nsTArray<nsString>& aListOfFonts, nsAtom *aLangGroup)
// Remove duplicates...
nsAutoString strFamily;
AppendUTF8toUTF16(family, strFamily);
AppendUTF8toUTF16(MakeStringSpan(family), strFamily);
if (aListOfFonts.Contains(strFamily)) {
continue;
}

View file

@ -12,13 +12,13 @@
// third_party/rust/encoding_rs/.
extern crate encoding_rs;
extern crate nsstring;
extern crate nserror;
extern crate nsstring;
use std::slice;
use encoding_rs::*;
use nsstring::*;
use nserror::*;
use nsstring::*;
use std::slice;
// nsStringBuffer's internal bookkeeping takes 8 bytes from
// the allocation. Plus one for termination.
@ -590,3 +590,95 @@ fn checked_min(one: Option<usize>, other: Option<usize>) -> Option<usize> {
pub unsafe extern "C" fn encoding_mem_is_utf16_bidi(buffer: *const u16, len: usize) -> bool {
encoding_rs::mem::is_utf16_bidi(::std::slice::from_raw_parts(buffer, len))
}
#[no_mangle]
pub unsafe extern "C" fn encoding_mem_is_ascii(buffer: *const u8, len: usize) -> bool {
encoding_rs::mem::is_ascii(::std::slice::from_raw_parts(buffer, len))
}
#[no_mangle]
pub unsafe extern "C" fn encoding_mem_is_basic_latin(buffer: *const u16, len: usize) -> bool {
encoding_rs::mem::is_basic_latin(::std::slice::from_raw_parts(buffer, len))
}
#[no_mangle]
pub unsafe extern "C" fn encoding_mem_is_utf8_latin1(buffer: *const u8, len: usize) -> bool {
encoding_rs::mem::is_utf8_latin1(::std::slice::from_raw_parts(buffer, len))
}
#[no_mangle]
pub unsafe extern "C" fn encoding_mem_is_utf16_latin1(buffer: *const u16, len: usize) -> bool {
encoding_rs::mem::is_utf16_latin1(::std::slice::from_raw_parts(buffer, len))
}
#[no_mangle]
pub unsafe extern "C" fn encoding_mem_is_str_latin1(buffer: *const u8, len: usize) -> bool {
encoding_rs::mem::is_str_latin1(::std::str::from_utf8_unchecked(
::std::slice::from_raw_parts(buffer, len),
))
}
#[no_mangle]
pub unsafe extern "C" fn encoding_mem_convert_utf16_to_latin1_lossy(
src: *const u16,
src_len: usize,
dst: *mut u8,
dst_len: usize,
) {
encoding_rs::mem::convert_utf16_to_latin1_lossy(
::std::slice::from_raw_parts(src, src_len),
::std::slice::from_raw_parts_mut(dst, dst_len),
);
}
#[no_mangle]
pub unsafe extern "C" fn encoding_mem_convert_utf8_to_latin1_lossy(
src: *const u8,
src_len: usize,
dst: *mut u8,
dst_len: usize,
) -> usize {
encoding_rs::mem::convert_utf8_to_latin1_lossy(
::std::slice::from_raw_parts(src, src_len),
::std::slice::from_raw_parts_mut(dst, dst_len),
)
}
#[no_mangle]
pub unsafe extern "C" fn encoding_mem_convert_latin1_to_utf16(
src: *const u8,
src_len: usize,
dst: *mut u16,
dst_len: usize,
) {
encoding_rs::mem::convert_latin1_to_utf16(
::std::slice::from_raw_parts(src, src_len),
::std::slice::from_raw_parts_mut(dst, dst_len),
);
}
#[no_mangle]
pub unsafe extern "C" fn encoding_mem_convert_utf16_to_utf8(
src: *const u16,
src_len: usize,
dst: *mut u8,
dst_len: usize,
) -> usize {
encoding_rs::mem::convert_utf16_to_utf8(
::std::slice::from_raw_parts(src, src_len),
::std::slice::from_raw_parts_mut(dst, dst_len),
)
}
#[no_mangle]
pub unsafe extern "C" fn encoding_mem_convert_utf8_to_utf16(
src: *const u8,
src_len: usize,
dst: *mut u16,
dst_len: usize,
) -> usize {
encoding_rs::mem::convert_utf8_to_utf16(
::std::slice::from_raw_parts(src, src_len),
::std::slice::from_raw_parts_mut(dst, dst_len),
)
}

View file

@ -42,11 +42,13 @@ NS_GetComplexLineBreaks(const char16_t* aText, uint32_t aLength,
aBreakBefore[++u16Offset] = false; // Skip high surrogate
++u16Offset;
bool err;
uint32_t ch = UTF8CharEnumerator::NextChar(&p, end, &err);
// We're iterating over text obtained from NS_ConvertUTF16toUTF8,
// so we know we have valid UTF-8 and don't need to check for
// errors.
uint32_t ch = UTF8CharEnumerator::NextChar(&p, end);
++attr;
if (ch == 0 || err) {
if (!ch) {
// pango_break (pango 1.16.2) only analyses text before the
// first NUL (but sets one extra attr). Workaround loop to call
// pango_break again to analyse after the NUL is done somewhere else

View file

@ -477,7 +477,10 @@ GeckoChildProcessHost::GetChildLogName(const char* origLogName,
// points or symlinks or the sandbox will reject rules to allow writing.
std::wstring resolvedPath(NS_ConvertUTF8toUTF16(absPath).get());
if (widget::WinUtils::ResolveJunctionPointsAndSymLinks(resolvedPath)) {
AppendUTF16toUTF8(resolvedPath.c_str(), buffer);
AppendUTF16toUTF8(
MakeSpan(reinterpret_cast<const char16_t*>(resolvedPath.data()),
resolvedPath.size()),
buffer);
} else
#endif
{

View file

@ -2104,7 +2104,7 @@ nsXPCComponents_Utils::ReportError(HandleValue error, HandleValue stack, JSConte
if (err) {
// It's a proper JS Error
nsAutoString fileUni;
CopyUTF8toUTF16(err->filename, fileUni);
CopyUTF8toUTF16(mozilla::MakeStringSpan(err->filename), fileUni);
uint32_t column = err->tokenOffset();

View file

@ -282,36 +282,77 @@ XPCConvert::NativeData2JS(MutableHandleValue d, const void* s,
return true;
}
const uint32_t len = CalcUTF8ToUnicodeLength(*utf8String);
// The cString is not empty at this point, but the calculated
// UTF-16 length is zero, meaning no valid conversion exists.
if (!len)
return false;
const size_t buffer_size = (len + 1) * sizeof(char16_t);
char16_t* buffer =
static_cast<char16_t*>(JS_malloc(cx, buffer_size));
if (!buffer)
return false;
uint32_t copied;
if (!UTF8ToUnicodeBuffer(*utf8String, buffer, &copied) ||
len != copied) {
// Copy or conversion during copy failed. Did not copy the
// whole string.
JS_free(cx, buffer);
uint32_t len = utf8String->Length();
auto allocLen = CheckedUint32(len) + 1;
if (!allocLen.isValid()) {
return false;
}
// JS_NewUCString takes ownership on success, i.e. a
// Usage of UTF-8 in XPConnect is mostly for things that are
// almost always ASCII, so the inexact allocations below
// should be fine.
if (IsUTF8Latin1(*utf8String)) {
char* buffer = static_cast<char*>(JS_malloc(cx, allocLen.value()));
if (!buffer) {
return false;
}
size_t written =
LossyConvertUTF8toLatin1(*utf8String, MakeSpan(buffer, len));
buffer[written] = 0;
// JS_NewLatin1String takes ownership on success, i.e. a
// successful call will make it the responsiblity of the JS VM
// to free the buffer.
// written can never exceed len, so the truncation is OK.
JSString* str = JS_NewLatin1String(
cx, reinterpret_cast<JS::Latin1Char*>(buffer), written);
if (!str) {
JS_free(cx, buffer);
return false;
}
d.setString(str);
return true;
}
// 1-byte sequences decode to 1 UTF-16 code unit
// 2-byte sequences decode to 1 UTF-16 code unit
// 3-byte sequences decode to 1 UTF-16 code unit
// 4-byte sequences decode to 2 UTF-16 code units
// So the number of output code units never exceeds
// the number of input code units (but see the comment
// below). allocLen already takes the zero terminator
// into account.
allocLen *= sizeof(char16_t);
if (!allocLen.isValid()) {
return false;
}
char16_t* buffer =
static_cast<char16_t*>(JS_malloc(cx, allocLen.value()));
if (!buffer) {
return false;
}
// For its internal simplicity, ConvertUTF8toUTF16 requires the
// destination to be one code unit longer than the source, but
// it never actually writes more code units than the number of
// code units in the source. That's why it's OK to claim the
// output buffer has len + 1 space but then still expect to
// have space for the zero terminator.
size_t written =
ConvertUTF8toUTF16(*utf8String, MakeSpan(buffer, allocLen.value()));
MOZ_RELEASE_ASSERT(written <= len);
buffer[written] = 0;
// JS_NewUCStringDontDeflate takes ownership on success, i.e. a
// successful call will make it the responsiblity of the JS VM
// to free the buffer.
JSString* str = JS_NewUCString(cx, buffer, len);
JSString* str = JS_NewUCStringDontDeflate(cx, buffer, written);
if (!str) {
JS_free(cx, buffer);
return false;
}
d.setString(str);
return true;
}
@ -1223,9 +1264,10 @@ JSErrorToXPCException(const char* toStringResult,
if (report) {
nsAutoString bestMessage;
if (report && report->message()) {
CopyUTF8toUTF16(report->message().c_str(), bestMessage);
CopyUTF8toUTF16(mozilla::MakeStringSpan(report->message().c_str()),
bestMessage);
} else if (toStringResult) {
CopyUTF8toUTF16(toStringResult, bestMessage);
CopyUTF8toUTF16(mozilla::MakeStringSpan(toStringResult), bestMessage);
} else {
bestMessage.AssignLiteral("JavaScript Error");
}

View file

@ -193,7 +193,7 @@ xpc::ErrorBase::Init(JSErrorBase* aReport)
if (!aReport->filename)
mFileName.SetIsVoid(true);
else
CopyASCIItoUTF16(aReport->filename, mFileName);
CopyUTF8toUTF16(mozilla::MakeStringSpan(aReport->filename), mFileName);
mLineNumber = aReport->lineno;
mColumn = aReport->column;
@ -218,7 +218,7 @@ xpc::ErrorReport::Init(JSErrorReport* aReport, const char* aToStringResult,
ErrorReportToMessageString(aReport, mErrorMsg);
if (mErrorMsg.IsEmpty() && aToStringResult) {
AppendUTF8toUTF16(aToStringResult, mErrorMsg);
AppendUTF8toUTF16(mozilla::MakeStringSpan(aToStringResult), mErrorMsg);
}
mSourceLine.Assign(aReport->linebuf(), aReport->linebufLength());
@ -270,11 +270,11 @@ static LazyLogModule gJSDiagnostics("JSDiagnostics");
void
xpc::ErrorBase::AppendErrorDetailsTo(nsCString& error)
{
error.Append(NS_LossyConvertUTF16toASCII(mFileName));
AppendUTF16toUTF8(mFileName, error);
error.AppendLiteral(", line ");
error.AppendInt(mLineNumber, 10);
error.AppendLiteral(": ");
error.Append(NS_LossyConvertUTF16toASCII(mErrorMsg));
AppendUTF16toUTF8(mErrorMsg, error);
}
void
@ -344,8 +344,8 @@ xpc::ErrorReport::LogToConsoleWithStack(JS::HandleObject aStack,
MOZ_LOG(gJSDiagnostics,
JSREPORT_IS_WARNING(mFlags) ? LogLevel::Warning : LogLevel::Error,
("file %s, line %u\n%s", NS_LossyConvertUTF16toASCII(mFileName).get(),
mLineNumber, NS_LossyConvertUTF16toASCII(mErrorMsg).get()));
("file %s, line %u\n%s", NS_ConvertUTF16toUTF8(mFileName).get(),
mLineNumber, NS_ConvertUTF16toUTF8(mErrorMsg).get()));
// Log to the console. We do this last so that we can simply return if
// there's no console service without affecting the other reporting

View file

@ -102,7 +102,7 @@ static void logMessage(nsIContent* aContent,
void Area::ParseCoords(const nsAString& aSpec)
{
char* cp = ToNewCString(aSpec);
char* cp = ToNewUTF8String(aSpec);
if (cp) {
char *tptr;
char *n_str;

View file

@ -2460,7 +2460,7 @@ HttpBaseChannel::NotifySetCookie(char const *aCookie)
nsCOMPtr<nsIObserverService> obs = services::GetObserverService();
if (obs) {
nsAutoString cookie;
CopyASCIItoUTF16(aCookie, cookie);
CopyASCIItoUTF16(mozilla::MakeStringSpan(aCookie), cookie);
obs->NotifyObservers(static_cast<nsIChannel*>(this),
"http-on-response-set-cookie",
cookie.get());

View file

@ -77,11 +77,9 @@ nsHttpBasicAuth::GenerateCredentials(nsIHttpAuthenticableChannel *authChannel,
// we work with UTF-8 around here
nsAutoCString userpass;
CopyUTF16toUTF8(user, userpass);
CopyUTF16toUTF8(mozilla::MakeStringSpan(user), userpass);
userpass.Append(':'); // always send a ':' (see bug 129565)
if (password) {
AppendUTF16toUTF8(password, userpass);
}
AppendUTF16toUTF8(mozilla::MakeStringSpan(password), userpass);
nsAutoCString authString;
nsresult rv = Base64Encode(userpass, authString);

View file

@ -6345,15 +6345,15 @@ nsHttpChannel::BeginConnect()
do_GetService(NS_CONSOLESERVICE_CONTRACTID);
if (consoleService) {
nsAutoString message(NS_LITERAL_STRING("Alternate Service Mapping found: "));
AppendASCIItoUTF16(scheme.get(), message);
AppendASCIItoUTF16(scheme, message);
message.AppendLiteral(u"://");
AppendASCIItoUTF16(host.get(), message);
AppendASCIItoUTF16(host, message);
message.AppendLiteral(u":");
message.AppendInt(port);
message.AppendLiteral(u" to ");
AppendASCIItoUTF16(scheme.get(), message);
AppendASCIItoUTF16(scheme, message);
message.AppendLiteral(u"://");
AppendASCIItoUTF16(mapping->AlternateHost().get(), message);
AppendASCIItoUTF16(mapping->AlternateHost(), message);
message.AppendLiteral(u":");
message.AppendInt(mapping->AlternatePort());
consoleService->LogStringMessage(message.get());

View file

@ -170,8 +170,7 @@ nsHtml5String::FromLiteral(const char* aLiteral)
MOZ_CRASH("Out of memory.");
}
char16_t* data = reinterpret_cast<char16_t*>(buffer->Data());
LossyConvertEncoding8to16 converter(data);
converter.write(aLiteral, length);
ConvertLatin1toUTF16(MakeSpan(aLiteral, length), MakeSpan(data, length));
data[length] = 0;
return nsHtml5String(reinterpret_cast<uintptr_t>(buffer.forget().take()) |
eStringBuffer);

View file

@ -31,6 +31,7 @@
using mozilla::fallible;
using mozilla::LogLevel;
using mozilla::MakeStringSpan;
#define kExpatSeparatorChar 0xFFFF
@ -578,11 +579,11 @@ nsExpatDriver::HandleExternalEntityRef(const char16_t *openEntityNames,
if (NS_FAILED(rv)) {
#ifdef DEBUG
nsCString message("Failed to open external DTD: publicId \"");
AppendUTF16toUTF8(publicId, message);
AppendUTF16toUTF8(MakeStringSpan(publicId), message);
message += "\" systemId \"";
AppendUTF16toUTF8(systemId, message);
AppendUTF16toUTF8(MakeStringSpan(systemId), message);
message += "\" base \"";
AppendUTF16toUTF8(base, message);
AppendUTF16toUTF8(MakeStringSpan(base), message);
message += "\" URL \"";
AppendUTF16toUTF8(absURL, message);
message += "\"";

View file

@ -909,7 +909,8 @@ AppendBMPtoUTF16(const UniquePLArenaPool& arena,
false, data, len, utf8Val, utf8ValLen, &utf8ValLen)) {
return NS_ERROR_FAILURE;
}
AppendUTF8toUTF16((char*)utf8Val, text);
AppendUTF8toUTF16(MakeSpan(reinterpret_cast<char*>(utf8Val), utf8ValLen),
text);
return NS_OK;
}

View file

@ -367,7 +367,7 @@ NS_IMETHODIMP
nsNSSCertificate::GetEmailAddress(nsAString& aEmailAddress)
{
if (mCert->emailAddr) {
LossyUTF8ToUTF16(mCert->emailAddr, strlen(mCert->emailAddr), aEmailAddress);
CopyUTF8toUTF16(MakeStringSpan(mCert->emailAddr), aEmailAddress);
} else {
GetPIPNSSBundleString("CertNoEmailAddress", aEmailAddress);
}

View file

@ -2124,7 +2124,7 @@ fn static_assert() {
};
for (servo, gecko) in v.0.areas.into_iter().zip(refptr.mNamedAreas.iter_mut()) {
gecko.mName.assign_utf8(&*servo.name);
gecko.mName.assign_str(&*servo.name);
gecko.mColumnStart = servo.columns.start;
gecko.mColumnEnd = servo.columns.end;
gecko.mRowStart = servo.rows.start;
@ -2132,7 +2132,7 @@ fn static_assert() {
}
for (servo, gecko) in v.0.strings.into_iter().zip(refptr.mTemplates.iter_mut()) {
gecko.assign_utf8(&*servo);
gecko.assign_str(&*servo);
}
self.gecko.mGridTemplateAreas.set_move(refptr.get())
@ -4189,8 +4189,8 @@ fn static_assert() {
};
for (servo, gecko) in other.0.into_iter().zip(refptr.mQuotePairs.iter_mut()) {
gecko.first.assign_utf8(&servo.0);
gecko.second.assign_utf8(&servo.1);
gecko.first.assign_str(&servo.0);
gecko.second.assign_str(&servo.1);
}
self.gecko.mQuotes.set_move(refptr.get())
@ -4728,7 +4728,7 @@ fn static_assert() {
(structs::NS_STYLE_TEXT_EMPHASIS_STYLE_STRING, &**s)
},
};
self.gecko.mTextEmphasisStyleString.assign_utf8(s);
self.gecko.mTextEmphasisStyleString.assign_str(s);
self.gecko.mTextEmphasisStyle = te as u8;
}
@ -4829,7 +4829,7 @@ fn static_assert() {
TextOverflowSide::Clip => structs::NS_STYLE_TEXT_OVERFLOW_CLIP,
TextOverflowSide::Ellipsis => structs::NS_STYLE_TEXT_OVERFLOW_ELLIPSIS,
TextOverflowSide::String(ref s) => {
side.mString.assign_utf8(s);
side.mString.assign_str(s);
structs::NS_STYLE_TEXT_OVERFLOW_STRING
}
};
@ -5461,7 +5461,7 @@ clip-path
};
counter_func.mIdent.assign(name.0.as_slice());
if content_type == StyleContentType::Counters {
counter_func.mSeparator.assign_utf8(sep);
counter_func.mSeparator.assign_str(sep);
}
style.to_gecko_value(&mut counter_func.mCounterStyle, device);
}

View file

@ -3520,7 +3520,7 @@ pub extern "C" fn Servo_DeclarationBlock_GetNthProperty(
read_locked_arc(declarations, |decls: &PropertyDeclarationBlock| {
if let Some(decl) = decls.declarations().get(index as usize) {
let result = unsafe { result.as_mut().unwrap() };
result.assign_utf8(&decl.id().name());
result.assign_str(&decl.id().name());
true
} else {
false

View file

@ -10,4 +10,4 @@ gecko_debug = []
[dependencies]
bitflags = "1.0"
encoding_rs = "0.8.0"

View file

@ -0,0 +1,712 @@
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
extern crate encoding_rs;
use std::slice;
use super::nsACString;
use super::nsAString;
use super::nsCStringLike;
use super::BulkWriteOk;
use super::Gecko_FallibleAssignCString;
use super::Latin1StringLike;
use conversions::encoding_rs::mem::*;
use conversions::encoding_rs::Encoding;
/// Required math stated in the docs of
/// `convert_utf16_to_utf8()`.
#[inline(always)]
fn times_three_plus_one(a: usize) -> Option<usize> {
a.checked_mul(3)?.checked_add(1)
}
#[inline(always)]
fn identity(a: usize) -> Option<usize> {
Some(a)
}
#[inline(always)]
fn plus_one(a: usize) -> Option<usize> {
a.checked_add(1)
}
/// Typical cache line size per
/// https://stackoverflow.com/questions/14707803/line-size-of-l1-and-l2-caches
///
/// For consistent behavior, not trying to use 128 on aarch64
/// or other fanciness like that.
const CACHE_LINE: usize = 64;
const CACHE_LINE_MASK: usize = CACHE_LINE - 1;
#[inline(always)]
fn starts_with_ascii(buffer: &[u8]) -> bool {
// We examine data only up to the end of the cache line
// to make this check minimally disruptive.
let bound = if buffer.len() <= CACHE_LINE {
buffer.len()
} else {
CACHE_LINE - ((buffer.as_ptr() as usize) & CACHE_LINE_MASK)
};
is_ascii(&buffer[..bound])
}
#[inline(always)]
fn starts_with_basic_latin(buffer: &[u16]) -> bool {
// We look at two cache lines with code unit size of two. There is need
// to look at more than one cache line in the UTF-16 case, because looking
// at just one cache line wouldn't catch non-ASCII Latin with high enough
// probability with Latin-script languages that have relatively infrequent
// non-ASCII characters.
let bound = if buffer.len() <= CACHE_LINE {
buffer.len()
} else {
(CACHE_LINE * 2 - ((buffer.as_ptr() as usize) & CACHE_LINE_MASK)) / 2
};
is_basic_latin(&buffer[..bound])
}
// Ignoring the copy avoidance complications of conversions between Latin1 and
// UTF-8, a conversion function has the outward form of
// `fn F(&mut self, other: &[T], old_len: usize) -> Result<BulkWriteOk, ()>`,
// where `T` is either `u8` or `u16`. `other` is the slice whose converted
// content are to be appended to `self` and `old_len` indicates how many
// code unit of `self` are to be preserved (0 for the assignment case and
// `self.len()` for the appending case).
//
// As implementation parameters a conversion function needs to know the
// math for computing the worst case conversion length in code units given
// the input length in code units. For a _constant conversion_ the number
// of code units the conversion produces equals the number of code units
// in the input. For a _shinking conversion_ the maximum number of code
// units the conversion can produce equals the number of code units in
// the input, but the conversion can produce fewer code units. Still, due
// to implementation details, the function might want _one_ unit more of
// output space. For an _expanding conversion_ (no need for macro), the
// minimum number of code units produced by the conversion is the number
// of code units in the input, but the conversion can produce more.
//
// Copy avoidance conversions avoid copying a refcounted buffer when it's
// ASCII-only.
//
// Internally, a conversion function needs to know the underlying
// encoding_rs conversion function, the math for computing the required
// output buffer size and, depending on the case, the underlying
// encoding_rs ASCII prefix handling function.
/// A conversion where the number of code units in the output is potentially
/// smaller than the number of code units in the input.
///
/// Takes the name of the method to be generated, the name of the conversion
/// function and the type of the input slice.
///
/// `$name` is the name of the function to generate
/// `$convert` is the underlying `encoding_rs::mem` function to use
/// `$other_ty` is the type of the input slice
/// `$math` is the worst-case length math that `$convert` expects
macro_rules! shrinking_conversion {
(name = $name:ident,
convert = $convert:ident,
other_ty = $other_ty:ty,
math = $math:ident) => (
fn $name(&mut self, other: $other_ty, old_len: usize) -> Result<BulkWriteOk, ()> {
let needed = $math(other.len()).ok_or(())?;
let mut handle = unsafe {
self.bulk_write(old_len.checked_add(needed).ok_or(())?, old_len, false)?
};
let written = $convert(other, &mut handle.as_mut_slice()[old_len..]);
Ok(handle.finish(old_len + written, true))
}
)
}
/// A conversion where the number of code units in the output is always equal
/// to the number of code units in the input.
///
/// Takes the name of the method to be generated, the name of the conversion
/// function and the type of the input slice.
///
/// `$name` is the name of the function to generate
/// `$convert` is the underlying `encoding_rs::mem` function to use
/// `$other_ty` is the type of the input slice
macro_rules! constant_conversion {
(name = $name:ident,
convert = $convert:ident,
other_ty = $other_ty:ty) => (
fn $name(&mut self, other: $other_ty, old_len: usize) -> Result<BulkWriteOk, ()> {
let new_len = old_len.checked_add(other.len()).ok_or(())?;
let mut handle = unsafe {
self.bulk_write(new_len, old_len, true)?
};
$convert(other, &mut handle.as_mut_slice()[old_len..]);
Ok(handle.finish(new_len, false))
}
)
}
/// An intermediate check for avoiding a copy and having an `nsStringBuffer`
/// refcount increment instead when both `self` and `other` are `nsACString`s,
/// `other` is entirely ASCII and all old data in `self` is discarded.
///
/// `$name` is the name of the function to generate
/// `$impl` is the underlying conversion that takes a slice and that is used
/// when we can't just adopt the incoming buffer as-is
/// `$string_like` is the kind of input taken
macro_rules! ascii_copy_avoidance {
(name = $name:ident,
implementation = $implementation:ident,
string_like = $string_like:ident) => (
fn $name<T: $string_like + ?Sized>(&mut self, other: &T, old_len: usize) -> Result<BulkWriteOk, ()> {
let adapter = other.adapt();
let other_slice = adapter.as_ref();
let num_ascii = if adapter.is_abstract() && old_len == 0 {
let up_to = Encoding::ascii_valid_up_to(other_slice);
if up_to == other_slice.len() {
// Calling something whose argument can be obtained from
// the adapter rather than an nsStringLike avoids a huge
// lifetime mess by keeping nsStringLike and
// Latin1StringLike free of lifetime interdependencies.
if unsafe { Gecko_FallibleAssignCString(self, other.adapt().as_ptr()) } {
return Ok(BulkWriteOk{});
} else {
return Err(());
}
}
Some(up_to)
} else {
None
};
self.$implementation(other_slice, old_len, num_ascii)
}
)
}
impl nsAString {
// Valid UTF-8 to UTF-16
// Documentation says the destination buffer needs to have
// as many code units as the input.
shrinking_conversion!(
name = fallible_append_str_impl,
convert = convert_str_to_utf16,
other_ty = &str,
math = identity
);
/// Convert a valid UTF-8 string into valid UTF-16 and replace the content
/// of this string with the conversion result.
pub fn assign_str(&mut self, other: &str) {
self.fallible_append_str_impl(other, 0)
.expect("Out of memory");
}
/// Convert a valid UTF-8 string into valid UTF-16 and fallibly replace the
/// content of this string with the conversion result.
pub fn fallible_assign_str(&mut self, other: &str) -> Result<(), ()> {
self.fallible_append_str_impl(other, 0).map(|_| ())
}
/// Convert a valid UTF-8 string into valid UTF-16 and append the conversion
/// to this string.
pub fn append_str(&mut self, other: &str) {
let len = self.len();
self.fallible_append_str_impl(other, len)
.expect("Out of memory");
}
/// Convert a valid UTF-8 string into valid UTF-16 and fallibly append the
/// conversion to this string.
pub fn fallible_append_str(&mut self, other: &str) -> Result<(), ()> {
let len = self.len();
self.fallible_append_str_impl(other, len).map(|_| ())
}
// Potentially-invalid UTF-8 to UTF-16
// Documentation says the destination buffer needs to have
// one more code unit than the input.
shrinking_conversion!(
name = fallible_append_utf8_impl,
convert = convert_utf8_to_utf16,
other_ty = &[u8],
math = plus_one
);
/// Convert a potentially-invalid UTF-8 string into valid UTF-16
/// (replacing invalid sequences with the REPLACEMENT CHARACTER) and
/// replace the content of this string with the conversion result.
pub fn assign_utf8(&mut self, other: &[u8]) {
self.fallible_append_utf8_impl(other, 0)
.expect("Out of memory");
}
/// Convert a potentially-invalid UTF-8 string into valid UTF-16
/// (replacing invalid sequences with the REPLACEMENT CHARACTER) and
/// fallibly replace the content of this string with the conversion result.
pub fn fallible_assign_utf8(&mut self, other: &[u8]) -> Result<(), ()> {
self.fallible_append_utf8_impl(other, 0).map(|_| ())
}
/// Convert a potentially-invalid UTF-8 string into valid UTF-16
/// (replacing invalid sequences with the REPLACEMENT CHARACTER) and
/// append the conversion result to this string.
pub fn append_utf8(&mut self, other: &[u8]) {
let len = self.len();
self.fallible_append_utf8_impl(other, len)
.expect("Out of memory");
}
/// Convert a potentially-invalid UTF-8 string into valid UTF-16
/// (replacing invalid sequences with the REPLACEMENT CHARACTER) and
/// fallibly append the conversion result to this string.
pub fn fallible_append_utf8(&mut self, other: &[u8]) -> Result<(), ()> {
let len = self.len();
self.fallible_append_utf8_impl(other, len).map(|_| ())
}
// Latin1 to UTF-16
constant_conversion!(
name = fallible_append_latin1_impl,
convert = convert_latin1_to_utf16,
other_ty = &[u8]
);
/// Convert a Latin1 (i.e. byte value equals scalar value; not windows-1252!)
/// into UTF-16 and replace the content of this string with the conversion result.
pub fn assign_latin1(&mut self, other: &[u8]) {
self.fallible_append_latin1_impl(other, 0)
.expect("Out of memory");
}
/// Convert a Latin1 (i.e. byte value equals scalar value; not windows-1252!)
/// into UTF-16 and fallibly replace the content of this string with the
/// conversion result.
pub fn fallible_assign_latin1(&mut self, other: &[u8]) -> Result<(), ()> {
self.fallible_append_latin1_impl(other, 0).map(|_| ())
}
/// Convert a Latin1 (i.e. byte value equals scalar value; not windows-1252!)
/// into UTF-16 and append the conversion result to this string.
pub fn append_latin1(&mut self, other: &[u8]) {
let len = self.len();
self.fallible_append_latin1_impl(other, len)
.expect("Out of memory");
}
/// Convert a Latin1 (i.e. byte value equals scalar value; not windows-1252!)
/// into UTF-16 and fallibly append the conversion result to this string.
pub fn fallible_append_latin1(&mut self, other: &[u8]) -> Result<(), ()> {
let len = self.len();
self.fallible_append_latin1_impl(other, len).map(|_| ())
}
}
impl nsACString {
// UTF-16 to UTF-8
fn fallible_append_utf16_to_utf8_impl(
&mut self,
other: &[u16],
old_len: usize,
) -> Result<BulkWriteOk, ()> {
// We first size the buffer for ASCII if the first cache line is ASCII. If that turns out not to
// be enough, we size for the worst case given the length of the remaining input at that point.
let (filled, num_ascii, mut handle) = if starts_with_basic_latin(other) {
let new_len_with_ascii = old_len.checked_add(other.len()).ok_or(())?;
let mut handle = unsafe { self.bulk_write(new_len_with_ascii, old_len, false)? };
let num_ascii = copy_basic_latin_to_ascii(other, &mut handle.as_mut_slice()[old_len..]);
let left = other.len() - num_ascii;
if left == 0 {
return Ok(handle.finish(old_len + num_ascii, true));
}
let filled = old_len + num_ascii;
let needed = times_three_plus_one(left).ok_or(())?;
let new_len = filled.checked_add(needed).ok_or(())?;
unsafe {
handle.restart_bulk_write(new_len, filled, false)?;
}
(filled, num_ascii, handle)
} else {
// Started with non-ASCII. Compute worst case
let needed = times_three_plus_one(other.len()).ok_or(())?;
let new_len = old_len.checked_add(needed).ok_or(())?;
let mut handle = unsafe { self.bulk_write(new_len, old_len, false)? };
(old_len, 0, handle)
};
let written =
convert_utf16_to_utf8(&other[num_ascii..], &mut handle.as_mut_slice()[filled..]);
Ok(handle.finish(filled + written, true))
}
/// Convert a potentially-invalid UTF-16 string into valid UTF-8
/// (replacing invalid sequences with the REPLACEMENT CHARACTER) and
/// replace the content of this string with the conversion result.
pub fn assign_utf16_to_utf8(&mut self, other: &[u16]) {
self.fallible_append_utf16_to_utf8_impl(other, 0)
.expect("Out of memory");
}
/// Convert a potentially-invalid UTF-16 string into valid UTF-8
/// (replacing invalid sequences with the REPLACEMENT CHARACTER) and
/// fallibly replace the content of this string with the conversion result.
pub fn fallible_assign_utf16_to_utf8(&mut self, other: &[u16]) -> Result<(), ()> {
self.fallible_append_utf16_to_utf8_impl(other, 0)
.map(|_| ())
}
/// Convert a potentially-invalid UTF-16 string into valid UTF-8
/// (replacing invalid sequences with the REPLACEMENT CHARACTER) and
/// append the conversion result to this string.
pub fn append_utf16_to_utf8(&mut self, other: &[u16]) {
let len = self.len();
self.fallible_append_utf16_to_utf8_impl(other, len)
.expect("Out of memory");
}
/// Convert a potentially-invalid UTF-16 string into valid UTF-8
/// (replacing invalid sequences with the REPLACEMENT CHARACTER) and
/// fallibly append the conversion result to this string.
pub fn fallible_append_utf16_to_utf8(&mut self, other: &[u16]) -> Result<(), ()> {
let len = self.len();
self.fallible_append_utf16_to_utf8_impl(other, len)
.map(|_| ())
}
// UTF-16 to Latin1
constant_conversion!(
name = fallible_append_utf16_to_latin1_lossy_impl,
convert = convert_utf16_to_latin1_lossy,
other_ty = &[u16]
);
/// Convert a UTF-16 string whose all code points are below U+0100 into
/// a Latin1 (scalar value is byte value; not windows-1252!) string and
/// replace the content of this string with the conversion result.
///
/// # Panics
///
/// If the input contains code points above U+00FF or is not valid UTF-16,
/// panics in debug mode and produces garbage in a memory-safe way in
/// release builds. The nature of the garbage may differ based on CPU
/// architecture and must not be relied upon.
pub fn assign_utf16_to_latin1_lossy(&mut self, other: &[u16]) {
self.fallible_append_utf16_to_latin1_lossy_impl(other, 0)
.expect("Out of memory");
}
/// Convert a UTF-16 string whose all code points are below U+0100 into
/// a Latin1 (scalar value is byte value; not windows-1252!) string and
/// fallibly replace the content of this string with the conversion result.
///
/// # Panics
///
/// If the input contains code points above U+00FF or is not valid UTF-16,
/// panics in debug mode and produces garbage in a memory-safe way in
/// release builds. The nature of the garbage may differ based on CPU
/// architecture and must not be relied upon.
pub fn fallible_assign_utf16_to_latin1_lossy(&mut self, other: &[u16]) -> Result<(), ()> {
self.fallible_append_utf16_to_latin1_lossy_impl(other, 0)
.map(|_| ())
}
/// Convert a UTF-16 string whose all code points are below U+0100 into
/// a Latin1 (scalar value is byte value; not windows-1252!) string and
/// append the conversion result to this string.
///
/// # Panics
///
/// If the input contains code points above U+00FF or is not valid UTF-16,
/// panics in debug mode and produces garbage in a memory-safe way in
/// release builds. The nature of the garbage may differ based on CPU
/// architecture and must not be relied upon.
pub fn append_utf16_to_latin1_lossy(&mut self, other: &[u16]) {
let len = self.len();
self.fallible_append_utf16_to_latin1_lossy_impl(other, len)
.expect("Out of memory");
}
/// Convert a UTF-16 string whose all code points are below U+0100 into
/// a Latin1 (scalar value is byte value; not windows-1252!) string and
/// fallibly append the conversion result to this string.
///
/// # Panics
///
/// If the input contains code points above U+00FF or is not valid UTF-16,
/// panics in debug mode and produces garbage in a memory-safe way in
/// release builds. The nature of the garbage may differ based on CPU
/// architecture and must not be relied upon.
pub fn fallible_append_utf16_to_latin1_lossy(&mut self, other: &[u16]) -> Result<(), ()> {
let len = self.len();
self.fallible_append_utf16_to_latin1_lossy_impl(other, len)
.map(|_| ())
}
// UTF-8 to Latin1
ascii_copy_avoidance!(
name = fallible_append_utf8_to_latin1_lossy_check,
implementation = fallible_append_utf8_to_latin1_lossy_impl,
string_like = nsCStringLike
);
fn fallible_append_utf8_to_latin1_lossy_impl(
&mut self,
other: &[u8],
old_len: usize,
maybe_num_ascii: Option<usize>,
) -> Result<BulkWriteOk, ()> {
let new_len = old_len.checked_add(other.len()).ok_or(())?;
let num_ascii = maybe_num_ascii.unwrap_or(0);
// Already checked for overflow above, so this can't overflow.
let old_len_plus_num_ascii = old_len + num_ascii;
let mut handle = unsafe { self.bulk_write(new_len, old_len, false)? };
let written = {
let buffer = handle.as_mut_slice();
if num_ascii != 0 {
(&mut buffer[old_len..old_len_plus_num_ascii]).copy_from_slice(&other[..num_ascii]);
}
convert_utf8_to_latin1_lossy(&other[num_ascii..], &mut buffer[old_len_plus_num_ascii..])
};
Ok(handle.finish(old_len_plus_num_ascii + written, true))
}
/// Convert a UTF-8 string whose all code points are below U+0100 into
/// a Latin1 (scalar value is byte value; not windows-1252!) string and
/// replace the content of this string with the conversion result.
///
/// # Panics
///
/// If the input contains code points above U+00FF or is not valid UTF-8,
/// panics in debug mode and produces garbage in a memory-safe way in
/// release builds. The nature of the garbage may differ based on CPU
/// architecture and must not be relied upon.
pub fn assign_utf8_to_latin1_lossy<T: nsCStringLike + ?Sized>(&mut self, other: &T) {
self.fallible_append_utf8_to_latin1_lossy_check(other, 0)
.expect("Out of memory");
}
/// Convert a UTF-8 string whose all code points are below U+0100 into
/// a Latin1 (scalar value is byte value; not windows-1252!) string and
/// fallibly replace the content of this string with the conversion result.
///
/// # Panics
///
/// If the input contains code points above U+00FF or is not valid UTF-8,
/// panics in debug mode and produces garbage in a memory-safe way in
/// release builds. The nature of the garbage may differ based on CPU
/// architecture and must not be relied upon.
pub fn fallible_assign_utf8_to_latin1_lossy<T: nsCStringLike + ?Sized>(
&mut self,
other: &T,
) -> Result<(), ()> {
self.fallible_append_utf8_to_latin1_lossy_check(other, 0)
.map(|_| ())
}
/// Convert a UTF-8 string whose all code points are below U+0100 into
/// a Latin1 (scalar value is byte value; not windows-1252!) string and
/// append the conversion result to this string.
///
/// # Panics
///
/// If the input contains code points above U+00FF or is not valid UTF-8,
/// panics in debug mode and produces garbage in a memory-safe way in
/// release builds. The nature of the garbage may differ based on CPU
/// architecture and must not be relied upon.
pub fn append_utf8_to_latin1_lossy<T: nsCStringLike + ?Sized>(&mut self, other: &T) {
let len = self.len();
self.fallible_append_utf8_to_latin1_lossy_check(other, len)
.expect("Out of memory");
}
/// Convert a UTF-8 string whose all code points are below U+0100 into
/// a Latin1 (scalar value is byte value; not windows-1252!) string and
/// fallibly append the conversion result to this string.
///
/// # Panics
///
/// If the input contains code points above U+00FF or is not valid UTF-8,
/// panics in debug mode and produces garbage in a memory-safe way in
/// release builds. The nature of the garbage may differ based on CPU
/// architecture and must not be relied upon.
pub fn fallible_append_utf8_to_latin1_lossy<T: nsCStringLike + ?Sized>(
&mut self,
other: &T,
) -> Result<(), ()> {
let len = self.len();
self.fallible_append_utf8_to_latin1_lossy_check(other, len)
.map(|_| ())
}
// Latin1 to UTF-8 CString
ascii_copy_avoidance!(
name = fallible_append_latin1_to_utf8_check,
implementation = fallible_append_latin1_to_utf8_impl,
string_like = Latin1StringLike
);
fn fallible_append_latin1_to_utf8_impl(
&mut self,
other: &[u8],
old_len: usize,
maybe_num_ascii: Option<usize>,
) -> Result<BulkWriteOk, ()> {
let (filled, num_ascii, mut handle) = if let Some(num_ascii) = maybe_num_ascii {
// Wrapper checked for ASCII
let left = other.len() - num_ascii;
let filled = old_len + num_ascii;
let needed = left.checked_mul(2).ok_or(())?;
let new_len = filled.checked_add(needed).ok_or(())?;
let mut handle = unsafe { self.bulk_write(new_len, old_len, false)? };
if num_ascii != 0 {
(&mut handle.as_mut_slice()[old_len..filled]).copy_from_slice(&other[..num_ascii]);
}
(filled, num_ascii, handle)
} else if starts_with_ascii(other) {
// Wrapper didn't check for ASCII, so let's see if `other` starts with ASCII
// `other` starts with ASCII, so let's first size the buffer
// with optimism that it's ASCII-only.
let new_len_with_ascii = old_len.checked_add(other.len()).ok_or(())?;
let mut handle = unsafe { self.bulk_write(new_len_with_ascii, old_len, false)? };
let num_ascii = copy_ascii_to_ascii(other, &mut handle.as_mut_slice()[old_len..]);
let left = other.len() - num_ascii;
let filled = old_len + num_ascii;
if left == 0 {
// `other` was all ASCII
return Ok(handle.finish(filled, true));
}
let needed = left.checked_mul(2).ok_or(())?;
let new_len = filled.checked_add(needed).ok_or(())?;
unsafe {
handle.restart_bulk_write(new_len, filled, false)?;
}
(filled, num_ascii, handle)
} else {
// Started with non-ASCII. Assume worst case.
let needed = other.len().checked_mul(2).ok_or(())?;
let new_len = old_len.checked_add(needed).ok_or(())?;
let mut handle = unsafe { self.bulk_write(new_len, old_len, false)? };
(old_len, 0, handle)
};
let written =
convert_latin1_to_utf8(&other[num_ascii..], &mut handle.as_mut_slice()[filled..]);
Ok(handle.finish(filled + written, true))
}
/// Convert a Latin1 (i.e. byte value equals scalar value; not windows-1252!)
/// into UTF-8 and replace the content of this string with the conversion result.
pub fn assign_latin1_to_utf8<T: Latin1StringLike + ?Sized>(&mut self, other: &T) {
self.fallible_append_latin1_to_utf8_check(other, 0)
.expect("Out of memory");
}
/// Convert a Latin1 (i.e. byte value equals scalar value; not windows-1252!)
/// into UTF-8 and fallibly replace the content of this string with the
/// conversion result.
pub fn fallible_assign_latin1_to_utf8<T: Latin1StringLike + ?Sized>(
&mut self,
other: &T,
) -> Result<(), ()> {
self.fallible_append_latin1_to_utf8_check(other, 0)
.map(|_| ())
}
/// Convert a Latin1 (i.e. byte value equals scalar value; not windows-1252!)
/// into UTF-8 and append the conversion result to this string.
pub fn append_latin1_to_utf8<T: Latin1StringLike + ?Sized>(&mut self, other: &T) {
let len = self.len();
self.fallible_append_latin1_to_utf8_check(other, len)
.expect("Out of memory");
}
/// Convert a Latin1 (i.e. byte value equals scalar value; not windows-1252!)
/// into UTF-8 and fallibly append the conversion result to this string.
pub fn fallible_append_latin1_to_utf8<T: Latin1StringLike + ?Sized>(
&mut self,
other: &T,
) -> Result<(), ()> {
let len = self.len();
self.fallible_append_latin1_to_utf8_check(other, len)
.map(|_| ())
}
}
#[no_mangle]
pub unsafe extern "C" fn nsstring_fallible_append_utf8_impl(
this: *mut nsAString,
other: *const u8,
other_len: usize,
old_len: usize,
) -> bool {
let other_slice = slice::from_raw_parts(other, other_len);
(*this)
.fallible_append_utf8_impl(other_slice, old_len)
.is_ok()
}
#[no_mangle]
pub unsafe extern "C" fn nsstring_fallible_append_latin1_impl(
this: *mut nsAString,
other: *const u8,
other_len: usize,
old_len: usize,
) -> bool {
let other_slice = slice::from_raw_parts(other, other_len);
(*this)
.fallible_append_latin1_impl(other_slice, old_len)
.is_ok()
}
#[no_mangle]
pub unsafe extern "C" fn nscstring_fallible_append_utf16_to_utf8_impl(
this: *mut nsACString,
other: *const u16,
other_len: usize,
old_len: usize,
) -> bool {
let other_slice = slice::from_raw_parts(other, other_len);
(*this)
.fallible_append_utf16_to_utf8_impl(other_slice, old_len)
.is_ok()
}
#[no_mangle]
pub unsafe extern "C" fn nscstring_fallible_append_utf16_to_latin1_lossy_impl(
this: *mut nsACString,
other: *const u16,
other_len: usize,
old_len: usize,
) -> bool {
let other_slice = slice::from_raw_parts(other, other_len);
(*this)
.fallible_append_utf16_to_latin1_lossy_impl(other_slice, old_len)
.is_ok()
}
#[no_mangle]
pub unsafe extern "C" fn nscstring_fallible_append_utf8_to_latin1_lossy_check(
this: *mut nsACString,
other: *const nsACString,
old_len: usize,
) -> bool {
(*this)
.fallible_append_utf8_to_latin1_lossy_check(&*other, old_len)
.is_ok()
}
#[no_mangle]
pub unsafe extern "C" fn nscstring_fallible_append_latin1_to_utf8_check(
this: *mut nsACString,
other: *const nsACString,
old_len: usize,
) -> bool {
(*this)
.fallible_append_latin1_to_utf8_check(&*other, old_len)
.is_ok()
}

View file

@ -124,10 +124,26 @@ use std::marker::PhantomData;
use std::mem;
use std::ops::{Deref, DerefMut};
use std::os::raw::c_void;
use std::ptr;
use std::slice;
use std::str;
use std::u32;
mod conversions;
pub use self::conversions::nscstring_fallible_append_latin1_to_utf8_check;
pub use self::conversions::nscstring_fallible_append_utf16_to_latin1_lossy_impl;
pub use self::conversions::nscstring_fallible_append_utf16_to_utf8_impl;
pub use self::conversions::nscstring_fallible_append_utf8_to_latin1_lossy_check;
pub use self::conversions::nsstring_fallible_append_latin1_impl;
pub use self::conversions::nsstring_fallible_append_utf8_impl;
/// A type for showing that `finish()` was called on a `BulkWriteHandle`.
/// Instantiating this type from elsewhere is basically an assertion that
/// there is no `BulkWriteHandle` around, so be very careful with instantiating
/// this type!
pub struct BulkWriteOk;
///////////////////////////////////
// Internal Implementation Flags //
///////////////////////////////////
@ -168,6 +184,146 @@ use data_flags::DataFlags;
// Generic String Bindings Macros //
////////////////////////////////////
macro_rules! string_like {
{
char_t = $char_t: ty;
AString = $AString: ident;
String = $String: ident;
Str = $Str: ident;
StringLike = $StringLike: ident;
StringAdapter = $StringAdapter: ident;
} => {
/// This trait is implemented on types which are `ns[C]String`-like, in
/// that they can at very low cost be converted to a borrowed
/// `&nsA[C]String`. Unfortunately, the intermediate type
/// `ns[C]StringAdapter` is required as well due to types like `&[u8]`
/// needing to be (cheaply) wrapped in a `nsCString` on the stack to
/// create the `&nsACString`.
///
/// This trait is used to DWIM when calling the methods on
/// `nsA[C]String`.
pub trait $StringLike {
fn adapt(&self) -> $StringAdapter;
}
impl<'a, T: $StringLike + ?Sized> $StringLike for &'a T {
fn adapt(&self) -> $StringAdapter {
<T as $StringLike>::adapt(*self)
}
}
impl<'a, T> $StringLike for borrow::Cow<'a, T>
where T: $StringLike + borrow::ToOwned + ?Sized {
fn adapt(&self) -> $StringAdapter {
<T as $StringLike>::adapt(self.as_ref())
}
}
impl $StringLike for $AString {
fn adapt(&self) -> $StringAdapter {
$StringAdapter::Abstract(self)
}
}
impl<'a> $StringLike for $Str<'a> {
fn adapt(&self) -> $StringAdapter {
$StringAdapter::Abstract(self)
}
}
impl $StringLike for $String {
fn adapt(&self) -> $StringAdapter {
$StringAdapter::Abstract(self)
}
}
impl $StringLike for [$char_t] {
fn adapt(&self) -> $StringAdapter {
$StringAdapter::Borrowed($Str::from(self))
}
}
impl $StringLike for Vec<$char_t> {
fn adapt(&self) -> $StringAdapter {
$StringAdapter::Borrowed($Str::from(&self[..]))
}
}
impl $StringLike for Box<[$char_t]> {
fn adapt(&self) -> $StringAdapter {
$StringAdapter::Borrowed($Str::from(&self[..]))
}
}
}
}
impl<'a> Drop for nsAStringBulkWriteHandle<'a> {
/// This only runs in error cases. In success cases, `finish()`
/// calls `forget(self)`.
fn drop(&mut self) {
if self.capacity == 0 {
// If capacity is 0, the string is a zero-length
// string, so we have nothing to do.
return;
}
// The old zero terminator may be gone by now, so we need
// to write a new one somewhere and make length match.
// We can use a length between 1 and self.capacity.
// Seems prudent to overwrite the uninitialized memory.
// Using the length 1 leaves the shortest memory to overwrite.
// U+FFFD is the safest placeholder. Merely truncating the
// string to a zero-length string might be dangerous in some
// scenarios. See
// https://www.unicode.org/reports/tr36/#Substituting_for_Ill_Formed_Subsequences
// for closely related scenario.
unsafe {
let mut this = self.string.as_repr();
this.as_mut().length = 1u32;
*(this.as_mut().data.as_mut()) = 0xFFFDu16;
*(this.as_mut().data.as_ptr().offset(1isize)) = 0;
}
}
}
impl<'a> Drop for nsACStringBulkWriteHandle<'a> {
/// This only runs in error cases. In success cases, `finish()`
/// calls `forget(self)`.
fn drop(&mut self) {
if self.capacity == 0 {
// If capacity is 0, the string is a zero-length
// string, so we have nothing to do.
return;
}
// The old zero terminator may be gone by now, so we need
// to write a new one somewhere and make length match.
// We can use a length between 1 and self.capacity.
// Seems prudent to overwrite the uninitialized memory.
// Using the length 1 leaves the shortest memory to overwrite.
// U+FFFD is the safest placeholder, but when it doesn't fit,
// let's use ASCII substitute. Merely truncating the
// string to a zero-length string might be dangerous in some
// scenarios. See
// https://www.unicode.org/reports/tr36/#Substituting_for_Ill_Formed_Subsequences
// for closely related scenario.
unsafe {
let mut this = self.string.as_repr();
if self.capacity >= 3 {
this.as_mut().length = 3u32;
*(this.as_mut().data.as_mut()) = 0xEFu8;
*(this.as_mut().data.as_ptr().offset(1isize)) = 0xBFu8;
*(this.as_mut().data.as_ptr().offset(2isize)) = 0xBDu8;
*(this.as_mut().data.as_ptr().offset(3isize)) = 0;
} else {
this.as_mut().length = 1u32;
*(this.as_mut().data.as_mut()) = 0x1Au8; // U+FFFD doesn't fit
*(this.as_mut().data.as_ptr().offset(1isize)) = 0;
}
}
}
}
macro_rules! define_string_types {
{
char_t = $char_t: ty;
@ -181,12 +337,15 @@ macro_rules! define_string_types {
StringRepr = $StringRepr: ident;
BulkWriteHandle = $BulkWriteHandle: ident;
drop = $drop: ident;
assign = $assign: ident, $fallible_assign: ident;
take_from = $take_from: ident, $fallible_take_from: ident;
append = $append: ident, $fallible_append: ident;
set_length = $set_length: ident, $fallible_set_length: ident;
begin_writing = $begin_writing: ident, $fallible_begin_writing: ident;
start_bulk_write = $start_bulk_write: ident;
} => {
/// The representation of a ns[C]String type in C++. This type is
/// used internally by our definition of ns[C]String to ensure layout
@ -201,7 +360,7 @@ macro_rules! define_string_types {
#[repr(C)]
#[derive(Debug)]
pub struct $StringRepr {
data: *const $char_t,
data: ptr::NonNull<$char_t>,
length: u32,
dataflags: DataFlags,
classflags: ClassFlags,
@ -211,7 +370,7 @@ macro_rules! define_string_types {
fn new(classflags: ClassFlags) -> $StringRepr {
static NUL: $char_t = 0;
$StringRepr {
data: &NUL,
data: unsafe { ptr::NonNull::new_unchecked(&NUL as *const _ as *mut _) },
length: 0,
dataflags: DataFlags::TERMINATED | DataFlags::LITERAL,
classflags: classflags,
@ -236,6 +395,63 @@ macro_rules! define_string_types {
}
}
pub struct $BulkWriteHandle<'a> {
string: &'a mut $AString,
capacity: usize,
}
impl<'a> $BulkWriteHandle<'a> {
fn new(string: &'a mut $AString, capacity: usize) -> Self {
$BulkWriteHandle{ string: string, capacity: capacity }
}
pub unsafe fn restart_bulk_write(&mut self,
capacity: usize,
units_to_preserve: usize,
allow_shrinking: bool) -> Result<(), ()> {
self.capacity =
self.string.start_bulk_write_impl(capacity,
units_to_preserve,
allow_shrinking)?;
Ok(())
}
pub fn finish(mut self, length: usize, allow_shrinking: bool) -> BulkWriteOk {
// NOTE: Drop is implemented outside the macro earlier in this file,
// because it needs to deal with different code unit representations
// for the REPLACEMENT CHARACTER in the UTF-16 and UTF-8 cases and
// needs to deal with a REPLACEMENT CHARACTER not fitting in the
// buffer in the UTF-8 case.
assert!(length <= self.capacity);
if length == 0 {
// `truncate()` is OK even when the string
// is in invalid state.
self.string.truncate();
mem::forget(self); // Don't run the failure path in drop()
return BulkWriteOk{};
}
if allow_shrinking {
unsafe {
let _ = self.restart_bulk_write(length, length, true);
}
}
unsafe {
let mut this = self.string.as_repr();
this.as_mut().length = length as u32;
*(this.as_mut().data.as_ptr().offset(length as isize)) = 0;
}
mem::forget(self); // Don't run the failure path in drop()
BulkWriteOk{}
}
pub fn as_mut_slice(&mut self) -> &mut [$char_t] {
unsafe {
let mut this = self.string.as_repr();
slice::from_raw_parts_mut(this.as_mut().data.as_ptr(), self.capacity)
}
}
}
/// This type is the abstract type which is used for interacting with
/// strings in rust. Each string type can derefence to an instance of
/// this type, which provides the useful operations on strings.
@ -345,8 +561,8 @@ macro_rules! define_string_types {
unsafe {
let len = self.len();
if len == 0 {
// Use an arbitrary non-null value as the pointer
slice::from_raw_parts_mut(0x1 as *mut $char_t, 0)
// Use an arbitrary but aligned non-null value as the pointer
slice::from_raw_parts_mut(ptr::NonNull::<$char_t>::dangling().as_ptr(), 0)
} else {
slice::from_raw_parts_mut($begin_writing(self), len)
}
@ -363,8 +579,9 @@ macro_rules! define_string_types {
unsafe {
let len = self.len();
if len == 0 {
// Use an arbitrary non-null value as the pointer
Ok(slice::from_raw_parts_mut(0x1 as *mut $char_t, 0))
// Use an arbitrary but aligned non-null value as the pointer
Ok(slice::from_raw_parts_mut(
ptr::NonNull::<$char_t>::dangling().as_ptr() as *mut $char_t, 0))
} else {
let ptr = $fallible_begin_writing(self);
if ptr.is_null() {
@ -376,6 +593,46 @@ macro_rules! define_string_types {
}
}
/// Unshares the buffer of the string and returns a handle
/// from which a writable slice whose length is the rounded-up
/// capacity can be obtained.
///
/// Fails also if the new length doesn't fit in 32 bits.
///
/// # Safety
///
/// Unsafe because of exposure of uninitialized memory.
pub unsafe fn bulk_write(&mut self,
capacity: usize,
units_to_preserve: usize,
allow_shrinking: bool) -> Result<$BulkWriteHandle, ()> {
let capacity =
self.start_bulk_write_impl(capacity, units_to_preserve, allow_shrinking)?;
Ok($BulkWriteHandle::new(self, capacity))
}
unsafe fn start_bulk_write_impl(&mut self,
capacity: usize,
units_to_preserve: usize,
allow_shrinking: bool) -> Result<usize, ()> {
if capacity > u32::max_value() as usize {
Err(())
} else {
let capacity32 = capacity as u32;
let rounded = $start_bulk_write(self,
capacity32,
units_to_preserve as u32,
allow_shrinking);
if rounded == u32::max_value() {
return Err(())
}
Ok(rounded as usize)
}
}
fn as_repr(&mut self) -> ptr::NonNull<$StringRepr> {
unsafe { ptr::NonNull::new_unchecked(self as *mut _ as *mut $StringRepr)}
}
}
impl Deref for $AString {
@ -387,13 +644,7 @@ macro_rules! define_string_types {
// into $StringRepr to get the reference to the underlying
// data.
let this: &$StringRepr = mem::transmute(self);
if this.data.is_null() {
debug_assert_eq!(this.length, 0);
// Use an arbitrary non-null value as the pointer
slice::from_raw_parts(0x1 as *const $char_t, 0)
} else {
slice::from_raw_parts(this.data, this.length as usize)
}
slice::from_raw_parts(this.data.as_ptr(), this.length as usize)
}
}
}
@ -478,7 +729,7 @@ macro_rules! define_string_types {
}
$Str {
hdr: $StringRepr {
data: s.as_ptr(),
data: unsafe { ptr::NonNull::new_unchecked(s.as_ptr() as *mut _) },
length: s.len() as u32,
dataflags: DataFlags::empty(),
classflags: ClassFlags::empty(),
@ -638,14 +889,14 @@ macro_rules! define_string_types {
// because in the Gecko tree, we use the same allocator for
// Rust code as for C++ code, meaning that our box can be
// legally freed with libc::free().
let ptr = s.as_ptr();
let ptr = s.as_mut_ptr();
mem::forget(s);
unsafe {
Gecko_IncrementStringAdoptCount(ptr as *mut _);
}
$String {
hdr: $StringRepr {
data: ptr,
data: unsafe { ptr::NonNull::new_unchecked(ptr) },
length: length,
dataflags: DataFlags::OWNED | DataFlags::TERMINATED,
classflags: ClassFlags::NULL_TERMINATED,
@ -727,66 +978,25 @@ macro_rules! define_string_types {
}
}
/// This trait is implemented on types which are `ns[C]String`-like, in
/// that they can at very low cost be converted to a borrowed
/// `&nsA[C]String`. Unfortunately, the intermediate type
/// `ns[C]StringAdapter` is required as well due to types like `&[u8]`
/// needing to be (cheaply) wrapped in a `nsCString` on the stack to
/// create the `&nsACString`.
///
/// This trait is used to DWIM when calling the methods on
/// `nsA[C]String`.
pub trait $StringLike {
fn adapt(&self) -> $StringAdapter;
}
impl<'a, T: $StringLike + ?Sized> $StringLike for &'a T {
fn adapt(&self) -> $StringAdapter {
<T as $StringLike>::adapt(*self)
impl<'a> $StringAdapter<'a> {
#[allow(dead_code)]
fn is_abstract(&self) -> bool {
match *self {
$StringAdapter::Borrowed(_) => false,
$StringAdapter::Abstract(_) => true,
}
}
}
impl<'a, T> $StringLike for borrow::Cow<'a, T>
where T: $StringLike + borrow::ToOwned + ?Sized {
fn adapt(&self) -> $StringAdapter {
<T as $StringLike>::adapt(self.as_ref())
}
}
string_like! {
char_t = $char_t;
impl $StringLike for $AString {
fn adapt(&self) -> $StringAdapter {
$StringAdapter::Abstract(self)
}
}
AString = $AString;
String = $String;
Str = $Str;
impl<'a> $StringLike for $Str<'a> {
fn adapt(&self) -> $StringAdapter {
$StringAdapter::Abstract(self)
}
}
impl $StringLike for $String {
fn adapt(&self) -> $StringAdapter {
$StringAdapter::Abstract(self)
}
}
impl $StringLike for [$char_t] {
fn adapt(&self) -> $StringAdapter {
$StringAdapter::Borrowed($Str::from(self))
}
}
impl $StringLike for Vec<$char_t> {
fn adapt(&self) -> $StringAdapter {
$StringAdapter::Borrowed($Str::from(&self[..]))
}
}
impl $StringLike for Box<[$char_t]> {
fn adapt(&self) -> $StringAdapter {
$StringAdapter::Borrowed($Str::from(&self[..]))
}
StringLike = $StringLike;
StringAdapter = $StringAdapter;
}
}
}
@ -807,39 +1017,18 @@ define_string_types! {
StringRepr = nsCStringRepr;
BulkWriteHandle = nsACStringBulkWriteHandle;
drop = Gecko_FinalizeCString;
assign = Gecko_AssignCString, Gecko_FallibleAssignCString;
take_from = Gecko_TakeFromCString, Gecko_FallibleTakeFromCString;
append = Gecko_AppendCString, Gecko_FallibleAppendCString;
set_length = Gecko_SetLengthCString, Gecko_FallibleSetLengthCString;
begin_writing = Gecko_BeginWritingCString, Gecko_FallibleBeginWritingCString;
start_bulk_write = Gecko_StartBulkWriteCString;
}
impl nsACString {
pub fn assign_utf16<T: nsStringLike + ?Sized>(&mut self, other: &T) {
self.truncate();
self.append_utf16(other);
}
pub fn fallible_assign_utf16<T: nsStringLike + ?Sized>(&mut self, other: &T) -> Result<(), ()> {
self.truncate();
self.fallible_append_utf16(other)
}
pub fn append_utf16<T: nsStringLike + ?Sized>(&mut self, other: &T) {
unsafe {
Gecko_AppendUTF16toCString(self, other.adapt().as_ptr());
}
}
pub fn fallible_append_utf16<T: nsStringLike + ?Sized>(&mut self, other: &T) -> Result<(), ()> {
if unsafe { Gecko_FallibleAppendUTF16toCString(self, other.adapt().as_ptr()) } {
Ok(())
} else {
Err(())
}
}
pub unsafe fn as_str_unchecked(&self) -> &str {
str::from_utf8_unchecked(self)
}
@ -925,6 +1114,23 @@ impl nsCStringLike for Box<str> {
}
}
/// This trait is implemented on types which are Latin1 `nsCString`-like,
/// in that they can at very low cost be converted to a borrowed
/// `&nsACString` and do not denote UTF-8ness in the Rust type system.
///
/// This trait is used to DWIM when calling the methods on
/// `nsACString`.
string_like! {
char_t = u8;
AString = nsACString;
String = nsCString;
Str = nsCStr;
StringLike = Latin1StringLike;
StringAdapter = nsCStringAdapter;
}
///////////////////////////////////////////
// Bindings for nsString (u16 char type) //
///////////////////////////////////////////
@ -941,38 +1147,15 @@ define_string_types! {
StringRepr = nsStringRepr;
BulkWriteHandle = nsAStringBulkWriteHandle;
drop = Gecko_FinalizeString;
assign = Gecko_AssignString, Gecko_FallibleAssignString;
take_from = Gecko_TakeFromString, Gecko_FallibleTakeFromString;
append = Gecko_AppendString, Gecko_FallibleAppendString;
set_length = Gecko_SetLengthString, Gecko_FallibleSetLengthString;
begin_writing = Gecko_BeginWritingString, Gecko_FallibleBeginWritingString;
}
impl nsAString {
pub fn assign_utf8<T: nsCStringLike + ?Sized>(&mut self, other: &T) {
self.truncate();
self.append_utf8(other);
}
pub fn fallible_assign_utf8<T: nsCStringLike + ?Sized>(&mut self, other: &T) -> Result<(), ()> {
self.truncate();
self.fallible_append_utf8(other)
}
pub fn append_utf8<T: nsCStringLike + ?Sized>(&mut self, other: &T) {
unsafe {
Gecko_AppendUTF8toString(self, other.adapt().as_ptr());
}
}
pub fn fallible_append_utf8<T: nsCStringLike + ?Sized>(&mut self, other: &T) -> Result<(), ()> {
if unsafe { Gecko_FallibleAppendUTF8toString(self, other.adapt().as_ptr()) } {
Ok(())
} else {
Err(())
}
}
start_bulk_write = Gecko_StartBulkWriteString;
}
// NOTE: The From impl for a string slice for nsString produces a <'static>
@ -994,7 +1177,7 @@ impl fmt::Write for nsAString {
fn write_str(&mut self, s: &str) -> Result<(), fmt::Error> {
// Directly invoke gecko's routines for appending utf8 strings to
// nsAString values, to avoid as much overhead as possible
self.append_utf8(s);
self.append_str(s);
Ok(())
}
}
@ -1038,6 +1221,12 @@ extern "C" {
fn Gecko_FallibleAppendCString(this: *mut nsACString, other: *const nsACString) -> bool;
fn Gecko_FallibleSetLengthCString(this: *mut nsACString, length: u32) -> bool;
fn Gecko_FallibleBeginWritingCString(this: *mut nsACString) -> *mut u8;
fn Gecko_StartBulkWriteCString(
this: *mut nsACString,
capacity: u32,
units_to_preserve: u32,
allow_shrinking: bool,
) -> u32;
fn Gecko_FinalizeString(this: *mut nsAString);
@ -1051,12 +1240,12 @@ extern "C" {
fn Gecko_FallibleAppendString(this: *mut nsAString, other: *const nsAString) -> bool;
fn Gecko_FallibleSetLengthString(this: *mut nsAString, length: u32) -> bool;
fn Gecko_FallibleBeginWritingString(this: *mut nsAString) -> *mut u16;
// Gecko implementation in nsReadableUtils.cpp
fn Gecko_AppendUTF16toCString(this: *mut nsACString, other: *const nsAString);
fn Gecko_AppendUTF8toString(this: *mut nsAString, other: *const nsACString);
fn Gecko_FallibleAppendUTF16toCString(this: *mut nsACString, other: *const nsAString) -> bool;
fn Gecko_FallibleAppendUTF8toString(this: *mut nsAString, other: *const nsACString) -> bool;
fn Gecko_StartBulkWriteString(
this: *mut nsAString,
capacity: u32,
units_to_preserve: u32,
allow_shrinking: bool,
) -> u32;
}
//////////////////////////////////////
@ -1070,10 +1259,10 @@ pub mod test_helpers {
//! It is public to ensure that these testing functions are avaliable to
//! gtest code.
use std::mem;
use super::{ClassFlags, DataFlags};
use super::{nsCStr, nsCString, nsCStringRepr};
use super::{nsStr, nsString, nsStringRepr};
use super::{ClassFlags, DataFlags};
use std::mem;
/// Generates an #[no_mangle] extern "C" function which returns the size and
/// alignment of the given type with the given name.

View file

@ -2,10 +2,3 @@
max-asserts: 4
[Fragment navigation: encoding]
expected: FAIL
[Invalid percent-encoded UTF-8 byte should decode as U+FFFD]
expected: FAIL
[Percent-encoded UTF-8 BOM followed by invalid UTF-8 byte should decode as U+FEFF U+FFFD]
expected: FAIL

View file

@ -685,7 +685,7 @@ nsWindowWatcher::OpenWindowInternal(mozIDOMWindowProxy* aParent,
bool nameSpecified = false;
if (aName) {
CopyUTF8toUTF16(aName, name);
CopyUTF8toUTF16(MakeStringSpan(aName), name);
nameSpecified = true;
} else {
name.SetIsVoid(true);

View file

@ -2208,7 +2208,7 @@ ShowProfileManager(nsIToolkitProfileService* aProfileSvc,
rv = ioParamBlock->GetString(0, &profileNamePtr);
NS_ENSURE_SUCCESS(rv, rv);
CopyUTF16toUTF8(profileNamePtr, profileName);
CopyUTF16toUTF8(MakeStringSpan(profileNamePtr), profileName);
free(profileNamePtr);
lock->Unlock();

View file

@ -522,7 +522,9 @@ struct MessageWindow {
// Construct a narrow UTF8 buffer <commandline>\0<workingdir>\0
NS_ConvertUTF16toUTF8 utf8buffer(cmd);
utf8buffer.Append('\0');
AppendUTF16toUTF8(cwd, utf8buffer);
WCHAR* cwdPtr = cwd;
AppendUTF16toUTF8(MakeStringSpan(reinterpret_cast<char16_t*>(cwdPtr)),
utf8buffer);
utf8buffer.Append('\0');
// We used to set dwData to zero, when we didn't send the working dir.

View file

@ -28,14 +28,15 @@ static char16_t*
AllocConvertUTF8toUTF16(const char *arg)
{
// UTF16 can't be longer in units than UTF8
int len = strlen(arg);
size_t len = strlen(arg);
char16_t *s = new char16_t[(len + 1) * sizeof(char16_t)];
if (!s)
return nullptr;
ConvertUTF8toUTF16 convert(s);
convert.write(arg, len);
convert.write_terminator();
size_t dstLen = ::MultiByteToWideChar(
CP_UTF8, 0, arg, len, reinterpret_cast<wchar_t*>(s), len);
s[dstLen] = 0;
return s;
}

View file

@ -77,14 +77,17 @@ static char*
AllocConvertUTF16toUTF8(char16ptr_t arg)
{
// be generous... UTF16 units can expand up to 3 UTF8 units
int len = wcslen(arg);
char *s = new char[len * 3 + 1];
size_t len = wcslen(arg);
// ConvertUTF16toUTF8 requires +1. Let's do that here, too, lacking
// knowledge of Windows internals.
size_t dstLen = len * 3 + 1;
char* s = new char[dstLen + 1]; // Another +1 for zero terminator
if (!s)
return nullptr;
ConvertUTF16toUTF8 convert(s);
convert.write(arg, len);
convert.write_terminator();
int written =
::WideCharToMultiByte(CP_UTF8, 0, arg, len, s, dstLen, nullptr, nullptr);
s[written] = 0;
return s;
}

View file

@ -2791,7 +2791,7 @@ NS_IMETHODIMP nsExternalHelperAppService::GetTypeFromFile(nsIFile* aFile, nsACSt
{
if (fileName[i] == char16_t('.'))
{
CopyUTF16toUTF8(fileName.get() + i + 1, fileExt);
CopyUTF16toUTF8(Substring(fileName, i + 1), fileExt);
break;
}
}

View file

@ -1878,7 +1878,7 @@ IMContextWrapper::GetCompositionString(GtkIMContext* aContext,
gtk_im_context_get_preedit_string(aContext, &preedit_string,
&feedback_list, &cursor_pos);
if (preedit_string && *preedit_string) {
CopyUTF8toUTF16(preedit_string, aCompositionString);
CopyUTF8toUTF16(MakeStringSpan(preedit_string), aCompositionString);
} else {
aCompositionString.Truncate();
}

View file

@ -438,7 +438,7 @@ nsPrinterEnumeratorGTK::InitPrintSettingsFromPrinter(const nsAString& aPrinterNa
}
if (path) {
CopyUTF8toUTF16(path, filename);
CopyUTF8toUTF16(MakeStringSpan(path), filename);
filename.AppendLiteral("/mozilla.pdf");
} else {
filename.AssignLiteral("mozilla.pdf");

View file

@ -3293,11 +3293,8 @@ case _value: eventName.AssignLiteral(_name) ; break
default:
{
char buf[32];
SprintfLiteral(buf,"UNKNOWN: %d",aGuiEvent->mMessage);
CopyASCIItoUTF16(buf, eventName);
eventName.AssignLiteral("UNKNOWN: ");
eventName.AppendInt(aGuiEvent->mMessage);
}
break;
}

View file

@ -1661,7 +1661,7 @@ CycleCollectedJSRuntime::ErrorInterceptor::interceptError(JSContext* cx, const J
nsContentUtils::ExtractErrorValues(cx, value, details.mFilename, &details.mLine, &details.mColumn, details.mMessage);
JS::UniqueChars buf = JS::FormatStackDump(cx, nullptr, /* showArgs = */ false, /* showLocals = */ false, /* showThisProps = */ false);
CopyUTF8toUTF16(buf.get(), details.mStack);
CopyUTF8toUTF16(mozilla::MakeStringSpan(buf.get()), details.mStack);
mThrownError.emplace(std::move(details));
}

View file

@ -59,17 +59,4 @@ NS_COUNT(InputIterator& aFirst, const InputIterator& aLast, const T& aValue)
return result;
}
template <class InputIterator, class OutputIterator>
inline OutputIterator&
copy_string(const InputIterator& aFirst, const InputIterator& aLast,
OutputIterator& aResult)
{
typedef nsCharSourceTraits<InputIterator> source_traits;
typedef nsCharSinkTraits<OutputIterator> sink_traits;
sink_traits::write(aResult, source_traits::read(aFirst),
source_traits::readable_distance(aFirst, aLast));
return aResult;
}
#endif // !defined(nsAlgorithm_h___)

View file

@ -196,18 +196,15 @@ struct AtomTableKey
*aHashOut = mHash;
}
AtomTableKey(const char* aUTF8String, uint32_t aLength, uint32_t* aHashOut)
AtomTableKey(const char* aUTF8String,
uint32_t aLength,
uint32_t* aHashOut,
bool* aErr)
: mUTF16String(nullptr)
, mUTF8String(aUTF8String)
, mLength(aLength)
{
bool err;
mHash = HashUTF8AsUTF16(mUTF8String, mLength, &err);
if (err) {
mUTF8String = nullptr;
mLength = 0;
mHash = 0;
}
mHash = HashUTF8AsUTF16(mUTF8String, mLength, aErr);
*aHashOut = mHash;
}
@ -333,10 +330,12 @@ AtomTableMatchKey(const PLDHashEntryHdr* aEntry, const void* aKey)
const AtomTableKey* k = static_cast<const AtomTableKey*>(aKey);
if (k->mUTF8String) {
return
CompareUTF8toUTF16(nsDependentCSubstring(k->mUTF8String,
k->mUTF8String + k->mLength),
nsDependentAtomString(he->mAtom)) == 0;
bool err = false;
return (CompareUTF8toUTF16(nsDependentCSubstring(
k->mUTF8String, k->mUTF8String + k->mLength),
nsDependentAtomString(he->mAtom),
&err) == 0) &&
!err;
}
return he->mAtom->Equals(k->mUTF16String, k->mLength);
@ -687,7 +686,16 @@ already_AddRefed<nsAtom>
nsAtomTable::Atomize(const nsACString& aUTF8String)
{
uint32_t hash;
AtomTableKey key(aUTF8String.Data(), aUTF8String.Length(), &hash);
bool err;
AtomTableKey key(aUTF8String.Data(), aUTF8String.Length(), &hash, &err);
if (MOZ_UNLIKELY(err)) {
MOZ_ASSERT_UNREACHABLE("Tried to atomize invalid UTF-8.");
// The input was invalid UTF-8. Let's replace the errors with U+FFFD
// and atomize the result.
nsString str;
CopyUTF8toUTF16(aUTF8String, str);
return Atomize(str);
}
nsAtomSubTable& table = SelectSubTable(key);
MutexAutoLock lock(table.mLock);
AtomTableEntry* he = table.Add(key);

View file

@ -829,7 +829,7 @@ nsDiscriminatedUnion::ConvertToAString(nsAString& aResult) const
CopyUTF8toUTF16(*u.mUTF8StringValue, aResult);
return NS_OK;
case nsIDataType::VTYPE_CHAR_STR:
CopyASCIItoUTF16(u.str.mStringValue, aResult);
CopyASCIItoUTF16(mozilla::MakeStringSpan(u.str.mStringValue), aResult);
return NS_OK;
case nsIDataType::VTYPE_WCHAR_STR:
aResult.Assign(u.wstr.mWStringValue);
@ -924,7 +924,7 @@ nsDiscriminatedUnion::ConvertToAUTF8String(nsAUTF8String& aResult) const
aResult);
return NS_OK;
case nsIDataType::VTYPE_WCHAR_STR:
CopyUTF16toUTF8(u.wstr.mWStringValue, aResult);
CopyUTF16toUTF8(mozilla::MakeStringSpan(u.wstr.mWStringValue), aResult);
return NS_OK;
case nsIDataType::VTYPE_STRING_SIZE_IS:
// XXX Extra copy, can be removed if we're sure CHAR_STR can

View file

@ -172,7 +172,7 @@ mozilla::GetLibraryName(mozilla::pathstr_t aDirectory, const char* aLib)
fullName.Assign(aDirectory);
fullName.Append('\\');
}
AppendUTF8toUTF16(aLib, fullName);
AppendUTF8toUTF16(MakeStringSpan(aLib), fullName);
if (!strstr(aLib, ".dll")) {
fullName.AppendLiteral(".dll");
}

View file

@ -49,16 +49,4 @@ UNIFIED_SOURCES += [
'precompiled_templates.cpp',
]
# Are we targeting x86 or x86-64? If so, compile the SSE2 functions for
# nsUTF8Utils.cpp and nsReadableUtils.cpp.
if CONFIG['INTEL_ARCHITECTURE']:
SOURCES += ['nsUTF8UtilsSSE2.cpp']
SOURCES['nsUTF8UtilsSSE2.cpp'].flags += CONFIG['SSE2_FLAGS']
SOURCES += ['nsReadableUtilsSSE2.cpp']
SOURCES['nsReadableUtilsSSE2.cpp'].flags += CONFIG['SSE2_FLAGS']
if CONFIG['HAVE_ARM_NEON'] or CONFIG['CPU_ARCH'] == 'aarch64':
SOURCES += ['nsUTF8UtilsNEON.cpp']
SOURCES['nsUTF8UtilsNEON.cpp'].flags += CONFIG['NEON_FLAGS']
FINAL_LIBRARY = 'xul'

File diff suppressed because it is too large Load diff

View file

@ -18,10 +18,172 @@
#include "nsTArrayForwardDeclare.h"
// Can't include mozilla/Encoding.h here
// Can't include mozilla/Encoding.h here. The implementations are in
// the encoding_rs and encoding_glue crates.
extern "C" {
size_t encoding_utf8_valid_up_to(uint8_t const* buffer, size_t buffer_len);
size_t encoding_ascii_valid_up_to(uint8_t const* buffer, size_t buffer_len);
size_t
encoding_utf8_valid_up_to(uint8_t const* buffer, size_t buffer_len);
bool
encoding_mem_is_ascii(uint8_t const* buffer, size_t buffer_len);
bool
encoding_mem_is_basic_latin(char16_t const* buffer, size_t buffer_len);
bool
encoding_mem_is_utf8_latin1(uint8_t const* buffer, size_t buffer_len);
bool
encoding_mem_is_str_latin1(uint8_t const* buffer, size_t buffer_len);
bool
encoding_mem_is_utf16_latin1(char16_t const* buffer, size_t buffer_len);
void
encoding_mem_convert_utf16_to_latin1_lossy(const char16_t* src,
size_t src_len,
char* dst,
size_t dst_len);
size_t
encoding_mem_convert_utf8_to_latin1_lossy(const char* src,
size_t src_len,
char* dst,
size_t dst_len);
void
encoding_mem_convert_latin1_to_utf16(const char* src,
size_t src_len,
char16_t* dst,
size_t dst_len);
size_t
encoding_mem_convert_utf16_to_utf8(const char16_t* src,
size_t src_len,
char* dst,
size_t dst_len);
size_t
encoding_mem_convert_utf8_to_utf16(const char* src,
size_t src_len,
char16_t* dst,
size_t dst_len);
}
// From the nsstring crate
extern "C" {
bool
nsstring_fallible_append_utf8_impl(nsAString* aThis,
const char* aOther,
size_t aOtherLen,
size_t aOldLen);
bool
nsstring_fallible_append_latin1_impl(nsAString* aThis,
const char* aOther,
size_t aOtherLen,
size_t aOldLen);
bool
nscstring_fallible_append_utf16_to_utf8_impl(nsACString* aThis,
const char16_t*,
size_t aOtherLen,
size_t aOldLen);
bool
nscstring_fallible_append_utf16_to_latin1_lossy_impl(nsACString* aThis,
const char16_t*,
size_t aOtherLen,
size_t aOldLen);
bool
nscstring_fallible_append_utf8_to_latin1_lossy_check(nsACString* aThis,
const nsACString* aOther,
size_t aOldLen);
bool
nscstring_fallible_append_latin1_to_utf8_check(nsACString* aThis,
const nsACString* aOther,
size_t aOldLen);
}
/**
* If all the code points in the input are below U+0100, converts to Latin1,
* i.e. unsigned byte value is Unicode scalar value; not windows-1252. If
* there are code points above U+00FF, produces garbage in a memory-safe way
* and will likely start asserting in future debug builds. The nature of the
* garbage depends on the CPU architecture and must not be relied upon.
*
* The length of aDest must be not be less than the length of aSource.
*/
inline void
LossyConvertUTF16toLatin1(mozilla::Span<const char16_t> aSource,
mozilla::Span<char> aDest)
{
encoding_mem_convert_utf16_to_latin1_lossy(
aSource.Elements(), aSource.Length(), aDest.Elements(), aDest.Length());
}
/**
* If all the code points in the input are below U+0100, converts to Latin1,
* i.e. unsigned byte value is Unicode scalar value; not windows-1252. If
* there are code points above U+00FF, asserts in debug builds and produces
* garbage in memory-safe way in release builds. The nature of the garbage
* may depend on the CPU architecture and must not be relied upon.
*
* The length of aDest must be not be less than the length of aSource.
*/
inline size_t
LossyConvertUTF8toLatin1(mozilla::Span<const char> aSource,
mozilla::Span<char> aDest)
{
return encoding_mem_convert_utf8_to_latin1_lossy(
aSource.Elements(), aSource.Length(), aDest.Elements(), aDest.Length());
}
/**
* Interprets unsigned byte value as Unicode scalar value (i.e. not
* windows-1252!).
*
* The length of aDest must be not be less than the length of aSource.
*/
inline void
ConvertLatin1toUTF16(mozilla::Span<const char> aSource,
mozilla::Span<char16_t> aDest)
{
encoding_mem_convert_latin1_to_utf16(
aSource.Elements(), aSource.Length(), aDest.Elements(), aDest.Length());
}
/**
* Lone surrogates are replaced with the REPLACEMENT CHARACTER.
*
* The length of aDest must be at least the length of aSource times three
* _plus one_.
*
* Returns the number of code units written.
*/
inline size_t
ConvertUTF16toUTF8(mozilla::Span<const char16_t> aSource,
mozilla::Span<char> aDest)
{
return encoding_mem_convert_utf16_to_utf8(
aSource.Elements(), aSource.Length(), aDest.Elements(), aDest.Length());
}
/**
* Malformed byte sequences are replaced with the REPLACEMENT CHARACTER.
*
* The length of aDest must at least one greater than the length of aSource.
*
* Returns the number of code units written.
*/
inline size_t
ConvertUTF8toUTF16(mozilla::Span<const char> aSource,
mozilla::Span<char16_t> aDest)
{
return encoding_mem_convert_utf8_to_utf16(
aSource.Elements(), aSource.Length(), aDest.Elements(), aDest.Length());
}
inline size_t
@ -31,6 +193,7 @@ Distance(const nsReadingIterator<char16_t>& aStart,
MOZ_ASSERT(aStart.get() <= aEnd.get());
return static_cast<size_t>(aEnd.get() - aStart.get());
}
inline size_t
Distance(const nsReadingIterator<char>& aStart,
const nsReadingIterator<char>& aEnd)
@ -39,65 +202,185 @@ Distance(const nsReadingIterator<char>& aStart,
return static_cast<size_t>(aEnd.get() - aStart.get());
}
void LossyCopyUTF16toASCII(const nsAString& aSource, nsACString& aDest);
void CopyASCIItoUTF16(const nsACString& aSource, nsAString& aDest);
MOZ_MUST_USE bool CopyASCIItoUTF16(const nsACString& aSource, nsAString& aDest,
const mozilla::fallible_t&);
// UTF-8 to UTF-16
// Invalid UTF-8 byte sequences are replaced with the REPLACEMENT CHARACTER.
void LossyCopyUTF16toASCII(const char16ptr_t aSource, nsACString& aDest);
void CopyASCIItoUTF16(const char* aSource, nsAString& aDest);
inline MOZ_MUST_USE bool
CopyUTF8toUTF16(mozilla::Span<const char> aSource,
nsAString& aDest,
const mozilla::fallible_t&)
{
return nsstring_fallible_append_utf8_impl(
&aDest, aSource.Elements(), aSource.Length(), 0);
}
void CopyUTF16toUTF8(const nsAString& aSource, nsACString& aDest);
MOZ_MUST_USE bool CopyUTF16toUTF8(const nsAString& aSource, nsACString& aDest,
const mozilla::fallible_t&);
void CopyUTF8toUTF16(const nsACString& aSource, nsAString& aDest);
inline void
CopyUTF8toUTF16(mozilla::Span<const char> aSource, nsAString& aDest)
{
if (MOZ_UNLIKELY(!CopyUTF8toUTF16(aSource, aDest, mozilla::fallible))) {
aDest.AllocFailed(aSource.Length());
}
}
void CopyUTF16toUTF8(const char16ptr_t aSource, nsACString& aDest);
void CopyUTF8toUTF16(const char* aSource, nsAString& aDest);
inline MOZ_MUST_USE bool
AppendUTF8toUTF16(mozilla::Span<const char> aSource,
nsAString& aDest,
const mozilla::fallible_t&)
{
return nsstring_fallible_append_utf8_impl(
&aDest, aSource.Elements(), aSource.Length(), aDest.Length());
}
void LossyAppendUTF16toASCII(const nsAString& aSource, nsACString& aDest);
void AppendASCIItoUTF16(const nsACString& aSource, nsAString& aDest);
MOZ_MUST_USE bool AppendASCIItoUTF16(const nsACString& aSource,
nsAString& aDest,
const mozilla::fallible_t&);
inline void
AppendUTF8toUTF16(mozilla::Span<const char> aSource, nsAString& aDest)
{
if (MOZ_UNLIKELY(!AppendUTF8toUTF16(aSource, aDest, mozilla::fallible))) {
aDest.AllocFailed(aDest.Length() + aSource.Length());
}
}
void LossyAppendUTF16toASCII(const char16ptr_t aSource, nsACString& aDest);
MOZ_MUST_USE bool AppendASCIItoUTF16(const char* aSource,
nsAString& aDest,
const mozilla::fallible_t&);
void AppendASCIItoUTF16(const char* aSource, nsAString& aDest);
// Latin1 to UTF-16
// Interpret each incoming unsigned byte value as a Unicode scalar value (not
// windows-1252!). The function names say "ASCII" instead of "Latin1" for
// legacy reasons.
void AppendUTF16toUTF8(const nsAString& aSource, nsACString& aDest);
MOZ_MUST_USE bool AppendUTF16toUTF8(const nsAString& aSource,
nsACString& aDest,
const mozilla::fallible_t&);
void AppendUTF8toUTF16(const nsACString& aSource, nsAString& aDest);
MOZ_MUST_USE bool AppendUTF8toUTF16(const nsACString& aSource,
nsAString& aDest,
const mozilla::fallible_t&);
inline MOZ_MUST_USE bool
CopyASCIItoUTF16(mozilla::Span<const char> aSource,
nsAString& aDest,
const mozilla::fallible_t&)
{
return nsstring_fallible_append_latin1_impl(
&aDest, aSource.Elements(), aSource.Length(), 0);
}
void AppendUTF16toUTF8(const char16ptr_t aSource, nsACString& aDest);
void AppendUTF8toUTF16(const char* aSource, nsAString& aDest);
inline void
CopyASCIItoUTF16(mozilla::Span<const char> aSource, nsAString& aDest)
{
if (MOZ_UNLIKELY(!CopyASCIItoUTF16(aSource, aDest, mozilla::fallible))) {
aDest.AllocFailed(aSource.Length());
}
}
inline MOZ_MUST_USE bool
AppendASCIItoUTF16(mozilla::Span<const char> aSource,
nsAString& aDest,
const mozilla::fallible_t&)
{
return nsstring_fallible_append_latin1_impl(
&aDest, aSource.Elements(), aSource.Length(), aDest.Length());
}
inline void
AppendASCIItoUTF16(mozilla::Span<const char> aSource, nsAString& aDest)
{
if (MOZ_UNLIKELY(!AppendASCIItoUTF16(aSource, aDest, mozilla::fallible))) {
aDest.AllocFailed(aDest.Length() + aSource.Length());
}
}
// UTF-16 to UTF-8
// Unpaired surrogates are replaced with the REPLACEMENT CHARACTER.
inline MOZ_MUST_USE bool
CopyUTF16toUTF8(mozilla::Span<const char16_t> aSource,
nsACString& aDest,
const mozilla::fallible_t&)
{
return nscstring_fallible_append_utf16_to_utf8_impl(
&aDest, aSource.Elements(), aSource.Length(), 0);
}
inline void
CopyUTF16toUTF8(mozilla::Span<const char16_t> aSource, nsACString& aDest)
{
if (MOZ_UNLIKELY(!CopyUTF16toUTF8(aSource, aDest, mozilla::fallible))) {
aDest.AllocFailed(aSource.Length());
}
}
inline MOZ_MUST_USE bool
AppendUTF16toUTF8(mozilla::Span<const char16_t> aSource,
nsACString& aDest,
const mozilla::fallible_t&)
{
return nscstring_fallible_append_utf16_to_utf8_impl(
&aDest, aSource.Elements(), aSource.Length(), aDest.Length());
}
inline void
AppendUTF16toUTF8(mozilla::Span<const char16_t> aSource, nsACString& aDest)
{
if (MOZ_UNLIKELY(!AppendUTF16toUTF8(aSource, aDest, mozilla::fallible))) {
aDest.AllocFailed(aDest.Length() + aSource.Length());
}
}
// UTF-16 to Latin1
// If all code points in the input are below U+0100, represents each scalar
// value as an unsigned byte. (This is not windows-1252!) If there are code
// points above U+00FF, memory-safely produces garbage and will likely start
// asserting in future debug builds. The nature of the garbage may differ
// based on CPU architecture and must not be relied upon. The names say
// "ASCII" instead of "Latin1" for legacy reasons.
inline MOZ_MUST_USE bool
LossyCopyUTF16toASCII(mozilla::Span<const char16_t> aSource,
nsACString& aDest,
const mozilla::fallible_t&)
{
return nscstring_fallible_append_utf16_to_latin1_lossy_impl(
&aDest, aSource.Elements(), aSource.Length(), 0);
}
inline void
LossyCopyUTF16toASCII(mozilla::Span<const char16_t> aSource, nsACString& aDest)
{
if (MOZ_UNLIKELY(!LossyCopyUTF16toASCII(aSource, aDest, mozilla::fallible))) {
aDest.AllocFailed(aSource.Length());
}
}
inline MOZ_MUST_USE bool
LossyAppendUTF16toASCII(mozilla::Span<const char16_t> aSource,
nsACString& aDest,
const mozilla::fallible_t&)
{
return nscstring_fallible_append_utf16_to_latin1_lossy_impl(
&aDest, aSource.Elements(), aSource.Length(), aDest.Length());
}
inline void
LossyAppendUTF16toASCII(mozilla::Span<const char16_t> aSource,
nsACString& aDest)
{
if (MOZ_UNLIKELY(
!LossyAppendUTF16toASCII(aSource, aDest, mozilla::fallible))) {
aDest.AllocFailed(aDest.Length() + aSource.Length());
}
}
/**
* Returns a new |char| buffer containing a zero-terminated copy of |aSource|.
*
* Allocates and returns a new |char| buffer which you must free with |free|.
* Performs a lossy encoding conversion by chopping 16-bit wide characters down to 8-bits wide while copying |aSource| to your new buffer.
* This conversion is not well defined; but it reproduces legacy string behavior.
* The new buffer is zero-terminated, but that may not help you if |aSource| contains embedded nulls.
* Performs a conversion with LossyConvertUTF16toLatin1() writing into the
* newly-allocated buffer.
*
* The new buffer is zero-terminated, but that may not help you if |aSource|
* contains embedded nulls.
*
* @param aSource a 16-bit wide string
* @return a new |char| buffer you must free with |free|.
*/
char* ToNewCString(const nsAString& aSource);
/**
* Returns a new |char| buffer containing a zero-terminated copy of |aSource|.
*
* Allocates and returns a new |char| buffer which you must free with |free|.
* The new buffer is zero-terminated, but that may not help you if |aSource| contains embedded nulls.
*
* The new buffer is zero-terminated, but that may not help you if |aSource|
* contains embedded nulls.
*
* @param aSource an 8-bit wide string
* @return a new |char| buffer you must free with |free|.
@ -109,8 +392,10 @@ char* ToNewCString(const nsACString& aSource);
*
* Allocates and returns a new |char| buffer which you must free with
* |free|.
* Performs an encoding conversion from a UTF-16 string to a UTF-8 string
* copying |aSource| to your new buffer.
* Performs an encoding conversion from a UTF-16 string to a UTF-8 string with
* unpaired surrogates replaced with the REPLACEMENT CHARACTER copying
* |aSource| to your new buffer.
*
* The new buffer is zero-terminated, but that may not help you if |aSource|
* contains embedded nulls.
*
@ -128,6 +413,7 @@ char* ToNewUTF8String(const nsAString& aSource, uint32_t* aUTF8Count = nullptr);
*
* Allocates and returns a new |char16_t| buffer which you must free with
* |free|.
*
* The new buffer is zero-terminated, but that may not help you if |aSource|
* contains embedded nulls.
*
@ -138,59 +424,35 @@ char16_t* ToNewUnicode(const nsAString& aSource);
/**
* Returns a new |char16_t| buffer containing a zero-terminated copy of |aSource|.
* Returns a new |char16_t| buffer containing a zero-terminated copy of
* |aSource|.
*
* Allocates and returns a new |char16_t| buffer which you must free with |free|.
* Performs an encoding conversion by 0-padding 8-bit wide characters up to 16-bits wide while copying |aSource| to your new buffer.
* This conversion is not well defined; but it reproduces legacy string behavior.
* The new buffer is zero-terminated, but that may not help you if |aSource| contains embedded nulls.
* Allocates and returns a new |char16_t| buffer which you must free with
* |free|.
*
* @param aSource an 8-bit wide string (a C-string, NOT UTF-8)
* Performs an encoding conversion by 0-padding 8-bit wide characters up to
* 16-bits wide (i.e. Latin1 to UTF-16 conversion) while copying |aSource|
* to your new buffer.
*
* The new buffer is zero-terminated, but that may not help you if |aSource|
* contains embedded nulls.
*
* @param aSource a Latin1 string
* @return a new |char16_t| buffer you must free with |free|.
*/
char16_t* ToNewUnicode(const nsACString& aSource);
/**
* Returns the required length for a char16_t buffer holding
* a copy of aSource, using UTF-8 to UTF-16 conversion.
* The length does NOT include any space for zero-termination.
*
* @param aSource an 8-bit wide string, UTF-8 encoded
* @return length of UTF-16 encoded string copy, not zero-terminated
*/
uint32_t CalcUTF8ToUnicodeLength(const nsACString& aSource);
/**
* Copies the source string into the specified buffer, converting UTF-8 to
* UTF-16 in the process. The conversion is well defined for valid UTF-8
* strings.
* The copied string will be zero-terminated! Any embedded nulls will be
* copied nonetheless. It is the caller's responsiblity to ensure the buffer
* is large enough to hold the string copy plus one char16_t for
* zero-termination!
*
* @see CalcUTF8ToUnicodeLength( const nsACString& )
* @see UTF8ToNewUnicode( const nsACString&, uint32_t* )
*
* @param aSource an 8-bit wide string, UTF-8 encoded
* @param aBuffer the buffer holding the converted string copy
* @param aUTF16Count receiving optionally the number of 16-bit units that
* were copied
* @return aBuffer pointer, for convenience
*/
char16_t* UTF8ToUnicodeBuffer(const nsACString& aSource,
char16_t* aBuffer,
uint32_t* aUTF16Count = nullptr);
/**
* Returns a new |char16_t| buffer containing a zero-terminated copy
* of |aSource|.
*
* Allocates and returns a new |char| buffer which you must free with
* |free|. Performs an encoding conversion from UTF-8 to UTF-16
* while copying |aSource| to your new buffer. This conversion is well defined
* for a valid UTF-8 string. The new buffer is zero-terminated, but that
* may not help you if |aSource| contains embedded nulls.
* while copying |aSource| to your new buffer. Malformed byte sequences
* are replaced with the REPLACEMENT CHARACTER.
*
* The new buffer is zero-terminated, but that may not help you if |aSource|
* contains embedded nulls.
*
* @param aSource an 8-bit wide string, UTF-8 encoded
* @param aUTF16Count the number of 16-bit units that was returned
@ -217,68 +479,136 @@ char16_t* CopyUnicodeTo(const nsAString& aSource,
char16_t* aDest,
uint32_t aLength);
/**
* Copies 16-bit characters between iterators |aSrcStart| and
* |aSrcEnd| to the writable string |aDest|. Similar to the
* |nsString::Mid| method.
*
* After this operation |aDest| is not null terminated.
*
* @param aSrcStart start source iterator
* @param aSrcEnd end source iterator
* @param aDest destination for the copy
*/
void CopyUnicodeTo(const nsAString::const_iterator& aSrcStart,
const nsAString::const_iterator& aSrcEnd,
nsAString& aDest);
/**
* Appends 16-bit characters between iterators |aSrcStart| and
* |aSrcEnd| to the writable string |aDest|.
*
* After this operation |aDest| is not null terminated.
*
* @param aSrcStart start source iterator
* @param aSrcEnd end source iterator
* @param aDest destination for the copy
*/
void AppendUnicodeTo(const nsAString::const_iterator& aSrcStart,
const nsAString::const_iterator& aSrcEnd,
nsAString& aDest);
/**
* Returns |true| if |aString| contains only ASCII characters, that is, characters in the range (0x00, 0x7F).
* Returns |true| if |aString| contains only ASCII characters, that is,
* characters in the range (0x00, 0x7F).
*
* @param aString a 16-bit wide string to scan
*/
bool IsASCII(const nsAString& aString);
/**
* Returns |true| if |aString| contains only ASCII characters, that is, characters in the range (0x00, 0x7F).
*
* @param aString a 8-bit wide string to scan
*/
inline bool IsASCII(const nsACString& aString)
inline bool
IsASCII(mozilla::Span<const char16_t> aString)
{
size_t length = aString.Length();
const uint8_t* ptr = reinterpret_cast<const uint8_t*>(aString.BeginReading());
const char16_t* ptr = aString.Elements();
// For short strings, calling into Rust is a pessimization, and the SIMD
// code won't have a chance to kick in anyway. Additionally, handling the
// case of the empty string here makes null-checking ptr unnecessary.
// (Passing nullptr to Rust would technically be UB.)
// code won't have a chance to kick in anyway.
if (length < 16) {
size_t accu = 0;
char16_t accu = 0;
for (size_t i = 0; i < length; i++) {
accu |= ptr[i];
}
return accu < 0x80;
return accu < 0x80U;
}
// This is not quite optimal, because it's not fail-fast when the by-register
// check already finds non-ASCII. Also, input to this function is almost
// always ASCII, so even the by-register check wouldn't need to be fail-fast
// and could be more like the loop above.
return length == encoding_ascii_valid_up_to(ptr, length);
return encoding_mem_is_basic_latin(ptr, length);
}
/**
* Returns |true| if |aString| contains only ASCII characters, that is,
* characters in the range (0x00, 0x7F).
*
* @param aString a 8-bit wide string to scan
*/
inline bool
IsASCII(mozilla::Span<const char> aString)
{
size_t length = aString.Length();
const uint8_t* ptr = reinterpret_cast<const uint8_t*>(aString.Elements());
// For short strings, calling into Rust is a pessimization, and the SIMD
// code won't have a chance to kick in anyway.
if (length < 16) {
uint8_t accu = 0;
for (size_t i = 0; i < length; i++) {
accu |= ptr[i];
}
return accu < 0x80U;
}
return encoding_mem_is_ascii(ptr, length);
}
/**
* Returns |true| if |aString| contains only Latin1 characters, that is,
* characters in the range (U+0000, U+00FF).
*
* @param aString a potentially-invalid UTF-16 string to scan
*/
inline bool
IsUTF16Latin1(mozilla::Span<const char16_t> aString)
{
size_t length = aString.Length();
const char16_t* ptr = aString.Elements();
// For short strings, calling into Rust is a pessimization, and the SIMD
// code won't have a chance to kick in anyway.
if (length < 16) {
char16_t accu = 0;
for (size_t i = 0; i < length; i++) {
accu |= ptr[i];
}
return accu < 0x100U;
}
return encoding_mem_is_utf16_latin1(ptr, length);
}
/**
* Returns |true| if |aString| contains only Latin1 characters, that is,
* characters in the range (U+0000, U+00FF).
*
* If you know that the argument is always absolutely guaranteed to be valid
* UTF-8, use the faster UnsafeIsValidUTF8Latin1() instead.
*
* @param aString potentially-invalid UTF-8 string to scan
*/
inline bool
IsUTF8Latin1(mozilla::Span<const char> aString)
{
size_t length = aString.Length();
const uint8_t* ptr = reinterpret_cast<const uint8_t*>(aString.Elements());
// For short strings, calling into Rust is a pessimization, and the SIMD
// code won't have a chance to kick in anyway.
if (length < 16) {
for (size_t i = 0; i < length; i++) {
if (ptr[i] >= 0x80U) {
ptr += i;
length -= i;
// This loop can't handle non-ASCII, but the Rust code can, so
// upon seeing non-ASCII, break the loop and let the Rust code
// handle the rest of the buffer (including the non-ASCII byte).
goto end;
}
}
return true;
}
end:
return encoding_mem_is_utf8_latin1(ptr, length);
}
/**
* Returns |true| if |aString| contains only Latin1 characters, that is,
* characters in the range (U+0000, U+00FF).
*
* The argument MUST be valid UTF-8. If you are at all unsure, use IsUTF8Latin1
* instead!
*
* @param aString known-valid UTF-8 string to scan
*/
inline bool
UnsafeIsValidUTF8Latin1(mozilla::Span<const char> aString)
{
size_t length = aString.Length();
const uint8_t* ptr = reinterpret_cast<const uint8_t*>(aString.Elements());
// For short strings, calling into Rust is a pessimization, and the SIMD
// code won't have a chance to kick in anyway.
if (length < 16) {
for (size_t i = 0; i < length; i++) {
if (ptr[i] >= 0x80U) {
ptr += i;
length -= i;
goto end;
}
}
return true;
}
end:
return encoding_mem_is_str_latin1(ptr, length);
}
/**
@ -289,17 +619,16 @@ inline bool IsASCII(const nsACString& aString)
*
* @param aString an 8-bit wide string to scan
*/
inline bool IsUTF8(const nsACString& aString)
inline bool
IsUTF8(mozilla::Span<const char> aString)
{
size_t length = aString.Length();
const uint8_t* ptr = reinterpret_cast<const uint8_t*>(aString.BeginReading());
const uint8_t* ptr = reinterpret_cast<const uint8_t*>(aString.Elements());
// For short strings, calling into Rust is a pessimization, and the SIMD
// code won't have a chance to kick in anyway. Additionally, handling the
// case of the empty string here makes null-checking ptr unnecessary.
// (Passing nullptr to Rust would technically be UB.)
// code won't have a chance to kick in anyway.
if (length < 16) {
for (size_t i = 0; i < length; i++) {
if (ptr[i] >= 0x80) {
if (ptr[i] >= 0x80U) {
ptr += i;
length -= i;
goto end;
@ -333,12 +662,16 @@ void ToUpperCase(const nsACString& aSource, nsACString& aDest);
void ToLowerCase(const nsACString& aSource, nsACString& aDest);
/**
* Finds the leftmost occurrence of |aPattern|, if any in the range |aSearchStart|..|aSearchEnd|.
* Finds the leftmost occurrence of |aPattern|, if any in the range
* |aSearchStart|..|aSearchEnd|.
*
* Returns |true| if a match was found, and adjusts |aSearchStart| and |aSearchEnd| to
* point to the match. If no match was found, returns |false| and makes |aSearchStart == aSearchEnd|.
* Returns |true| if a match was found, and adjusts |aSearchStart| and
* |aSearchEnd| to point to the match. If no match was found, returns |false|
* and makes |aSearchStart == aSearchEnd|.
*
* Currently, this is equivalent to the O(m*n) implementation previously on
* |ns[C]String|.
*
* Currently, this is equivalent to the O(m*n) implementation previously on |ns[C]String|.
* If we need something faster, then we can implement that later.
*/
@ -378,9 +711,9 @@ bool CaseInsensitiveFindInReadable(const nsACString& aPattern,
/**
* Finds the rightmost occurrence of |aPattern|
* Returns |true| if a match was found, and adjusts |aSearchStart| and |aSearchEnd| to
* point to the match. If no match was found, returns |false| and makes |aSearchStart == aSearchEnd|.
*
* Returns |true| if a match was found, and adjusts |aSearchStart| and
* |aSearchEnd| to point to the match. If no match was found, returns |false|
* and makes |aSearchStart == aSearchEnd|.
*/
bool RFindInReadable(const nsAString& aPattern, nsAString::const_iterator&,
nsAString::const_iterator&,
@ -430,15 +763,18 @@ const nsString& VoidString();
const nsCString& VoidCString();
/**
* Compare a UTF-8 string to an UTF-16 string.
*
* Returns 0 if the strings are equal, -1 if aUTF8String is less
* than aUTF16Count, and 1 in the reverse case. In case of fatal
* error (eg the strings are not valid UTF8 and UTF16 respectively),
* this method will return INT32_MIN.
*/
int32_t CompareUTF8toUTF16(const nsACString& aUTF8String,
const nsAString& aUTF16String);
* Compare a UTF-8 string to an UTF-16 string.
*
* Returns 0 if the strings are equal, -1 if aUTF8String is less
* than aUTF16Count, and 1 in the reverse case. Errors are replaced
* with U+FFFD and then the U+FFFD is compared as if it had occurred
* in the input. If aErr is not nullptr, *aErr is set to true if
* either string had malformed sequences.
*/
int32_t
CompareUTF8toUTF16(const nsACString& aUTF8String,
const nsAString& aUTF16String,
bool* aErr = nullptr);
void AppendUCS4ToUTF16(const uint32_t aSource, nsAString& aDest);

View file

@ -1,54 +0,0 @@
/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* vim: set ts=8 sts=2 et sw=2 tw=80: */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#include <stdint.h>
namespace mozilla {
inline bool IsASCII(char16_t aChar) {
return (aChar & 0xFF80) == 0;
}
/**
* Provides a pointer before or equal to |aPtr| that is is suitably aligned.
*/
inline const char16_t* aligned(const char16_t* aPtr, const uintptr_t aMask)
{
return reinterpret_cast<const char16_t*>(
reinterpret_cast<uintptr_t>(aPtr) & ~aMask);
}
/**
* Structures for word-sized vectorization of ASCII checking for UTF-16
* strings.
*/
template<size_t size> struct NonASCIIParameters;
template<> struct NonASCIIParameters<4> {
static inline size_t mask() { return 0xff80ff80; }
static inline uintptr_t alignMask() { return 0x3; }
static inline size_t numUnicharsPerWord() { return 2; }
};
template<> struct NonASCIIParameters<8> {
static inline size_t mask() {
static const uint64_t maskAsUint64 = UINT64_C(0xff80ff80ff80ff80);
// We have to explicitly cast this 64-bit value to a size_t, or else
// compilers for 32-bit platforms will warn about it being too large to fit
// in the size_t return type. (Fortunately, this code isn't actually
// invoked on 32-bit platforms -- they'll use the <4> specialization above.
// So it is, in fact, OK that this value is too large for a 32-bit size_t.)
return (size_t)maskAsUint64;
}
static inline uintptr_t alignMask() { return 0x7; }
static inline size_t numUnicharsPerWord() { return 4; }
};
namespace SSE2 {
int32_t FirstNonASCII(const char16_t* aBegin, const char16_t* aEnd);
} // namespace SSE2
} // namespace mozilla

View file

@ -1,70 +0,0 @@
/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* vim: set ts=8 sts=2 et sw=2 tw=80: */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#include <emmintrin.h>
#include "nsReadableUtilsImpl.h"
namespace mozilla {
namespace SSE2 {
static inline bool
is_zero (__m128i x)
{
return
_mm_movemask_epi8(_mm_cmpeq_epi8(x, _mm_setzero_si128())) == 0xffff;
}
int32_t
FirstNonASCII(const char16_t* aBegin, const char16_t* aEnd)
{
const size_t kNumUnicharsPerVector = sizeof(__m128i) / sizeof(char16_t);
typedef NonASCIIParameters<sizeof(size_t)> p;
const size_t kMask = p::mask();
const uintptr_t kXmmAlignMask = 0xf;
const uint16_t kShortMask = 0xff80;
const size_t kNumUnicharsPerWord = p::numUnicharsPerWord();
const char16_t* idx = aBegin;
// Align ourselves to a 16-byte boundary as required by _mm_load_si128
for (; idx != aEnd && ((uintptr_t(idx) & kXmmAlignMask) != 0); idx++) {
if (!IsASCII(*idx)) {
return idx - aBegin;
}
}
// Check one XMM register (16 bytes) at a time.
const char16_t* vectWalkEnd = aligned(aEnd, kXmmAlignMask);
__m128i vectmask = _mm_set1_epi16(static_cast<int16_t>(kShortMask));
for (; idx != vectWalkEnd; idx += kNumUnicharsPerVector) {
const __m128i vect = *reinterpret_cast<const __m128i*>(idx);
if (!is_zero(_mm_and_si128(vect, vectmask))) {
return idx - aBegin;
}
}
// Check one word at a time.
const char16_t* wordWalkEnd = aligned(aEnd, p::alignMask());
for(; idx != wordWalkEnd; idx += kNumUnicharsPerWord) {
const size_t word = *reinterpret_cast<const size_t*>(idx);
if (word & kMask) {
return idx - aBegin;
}
}
// Take care of the remainder one character at a time.
for (; idx != aEnd; idx++) {
if (!IsASCII(*idx)) {
return idx - aBegin;
}
}
return -1;
}
} // namespace SSE2
} // namespace mozilla

View file

@ -47,7 +47,7 @@ class NS_LossyConvertUTF16toASCII : public nsAutoCString
public:
explicit NS_LossyConvertUTF16toASCII(const char16ptr_t aString)
{
LossyAppendUTF16toASCII(aString, *this);
LossyAppendUTF16toASCII(mozilla::MakeStringSpan(aString), *this);
}
NS_LossyConvertUTF16toASCII(const char16ptr_t aString, uint32_t aLength)
@ -71,7 +71,7 @@ class NS_ConvertASCIItoUTF16 : public nsAutoString
public:
explicit NS_ConvertASCIItoUTF16(const char* aCString)
{
AppendASCIItoUTF16(aCString, *this);
AppendASCIItoUTF16(mozilla::MakeStringSpan(aCString), *this);
}
NS_ConvertASCIItoUTF16(const char* aCString, uint32_t aLength)
@ -98,7 +98,7 @@ class NS_ConvertUTF16toUTF8 : public nsAutoCString
public:
explicit NS_ConvertUTF16toUTF8(const char16ptr_t aString)
{
AppendUTF16toUTF8(aString, *this);
AppendUTF16toUTF8(mozilla::MakeStringSpan(aString), *this);
}
NS_ConvertUTF16toUTF8(const char16ptr_t aString, uint32_t aLength)
@ -122,7 +122,7 @@ class NS_ConvertUTF8toUTF16 : public nsAutoString
public:
explicit NS_ConvertUTF8toUTF16(const char* aCString)
{
AppendUTF8toUTF16(aCString, *this);
AppendUTF8toUTF16(mozilla::MakeStringSpan(aCString), *this);
}
NS_ConvertUTF8toUTF16(const char* aCString, uint32_t aLength)

View file

@ -459,6 +459,15 @@ char* Gecko_FallibleBeginWritingCString(nsACString* aThis)
return aThis->BeginWriting(mozilla::fallible);
}
uint32_t
Gecko_StartBulkWriteCString(nsACString* aThis,
uint32_t aCapacity,
uint32_t aUnitsToPreserve,
bool aAllowShrinking)
{
return aThis->StartBulkWrite(aCapacity, aUnitsToPreserve, aAllowShrinking).unwrapOr(UINT32_MAX);
}
void Gecko_FinalizeString(nsAString* aThis)
{
aThis->~nsAString();
@ -514,4 +523,13 @@ char16_t* Gecko_FallibleBeginWritingString(nsAString* aThis)
return aThis->BeginWriting(mozilla::fallible);
}
uint32_t
Gecko_StartBulkWriteString(nsAString* aThis,
uint32_t aCapacity,
uint32_t aUnitsToPreserve,
bool aAllowShrinking)
{
return aThis->StartBulkWrite(aCapacity, aUnitsToPreserve, aAllowShrinking).unwrapOr(UINT32_MAX);
}
} // extern "C"

View file

@ -325,14 +325,11 @@ nsTString<T>::ReplaceSubstring(const self_type& aTarget,
// Note that we always allocate at least an this->mLength sized buffer, because the
// rest of the algorithm relies on having access to all of the original
// string. In other words, we over-allocate in the shrinking case.
char_type* oldData;
DataFlags oldFlags;
if (!this->MutatePrep(XPCOM_MAX(this->mLength, newLength.value()), &oldData, &oldFlags))
uint32_t oldLen = this->mLength;
mozilla::Result<uint32_t, nsresult> r =
this->StartBulkWrite(XPCOM_MAX(oldLen, newLength.value()), oldLen);
if (r.isErr()) {
return false;
if (oldData) {
// Copy all of the old data to the new buffer.
char_traits::copy(this->mData, oldData, this->mLength);
::ReleaseData(oldData, oldFlags);
}
if (aTarget.Length() >= aNewValue.Length()) {
@ -370,8 +367,7 @@ nsTString<T>::ReplaceSubstring(const self_type& aTarget,
}
// Adjust the length and make sure the string is null terminated.
this->mLength = newLength.value();
this->mData[this->mLength] = char_type(0);
this->FinishBulkWrite(newLength.value());
return true;
}

View file

@ -12,6 +12,24 @@
#include "nsASCIIMask.h"
// It's not worthwhile to reallocate the buffer and memcpy the
// contents over when the size difference isn't large. With
// power-of-two allocation buckets and 64 as the typical inline
// capacity, considering that above 1000 there performance aspects
// of realloc and memcpy seem to be absorbed, relative to the old
// code, by the performance benefits of the new code being exact,
// we need to choose which transitions of 256 to 128, 512 to 256
// and 1024 to 512 to allow. As a guess, let's pick the middle
// one as the the largest potential transition that we forgo. So
// we'll shrink from 1024 bucket to 512 bucket but not from 512
// bucket to 256 bucket. We'll decide by comparing the difference
// of capacities. As bucket differences, the differences are 256
// and 512. Since the capacities have various overheads, we
// can't compare with 256 or 512 exactly but it's easier to
// compare to some number that's between the two, so it's
// far away from either to ignore the overheads.
const uint32_t kNsStringBufferShrinkingThreshold = 384;
using double_conversion::DoubleToStringConverter;
template <typename T>
@ -47,45 +65,71 @@ AsAutoString(const nsTSubstring<T>* aStr)
return static_cast<const nsTAutoString<T>*>(aStr);
}
/**
* this function is called to prepare mData for writing. the given capacity
* indicates the required minimum storage size for mData, in sizeof(char_type)
* increments. this function returns true if the operation succeeds. it also
* returns the old data and old flags members if mData is newly allocated.
* the old data must be released by the caller.
*/
template <typename T>
bool
nsTSubstring<T>::MutatePrep(size_type aCapacity, char_type** aOldData,
DataFlags* aOldDataFlags)
template<typename T>
mozilla::Result<uint32_t, nsresult>
nsTSubstring<T>::StartBulkWrite(size_type aCapacity,
size_type aPrefixToPreserve,
bool aAllowShrinking,
size_type aSuffixLength,
size_type aOldSuffixStart,
size_type aNewSuffixStart)
{
// initialize to no old data
*aOldData = nullptr;
*aOldDataFlags = DataFlags(0);
// Note! Capacity does not include room for the terminating null char.
size_type curCapacity = Capacity();
MOZ_ASSERT(aPrefixToPreserve <= aCapacity,
"Requested preservation of an overlong prefix.");
MOZ_ASSERT(aNewSuffixStart + aSuffixLength <= aCapacity,
"Requesed move of suffix to out-of-bounds location.");
// Can't assert aOldSuffixStart, because mLength may not be valid anymore,
// since this method allows itself to be called more than once.
// If |aCapacity > kMaxCapacity|, then our doubling algorithm may not be
// able to allocate it. Just bail out in cases like that. We don't want
// to be allocating 2GB+ strings anyway.
static_assert((sizeof(nsStringBuffer) & 0x1) == 0,
"bad size for nsStringBuffer");
if (!CheckCapacity(aCapacity)) {
return false;
// If zero capacity is requested, set the string to the special empty
// string.
if (MOZ_UNLIKELY(!aCapacity)) {
::ReleaseData(this->mData, this->mDataFlags);
SetToEmptyBuffer();
return 0;
}
// Note! Capacity() returns 0 when the string is immutable.
size_type curCapacity = Capacity();
// We've established that aCapacity > 0.
// |curCapacity == 0| means that the buffer is immutable or 0-sized, so we
// need to allocate a new buffer. We cannot use the existing buffer even
// though it might be large enough.
if (curCapacity != 0) {
if (aCapacity <= curCapacity) {
this->mDataFlags &= ~DataFlags::VOIDED; // mutation clears voided flag
return true;
}
if (!aAllowShrinking && aCapacity <= curCapacity) {
char_traits::move(this->mData + aNewSuffixStart,
this->mData + aOldSuffixStart,
aSuffixLength);
return curCapacity;
}
if (curCapacity < aCapacity) {
char_type* oldData = this->mData;
DataFlags oldFlags = this->mDataFlags;
char_type* newData;
DataFlags newDataFlags;
size_type newCapacity;
// If this is an nsTAutoStringN, it's possible that we can use the inline
// buffer.
if ((this->mClassFlags & ClassFlags::INLINE) &&
(aCapacity <= AsAutoString(this)->mInlineCapacity)) {
newCapacity = AsAutoString(this)->mInlineCapacity;
newData = (char_type*)AsAutoString(this)->mStorage;
newDataFlags = DataFlags::TERMINATED | DataFlags::INLINE;
} else {
// If |aCapacity > kMaxCapacity|, then our doubling algorithm may not be
// able to allocate it. Just bail out in cases like that. We don't want
// to be allocating 2GB+ strings anyway.
static_assert((sizeof(nsStringBuffer) & 0x1) == 0,
"bad size for nsStringBuffer");
if (MOZ_UNLIKELY(!CheckCapacity(aCapacity))) {
return mozilla::Err(NS_ERROR_OUT_OF_MEMORY);
}
// We increase our capacity so that the allocated buffer grows
// exponentially, which gives us amortized O(1) appending. Below the
// threshold, we use powers-of-two. Above the threshold, we grow by at
@ -113,77 +157,60 @@ nsTSubstring<T>::MutatePrep(size_type aCapacity, char_type** aOldData,
mozilla::RoundUpPow2(aCapacity + neededExtraSpace) - neededExtraSpace;
}
MOZ_ASSERT(XPCOM_MIN(temp, kMaxCapacity) >= aCapacity,
newCapacity = XPCOM_MIN(temp, kMaxCapacity);
MOZ_ASSERT(newCapacity >= aCapacity,
"should have hit the early return at the top");
aCapacity = XPCOM_MIN(temp, kMaxCapacity);
}
//
// several cases:
//
// (1) we have a refcounted shareable buffer (this->mDataFlags &
// DataFlags::REFCOUNTED)
// (2) we have an owned buffer (this->mDataFlags & DataFlags::OWNED)
// (3) we have an inline buffer (this->mDataFlags & DataFlags::INLINE)
// (4) we have a readonly buffer
//
// requiring that we in some cases preserve the data before creating
// a new buffer complicates things just a bit ;-)
//
size_type storageSize = (aCapacity + 1) * sizeof(char_type);
// case #1
if (this->mDataFlags & DataFlags::REFCOUNTED) {
nsStringBuffer* hdr = nsStringBuffer::FromData(this->mData);
if (!hdr->IsReadonly()) {
nsStringBuffer* newHdr = nsStringBuffer::Realloc(hdr, storageSize);
// Avoid shrinking if the new buffer size is close to the old. Note that
// unsigned underflow is defined behavior.
if ((curCapacity - newCapacity) <= kNsStringBufferShrinkingThreshold &&
(this->mDataFlags & DataFlags::REFCOUNTED)) {
MOZ_ASSERT(aAllowShrinking, "How come we didn't return earlier?");
// We're already close enough to the right size.
newData = oldData;
} else {
size_type storageSize = (newCapacity + 1) * sizeof(char_type);
// Since we allocate only by powers of 2 we always fit into a full mozjemalloc
// bucket, it's not useful to use realloc, which may spend time uselessly
// copying too much.
nsStringBuffer* newHdr = nsStringBuffer::Alloc(storageSize).take();
if (!newHdr) {
return false; // out-of-memory (original header left intact)
return mozilla::Err(NS_ERROR_OUT_OF_MEMORY); // we are still in a consistent state
}
hdr = newHdr;
this->mData = (char_type*)hdr->Data();
this->mDataFlags &= ~DataFlags::VOIDED; // mutation clears voided flag
return true;
newData = (char_type*)newHdr->Data();
}
}
char_type* newData;
DataFlags newDataFlags;
// If this is an nsTAutoStringN whose inline buffer is sufficiently large,
// then use it. This helps avoid heap allocations.
if ((this->mClassFlags & ClassFlags::INLINE) &&
(aCapacity < AsAutoString(this)->mInlineCapacity)) {
newData = (char_type*)AsAutoString(this)->mStorage;
newDataFlags = DataFlags::TERMINATED | DataFlags::INLINE;
} else {
// if we reach here then, we must allocate a new buffer. we cannot
// make use of our DataFlags::OWNED or DataFlags::INLINE buffers because
// they are not large enough.
nsStringBuffer* newHdr =
nsStringBuffer::Alloc(storageSize).take();
if (!newHdr) {
return false; // we are still in a consistent state
}
newData = (char_type*)newHdr->Data();
newDataFlags = DataFlags::TERMINATED | DataFlags::REFCOUNTED;
}
// save old data and flags
*aOldData = this->mData;
*aOldDataFlags = this->mDataFlags;
this->mData = newData;
this->mDataFlags = newDataFlags;
// this->mLength does not change
SetData(newData, this->mLength, newDataFlags);
if (oldData == newData) {
char_traits::move(
newData + aNewSuffixStart, oldData + aOldSuffixStart, aSuffixLength);
} else {
char_traits::copy(newData, oldData, aPrefixToPreserve);
char_traits::copy(
newData + aNewSuffixStart, oldData + aOldSuffixStart, aSuffixLength);
::ReleaseData(oldData, oldFlags);
}
// though we are not necessarily terminated at the moment, now is probably
// still the best time to set DataFlags::TERMINATED.
return newCapacity;
}
return true;
template<typename T>
void
nsTSubstring<T>::FinishBulkWrite(size_type aLength)
{
MOZ_ASSERT(aLength != UINT32_MAX, "OOM magic value passed as length.");
if (aLength) {
this->mData[aLength] = char_type(0);
this->mLength = aLength;
} else {
::ReleaseData(this->mData, this->mDataFlags);
SetToEmptyBuffer();
}
AssertValid();
}
template <typename T>
@ -225,48 +252,16 @@ bool
nsTSubstring<T>::ReplacePrepInternal(index_type aCutStart, size_type aCutLen,
size_type aFragLen, size_type aNewLen)
{
char_type* oldData;
DataFlags oldFlags;
if (!MutatePrep(aNewLen, &oldData, &oldFlags)) {
return false; // out-of-memory
size_type newSuffixStart = aCutStart + aFragLen;
size_type oldSuffixStart = aCutStart + aCutLen;
size_type suffixLength = this->mLength - oldSuffixStart;
mozilla::Result<uint32_t, nsresult> r = StartBulkWrite(
aNewLen, aCutStart, false, suffixLength, oldSuffixStart, newSuffixStart);
if (r.isErr()) {
return false;
}
if (oldData) {
// determine whether or not we need to copy part of the old string
// over to the new string.
if (aCutStart > 0) {
// copy prefix from old string
char_traits::copy(this->mData, oldData, aCutStart);
}
if (aCutStart + aCutLen < this->mLength) {
// copy suffix from old string to new offset
size_type from = aCutStart + aCutLen;
size_type fromLen = this->mLength - from;
uint32_t to = aCutStart + aFragLen;
char_traits::copy(this->mData + to, oldData + from, fromLen);
}
::ReleaseData(oldData, oldFlags);
} else {
// original data remains intact
// determine whether or not we need to move part of the existing string
// to make room for the requested hole.
if (aFragLen != aCutLen && aCutStart + aCutLen < this->mLength) {
uint32_t from = aCutStart + aCutLen;
uint32_t fromLen = this->mLength - from;
uint32_t to = aCutStart + aFragLen;
char_traits::move(this->mData + to, this->mData + from, fromLen);
}
}
// add null terminator (mutable this->mData always has room for the null-
// terminator).
this->mData[aNewLen] = char_type(0);
this->mLength = aNewLen;
FinishBulkWrite(aNewLen);
return true;
}
@ -553,20 +548,14 @@ nsTSubstring<T>::Assign(const substring_tuple_type& aTuple,
size_type length = aTuple.Length();
// don't use ReplacePrep here because it changes the length
char_type* oldData;
DataFlags oldFlags;
if (!MutatePrep(length, &oldData, &oldFlags)) {
mozilla::Result<uint32_t, nsresult> r = StartBulkWrite(length);
if (r.isErr()) {
return false;
}
if (oldData) {
::ReleaseData(oldData, oldFlags);
}
aTuple.WriteTo(this->mData, length);
this->mData[length] = 0;
this->mLength = length;
FinishBulkWrite(length);
return true;
}
@ -762,40 +751,53 @@ nsTSubstring<T>::SetCapacity(size_type aCapacity, const fallible_t&)
{
// capacity does not include room for the terminating null char
// if our capacity is reduced to zero, then free our buffer.
if (aCapacity == 0) {
::ReleaseData(this->mData, this->mDataFlags);
SetToEmptyBuffer();
return true;
// Sadly, existing callers assume that it's valid to
// first call SetCapacity(), then write past mLength
// and then call SetLength() with the assumption that
// SetLength still preserves the written data past
// mLength!!!
size_type preserve;
if (this->mDataFlags & DataFlags::REFCOUNTED) {
nsStringBuffer* hdr = nsStringBuffer::FromData(this->mData);
preserve = (hdr->StorageSize() / sizeof(char_type)) - 1;
} else if (this->mDataFlags & DataFlags::INLINE) {
preserve = AsAutoString(this)->mInlineCapacity;
} else {
preserve = this->mLength;
}
char_type* oldData;
DataFlags oldFlags;
if (!MutatePrep(aCapacity, &oldData, &oldFlags)) {
return false; // out-of-memory
if (preserve > aCapacity) {
preserve = aCapacity;
}
// compute new string length
size_type newLen = XPCOM_MIN(this->mLength, aCapacity);
mozilla::Result<uint32_t, nsresult> r = StartBulkWrite(aCapacity, preserve);
if (r.isErr()) {
return false;
}
if (r.unwrap()) {
// In the zero case StartBulkWrite already put the string
// in a valid state.
if (oldData) {
// preserve old data
if (this->mLength > 0) {
char_traits::copy(this->mData, oldData, newLen);
// Otherwise, instead of calling FinishBulkWrite,
// intentionally replicate the legacy semantics of
// this method:
// If requested capacity was smaller than the pre-existing
// length, set length to the requested capacity and
// zero-terminate there. Otherwise, zero-terminate at
// the requested capacity. (This latter behavior was
// designated as a legacy compatibility measure by the
// previous implementation of this method.)
if (aCapacity < this->mLength) {
// aCapacity not capacity for legacy reasons;
// maybe capacity would work, too.
this->mLength = aCapacity;
}
::ReleaseData(oldData, oldFlags);
// Note that we can't write a terminator at
// mData[mLength], because doing so would overwrite
// data when this method is called from SetLength.
this->mData[aCapacity] = char_type(0);
}
// adjust this->mLength if our buffer shrunk down in size
if (newLen < this->mLength) {
this->mLength = newLen;
}
// always null-terminate here, even if the buffer got longer. this is
// for backwards compat with the old string implementation.
this->mData[aCapacity] = char_type(0);
return true;
}

View file

@ -13,6 +13,7 @@
#include "mozilla/UniquePtr.h"
#include "mozilla/MemoryReporting.h"
#include "mozilla/IntegerTypeTraits.h"
#include "mozilla/Result.h"
#include "mozilla/Span.h"
#include "nsTStringRepr.h"
@ -886,26 +887,70 @@ protected:
*/
void NS_FASTCALL Finalize();
public:
/**
* this function prepares mData to be mutated.
* THIS IS NOT REALLY A PUBLIC METHOD! DO NOT CALL FROM OUTSIDE
* THE STRING IMPLEMENTATION. (It's public only because friend
* declarations don't allow extern or static and this needs to
* be called from Rust FFI glue.)
*
* @param aCapacity specifies the required capacity of mData
* @param aOldData returns null or the old value of mData
* @param aOldFlags returns 0 or the old value of mDataFlags
* Prepares mData to be mutated such that the capacity of the string
* (not counting the zero-terminator) is at least aCapacity.
* Returns the actual capacity, which may be larger than what was
* requested or Err(NS_ERROR_OUT_OF_MEMORY) on allocation failure.
*
* if mData is already mutable and of sufficient capacity, then this
* function will return immediately. otherwise, it will either resize
* mData or allocate a new shared buffer. if it needs to allocate a
* new buffer, then it will return the old buffer and the corresponding
* flags. this allows the caller to decide when to free the old data.
* mLength is ignored by this method. If the buffer is reallocated,
* aUnitsToPreserve specifies how many code units to copy over to
* the new buffer. The old buffer is freed if applicable.
*
* this function returns false if is unable to allocate sufficient
* memory.
* Unless the return value is Err(NS_ERROR_OUT_OF_MEMORY) to signal
* failure or 0 to signal that the string has been set to
* the special empty state, this method leaves the string in an
* invalid state! The caller is responsible for calling
* FinishBulkWrite() (or in Rust calling
* nsA[C]StringBulkWriteHandle::finish()), which put the string
* into a valid state by setting mLength and zero-terminating.
* This method sets the flag to claim that the string is
* zero-terminated before it actually is.
*
* Once this method has been called and before FinishBulkWrite()
* has been called, only accessing mData or calling this method
* again are valid operations. Do not call any other methods or
* access other fields between calling this method and
* FinishBulkWrite().
*
* @param aCapacity The requested capacity. The return value
* will be greater than or equal to this value.
* @param aPrefixToPreserve The number of code units at the start
* of the old buffer to copy into the
* new buffer.
* @parem aAllowShrinking If true, an allocation may be performed
* if the requested capacity is smaller
* than the current capacity.
* @param aSuffixLength The length, in code units, of a suffix
* to move.
* @param aOldSuffixStart The old start index of the suffix to
* move.
* @param aNewSuffixStart The new start index of the suffix to
* move.
*
* XXX we should expose a way for subclasses to free old_data.
*/
bool NS_FASTCALL MutatePrep(size_type aCapacity,
char_type** aOldData, DataFlags* aOldDataFlags);
mozilla::Result<uint32_t, nsresult>
NS_FASTCALL StartBulkWrite(size_type aCapacity,
size_type aPrefixToPreserve = 0,
bool aAllowShrinking = true,
size_type aSuffixLength = 0,
size_type aOldSuffixStart = 0,
size_type aNewSuffixStart = 0);
protected:
/**
* Restores the string to a valid state after a call to StartBulkWrite()
* that returned a non-error result. The argument to this method
* must be less than or equal to the value returned by the most recent
* StartBulkWrite() call.
*/
void NS_FASTCALL FinishBulkWrite(size_type aLength);
/**
* this function prepares a section of mData to be modified. if

View file

@ -11,10 +11,8 @@
// use XPCOM assertion/debugging macros, etc.
#include "nscore.h"
#include "mozilla/arm.h"
#include "mozilla/Assertions.h"
#include "mozilla/EndianUtils.h"
#include "mozilla/SSE.h"
#include "mozilla/TypeTraits.h"
#include "nsCharTraits.h"
@ -71,711 +69,186 @@ public:
if (is4byte(aChar)) {
return 4;
}
if (is5byte(aChar)) {
return 5;
}
if (is6byte(aChar)) {
return 6;
}
MOZ_ASSERT_UNREACHABLE("should not be used for in-sequence characters");
return 1;
}
};
/**
* Extract the next UCS-4 character from the buffer and return it. The
* Extract the next Unicode scalar value from the buffer and return it. The
* pointer passed in is advanced to the start of the next character in the
* buffer. If non-null, the parameters err and overlong are filled in to
* indicate that the character was represented by an overlong sequence, or
* that an error occurred.
* buffer. Upon error, the return value is 0xFFFD, *aBuffer is advanced
* over the maximal valid prefix and *aErr is set to true (if aErr is not
* null).
*
* Note: This method never sets *aErr to false to allow error accumulation
* across multiple calls.
*
* Precondition: *aBuffer < aEnd
*/
class UTF8CharEnumerator
{
public:
static uint32_t NextChar(const char** aBuffer, const char* aEnd, bool* aErr)
static inline char32_t NextChar(const char** aBuffer,
const char* aEnd,
bool* aErr = nullptr)
{
NS_ASSERTION(aBuffer && *aBuffer, "null buffer!");
MOZ_ASSERT(aBuffer, "null buffer pointer pointer");
MOZ_ASSERT(aEnd, "null end pointer");
const char* p = *aBuffer;
*aErr = false;
const unsigned char* p = reinterpret_cast<const unsigned char*>(*aBuffer);
const unsigned char* end = reinterpret_cast<const unsigned char*>(aEnd);
if (p >= aEnd) {
*aErr = true;
MOZ_ASSERT(p, "null buffer");
MOZ_ASSERT(p < end, "Bogus range");
return 0;
unsigned char first = *p++;
if (MOZ_LIKELY(first < 0x80U)) {
*aBuffer = reinterpret_cast<const char*>(p);
return first;
}
char c = *p++;
if (UTF8traits::isASCII(c)) {
*aBuffer = p;
return c;
}
uint32_t ucs4;
uint32_t minUcs4;
int32_t state = 0;
if (!CalcState(c, ucs4, minUcs4, state)) {
NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
*aErr = true;
return 0;
}
while (state--) {
if (p == aEnd) {
// Unsigned underflow is defined behavior
if (MOZ_UNLIKELY((p == end) || ((first - 0xC2U) >= (0xF5U - 0xC2U)))) {
*aBuffer = reinterpret_cast<const char*>(p);
if (aErr) {
*aErr = true;
return 0;
}
return 0xFFFDU;
}
c = *p++;
unsigned char second = *p;
if (!AddByte(c, state, ucs4)) {
if (first < 0xE0U) {
// Two-byte
if (MOZ_LIKELY((second & 0xC0U) == 0x80U)) {
*aBuffer = reinterpret_cast<const char*>(++p);
return ((uint32_t(first) & 0x1FU) << 6) | (uint32_t(second) & 0x3FU);
}
*aBuffer = reinterpret_cast<const char*>(p);
if (aErr) {
*aErr = true;
}
return 0xFFFDU;
}
return 0;
if (MOZ_LIKELY(first < 0xF0U)) {
// Three-byte
unsigned char lower = 0x80U;
unsigned char upper = 0xBFU;
if (first == 0xE0U) {
lower = 0xA0U;
} else if (first == 0xEDU) {
upper = 0x9FU;
}
if (MOZ_LIKELY(second >= lower && second <= upper)) {
if (MOZ_LIKELY(p != end)) {
unsigned char third = *++p;
if (MOZ_LIKELY((third & 0xC0U) == 0x80U)) {
*aBuffer = reinterpret_cast<const char*>(++p);
return ((uint32_t(first) & 0xFU) << 12) |
((uint32_t(second) & 0x3FU) << 6) |
(uint32_t(third) & 0x3FU);
}
}
}
*aBuffer = reinterpret_cast<const char*>(p);
if (aErr) {
*aErr = true;
}
return 0xFFFDU;
}
// Four-byte
unsigned char lower = 0x80U;
unsigned char upper = 0xBFU;
if (first == 0xF0U) {
lower = 0x90U;
} else if (first == 0xF4U) {
upper = 0x8FU;
}
if (MOZ_LIKELY(second >= lower && second <= upper)) {
if (MOZ_LIKELY(p != end)) {
unsigned char third = *++p;
if (MOZ_LIKELY((third & 0xC0U) == 0x80U)) {
if (MOZ_LIKELY(p != end)) {
unsigned char fourth = *++p;
if (MOZ_LIKELY((fourth & 0xC0U) == 0x80U)) {
*aBuffer = reinterpret_cast<const char*>(++p);
return ((uint32_t(first) & 0x7U) << 18) |
((uint32_t(second) & 0x3FU) << 12) |
((uint32_t(third) & 0x3FU) << 6) |
(uint32_t(fourth) & 0x3FU);
}
}
}
}
}
if (ucs4 < minUcs4) {
// Overlong sequence
ucs4 = UCS2_REPLACEMENT_CHAR;
} else if (ucs4 >= 0xD800 &&
(ucs4 <= 0xDFFF || ucs4 >= UCS_END)) {
// Surrogates and code points outside the Unicode range.
ucs4 = UCS2_REPLACEMENT_CHAR;
*aBuffer = reinterpret_cast<const char*>(p);
if (aErr) {
*aErr = true;
}
*aBuffer = p;
return ucs4;
}
private:
static bool CalcState(char aChar, uint32_t& aUcs4, uint32_t& aMinUcs4,
int32_t& aState)
{
if (UTF8traits::is2byte(aChar)) {
aUcs4 = (uint32_t(aChar) << 6) & 0x000007C0L;
aState = 1;
aMinUcs4 = 0x00000080;
} else if (UTF8traits::is3byte(aChar)) {
aUcs4 = (uint32_t(aChar) << 12) & 0x0000F000L;
aState = 2;
aMinUcs4 = 0x00000800;
} else if (UTF8traits::is4byte(aChar)) {
aUcs4 = (uint32_t(aChar) << 18) & 0x001F0000L;
aState = 3;
aMinUcs4 = 0x00010000;
} else if (UTF8traits::is5byte(aChar)) {
aUcs4 = (uint32_t(aChar) << 24) & 0x03000000L;
aState = 4;
aMinUcs4 = 0x00200000;
} else if (UTF8traits::is6byte(aChar)) {
aUcs4 = (uint32_t(aChar) << 30) & 0x40000000L;
aState = 5;
aMinUcs4 = 0x04000000;
} else {
return false;
}
return true;
}
static bool AddByte(char aChar, int32_t aState, uint32_t& aUcs4)
{
if (UTF8traits::isInSeq(aChar)) {
int32_t shift = aState * 6;
aUcs4 |= (uint32_t(aChar) & 0x3F) << shift;
return true;
}
return false;
return 0xFFFDU;
}
};
/**
* Extract the next UCS-4 character from the buffer and return it. The
* Extract the next Unicode scalar value from the buffer and return it. The
* pointer passed in is advanced to the start of the next character in the
* buffer. If non-null, the err parameter is filled in if an error occurs.
* buffer. Upon error, the return value is 0xFFFD, *aBuffer is advanced over
* the unpaired surrogate and *aErr is set to true (if aErr is not null).
*
* If an error occurs that causes UCS2_REPLACEMENT_CHAR to be returned, then
* the buffer will be updated to move only a single UCS-2 character.
* Note: This method never sets *aErr to false to allow error accumulation
* across multiple calls.
*
* Any other error returns 0 and does not move the buffer position.
* Precondition: *aBuffer < aEnd
*/
class UTF16CharEnumerator
{
public:
static uint32_t NextChar(const char16_t** aBuffer, const char16_t* aEnd,
bool* aErr = nullptr)
static inline char32_t NextChar(const char16_t** aBuffer,
const char16_t* aEnd,
bool* aErr = nullptr)
{
NS_ASSERTION(aBuffer && *aBuffer, "null buffer!");
MOZ_ASSERT(aBuffer, "null buffer pointer pointer");
MOZ_ASSERT(aEnd, "null end pointer");
const char16_t* p = *aBuffer;
if (p >= aEnd) {
NS_ERROR("No input to work with");
if (aErr) {
*aErr = true;
}
return 0;
}
MOZ_ASSERT(p, "null buffer");
MOZ_ASSERT(p < aEnd, "Bogus range");
char16_t c = *p++;
if (!IS_SURROGATE(c)) { // U+0000 - U+D7FF,U+E000 - U+FFFF
if (aErr) {
*aErr = false;
}
// Let's use encoding_rs-style code golf here.
// Unsigned underflow is defined behavior
char16_t cMinusSurrogateStart = c - 0xD800U;
if (MOZ_LIKELY(cMinusSurrogateStart > (0xDFFFU - 0xD800U))) {
*aBuffer = p;
return c;
} else if (NS_IS_HIGH_SURROGATE(c)) { // U+D800 - U+DBFF
if (p == aEnd) {
// Found a high surrogate at the end of the buffer. Flag this
// as an error and return the Unicode replacement
// character 0xFFFD.
UTF8UTILS_WARNING("Unexpected end of buffer after high surrogate");
if (aErr) {
*aErr = true;
}
*aBuffer = p;
return 0xFFFD;
}
// D800- DBFF - High Surrogate
char16_t h = c;
c = *p++;
if (NS_IS_LOW_SURROGATE(c)) {
// DC00- DFFF - Low Surrogate
// N = (H - D800) *400 + 10000 + (L - DC00)
uint32_t ucs4 = SURROGATE_TO_UCS4(h, c);
if (aErr) {
*aErr = false;
}
*aBuffer = p;
return ucs4;
} else {
// Found a high surrogate followed by something other than
// a low surrogate. Flag this as an error and return the
// Unicode replacement character 0xFFFD. Note that the
// pointer to the next character points to the second 16-bit
// value, not beyond it, as per Unicode 5.0.0 Chapter 3 C10,
// only the first code unit of an illegal sequence must be
// treated as an illegally terminated code unit sequence
// (also Chapter 3 D91, "isolated [not paired and ill-formed]
// UTF-16 code units in the range D800..DFFF are ill-formed").
UTF8UTILS_WARNING("got a High Surrogate but no low surrogate");
if (aErr) {
*aErr = true;
}
*aBuffer = p - 1;
return 0xFFFD;
}
} else { // U+DC00 - U+DFFF
// DC00- DFFF - Low Surrogate
// Found a low surrogate w/o a preceding high surrogate. Flag
// this as an error and return the Unicode replacement
// character 0xFFFD.
UTF8UTILS_WARNING("got a low Surrogate but no high surrogate");
if (aErr) {
*aErr = true;
}
*aBuffer = p;
return 0xFFFD;
}
MOZ_ASSERT_UNREACHABLE("Impossible UCS-2 character value.");
if (MOZ_LIKELY(cMinusSurrogateStart <= (0xDBFFU - 0xD800U))) {
// High surrogate
if (MOZ_LIKELY(p != aEnd)) {
char16_t second = *p;
// Unsigned underflow is defined behavior
if (MOZ_LIKELY((second - 0xDC00U) <= (0xDFFFU - 0xDC00U))) {
*aBuffer = ++p;
return (uint32_t(c) << 10) + uint32_t(second) -
(((0xD800U << 10) - 0x10000U) + 0xDC00U);
}
}
}
// Unpaired surrogate
*aBuffer = p;
if (aErr) {
*aErr = true;
}
return 0xFFFDU;
}
};
/**
* A character sink (see |copy_string| in nsAlgorithm.h) for converting
* UTF-8 to UTF-16
*/
class ConvertUTF8toUTF16
{
public:
typedef char value_type;
typedef char16_t buffer_type;
explicit ConvertUTF8toUTF16(buffer_type* aBuffer)
: mStart(aBuffer), mBuffer(aBuffer), mErrorEncountered(false)
{
}
size_t Length() const
{
return mBuffer - mStart;
}
bool ErrorEncountered() const
{
return mErrorEncountered;
}
void write(const value_type* aStart, uint32_t aN)
{
if (mErrorEncountered) {
return;
}
// algorithm assumes utf8 units won't
// be spread across fragments
const value_type* p = aStart;
const value_type* end = aStart + aN;
buffer_type* out = mBuffer;
for (; p != end /* && *p */;) {
bool err;
uint32_t ucs4 = UTF8CharEnumerator::NextChar(&p, end, &err);
if (err) {
mErrorEncountered = true;
mBuffer = out;
return;
}
if (ucs4 >= PLANE1_BASE) {
*out++ = (buffer_type)H_SURROGATE(ucs4);
*out++ = (buffer_type)L_SURROGATE(ucs4);
} else {
*out++ = ucs4;
}
}
mBuffer = out;
}
void write_terminator()
{
*mBuffer = buffer_type(0);
}
private:
buffer_type* const mStart;
buffer_type* mBuffer;
bool mErrorEncountered;
};
/**
* A character sink (see |copy_string| in nsAlgorithm.h) for computing
* the length of the UTF-16 string equivalent to a UTF-8 string.
*/
class CalculateUTF8Length
{
public:
typedef char value_type;
CalculateUTF8Length()
: mLength(0), mErrorEncountered(false)
{
}
size_t Length() const
{
return mLength;
}
void write(const value_type* aStart, uint32_t aN)
{
// ignore any further requests
if (mErrorEncountered) {
return;
}
// algorithm assumes utf8 units won't
// be spread across fragments
const value_type* p = aStart;
const value_type* end = aStart + aN;
for (; p < end /* && *p */; ++mLength) {
if (UTF8traits::isASCII(*p)) {
p += 1;
} else if (UTF8traits::is2byte(*p)) {
p += 2;
} else if (UTF8traits::is3byte(*p)) {
p += 3;
} else if (UTF8traits::is4byte(*p)) {
// Because a UTF-8 sequence of 4 bytes represents a codepoint
// greater than 0xFFFF, it will become a surrogate pair in the
// UTF-16 string, so add 1 more to mLength.
// This doesn't happen with is5byte and is6byte because they
// are illegal UTF-8 sequences (greater than 0x10FFFF) so get
// converted to a single replacement character.
// However, there is one case when a 4 byte UTF-8 sequence will
// only generate 2 UTF-16 bytes. If we have a properly encoded
// sequence, but with an invalid value (too small or too big),
// that will result in a replacement character being written
// This replacement character is encoded as just 1 single
// UTF-16 character, which is 2 bytes.
// The below code therefore only adds 1 to mLength if the UTF8
// data will produce a decoded character which is greater than
// or equal to 0x010000 and less than 0x0110000.
// A 4byte UTF8 character is encoded as
// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
// Bit 1-3 on the first byte, and bit 5-6 on the second byte,
// map to bit 17-21 in the final result. If these bits are
// between 0x01 and 0x11, that means that the final result is
// between 0x010000 and 0x110000. The below code reads these
// bits out and assigns them to c, but shifted up 4 bits to
// avoid having to shift twice.
// It doesn't matter what to do in the case where p + 4 > end
// since no UTF16 characters will be written in that case by
// ConvertUTF8toUTF16. Likewise it doesn't matter what we do if
// any of the surrogate bits are wrong since no UTF16
// characters will be written in that case either.
if (p + 4 <= end) {
uint32_t c = ((uint32_t)(p[0] & 0x07)) << 6 |
((uint32_t)(p[1] & 0x30));
if (c >= 0x010 && c < 0x110) {
++mLength;
}
}
p += 4;
} else if (UTF8traits::is5byte(*p)) {
p += 5;
} else if (UTF8traits::is6byte(*p)) {
p += 6;
} else { // error
++mLength; // to account for the decrement below
break;
}
}
if (p != end) {
NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
--mLength; // The last multi-byte char wasn't complete, discard it.
mErrorEncountered = true;
}
}
private:
size_t mLength;
bool mErrorEncountered;
};
/**
* A character sink (see |copy_string| in nsAlgorithm.h) for
* converting UTF-16 to UTF-8. Treats invalid UTF-16 data as 0xFFFD
* (0xEFBFBD in UTF-8).
*/
class ConvertUTF16toUTF8
{
public:
typedef char16_t value_type;
typedef char buffer_type;
// The error handling here is more lenient than that in
// |ConvertUTF8toUTF16|, but it's that way for backwards
// compatibility.
explicit ConvertUTF16toUTF8(buffer_type* aBuffer)
: mStart(aBuffer), mBuffer(aBuffer)
{
}
size_t Size() const
{
return mBuffer - mStart;
}
void write(const value_type* aStart, uint32_t aN)
{
buffer_type* out = mBuffer; // gcc isn't smart enough to do this!
for (const value_type* p = aStart, *end = aStart + aN; p < end; ++p) {
value_type c = *p;
if (!(c & 0xFF80)) { // U+0000 - U+007F
*out++ = (char)c;
} else if (!(c & 0xF800)) { // U+0100 - U+07FF
*out++ = 0xC0 | (char)(c >> 6);
*out++ = 0x80 | (char)(0x003F & c);
} else if (!IS_SURROGATE(c)) { // U+0800 - U+D7FF,U+E000 - U+FFFF
*out++ = 0xE0 | (char)(c >> 12);
*out++ = 0x80 | (char)(0x003F & (c >> 6));
*out++ = 0x80 | (char)(0x003F & c);
} else if (NS_IS_HIGH_SURROGATE(c)) { // U+D800 - U+DBFF
// D800- DBFF - High Surrogate
value_type h = c;
++p;
if (p == end) {
// Treat broken characters as the Unicode
// replacement character 0xFFFD (0xEFBFBD in
// UTF-8)
*out++ = '\xEF';
*out++ = '\xBF';
*out++ = '\xBD';
UTF8UTILS_WARNING("String ending in half a surrogate pair!");
break;
}
c = *p;
if (NS_IS_LOW_SURROGATE(c)) {
// DC00- DFFF - Low Surrogate
// N = (H - D800) *400 + 10000 + ( L - DC00 )
uint32_t ucs4 = SURROGATE_TO_UCS4(h, c);
// 0001 0000-001F FFFF
*out++ = 0xF0 | (char)(ucs4 >> 18);
*out++ = 0x80 | (char)(0x003F & (ucs4 >> 12));
*out++ = 0x80 | (char)(0x003F & (ucs4 >> 6));
*out++ = 0x80 | (char)(0x003F & ucs4);
} else {
// Treat broken characters as the Unicode
// replacement character 0xFFFD (0xEFBFBD in
// UTF-8)
*out++ = '\xEF';
*out++ = '\xBF';
*out++ = '\xBD';
// The pointer to the next character points to the second
// 16-bit value, not beyond it, as per Unicode 5.0.0
// Chapter 3 C10, only the first code unit of an illegal
// sequence must be treated as an illegally terminated
// code unit sequence (also Chapter 3 D91, "isolated [not
// paired and ill-formed] UTF-16 code units in the range
// D800..DFFF are ill-formed").
p--;
UTF8UTILS_WARNING("got a High Surrogate but no low surrogate");
}
} else { // U+DC00 - U+DFFF
// Treat broken characters as the Unicode replacement
// character 0xFFFD (0xEFBFBD in UTF-8)
*out++ = '\xEF';
*out++ = '\xBF';
*out++ = '\xBD';
// DC00- DFFF - Low Surrogate
UTF8UTILS_WARNING("got a low Surrogate but no high surrogate");
}
}
mBuffer = out;
}
void write_terminator()
{
*mBuffer = buffer_type(0);
}
private:
buffer_type* const mStart;
buffer_type* mBuffer;
};
/**
* A character sink (see |copy_string| in nsAlgorithm.h) for computing
* the number of bytes a UTF-16 would occupy in UTF-8. Treats invalid
* UTF-16 data as 0xFFFD (0xEFBFBD in UTF-8).
*/
class CalculateUTF8Size
{
public:
typedef char16_t value_type;
CalculateUTF8Size()
: mSize(0)
{
}
size_t Size() const
{
return mSize;
}
void write(const value_type* aStart, uint32_t aN)
{
// Assume UCS2 surrogate pairs won't be spread across fragments.
for (const value_type* p = aStart, *end = aStart + aN; p < end; ++p) {
value_type c = *p;
if (!(c & 0xFF80)) { // U+0000 - U+007F
mSize += 1;
} else if (!(c & 0xF800)) { // U+0100 - U+07FF
mSize += 2;
} else if (0xD800 != (0xF800 & c)) { // U+0800 - U+D7FF,U+E000 - U+FFFF
mSize += 3;
} else if (0xD800 == (0xFC00 & c)) { // U+D800 - U+DBFF
++p;
if (p == end) {
// Treat broken characters as the Unicode
// replacement character 0xFFFD (0xEFBFBD in
// UTF-8)
mSize += 3;
UTF8UTILS_WARNING("String ending in half a surrogate pair!");
break;
}
c = *p;
if (0xDC00 == (0xFC00 & c)) {
mSize += 4;
} else {
// Treat broken characters as the Unicode
// replacement character 0xFFFD (0xEFBFBD in
// UTF-8)
mSize += 3;
// The next code unit is the second 16-bit value, not
// the one beyond it, as per Unicode 5.0.0 Chapter 3 C10,
// only the first code unit of an illegal sequence must
// be treated as an illegally terminated code unit
// sequence (also Chapter 3 D91, "isolated [not paired and
// ill-formed] UTF-16 code units in the range D800..DFFF
// are ill-formed").
p--;
UTF8UTILS_WARNING("got a high Surrogate but no low surrogate");
}
} else { // U+DC00 - U+DFFF
// Treat broken characters as the Unicode replacement
// character 0xFFFD (0xEFBFBD in UTF-8)
mSize += 3;
UTF8UTILS_WARNING("got a low Surrogate but no high surrogate");
}
}
}
private:
size_t mSize;
};
#ifdef MOZILLA_INTERNAL_API
/**
* A character sink that performs a |reinterpret_cast|-style conversion
* from char to char16_t.
*/
class LossyConvertEncoding8to16
{
public:
typedef char value_type;
typedef char input_type;
typedef char16_t output_type;
public:
explicit LossyConvertEncoding8to16(char16_t* aDestination) :
mDestination(aDestination)
{
}
void
write(const char* aSource, uint32_t aSourceLength)
{
#ifdef MOZILLA_MAY_SUPPORT_SSE2
if (mozilla::supports_sse2()) {
write_sse2(aSource, aSourceLength);
return;
}
#endif
#if defined(MOZILLA_MAY_SUPPORT_NEON) && defined(MOZ_LITTLE_ENDIAN)
if (mozilla::supports_neon()) {
write_neon(aSource, aSourceLength);
return;
}
#endif
const char* done_writing = aSource + aSourceLength;
while (aSource < done_writing) {
*mDestination++ = (char16_t)(unsigned char)(*aSource++);
}
}
void
write_sse2(const char* aSource, uint32_t aSourceLength);
#if defined(MOZILLA_MAY_SUPPORT_NEON) && defined(MOZ_LITTLE_ENDIAN)
void
write_neon(const char* aSource, uint32_t aSourceLength);
#endif
void
write_terminator()
{
*mDestination = (char16_t)(0);
}
private:
char16_t* mDestination;
};
/**
* A character sink that performs a |reinterpret_cast|-style conversion
* from char16_t to char.
*/
class LossyConvertEncoding16to8
{
public:
typedef char16_t value_type;
typedef char16_t input_type;
typedef char output_type;
explicit LossyConvertEncoding16to8(char* aDestination)
: mDestination(aDestination)
{
}
void
write(const char16_t* aSource, uint32_t aSourceLength)
{
#ifdef MOZILLA_MAY_SUPPORT_SSE2
if (mozilla::supports_sse2()) {
write_sse2(aSource, aSourceLength);
return;
}
#endif
#if defined(MOZILLA_MAY_SUPPORT_NEON) && defined(MOZ_LITTLE_ENDIAN)
if (mozilla::supports_neon()) {
write_neon(aSource, aSourceLength);
return;
}
#endif
const char16_t* done_writing = aSource + aSourceLength;
while (aSource < done_writing) {
*mDestination++ = (char)(*aSource++);
}
}
#ifdef MOZILLA_MAY_SUPPORT_SSE2
void
write_sse2(const char16_t* aSource, uint32_t aSourceLength);
#endif
#if defined(MOZILLA_MAY_SUPPORT_NEON) && defined(MOZ_LITTLE_ENDIAN)
void
write_neon(const char16_t* aSource, uint32_t aSourceLength);
#endif
void
write_terminator()
{
*mDestination = '\0';
}
private:
char* mDestination;
};
#endif // MOZILLA_INTERNAL_API
template<typename Char, typename UnsignedT>
inline UnsignedT
RewindToPriorUTF8Codepoint(const Char* utf8Chars, UnsignedT index)

View file

@ -1,129 +0,0 @@
/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* vim: set ts=8 sts=2 et sw=2 tw=80: */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#include "nscore.h"
#include "nsAlgorithm.h"
#include "nsUTF8Utils.h"
#include <arm_neon.h>
void
LossyConvertEncoding16to8::write_neon(const char16_t* aSource,
uint32_t aSourceLength)
{
char* dest = mDestination;
// Align source to a 16-byte boundary and destination to 8-bytes boundary.
uint32_t i = 0;
while (((reinterpret_cast<uintptr_t>(aSource + i) & 0xf) ||
(reinterpret_cast<uintptr_t>(dest + i) & 0x7)) &&
i < aSourceLength) {
dest[i] = static_cast<unsigned char>(aSource[i]);
i++;
}
while ((reinterpret_cast<uintptr_t>(dest + i) & 0xf) &&
aSourceLength - i > 7) {
// source is aligned, but destination isn't aligned by 16-byte yet
uint16x8_t s =
vld1q_u16(reinterpret_cast<const uint16_t*>(
__builtin_assume_aligned(aSource + i, 16)));
vst1_u8(reinterpret_cast<uint8_t*>(
__builtin_assume_aligned(dest + i, 8)),
vmovn_u16(s));
i += 8;
}
// Align source and destination to a 16-byte boundary.
while (aSourceLength - i > 15) {
uint16x8_t low =
vld1q_u16(reinterpret_cast<const uint16_t*>(
__builtin_assume_aligned(aSource + i, 16)));
uint16x8_t high =
vld1q_u16(reinterpret_cast<const uint16_t*>(
__builtin_assume_aligned(aSource + i + 8, 16)));
vst1q_u8(reinterpret_cast<uint8_t*>(
__builtin_assume_aligned(dest + i, 16)),
vcombine_u8(vmovn_u16(low), vmovn_u16(high)));
i += 16;
}
if (aSourceLength - i > 7) {
uint16x8_t s = vld1q_u16(reinterpret_cast<const uint16_t*>(
__builtin_assume_aligned(aSource + i, 16)));
vst1_u8(reinterpret_cast<uint8_t*>(
__builtin_assume_aligned(dest + i, 8)),
vmovn_u16(s));
i += 8;
}
// Finish up the rest.
for (; i < aSourceLength; ++i) {
dest[i] = static_cast<unsigned char>(aSource[i]);
}
mDestination += i;
}
void
LossyConvertEncoding8to16::write_neon(const char* aSource,
uint32_t aSourceLength)
{
char16_t* dest = mDestination;
// Align source to a 8-byte boundary and destination to 16-bytes boundary.
uint32_t i = 0;
while (((reinterpret_cast<uintptr_t>(aSource + i) & 0x7) ||
(reinterpret_cast<uintptr_t>(dest + i) & 0xf)) &&
i < aSourceLength) {
dest[i] = static_cast<unsigned char>(aSource[i]);
i++;
}
if ((uintptr_t(aSource + i) & 0xf) && aSourceLength - i > 7) {
// destination is aligned, but source isn't aligned by 16-byte yet
uint8x8_t s =
vld1_u8(reinterpret_cast<const uint8_t*>(
__builtin_assume_aligned(aSource + i, 8)));
vst1q_u16(reinterpret_cast<uint16_t*>(
__builtin_assume_aligned(dest + i, 16)),
vmovl_u8(s));
i += 8;
}
// Align source and destination to a 16-byte boundary.
while (aSourceLength - i > 15) {
uint8x16_t s =
vld1q_u8(reinterpret_cast<const uint8_t*>(
__builtin_assume_aligned(aSource + i, 16)));
uint16x8_t low = vmovl_u8(vget_low_u8(s));
uint16x8_t high = vmovl_u8(vget_high_u8(s));
vst1q_u16(reinterpret_cast<uint16_t*>(
__builtin_assume_aligned(dest + i, 16)),
low);
vst1q_u16(reinterpret_cast<uint16_t*>(
__builtin_assume_aligned(dest + i + 8, 16)),
high);
i += 16;
}
if (aSourceLength - i > 7) {
uint8x8_t s =
vld1_u8(reinterpret_cast<const uint8_t*>(
__builtin_assume_aligned(aSource + i, 8)));
vst1q_u16(reinterpret_cast<uint16_t*>(
__builtin_assume_aligned(dest + i, 16)),
vmovl_u8(s));
i += 8;
}
// Finish up whatever's left.
for (; i < aSourceLength; ++i) {
dest[i] = static_cast<unsigned char>(aSource[i]);
}
mDestination += i;
}

View file

@ -1,105 +0,0 @@
/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* vim: set ts=8 sts=2 et sw=2 tw=80: */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#include "nscore.h"
#include "nsAlgorithm.h"
#include <emmintrin.h>
#include <nsUTF8Utils.h>
void
LossyConvertEncoding16to8::write_sse2(const char16_t* aSource,
uint32_t aSourceLength)
{
char* dest = mDestination;
// Align source to a 16-byte boundary.
uint32_t i = 0;
uint32_t alignLen =
XPCOM_MIN<uint32_t>(aSourceLength,
uint32_t(-NS_PTR_TO_INT32(aSource) & 0xf) / sizeof(char16_t));
for (; i < alignLen; ++i) {
dest[i] = static_cast<unsigned char>(aSource[i]);
}
// Walk 64 bytes (four XMM registers) at a time.
__m128i vectmask = _mm_set1_epi16(0x00ff);
for (; aSourceLength - i > 31; i += 32) {
__m128i source1 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i));
source1 = _mm_and_si128(source1, vectmask);
__m128i source2 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i + 8));
source2 = _mm_and_si128(source2, vectmask);
__m128i source3 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i + 16));
source3 = _mm_and_si128(source3, vectmask);
__m128i source4 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i + 24));
source4 = _mm_and_si128(source4, vectmask);
// Pack the source data. SSE2 views this as a saturating uint16_t to
// uint8_t conversion, but since we masked off the high-order byte of every
// uint16_t, we're really just grabbing the low-order bytes of source1 and
// source2.
__m128i packed1 = _mm_packus_epi16(source1, source2);
__m128i packed2 = _mm_packus_epi16(source3, source4);
// This store needs to be unaligned since there's no guarantee that the
// alignment we did above for the source will align the destination.
_mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i), packed1);
_mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i + 16), packed2);
}
// Finish up the rest.
for (; i < aSourceLength; ++i) {
dest[i] = static_cast<unsigned char>(aSource[i]);
}
mDestination += i;
}
void
LossyConvertEncoding8to16::write_sse2(const char* aSource,
uint32_t aSourceLength)
{
char16_t* dest = mDestination;
// Align source to a 16-byte boundary. We choose to align source rather than
// dest because we'd rather have our loads than our stores be fast. You have
// to wait for a load to complete, but you can keep on moving after issuing a
// store.
uint32_t i = 0;
uint32_t alignLen = XPCOM_MIN(aSourceLength,
uint32_t(-NS_PTR_TO_INT32(aSource) & 0xf));
for (; i < alignLen; ++i) {
dest[i] = static_cast<unsigned char>(aSource[i]);
}
// Walk 32 bytes (two XMM registers) at a time.
for (; aSourceLength - i > 31; i += 32) {
__m128i source1 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i));
__m128i source2 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i + 16));
// Interleave 0s in with the bytes of source to create lo and hi.
__m128i lo1 = _mm_unpacklo_epi8(source1, _mm_setzero_si128());
__m128i hi1 = _mm_unpackhi_epi8(source1, _mm_setzero_si128());
__m128i lo2 = _mm_unpacklo_epi8(source2, _mm_setzero_si128());
__m128i hi2 = _mm_unpackhi_epi8(source2, _mm_setzero_si128());
// store lo and hi into dest.
_mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i), lo1);
_mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i + 8), hi1);
_mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i + 16), lo2);
_mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i + 24), hi2);
}
// Finish up whatever's left.
for (; i < aSourceLength; ++i) {
dest[i] = static_cast<unsigned char>(aSource[i]);
}
mDestination += i;
}

View file

@ -82,7 +82,8 @@ TEST(Atoms, Invalid)
EXPECT_EQ(count, NS_GetNumberOfAtoms());
}
#ifndef DEBUG
// Don't run this test in debug builds as that intentionally asserts.
for (unsigned int i = 0; i < ArrayLength(Invalid8Strings); ++i) {
nsrefcnt count = NS_GetNumberOfAtoms();
@ -96,15 +97,15 @@ TEST(Atoms, Invalid)
EXPECT_EQ(count, NS_GetNumberOfAtoms());
}
// Don't run this test in debug builds as that intentionally asserts.
#ifndef DEBUG
RefPtr<nsAtom> emptyAtom = NS_Atomize("");
for (unsigned int i = 0; i < ArrayLength(Malformed8Strings); ++i) {
nsrefcnt count = NS_GetNumberOfAtoms();
RefPtr<nsAtom> atom8 = NS_Atomize(Malformed8Strings[i]);
EXPECT_EQ(atom8, emptyAtom);
{
RefPtr<nsAtom> atom8 = NS_Atomize(Malformed8Strings[i].m8);
RefPtr<nsAtom> atom16 = NS_Atomize(Malformed8Strings[i].m16);
EXPECT_EQ(atom8, atom16);
}
EXPECT_EQ(count, NS_GetNumberOfAtoms());
}
#endif

View file

@ -42,8 +42,8 @@ static void Check(const char* s1, const char* s2, size_t n)
}
nsAutoString t1,t2;
CopyASCIItoUTF16(s1, t1);
CopyASCIItoUTF16(s2, t2);
CopyASCIItoUTF16(mozilla::MakeStringSpan(s1), t1);
CopyASCIItoUTF16(mozilla::MakeStringSpan(s2), t2);
const char16_t* us1 = t1.get();
const char16_t* us2 = t2.get();

View file

@ -769,12 +769,10 @@ TEST_F(Strings, replace_substr)
TEST_F(Strings, replace_substr_2)
{
const char *oldName = nullptr;
const char *newName = "user";
nsString acctName; acctName.AssignLiteral("forums.foo.com");
nsAutoString newAcctName, oldVal, newVal;
CopyASCIItoUTF16(oldName, oldVal);
CopyASCIItoUTF16(newName, newVal);
CopyASCIItoUTF16(mozilla::MakeStringSpan(newName), newVal);
newAcctName.Assign(acctName);
// here, oldVal is empty. we are testing that this function
@ -1287,6 +1285,47 @@ TEST(String, strip_chars)
NS_LITERAL_STRING("foo"));
}
TEST_F(Strings, append_with_capacity)
{
nsAutoString s;
const char16_t* origPtr = s.BeginReading();
s.SetCapacity(100);
const char16_t* ptr = s.BeginReading();
EXPECT_NE(origPtr, ptr);
for (int i = 0; i < 100; i++) {
s.Append(u'a');
EXPECT_EQ(s.BeginReading(), ptr);
EXPECT_EQ(s.Length(), uint32_t(i + 1));
}
}
TEST_F(Strings, append_string_with_capacity)
{
nsAutoString aa;
aa.Append(u'a');
aa.Append(u'a');
nsAutoString s;
const char16_t* origPtr = s.BeginReading();
s.SetCapacity(200);
const char16_t* ptr = s.BeginReading();
EXPECT_NE(origPtr, ptr);
for (int i = 0; i < 100; i++) {
s.Append(aa);
EXPECT_EQ(s.BeginReading(), ptr);
EXPECT_EQ(s.Length(), uint32_t(2 * (i + 1)));
}
}
TEST_F(Strings, legacy_set_length_semantics)
{
const char* foobar = "foobar";
nsCString s;
s.SetCapacity(2048);
memcpy(s.BeginWriting(), foobar, strlen(foobar));
s.SetLength(strlen(foobar));
EXPECT_TRUE(s.EqualsASCII(foobar));
}
TEST_F(Strings, huge_capacity)
{
nsString a, b, c, d, e, f, g, h, i, j, k, l, m, n;

View file

@ -17,7 +17,6 @@ TEST(TextFormatter, Tests)
char16_t buf[256];
nsTextFormatter::snprintf(buf, 256, fmt.get(), d, 333, utf8, ucs2);
nsAutoString out(buf);
ASSERT_STREQ("Hello World", NS_LossyConvertUTF16toASCII(out).get());
const char16_t *uout = out.get();
const char16_t expected[] = {0x48, 0x65, 0x6C, 0x6C, 0x6F, 0x20,

View file

@ -14,6 +14,7 @@
#include "UTFStrings.h"
#include "nsUnicharUtils.h"
#include "mozilla/HashFunctions.h"
#include "nsUTF8Utils.h"
#include "gtest/gtest.h"
@ -77,20 +78,18 @@ TEST(UTF, Invalid8)
TEST(UTF, Malformed8)
{
// Don't run this test in debug builds as that intentionally asserts.
#ifndef DEBUG
for (unsigned int i = 0; i < ArrayLength(Malformed8Strings); ++i) {
nsDependentCString str8(Malformed8Strings[i]);
nsDependentString str16(Malformed8Strings[i].m16);
nsDependentCString str8(Malformed8Strings[i].m8);
EXPECT_TRUE(NS_ConvertUTF8toUTF16(str8).IsEmpty());
EXPECT_TRUE(NS_ConvertUTF8toUTF16(str8).Equals(str16));
nsString tmp16(NS_LITERAL_STRING("string"));
nsString tmp16(NS_LITERAL_STRING("string "));
AppendUTF8toUTF16(str8, tmp16);
EXPECT_TRUE(tmp16.EqualsLiteral("string"));
EXPECT_TRUE(tmp16.Equals(NS_LITERAL_STRING("string ") + str16));
EXPECT_NE(CompareUTF8toUTF16(str8, EmptyString()), 0);
EXPECT_EQ(CompareUTF8toUTF16(str8, str16), 0);
}
#endif
}
TEST(UTF, Hash16)
@ -106,20 +105,16 @@ TEST(UTF, Hash16)
for (unsigned int i = 0; i < ArrayLength(Invalid8Strings); ++i) {
nsDependentCString str8(Invalid8Strings[i].m8);
bool err;
EXPECT_EQ(HashString(Invalid8Strings[i].m16),
HashUTF8AsUTF16(str8.get(), str8.Length(), &err));
EXPECT_FALSE(err);
EXPECT_EQ(HashUTF8AsUTF16(str8.get(), str8.Length(), &err), 0u);
EXPECT_TRUE(err);
}
// Don't run this test in debug builds as that intentionally asserts.
#ifndef DEBUG
for (unsigned int i = 0; i < ArrayLength(Malformed8Strings); ++i) {
nsDependentCString str8(Malformed8Strings[i]);
nsDependentCString str8(Malformed8Strings[i].m8);
bool err;
EXPECT_EQ(HashUTF8AsUTF16(str8.get(), str8.Length(), &err), 0u);
EXPECT_TRUE(err);
}
#endif
}
/**
@ -178,14 +173,76 @@ void NonASCII16_helper(const size_t aStrSize)
}
}
TEST(UTF, NonASCII16)
TEST(UTF, UTF8CharEnumerator)
{
// Test with various string sizes to catch any special casing.
NonASCII16_helper(1);
NonASCII16_helper(8);
NonASCII16_helper(16);
NonASCII16_helper(32);
NonASCII16_helper(512);
const char* p = "\x61\xC0\xC2\xC2\x80\xE0\x80\x80\xE0\xA0\x80\xE1\x80\x80\xED\xBF\xBF\xED\x9F\xBF\xEE\x80\x80\xEE\x80\xFF\xF0\x90\x80\x80\xF0\x80\x80\x80\xF1\x80\x80\x80\xF4\x8F\xBF\xF4\x8F\xBF\xBF\xF4\xBF\xBF\xBF";
const char* end = p + 49;
EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0x0061U);
EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0x0080U);
EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0x0800U);
EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0x1000U);
EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xD7FFU);
EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xE000U);
EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0x10000U);
EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0x40000U);
EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0x10FFFFU);
EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
EXPECT_EQ(p, end);
p = "\xC2";
end = p + 1;
EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
EXPECT_EQ(p, end);
p = "\xE1\x80";
end = p + 2;
EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
EXPECT_EQ(p, end);
p = "\xF1\x80\x80";
end = p + 3;
EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
EXPECT_EQ(p, end);
}
TEST(UTF, UTF16CharEnumerator)
{
const char16_t* p = u"\u0061\U0001F4A9";
const char16_t* end = p + 3;
EXPECT_EQ(UTF16CharEnumerator::NextChar(&p, end), 0x0061U);
EXPECT_EQ(UTF16CharEnumerator::NextChar(&p, end), 0x1F4A9U);
EXPECT_EQ(p, end);
const char16_t loneHigh = 0xD83D;
p = &loneHigh;
end = p + 1;
EXPECT_EQ(UTF16CharEnumerator::NextChar(&p, end), 0xFFFDU);
EXPECT_EQ(p, end);
const char16_t loneLow = 0xDCA9;
p = &loneLow;
end = p + 1;
EXPECT_EQ(UTF16CharEnumerator::NextChar(&p, end), 0xFFFDU);
EXPECT_EQ(p, end);
const char16_t loneHighStr[] = { 0xD83D, 0x0061 };
p = loneHighStr;
end = p + 2;
EXPECT_EQ(UTF16CharEnumerator::NextChar(&p, end), 0xFFFDU);
EXPECT_EQ(UTF16CharEnumerator::NextChar(&p, end), 0x0061U);
EXPECT_EQ(p, end);
}
} // namespace TestUTF

View file

@ -61,52 +61,68 @@ static const UTFStringsStringPair Invalid16Strings[] =
static const UTFStringsStringPair Invalid8Strings[] =
{
{ { 'a', 0xFFFD, 'b' },
{ { 'a', 0xFFFD, 0xFFFD, 'b' },
{ 'a', char(0xC0), char(0x80), 'b' } },
{ { 0xFFFD, 0x80 },
{ { 0xFFFD, 0xFFFD, 0x80 },
{ char(0xC1), char(0xBF), char(0xC2), char(0x80) } },
{ { 0xFFFD },
{ { 0xFFFD, 0xFFFD },
{ char(0xC1), char(0xBF) } },
{ { 0xFFFD, 'x', 0x0800 },
{ { 0xFFFD, 0xFFFD, 0xFFFD, 'x', 0x0800 },
{ char(0xE0), char(0x80), char(0x80), 'x', char(0xE0), char(0xA0), char(0x80) } },
{ { 0xFFFD, 'x', 0xFFFD },
{ { 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 'x', 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD },
{ char(0xF0), char(0x80), char(0x80), char(0x80), 'x', char(0xF0), char(0x80), char(0x8F), char(0x80) } },
{ { 0xFFFD, 0xFFFD },
{ { 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD },
{ char(0xF4), char(0x90), char(0x80), char(0x80), char(0xF7), char(0xBF), char(0xBF), char(0xBF) } },
{ { 0xFFFD, 'x', 0xD800, 0xDC00, 0xFFFD },
{ { 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 'x', 0xD800, 0xDC00, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD },
{ char(0xF0), char(0x8F), char(0xBF), char(0xBF), 'x', char(0xF0), char(0x90), char(0x80), char(0x80), char(0xF0), char(0x8F), char(0xBF), char(0xBF) } },
{ { 0xFFFD, 'x', 0xFFFD },
{ { 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 'x', 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD },
{ char(0xF8), char(0x80), char(0x80), char(0x80), char(0x80), 'x', char(0xF8), char(0x88), char(0x80), char(0x80), char(0x80) } },
{ { 0xFFFD, 0xFFFD },
{ { 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD },
{ char(0xFB), char(0xBF), char(0xBF), char(0xBF), char(0xBF), char(0xFC), char(0xA0), char(0x80), char(0x80), char(0x80), char(0x80) } },
{ { 0xFFFD, 0xFFFD },
{ { 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD },
{ char(0xFC), char(0x80), char(0x80), char(0x80), char(0x80), char(0x80), char(0xFD), char(0xBF), char(0xBF), char(0xBF), char(0xBF), char(0xBF) } },
};
// Don't use this array in debug builds as that intentionally asserts.
#ifndef DEBUG
static const char Malformed8Strings[][16] =
static const UTFStringsStringPair Malformed8Strings[] =
{
{ char(0x80) },
{ 'a', char(0xC8), 'c' },
{ 'a', char(0xC0) },
{ 'a', char(0xE8), 'c' },
{ 'a', char(0xE8), char(0x80), 'c' },
{ 'a', char(0xE8), char(0x80) },
{ char(0xE8), 0x7F, char(0x80) },
{ 'a', char(0xE8), char(0xE8), char(0x80) },
{ 'a', char(0xF4) },
{ 'a', char(0xF4), char(0x80), char(0x80), 'c', 'c' },
{ 'a', char(0xF4), char(0x80), 'x', char(0x80) },
{ char(0xF4), char(0x80), char(0x80), char(0x80), char(0x80) },
{ 'a', char(0xFA), 'c' },
{ 'a', char(0xFA), char(0x80), char(0x80), 0x7F, char(0x80), 'c' },
{ 'a', char(0xFA), char(0x80), char(0x80), char(0x80), char(0x80), char(0x80), 'c' },
{ 'a', char(0xFD) },
{ 'a', char(0xFD), char(0x80), char(0x80), char(0x80), char(0x80), 'c' },
{ 'a', char(0xFD), char(0x80), char(0x80), char(0x80), char(0x80), char(0x80), char(0x80) },
{ 'a', char(0xFC), char(0x80), char(0x80), 0x40, char(0x80), char(0x80), 'c' },
{ { 0xFFFD },
{ char(0x80) } },
{ { 'a', 0xFFFD, 'c' },
{ 'a', char(0xC8), 'c' } },
{ { 'a', 0xFFFD },
{ 'a', char(0xC8) } },
{ { 'a', 0xFFFD, 'c' },
{ 'a', char(0xE8), 'c' } },
{ { 'a', 0xFFFD, 'c' },
{ 'a', char(0xE8), char(0x80), 'c' } },
{ { 'a', 0xFFFD },
{ 'a', char(0xE8), char(0x80) } },
{ { 0xFFFD, 0x7F, 0xFFFD },
{ char(0xE8), 0x7F, char(0x80) } },
{ { 'a', 0xFFFD, 0xFFFD },
{ 'a', char(0xE8), char(0xE8), char(0x80) } },
{ { 'a', 0xFFFD },
{ 'a', char(0xF4) } },
{ { 'a', 0xFFFD, 'c', 'c' },
{ 'a', char(0xF4), char(0x80), char(0x80), 'c', 'c' } },
{ { 'a', 0xFFFD, 'x', 0xFFFD },
{ 'a', char(0xF4), char(0x80), 'x', char(0x80) } },
{ { 0xDBC0, 0xDC00, 0xFFFD },
{ char(0xF4), char(0x80), char(0x80), char(0x80), char(0x80) } },
{ { 'a', 0xFFFD, 'c' },
{ 'a', char(0xFA), 'c' } },
{ { 'a', 0xFFFD, 0xFFFD, 0xFFFD, 0x7F, 0xFFFD, 'c' },
{ 'a', char(0xFA), char(0x80), char(0x80), 0x7F, char(0x80), 'c' } },
{ { 'a', 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 'c' },
{ 'a', char(0xFA), char(0x80), char(0x80), char(0x80), char(0x80), char(0x80), 'c' } },
{ { 'a', 0xFFFD },
{ 'a', char(0xFD) } },
{ { 'a', 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 'c' },
{ 'a', char(0xFD), char(0x80), char(0x80), char(0x80), char(0x80), 'c' } },
{ { 'a', 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD },
{ 'a', char(0xFD), char(0x80), char(0x80), char(0x80), char(0x80), char(0x80), char(0x80) } },
{ { 'a', 0xFFFD, 0xFFFD, 0xFFFD, 0x40, 0xFFFD, 0xFFFD, 'c' },
{ 'a', char(0xFD), char(0x80), char(0x80), 0x40, char(0x80), char(0x80), 'c' } },
};
#endif
#endif