diff --git a/.gitignore b/.gitignore index 9919ac1f0d1a..2b16924f1047 100644 --- a/.gitignore +++ b/.gitignore @@ -177,3 +177,7 @@ testing/raptor/.raptor-venv testing/raptor/raptor-venv testing/raptor/raptor/tests/json/ testing/raptor/webext/raptor/auto_gen_test_config.js + +# Ignore ICU4X experimentation data files. +# See intl/ICU4X.md for more details. +config/external/icu4x diff --git a/.hgignore b/.hgignore index e9958123af6b..0c8f3a1860cd 100644 --- a/.hgignore +++ b/.hgignore @@ -221,3 +221,7 @@ toolkit/components/certviewer/content/package-lock.json # Ignore Rust/Cargo output from running `cargo` directly for image_builder docker image ^taskcluster/docker/image_builder/build-image/target + +# Ignore ICU4X experimentation data files. +# See intl/ICU4X.md for more details. +^config/external/icu4x diff --git a/config/external/icu4x/icu4x_data.S b/config/external/icu4x/icu4x_data.S new file mode 100644 index 000000000000..95a647f9ef02 --- /dev/null +++ b/config/external/icu4x/icu4x_data.S @@ -0,0 +1,29 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#if defined(_WIN32) && defined(__i386__) +// Mark the object as SAFESEH-enabled. +.def @feat.00; +.scl 3; +.type 0; +.endef +.global @feat.00 +.set @feat.00, 1 +#endif + +.global ICU4X_DATA_SYMBOL +#if defined(__APPLE__) +.data +.const +#elif defined(__wasi__) +.section .rodata,"",@ +#else +.section .rodata +#endif +.balign 16 +ICU4X_DATA_SYMBOL: +.incbin ICU4X_DATA_FILE +#ifdef __wasi__ +.size ICU4X_DATA_SYMBOL, . - ICU4X_DATA_SYMBOL +#endif diff --git a/config/external/icu4x/moz.build b/config/external/icu4x/moz.build new file mode 100644 index 000000000000..63273b83ff5f --- /dev/null +++ b/config/external/icu4x/moz.build @@ -0,0 +1,35 @@ +# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*- +# vim: set filetype=python: +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +# Build the ICU4X data directly into the binary file. This is an experiment that can +# be enabled by adding `ac_add_options --enable-icu4x` to your mozconfig. +# See `intl/ICU4X.md`. + +if CONFIG["MOZ_ICU4X"]: + DEFINES["MOZ_ICU4X"] = 1 + Library("icu4xdata") + LOCAL_INCLUDES += ["."] + + # The "mangled" symbol gets prefixed by a "_" in certain platforms. + symbol_prefix = "" + if (CONFIG["OS_ARCH"] == "WINNT" and CONFIG["CPU_ARCH"] == "x86") or CONFIG[ + "OS_ARCH" + ] == "Darwin": + symbol_prefix = "_" + + # To re-generate this file run: intl/update-icu4x.sh + DEFINES["ICU4X_DATA_FILE"] = '"icu4x.postcard"' + + # In C++ this data will be available via: + # + # extern uint8_t icu4x_static_locale_data; + # uint8_t firstByte = (&icu4x_static_locale_data)[0]; + DEFINES["ICU4X_DATA_SYMBOL"] = "%s%s" % (symbol_prefix, "icu4x_static_locale_data") + + # This is assembly which has instructions to include the binary locale data directly. + SOURCES += [ + "icu4x_data.S", + ] diff --git a/intl/ICU4X.md b/intl/ICU4X.md new file mode 100644 index 000000000000..b3ab912e92ec --- /dev/null +++ b/intl/ICU4X.md @@ -0,0 +1,31 @@ +# Experimentation with ICU4X + +We're currently conducting some experiments with using [ICU4X](https://github.com/unicode-org/icu4x) in Gecko rather than ICU4C. This file documents the procedures for building with ICU4X. The current implementation is incomplete, and hopefully we can begin to land code incrementally. This document will serve as a documentation for the status of this experimentation on what has been landed in tree. + +## Enabling ICU4X + +To enable the ICU4X experimentation: + + 1. Add the `ac_add_options --enable-icu4x` mozconfig. + 2. Generate the locale data by running `./intl/update-icu4x.sh`. + 3. Do a full build. + +## Pieces of the ICU4X integration + +#### Bundle the ICU4X data + +The data is bundled directly into the binary in the `config/external/icu4x` directory. The "icu4xdata" is a separate library, which consists of an assembly file that directly includes the ICU4X locale binary data. Eventually this binary data will be directly accessed via ICU4X's StaticDataProvider. + +The script `intl/update-icu4x.sh` can generate and update this binary data. At the time of this writing, this data is not checked in to source control since it's quite large with no ability to prune the data with only exporting certain keys. See [unicode-org/icu4x#192](https://github.com/unicode-org/icu4x/issues/192) for supporting splitting out keys. + +#### Vendored ICU4X + +At the time of this writing, ICU4X has been [added to the allow list for vendored code](https://searchfox.org/mozilla-central/rev/b24799980a929597dcc553cb0854aa6c960c82b5/python/mozbuild/mozbuild/vendor/vendor_rust.py#284-293) but the files still need to be vendored in. This is currently blocked on some large testdata being included. There is a tracking issue [unicode-org/icu4x#849](https://github.com/unicode-org/icu4x/issues/849) for the ICU4X integration, but we might find a way to prune the files on the Gecko vendoring code. + +#### Static Data Provider + +The data provider in ICU4X is the mechanism to load the locale-specific data. In [previous experiments](https://bugzilla.mozilla.org/show_bug.cgi?id=1713136) we used the `FsDataProvider` which hits the file system every time the APIs need any data. Future integrations should look into using the `StaticDataProvider`. + +#### Building with FFIs + +The final step in the ICU4X integration is to actually build and include the C++ FFI files. In the Summer 2021 experimentation we used manually built FFIs, but future experiments will rely on the [Diplomat](https://github.com/rust-diplomat/diplomat)-built FFIs. diff --git a/intl/update-icu4x.sh b/intl/update-icu4x.sh new file mode 100755 index 000000000000..fdb2a20c0552 --- /dev/null +++ b/intl/update-icu4x.sh @@ -0,0 +1,82 @@ +#!/bin/sh +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +set -e + +# Update the icu4x binary data for a given release: +# Usage: update-icu4x.sh +# update-icu4x.sh https://github.com/unicode-org/icu4x.git icu@0.3.0 +# +# Update to the main branch: +# Usage: update-icu4x.sh +# update-icu4x.sh https://github.com/unicode-org/icu4x.git main + +if [ $# -lt 2 ]; then + echo "Usage: update-icu4x.sh " + echo "Example: update-icu4x.sh https://github.com/unicode-org/icu4x.git icu@0.3.0 39.0.0" + exit 1 +fi + +# Make a log function so the output is easy to read. +log() { + CYAN='\033[0;36m' + CLEAR='\033[0m' + printf "${CYAN}[update-icu4x]${CLEAR} $*\n" +} + +# Specify locale and time zone information for consistent output and reproduceability. +export TZ=UTC +export LANG=en_US.UTF-8 +export LANGUAGE=en_US +export LC_ALL=en_US.UTF-8 + +# Define all of the paths. +original_pwd=$(pwd) +top_src_dir=$(cd -- "$(dirname "$0")/.." >/dev/null 2>&1 ; pwd -P) +data_dir=${top_src_dir}/config/external/icu4x +data_file=${data_dir}/icu4x.postcard +git_info_file=${data_dir}/ICU4X-GIT-INFO + +log "Remove the old data" +rm -f ${data_file} + +log "Clone ICU4X" +tmpclonedir=$(mktemp -d) +git clone --depth 1 --branch $2 $1 ${tmpclonedir} + +log "Change the directory to the cloned repo" +log ${tmpclonedir} +cd ${tmpclonedir} + +log "Run the icu4x-datagen tool to regenerate the data." +log "Saving the data to: ${data_file}" + +# TODO(Bug 1741262) - Should locales be filtered as well? It doesn't appear that the existing ICU +# data builder is using any locale filtering. + +# TODO(Bug 1741264) - Keys are not supported yet: https://github.com/unicode-org/icu4x/issues/192 +# --keys ... +# Include this resource key in the output. Accepts multiple arguments. +# --key-file +# Path to text file with resource keys to include, one per line. Empty lines and +# lines starting with '#' are ignored. +cargo run --bin icu4x-datagen -- \ + --cldr-tag $3 \ + --all-keys \ + --all-locales \ + --format blob \ + --out ${data_file} \ + -v \ + +log "Record the current cloned git information to:" +log ${git_info_file} +# (This ensures that if ICU modifications are performed properly, it's always +# possible to run the command at the top of this script and make no changes to +# the tree.) +git -C ${tmpclonedir} log -1 > ${git_info_file} + +log "Clean up the tmp directory" +cd ${original_pwd} +rm -rf ${tmpclonedir} diff --git a/js/src/build/moz.build b/js/src/build/moz.build index 23f9184c2545..d0409845252f 100644 --- a/js/src/build/moz.build +++ b/js/src/build/moz.build @@ -48,6 +48,9 @@ if CONFIG["JS_HAS_INTL_API"]: "icu", ] +if CONFIG["MOZ_ICU4X"]: + USE_LIBS += ["icu4xdata"] + USE_LIBS += [ "nspr", "zlib", diff --git a/moz.configure b/moz.configure index 45d7b2900f09..5008454b45ad 100755 --- a/moz.configure +++ b/moz.configure @@ -676,6 +676,13 @@ option( set_config("MOZ_UI_LOCALE", depends("--enable-ui-locale")(lambda x: x)) +option( + "--enable-icu4x", + help="An experiment to use ICU4X instead of ICU4C. See intl/ICU4X.md", +) + +set_config("MOZ_ICU4X", True, when="--enable-icu4x") + # clang-plugin location # ==============================================================