diff --git a/.gitignore b/.gitignore index 78441b64b2eb..b10285e79d49 100644 --- a/.gitignore +++ b/.gitignore @@ -71,6 +71,9 @@ browser/components/newtab/content-src/asrouter/schemas/corpus/PanelTestProvider_ # Ignore Pocket component build and dev assets browser/components/pocket/content/panels/css/main.compiled.css.map +# Ignore downloaded thirdparty build artifacts. +toolkit/components/translations/bergamot-translator/thirdparty + # Build directories for js shell *_DBG.OBJ/ *_OPT.OBJ/ diff --git a/.hgignore b/.hgignore index 305143ba2030..372191c60bea 100644 --- a/.hgignore +++ b/.hgignore @@ -69,6 +69,9 @@ compile_commands\.json # Ignore Pocket component build and dev assets browser/components/pocket/content/panels/css/main.compiled.css.map +# Ignore downloaded thirdparty build artifacts. +toolkit/components/translations/bergamot-translator/thirdparty + # Build directories for js shell _DBG\.OBJ/ _OPT\.OBJ/ diff --git a/.prettierignore b/.prettierignore index 1503e305ff0c..cf6a55b9f66f 100644 --- a/.prettierignore +++ b/.prettierignore @@ -1424,7 +1424,8 @@ toolkit/components/normandy/vendor/ toolkit/components/passwordmgr/PasswordRulesParser.sys.mjs toolkit/components/protobuf/ toolkit/components/translation/cld2/ -toolkit/components/translations/bergamot-translator +toolkit/components/translations/bergamot-translator/thirdparty +toolkit/components/translations/bergamot-translator/bergamot-translator.js toolkit/components/url-classifier/chromium/ toolkit/components/utils/mozjexl.js toolkit/components/viaduct/fetch_msg_types.pb.cc diff --git a/toolkit/components/translations/bergamot-translator/build-bergamot.py b/toolkit/components/translations/bergamot-translator/build-bergamot.py new file mode 100755 index 000000000000..4692292a34c0 --- /dev/null +++ b/toolkit/components/translations/bergamot-translator/build-bergamot.py @@ -0,0 +1,269 @@ +#!/usr/bin/env python3 +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +""" +Builds the Bergamot translations engine for integration with Firefox. + +If you wish to test the Bergamot engine locally, then uncomment the .wasm line in +the toolkit/components/translations/jar.mn after building the file. Just make sure +not to check the code change in. +""" + +import argparse +import multiprocessing +import os +import shutil +import subprocess +from collections import namedtuple + +import yaml + +DIR_PATH = os.path.realpath(os.path.dirname(__file__)) +THIRD_PARTY_PATH = os.path.join(DIR_PATH, "thirdparty") +MOZ_YAML_PATH = os.path.join(DIR_PATH, "moz.yaml") +PATCHES_PATH = os.path.join(DIR_PATH, "patches") +BERGAMOT_PATH = os.path.join(THIRD_PARTY_PATH, "bergamot-translator") +MARIAN_PATH = os.path.join(BERGAMOT_PATH, "3rd_party/marian-dev") +GEMM_SCRIPT = os.path.join(BERGAMOT_PATH, "wasm/patch-artifacts-import-gemm-module.sh") +BUILD_PATH = os.path.join(THIRD_PARTY_PATH, "build-wasm") +EMSDK_PATH = os.path.join(THIRD_PARTY_PATH, "emsdk") +EMSDK_ENV_PATH = os.path.join(EMSDK_PATH, "emsdk_env.sh") +WASM_PATH = os.path.join(BUILD_PATH, "bergamot-translator-worker.wasm") +JS_PATH = os.path.join(BUILD_PATH, "bergamot-translator-worker.js") +FINAL_JS_PATH = os.path.join(DIR_PATH, "bergamot-translator.js") +ROOT_PATH = os.path.join(DIR_PATH, "../../../..") + +# 3.1.47 had an error compiling sentencepiece. +EMSDK_VERSION = "3.1.8" +EMSDK_REVISION = "2346baa7bb44a4a0571cc75f1986ab9aaa35aa03" + +patches = [ + (BERGAMOT_PATH, os.path.join(PATCHES_PATH, "allocation-bergamot.patch")), + (MARIAN_PATH, os.path.join(PATCHES_PATH, "allocation-marian.patch")), +] + +parser = argparse.ArgumentParser( + description=__doc__, + # Preserves whitespace in the help text. + formatter_class=argparse.RawTextHelpFormatter, +) +parser.add_argument( + "--clobber", action="store_true", help="Clobber the build artifacts" +) +parser.add_argument( + "--debug", + action="store_true", + help="Build with debug symbols, useful for profiling", +) + +ArgNamespace = namedtuple("ArgNamespace", ["clobber", "debug"]) + + +def git_clone_update(name: str, repo_path: str, repo_url: str, revision: str): + if not os.path.exists(repo_path): + print(f"\nā¬‡ļø Clone the {name} repo into {repo_path}\n") + subprocess.check_call( + ["git", "clone", repo_url], + cwd=THIRD_PARTY_PATH, + ) + + local_head = subprocess.check_output( + ["git", "rev-parse", "HEAD"], + cwd=repo_path, + text=True, + ).strip() + + def run(command): + return subprocess.check_call(command, cwd=repo_path) + + if local_head != revision: + print(f"The head ({local_head}) and revision ({revision}) don't match.") + print(f"\nšŸ”Ž Fetching the latest from {name}.\n") + run(["git", "fetch", "--recurse-submodules"]) + + print(f"šŸ›’ Checking out the revision {revision}") + run(["git", "checkout", revision]) + run(["git", "submodule", "update", "--init", "--recursive"]) + + +def install_and_activate_emscripten(args: ArgNamespace): + git_clone_update( + name="emsdk", + repo_path=EMSDK_PATH, + repo_url="https://github.com/emscripten-core/emsdk.git", + revision=EMSDK_REVISION, + ) + + # Run these commands in the shell so that the configuration is saved. + def run_shell(command): + return subprocess.run(command, cwd=EMSDK_PATH, shell=True, check=True) + + print(f"\nšŸ› ļø Installing EMSDK version {EMSDK_VERSION}\n") + run_shell("./emsdk install " + EMSDK_VERSION) + + print("\nšŸ› ļø Activating emsdk\n") + run_shell("./emsdk activate " + EMSDK_VERSION) + + +def install_bergamot(): + with open(MOZ_YAML_PATH, "r", encoding="utf8") as file: + text = file.read() + + moz_yaml = yaml.safe_load(text) + + git_clone_update( + name="bergamot", + repo_path=BERGAMOT_PATH, + repo_url=moz_yaml["origin"]["url"], + revision=moz_yaml["origin"]["revision"], + ) + + +def to_human_readable(size): + """Convert sizes to human-readable format""" + size_in_mb = size / 1048576 + return f"{size_in_mb:.2f}M ({size} bytes)" + + +def apply_git_patch(repo_path, patch_path): + print(f"Applying patch {patch_path} to {os.path.basename(repo_path)}") + subprocess.check_call(["git", "apply", "--reject", patch_path], cwd=repo_path) + + +def revert_git_patch(repo_path, patch_path): + print(f"Reverting patch {patch_path} from {os.path.basename(repo_path)}") + subprocess.check_call(["git", "apply", "-R", "--reject", patch_path], cwd=repo_path) + + +def build_bergamot(args: ArgNamespace): + if args.clobber and os.path.exists(BUILD_PATH): + shutil.rmtree(BUILD_PATH) + + if not os.path.exists(BUILD_PATH): + os.mkdir(BUILD_PATH) + + print("\n šŸ–Œļø Applying source code patches\n") + for repo_path, patch_path in patches: + apply_git_patch(repo_path, patch_path) + + # These commands require the emsdk environment variables to be set up. + def run_shell(command): + if '"' in command or "'" in command: + raise Exception("This run_shell utility does not support quotes.") + + return subprocess.run( + # "source" is not available in all shells so explicitly + f"bash -c 'source {EMSDK_ENV_PATH} && {command}'", + cwd=BUILD_PATH, + shell=True, + check=True, + ) + + try: + flags = "" + if args.debug: + flags = "-DCMAKE_BUILD_TYPE=RelWithDebInfo" + + print("\n šŸƒ Running CMake for Bergamot\n") + run_shell(f"emcmake cmake -DCOMPILE_WASM=on {flags} {BERGAMOT_PATH}") + + print("\n šŸƒ Building Bergamot with emmake\n") + run_shell(f"emmake make -j {multiprocessing.cpu_count()}") + + print("\n 🪚 Patching Bergamot for gemm support\n") + subprocess.check_call(["bash", GEMM_SCRIPT, BUILD_PATH]) + + print("\nāœ… Build complete\n") + print(" " + JS_PATH) + print(" " + WASM_PATH) + + # Get the sizes of the build artifacts. + wasm_size = os.path.getsize(WASM_PATH) + gzip_size = int( + subprocess.run( + f"gzip -c {WASM_PATH} | wc -c", + check=True, + shell=True, + capture_output=True, + ).stdout.strip() + ) + print(f" Uncompressed wasm size: {to_human_readable(wasm_size)}") + print(f" Compressed wasm size: {to_human_readable(gzip_size)}") + finally: + print("\nšŸ–Œļø Reverting the source code patches\n") + for repo_path, patch_path in patches[::-1]: + revert_git_patch(repo_path, patch_path) + + +def write_final_bergamot_js_file(): + """ + The generated JS file requires some light patching for integration. + """ + + source = "\n".join( + [ + "/* This Source Code Form is subject to the terms of the Mozilla Public", + " * License, v. 2.0. If a copy of the MPL was not distributed with this", + " * file, You can obtain one at http://mozilla.org/MPL/2.0/. */", + "", + "function loadBergamot(Module) {", + "", + ] + ) + + with open(JS_PATH, "r", encoding="utf8") as file: + for line in file.readlines(): + source += " " + line + + source += " return Module;\n}" + + # Use the Module's printing. + source = source.replace("console.log(", "Module.print(") + + # Add some instrumentation to the module's memory size. + source = source.replace( + "function updateGlobalBufferAndViews(buf) {", + """ + function updateGlobalBufferAndViews(buf) { + const mb = (buf.byteLength / 1_000_000).toFixed(); + Module.print( + `Growing wasm buffer to ${mb}MB (${buf.byteLength} bytes).` + ); + """, + ) + + print("\n Formatting the final bergamot file") + # Create the file outside of this directory so it's not ignored by eslint. + temp_path = os.path.join(DIR_PATH, "../temp-bergamot.js") + with open(temp_path, "w", encoding="utf8") as file: + file.write(source) + + subprocess.run( + f"./mach eslint --fix {temp_path}", + cwd=ROOT_PATH, + check=True, + shell=True, + capture_output=True, + ) + + print(f"\n Writing out final bergamot file: {FINAL_JS_PATH}") + shutil.move(temp_path, FINAL_JS_PATH) + + +def main(): + args: ArgNamespace = parser.parse_args() + + if not os.path.exists(THIRD_PARTY_PATH): + os.mkdir(THIRD_PARTY_PATH) + + install_and_activate_emscripten(args) + install_bergamot() + build_bergamot(args) + write_final_bergamot_js_file() + + +if __name__ == "__main__": + main() diff --git a/toolkit/components/translations/bergamot-translator/moz.yaml b/toolkit/components/translations/bergamot-translator/moz.yaml index 427e7cdc0418..23a4cc8c27f6 100644 --- a/toolkit/components/translations/bergamot-translator/moz.yaml +++ b/toolkit/components/translations/bergamot-translator/moz.yaml @@ -16,15 +16,15 @@ origin: # Full URL for the package's homepage/etc # Usually different from repository url - url: https://github.com/mozilla/bergamot-translator/ + url: https://github.com/browsermt/bergamot-translator.git # Human-readable identifier for this version/release # Generally "version NNN", "tag SSS", "bookmark SSS" - release: v0.4.4 + release: v0.4.5 # Revision to pull in # Must be a long or short commit SHA (long preferred) - revision: 5ae1b1ebb3fa9a3eabed8a64ca6798154bd486eb + revision: 05a87784973b6e1cc591f1f1a9a05c5873d9971e # The package's license, where possible using the mnemonic from # https://spdx.org/licenses/ diff --git a/toolkit/components/translations/bergamot-translator/patches/allocation-bergamot.patch b/toolkit/components/translations/bergamot-translator/patches/allocation-bergamot.patch new file mode 100644 index 000000000000..a8dca5b7e1d5 --- /dev/null +++ b/toolkit/components/translations/bergamot-translator/patches/allocation-bergamot.patch @@ -0,0 +1,26 @@ +commit dfa705777729fd084f0187a90f9712eb76ea9209 +parent 05a87784973b6e1cc591f1f1a9a05c5873d9971e +Author: Greg Tatum +Date: Tue Nov 7 10:57:07 2023 -0600 + + Change allocation strategy + + This fixes an issue where the memory would grow to 500mb by pre-allocating large + workspaces. For some reason the "workspace" configuration for the Wasm build wasn't + fixing this, but hard-coding the value does. Perhaps the configuration file in Bergamot + is not working correctly, or it was just a mistake on the author's part. Empirically + this value keeps memory from growing too rapidly, and does not degrade Wasm performance. + +diff --git a/src/translator/translation_model.cpp b/src/translator/translation_model.cpp +index 3f91ebb..61a299f 100644 +--- a/src/translator/translation_model.cpp ++++ b/src/translator/translation_model.cpp +@@ -59,7 +59,7 @@ void TranslationModel::loadBackend(size_t idx) { + graph->setDefaultElementType(typeFromString(prec[0])); + graph->setDevice(device_); + graph->getBackend()->configureDevice(options_); +- graph->reserveWorkspaceMB(options_->get("workspace")); ++ graph->reserveWorkspaceMB(5); + + // Marian Model: Load from memoryBundle or shortList + if (memory_.model.size() > 0 && diff --git a/toolkit/components/translations/bergamot-translator/patches/allocation-marian.patch b/toolkit/components/translations/bergamot-translator/patches/allocation-marian.patch new file mode 100644 index 000000000000..4fe2616d0784 --- /dev/null +++ b/toolkit/components/translations/bergamot-translator/patches/allocation-marian.patch @@ -0,0 +1,25 @@ +commit 31a05b47381a5b22b57fe9af7805fa40a5c5e384 +parent 11c6ae7c46be21ef96ed10c60f28022fa968939f +Author: Greg Tatum +Date: Mon Nov 6 14:01:32 2023 -0600 + + Change allocation strategy for tensors + + When tensors grow, they would pre-emptively allocate large amounts of memory, and + would allocate ~500mb of memory for a single translation. Adjusting this value + down appears to fix this issue. Empirically this value keeps memory from growing too + rapidly, and does not degrade Wasm performance. + +diff --git a/src/tensors/tensor_allocator.h b/src/tensors/tensor_allocator.h +index e3bc79f9..66f8e44d 100644 +--- a/src/tensors/tensor_allocator.h ++++ b/src/tensors/tensor_allocator.h +@@ -13,7 +13,7 @@ class TensorAllocator { + private: + const size_t CHUNK = 128; + const size_t MBYTE = 1024 * 1024; +- const size_t GROW = CHUNK * MBYTE; ++ const size_t GROW = MBYTE; + const size_t ALIGN = 256; + + Ptr backend_; diff --git a/toolkit/components/translations/content/translations-engine-worker.js b/toolkit/components/translations/content/translations-engine-worker.js index 7d3954164e58..9459ace7a24d 100644 --- a/toolkit/components/translations/content/translations-engine-worker.js +++ b/toolkit/components/translations/content/translations-engine-worker.js @@ -493,19 +493,19 @@ class BergamotUtils { * * https://github.com/mozilla/bergamot-translator/ * - * @param {ArrayBuffer} wasmBinary + * @param {ArrayBuffer} wasm * @returns {Promise} */ - static initializeWasm(wasmBinary) { + static initializeWasm(wasm) { return new Promise((resolve, reject) => { /** @type {number} */ let start = performance.now(); /** @type {Bergamot} */ const bergamot = loadBergamot({ - // This is the amount of memory that a simple run of Bergamot uses, in byte. - INITIAL_MEMORY: 459_276_288, - preRun: [], + // This is the amount of memory that a simple run of Bergamot uses, in bytes. + INITIAL_MEMORY: 234_291_200, + print: log, onAbort() { reject(new Error("Error loading Bergamot wasm module.")); }, @@ -519,7 +519,7 @@ class BergamotUtils { await Promise.resolve(); resolve(bergamot); }, - wasmBinary, + wasm, }); }); } diff --git a/tools/rewriting/ThirdPartyPaths.txt b/tools/rewriting/ThirdPartyPaths.txt index 79eaa0b0f627..a0ac0e35b9c1 100644 --- a/tools/rewriting/ThirdPartyPaths.txt +++ b/tools/rewriting/ThirdPartyPaths.txt @@ -179,7 +179,8 @@ toolkit/components/normandy/vendor/ toolkit/components/passwordmgr/PasswordRulesParser.sys.mjs toolkit/components/protobuf/ toolkit/components/translation/cld2/ -toolkit/components/translations/bergamot-translator +toolkit/components/translations/bergamot-translator/thirdparty +toolkit/components/translations/bergamot-translator/bergamot-translator.js toolkit/components/url-classifier/chromium/ toolkit/components/utils/mozjexl.js toolkit/components/viaduct/fetch_msg_types.pb.cc