Bug 1863793 - Add a Bergamot translator build script; r=translations-reviewers,nordzilla

Differential Revision: https://phabricator.services.mozilla.com/D193559
This commit is contained in:
Greg Tatum 2023-11-28 17:32:48 +00:00
parent 39d85a0ba9
commit d740b0a3ec
9 changed files with 339 additions and 11 deletions

3
.gitignore vendored
View file

@ -71,6 +71,9 @@ browser/components/newtab/content-src/asrouter/schemas/corpus/PanelTestProvider_
# Ignore Pocket component build and dev assets
browser/components/pocket/content/panels/css/main.compiled.css.map
# Ignore downloaded thirdparty build artifacts.
toolkit/components/translations/bergamot-translator/thirdparty
# Build directories for js shell
*_DBG.OBJ/
*_OPT.OBJ/

View file

@ -69,6 +69,9 @@ compile_commands\.json
# Ignore Pocket component build and dev assets
browser/components/pocket/content/panels/css/main.compiled.css.map
# Ignore downloaded thirdparty build artifacts.
toolkit/components/translations/bergamot-translator/thirdparty
# Build directories for js shell
_DBG\.OBJ/
_OPT\.OBJ/

View file

@ -1424,7 +1424,8 @@ toolkit/components/normandy/vendor/
toolkit/components/passwordmgr/PasswordRulesParser.sys.mjs
toolkit/components/protobuf/
toolkit/components/translation/cld2/
toolkit/components/translations/bergamot-translator
toolkit/components/translations/bergamot-translator/thirdparty
toolkit/components/translations/bergamot-translator/bergamot-translator.js
toolkit/components/url-classifier/chromium/
toolkit/components/utils/mozjexl.js
toolkit/components/viaduct/fetch_msg_types.pb.cc

View file

@ -0,0 +1,269 @@
#!/usr/bin/env python3
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
"""
Builds the Bergamot translations engine for integration with Firefox.
If you wish to test the Bergamot engine locally, then uncomment the .wasm line in
the toolkit/components/translations/jar.mn after building the file. Just make sure
not to check the code change in.
"""
import argparse
import multiprocessing
import os
import shutil
import subprocess
from collections import namedtuple
import yaml
DIR_PATH = os.path.realpath(os.path.dirname(__file__))
THIRD_PARTY_PATH = os.path.join(DIR_PATH, "thirdparty")
MOZ_YAML_PATH = os.path.join(DIR_PATH, "moz.yaml")
PATCHES_PATH = os.path.join(DIR_PATH, "patches")
BERGAMOT_PATH = os.path.join(THIRD_PARTY_PATH, "bergamot-translator")
MARIAN_PATH = os.path.join(BERGAMOT_PATH, "3rd_party/marian-dev")
GEMM_SCRIPT = os.path.join(BERGAMOT_PATH, "wasm/patch-artifacts-import-gemm-module.sh")
BUILD_PATH = os.path.join(THIRD_PARTY_PATH, "build-wasm")
EMSDK_PATH = os.path.join(THIRD_PARTY_PATH, "emsdk")
EMSDK_ENV_PATH = os.path.join(EMSDK_PATH, "emsdk_env.sh")
WASM_PATH = os.path.join(BUILD_PATH, "bergamot-translator-worker.wasm")
JS_PATH = os.path.join(BUILD_PATH, "bergamot-translator-worker.js")
FINAL_JS_PATH = os.path.join(DIR_PATH, "bergamot-translator.js")
ROOT_PATH = os.path.join(DIR_PATH, "../../../..")
# 3.1.47 had an error compiling sentencepiece.
EMSDK_VERSION = "3.1.8"
EMSDK_REVISION = "2346baa7bb44a4a0571cc75f1986ab9aaa35aa03"
patches = [
(BERGAMOT_PATH, os.path.join(PATCHES_PATH, "allocation-bergamot.patch")),
(MARIAN_PATH, os.path.join(PATCHES_PATH, "allocation-marian.patch")),
]
parser = argparse.ArgumentParser(
description=__doc__,
# Preserves whitespace in the help text.
formatter_class=argparse.RawTextHelpFormatter,
)
parser.add_argument(
"--clobber", action="store_true", help="Clobber the build artifacts"
)
parser.add_argument(
"--debug",
action="store_true",
help="Build with debug symbols, useful for profiling",
)
ArgNamespace = namedtuple("ArgNamespace", ["clobber", "debug"])
def git_clone_update(name: str, repo_path: str, repo_url: str, revision: str):
if not os.path.exists(repo_path):
print(f"\n⬇️ Clone the {name} repo into {repo_path}\n")
subprocess.check_call(
["git", "clone", repo_url],
cwd=THIRD_PARTY_PATH,
)
local_head = subprocess.check_output(
["git", "rev-parse", "HEAD"],
cwd=repo_path,
text=True,
).strip()
def run(command):
return subprocess.check_call(command, cwd=repo_path)
if local_head != revision:
print(f"The head ({local_head}) and revision ({revision}) don't match.")
print(f"\n🔎 Fetching the latest from {name}.\n")
run(["git", "fetch", "--recurse-submodules"])
print(f"🛒 Checking out the revision {revision}")
run(["git", "checkout", revision])
run(["git", "submodule", "update", "--init", "--recursive"])
def install_and_activate_emscripten(args: ArgNamespace):
git_clone_update(
name="emsdk",
repo_path=EMSDK_PATH,
repo_url="https://github.com/emscripten-core/emsdk.git",
revision=EMSDK_REVISION,
)
# Run these commands in the shell so that the configuration is saved.
def run_shell(command):
return subprocess.run(command, cwd=EMSDK_PATH, shell=True, check=True)
print(f"\n🛠️ Installing EMSDK version {EMSDK_VERSION}\n")
run_shell("./emsdk install " + EMSDK_VERSION)
print("\n🛠️ Activating emsdk\n")
run_shell("./emsdk activate " + EMSDK_VERSION)
def install_bergamot():
with open(MOZ_YAML_PATH, "r", encoding="utf8") as file:
text = file.read()
moz_yaml = yaml.safe_load(text)
git_clone_update(
name="bergamot",
repo_path=BERGAMOT_PATH,
repo_url=moz_yaml["origin"]["url"],
revision=moz_yaml["origin"]["revision"],
)
def to_human_readable(size):
"""Convert sizes to human-readable format"""
size_in_mb = size / 1048576
return f"{size_in_mb:.2f}M ({size} bytes)"
def apply_git_patch(repo_path, patch_path):
print(f"Applying patch {patch_path} to {os.path.basename(repo_path)}")
subprocess.check_call(["git", "apply", "--reject", patch_path], cwd=repo_path)
def revert_git_patch(repo_path, patch_path):
print(f"Reverting patch {patch_path} from {os.path.basename(repo_path)}")
subprocess.check_call(["git", "apply", "-R", "--reject", patch_path], cwd=repo_path)
def build_bergamot(args: ArgNamespace):
if args.clobber and os.path.exists(BUILD_PATH):
shutil.rmtree(BUILD_PATH)
if not os.path.exists(BUILD_PATH):
os.mkdir(BUILD_PATH)
print("\n 🖌️ Applying source code patches\n")
for repo_path, patch_path in patches:
apply_git_patch(repo_path, patch_path)
# These commands require the emsdk environment variables to be set up.
def run_shell(command):
if '"' in command or "'" in command:
raise Exception("This run_shell utility does not support quotes.")
return subprocess.run(
# "source" is not available in all shells so explicitly
f"bash -c 'source {EMSDK_ENV_PATH} && {command}'",
cwd=BUILD_PATH,
shell=True,
check=True,
)
try:
flags = ""
if args.debug:
flags = "-DCMAKE_BUILD_TYPE=RelWithDebInfo"
print("\n 🏃 Running CMake for Bergamot\n")
run_shell(f"emcmake cmake -DCOMPILE_WASM=on {flags} {BERGAMOT_PATH}")
print("\n 🏃 Building Bergamot with emmake\n")
run_shell(f"emmake make -j {multiprocessing.cpu_count()}")
print("\n 🪚 Patching Bergamot for gemm support\n")
subprocess.check_call(["bash", GEMM_SCRIPT, BUILD_PATH])
print("\n✅ Build complete\n")
print(" " + JS_PATH)
print(" " + WASM_PATH)
# Get the sizes of the build artifacts.
wasm_size = os.path.getsize(WASM_PATH)
gzip_size = int(
subprocess.run(
f"gzip -c {WASM_PATH} | wc -c",
check=True,
shell=True,
capture_output=True,
).stdout.strip()
)
print(f" Uncompressed wasm size: {to_human_readable(wasm_size)}")
print(f" Compressed wasm size: {to_human_readable(gzip_size)}")
finally:
print("\n🖌️ Reverting the source code patches\n")
for repo_path, patch_path in patches[::-1]:
revert_git_patch(repo_path, patch_path)
def write_final_bergamot_js_file():
"""
The generated JS file requires some light patching for integration.
"""
source = "\n".join(
[
"/* This Source Code Form is subject to the terms of the Mozilla Public",
" * License, v. 2.0. If a copy of the MPL was not distributed with this",
" * file, You can obtain one at http://mozilla.org/MPL/2.0/. */",
"",
"function loadBergamot(Module) {",
"",
]
)
with open(JS_PATH, "r", encoding="utf8") as file:
for line in file.readlines():
source += " " + line
source += " return Module;\n}"
# Use the Module's printing.
source = source.replace("console.log(", "Module.print(")
# Add some instrumentation to the module's memory size.
source = source.replace(
"function updateGlobalBufferAndViews(buf) {",
"""
function updateGlobalBufferAndViews(buf) {
const mb = (buf.byteLength / 1_000_000).toFixed();
Module.print(
`Growing wasm buffer to ${mb}MB (${buf.byteLength} bytes).`
);
""",
)
print("\n Formatting the final bergamot file")
# Create the file outside of this directory so it's not ignored by eslint.
temp_path = os.path.join(DIR_PATH, "../temp-bergamot.js")
with open(temp_path, "w", encoding="utf8") as file:
file.write(source)
subprocess.run(
f"./mach eslint --fix {temp_path}",
cwd=ROOT_PATH,
check=True,
shell=True,
capture_output=True,
)
print(f"\n Writing out final bergamot file: {FINAL_JS_PATH}")
shutil.move(temp_path, FINAL_JS_PATH)
def main():
args: ArgNamespace = parser.parse_args()
if not os.path.exists(THIRD_PARTY_PATH):
os.mkdir(THIRD_PARTY_PATH)
install_and_activate_emscripten(args)
install_bergamot()
build_bergamot(args)
write_final_bergamot_js_file()
if __name__ == "__main__":
main()

View file

@ -16,15 +16,15 @@ origin:
# Full URL for the package's homepage/etc
# Usually different from repository url
url: https://github.com/mozilla/bergamot-translator/
url: https://github.com/browsermt/bergamot-translator.git
# Human-readable identifier for this version/release
# Generally "version NNN", "tag SSS", "bookmark SSS"
release: v0.4.4
release: v0.4.5
# Revision to pull in
# Must be a long or short commit SHA (long preferred)
revision: 5ae1b1ebb3fa9a3eabed8a64ca6798154bd486eb
revision: 05a87784973b6e1cc591f1f1a9a05c5873d9971e
# The package's license, where possible using the mnemonic from
# https://spdx.org/licenses/

View file

@ -0,0 +1,26 @@
commit dfa705777729fd084f0187a90f9712eb76ea9209
parent 05a87784973b6e1cc591f1f1a9a05c5873d9971e
Author: Greg Tatum <tatum.creative@gmail.com>
Date: Tue Nov 7 10:57:07 2023 -0600
Change allocation strategy
This fixes an issue where the memory would grow to 500mb by pre-allocating large
workspaces. For some reason the "workspace" configuration for the Wasm build wasn't
fixing this, but hard-coding the value does. Perhaps the configuration file in Bergamot
is not working correctly, or it was just a mistake on the author's part. Empirically
this value keeps memory from growing too rapidly, and does not degrade Wasm performance.
diff --git a/src/translator/translation_model.cpp b/src/translator/translation_model.cpp
index 3f91ebb..61a299f 100644
--- a/src/translator/translation_model.cpp
+++ b/src/translator/translation_model.cpp
@@ -59,7 +59,7 @@ void TranslationModel::loadBackend(size_t idx) {
graph->setDefaultElementType(typeFromString(prec[0]));
graph->setDevice(device_);
graph->getBackend()->configureDevice(options_);
- graph->reserveWorkspaceMB(options_->get<size_t>("workspace"));
+ graph->reserveWorkspaceMB(5);
// Marian Model: Load from memoryBundle or shortList
if (memory_.model.size() > 0 &&

View file

@ -0,0 +1,25 @@
commit 31a05b47381a5b22b57fe9af7805fa40a5c5e384
parent 11c6ae7c46be21ef96ed10c60f28022fa968939f
Author: Greg Tatum <tatum.creative@gmail.com>
Date: Mon Nov 6 14:01:32 2023 -0600
Change allocation strategy for tensors
When tensors grow, they would pre-emptively allocate large amounts of memory, and
would allocate ~500mb of memory for a single translation. Adjusting this value
down appears to fix this issue. Empirically this value keeps memory from growing too
rapidly, and does not degrade Wasm performance.
diff --git a/src/tensors/tensor_allocator.h b/src/tensors/tensor_allocator.h
index e3bc79f9..66f8e44d 100644
--- a/src/tensors/tensor_allocator.h
+++ b/src/tensors/tensor_allocator.h
@@ -13,7 +13,7 @@ class TensorAllocator {
private:
const size_t CHUNK = 128;
const size_t MBYTE = 1024 * 1024;
- const size_t GROW = CHUNK * MBYTE;
+ const size_t GROW = MBYTE;
const size_t ALIGN = 256;
Ptr<Backend> backend_;

View file

@ -493,19 +493,19 @@ class BergamotUtils {
*
* https://github.com/mozilla/bergamot-translator/
*
* @param {ArrayBuffer} wasmBinary
* @param {ArrayBuffer} wasm
* @returns {Promise<Bergamot>}
*/
static initializeWasm(wasmBinary) {
static initializeWasm(wasm) {
return new Promise((resolve, reject) => {
/** @type {number} */
let start = performance.now();
/** @type {Bergamot} */
const bergamot = loadBergamot({
// This is the amount of memory that a simple run of Bergamot uses, in byte.
INITIAL_MEMORY: 459_276_288,
preRun: [],
// This is the amount of memory that a simple run of Bergamot uses, in bytes.
INITIAL_MEMORY: 234_291_200,
print: log,
onAbort() {
reject(new Error("Error loading Bergamot wasm module."));
},
@ -519,7 +519,7 @@ class BergamotUtils {
await Promise.resolve();
resolve(bergamot);
},
wasmBinary,
wasm,
});
});
}

View file

@ -179,7 +179,8 @@ toolkit/components/normandy/vendor/
toolkit/components/passwordmgr/PasswordRulesParser.sys.mjs
toolkit/components/protobuf/
toolkit/components/translation/cld2/
toolkit/components/translations/bergamot-translator
toolkit/components/translations/bergamot-translator/thirdparty
toolkit/components/translations/bergamot-translator/bergamot-translator.js
toolkit/components/url-classifier/chromium/
toolkit/components/utils/mozjexl.js
toolkit/components/viaduct/fetch_msg_types.pb.cc