fune/toolkit/components/translations/content/language-id-engine-worker.js
Cristina Horotan 2e51c47d14 Backed out 5 changesets (bug 1861516) for causing generate failure. CLOSED TREE
Backed out changeset 59284ad6706a (bug 1861516)
Backed out changeset f523baf65417 (bug 1861516)
Backed out changeset a765b373c3f1 (bug 1861516)
Backed out changeset 2aab5a2ea289 (bug 1861516)
Backed out changeset 96624994d2cb (bug 1861516)
2023-11-09 02:23:16 +02:00

327 lines
11 KiB
JavaScript

/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
/* eslint-env worker */
"use strict";
// Throw Promise rejection errors so that they are visible in the console.
self.addEventListener("unhandledrejection", event => {
throw event.reason;
});
/* global addOnPostRun FastText loadFastText */
importScripts(
"chrome://global/content/translations/fasttext.js",
"chrome://global/content/translations/fasttext_wasm.js"
);
/**
* The number of languages that should be returned when the model analyzes text.
*
* A value of 1 means only the most-likely language will be returned.
* A value of 5 would mean that the top 5 most-likely languages will be returned.
*/
const LANGUAGE_COUNT = 1;
/**
* The threshold of likelihood in range [0.0, 1.0] that must pass
* for a language to be returned from the model.
*
* A value of 0.0 would mean that a language is always returned with any confidence.
* A value of 0.5 would mean that a language is only returned if the model
* is 50% confident that the analyzed text could be that language.
*/
const CONFIDENCE_THRESHOLD = 0.0;
// Respect the preference "browser.translations.logLevel".
let _isLoggingEnabled = true;
function log(...args) {
if (_isLoggingEnabled) {
console.log("Translations:", ...args);
}
}
// Wait for the initialization request.
addEventListener("message", handleInitializationMessage);
/**
* Initialize the engine, and get it ready to handle language identification requests.
* The "initialize" message must be received before any other message handling
* requests will be processed.
*
* @param {Object} event
* @param {Object} event.data
* @param {string} event.data.type - The message type, expects "initialize".
* @param {ArrayBuffer} event.data.wasmBuffer - The buffer containing the wasm binary.
* @param {ArrayBuffer} event.data.modelBuffer - The buffer containing the language-id model binary.
* @param {null | string} event.data.mockedLangTag - The mocked language tag value (only present when mocking).
* @param {null | number} event.data.mockedConfidence - The mocked confidence value (only present when mocking).
* @param {boolean} event.data.isLoggingEnabled
*/
async function handleInitializationMessage({ data }) {
if (data.type !== "initialize") {
throw new Error(
"The LanguageIdEngine worker received a message before it was initialized."
);
}
try {
const { isLoggingEnabled } = data;
if (isLoggingEnabled) {
// Respect the "browser.translations.logLevel" preference.
_isLoggingEnabled = true;
}
/** @type {LanguageIdEngine | MockedLanguageIdEngine} */
let languageIdEngine;
const { mockedLangTag, mockedConfidence } = data;
if (mockedLangTag !== null && mockedConfidence !== null) {
// Don't actually use the engine as it is mocked.
languageIdEngine = new MockedLanguageIdEngine(
mockedLangTag,
mockedConfidence
);
} else {
languageIdEngine = await initializeLanguageIdEngine(data);
}
handleMessages(languageIdEngine);
postMessage({ type: "initialization-success" });
} catch (error) {
console.error(error);
postMessage({ type: "initialization-error", error: error?.message });
}
removeEventListener("message", handleInitializationMessage);
}
/**
* Initializes the fastText wasm runtime and returns the fastText model.
*
* @param {ArrayBuffer} data.wasmBuffer - The buffer containing the wasm binary.
* @param {ArrayBuffer} data.modelBuffer - The buffer containing the language-id model binary.
* @returns {FastTextModel}
*/
function initializeFastTextModel(modelBuffer, wasmBuffer) {
return new Promise((resolve, reject) => {
const initialModule = {
onAbort() {
reject(new Error("Error loading the fastText Wasm Module"));
},
onRuntimeInitialized() {
addOnPostRun(() => {
const ft = new FastText(initialModule);
const model = ft.loadModelBinary(modelBuffer);
resolve(model);
});
},
wasmBinary: wasmBuffer,
};
loadFastText(initialModule);
});
}
/**
* Initialize the LanguageIdEngine from the data payload by loading
* the fastText wasm runtime and model and constructing the engine.
*
* @param {Object} data
* @property {ArrayBuffer} data.wasmBuffer - The buffer containing the wasm binary.
* @property {ArrayBuffer} data.modelBuffer - The buffer containing the language-id model binary.
*/
async function initializeLanguageIdEngine(data) {
const { modelBuffer, wasmBuffer } = data;
if (!modelBuffer) {
throw new Error('LanguageIdEngine initialization missing "modelBuffer"');
}
if (!wasmBuffer) {
throw new Error('LanguageIdEngine initialization missing "wasmBuffer"');
}
const model = await initializeFastTextModel(modelBuffer, wasmBuffer);
return new LanguageIdEngine(model);
}
/**
* Sets up the message handling for the worker.
*
* @param {LanguageIdEngine | MockedLanguageIdEngine} languageIdEngine
*/
function handleMessages(languageIdEngine) {
/**
* Handle any message after the initialization message.
*
* @param {Object} data
* @property {string} data.type - The message type.
* @property {string} data.message - The message text to identify the language of.
* @property {number} data.messageId - The ID of the message.
*/
addEventListener("message", ({ data }) => {
try {
if (data.type === "initialize") {
throw new Error(
"The language-identification engine must not be re-initialized."
);
}
switch (data.type) {
case "language-id-request": {
const { message, messageId } = data;
try {
const [confidence, langTag] =
languageIdEngine.identifyLanguage(message);
postMessage({
type: "language-id-response",
langTag,
confidence,
messageId,
});
} catch (error) {
console.error(error);
postMessage({
type: "language-id-error",
messageId,
});
}
break;
}
default: {
console.warn("Unknown message type:", data.type);
}
}
} catch (error) {
// Ensure the unexpected errors are surfaced in the console.
console.error(error);
}
});
}
/**
* The LanguageIdEngine wraps around a machine-learning model that can identify text
* as being written in a given human language. The engine is responsible for invoking
* model and returning the language tag in the format that is expected by firefox
* translations code.
*/
class LanguageIdEngine {
/** @type {FastTextModel} */
#model;
/**
* @param {FastTextModel} model
*/
constructor(model) {
this.#model = model;
}
/**
* Formats the language tag returned by the language-identification model to match
* conform to the format used internally by Firefox.
*
* This function is currently configured to handle the fastText language-identification
* model. Updating the language-identification model or moving to something other than
* fastText in the future will likely require updating this function.
*
* @param {string} langTag
* @returns {string} The correctly formatted langTag
*/
#formatLangTag(langTag) {
// The fastText language model returns values of the format "__label__{langTag}".
// As such, this function strips the "__label__" prefix, leaving only the langTag.
let formattedTag = langTag.replace("__label__", "");
// fastText is capable of returning any of a predetermined set of 176 langTags:
// https://fasttext.cc/docs/en/language-identification.html
//
// These tags come from ISO639-3:
// https://iso639-3.sil.org/code_tables/deprecated_codes/data
//
// Each of these tags have been cross checked for compatibility with the IANA
// language subtag registry, which is used by BCP 47, and any edge cases are handled below.
// https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry
switch (formattedTag) {
// fastText may return "eml" which is a deprecated ISO639-3 language tag for the language
// Emiliano-Romagnolo. It was split into two separate tags "egl" and "rgn":
// https://iso639-3.sil.org/request/2008-040
//
// "eml" was once requested to be added to the IANA registry, but it was denied:
// https://www.alvestrand.no/pipermail/ietf-languages/2009-December/009754.html
//
// This case should return either "egl" or "rgn", given that the "eml" tag was split.
// However, given that the fastText model does not distinguish between the two by using
// the deprecated tag, this function will default to "egl" because it is alphabetically first.
//
// At such a time that Firefox Translations may support either of these languages, we should consider
// a way to further distinguish between the two languages at that time.
case "eml": {
formattedTag = "egl";
break;
}
// The fastText model returns "no" for Norwegian Bokmål.
//
// According to advice from https://r12a.github.io/app-subtags/
// "no" is a macro language that encompasses the following more specific primary language subtags: "nb" "nn".
// It is recommended to use more specific language subtags as long as it does not break legacy usage of an application.
// As such, this function will return "nb" for Norwegian Bokmål instead of "no" as reported by fastText.
case "no": {
formattedTag = "nb";
break;
}
}
return formattedTag;
}
/**
* Identifies the human language in which the message is written and returns
* the BCP 47 language tag of the language it is determined to be along along
* with a rating of how confident the model is that the label is correct.
*
* @param {string} message
* @returns {Array<number | string>} An array containing the confidence and language tag.
* The confidence is a number between 0 and 1, representing a percentage.
* The language tag is a BCP 47 language tag such as "en" for English.
*
* e.g. [0.87, "en"]
*/
identifyLanguage(message) {
const mostLikelyLanguageData = this.#model
.predict(message.trim(), LANGUAGE_COUNT, CONFIDENCE_THRESHOLD)
.get(0);
// This should never fail as long as
// LANGUAGE_COUNT > 1 && CONFIDENCE_THRESHOLD === 0.0
if (!mostLikelyLanguageData) {
throw new Error("Unable to identify a language");
}
const [confidence, langTag] = mostLikelyLanguageData;
return [confidence, this.#formatLangTag(langTag)];
}
}
/**
* For testing purposes, provide a fully mocked engine. This allows for easy integration
* testing of the UI, without having to rely on downloading remote models and remote
* wasm binaries.
*/
class MockedLanguageIdEngine {
/** @type {string} */
#langTag;
/** @type {number} */
#confidence;
/**
* @param {string} langTag
* @param {number} confidence
*/
constructor(langTag, confidence) {
this.#langTag = langTag;
this.#confidence = confidence;
}
/**
* Mocks identifying a language by returning the mocked engine's pre-determined
* language tag and confidence values.
*/
identifyLanguage(_message) {
return [this.#confidence, this.#langTag];
}
}