Backed out 5 changesets (bug 1861516) for causing generate failure. CLOSED TREE

Backed out changeset 59284ad6706a (bug 1861516)
Backed out changeset f523baf65417 (bug 1861516)
Backed out changeset a765b373c3f1 (bug 1861516)
Backed out changeset 2aab5a2ea289 (bug 1861516)
Backed out changeset 96624994d2cb (bug 1861516)
This commit is contained in:
Cristina Horotan 2023-11-09 02:23:16 +02:00
parent 6813086faf
commit 2e51c47d14
25 changed files with 1932 additions and 29 deletions

View file

@ -1424,6 +1424,8 @@ toolkit/components/passwordmgr/PasswordRulesParser.sys.mjs
toolkit/components/protobuf/ toolkit/components/protobuf/
toolkit/components/translation/cld2/ toolkit/components/translation/cld2/
toolkit/components/translations/bergamot-translator toolkit/components/translations/bergamot-translator
toolkit/components/translations/fasttext/fasttext.js
toolkit/components/translations/fasttext/fasttext_wasm.js
toolkit/components/url-classifier/chromium/ toolkit/components/url-classifier/chromium/
toolkit/components/utils/mozjexl.js toolkit/components/utils/mozjexl.js
toolkit/components/viaduct/fetch_msg_types.pb.cc toolkit/components/viaduct/fetch_msg_types.pb.cc

View file

@ -72,6 +72,7 @@ const intermittently_loaded_scripts = {
// Translations code which may be preffed on. // Translations code which may be preffed on.
"resource://gre/actors/TranslationsChild.sys.mjs", "resource://gre/actors/TranslationsChild.sys.mjs",
"resource://gre/modules/translation/LanguageDetector.sys.mjs", "resource://gre/modules/translation/LanguageDetector.sys.mjs",
"chrome://global/content/translations/language-id-engine.sys.mjs",
"resource://gre/modules/ConsoleAPIStorage.sys.mjs", // Logging related. "resource://gre/modules/ConsoleAPIStorage.sys.mjs", // Logging related.
// Session store. // Session store.

View file

@ -117,8 +117,13 @@ add_task(async function test_about_preferences_manage_languages() {
"All models were downloaded." "All models were downloaded."
); );
Assert.deepEqual( Assert.deepEqual(
await remoteClients.translationsWasm.resolvePendingDownloads(1), await remoteClients.languageIdModels.resolvePendingDownloads(1),
["bergamot-translator"], ["lid.176.ftz"],
"Language ID model was downloaded."
);
Assert.deepEqual(
await remoteClients.translationsWasm.resolvePendingDownloads(2),
["bergamot-translator", "fasttext-wasm"],
"Wasm was downloaded." "Wasm was downloaded."
); );
@ -154,6 +159,7 @@ add_task(async function test_about_preferences_manage_languages() {
); );
remoteClients.translationsWasm.assertNoNewDownloads(); remoteClients.translationsWasm.assertNoNewDownloads();
remoteClients.languageIdModels.assertNoNewDownloads();
await assertVisibility({ await assertVisibility({
message: "Everything is downloaded again.", message: "Everything is downloaded again.",

View file

@ -3627,6 +3627,13 @@ pref("browser.translations.simulateUnsupportedEngine", false);
pref("browser.translations.chaos.errors", false); pref("browser.translations.chaos.errors", false);
pref("browser.translations.chaos.timeoutMS", 0); pref("browser.translations.chaos.timeoutMS", 0);
// A pref to manage the use of fastText for language detection in Translations.
// The feature was initially built using fastText, but we are now putting it
// behind a pref while we investigate some performance improvements.
// In the meantime, we will use CLD2, which is already available in tree.
// See https://bugzilla.mozilla.org/show_bug.cgi?id=1836974
pref("browser.translations.languageIdentification.useFastText", false);
// When a user cancels this number of authentication dialogs coming from // When a user cancels this number of authentication dialogs coming from
// a single web page in a row, all following authentication dialogs will // a single web page in a row, all following authentication dialogs will
// be blocked (automatically canceled) for that page. The counter resets // be blocked (automatically canceled) for that page. The counter resets

View file

@ -2,6 +2,8 @@
* License, v. 2.0. If a copy of the MPL was not distributed with this * License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
import { XPCOMUtils } from "resource://gre/modules/XPCOMUtils.sys.mjs";
const lazy = {}; const lazy = {};
ChromeUtils.defineLazyGetter(lazy, "console", () => { ChromeUtils.defineLazyGetter(lazy, "console", () => {
@ -16,7 +18,14 @@ ChromeUtils.defineESModuleGetters(lazy, {
"resource://gre/modules/translation/LanguageDetector.sys.mjs", "resource://gre/modules/translation/LanguageDetector.sys.mjs",
}); });
XPCOMUtils.defineLazyPreferenceGetter(
lazy,
"useFastTextPref",
"browser.translations.languageIdentification.useFastText"
);
/** /**
* @typedef {import("./TranslationsChild.sys.mjs").LanguageIdEngine} LanguageIdEngine
* @typedef {import("./TranslationsChild.sys.mjs").TranslationsEngine} TranslationsEngine * @typedef {import("./TranslationsChild.sys.mjs").TranslationsEngine} TranslationsEngine
* @typedef {import("./TranslationsChild.sys.mjs").SupportedLanguages} SupportedLanguages * @typedef {import("./TranslationsChild.sys.mjs").SupportedLanguages} SupportedLanguages
*/ */
@ -26,6 +35,9 @@ ChromeUtils.defineESModuleGetters(lazy, {
* are exposed to the un-privileged scope of the about:translations page. * are exposed to the un-privileged scope of the about:translations page.
*/ */
export class AboutTranslationsChild extends JSWindowActorChild { export class AboutTranslationsChild extends JSWindowActorChild {
/** @type {LanguageIdEngine | null} */
languageIdEngine = null;
/** /**
* The translations engine uses text translations by default in about:translations, * The translations engine uses text translations by default in about:translations,
* but it can be changed to translate HTML by setting this pref to true. This is * but it can be changed to translate HTML by setting this pref to true. This is
@ -143,6 +155,7 @@ export class AboutTranslationsChild extends JSWindowActorChild {
"AT_getSupportedLanguages", "AT_getSupportedLanguages",
"AT_isTranslationEngineSupported", "AT_isTranslationEngineSupported",
"AT_isHtmlTranslation", "AT_isHtmlTranslation",
"AT_createLanguageIdEngine",
"AT_createTranslationsPort", "AT_createTranslationsPort",
"AT_identifyLanguage", "AT_identifyLanguage",
"AT_getScriptDirection", "AT_getScriptDirection",
@ -211,6 +224,32 @@ export class AboutTranslationsChild extends JSWindowActorChild {
return this.#isHtmlTranslation; return this.#isHtmlTranslation;
} }
/**
* Creates the LanguageIdEngine which attempts to identify in which
* human language a string is written.
*
* Unlike TranslationsEngine, which handles only a single language pair
* and must be rebuilt to handle a new language pair, the LanguageIdEngine
* is a one-to-many engine that can recognize all of its supported languages.
*
* Subsequent calls to this function after the engine is initialized will do nothing
* instead of rebuilding the engine.
*
* @returns {Promise<void>}
*/
AT_createLanguageIdEngine() {
if (this.languageIdEngine) {
return this.#convertToContentPromise(Promise.resolve());
}
return this.#convertToContentPromise(
this.#getTranslationsChild()
.getOrCreateLanguageIdEngine()
.then(engine => {
this.languageIdEngine = engine;
})
);
}
/** /**
* Requests a port to the TranslationsEngine process. An engine will be created on * Requests a port to the TranslationsEngine process. An engine will be created on
* the fly for translation requests through this port. This port is unique to its * the fly for translation requests through this port. This port is unique to its
@ -231,11 +270,26 @@ export class AboutTranslationsChild extends JSWindowActorChild {
/** /**
* Attempts to identify the human language in which the message is written. * Attempts to identify the human language in which the message is written.
* @see LanguageIdEngine#identifyLanguage for more detailed documentation.
* *
* @param {string} message * @param {string} message
* @returns {Promise<{ langTag: string, confidence: number }>} * @returns {Promise<{ langTag: string, confidence: number }>}
*/ */
AT_identifyLanguage(message) { AT_identifyLanguage(message) {
if (lazy.useFastTextPref) {
if (!this.languageIdEngine) {
const { Promise, Error } = this.contentWindow;
return Promise.reject(
new Error("The language identification was not created.")
);
}
return this.#convertToContentPromise(
this.languageIdEngine
.identifyLanguage(message)
.then(data => Cu.cloneInto(data, this.contentWindow))
);
}
return this.#convertToContentPromise( return this.#convertToContentPromise(
lazy.LanguageDetector.detectLanguage(message).then(data => lazy.LanguageDetector.detectLanguage(message).then(data =>
Cu.cloneInto( Cu.cloneInto(

View file

@ -6,6 +6,10 @@ const lazy = {};
ChromeUtils.defineESModuleGetters(lazy, { ChromeUtils.defineESModuleGetters(lazy, {
TranslationsDocument: TranslationsDocument:
"chrome://global/content/translations/translations-document.sys.mjs", "chrome://global/content/translations/translations-document.sys.mjs",
// The fastText languageIdEngine
LanguageIdEngine:
"chrome://global/content/translations/language-id-engine.sys.mjs",
// The CLD2 language detector
LanguageDetector: LanguageDetector:
"resource://gre/modules/translation/LanguageDetector.sys.mjs", "resource://gre/modules/translation/LanguageDetector.sys.mjs",
}); });
@ -74,6 +78,16 @@ export class TranslationsChild extends JSWindowActorChild {
} }
try { try {
// Try to use the fastText engine if directed to do so.
if (data.useFastText) {
const engine = await this.getOrCreateLanguageIdEngine();
if (!engine) {
return null;
}
return engine.identifyLanguageFromDocument(this.document);
}
// Use the CLD2 language detector otherwise.
return lazy.LanguageDetector.detectLanguageFromDocument( return lazy.LanguageDetector.detectLanguageFromDocument(
this.document this.document
); );
@ -90,4 +104,13 @@ export class TranslationsChild extends JSWindowActorChild {
throw new Error("Unknown message.", name); throw new Error("Unknown message.", name);
} }
} }
getOrCreateLanguageIdEngine() {
return lazy.LanguageIdEngine.getOrCreate(() => {
if (!this.manager || !this.manager.isCurrentGlobal) {
throw new Error("The page was already hidden.");
}
return this.sendQuery("Translations:GetLanguageIdEnginePayload");
});
}
} }

View file

@ -120,6 +120,12 @@ XPCOMUtils.defineLazyPreferenceGetter(
"browser.translations.simulateUnsupportedEngine" "browser.translations.simulateUnsupportedEngine"
); );
XPCOMUtils.defineLazyPreferenceGetter(
lazy,
"useFastTextPref",
"browser.translations.languageIdentification.useFastText"
);
// At this time the signatures of the files are not being checked when they are being // At this time the signatures of the files are not being checked when they are being
// loaded from disk. This signature check involves hitting the network, and translations // loaded from disk. This signature check involves hitting the network, and translations
// are explicitly an offline-capable feature. See Bug 1827265 for re-enabling this // are explicitly an offline-capable feature. See Bug 1827265 for re-enabling this
@ -129,11 +135,13 @@ const VERIFY_SIGNATURES_FROM_FS = false;
/** /**
* @typedef {import("../translations").TranslationModelRecord} TranslationModelRecord * @typedef {import("../translations").TranslationModelRecord} TranslationModelRecord
* @typedef {import("../translations").RemoteSettingsClient} RemoteSettingsClient * @typedef {import("../translations").RemoteSettingsClient} RemoteSettingsClient
* @typedef {import("../translations").LanguageIdEngineMockedPayload} LanguageIdEngineMockedPayload
* @typedef {import("../translations").LanguageTranslationModelFiles} LanguageTranslationModelFiles * @typedef {import("../translations").LanguageTranslationModelFiles} LanguageTranslationModelFiles
* @typedef {import("../translations").WasmRecord} WasmRecord * @typedef {import("../translations").WasmRecord} WasmRecord
* @typedef {import("../translations").LangTags} LangTags * @typedef {import("../translations").LangTags} LangTags
* @typedef {import("../translations").LanguagePair} LanguagePair * @typedef {import("../translations").LanguagePair} LanguagePair
* @typedef {import("../translations").SupportedLanguages} SupportedLanguages * @typedef {import("../translations").SupportedLanguages} SupportedLanguages
* @typedef {import("../translations").LanguageIdModelRecord} LanguageIdModelRecord
* @typedef {import("../translations").TranslationErrors} TranslationErrors * @typedef {import("../translations").TranslationErrors} TranslationErrors
*/ */
@ -219,6 +227,13 @@ export class TranslationsParent extends JSWindowActorParent {
} }
} }
/**
* The remote settings client that retrieves the language-identification model binary.
*
* @type {RemoteSettingsClient | null}
*/
static #languageIdModelsRemoteClient = null;
/** /**
* A map of the TranslationModelRecord["id"] to the record of the model in Remote Settings. * A map of the TranslationModelRecord["id"] to the record of the model in Remote Settings.
* Used to coordinate the downloads. * Used to coordinate the downloads.
@ -255,6 +270,22 @@ export class TranslationsParent extends JSWindowActorParent {
*/ */
static #isTranslationsEngineMocked = false; static #isTranslationsEngineMocked = false;
/**
* The language identification engine can be mocked for testing
* by pre-defining this value.
*
* @type {string | null}
*/
static #mockedLangTag = null;
/**
* The language identification engine can be mocked for testing
* by pre-defining this value.
*
* @type {number | null}
*/
static #mockedLanguageIdConfidence = null;
/** /**
* @type {null | Promise<boolean>} * @type {null | Promise<boolean>}
*/ */
@ -753,6 +784,18 @@ export class TranslationsParent extends JSWindowActorParent {
async receiveMessage({ name, data }) { async receiveMessage({ name, data }) {
switch (name) { switch (name) {
case "Translations:GetLanguageIdEnginePayload": {
const [modelBuffer, wasmBuffer] = await Promise.all([
TranslationsParent.#getLanguageIdModelArrayBuffer(),
TranslationsParent.#getLanguageIdWasmArrayBuffer(),
]);
return {
modelBuffer,
wasmBuffer,
mockedConfidence: TranslationsParent.#mockedLanguageIdConfidence,
mockedLangTag: TranslationsParent.#mockedLangTag,
};
}
case "Translations:ReportLangTags": { case "Translations:ReportLangTags": {
const { documentElementLang, href } = data; const { documentElementLang, href } = data;
const detectedLanguages = await this.getDetectedLanguages( const detectedLanguages = await this.getDetectedLanguages(
@ -919,6 +962,152 @@ export class TranslationsParent extends JSWindowActorParent {
return TranslationsParent.shouldAlwaysTranslateLanguage(langTags); return TranslationsParent.shouldAlwaysTranslateLanguage(langTags);
} }
/** @type {Promise<LanguageIdModelRecord> | null} */
static #languageIdModelRecord = null;
/**
* Retrieves the language-identification model binary from remote settings.
*
* @returns {Promise<ArrayBuffer>}
*/
static async #getLanguageIdModelArrayBuffer() {
lazy.console.log("Getting language-identification model array buffer.");
const now = Date.now();
const client = TranslationsParent.#getLanguageIdModelRemoteClient();
if (!TranslationsParent.#languageIdModelRecord) {
// Place the records into a promise to prevent any races.
TranslationsParent.#languageIdModelRecord = (async () => {
/** @type {LanguageIdModelRecord[]} */
let modelRecords = await TranslationsParent.getMaxVersionRecords(
client
);
if (modelRecords.length === 0) {
throw new Error(
"Unable to get language-identification model record from remote settings"
);
}
if (modelRecords.length > 1) {
TranslationsParent.reportError(
new Error(
"Expected the language-identification model collection to have only 1 record."
),
modelRecords
);
}
return modelRecords[0];
})();
}
await chaosMode(1 / 3);
try {
/** @type {{buffer: ArrayBuffer}} */
const { buffer } = await client.attachments.download(
await TranslationsParent.#languageIdModelRecord
);
const duration = (Date.now() - now) / 1000;
lazy.console.log(
`Remote language-identification model loaded in ${duration} seconds.`
);
return buffer;
} catch (error) {
TranslationsParent.#languageIdModelRecord = null;
throw error;
}
}
/**
* Initializes the RemoteSettingsClient for the language-identification model binary.
*
* @returns {RemoteSettingsClient}
*/
static #getLanguageIdModelRemoteClient() {
if (TranslationsParent.#languageIdModelsRemoteClient) {
return TranslationsParent.#languageIdModelsRemoteClient;
}
/** @type {RemoteSettingsClient} */
const client = lazy.RemoteSettings("translations-identification-models");
TranslationsParent.#languageIdModelsRemoteClient = client;
return client;
}
/** @type {Promise<LanguageIdModelRecord> | null} */
static #languageIdWasmRecord = null;
/**
* Retrieves the language-identification wasm binary from remote settings.
*
* @returns {Promise<ArrayBuffer>}
*/
static async #getLanguageIdWasmArrayBuffer() {
const start = Date.now();
const client = TranslationsParent.#getTranslationsWasmRemoteClient();
// Load the wasm binary from remote settings, if it hasn't been already.
lazy.console.log(`Getting remote language-identification wasm binary.`);
if (!TranslationsParent.#languageIdWasmRecord) {
// Place the records into a promise to prevent any races.
TranslationsParent.#languageIdWasmRecord = (async () => {
/** @type {WasmRecord[]} */
let wasmRecords = await TranslationsParent.getMaxVersionRecords(
client,
{
filters: { name: "fasttext-wasm" },
}
);
if (wasmRecords.length === 0) {
// The remote settings client provides an empty list of records when there is
// an error.
throw new Error(
'Unable to get "fasttext-wasm" language-identification wasm binary from Remote Settings.'
);
}
if (wasmRecords.length > 1) {
TranslationsParent.reportError(
new Error(
'Expected the "fasttext-wasm" language-identification wasm collection to only have 1 record.'
),
wasmRecords
);
}
return wasmRecords[0];
})();
}
try {
// Unlike the models, greedily download the wasm. It will pull it from a locale
// cache on disk if it's already been downloaded. Do not retain a copy, as
// this will be running in the parent process. It's not worth holding onto
// this much memory, so reload it every time it is needed.
await chaosMode(1 / 3);
/** @type {{buffer: ArrayBuffer}} */
const { buffer } = await client.attachments.download(
await TranslationsParent.#languageIdWasmRecord
);
const duration = (Date.now() - start) / 1000;
lazy.console.log(
`Remote language-identification wasm binary loaded in ${duration} seconds.`
);
return buffer;
} catch (error) {
TranslationsParent.#languageIdWasmRecord = null;
throw error;
}
}
/** /**
* Creates a lookup key that is unique to each fromLanguage-toLanguage pair. * Creates a lookup key that is unique to each fromLanguage-toLanguage pair.
* *
@ -1138,7 +1327,7 @@ export class TranslationsParent extends JSWindowActorParent {
* This function should take a record as input and return a string that represents the lookup key for the record. * This function should take a record as input and return a string that represents the lookup key for the record.
* For most record types, the name (default) is sufficient, however if a collection contains records with * For most record types, the name (default) is sufficient, however if a collection contains records with
* non-unique name values, it may be necessary to provide an alternative function here. * non-unique name values, it may be necessary to provide an alternative function here.
* @returns {Array<TranslationModelRecord | WasmRecord>} * @returns {Array<TranslationModelRecord | LanguageIdModelRecord | WasmRecord>}
*/ */
static async getMaxVersionRecords( static async getMaxVersionRecords(
remoteSettingsClient, remoteSettingsClient,
@ -1505,6 +1694,12 @@ export class TranslationsParent extends JSWindowActorParent {
queue.push({ queue.push({
download: () => TranslationsParent.#getBergamotWasmArrayBuffer(), download: () => TranslationsParent.#getBergamotWasmArrayBuffer(),
}); });
queue.push({
download: () => TranslationsParent.#getLanguageIdModelArrayBuffer(),
});
queue.push({
download: () => TranslationsParent.#getLanguageIdWasmArrayBuffer(),
});
return downloadManager(queue); return downloadManager(queue);
} }
@ -1755,10 +1950,13 @@ export class TranslationsParent extends JSWindowActorParent {
// Records. // Records.
TranslationsParent.#bergamotWasmRecord = null; TranslationsParent.#bergamotWasmRecord = null;
TranslationsParent.#translationModelRecords = null; TranslationsParent.#translationModelRecords = null;
TranslationsParent.#languageIdModelRecord = null;
TranslationsParent.#languageIdWasmRecord = null;
// Clients. // Clients.
TranslationsParent.#translationModelsRemoteClient = null; TranslationsParent.#translationModelsRemoteClient = null;
TranslationsParent.#translationsWasmRemoteClient = null; TranslationsParent.#translationsWasmRemoteClient = null;
TranslationsParent.#languageIdModelsRemoteClient = null;
// Derived data. // Derived data.
TranslationsParent.#preferredLanguages = null; TranslationsParent.#preferredLanguages = null;
@ -1782,6 +1980,33 @@ export class TranslationsParent extends JSWindowActorParent {
TranslationsParent.#isTranslationsEngineMocked = false; TranslationsParent.#isTranslationsEngineMocked = false;
} }
/**
* For testing purposes, allow the LanguageIdEngine to be mocked. If called
* with `null` in each argument, the mock is removed.
*
* @param {string} langTag - The BCP 47 language tag.
* @param {number} confidence - The confidence score of the detected language.
* @param {RemoteSettingsClient} client
*/
static mockLanguageIdentification(langTag, confidence, client) {
lazy.console.log("Mocking language identification.", {
langTag,
confidence,
});
TranslationsParent.#mockedLangTag = langTag;
TranslationsParent.#mockedLanguageIdConfidence = confidence;
TranslationsParent.#languageIdModelsRemoteClient = client;
}
/**
* Remove the mocks for the language identification, make sure and call clearCache after
* to remove the cached values.
*/
static unmockLanguageIdentification() {
lazy.console.log("Removing language identification mock.");
TranslationsParent.#mockedLangTag = null;
TranslationsParent.#mockedLanguageIdConfidence = null;
}
/** /**
* Report an error. Having this as a method allows tests to check that an error * Report an error. Having this as a method allows tests to check that an error
* was properly reported. * was properly reported.
@ -1932,11 +2157,13 @@ export class TranslationsParent extends JSWindowActorParent {
async queryIdentifyLanguage() { async queryIdentifyLanguage() {
if ( if (
TranslationsParent.isInAutomation() && TranslationsParent.isInAutomation() &&
!TranslationsParent.#isTranslationsEngineMocked !TranslationsParent.#mockedLangTag
) { ) {
return null; return null;
} }
return this.sendQuery("Translations:IdentifyLanguage").catch(error => { return this.sendQuery("Translations:IdentifyLanguage", {
useFastText: lazy.useFastTextPref,
}).catch(error => {
if (this.#isDestroyed) { if (this.#isDestroyed) {
// The actor was destroyed while this message was still being resolved. // The actor was destroyed while this message was still being resolved.
return null; return null;
@ -2042,7 +2269,8 @@ export class TranslationsParent extends JSWindowActorParent {
} }
} }
} else { } else {
// If the document's markup had no specified langTag, attempt to identify the page's language. // If the document's markup had no specified langTag, attempt
// to identify the page's language using the LanguageIdEngine.
langTags.docLangTag = await this.queryIdentifyLanguage(); langTags.docLangTag = await this.queryIdentifyLanguage();
if (this.#isDestroyed) { if (this.#isDestroyed) {
return null; return null;

View file

@ -0,0 +1,327 @@
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
/* eslint-env worker */
"use strict";
// Throw Promise rejection errors so that they are visible in the console.
self.addEventListener("unhandledrejection", event => {
throw event.reason;
});
/* global addOnPostRun FastText loadFastText */
importScripts(
"chrome://global/content/translations/fasttext.js",
"chrome://global/content/translations/fasttext_wasm.js"
);
/**
* The number of languages that should be returned when the model analyzes text.
*
* A value of 1 means only the most-likely language will be returned.
* A value of 5 would mean that the top 5 most-likely languages will be returned.
*/
const LANGUAGE_COUNT = 1;
/**
* The threshold of likelihood in range [0.0, 1.0] that must pass
* for a language to be returned from the model.
*
* A value of 0.0 would mean that a language is always returned with any confidence.
* A value of 0.5 would mean that a language is only returned if the model
* is 50% confident that the analyzed text could be that language.
*/
const CONFIDENCE_THRESHOLD = 0.0;
// Respect the preference "browser.translations.logLevel".
let _isLoggingEnabled = true;
function log(...args) {
if (_isLoggingEnabled) {
console.log("Translations:", ...args);
}
}
// Wait for the initialization request.
addEventListener("message", handleInitializationMessage);
/**
* Initialize the engine, and get it ready to handle language identification requests.
* The "initialize" message must be received before any other message handling
* requests will be processed.
*
* @param {Object} event
* @param {Object} event.data
* @param {string} event.data.type - The message type, expects "initialize".
* @param {ArrayBuffer} event.data.wasmBuffer - The buffer containing the wasm binary.
* @param {ArrayBuffer} event.data.modelBuffer - The buffer containing the language-id model binary.
* @param {null | string} event.data.mockedLangTag - The mocked language tag value (only present when mocking).
* @param {null | number} event.data.mockedConfidence - The mocked confidence value (only present when mocking).
* @param {boolean} event.data.isLoggingEnabled
*/
async function handleInitializationMessage({ data }) {
if (data.type !== "initialize") {
throw new Error(
"The LanguageIdEngine worker received a message before it was initialized."
);
}
try {
const { isLoggingEnabled } = data;
if (isLoggingEnabled) {
// Respect the "browser.translations.logLevel" preference.
_isLoggingEnabled = true;
}
/** @type {LanguageIdEngine | MockedLanguageIdEngine} */
let languageIdEngine;
const { mockedLangTag, mockedConfidence } = data;
if (mockedLangTag !== null && mockedConfidence !== null) {
// Don't actually use the engine as it is mocked.
languageIdEngine = new MockedLanguageIdEngine(
mockedLangTag,
mockedConfidence
);
} else {
languageIdEngine = await initializeLanguageIdEngine(data);
}
handleMessages(languageIdEngine);
postMessage({ type: "initialization-success" });
} catch (error) {
console.error(error);
postMessage({ type: "initialization-error", error: error?.message });
}
removeEventListener("message", handleInitializationMessage);
}
/**
* Initializes the fastText wasm runtime and returns the fastText model.
*
* @param {ArrayBuffer} data.wasmBuffer - The buffer containing the wasm binary.
* @param {ArrayBuffer} data.modelBuffer - The buffer containing the language-id model binary.
* @returns {FastTextModel}
*/
function initializeFastTextModel(modelBuffer, wasmBuffer) {
return new Promise((resolve, reject) => {
const initialModule = {
onAbort() {
reject(new Error("Error loading the fastText Wasm Module"));
},
onRuntimeInitialized() {
addOnPostRun(() => {
const ft = new FastText(initialModule);
const model = ft.loadModelBinary(modelBuffer);
resolve(model);
});
},
wasmBinary: wasmBuffer,
};
loadFastText(initialModule);
});
}
/**
* Initialize the LanguageIdEngine from the data payload by loading
* the fastText wasm runtime and model and constructing the engine.
*
* @param {Object} data
* @property {ArrayBuffer} data.wasmBuffer - The buffer containing the wasm binary.
* @property {ArrayBuffer} data.modelBuffer - The buffer containing the language-id model binary.
*/
async function initializeLanguageIdEngine(data) {
const { modelBuffer, wasmBuffer } = data;
if (!modelBuffer) {
throw new Error('LanguageIdEngine initialization missing "modelBuffer"');
}
if (!wasmBuffer) {
throw new Error('LanguageIdEngine initialization missing "wasmBuffer"');
}
const model = await initializeFastTextModel(modelBuffer, wasmBuffer);
return new LanguageIdEngine(model);
}
/**
* Sets up the message handling for the worker.
*
* @param {LanguageIdEngine | MockedLanguageIdEngine} languageIdEngine
*/
function handleMessages(languageIdEngine) {
/**
* Handle any message after the initialization message.
*
* @param {Object} data
* @property {string} data.type - The message type.
* @property {string} data.message - The message text to identify the language of.
* @property {number} data.messageId - The ID of the message.
*/
addEventListener("message", ({ data }) => {
try {
if (data.type === "initialize") {
throw new Error(
"The language-identification engine must not be re-initialized."
);
}
switch (data.type) {
case "language-id-request": {
const { message, messageId } = data;
try {
const [confidence, langTag] =
languageIdEngine.identifyLanguage(message);
postMessage({
type: "language-id-response",
langTag,
confidence,
messageId,
});
} catch (error) {
console.error(error);
postMessage({
type: "language-id-error",
messageId,
});
}
break;
}
default: {
console.warn("Unknown message type:", data.type);
}
}
} catch (error) {
// Ensure the unexpected errors are surfaced in the console.
console.error(error);
}
});
}
/**
* The LanguageIdEngine wraps around a machine-learning model that can identify text
* as being written in a given human language. The engine is responsible for invoking
* model and returning the language tag in the format that is expected by firefox
* translations code.
*/
class LanguageIdEngine {
/** @type {FastTextModel} */
#model;
/**
* @param {FastTextModel} model
*/
constructor(model) {
this.#model = model;
}
/**
* Formats the language tag returned by the language-identification model to match
* conform to the format used internally by Firefox.
*
* This function is currently configured to handle the fastText language-identification
* model. Updating the language-identification model or moving to something other than
* fastText in the future will likely require updating this function.
*
* @param {string} langTag
* @returns {string} The correctly formatted langTag
*/
#formatLangTag(langTag) {
// The fastText language model returns values of the format "__label__{langTag}".
// As such, this function strips the "__label__" prefix, leaving only the langTag.
let formattedTag = langTag.replace("__label__", "");
// fastText is capable of returning any of a predetermined set of 176 langTags:
// https://fasttext.cc/docs/en/language-identification.html
//
// These tags come from ISO639-3:
// https://iso639-3.sil.org/code_tables/deprecated_codes/data
//
// Each of these tags have been cross checked for compatibility with the IANA
// language subtag registry, which is used by BCP 47, and any edge cases are handled below.
// https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry
switch (formattedTag) {
// fastText may return "eml" which is a deprecated ISO639-3 language tag for the language
// Emiliano-Romagnolo. It was split into two separate tags "egl" and "rgn":
// https://iso639-3.sil.org/request/2008-040
//
// "eml" was once requested to be added to the IANA registry, but it was denied:
// https://www.alvestrand.no/pipermail/ietf-languages/2009-December/009754.html
//
// This case should return either "egl" or "rgn", given that the "eml" tag was split.
// However, given that the fastText model does not distinguish between the two by using
// the deprecated tag, this function will default to "egl" because it is alphabetically first.
//
// At such a time that Firefox Translations may support either of these languages, we should consider
// a way to further distinguish between the two languages at that time.
case "eml": {
formattedTag = "egl";
break;
}
// The fastText model returns "no" for Norwegian Bokmål.
//
// According to advice from https://r12a.github.io/app-subtags/
// "no" is a macro language that encompasses the following more specific primary language subtags: "nb" "nn".
// It is recommended to use more specific language subtags as long as it does not break legacy usage of an application.
// As such, this function will return "nb" for Norwegian Bokmål instead of "no" as reported by fastText.
case "no": {
formattedTag = "nb";
break;
}
}
return formattedTag;
}
/**
* Identifies the human language in which the message is written and returns
* the BCP 47 language tag of the language it is determined to be along along
* with a rating of how confident the model is that the label is correct.
*
* @param {string} message
* @returns {Array<number | string>} An array containing the confidence and language tag.
* The confidence is a number between 0 and 1, representing a percentage.
* The language tag is a BCP 47 language tag such as "en" for English.
*
* e.g. [0.87, "en"]
*/
identifyLanguage(message) {
const mostLikelyLanguageData = this.#model
.predict(message.trim(), LANGUAGE_COUNT, CONFIDENCE_THRESHOLD)
.get(0);
// This should never fail as long as
// LANGUAGE_COUNT > 1 && CONFIDENCE_THRESHOLD === 0.0
if (!mostLikelyLanguageData) {
throw new Error("Unable to identify a language");
}
const [confidence, langTag] = mostLikelyLanguageData;
return [confidence, this.#formatLangTag(langTag)];
}
}
/**
* For testing purposes, provide a fully mocked engine. This allows for easy integration
* testing of the UI, without having to rely on downloading remote models and remote
* wasm binaries.
*/
class MockedLanguageIdEngine {
/** @type {string} */
#langTag;
/** @type {number} */
#confidence;
/**
* @param {string} langTag
* @param {number} confidence
*/
constructor(langTag, confidence) {
this.#langTag = langTag;
this.#confidence = confidence;
}
/**
* Mocks identifying a language by returning the mocked engine's pre-determined
* language tag and confidence values.
*/
identifyLanguage(_message) {
return [this.#confidence, this.#langTag];
}
}

View file

@ -0,0 +1,224 @@
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
const lazy = {};
import { XPCOMUtils } from "resource://gre/modules/XPCOMUtils.sys.mjs";
XPCOMUtils.defineLazyPreferenceGetter(
lazy,
"logLevel",
"browser.translations.logLevel"
);
ChromeUtils.defineLazyGetter(lazy, "console", () => {
return console.createInstance({
maxLogLevelPref: "browser.translations.logLevel",
prefix: "Translations",
});
});
ChromeUtils.defineESModuleGetters(lazy, {
setTimeout: "resource://gre/modules/Timer.sys.mjs",
clearTimeout: "resource://gre/modules/Timer.sys.mjs",
});
/**
* The threshold that the language-identification confidence
* value must be greater than in order to provide the detected language
* tag for translations.
*
* This value should ideally be one that does not allow false positives
* while also not being too restrictive.
*
* At this time, this value is not driven by statistical data or analysis.
*/
const DOC_LANGUAGE_DETECTION_THRESHOLD = 0.65;
/**
* The length of the substring to pull from the document's text for language
* identification.
*
* This value should ideally be one that is large enough to yield a confident
* identification result without being too large or expensive to extract.
*
* At this time, this value is not driven by statistical data or analysis.
*
* For the moment, while we investigate which language identification library
* we would like to use, keep this logic in sync with LanguageDetector.sys.mjs
*/
const DOC_TEXT_TO_IDENTIFY_LENGTH = 1024;
export class LanguageIdEngine {
/** @type {Worker} */
#languageIdWorker;
// Multiple messages can be sent before a response is received. This ID is used to keep
// track of the messages. It is incremented on every use.
#messageId = 0;
static #cachedEngine = null;
static #cachedEngineTimeoutId = null;
static #cachedEngineTimeoutMS = 30_000;
/**
* Gets a cached engine, or creates a new one. Returns `null` when the engine
* payload fails to download.
*
* @param {() => Object} getPayload
* @returns {LanguageIdEngine | null}
*/
static getOrCreate(getPayload) {
if (!this.#cachedEngine) {
this.#cachedEngine = LanguageIdEngine.#create(getPayload);
}
return this.#cachedEngine;
}
/**
* @param {() => Object} getPayload
* @returns {Promise<LanguageIdEngine | null>}
*/
static async #create(getPayload) {
let payload;
try {
payload = await getPayload();
} catch (error) {
// The payload may not be able to be downloaded. Report this as a normal
// console.log, as this is the default behavior in automation.
lazy.console.log(
"The language id payload was unable to be downloaded.",
error
);
return null;
}
const engine = new LanguageIdEngine(payload);
await engine.isReady;
LanguageIdEngine.#resetCacheTimeout();
return engine;
}
static #resetCacheTimeout() {
if (LanguageIdEngine.#cachedEngineTimeoutId) {
lazy.clearTimeout(LanguageIdEngine.#cachedEngineTimeoutId);
}
LanguageIdEngine.#cachedEngineTimeoutId = lazy.setTimeout(
LanguageIdEngine.#clearEngineCache,
LanguageIdEngine.#cachedEngineTimeoutMS
);
}
static #clearEngineCache() {
lazy.console.log("Clearing the engine cache");
LanguageIdEngine.#cachedEngine = null;
LanguageIdEngine.#cachedEngineTimeoutId = null;
}
/**
* Construct and initialize the language-id worker.
*
* @param {Object} data
* @param {string} data.type - The message type, expects "initialize".
* @param {ArrayBuffer} data.wasmBuffer - The buffer containing the wasm binary.
* @param {ArrayBuffer} data.modelBuffer - The buffer containing the language-id model binary.
* @param {null | string} data.mockedLangTag - The mocked language tag value (only present when mocking).
* @param {null | number} data.mockedConfidence - The mocked confidence value (only present when mocking).
* @param {boolean} data.isLoggingEnabled
*/
constructor(data) {
this.#languageIdWorker = new Worker(
"chrome://global/content/translations/language-id-engine-worker.js"
);
this.isReady = new Promise((resolve, reject) => {
const onMessage = ({ data }) => {
if (data.type === "initialization-success") {
resolve();
} else if (data.type === "initialization-error") {
reject(data.error);
}
this.#languageIdWorker.removeEventListener("message", onMessage);
};
this.#languageIdWorker.addEventListener("message", onMessage);
});
const transferables = [];
// Make sure the ArrayBuffers are transferred, not cloned.
// https://developer.mozilla.org/en-US/docs/Web/API/Web_Workers_API/Transferable_objects
transferables.push(data.wasmBuffer, data.modelBuffer);
this.#languageIdWorker.postMessage(
{
type: "initialize",
isLoggingEnabled: lazy.logLevel === "All",
...data,
},
transferables
);
}
/**
* Attempts to identify the human language in which the message is written.
* Generally, the longer a message is, the higher the likelihood that the
* identified language will be correct. Shorter messages increase the chance
* of false identification.
*
* The returned confidence is a number between 0.0 and 1.0 of how confident
* the language identification model was that it identified the correct language.
*
* @param {string} message
* @returns {Promise<{ langTag: string, confidence: number }>}
*/
identifyLanguage(message) {
LanguageIdEngine.#resetCacheTimeout();
const messageId = this.#messageId++;
return new Promise((resolve, reject) => {
const onMessage = ({ data }) => {
if (data.messageId !== messageId) {
// Multiple translation requests can be sent before a response is received.
// Ensure that the response received here is the correct one.
return;
}
if (data.type === "language-id-response") {
let { langTag, confidence } = data;
resolve({ langTag, confidence });
}
if (data.type === "language-id-error") {
reject(data.error);
}
this.#languageIdWorker.removeEventListener("message", onMessage);
};
this.#languageIdWorker.addEventListener("message", onMessage);
this.#languageIdWorker.postMessage({
type: "language-id-request",
message,
messageId,
});
});
}
/**
* Attempts to determine the language in which the document's content is written.
*
* For the moment, while we investigate which language identification library
* we would like to use, keep this logic in sync with LanguageDetector.sys.mjs
* @returns {string | null}
*/
async identifyLanguageFromDocument(document) {
// Grab a selection of text.
let encoder = Cu.createDocumentEncoder("text/plain");
encoder.init(document, "text/plain", encoder.SkipInvisibleContent);
let text = encoder
.encodeToStringWithMaxLength(DOC_TEXT_TO_IDENTIFY_LENGTH)
.replaceAll("\r", "")
.replaceAll("\n", " ");
let { langTag, confidence } = await this.identifyLanguage(text);
lazy.console.log(
`${langTag}(${confidence.toFixed(2)}) Detected Page Language`
);
return confidence >= DOC_LANGUAGE_DETECTION_THRESHOLD ? langTag : null;
}
}

View file

@ -8,7 +8,7 @@
/* global AT_getSupportedLanguages, AT_log, AT_getScriptDirection, /* global AT_getSupportedLanguages, AT_log, AT_getScriptDirection,
AT_logError, AT_createTranslationsPort, AT_isHtmlTranslation, AT_logError, AT_createTranslationsPort, AT_isHtmlTranslation,
AT_isTranslationEngineSupported, AT_identifyLanguage */ AT_isTranslationEngineSupported, AT_createLanguageIdEngine, AT_identifyLanguage */
// Allow tests to override this value so that they can run faster. // Allow tests to override this value so that they can run faster.
// This is the delay in milliseconds. // This is the delay in milliseconds.
@ -79,6 +79,14 @@ class TranslationsState {
*/ */
this.isTranslationEngineSupported = isSupported; this.isTranslationEngineSupported = isSupported;
/**
* Allow code to wait for the engine to be created.
* @type {Promise<void>}
*/
this.languageIdEngineCreated = isSupported
? AT_createLanguageIdEngine()
: Promise.resolve();
/** /**
* @type {SupportedLanguages} * @type {SupportedLanguages}
*/ */
@ -90,12 +98,11 @@ class TranslationsState {
this.ui.setup(); this.ui.setup();
// Set the UI as ready after all of the state promises have settled. // Set the UI as ready after all of the state promises have settled.
this.supportedLanguages Promise.allSettled([
.then(() => { this.languageIdEngineCreated,
this.supportedLanguages,
]).then(() => {
this.ui.setAsReady(); this.ui.setAsReady();
})
.catch(error => {
AT_logError("Failed to load the supported languages", error);
}); });
} }
@ -108,6 +115,7 @@ class TranslationsState {
* @param {string} message * @param {string} message
*/ */
async identifyLanguage(message) { async identifyLanguage(message) {
await this.languageIdEngineCreated;
const start = performance.now(); const start = performance.now();
const { langTag, confidence } = await AT_identifyLanguage(message); const { langTag, confidence } = await AT_identifyLanguage(message);
const duration = performance.now() - start; const duration = performance.now() - start;

View file

@ -80,16 +80,20 @@ architecture to identify content as being written in a detected language.
### Technology ### Technology
Firefox Translations utilizes a [CLD2] language detector to identify in which language content is written. Firefox Translations utilizes a [WASM] version of the [fastText] library to identify in which
language content is written.
### Models ### Models
No models are currently used for language identification, since [CLD2] exists in the Firefox source tree. Unlike the language translations models in the [section](#language-translations) above, the [fastText]
model is a is a one-to-many model that is capable of detecting all of our supported languages
from the single model.
--- ---
## Remote Settings ## Remote Settings
Remote Settings is not currently used for language identification, since [CLD2] exists in the Firefox source tree. Firefox Translations utilizes [Remote Settings] to download [WASM] binaries, [Language Translation](#language-translation)
models and [Language Identification](#language-identification) models to use locally on your system.
--- ---
## Using Firefox Translations ## Using Firefox Translations
@ -135,7 +139,7 @@ It is, however, useful and fun, so it is documented here.
<!-- Hyperlinks --> <!-- Hyperlinks -->
[Bergamot]: https://browser.mt/ [Bergamot]: https://browser.mt/
[CLD2]: https://github.com/CLD2Owners/cld2 [fastText]: https://fasttext.cc/
[Firefox Nightly]: https://www.mozilla.org/en-US/firefox/channel/desktop/ [Firefox Nightly]: https://www.mozilla.org/en-US/firefox/channel/desktop/
[Marian]: https://aclanthology.org/P18-4020/ [Marian]: https://aclanthology.org/P18-4020/
[Remote Settings]: https://remote-settings.readthedocs.io/en/latest/ [Remote Settings]: https://remote-settings.readthedocs.io/en/latest/

View file

@ -13,11 +13,11 @@ to provide helpful information regarding contributing to Firefox Translations.
- [Versioning](#versioning) - [Versioning](#versioning)
- [Non-Breaking Changes](#non-breaking-changes) - [Non-Breaking Changes](#non-breaking-changes)
- [Breaking Changes](#breaking-changes) - [Breaking Changes](#breaking-changes)
- [Building fastText](#building-fasttext)
- [Downloading The Models](#downloading-the-models) - [Downloading The Models](#downloading-the-models)
- [Building the WASM Binary](#building-the-wasm-binary) - [Building the WASM Binary](#building-the-wasm-binary)
- [Dependencies](#dependencies) - [Dependencies](#dependencies)
- [Modifying the EMCXXFLAGS](#modifying-the-emcxxflags) - [Modifying the EMCXXFLAGS](#modifying-the-emcxxflags)
- [Language Identification](#language-identification)
- [Building Bergamot](#building-bergamot) - [Building Bergamot](#building-bergamot)
--- ---
@ -127,11 +127,290 @@ Tying breaking changes to releases in this way frees up Firefox Translations to
switching one third-party library for another in the compiled source code, while allowing older versions of Firefox to continue utilizing the old library and allowing newer versions of Firefox to utilize the new library. switching one third-party library for another in the compiled source code, while allowing older versions of Firefox to continue utilizing the old library and allowing newer versions of Firefox to utilize the new library.
--- ---
## Language Identification ## Building fastText
Translations currently uses the [CLD2] language detector. ### Downloading the Models
We have previously experimented with using the [fastText] language detector, but we opted to use [CLD2] due to complications with [fastText] [WASM] runtime performance. The benefit of the [CLD2] language detector is that it already exists in the Firefox source tree. In the future, we would still like to explore moving to a more modern language detector such as [CLD3], or perhaps something else. The fastText model that we use can be downloaded directly from the fastText website:<br>
> [https://fasttext.cc/docs/en/language-identification.html](https://fasttext.cc/docs/en/language-identification.html)
Firefox Translations uses the compressed, **`lid.176.ftz`** model.
### Building the WASM Binary
To build the fastText [WASM] binary, we can follow the steps in the [Requirements] section of the fastText website.
#### Dependencies
**C++ Compiler**<br>
Any of the C++ compilers from [Getting Set Up To Work On The Firefox Codebase] will be sufficient for this.
**emskd**<br>
Follow the [Download and Install] instructions for setting up the emscripten sdk.
#### Modifying the EMCXXFLAGS
At the time of writing, the a latest commit on the fastText repo ([3697152e0fd772d9185697fdbd4a1d340ca5571d])
is not compatible by default with the latest version of [emscripten (3.1.35)].
A few changes need to be made to the Makefile in order to generate the fastText [WASM] for use in Firefox.
**1) Disable DYNAMIC_EXECUTION**<br>
In the `Makefile` for the fastText repo, there is a variable called **`EMCXXFLAGS`**.<br>
We need to add the following flag to this variable:
```
-s "DYNAMIC_EXECUTION=0"
```
If this flag is not set to **`0`**, then emscripten will [generate functions] that use the [eval()] function.
[eval()] is not allowed in the context that fastText runs in FireFox due to security reasons.
**2) Rename EXTRA_EXPORTED_RUNTIME_METHODS**<br>
In [emscripten (2.0.18)], **`EXTRA_EXPORTED_RUNTIME_METHODS`** was deprecated in favor of **`EXPORTED_RUNTIME_METHODS`**.
The fastText Makefile still has the old flag, so we need to update the name.
**3) Use the -r Flag When Appropriate**<br>
In [emscripten (2.0.3)] the following change was made:
> "The default output format is now executable JavaScript. Previously we would default to output objecting files unless, for example, the output name ended in **`.js`**. This is contrary to behavior of clang and gcc. Now emscripten will always produce and executable unless the **`-c`**, **`-r`** or **`-shared`** flags are given. This is true even when the name of the output file ends in **`.o`**. e.g, **`emcc foo.c -o foo.o`** will produce a JavaScript file called **`foo.o`**. This might surprise some users (although it matches the behavior of existing toolchains) so we now produce a warning in this case."
The Makefile needs to be modified to use the **`-r`** flag when appropriate. These changes are modeled after comments on this [GitHub Issue].
**Cumulative Changes**<br>
Here is a diff of the full changes needed for the Makefile at the time of writing:
```diff
diff --git a/Makefile b/Makefile
index e246f79..396ae0b 100644
--- a/Makefile
+++ b/Makefile
@@ -73,7 +73,9 @@ clean:
EMCXX = em++
-EMCXXFLAGS = --bind --std=c++11 -s WASM=1 -s ALLOW_MEMORY_GROWTH=1 -s "EXTRA_EXPORTED_RUNTIME_METHODS=['addOnPostRun', 'FS']" -s "DISABLE_EXCEPTION_CATCHING=0" -s "EXCEPTION_DEBUG=1" -s "FORCE_FILESYSTEM=1" -s "MODULARIZE=1" -s "EXPORT_ES6=1" -s 'EXPORT_NAME="FastTextModule"' -Isrc/
+EMCXXFLAGS_BASE = --bind --std=c++11 -s WASM=1 -s ALLOW_MEMORY_GROWTH=1 -s "EXPORTED_RUNTIME_METHODS=['addOnPostRun', 'FS']" -s "DISABLE_EXCEPTION_CATCHING=0" -s "EXCEPTION_DEBUG=0" -s "DYNAMIC_EXECUTION=0" -s "FORCE_FILESYSTEM=1" -s "MODULARIZE=1" -s "EXPORT_ES6=1" -s 'EXPORT_NAME="FastTextModule"' -Isrc/
+EMCXXFLAGS = $(EMCXXFLAGS_BASE) -r
+EMCXXFLAGS_JS = $(EMCXXFLAGS_BASE)
EMOBJS = args.bc autotune.bc matrix.bc dictionary.bc loss.bc productquantizer.bc densematrix.bc quantmatrix.bc vector.bc model.bc utils.bc meter.bc fasttext.bc main.bc
@@ -120,6 +122,6 @@ fasttext.bc: src/fasttext.cc src/*.h
$(EMCXX) $(EMCXXFLAGS) src/fasttext.cc -o fasttext.bc
webassembly/fasttext_wasm.js: $(EMOBJS) webassembly/fasttext_wasm.cc Makefile
- $(EMCXX) $(EMCXXFLAGS) $(EMOBJS) -o webassembly/fasttext_wasm.js
+ $(EMCXX) $(EMCXXFLAGS_JS) $(EMOBJS) -o webassembly/fasttext_wasm.js
```
After modifying the Makefile in the previous section, running **`make wasm`** in the fastText repo should run without warnings or errors and the following files will be generated in the **`webassembly`** directory:
```
webassembly
├── fasttext.js
├── fasttext_wasm.js
└── fasttext_wasm.wasm
```
#### Modifying fasttext_wasm.js
There are a few changes we need to make to the **`fasttext_wasm.js`** file to make it compatible with use in Firefox.
**1) Define a function, not a module**<br>
The generated code exports a module, but this needs to be modified into a function for use in [importScripts()] in a worker.
At the top of the file we need to make the following changes:
```diff
diff --git a/toolkit/components/translations/fasttext/fasttext_wasm.js b/toolkit/components/translations/fasttext/fasttext_wasm.js
index 64c6184a85851..4802343da2a03 100644
--- a/toolkit/components/translations/fasttext/fasttext_wasm.js
+++ b/toolkit/components/translations/fasttext/fasttext_wasm.js
@@ -1,9 +1,6 @@
-var FastTextModule = (() => {
- var _scriptDir = import.meta.url;
-
- return (
-async function(FastTextModule = {}) {
+async function loadFastTextModule(FastTextModule = {}) {
+ const _scriptDir = null;
// include: shell.js
// The Module object: Our interface to the outside world. We import
```
Here we are defining a function rather than a variable, and we are setting **`_scriptDir`** to null
because **`import.meta.url`** is only available for use within modules.
Next we need to modify the bottom of the file to match these changes:
```diff
diff --git a/toolkit/components/translations/fasttext/fasttext_wasm.js b/toolkit/components/translations/fasttext/fasttext_wasm.js
index 64c6184a85851..0a6fca3f524e4 100644
--- a/toolkit/components/translations/fasttext/fasttext_wasm.js
+++ b/toolkit/components/translations/fasttext/fasttext_wasm.js
@@ -8287,7 +8287,3 @@ run();
return FastTextModule.ready
}
-
-);
-})();
-export default FastTextModule;
```
**2) Remove unneeded environment checks**<br>
Next we need to remove unneeded checks for different environments:
```JavaScript
if (ENVIRONMENT_IS_NODE) {
// ...
} else
if (ENVIRONMENT_IS_SHELL) {
// ...
} else
if (ENVIRONMENT_IS_WEB || ENVIRONMENT_IS_WORKER) {
// ...
} else
{
throw new Error('environment detection error');
}
```
Since this code will only be run inside of a worker, we want to delete the blocks that deal with **`ENVIRONMENT_IS_NODE`** and **`ENVIRONMENT_IS_SHELL`**. In fact, this code will fail to be imported by [importScripts()] if we don't do this.
**3) Remove the use of `import.meta.url`**<br>
Finally, there is a use of **`import.meta.url`** that we need to remove.
```diff
diff --git a/toolkit/components/translations/fasttext/fasttext_wasm.js b/toolkit/components/translations/fasttext/fasttext_wasm.js
index 64c6184a85851..746cbae2ec952 100644
--- a/toolkit/components/translations/fasttext/fasttext_wasm.js
+++ b/toolkit/components/translations/fasttext/fasttext_wasm.js
@@ -746,7 +746,7 @@ if (Module['locateFile']) {
}
} else {
// Use bundler-friendly `new URL(..., import.meta.url)` pattern; works in browsers too.
- wasmBinaryFile = new URL('fasttext_wasm.wasm', import.meta.url).href;
+ wasmBinaryFile = null;
}
function getBinary(file) {
```
As mentioned before, **`import.meta.url`** is not allowed outside of modules and cannot be used with [importScripts()]
in the worker code that we are creating.
It is okay to set this to null here, because we will be providing the **`wasmBinaryFile`** via [Remote Settings].
**4) Minifying the file**<br>
The generated **`fasttext_wasm.js`** file is very large. To minimize the impact on the size of the code in the Firefox source tree, we want to minify the file using the [minify] tool.
```
Size Name
291k ├── fasttext_wasm.js (original)
109k └── fasttext_wasm.js (minified)
```
**5) Adding the license**<br>
Finally, we should add a copy of the current fastText MIT license to the top of the minified **`fasttext_wasm.js`** file.
You should be able to paste this from the generated **`fasttext.js`** file.
#### Modifying fasttext.js
```{note}
It is likely that the source file in tree already has these changes and is already sufficient,
even if **`fasttext_wasm.js`** has been recently updated. Try running it first as-is before replacing
and re-modifying.
```
Next we need to modify **`fasttext.js`** to utilize the changes that we made to **`fasttext_wasm.js`** and also to
not be a module so that we can import it using [importScripts()].
These changes do the following:
1) Define a variable called **`fastTextModule`** for use in the worker scripts.
2) Utilize the **`loadFastTextModule()`** function that we defined in **`fasttext_wasm.js`**
3) Add a function **`loadModelBinary()`** that takes the wasm binary directly, which we will provide through [Remote Settings].
4) Remove any module exports.
```diff
diff --git a/toolkit/components/translations/fasttext/fasttext.js b/toolkit/components/translations/fasttext/fasttext.js
index 86600b9ac9e28..2c49b3faaeedc 100644
--- a/toolkit/components/translations/fasttext/fasttext.js
+++ b/toolkit/components/translations/fasttext/fasttext.js
@@ -6,20 +6,30 @@
* LICENSE file in the root directory of this source tree.
*/
-import fastTextModularized from './fasttext_wasm.js';
-const fastTextModule = fastTextModularized();
+let fastTextModule;
+
+const _initFastTextModule = async function (wasmModule) {
+ try {
+ fastTextModule = await loadFastTextModule(wasmModule);
+ } catch(e) {
+ console.error(e);
+ }
+ return true
+}
let postRunFunc = null;
const addOnPostRun = function(func) {
postRunFunc = func;
};
-fastTextModule.addOnPostRun(() => {
- if (postRunFunc) {
- postRunFunc();
- }
-});
+const loadFastText = (wasmModule) => {
+ _initFastTextModule(wasmModule).then((res) => {
+ if (postRunFunc) {
+ postRunFunc();
+ }
+ })
+}
const thisModule = this;
const trainFileInWasmFs = 'train.txt';
const testFileInWasmFs = 'test.txt';
@@ -41,7 +51,7 @@ const getFloat32ArrayFromHeap = (len) => {
const heapToFloat32 = (r) => new Float32Array(r.buffer, r.ptr, r.size);
class FastText {
- constructor() {
+ constructor(fastTextModule) {
this.f = new fastTextModule.FastText();
}
@@ -77,6 +87,15 @@ class FastText {
});
}
+ loadModelBinary(buffer) {
+ const fastTextNative = this.f;
+ const byteArray = new Uint8Array(buffer);
+ const FS = fastTextModule.FS;
+ FS.writeFile(modelFileInWasmFs, byteArray);
+ fastTextNative.loadModel(modelFileInWasmFs);
+ return new FastTextModel(fastTextNative);
+ }
+
_train(url, modelName, kwargs = {}, callback = null) {
const fetchFunc = (thisModule && thisModule.fetch) || fetch;
const fastTextNative = this.f;
@@ -515,6 +534,3 @@ class FastTextModel {
});
}
}
-
-
-export {FastText, addOnPostRun};
```
--- ---
## Building Bergamot ## Building Bergamot
@ -140,21 +419,20 @@ TODO
<!-- Hyperlinks --> <!-- Hyperlinks -->
[3697152e0fd772d9185697fdbd4a1d340ca5571d]: https://github.com/facebookresearch/fastText/tree/3697152e0fd772d9185697fdbd4a1d340ca5571d
[Bugzilla]: https://bugzilla.mozilla.org/enter_bug.cgi?product=Cloud%20Services&component=Server%3A%20Remote%20Settings [Bugzilla]: https://bugzilla.mozilla.org/enter_bug.cgi?product=Cloud%20Services&component=Server%3A%20Remote%20Settings
[Child]: https://searchfox.org/mozilla-central/search?q=TranslationsChild [Child]: https://searchfox.org/mozilla-central/search?q=TranslationsChild
[CLD2]: https://github.com/CLD2Owners/cld2
[CLD3]: https://github.com/google/cld3
[Download and Install]: https://emscripten.org/docs/getting_started/downloads.html#download-and-install [Download and Install]: https://emscripten.org/docs/getting_started/downloads.html#download-and-install
[emscripten (2.0.3)]: https://github.com/emscripten-core/emscripten/blob/main/ChangeLog.md#203-09102020 [emscripten (2.0.3)]: https://github.com/emscripten-core/emscripten/blob/main/ChangeLog.md#203-09102020
[emscripten (2.0.18)]: https://github.com/emscripten-core/emscripten/blob/main/ChangeLog.md#2018-04232021 [emscripten (2.0.18)]: https://github.com/emscripten-core/emscripten/blob/main/ChangeLog.md#2018-04232021
[emscripten (3.1.35)]: https://github.com/emscripten-core/emscripten/blob/main/ChangeLog.md#3135---040323 [emscripten (3.1.35)]: https://github.com/emscripten-core/emscripten/blob/main/ChangeLog.md#3135---040323
[Environments]: https://remote-settings.readthedocs.io/en/latest/getting-started.html#environments [Environments]: https://remote-settings.readthedocs.io/en/latest/getting-started.html#environments
[eval()]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/eval [eval()]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/eval
[fastText]: https://fasttext.cc/
[Filter Expressions]: https://remote-settings.readthedocs.io/en/latest/target-filters.html#filter-expressions [Filter Expressions]: https://remote-settings.readthedocs.io/en/latest/target-filters.html#filter-expressions
[Firefox Release Schedule]: https://wiki.mozilla.org/Release_Management/Calendar [Firefox Release Schedule]: https://wiki.mozilla.org/Release_Management/Calendar
[generate functions]: https://emscripten.org/docs/api_reference/emscripten.h.html?highlight=dynamic_execution#functions [generate functions]: https://emscripten.org/docs/api_reference/emscripten.h.html?highlight=dynamic_execution#functions
[Getting Set Up To Work On The Firefox Codebase]: https://firefox-source-docs.mozilla.org/setup/index.html [Getting Set Up To Work On The Firefox Codebase]: https://firefox-source-docs.mozilla.org/setup/index.html
[GitHub Issue]: https://github.com/facebookresearch/fastText/pull/1227#issuecomment-1353830003
[importScripts()]: https://developer.mozilla.org/en-US/docs/Web/API/WorkerGlobalScope/importScripts [importScripts()]: https://developer.mozilla.org/en-US/docs/Web/API/WorkerGlobalScope/importScripts
[JSWindowActors]: https://firefox-source-docs.mozilla.org/dom/ipc/jsactors.html#jswindowactor [JSWindowActors]: https://firefox-source-docs.mozilla.org/dom/ipc/jsactors.html#jswindowactor
[minify]: https://github.com/tdewolff/minify [minify]: https://github.com/tdewolff/minify
@ -162,6 +440,7 @@ TODO
[Step 3]: https://remote-settings.readthedocs.io/en/latest/getting-started.html#create-a-new-official-type-of-remote-settings [Step 3]: https://remote-settings.readthedocs.io/en/latest/getting-started.html#create-a-new-official-type-of-remote-settings
[remote-settings-devtools]: https://github.com/mozilla-extensions/remote-settings-devtools/releases [remote-settings-devtools]: https://github.com/mozilla-extensions/remote-settings-devtools/releases
[Remote Settings]: https://remote-settings.readthedocs.io/en/latest/ [Remote Settings]: https://remote-settings.readthedocs.io/en/latest/
[Requirements]: https://fasttext.cc/docs/en/webassembly-module.html#requirements
[toolkit/components/translations]: https://searchfox.org/mozilla-central/search?q=toolkit%2Fcomponents%2Ftranslations [toolkit/components/translations]: https://searchfox.org/mozilla-central/search?q=toolkit%2Fcomponents%2Ftranslations
[WASM]: https://webassembly.org/ [WASM]: https://webassembly.org/
[Workers]: https://searchfox.org/mozilla-central/search?q=%2Ftranslations.*worker&path=&case=false&regexp=true [Workers]: https://searchfox.org/mozilla-central/search?q=%2Ftranslations.*worker&path=&case=false&regexp=true

View file

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2016-present, Facebook, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View file

@ -0,0 +1,536 @@
/**
* Copyright (c) 2016-present, Facebook, Inc.
* All rights reserved.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
let fastTextModule;
const _initFastTextModule = async function (wasmModule) {
try {
fastTextModule = await loadFastTextModule(wasmModule);
} catch(e) {
console.error(e);
}
return true
}
let postRunFunc = null;
const addOnPostRun = function (func) {
postRunFunc = func;
};
const loadFastText = (wasmModule) => {
_initFastTextModule(wasmModule).then((res) => {
if (postRunFunc) {
postRunFunc();
}
})
}
const thisModule = this;
const trainFileInWasmFs = 'train.txt';
const testFileInWasmFs = 'test.txt';
const modelFileInWasmFs = 'model.bin';
const getFloat32ArrayFromHeap = (len) => {
const dataBytes = len * Float32Array.BYTES_PER_ELEMENT;
const dataPtr = fastTextModule._malloc(dataBytes);
const dataHeap = new Uint8Array(fastTextModule.HEAPU8.buffer,
dataPtr,
dataBytes);
return {
'ptr':dataHeap.byteOffset,
'size':len,
'buffer':dataHeap.buffer
};
};
const heapToFloat32 = (r) => new Float32Array(r.buffer, r.ptr, r.size);
class FastText {
constructor(fastTextModule) {
this.f = new fastTextModule.FastText();
}
/**
* loadModel
*
* Loads the model file from the specified url, and returns the
* corresponding `FastTextModel` object.
*
* @param {string} url
* the url of the model file.
*
* @return {Promise} promise object that resolves to a `FastTextModel`
*
*/
loadModel(url) {
const fetchFunc = (thisModule && thisModule.fetch) || fetch;
const fastTextNative = this.f;
return new Promise(function(resolve, reject) {
fetchFunc(url).then(response => {
return response.arrayBuffer();
}).then(bytes => {
const byteArray = new Uint8Array(bytes);
const FS = fastTextModule.FS;
FS.writeFile(modelFileInWasmFs, byteArray);
}).then(() => {
fastTextNative.loadModel(modelFileInWasmFs);
resolve(new FastTextModel(fastTextNative));
}).catch(error => {
reject(error);
});
});
}
loadModelBinary(buffer) {
const fastTextNative = this.f;
const byteArray = new Uint8Array(buffer);
const FS = fastTextModule.FS;
FS.writeFile(modelFileInWasmFs, byteArray);
fastTextNative.loadModel(modelFileInWasmFs);
return new FastTextModel(fastTextNative);
}
_train(url, modelName, kwargs = {}, callback = null) {
const fetchFunc = (thisModule && thisModule.fetch) || fetch;
const fastTextNative = this.f;
return new Promise(function(resolve, reject) {
fetchFunc(url).then(response => {
return response.arrayBuffer();
}).then(bytes => {
const byteArray = new Uint8Array(bytes);
const FS = fastTextModule.FS;
FS.writeFile(trainFileInWasmFs, byteArray);
}).then(() => {
const argsList = ['lr', 'lrUpdateRate', 'dim', 'ws', 'epoch',
'minCount', 'minCountLabel', 'neg', 'wordNgrams', 'loss',
'model', 'bucket', 'minn', 'maxn', 't', 'label', 'verbose',
'pretrainedVectors', 'saveOutput', 'seed', 'qout', 'retrain',
'qnorm', 'cutoff', 'dsub', 'qnorm', 'autotuneValidationFile',
'autotuneMetric', 'autotunePredictions', 'autotuneDuration',
'autotuneModelSize'];
const args = new fastTextModule.Args();
argsList.forEach(k => {
if (k in kwargs) {
args[k] = kwargs[k];
}
});
args.model = fastTextModule.ModelName[modelName];
args.loss = ('loss' in kwargs) ?
fastTextModule.LossName[kwargs['loss']] : 'hs';
args.thread = 1;
args.input = trainFileInWasmFs;
fastTextNative.train(args, callback);
resolve(new FastTextModel(fastTextNative));
}).catch(error => {
reject(error);
});
});
}
/**
* trainSupervised
*
* Downloads the input file from the specified url, trains a supervised
* model and returns a `FastTextModel` object.
*
* @param {string} url
* the url of the input file.
* The input file must must contain at least one label per line. For an
* example consult the example datasets which are part of the fastText
* repository such as the dataset pulled by classification-example.sh.
*
* @param {dict} kwargs
* train parameters.
* For example {'lr': 0.5, 'epoch': 5}
*
* @param {function} callback
* train callback function
* `callback` function is called regularly from the train loop:
* `callback(progress, loss, wordsPerSec, learningRate, eta)`
*
* @return {Promise} promise object that resolves to a `FastTextModel`
*
*/
trainSupervised(url, kwargs = {}, callback) {
const self = this;
return new Promise(function(resolve, reject) {
self._train(url, 'supervised', kwargs, callback).then(model => {
resolve(model);
}).catch(error => {
reject(error);
});
});
}
/**
* trainUnsupervised
*
* Downloads the input file from the specified url, trains an unsupervised
* model and returns a `FastTextModel` object.
*
* @param {string} url
* the url of the input file.
* The input file must not contain any labels or use the specified label
* prefixunless it is ok for those words to be ignored. For an example
* consult the dataset pulled by the example script word-vector-example.sh
* which is part of the fastText repository.
*
* @param {string} modelName
* Model to be used for unsupervised learning. `cbow` or `skipgram`.
*
* @param {dict} kwargs
* train parameters.
* For example {'lr': 0.5, 'epoch': 5}
*
* @param {function} callback
* train callback function
* `callback` function is called regularly from the train loop:
* `callback(progress, loss, wordsPerSec, learningRate, eta)`
*
* @return {Promise} promise object that resolves to a `FastTextModel`
*
*/
trainUnsupervised(url, modelName, kwargs = {}, callback) {
const self = this;
return new Promise(function(resolve, reject) {
self._train(url, modelName, kwargs, callback).then(model => {
resolve(model);
}).catch(error => {
reject(error);
});
});
}
}
class FastTextModel {
/**
* `FastTextModel` represents a trained model.
*
* @constructor
*
* @param {object} fastTextNative
* webassembly object that makes the bridge between js and C++
*/
constructor(fastTextNative) {
this.f = fastTextNative;
}
/**
* isQuant
*
* @return {bool} true if the model is quantized
*
*/
isQuant() {
return this.f.isQuant;
}
/**
* getDimension
*
* @return {int} the dimension (size) of a lookup vector (hidden layer)
*
*/
getDimension() {
return this.f.args.dim;
}
/**
* getWordVector
*
* @param {string} word
*
* @return {Float32Array} the vector representation of `word`.
*
*/
getWordVector(word) {
const b = getFloat32ArrayFromHeap(this.getDimension());
this.f.getWordVector(b, word);
return heapToFloat32(b);
}
/**
* getSentenceVector
*
* @param {string} text
*
* @return {Float32Array} the vector representation of `text`.
*
*/
getSentenceVector(text) {
if (text.indexOf('\n') != -1) {
"sentence vector processes one line at a time (remove '\\n')";
}
text += '\n';
const b = getFloat32ArrayFromHeap(this.getDimension());
this.f.getSentenceVector(b, text);
return heapToFloat32(b);
}
/**
* getNearestNeighbors
*
* returns the nearest `k` neighbors of `word`.
*
* @param {string} word
* @param {int} k
*
* @return {Array.<Pair.<number, string>>}
* words and their corresponding cosine similarities.
*
*/
getNearestNeighbors(word, k = 10) {
return this.f.getNN(word, k);
}
/**
* getAnalogies
*
* returns the nearest `k` neighbors of the operation
* `wordA - wordB + wordC`.
*
* @param {string} wordA
* @param {string} wordB
* @param {string} wordC
* @param {int} k
*
* @return {Array.<Pair.<number, string>>}
* words and their corresponding cosine similarities
*
*/
getAnalogies(wordA, wordB, wordC, k) {
return this.f.getAnalogies(k, wordA, wordB, wordC);
}
/**
* getWordId
*
* Given a word, get the word id within the dictionary.
* Returns -1 if word is not in the dictionary.
*
* @return {int} word id
*
*/
getWordId(word) {
return this.f.getWordId(word);
}
/**
* getSubwordId
*
* Given a subword, return the index (within input matrix) it hashes to.
*
* @return {int} subword id
*
*/
getSubwordId(subword) {
return this.f.getSubwordId(subword);
}
/**
* getSubwords
*
* returns the subwords and their indicies.
*
* @param {string} word
*
* @return {Pair.<Array.<string>, Array.<int>>}
* words and their corresponding indicies
*
*/
getSubwords(word) {
return this.f.getSubwords(word);
}
/**
* getInputVector
*
* Given an index, get the corresponding vector of the Input Matrix.
*
* @param {int} ind
*
* @return {Float32Array} the vector of the `ind`'th index
*
*/
getInputVector(ind) {
const b = getFloat32ArrayFromHeap(this.getDimension());
this.f.getInputVector(b, ind);
return heapToFloat32(b);
}
/**
* predict
*
* Given a string, get a list of labels and a list of corresponding
* probabilities. k controls the number of returned labels.
*
* @param {string} text
* @param {int} k, the number of predictions to be returned
* @param {number} probability threshold
*
* @return {Array.<Pair.<number, string>>}
* labels and their probabilities
*
*/
predict(text, k = 1, threshold = 0.0) {
return this.f.predict(text, k, threshold);
}
/**
* getInputMatrix
*
* Get a reference to the full input matrix of a Model. This only
* works if the model is not quantized.
*
* @return {DenseMatrix}
* densematrix with functions: `rows`, `cols`, `at(i,j)`
*
* example:
* let inputMatrix = model.getInputMatrix();
* let value = inputMatrix.at(1, 2);
*/
getInputMatrix() {
if (this.isQuant()) {
throw new Error("Can't get quantized Matrix");
}
return this.f.getInputMatrix();
}
/**
* getOutputMatrix
*
* Get a reference to the full input matrix of a Model. This only
* works if the model is not quantized.
*
* @return {DenseMatrix}
* densematrix with functions: `rows`, `cols`, `at(i,j)`
*
* example:
* let outputMatrix = model.getOutputMatrix();
* let value = outputMatrix.at(1, 2);
*/
getOutputMatrix() {
if (this.isQuant()) {
throw new Error("Can't get quantized Matrix");
}
return this.f.getOutputMatrix();
}
/**
* getWords
*
* Get the entire list of words of the dictionary including the frequency
* of the individual words. This does not include any subwords. For that
* please consult the function get_subwords.
*
* @return {Pair.<Array.<string>, Array.<int>>}
* words and their corresponding frequencies
*
*/
getWords() {
return this.f.getWords();
}
/**
* getLabels
*
* Get the entire list of labels of the dictionary including the frequency
* of the individual labels.
*
* @return {Pair.<Array.<string>, Array.<int>>}
* labels and their corresponding frequencies
*
*/
getLabels() {
return this.f.getLabels();
}
/**
* getLine
*
* Split a line of text into words and labels. Labels must start with
* the prefix used to create the model (__label__ by default).
*
* @param {string} text
*
* @return {Pair.<Array.<string>, Array.<string>>}
* words and labels
*
*/
getLine(text) {
return this.f.getLine(text);
}
/**
* saveModel
*
* Saves the model file in web assembly in-memory FS and returns a blob
*
* @return {Blob} blob data of the file saved in web assembly FS
*
*/
saveModel() {
this.f.saveModel(modelFileInWasmFs);
const content = fastTextModule.FS.readFile(modelFileInWasmFs,
{ encoding: 'binary' });
return new Blob(
[new Uint8Array(content, content.byteOffset, content.length)],
{ type: ' application/octet-stream' }
);
}
/**
* test
*
* Downloads the test file from the specified url, evaluates the supervised
* model with it.
*
* @param {string} url
* @param {int} k, the number of predictions to be returned
* @param {number} probability threshold
*
* @return {Promise} promise object that resolves to a `Meter` object
*
* example:
* model.test("/absolute/url/to/test.txt", 1, 0.0).then((meter) => {
* console.log(meter.precision);
* console.log(meter.recall);
* console.log(meter.f1Score);
* console.log(meter.nexamples());
* });
*
*/
test(url, k, threshold) {
const fetchFunc = (thisModule && thisModule.fetch) || fetch;
const fastTextNative = this.f;
return new Promise(function(resolve, reject) {
fetchFunc(url).then(response => {
return response.arrayBuffer();
}).then(bytes => {
const byteArray = new Uint8Array(bytes);
const FS = fastTextModule.FS;
FS.writeFile(testFileInWasmFs, byteArray);
}).then(() => {
const meter = fastTextNative.test(testFileInWasmFs, k, threshold);
resolve(meter);
}).catch(error => {
reject(error);
});
});
}
}

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1,44 @@
# Version of this schema
schema: 1
bugzilla:
# Bugzilla product and component for this directory and subdirectories
product: Firefox
component: Translation
# Document the source of externally hosted code
origin:
# Short name of the package/library
name: fasttext
description: The JavaScript emscripten worker to run fastText
# Full URL for the package's homepage/etc
# Usually different from repository url
url: https://github.com/facebookresearch/fastText
# Human-readable identifier for this version/release
# Generally "version NNN", "tag SSS", "bookmark SSS"
release: v0.9.2
# Revision to pull in
# Must be a long or short commit SHA (long preferred)
revision: 3697152e0fd772d9185697fdbd4a1d340ca5571d
# The package's license, where possible using the mnemonic from
# https://spdx.org/licenses/
# Multiple licenses can be specified (as a YAML list)
# A "LICENSE" file must exist containing the full license text
license: MIT
notes: >
This code was generated from the fastText repository on the following revision:
3697152e0fd772d9185697fdbd4a1d340ca5571d
https://github.com/facebookresearch/fastText
There are detailed instructions in the Firefox Source Docs on how to build these
dependencies locally.
https://firefox-source-docs.mozilla.org/toolkit/components/translations/resources/02_contributing.html#building-fasttext

View file

@ -4,6 +4,10 @@
toolkit.jar: toolkit.jar:
content/global/translations/bergamot-translator.js (bergamot-translator/bergamot-translator.js) content/global/translations/bergamot-translator.js (bergamot-translator/bergamot-translator.js)
content/global/translations/fasttext.js (fasttext/fasttext.js)
content/global/translations/fasttext_wasm.js (fasttext/fasttext_wasm.js)
content/global/translations/language-id-engine.sys.mjs (content/language-id-engine.sys.mjs)
content/global/translations/language-id-engine-worker.js (content/language-id-engine-worker.js)
content/global/translations/simd-detect-worker.js (content/simd-detect-worker.js) content/global/translations/simd-detect-worker.js (content/simd-detect-worker.js)
content/global/translations/translations-document.sys.mjs (content/translations-document.sys.mjs) content/global/translations/translations-document.sys.mjs (content/translations-document.sys.mjs)
content/global/translations/translations-engine.html (content/translations-engine.html) content/global/translations/translations-engine.html (content/translations-engine.html)

View file

@ -173,10 +173,13 @@ add_task(async function test_about_translations_html() {
add_task(async function test_about_translations_language_identification() { add_task(async function test_about_translations_language_identification() {
await openAboutTranslations({ await openAboutTranslations({
detectedLangTag: "en",
detectedLanguageConfidence: "0.98",
languagePairs: [ languagePairs: [
{ fromLang: "en", toLang: "fr" }, { fromLang: "en", toLang: "fr" },
{ fromLang: "fr", toLang: "en" }, { fromLang: "fr", toLang: "en" },
], ],
prefs: [["browser.translations.languageIdentification.useFastText", true]],
runInPage: async ({ selectors }) => { runInPage: async ({ selectors }) => {
const { document, window } = content; const { document, window } = content;
Cu.waiveXrays(window).DEBOUNCE_DELAY = 5; // Make the timer run faster for tests. Cu.waiveXrays(window).DEBOUNCE_DELAY = 5; // Make the timer run faster for tests.
@ -218,7 +221,7 @@ add_task(async function test_about_translations_language_identification() {
is( is(
translation, translation,
translationResult.innerText, translationResult.innerText,
"The language identification correctly informs the translation." "The language identification engine correctly informs the translation."
); );
} }

View file

@ -4,9 +4,11 @@
"use strict"; "use strict";
add_task(async function test_detected_language() { add_task(async function test_detected_language() {
const detectedLangTag = "en";
const { cleanup, tab } = await loadTestPage({ const { cleanup, tab } = await loadTestPage({
// This page will get its language changed by the test. // This page will get its language changed by the test.
page: ENGLISH_PAGE_URL, page: ENGLISH_PAGE_URL,
detectedLangTag,
autoDownloadFromRemoteSettings: true, autoDownloadFromRemoteSettings: true,
languagePairs: [ languagePairs: [
// Spanish // Spanish
@ -75,7 +77,7 @@ add_task(async function test_detected_language() {
Assert.deepEqual( Assert.deepEqual(
await getDetectedLanguagesFor("gibberish"), await getDetectedLanguagesFor("gibberish"),
{ {
docLangTag: "en", docLangTag: detectedLangTag,
userLangTag: null, userLangTag: null,
isDocLangTagSupported: true, isDocLangTagSupported: true,
}, },

View file

@ -8,9 +8,12 @@
* issues. * issues.
*/ */
add_task(async function test_detected_language() { add_task(async function test_detected_language() {
const detectedLangTag = "en";
const { cleanup, tab } = await loadTestPage({ const { cleanup, tab } = await loadTestPage({
// This page will get its language changed by the test. // This page will get its language changed by the test.
page: ENGLISH_PAGE_URL, page: ENGLISH_PAGE_URL,
detectedLangTag,
autoDownloadFromRemoteSettings: true, autoDownloadFromRemoteSettings: true,
// Empty out the accept languages. // Empty out the accept languages.
languagePairs: [ languagePairs: [

View file

@ -98,6 +98,10 @@ add_task(async function test_about_translations_enabled() {
add_task(async function test_language_identification_for_page_translation() { add_task(async function test_language_identification_for_page_translation() {
await autoTranslatePage({ await autoTranslatePage({
page: NO_LANGUAGE_URL, page: NO_LANGUAGE_URL,
detectedLangTag: "es",
detectedLanguageConfidence: 0.95,
resolveLanguageIdDownloads: true,
prefs: [["browser.translations.languageIdentification.useFastText", true]],
languagePairs: [ languagePairs: [
{ fromLang: "es", toLang: "en" }, { fromLang: "es", toLang: "en" },
{ fromLang: "en", toLang: "es" }, { fromLang: "en", toLang: "es" },

View file

@ -60,6 +60,14 @@ const NEVER_TRANSLATE_LANGS_PREF =
* @param {boolean} [options.disabled] * @param {boolean} [options.disabled]
* Disable the panel through a pref. * Disable the panel through a pref.
* *
* @param {number} detectedLanguageConfidence
* This is the value for the MockedLanguageIdEngine to give as a confidence score for
* the mocked detected language.
*
* @param {string} detectedLangTag
* This is the BCP 47 language tag for the MockedLanguageIdEngine to return as
* the mocked detected language.
*
* @param {Array<{ fromLang: string, toLang: string }>} options.languagePairs * @param {Array<{ fromLang: string, toLang: string }>} options.languagePairs
* The translation languages pairs to mock for the test. * The translation languages pairs to mock for the test.
* *
@ -70,6 +78,8 @@ async function openAboutTranslations({
dataForContent, dataForContent,
disabled, disabled,
runInPage, runInPage,
detectedLanguageConfidence,
detectedLangTag,
languagePairs = LANGUAGE_PAIRS, languagePairs = LANGUAGE_PAIRS,
prefs, prefs,
}) { }) {
@ -108,6 +118,8 @@ async function openAboutTranslations({
// TODO(Bug 1814168) - Do not test download behavior as this is not robustly // TODO(Bug 1814168) - Do not test download behavior as this is not robustly
// handled for about:translations yet. // handled for about:translations yet.
autoDownloadFromRemoteSettings: true, autoDownloadFromRemoteSettings: true,
detectedLangTag,
detectedLanguageConfidence,
}); });
// Now load the about:translations page, since the actor could be mocked. // Now load the about:translations page, since the actor could be mocked.
@ -117,7 +129,10 @@ async function openAboutTranslations({
); );
await BrowserTestUtils.browserLoaded(tab.linkedBrowser); await BrowserTestUtils.browserLoaded(tab.linkedBrowser);
await remoteClients.translationsWasm.resolvePendingDownloads(1); // Resolve the files.
await remoteClients.languageIdModels.resolvePendingDownloads(1);
// The language id and translation engine each have a wasm file, so expect 2 downloads.
await remoteClients.translationsWasm.resolvePendingDownloads(2);
await remoteClients.translationModels.resolvePendingDownloads( await remoteClients.translationModels.resolvePendingDownloads(
languagePairs.length * FILES_PER_LANGUAGE_PAIR languagePairs.length * FILES_PER_LANGUAGE_PAIR
); );
@ -389,6 +404,8 @@ async function closeTranslationsPanelIfOpen() {
async function setupActorTest({ async function setupActorTest({
languagePairs, languagePairs,
prefs, prefs,
detectedLanguageConfidence,
detectedLangTag,
autoDownloadFromRemoteSettings = false, autoDownloadFromRemoteSettings = false,
}) { }) {
await SpecialPowers.pushPrefEnv({ await SpecialPowers.pushPrefEnv({
@ -402,6 +419,8 @@ async function setupActorTest({
const { remoteClients, removeMocks } = await createAndMockRemoteSettings({ const { remoteClients, removeMocks } = await createAndMockRemoteSettings({
languagePairs, languagePairs,
detectedLangTag,
detectedLanguageConfidence,
autoDownloadFromRemoteSettings, autoDownloadFromRemoteSettings,
}); });
@ -429,6 +448,8 @@ async function setupActorTest({
async function createAndMockRemoteSettings({ async function createAndMockRemoteSettings({
languagePairs = LANGUAGE_PAIRS, languagePairs = LANGUAGE_PAIRS,
detectedLanguageConfidence = 0.5,
detectedLangTag = "en",
autoDownloadFromRemoteSettings = false, autoDownloadFromRemoteSettings = false,
}) { }) {
const remoteClients = { const remoteClients = {
@ -439,6 +460,9 @@ async function createAndMockRemoteSettings({
translationsWasm: await createTranslationsWasmRemoteClient( translationsWasm: await createTranslationsWasmRemoteClient(
autoDownloadFromRemoteSettings autoDownloadFromRemoteSettings
), ),
languageIdModels: await createLanguageIdModelsRemoteClient(
autoDownloadFromRemoteSettings
),
}; };
// The TranslationsParent will pull the language pair values from the JSON dump // The TranslationsParent will pull the language pair values from the JSON dump
@ -450,13 +474,23 @@ async function createAndMockRemoteSettings({
remoteClients.translationsWasm.client remoteClients.translationsWasm.client
); );
TranslationsParent.mockLanguageIdentification(
detectedLangTag,
detectedLanguageConfidence,
remoteClients.languageIdModels.client
);
return { return {
async removeMocks() { async removeMocks() {
await remoteClients.translationModels.client.attachments.deleteAll(); await remoteClients.translationModels.client.attachments.deleteAll();
await remoteClients.translationsWasm.client.attachments.deleteAll();
await remoteClients.languageIdModels.client.attachments.deleteAll();
await remoteClients.translationModels.client.db.clear(); await remoteClients.translationModels.client.db.clear();
await remoteClients.translationsWasm.client.db.clear(); await remoteClients.translationsWasm.client.db.clear();
await remoteClients.languageIdModels.client.db.clear();
TranslationsParent.unmockTranslationsEngine(); TranslationsParent.unmockTranslationsEngine();
TranslationsParent.unmockLanguageIdentification();
TranslationsParent.clearCache(); TranslationsParent.clearCache();
}, },
remoteClients, remoteClients,
@ -466,6 +500,8 @@ async function createAndMockRemoteSettings({
async function loadTestPage({ async function loadTestPage({
languagePairs, languagePairs,
autoDownloadFromRemoteSettings = false, autoDownloadFromRemoteSettings = false,
detectedLanguageConfidence,
detectedLangTag,
page, page,
prefs, prefs,
autoOffer, autoOffer,
@ -506,6 +542,8 @@ async function loadTestPage({
const { remoteClients, removeMocks } = await createAndMockRemoteSettings({ const { remoteClients, removeMocks } = await createAndMockRemoteSettings({
languagePairs, languagePairs,
detectedLanguageConfidence,
detectedLangTag,
autoDownloadFromRemoteSettings, autoDownloadFromRemoteSettings,
}); });
@ -544,6 +582,11 @@ async function loadTestPage({
); );
}, },
async resolveLanguageIdDownloads() {
await remoteClients.translationsWasm.resolvePendingDownloads(1);
await remoteClients.languageIdModels.resolvePendingDownloads(1);
},
/** /**
* @returns {Promise<void>} * @returns {Promise<void>}
*/ */
@ -817,7 +860,7 @@ async function createTranslationModelsRemoteClient(
async function createTranslationsWasmRemoteClient( async function createTranslationsWasmRemoteClient(
autoDownloadFromRemoteSettings autoDownloadFromRemoteSettings
) { ) {
const records = ["bergamot-translator"].map(name => ({ const records = ["bergamot-translator", "fasttext-wasm"].map(name => ({
id: crypto.randomUUID(), id: crypto.randomUUID(),
name, name,
version: "1.0", version: "1.0",
@ -843,6 +886,43 @@ async function createTranslationsWasmRemoteClient(
); );
} }
/**
* Creates a local RemoteSettingsClient for use within tests.
*
* @param {boolean} autoDownloadFromRemoteSettings
* @returns {RemoteSettingsClient}
*/
async function createLanguageIdModelsRemoteClient(
autoDownloadFromRemoteSettings
) {
const records = [
{
id: crypto.randomUUID(),
name: "lid.176.ftz",
version: "1.0",
last_modified: Date.now(),
schema: Date.now(),
},
];
const { RemoteSettings } = ChromeUtils.importESModule(
"resource://services-settings/remote-settings.sys.mjs"
);
const client = RemoteSettings(
"test-language-id-models" + _remoteSettingsMockId++
);
const mockedCollectionName = "test-language-id-models";
const metadata = {};
await client.db.clear();
await client.db.importChanges(metadata, Date.now(), records);
return createAttachmentMock(
client,
mockedCollectionName,
autoDownloadFromRemoteSettings
);
}
async function selectAboutPreferencesElements() { async function selectAboutPreferencesElements() {
const document = gBrowser.selectedBrowser.contentDocument; const document = gBrowser.selectedBrowser.contentDocument;

View file

@ -24,6 +24,25 @@ export interface Attachment {
mimetype: string; mimetype: string;
} }
/**
* The JSON that is synced from Remote Settings for the language-id models.
*/
export interface LanguageIdModelRecord {
// e.g. "0d4db293-a17c-4085-9bd8-e2e146c85000"
id: string;
// The full model name, e.g. "lid.176.ftz"
name: string;
// The semver number, used for handling future format changes. e.g. 1.0
version: string;
// The file attachment for this record
attachment: Attachment;
// e.g. 1673455932527
last_modified: string;
// A JEXL expression to determine whether this record should be pulled from Remote Settings
// See: https://remote-settings.readthedocs.io/en/latest/target-filters.html#filter-expressions
filter_expression: string;
}
/** /**
* The JSON that is synced from Remote Settings for the translation models. * The JSON that is synced from Remote Settings for the translation models.
*/ */
@ -242,6 +261,18 @@ interface TranslationsEnginePayload {
isMocked: boolean, isMocked: boolean,
} }
/**
* These are the files that are downloaded from Remote Settings that are necessary
* to start the language-identification engine. These may not be available if running
* in tests.
*/
interface LanguageIdEnginePayload {
wasmBuffer: ArrayBuffer,
modelBuffer: ArrayBuffer,
mockedConfidence: null | number,
mockedLangTag: null | string,
}
/** /**
* Nodes that are being translated are given priority according to their visibility. * Nodes that are being translated are given priority according to their visibility.
*/ */

View file

@ -3657,6 +3657,8 @@ SOFTWARE.
<li><code>third_party/js/cfworker/json-schema.js</code></li> <li><code>third_party/js/cfworker/json-schema.js</code></li>
<li><code>security/nss/lib/freebl/ecl/ecp_secp384r1.c</code> and <li><code>security/nss/lib/freebl/ecl/ecp_secp384r1.c</code> and
<code>security/nss/lib/freebl/ecl/ecp_secp521r1.c</code></li> <code>security/nss/lib/freebl/ecl/ecp_secp521r1.c</code></li>
<li><code>toolkit/components/translations/fasttext/fasttext.js</code> and
<code>toolkit/components/translations/fasttext/fasttext_wasm.js</code></li>
<li><code>security/nss/lib/freebl/ecl/curve25519_32.c</code>, <li><code>security/nss/lib/freebl/ecl/curve25519_32.c</code>,
<code>security/nss/lib/freebl/ecl/ecp_secp384r1.c</code> and <code>security/nss/lib/freebl/ecl/ecp_secp384r1.c</code> and
<code>security/nss/lib/freebl/ecl/ecp_secp521r1.c</code></li> <code>security/nss/lib/freebl/ecl/ecp_secp521r1.c</code></li>

View file

@ -180,6 +180,8 @@ toolkit/components/passwordmgr/PasswordRulesParser.sys.mjs
toolkit/components/protobuf/ toolkit/components/protobuf/
toolkit/components/translation/cld2/ toolkit/components/translation/cld2/
toolkit/components/translations/bergamot-translator toolkit/components/translations/bergamot-translator
toolkit/components/translations/fasttext/fasttext.js
toolkit/components/translations/fasttext/fasttext_wasm.js
toolkit/components/url-classifier/chromium/ toolkit/components/url-classifier/chromium/
toolkit/components/utils/mozjexl.js toolkit/components/utils/mozjexl.js
toolkit/components/viaduct/fetch_msg_types.pb.cc toolkit/components/viaduct/fetch_msg_types.pb.cc