forked from mirrors/gecko-dev
		
	
		
			
				
	
	
		
			192 lines
		
	
	
	
		
			6.3 KiB
		
	
	
	
		
			JavaScript
		
	
	
	
	
	
			
		
		
	
	
			192 lines
		
	
	
	
		
			6.3 KiB
		
	
	
	
		
			JavaScript
		
	
	
	
	
	
/* This Source Code Form is subject to the terms of the Mozilla Public
 | 
						|
 * License, v. 2.0. If a copy of the MPL was not distributed with this file,
 | 
						|
 * You can obtain one at http://mozilla.org/MPL/2.0/. */
 | 
						|
 | 
						|
// workerManager is exported for tests.
 | 
						|
import { clearTimeout, setTimeout } from "resource://gre/modules/Timer.sys.mjs";
 | 
						|
 | 
						|
const WORKER_URL = "resource://gre/modules/translation/cld-worker.js";
 | 
						|
 | 
						|
/**
 | 
						|
 * The length of the substring to pull from the document's text for language
 | 
						|
 * identification.
 | 
						|
 *
 | 
						|
 * This value should ideally be one that is large enough to yield a confident
 | 
						|
 * identification result without being too large or expensive to extract.
 | 
						|
 *
 | 
						|
 * At this time, this value is not driven by statistical data or analysis.
 | 
						|
 *
 | 
						|
 * For the moment, while we investigate which language identification library
 | 
						|
 * we would like to use, keep this logic in sync with language-id-engine.sys.mjs
 | 
						|
 */
 | 
						|
const DOC_TEXT_TO_IDENTIFY_LENGTH = 1024;
 | 
						|
 | 
						|
export var workerManager = {
 | 
						|
  // Since Emscripten can handle heap growth, but not heap shrinkage, we
 | 
						|
  // need to refresh the worker after we've processed a particularly large
 | 
						|
  // string in order to prevent unnecessary resident memory growth.
 | 
						|
  //
 | 
						|
  // These values define the cut-off string length and the idle timeout
 | 
						|
  // (in milliseconds) before destroying a worker. Once a string of the
 | 
						|
  // maximum size has been processed, the worker is marked for
 | 
						|
  // destruction, and is terminated as soon as it has been idle for the
 | 
						|
  // given timeout.
 | 
						|
  //
 | 
						|
  // 1.5MB. This is the approximate string length that forces heap growth
 | 
						|
  // for a 2MB heap.
 | 
						|
  LARGE_STRING: 1.5 * 1024 * 1024,
 | 
						|
  IDLE_TIMEOUT: 10 * 1000,
 | 
						|
 | 
						|
  detectionQueue: [],
 | 
						|
 | 
						|
  detectLanguage(aParams) {
 | 
						|
    return this.workerReady
 | 
						|
      .then(worker => {
 | 
						|
        return new Promise(resolve => {
 | 
						|
          this.detectionQueue.push({ resolve });
 | 
						|
          worker.postMessage(aParams);
 | 
						|
        });
 | 
						|
      })
 | 
						|
      .then(result => {
 | 
						|
        // We have our asynchronous result from the worker.
 | 
						|
        //
 | 
						|
        // Determine if our input was large enough to trigger heap growth,
 | 
						|
        // or if we're already waiting to destroy the worker when it's
 | 
						|
        // idle. If so, schedule termination after the idle timeout.
 | 
						|
        if (
 | 
						|
          aParams.text.length >= this.LARGE_STRING ||
 | 
						|
          this._idleTimeout != null
 | 
						|
        ) {
 | 
						|
          this.flushWorker();
 | 
						|
        }
 | 
						|
 | 
						|
        return result;
 | 
						|
      });
 | 
						|
  },
 | 
						|
 | 
						|
  _worker: null,
 | 
						|
  _workerReadyPromise: null,
 | 
						|
 | 
						|
  get workerReady() {
 | 
						|
    if (!this._workerReadyPromise) {
 | 
						|
      this._workerReadyPromise = new Promise(resolve => {
 | 
						|
        let worker = new Worker(WORKER_URL);
 | 
						|
        worker.onmessage = aMsg => {
 | 
						|
          if (aMsg.data == "ready") {
 | 
						|
            resolve(worker);
 | 
						|
          } else {
 | 
						|
            this.detectionQueue.shift().resolve(aMsg.data);
 | 
						|
          }
 | 
						|
        };
 | 
						|
        this._worker = worker;
 | 
						|
      });
 | 
						|
    }
 | 
						|
 | 
						|
    return this._workerReadyPromise;
 | 
						|
  },
 | 
						|
 | 
						|
  // Holds the ID of the current pending idle cleanup setTimeout.
 | 
						|
  _idleTimeout: null,
 | 
						|
 | 
						|
  // Schedule the current worker to be terminated after the idle timeout.
 | 
						|
  flushWorker() {
 | 
						|
    if (this._idleTimeout != null) {
 | 
						|
      clearTimeout(this._idleTimeout);
 | 
						|
    }
 | 
						|
 | 
						|
    this._idleTimeout = setTimeout(
 | 
						|
      this._flushWorker.bind(this),
 | 
						|
      this.IDLE_TIMEOUT
 | 
						|
    );
 | 
						|
  },
 | 
						|
 | 
						|
  // Immediately terminate the worker, as long as there no pending
 | 
						|
  // results. Otherwise, reschedule termination until after the next
 | 
						|
  // idle timeout.
 | 
						|
  _flushWorker() {
 | 
						|
    if (this.detectionQueue.length) {
 | 
						|
      this.flushWorker();
 | 
						|
    } else {
 | 
						|
      if (this._worker) {
 | 
						|
        this._worker.terminate();
 | 
						|
      }
 | 
						|
 | 
						|
      this._worker = null;
 | 
						|
      this._workerReadyPromise = null;
 | 
						|
      this._idleTimeout = null;
 | 
						|
    }
 | 
						|
  },
 | 
						|
};
 | 
						|
 | 
						|
export var LanguageDetector = {
 | 
						|
  /**
 | 
						|
   * Detect the language of a given string.
 | 
						|
   *
 | 
						|
   * The argument may be either a string containing the text to analyze,
 | 
						|
   * or an object with the following properties:
 | 
						|
   *
 | 
						|
   *  - 'text' The text to analyze.
 | 
						|
   *
 | 
						|
   *  - 'isHTML' (optional) A boolean, indicating whether the text
 | 
						|
   *      should be analyzed as HTML rather than plain text.
 | 
						|
   *
 | 
						|
   *  - 'language' (optional) A string indicating the expected language.
 | 
						|
   *      For text extracted from HTTP documents, this is expected to
 | 
						|
   *      come from the Content-Language header.
 | 
						|
   *
 | 
						|
   *  - 'tld' (optional) A string indicating the top-level domain of the
 | 
						|
   *      document the text was extracted from.
 | 
						|
   *
 | 
						|
   *  - 'encoding' (optional) A string describing the encoding of the
 | 
						|
   *      document the string was extracted from. Note that, regardless
 | 
						|
   *      of the value of this property, the 'text' property must be a
 | 
						|
   *      UTF-16 JavaScript string.
 | 
						|
   *
 | 
						|
   * @returns {Promise<Object>}
 | 
						|
   * @resolves When detection is finished, with a object containing
 | 
						|
   * these fields:
 | 
						|
   *  - 'language' (string with a language code)
 | 
						|
   *  - 'confident' (boolean) Whether the detector is confident of the
 | 
						|
   *      result.
 | 
						|
   *  - 'languages' (array) An array of up to three elements, containing
 | 
						|
   *      the most prevalent languages detected. It contains a
 | 
						|
   *      'languageCode' property, containing the ISO language code of
 | 
						|
   *      the language, and a 'percent' property, describing the
 | 
						|
   *      approximate percentage of the input which is in that language.
 | 
						|
   *      For text of an unknown language, the result may contain an
 | 
						|
   *      entry with the languge code 'un', indicating the percent of
 | 
						|
   *      the text which is unknown.
 | 
						|
   */
 | 
						|
  detectLanguage(aParams) {
 | 
						|
    if (typeof aParams == "string") {
 | 
						|
      aParams = { text: aParams };
 | 
						|
    }
 | 
						|
 | 
						|
    return workerManager.detectLanguage(aParams);
 | 
						|
  },
 | 
						|
 | 
						|
  /**
 | 
						|
   * Attempts to determine the language in which the document's content is written.
 | 
						|
   *
 | 
						|
   * For the moment, while we investigate which language identification library
 | 
						|
   * we would like to use, keep this logic in sync with language-id-engine.sys.mjs
 | 
						|
   * @returns {string | null}
 | 
						|
   */
 | 
						|
  async detectLanguageFromDocument(aDocument) {
 | 
						|
    // Grab a selection of text.
 | 
						|
    let encoder = Cu.createDocumentEncoder("text/plain");
 | 
						|
    encoder.init(aDocument, "text/plain", encoder.SkipInvisibleContent);
 | 
						|
    let text = encoder
 | 
						|
      .encodeToStringWithMaxLength(DOC_TEXT_TO_IDENTIFY_LENGTH)
 | 
						|
      .replaceAll("\r", "")
 | 
						|
      .replaceAll("\n", " ");
 | 
						|
 | 
						|
    const { language, confident } = await workerManager.detectLanguage({
 | 
						|
      text,
 | 
						|
    });
 | 
						|
 | 
						|
    workerManager.flushWorker();
 | 
						|
 | 
						|
    return confident ? language : null;
 | 
						|
  },
 | 
						|
};
 |