Bug 1897117 - Change the request structure for the ML image-to-text pipeline - r=gregtatum

Differential Revision: https://phabricator.services.mozilla.com/D210659
2024-05-16 16:56:18 +00:00 · 2024-05-16 16:56:18 +00:00 · 175bc1a8f6
commit 175bc1a8f6
parent acb920569c
2 changed files with 16 additions and 18 deletions
--- a/toolkit/components/ml/content/ONNXPipeline.mjs
+++ b/toolkit/components/ml/content/ONNXPipeline.mjs
@ -62,9 +62,11 @@ async function echo(request, _model, _tokenizer, _processor) {
 *
 * @async
 * @param {object} request - The request object containing image data.
- * @param {string} [request.imageUrl] - The URL of the image to process. Either `imageUrl` or `data` must be provided, but not both.
- * @param {ArrayBuffer} [request.data] - The raw image data to process. Either `data` or `imageUrl` must be provided, but not both.
- * @param {string} request.mimeType - The MIME type of the image data.
+ * @param {string} [request.url] - The URL of the image to process. If `url` is not provided, other fields are used.
+ * @param {ArrayBuffer} [request.data] - The raw image data to process. Ignored if `url` is provided.
+ * @param {number} [request.width] - The image width. Ignored if `url` is provided.
+ * @param {number} [request.height] - The image height. Ignored if `url` is provided.
+ * @param {number} [request.channels] - The image channels. Can be 1, 2, 3 or 4. Defaults to 4. Ignored if `url` is provided.
 * @param {object} model - The model used for inference.
 * @param {object} tokenizer - The tokenizer used for decoding.
 * @param {object} processor - The processor used for preparing image data.
@ -80,11 +82,15 @@ async function imageToText(request, model, tokenizer, processor) {
  let start = Date.now();
  let rawImage;

-  if ("imageUrl" in request) {
-    rawImage = await RawImage.fromUrl(request.imageUrl);
+  if ("url" in request) {
+    rawImage = await RawImage.fromURL(request.url);
  } else {
-    const blob = new Blob([request.data], { type: request.mimeType });
-    rawImage = await RawImage.fromBlob(blob);
+    rawImage = new RawImage(
+      request.data,
+      request.width,
+      request.height,
+      request.channels || 4
+    );
  }

  debug("Image loaded in ", Date.now() - start);
--- a/toolkit/components/ml/docs/index.rst
+++ b/toolkit/components/ml/docs/index.rst
@ -21,18 +21,10 @@ In the example below, an image is converted to text using the `image-to-text` ta
  // We then create the engine object, using the options
  const engine = engineParent.getEngine(options);

+  // Preparing a request
+  const request = {url: "https://huggingface.co/datasets/mishig/sample_images/resolve/main/football-match.jpg"};
+
  // At this point we are ready to do some inference.
-
-  // We need to get the image as an array buffer and wrap it into a request object
-  const response = await fetch("https://huggingface.co/datasets/mishig/sample_images/resolve/main/football-match.jpg");
-  const buffer = await response.arrayBuffer();
-  const mimeType = response.headers.get('Content-Type');
-  const request = {
-    data: buffer,
-    mimeType: mimeType
-  };
-
-  // Finally, we run the engine with the request object
  const res = await engine.run(request);

  // The result is a string containing the text extracted from the image