gecko-dev/browser/components/genai/LinkPreviewChild.sys.mjs

/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

const lazy = {};
ChromeUtils.defineESModuleGetters(lazy, {
  NetUtil: "resource://gre/modules/NetUtil.sys.mjs",
  ReaderMode: "moz-src:///toolkit/components/reader/ReaderMode.sys.mjs",
  Readerable: "resource://gre/modules/Readerable.sys.mjs",
  isProbablyReaderable: "resource://gre/modules/Readerable.sys.mjs",
});

/**
 * Represents a child actor for handling link previews in the browser.
 * Interacts with content windows and handles events related to link previews.
 *
 * @class LinkPreviewChild
 * @augments {JSWindowActorChild}
 */
export class LinkPreviewChild extends JSWindowActorChild {
  /**
   * Handles incoming messages from the parent actor.
   *
   * @param {object} message - The message object containing name and data.
   * @param {string} message.name - The name of the message.
   * @param {object} message.data - Data associated with the message.
   * @returns {Promise<object>|undefined} The result of fetchPageData if applicable.
   */
  async receiveMessage({ name, data }) {
    if (name === "LinkPreview:FetchPageData") {
      return this.fetchPageData(data.url);
    }
    //expected a return value.  consistent-return (eslint)
    return undefined;
  }

  /**
   * Fetches the HTML content from the given URL.
   *
   * @param {string} url - The URL to fetch.
   * @returns {Promise<string>} The HTML content as a string.
   * @throws {Error} If the fetch fails or the content type is invalid.
   */
  fetchHTML(url) {
    const uri = lazy.NetUtil.newURI(url);
    if (!uri.schemeIs("https")) {
      throw Components.Exception(
        "Only handling https",
        Cr.NS_ERROR_UNKNOWN_PROTOCOL
      );
    }

    // Make requests with a channel to automatically get safe browsing checks.
    // Use null principals in combination with anonymous for now ahead of
    // fetching content with cookies to handle sites requiring login.
    const principal = Services.scriptSecurityManager.createNullPrincipal({});
    const channel = lazy.NetUtil.newChannel({
      contentPolicyType: Ci.nsIContentPolicy.TYPE_DOCUMENT,
      loadingPrincipal: principal,
      securityFlags: Ci.nsILoadInfo.SEC_ALLOW_CROSS_ORIGIN_INHERITS_SEC_CONTEXT,
      triggeringPrincipal: principal,
      uri,
    }).QueryInterface(Ci.nsIHttpChannel);
    channel.loadFlags = Ci.nsIRequest.LOAD_ANONYMOUS;

    // Specially identify this request, e.g., for publishers to opt out
    channel.setRequestHeader("x-firefox-ai", "1", false);

    const { promise, resolve, reject } = Promise.withResolvers();
    const MAX_CONTENT_LENGTH = 5 * 1024 * 1024; // 5 MB limit

    let charset = "utf-8";
    const byteChunks = [];
    let totalLength = 0;
    channel.asyncOpen({
      onDataAvailable(request, stream, offset, count) {
        totalLength += count;
        if (totalLength > MAX_CONTENT_LENGTH) {
          request.cancel(Cr.NS_ERROR_FILE_TOO_BIG);
        } else {
          byteChunks.push(lazy.NetUtil.readInputStream(stream, count));
        }
      },
      onStartRequest(request) {
        const http = request.QueryInterface(Ci.nsIHttpChannel);

        // Enforce text/html if provided by server
        let contentType = "";
        try {
          contentType = http.getResponseHeader("content-type");
        } catch (ex) {}
        if (contentType && !contentType.startsWith("text/html")) {
          request.cancel(Cr.NS_ERROR_FILE_UNKNOWN_TYPE);
        }

        // Save charset without quotes or spaces for TextDecoder
        const match = contentType.match(/charset=["' ]*([^;"' ]+)/i);
        if (match) {
          charset = match[1];
        }

        // Enforce max length if provided by server
        try {
          if (http.getResponseHeader("content-length") > MAX_CONTENT_LENGTH) {
            request.cancel(Cr.NS_ERROR_FILE_TOO_BIG);
          }
        } catch (ex) {}
      },
      onStopRequest(_request, status) {
        if (Components.isSuccessCode(status)) {
          const bytes = new Uint8Array(totalLength);
          let offset = 0;
          for (const chunk of byteChunks) {
            bytes.set(new Uint8Array(chunk), offset);
            offset += chunk.byteLength;
          }

          const decoder = new TextDecoder(charset);
          resolve(decoder.decode(bytes));
        } else {
          reject(Components.Exception("Failed to fetch HTML", status));
        }
      },
    });
    return promise;
  }

  /**
   * Fetches HTML content from a URL and parses its meta tags and page text.
   *
   * @param {string} url - The URL to fetch and parse.
   * @returns {Promise<object>} An object containing meta information, page text, and HTML code.
   */
  async fetchPageData(url) {
    const ret = {
      article: {},
      rawMetaInfo: {},
      url,
    };
    try {
      const htmlCode = await this.fetchHTML(url);
      ret.urlComponents = this.extractUrlComponents(url);

      const parser = new DOMParser();
      const doc = parser.parseFromString(htmlCode, "text/html");
      ret.rawMetaInfo = this.parseMetaTagsFromDoc(doc);

      if (
        !lazy.Readerable.shouldCheckUri(lazy.NetUtil.newURI(url)) ||
        !lazy.isProbablyReaderable(doc)
      ) {
        // Add normalized metadata even if the document isn't reader-able
        ret.meta = this.extractNormalizedMetadata(ret.rawMetaInfo);
        return ret;
      }

      ret.article = await this.getArticleDataFromDoc(doc);

      ret.meta = this.extractNormalizedMetadata(ret.rawMetaInfo, ret.article);
    } catch (error) {
      console.error(`Failed to fetch and parse page data: ${error}`);
      ret.error = { message: error.message, result: error.result };
      // Add empty normalized metadata in case of error
      ret.meta = this.extractNormalizedMetadata();
    }
    return ret;
  }

  /**
   * Extracts and normalizes metadata from the page's meta tags and article content.
   *
   * @param {object} metaData - Metadata extracted from the page's meta tags (Open Graph, Twitter, HTML)
   * @param {object} articleData - Data extracted from the article content using ReaderMode
   * @returns {object} Normalized metadata containing:
   *   - title: Page title prioritizing Open Graph, Twitter, then HTML title
   *   - description: Content excerpt or meta description from various sources
   *   - imageUrl: HTTPS-only URL of the page's primary image
   *   - isMissingMetadata: Boolean flag indicating if description is missing
   */
  extractNormalizedMetadata(metaData = {}, articleData = {}) {
    const title =
      metaData["og:title"] ||
      metaData["twitter:title"] ||
      metaData["html:title"] ||
      "";

    const description =
      articleData.excerpt ||
      metaData["og:description"] ||
      metaData["twitter:description"] ||
      metaData.description ||
      "";

    let imageUrl = metaData["og:image"] || metaData["twitter:image:src"] || "";

    if (!imageUrl.startsWith("https://")) {
      imageUrl = "";
    }

    return {
      title,
      description,
      imageUrl,
    };
  }

  /**
   * Extracts URL components including domain and filename.
   *
   * @param {string} url - The URL to extract information from.
   * @returns {object} Object containing domain and filename.
   */
  extractUrlComponents(url) {
    try {
      const urlObj = new URL(url);
      const domain = urlObj.hostname;

      // Extract the filename (last part of pathname)
      let pathname = urlObj.pathname;
      // Remove trailing slash if present
      if (pathname.endsWith("/")) {
        pathname = pathname.slice(0, -1);
      }

      // Get last segment of path
      const pathParts = pathname.split("/");
      const filename = pathParts[pathParts.length - 1] || domain;

      return { domain, filename };
    } catch (e) {
      // Return both properties with same fallback value if URL is invalid
      return { domain: url, filename: url };
    }
  }

  /**
   * Parses meta tags from the provided Document into a key-value object.
   * Also extracts the title if available.
   *
   * @param {Document} doc - The parsed HTML document.
   * @returns {object} An object containing meta tag key-value pairs.
   */
  parseMetaTagsFromDoc(doc) {
    const metaTags = doc.querySelectorAll("meta");
    const metaInfo = {};

    // TODO: Define the meta tags we are interested in
    const desiredMetaNames = [
      "description",
      "og:image",
      "title",
      "og:title",
      "twitter:title",
      "og:description",
      "twitter:description",
      "twitter:image:src",
    ];

    metaTags.forEach(tag => {
      const name = tag.getAttribute("name") || tag.getAttribute("property");
      const content = tag.getAttribute("content");
      if (name && content) {
        if (desiredMetaNames.includes(name.toLowerCase())) {
          metaInfo[name] = content;
        }
      }
    });

    const title = doc.querySelector("title")?.textContent;
    if (title) {
      metaInfo["html:title"] = title;
    }

    return metaInfo;
  }

  /**
   * Extracts article data from the provided Document using ReaderMode.
   *
   * @param {Document} doc - The parsed HTML document.
   * @returns {Promise<object>} The extracted article data including specified fields.
   */
  async getArticleDataFromDoc(doc) {
    try {
      const article = await lazy.ReaderMode.parseDocument(doc);
      if (article) {
        const {
          title,
          byline,
          content,
          detectedLanguage,
          length,
          siteName,
          excerpt,
          readingTimeMinsSlow,
          readingTimeMinsFast,
        } = article;

        // parseDocument return a `textContent` that strips structure and newlines, which we need for the model.
        // So we convert the HTML `content` to plain text directly, preserving formatting and newlines.
        const textContent = Cc["@mozilla.org/parserutils;1"]
          .getService(Ci.nsIParserUtils)
          .convertToPlainText(
            content,
            null,
            0 // No line-wrapping
          );

        return {
          title,
          byline,
          textContent,
          detectedLanguage,
          length,
          siteName,
          excerpt,
          readingTimeMinsFast,
          readingTimeMinsSlow,
        };
      }
    } catch (error) {
      console.error("Error parsing document with ReaderMode:", error);
    }

    return {};
  }
}