From 22d4bb007403762128e1f3f09c604b1f99cfbc42 Mon Sep 17 00:00:00 2001 From: Kris Maglione Date: Fri, 17 Aug 2018 23:10:59 -0700 Subject: [PATCH] Bug 1484413: Split ReaderMode.jsm into separate checking and loading components. r=Gijs Most of the ReaderMode.jsm and Readability.js code is only needed when we actually need to render a document in reader mode, but also winds up loaded into any process where we ever check if a page is readerable. This winds up wasting a huge amount of memory (and probably a huge amount of CPU time) loading code which is almost never used. This patch splits ReaderMode.jsm into two modules, one for checking readability, one for actually entering reader mode. It also separates out the isProbablyReaderable checks from Readability.js, since the overhead of loading that script before it's needed is unsupportable. This means we're probably going to need some effort to keep Readerable.jsm and Readability.js in sync, but the code in question is pretty trivial, so it shouldn't be too difficult. Differential Revision: https://phabricator.services.mozilla.com/D3687 --HG-- rename : toolkit/components/reader/Readability.js => toolkit/components/reader/Readability-readerable.js rename : toolkit/components/reader/ReaderMode.jsm => toolkit/components/reader/Readerable.js extra : rebase_source : 66712057591ae20dd66234e3dc78fbba90a6914e extra : amend_source : f908f62f49ea54b9099ddb87d9f2fc11f12d4dee --- .eslintignore | 1 + browser/actors/AboutReaderChild.jsm | 6 +- .../performance/browser_startup_content.js | 2 +- mobile/android/chrome/content/content.js | 6 +- .../reader/Readability-readerable.js | 96 ++++++++++++++ toolkit/components/reader/Readability.js | 69 +--------- toolkit/components/reader/ReaderMode.jsm | 124 ++---------------- toolkit/components/reader/Readerable.js | 79 +++++++++++ toolkit/components/reader/Readerable.jsm | 10 ++ toolkit/components/reader/moz.build | 8 +- 10 files changed, 213 insertions(+), 188 deletions(-) create mode 100644 toolkit/components/reader/Readability-readerable.js create mode 100644 toolkit/components/reader/Readerable.js create mode 100644 toolkit/components/reader/Readerable.jsm diff --git a/.eslintignore b/.eslintignore index 4d1468b70833..d70943eafaff 100644 --- a/.eslintignore +++ b/.eslintignore @@ -370,6 +370,7 @@ toolkit/components/reader/Readability.js toolkit/components/reader/JSDOMParser.js # Uses preprocessing +toolkit/components/reader/Readerable.jsm toolkit/content/widgets/wizard.xml toolkit/modules/AppConstants.jsm toolkit/mozapps/update/tests/data/xpcshellConstantsPP.js diff --git a/browser/actors/AboutReaderChild.jsm b/browser/actors/AboutReaderChild.jsm index 35300c8b7700..0981e7bcf60f 100644 --- a/browser/actors/AboutReaderChild.jsm +++ b/browser/actors/AboutReaderChild.jsm @@ -12,6 +12,8 @@ ChromeUtils.defineModuleGetter(this, "AboutReader", "resource://gre/modules/AboutReader.jsm"); ChromeUtils.defineModuleGetter(this, "ReaderMode", "resource://gre/modules/ReaderMode.jsm"); +ChromeUtils.defineModuleGetter(this, "Readerable", + "resource://gre/modules/Readerable.jsm"); class AboutReaderChild extends ActorChild { constructor(mm) { @@ -102,7 +104,7 @@ class AboutReaderChild extends ActorChild { * painted is not going to work. */ updateReaderButton(forceNonArticle) { - if (!ReaderMode.isEnabledForParseOnLoad || this.isAboutReader || + if (!Readerable.isEnabledForParseOnLoad || this.isAboutReader || !this.content || !(this.content.document instanceof this.content.HTMLDocument) || this.content.document.mozSyntheticDocument) { return; @@ -141,7 +143,7 @@ class AboutReaderChild extends ActorChild { this.cancelPotentialPendingReadabilityCheck(); // Only send updates when there are articles; there's no point updating with // |false| all the time. - if (ReaderMode.isProbablyReaderable(this.content.document)) { + if (Readerable.isProbablyReaderable(this.content.document)) { this.mm.sendAsyncMessage("Reader:UpdateReaderButton", { isArticle: true }); } else if (forceNonArticle) { this.mm.sendAsyncMessage("Reader:UpdateReaderButton", { isArticle: false }); diff --git a/browser/base/content/test/performance/browser_startup_content.js b/browser/base/content/test/performance/browser_startup_content.js index 76c9aaac185e..ed35685b88ce 100644 --- a/browser/base/content/test/performance/browser_startup_content.js +++ b/browser/base/content/test/performance/browser_startup_content.js @@ -56,7 +56,7 @@ const whitelist = { "resource://gre/modules/ActorChild.jsm", "resource://gre/modules/ActorManagerChild.jsm", "resource://gre/modules/E10SUtils.jsm", - "resource://gre/modules/ReaderMode.jsm", + "resource://gre/modules/Readerable.jsm", "resource://gre/modules/WebProgressChild.jsm", // Pocket diff --git a/mobile/android/chrome/content/content.js b/mobile/android/chrome/content/content.js index 31108508aad8..7d4f66ef1b79 100644 --- a/mobile/android/chrome/content/content.js +++ b/mobile/android/chrome/content/content.js @@ -8,6 +8,7 @@ ChromeUtils.import("resource://gre/modules/XPCOMUtils.jsm"); ChromeUtils.defineModuleGetter(this, "AboutReader", "resource://gre/modules/AboutReader.jsm"); ChromeUtils.defineModuleGetter(this, "ReaderMode", "resource://gre/modules/ReaderMode.jsm"); +ChromeUtils.defineModuleGetter(this, "Readerable", "resource://gre/modules/Readerable.jsm"); ChromeUtils.defineModuleGetter(this, "LoginManagerContent", "resource://gre/modules/LoginManagerContent.jsm"); XPCOMUtils.defineLazyGetter(this, "gPipNSSBundle", function() { @@ -486,7 +487,7 @@ var AboutReaderListener = { // Do not show Reader View icon on error pages (bug 1320900) if (this.isErrorPage) { sendAsyncMessage("Reader:UpdateReaderButton", { isArticle: false }); - } else if (!ReaderMode.isEnabledForParseOnLoad || this.isAboutReader || + } else if (!Readerable.isEnabledForParseOnLoad || this.isAboutReader || !(content.document instanceof content.HTMLDocument) || content.document.mozSyntheticDocument) { @@ -522,11 +523,12 @@ var AboutReaderListener = { return; } + Services.console.logStringMessage(`ON PAINT WHEN WAITED FOR\n`); this.cancelPotentialPendingReadabilityCheck(); // Only send updates when there are articles; there's no point updating with // |false| all the time. - if (ReaderMode.isProbablyReaderable(content.document)) { + if (Readerable.isProbablyReaderable(content.document)) { sendAsyncMessage("Reader:UpdateReaderButton", { isArticle: true }); } else if (forceNonArticle) { sendAsyncMessage("Reader:UpdateReaderButton", { isArticle: false }); diff --git a/toolkit/components/reader/Readability-readerable.js b/toolkit/components/reader/Readability-readerable.js new file mode 100644 index 000000000000..f38b418a061b --- /dev/null +++ b/toolkit/components/reader/Readability-readerable.js @@ -0,0 +1,96 @@ +/* eslint-env es6:false */ +/* globals exports */ +/* + * Copyright (c) 2010 Arc90 Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * This code is heavily based on Arc90's readability.js (1.7.1) script + * available at: http://code.google.com/p/arc90labs-readability + */ + +var REGEXPS = { + // NOTE: These two regular expressions are duplicated in + // Readability.js. Please keep both copies in sync. + unlikelyCandidates: /-ad-|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i, + okMaybeItsACandidate: /and|article|body|column|main|shadow/i, +}; + +function isNodeVisible(node) { + return node.style.display != "none" && !node.hasAttribute("hidden"); +} + +/** + * Decides whether or not the document is reader-able without parsing the whole thing. + * + * @return boolean Whether or not we suspect Readability.parse() will suceeed at returning an article object. + */ +function isProbablyReaderable(doc, isVisible) { + if (!isVisible) { + isVisible = isNodeVisible; + } + + var nodes = doc.querySelectorAll("p, pre"); + + // Get
nodes which have
node(s) and append them into the `nodes` variable. + // Some articles' DOM structures might look like + //
+ // Sentences
+ //
+ // Sentences
+ //
+ var brNodes = doc.querySelectorAll("div > br"); + if (brNodes.length) { + var set = new Set(nodes); + [].forEach.call(brNodes, function(node) { + set.add(node.parentNode); + }); + nodes = Array.from(set); + } + + var score = 0; + // This is a little cheeky, we use the accumulator 'score' to decide what to return from + // this callback: + return [].some.call(nodes, function(node) { + if (!isVisible(node)) + return false; + + var matchString = node.className + " " + node.id; + if (REGEXPS.unlikelyCandidates.test(matchString) && + !REGEXPS.okMaybeItsACandidate.test(matchString)) { + return false; + } + + if (node.matches("li p")) { + return false; + } + + var textContentLength = node.textContent.trim().length; + if (textContentLength < 140) { + return false; + } + + score += Math.sqrt(textContentLength - 140); + + if (score > 20) { + return true; + } + return false; + }); +} + +if (typeof exports === "object") { + exports.isProbablyReaderable = isProbablyReaderable; +} diff --git a/toolkit/components/reader/Readability.js b/toolkit/components/reader/Readability.js index da89277e3718..deac15904d55 100644 --- a/toolkit/components/reader/Readability.js +++ b/toolkit/components/reader/Readability.js @@ -1,11 +1,4 @@ /*eslint-env es6:false*/ -/* - * DO NOT MODIFY THIS FILE DIRECTLY! - * - * This is a shared library that is maintained in an external repo: - * https://github.com/mozilla/readability - */ - /* * Copyright (c) 2010 Arc90 Inc * @@ -118,8 +111,11 @@ Readability.prototype = { // All of the regular expressions in use within readability. // Defined up here so we don't instantiate them repeatedly in loops. REGEXPS: { + // NOTE: These two regular expressions are duplicated in + // Readability-readerable.js. Please keep both copies in sync. unlikelyCandidates: /-ad-|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i, okMaybeItsACandidate: /and|article|body|column|main|shadow/i, + positive: /article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i, negative: /hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i, extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i, @@ -1711,65 +1707,6 @@ Readability.prototype = { return node.style.display != "none" && !node.hasAttribute("hidden"); }, - /** - * Decides whether or not the document is reader-able without parsing the whole thing. - * - * @return boolean Whether or not we suspect parse() will suceeed at returning an article object. - */ - isProbablyReaderable: function(helperIsVisible) { - var nodes = this._getAllNodesWithTag(this._doc, ["p", "pre"]); - - // Get
nodes which have
node(s) and append them into the `nodes` variable. - // Some articles' DOM structures might look like - //
- // Sentences
- //
- // Sentences
- //
- var brNodes = this._getAllNodesWithTag(this._doc, ["div > br"]); - if (brNodes.length) { - var set = new Set(); - [].forEach.call(brNodes, function(node) { - set.add(node.parentNode); - }); - nodes = [].concat.apply(Array.from(set), nodes); - } - - if (!helperIsVisible) { - helperIsVisible = this._isProbablyVisible; - } - - var score = 0; - // This is a little cheeky, we use the accumulator 'score' to decide what to return from - // this callback: - return this._someNode(nodes, function(node) { - if (helperIsVisible && !helperIsVisible(node)) - return false; - var matchString = node.className + " " + node.id; - - if (this.REGEXPS.unlikelyCandidates.test(matchString) && - !this.REGEXPS.okMaybeItsACandidate.test(matchString)) { - return false; - } - - if (node.matches && node.matches("li p")) { - return false; - } - - var textContentLength = node.textContent.trim().length; - if (textContentLength < 140) { - return false; - } - - score += Math.sqrt(textContentLength - 140); - - if (score > 20) { - return true; - } - return false; - }); - }, - /** * Runs readability. * diff --git a/toolkit/components/reader/ReaderMode.jsm b/toolkit/components/reader/ReaderMode.jsm index f147d5a95520..134ca7a6b1af 100644 --- a/toolkit/components/reader/ReaderMode.jsm +++ b/toolkit/components/reader/ReaderMode.jsm @@ -41,13 +41,7 @@ ChromeUtils.defineModuleGetter(this, "EventDispatcher", "resource://gre/modules/ ChromeUtils.defineModuleGetter(this, "OS", "resource://gre/modules/osfile.jsm"); ChromeUtils.defineModuleGetter(this, "ReaderWorker", "resource://gre/modules/reader/ReaderWorker.jsm"); ChromeUtils.defineModuleGetter(this, "LanguageDetector", "resource:///modules/translation/LanguageDetector.jsm"); - -XPCOMUtils.defineLazyGetter(this, "Readability", function() { - let scope = {}; - scope.dump = this.dump; - Services.scriptloader.loadSubScript("resource://gre/modules/reader/Readability.js", scope); - return scope.Readability; -}); +ChromeUtils.defineModuleGetter(this, "Readerable", "resource://gre/modules/Readerable.jsm"); const gIsFirefoxDesktop = Services.appinfo.ID == "{ec8030f7-c20a-464f-9b0e-13a3a9e97384}"; @@ -57,42 +51,6 @@ var ReaderMode = { DEBUG: 0, - // Don't try to parse the page if it has too many elements (for memory and - // performance reasons) - get maxElemsToParse() { - delete this.parseNodeLimit; - - Services.prefs.addObserver("reader.parse-node-limit", this); - return this.parseNodeLimit = Services.prefs.getIntPref("reader.parse-node-limit"); - }, - - get isEnabledForParseOnLoad() { - delete this.isEnabledForParseOnLoad; - - // Listen for future pref changes. - Services.prefs.addObserver("reader.parse-on-load.", this); - - return this.isEnabledForParseOnLoad = this._getStateForParseOnLoad(); - }, - - _getStateForParseOnLoad() { - let isEnabled = Services.prefs.getBoolPref("reader.parse-on-load.enabled"); - let isForceEnabled = Services.prefs.getBoolPref("reader.parse-on-load.force-enabled"); - return isForceEnabled || isEnabled; - }, - - observe(aMessage, aTopic, aData) { - switch (aTopic) { - case "nsPref:changed": - if (aData.startsWith("reader.parse-on-load.")) { - this.isEnabledForParseOnLoad = this._getStateForParseOnLoad(); - } else if (aData === "reader.parse-node-limit") { - this.parseNodeLimit = Services.prefs.getIntPref(aData); - } - break; - } - }, - /** * Enter the reader mode by going forward one step in history if applicable, * if not, append the about:reader page in the history instead. @@ -197,39 +155,6 @@ var ReaderMode = { return null; }, - /** - * Decides whether or not a document is reader-able without parsing the whole thing. - * - * @param doc A document to parse. - * @return boolean Whether or not we should show the reader mode button. - */ - isProbablyReaderable(doc) { - // Only care about 'real' HTML documents: - if (doc.mozSyntheticDocument || !(doc instanceof doc.defaultView.HTMLDocument)) { - return false; - } - - let uri = Services.io.newURI(doc.location.href); - if (!this._shouldCheckUri(uri)) { - return false; - } - - let utils = this.getUtilsForWin(doc.defaultView); - // We pass in a helper function to determine if a node is visible, because - // it uses gecko APIs that the engine-agnostic readability code can't rely - // upon. - return new Readability(doc).isProbablyReaderable(this.isNodeVisible.bind(this, utils)); - }, - - isNodeVisible(utils, node) { - let bounds = utils.getBoundsWithoutFlushing(node); - return bounds.height > 0 && bounds.width > 0; - }, - - getUtilsForWin(win) { - return win.windowUtils; - }, - /** * Gets an article from a loaded browser's document. This method will not attempt * to parse certain URIs (e.g. about: URIs). @@ -239,7 +164,8 @@ var ReaderMode = { * @resolves JS object representing the article, or null if no article is found. */ parseDocument(doc) { - if (!this._shouldCheckUri(doc.documentURIObject) || !this._shouldCheckUri(doc.baseURIObject, true)) { + if (!Readerable.shouldCheckUri(doc.documentURIObject) || + !Readerable.shouldCheckUri(doc.baseURIObject, true)) { this.log("Reader mode disabled for URI"); return null; } @@ -259,7 +185,8 @@ var ReaderMode = { if (!doc) { return null; } - if (!this._shouldCheckUri(doc.documentURIObject) || !this._shouldCheckUri(doc.baseURIObject, true)) { + if (!Readerable.shouldCheckUri(doc.documentURIObject) || + !Readerable.shouldCheckUri(doc.baseURIObject, true)) { this.log("Reader mode disabled for URI"); return null; } @@ -269,7 +196,7 @@ var ReaderMode = { _downloadDocument(url) { try { - if (!this._shouldCheckUri(Services.io.newURI(url))) { + if (!Readerable.shouldCheckUri(Services.io.newURI(url))) { return null; } } catch (ex) { @@ -415,42 +342,6 @@ var ReaderMode = { dump("Reader: " + msg); }, - _blockedHosts: [ - "amazon.com", - "github.com", - "mail.google.com", - "pinterest.com", - "reddit.com", - "twitter.com", - "youtube.com", - ], - - _shouldCheckUri(uri, isBaseUri = false) { - if (!(uri.schemeIs("http") || uri.schemeIs("https"))) { - this.log("Not parsing URI scheme: " + uri.scheme); - return false; - } - - try { - uri.QueryInterface(Ci.nsIURL); - } catch (ex) { - // If this doesn't work, presumably the URL is not well-formed or something - return false; - } - // Sadly, some high-profile pages have false positives, so bail early for those: - let asciiHost = uri.asciiHost; - if (!isBaseUri && this._blockedHosts.some(blockedHost => asciiHost.endsWith(blockedHost))) { - return false; - } - - if (!isBaseUri && (!uri.filePath || uri.filePath == "/")) { - this.log("Not parsing home page: " + uri.spec); - return false; - } - - return true; - }, - /** * Attempts to parse a document into an article. Heavy lifting happens * in readerWorker.js. @@ -641,3 +532,6 @@ var ReaderMode = { return readingSpeed.get(lang) || readingSpeed.get("en"); }, }; + +XPCOMUtils.defineLazyPreferenceGetter( + ReaderMode, "maxElemsToParse", "reader.parse-node-limit", 0); diff --git a/toolkit/components/reader/Readerable.js b/toolkit/components/reader/Readerable.js new file mode 100644 index 000000000000..509e53d3f761 --- /dev/null +++ b/toolkit/components/reader/Readerable.js @@ -0,0 +1,79 @@ +// -*- indent-tabs-mode: nil; js-indent-level: 2 -*- +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this file, + * You can obtain one at http://mozilla.org/MPL/2.0/. */ +"use strict"; + +// This file and Readability-readerable.js are merged together into +// Readerable.jsm. + +/* exported Readerable */ +/* import-globals-from Readability-readerable.js */ + +ChromeUtils.import("resource://gre/modules/Services.jsm"); +ChromeUtils.import("resource://gre/modules/XPCOMUtils.jsm"); + +function isNodeVisible(node) { + return node.clientHeight > 0 && node.clientWidth > 0; +} + +var Readerable = { + get isEnabledForParseOnLoad() { + return this.isEnabled || this.isForceEnabled; + }, + + /** + * Decides whether or not a document is reader-able without parsing the whole thing. + * + * @param doc A document to parse. + * @return boolean Whether or not we should show the reader mode button. + */ + isProbablyReaderable(doc) { + // Only care about 'real' HTML documents: + if (doc.mozSyntheticDocument || !(doc instanceof doc.defaultView.HTMLDocument)) { + return false; + } + + let uri = Services.io.newURI(doc.location.href); + if (!this.shouldCheckUri(uri)) { + return false; + } + + return isProbablyReaderable(doc, isNodeVisible); + }, + + _blockedHosts: [ + "amazon.com", + "github.com", + "mail.google.com", + "pinterest.com", + "reddit.com", + "twitter.com", + "youtube.com", + ], + + shouldCheckUri(uri, isBaseUri = false) { + if (!["http", "https"].includes(uri.scheme)) { + return false; + } + + if (!isBaseUri) { + // Sadly, some high-profile pages have false positives, so bail early for those: + let {host} = uri; + if (this._blockedHosts.some(blockedHost => host.endsWith(blockedHost))) { + return false; + } + + if (uri.filePath == "/") { + return false; + } + } + + return true; + }, +}; + +XPCOMUtils.defineLazyPreferenceGetter( + Readerable, "isEnabled", "reader.parse-on-load.enabled", true); +XPCOMUtils.defineLazyPreferenceGetter( + Readerable, "isForceEnabled", "reader.parse-on-load.force-enabled", false); diff --git a/toolkit/components/reader/Readerable.jsm b/toolkit/components/reader/Readerable.jsm new file mode 100644 index 000000000000..2268487e4296 --- /dev/null +++ b/toolkit/components/reader/Readerable.jsm @@ -0,0 +1,10 @@ +// -*- indent-tabs-mode: nil; js-indent-level: 2 -*- +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this file, + * You can obtain one at http://mozilla.org/MPL/2.0/. */ +"use strict"; + +var EXPORTED_SYMBOLS = ["Readerable"]; + +#include Readability-readerable.js +#include Readerable.js diff --git a/toolkit/components/reader/moz.build b/toolkit/components/reader/moz.build index 6863d6542747..c2364ce8deb4 100644 --- a/toolkit/components/reader/moz.build +++ b/toolkit/components/reader/moz.build @@ -8,14 +8,18 @@ JAR_MANIFESTS += ['jar.mn'] EXTRA_JS_MODULES += [ 'AboutReader.jsm', - 'ReaderMode.jsm' + 'ReaderMode.jsm', +] + +EXTRA_PP_JS_MODULES += [ + 'Readerable.jsm', ] EXTRA_JS_MODULES.reader = [ 'JSDOMParser.js', 'Readability.js', 'ReaderWorker.js', - 'ReaderWorker.jsm' + 'ReaderWorker.jsm', ] BROWSER_CHROME_MANIFESTS += [