Bug 1484413: Split ReaderMode.jsm into separate checking and loading components. r=Gijs

Most of the ReaderMode.jsm and Readability.js code is only needed when we
actually need to render a document in reader mode, but also winds up loaded
into any process where we ever check if a page is readerable. This winds up
wasting a huge amount of memory (and probably a huge amount of CPU time)
loading code which is almost never used.

This patch splits ReaderMode.jsm into two modules, one for checking
readability, one for actually entering reader mode. It also separates out the
isProbablyReaderable checks from Readability.js, since the overhead of loading
that script before it's needed is unsupportable.

This means we're probably going to need some effort to keep Readerable.jsm and
Readability.js in sync, but the code in question is pretty trivial, so it
shouldn't be too difficult.

Differential Revision: https://phabricator.services.mozilla.com/D3687

--HG--
rename : toolkit/components/reader/Readability.js => toolkit/components/reader/Readability-readerable.js
rename : toolkit/components/reader/ReaderMode.jsm => toolkit/components/reader/Readerable.js
extra : rebase_source : 66712057591ae20dd66234e3dc78fbba90a6914e
extra : amend_source : f908f62f49ea54b9099ddb87d9f2fc11f12d4dee
This commit is contained in:
Kris Maglione 2018-08-17 23:10:59 -07:00
parent 7fa99adb7e
commit 22d4bb0074
10 changed files with 213 additions and 188 deletions

View file

@ -370,6 +370,7 @@ toolkit/components/reader/Readability.js
toolkit/components/reader/JSDOMParser.js
# Uses preprocessing
toolkit/components/reader/Readerable.jsm
toolkit/content/widgets/wizard.xml
toolkit/modules/AppConstants.jsm
toolkit/mozapps/update/tests/data/xpcshellConstantsPP.js

View file

@ -12,6 +12,8 @@ ChromeUtils.defineModuleGetter(this, "AboutReader",
"resource://gre/modules/AboutReader.jsm");
ChromeUtils.defineModuleGetter(this, "ReaderMode",
"resource://gre/modules/ReaderMode.jsm");
ChromeUtils.defineModuleGetter(this, "Readerable",
"resource://gre/modules/Readerable.jsm");
class AboutReaderChild extends ActorChild {
constructor(mm) {
@ -102,7 +104,7 @@ class AboutReaderChild extends ActorChild {
* painted is not going to work.
*/
updateReaderButton(forceNonArticle) {
if (!ReaderMode.isEnabledForParseOnLoad || this.isAboutReader ||
if (!Readerable.isEnabledForParseOnLoad || this.isAboutReader ||
!this.content || !(this.content.document instanceof this.content.HTMLDocument) ||
this.content.document.mozSyntheticDocument) {
return;
@ -141,7 +143,7 @@ class AboutReaderChild extends ActorChild {
this.cancelPotentialPendingReadabilityCheck();
// Only send updates when there are articles; there's no point updating with
// |false| all the time.
if (ReaderMode.isProbablyReaderable(this.content.document)) {
if (Readerable.isProbablyReaderable(this.content.document)) {
this.mm.sendAsyncMessage("Reader:UpdateReaderButton", { isArticle: true });
} else if (forceNonArticle) {
this.mm.sendAsyncMessage("Reader:UpdateReaderButton", { isArticle: false });

View file

@ -56,7 +56,7 @@ const whitelist = {
"resource://gre/modules/ActorChild.jsm",
"resource://gre/modules/ActorManagerChild.jsm",
"resource://gre/modules/E10SUtils.jsm",
"resource://gre/modules/ReaderMode.jsm",
"resource://gre/modules/Readerable.jsm",
"resource://gre/modules/WebProgressChild.jsm",
// Pocket

View file

@ -8,6 +8,7 @@ ChromeUtils.import("resource://gre/modules/XPCOMUtils.jsm");
ChromeUtils.defineModuleGetter(this, "AboutReader", "resource://gre/modules/AboutReader.jsm");
ChromeUtils.defineModuleGetter(this, "ReaderMode", "resource://gre/modules/ReaderMode.jsm");
ChromeUtils.defineModuleGetter(this, "Readerable", "resource://gre/modules/Readerable.jsm");
ChromeUtils.defineModuleGetter(this, "LoginManagerContent", "resource://gre/modules/LoginManagerContent.jsm");
XPCOMUtils.defineLazyGetter(this, "gPipNSSBundle", function() {
@ -486,7 +487,7 @@ var AboutReaderListener = {
// Do not show Reader View icon on error pages (bug 1320900)
if (this.isErrorPage) {
sendAsyncMessage("Reader:UpdateReaderButton", { isArticle: false });
} else if (!ReaderMode.isEnabledForParseOnLoad || this.isAboutReader ||
} else if (!Readerable.isEnabledForParseOnLoad || this.isAboutReader ||
!(content.document instanceof content.HTMLDocument) ||
content.document.mozSyntheticDocument) {
@ -522,11 +523,12 @@ var AboutReaderListener = {
return;
}
Services.console.logStringMessage(`ON PAINT WHEN WAITED FOR\n`);
this.cancelPotentialPendingReadabilityCheck();
// Only send updates when there are articles; there's no point updating with
// |false| all the time.
if (ReaderMode.isProbablyReaderable(content.document)) {
if (Readerable.isProbablyReaderable(content.document)) {
sendAsyncMessage("Reader:UpdateReaderButton", { isArticle: true });
} else if (forceNonArticle) {
sendAsyncMessage("Reader:UpdateReaderButton", { isArticle: false });

View file

@ -0,0 +1,96 @@
/* eslint-env es6:false */
/* globals exports */
/*
* Copyright (c) 2010 Arc90 Inc
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* This code is heavily based on Arc90's readability.js (1.7.1) script
* available at: http://code.google.com/p/arc90labs-readability
*/
var REGEXPS = {
// NOTE: These two regular expressions are duplicated in
// Readability.js. Please keep both copies in sync.
unlikelyCandidates: /-ad-|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i,
okMaybeItsACandidate: /and|article|body|column|main|shadow/i,
};
function isNodeVisible(node) {
return node.style.display != "none" && !node.hasAttribute("hidden");
}
/**
* Decides whether or not the document is reader-able without parsing the whole thing.
*
* @return boolean Whether or not we suspect Readability.parse() will suceeed at returning an article object.
*/
function isProbablyReaderable(doc, isVisible) {
if (!isVisible) {
isVisible = isNodeVisible;
}
var nodes = doc.querySelectorAll("p, pre");
// Get <div> nodes which have <br> node(s) and append them into the `nodes` variable.
// Some articles' DOM structures might look like
// <div>
// Sentences<br>
// <br>
// Sentences<br>
// </div>
var brNodes = doc.querySelectorAll("div > br");
if (brNodes.length) {
var set = new Set(nodes);
[].forEach.call(brNodes, function(node) {
set.add(node.parentNode);
});
nodes = Array.from(set);
}
var score = 0;
// This is a little cheeky, we use the accumulator 'score' to decide what to return from
// this callback:
return [].some.call(nodes, function(node) {
if (!isVisible(node))
return false;
var matchString = node.className + " " + node.id;
if (REGEXPS.unlikelyCandidates.test(matchString) &&
!REGEXPS.okMaybeItsACandidate.test(matchString)) {
return false;
}
if (node.matches("li p")) {
return false;
}
var textContentLength = node.textContent.trim().length;
if (textContentLength < 140) {
return false;
}
score += Math.sqrt(textContentLength - 140);
if (score > 20) {
return true;
}
return false;
});
}
if (typeof exports === "object") {
exports.isProbablyReaderable = isProbablyReaderable;
}

View file

@ -1,11 +1,4 @@
/*eslint-env es6:false*/
/*
* DO NOT MODIFY THIS FILE DIRECTLY!
*
* This is a shared library that is maintained in an external repo:
* https://github.com/mozilla/readability
*/
/*
* Copyright (c) 2010 Arc90 Inc
*
@ -118,8 +111,11 @@ Readability.prototype = {
// All of the regular expressions in use within readability.
// Defined up here so we don't instantiate them repeatedly in loops.
REGEXPS: {
// NOTE: These two regular expressions are duplicated in
// Readability-readerable.js. Please keep both copies in sync.
unlikelyCandidates: /-ad-|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i,
okMaybeItsACandidate: /and|article|body|column|main|shadow/i,
positive: /article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i,
negative: /hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i,
extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i,
@ -1711,65 +1707,6 @@ Readability.prototype = {
return node.style.display != "none" && !node.hasAttribute("hidden");
},
/**
* Decides whether or not the document is reader-able without parsing the whole thing.
*
* @return boolean Whether or not we suspect parse() will suceeed at returning an article object.
*/
isProbablyReaderable: function(helperIsVisible) {
var nodes = this._getAllNodesWithTag(this._doc, ["p", "pre"]);
// Get <div> nodes which have <br> node(s) and append them into the `nodes` variable.
// Some articles' DOM structures might look like
// <div>
// Sentences<br>
// <br>
// Sentences<br>
// </div>
var brNodes = this._getAllNodesWithTag(this._doc, ["div > br"]);
if (brNodes.length) {
var set = new Set();
[].forEach.call(brNodes, function(node) {
set.add(node.parentNode);
});
nodes = [].concat.apply(Array.from(set), nodes);
}
if (!helperIsVisible) {
helperIsVisible = this._isProbablyVisible;
}
var score = 0;
// This is a little cheeky, we use the accumulator 'score' to decide what to return from
// this callback:
return this._someNode(nodes, function(node) {
if (helperIsVisible && !helperIsVisible(node))
return false;
var matchString = node.className + " " + node.id;
if (this.REGEXPS.unlikelyCandidates.test(matchString) &&
!this.REGEXPS.okMaybeItsACandidate.test(matchString)) {
return false;
}
if (node.matches && node.matches("li p")) {
return false;
}
var textContentLength = node.textContent.trim().length;
if (textContentLength < 140) {
return false;
}
score += Math.sqrt(textContentLength - 140);
if (score > 20) {
return true;
}
return false;
});
},
/**
* Runs readability.
*

View file

@ -41,13 +41,7 @@ ChromeUtils.defineModuleGetter(this, "EventDispatcher", "resource://gre/modules/
ChromeUtils.defineModuleGetter(this, "OS", "resource://gre/modules/osfile.jsm");
ChromeUtils.defineModuleGetter(this, "ReaderWorker", "resource://gre/modules/reader/ReaderWorker.jsm");
ChromeUtils.defineModuleGetter(this, "LanguageDetector", "resource:///modules/translation/LanguageDetector.jsm");
XPCOMUtils.defineLazyGetter(this, "Readability", function() {
let scope = {};
scope.dump = this.dump;
Services.scriptloader.loadSubScript("resource://gre/modules/reader/Readability.js", scope);
return scope.Readability;
});
ChromeUtils.defineModuleGetter(this, "Readerable", "resource://gre/modules/Readerable.jsm");
const gIsFirefoxDesktop = Services.appinfo.ID == "{ec8030f7-c20a-464f-9b0e-13a3a9e97384}";
@ -57,42 +51,6 @@ var ReaderMode = {
DEBUG: 0,
// Don't try to parse the page if it has too many elements (for memory and
// performance reasons)
get maxElemsToParse() {
delete this.parseNodeLimit;
Services.prefs.addObserver("reader.parse-node-limit", this);
return this.parseNodeLimit = Services.prefs.getIntPref("reader.parse-node-limit");
},
get isEnabledForParseOnLoad() {
delete this.isEnabledForParseOnLoad;
// Listen for future pref changes.
Services.prefs.addObserver("reader.parse-on-load.", this);
return this.isEnabledForParseOnLoad = this._getStateForParseOnLoad();
},
_getStateForParseOnLoad() {
let isEnabled = Services.prefs.getBoolPref("reader.parse-on-load.enabled");
let isForceEnabled = Services.prefs.getBoolPref("reader.parse-on-load.force-enabled");
return isForceEnabled || isEnabled;
},
observe(aMessage, aTopic, aData) {
switch (aTopic) {
case "nsPref:changed":
if (aData.startsWith("reader.parse-on-load.")) {
this.isEnabledForParseOnLoad = this._getStateForParseOnLoad();
} else if (aData === "reader.parse-node-limit") {
this.parseNodeLimit = Services.prefs.getIntPref(aData);
}
break;
}
},
/**
* Enter the reader mode by going forward one step in history if applicable,
* if not, append the about:reader page in the history instead.
@ -197,39 +155,6 @@ var ReaderMode = {
return null;
},
/**
* Decides whether or not a document is reader-able without parsing the whole thing.
*
* @param doc A document to parse.
* @return boolean Whether or not we should show the reader mode button.
*/
isProbablyReaderable(doc) {
// Only care about 'real' HTML documents:
if (doc.mozSyntheticDocument || !(doc instanceof doc.defaultView.HTMLDocument)) {
return false;
}
let uri = Services.io.newURI(doc.location.href);
if (!this._shouldCheckUri(uri)) {
return false;
}
let utils = this.getUtilsForWin(doc.defaultView);
// We pass in a helper function to determine if a node is visible, because
// it uses gecko APIs that the engine-agnostic readability code can't rely
// upon.
return new Readability(doc).isProbablyReaderable(this.isNodeVisible.bind(this, utils));
},
isNodeVisible(utils, node) {
let bounds = utils.getBoundsWithoutFlushing(node);
return bounds.height > 0 && bounds.width > 0;
},
getUtilsForWin(win) {
return win.windowUtils;
},
/**
* Gets an article from a loaded browser's document. This method will not attempt
* to parse certain URIs (e.g. about: URIs).
@ -239,7 +164,8 @@ var ReaderMode = {
* @resolves JS object representing the article, or null if no article is found.
*/
parseDocument(doc) {
if (!this._shouldCheckUri(doc.documentURIObject) || !this._shouldCheckUri(doc.baseURIObject, true)) {
if (!Readerable.shouldCheckUri(doc.documentURIObject) ||
!Readerable.shouldCheckUri(doc.baseURIObject, true)) {
this.log("Reader mode disabled for URI");
return null;
}
@ -259,7 +185,8 @@ var ReaderMode = {
if (!doc) {
return null;
}
if (!this._shouldCheckUri(doc.documentURIObject) || !this._shouldCheckUri(doc.baseURIObject, true)) {
if (!Readerable.shouldCheckUri(doc.documentURIObject) ||
!Readerable.shouldCheckUri(doc.baseURIObject, true)) {
this.log("Reader mode disabled for URI");
return null;
}
@ -269,7 +196,7 @@ var ReaderMode = {
_downloadDocument(url) {
try {
if (!this._shouldCheckUri(Services.io.newURI(url))) {
if (!Readerable.shouldCheckUri(Services.io.newURI(url))) {
return null;
}
} catch (ex) {
@ -415,42 +342,6 @@ var ReaderMode = {
dump("Reader: " + msg);
},
_blockedHosts: [
"amazon.com",
"github.com",
"mail.google.com",
"pinterest.com",
"reddit.com",
"twitter.com",
"youtube.com",
],
_shouldCheckUri(uri, isBaseUri = false) {
if (!(uri.schemeIs("http") || uri.schemeIs("https"))) {
this.log("Not parsing URI scheme: " + uri.scheme);
return false;
}
try {
uri.QueryInterface(Ci.nsIURL);
} catch (ex) {
// If this doesn't work, presumably the URL is not well-formed or something
return false;
}
// Sadly, some high-profile pages have false positives, so bail early for those:
let asciiHost = uri.asciiHost;
if (!isBaseUri && this._blockedHosts.some(blockedHost => asciiHost.endsWith(blockedHost))) {
return false;
}
if (!isBaseUri && (!uri.filePath || uri.filePath == "/")) {
this.log("Not parsing home page: " + uri.spec);
return false;
}
return true;
},
/**
* Attempts to parse a document into an article. Heavy lifting happens
* in readerWorker.js.
@ -641,3 +532,6 @@ var ReaderMode = {
return readingSpeed.get(lang) || readingSpeed.get("en");
},
};
XPCOMUtils.defineLazyPreferenceGetter(
ReaderMode, "maxElemsToParse", "reader.parse-node-limit", 0);

View file

@ -0,0 +1,79 @@
// -*- indent-tabs-mode: nil; js-indent-level: 2 -*-
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this file,
* You can obtain one at http://mozilla.org/MPL/2.0/. */
"use strict";
// This file and Readability-readerable.js are merged together into
// Readerable.jsm.
/* exported Readerable */
/* import-globals-from Readability-readerable.js */
ChromeUtils.import("resource://gre/modules/Services.jsm");
ChromeUtils.import("resource://gre/modules/XPCOMUtils.jsm");
function isNodeVisible(node) {
return node.clientHeight > 0 && node.clientWidth > 0;
}
var Readerable = {
get isEnabledForParseOnLoad() {
return this.isEnabled || this.isForceEnabled;
},
/**
* Decides whether or not a document is reader-able without parsing the whole thing.
*
* @param doc A document to parse.
* @return boolean Whether or not we should show the reader mode button.
*/
isProbablyReaderable(doc) {
// Only care about 'real' HTML documents:
if (doc.mozSyntheticDocument || !(doc instanceof doc.defaultView.HTMLDocument)) {
return false;
}
let uri = Services.io.newURI(doc.location.href);
if (!this.shouldCheckUri(uri)) {
return false;
}
return isProbablyReaderable(doc, isNodeVisible);
},
_blockedHosts: [
"amazon.com",
"github.com",
"mail.google.com",
"pinterest.com",
"reddit.com",
"twitter.com",
"youtube.com",
],
shouldCheckUri(uri, isBaseUri = false) {
if (!["http", "https"].includes(uri.scheme)) {
return false;
}
if (!isBaseUri) {
// Sadly, some high-profile pages have false positives, so bail early for those:
let {host} = uri;
if (this._blockedHosts.some(blockedHost => host.endsWith(blockedHost))) {
return false;
}
if (uri.filePath == "/") {
return false;
}
}
return true;
},
};
XPCOMUtils.defineLazyPreferenceGetter(
Readerable, "isEnabled", "reader.parse-on-load.enabled", true);
XPCOMUtils.defineLazyPreferenceGetter(
Readerable, "isForceEnabled", "reader.parse-on-load.force-enabled", false);

View file

@ -0,0 +1,10 @@
// -*- indent-tabs-mode: nil; js-indent-level: 2 -*-
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this file,
* You can obtain one at http://mozilla.org/MPL/2.0/. */
"use strict";
var EXPORTED_SYMBOLS = ["Readerable"];
#include Readability-readerable.js
#include Readerable.js

View file

@ -8,14 +8,18 @@ JAR_MANIFESTS += ['jar.mn']
EXTRA_JS_MODULES += [
'AboutReader.jsm',
'ReaderMode.jsm'
'ReaderMode.jsm',
]
EXTRA_PP_JS_MODULES += [
'Readerable.jsm',
]
EXTRA_JS_MODULES.reader = [
'JSDOMParser.js',
'Readability.js',
'ReaderWorker.js',
'ReaderWorker.jsm'
'ReaderWorker.jsm',
]
BROWSER_CHROME_MANIFESTS += [