forked from mirrors/gecko-dev
Most of the ReaderMode.jsm and Readability.js code is only needed when we actually need to render a document in reader mode, but also winds up loaded into any process where we ever check if a page is readerable. This winds up wasting a huge amount of memory (and probably a huge amount of CPU time) loading code which is almost never used. This patch splits ReaderMode.jsm into two modules, one for checking readability, one for actually entering reader mode. It also separates out the isProbablyReaderable checks from Readability.js, since the overhead of loading that script before it's needed is unsupportable. This means we're probably going to need some effort to keep Readerable.jsm and Readability.js in sync, but the code in question is pretty trivial, so it shouldn't be too difficult. Differential Revision: https://phabricator.services.mozilla.com/D3687 --HG-- rename : toolkit/components/reader/Readability.js => toolkit/components/reader/Readability-readerable.js rename : toolkit/components/reader/ReaderMode.jsm => toolkit/components/reader/Readerable.js extra : rebase_source : 66712057591ae20dd66234e3dc78fbba90a6914e extra : amend_source : f908f62f49ea54b9099ddb87d9f2fc11f12d4dee
96 lines
2.9 KiB
JavaScript
96 lines
2.9 KiB
JavaScript
/* eslint-env es6:false */
|
|
/* globals exports */
|
|
/*
|
|
* Copyright (c) 2010 Arc90 Inc
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
/*
|
|
* This code is heavily based on Arc90's readability.js (1.7.1) script
|
|
* available at: http://code.google.com/p/arc90labs-readability
|
|
*/
|
|
|
|
var REGEXPS = {
|
|
// NOTE: These two regular expressions are duplicated in
|
|
// Readability.js. Please keep both copies in sync.
|
|
unlikelyCandidates: /-ad-|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i,
|
|
okMaybeItsACandidate: /and|article|body|column|main|shadow/i,
|
|
};
|
|
|
|
function isNodeVisible(node) {
|
|
return node.style.display != "none" && !node.hasAttribute("hidden");
|
|
}
|
|
|
|
/**
|
|
* Decides whether or not the document is reader-able without parsing the whole thing.
|
|
*
|
|
* @return boolean Whether or not we suspect Readability.parse() will suceeed at returning an article object.
|
|
*/
|
|
function isProbablyReaderable(doc, isVisible) {
|
|
if (!isVisible) {
|
|
isVisible = isNodeVisible;
|
|
}
|
|
|
|
var nodes = doc.querySelectorAll("p, pre");
|
|
|
|
// Get <div> nodes which have <br> node(s) and append them into the `nodes` variable.
|
|
// Some articles' DOM structures might look like
|
|
// <div>
|
|
// Sentences<br>
|
|
// <br>
|
|
// Sentences<br>
|
|
// </div>
|
|
var brNodes = doc.querySelectorAll("div > br");
|
|
if (brNodes.length) {
|
|
var set = new Set(nodes);
|
|
[].forEach.call(brNodes, function(node) {
|
|
set.add(node.parentNode);
|
|
});
|
|
nodes = Array.from(set);
|
|
}
|
|
|
|
var score = 0;
|
|
// This is a little cheeky, we use the accumulator 'score' to decide what to return from
|
|
// this callback:
|
|
return [].some.call(nodes, function(node) {
|
|
if (!isVisible(node))
|
|
return false;
|
|
|
|
var matchString = node.className + " " + node.id;
|
|
if (REGEXPS.unlikelyCandidates.test(matchString) &&
|
|
!REGEXPS.okMaybeItsACandidate.test(matchString)) {
|
|
return false;
|
|
}
|
|
|
|
if (node.matches("li p")) {
|
|
return false;
|
|
}
|
|
|
|
var textContentLength = node.textContent.trim().length;
|
|
if (textContentLength < 140) {
|
|
return false;
|
|
}
|
|
|
|
score += Math.sqrt(textContentLength - 140);
|
|
|
|
if (score > 20) {
|
|
return true;
|
|
}
|
|
return false;
|
|
});
|
|
}
|
|
|
|
if (typeof exports === "object") {
|
|
exports.isProbablyReaderable = isProbablyReaderable;
|
|
}
|