fune/browser/components/urlbar/UrlbarTokenizer.jsm
Marco Bonardo 45b356558e Bug 1583616 - Some numbers are searched and not autofilled. r=Standard8
The tokenizer currently thinks large numbers are "broken" IPs, thus it says they
can't be an origin. Unfortunately we use that same code path to identify possible
origin prefixes, and origins can start with a number. Thus we end up searching
rather than autofilling the origin.

For now fix the heuristic for IPs, in the future we may evaluate splitting these
code paths.

Differential Revision: https://phabricator.services.mozilla.com/D48187

--HG--
extra : moz-landing-system : lando
2019-10-07 14:16:02 +00:00

362 lines
12 KiB
JavaScript

/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
"use strict";
/**
* This module exports a tokenizer to be used by the urlbar model.
* Emitted tokens are objects in the shape { type, value }, where type is one
* of UrlbarTokenizer.TYPE.
*/
var EXPORTED_SYMBOLS = ["UrlbarTokenizer"];
const { XPCOMUtils } = ChromeUtils.import(
"resource://gre/modules/XPCOMUtils.jsm"
);
const { Services } = ChromeUtils.import("resource://gre/modules/Services.jsm");
ChromeUtils.defineModuleGetter(this, "Log", "resource://gre/modules/Log.jsm");
XPCOMUtils.defineLazyGetter(this, "logger", () =>
Log.repository.getLogger("Urlbar.Tokenizer")
);
var UrlbarTokenizer = {
// Regex matching on whitespaces.
REGEXP_SPACES: /\s+/,
// Regex used to guess url-like strings.
// These are not expected to be 100% correct, we accept some user mistypes
// and we're unlikely to be able to cover 100% of the cases.
REGEXP_LIKE_PROTOCOL: /^[A-Z+.-]+:\/*(?!\/)/i,
REGEXP_USERINFO_INVALID_CHARS: /[^\w.~%!$&'()*+,;=:-]/,
REGEXP_HOSTPORT_INVALID_CHARS: /[^\[\]A-Z0-9.:-]/i,
REGEXP_SINGLE_WORD_HOST: /^[^.:]$/i,
REGEXP_HOSTPORT_IP_LIKE: /^(?=(.*[.:].*){2})[a-f0-9\.\[\]:]+$/i,
// This accepts partial IPv4.
REGEXP_HOSTPORT_INVALID_IP: /\.{2,}|\d{5,}|\d{4,}(?![:\]])|^\.|^(\d+\.){4,}\d+$|^\d{4,}$/,
// This only accepts complete IPv4.
REGEXP_HOSTPORT_IPV4: /^(\d{1,3}\.){3,}\d{1,3}(:\d+)?$/,
// This accepts partial IPv6.
REGEXP_HOSTPORT_IPV6: /^\[([0-9a-f]{0,4}:){0,7}[0-9a-f]{0,4}\]?$/i,
REGEXP_COMMON_EMAIL: /^[\w!#$%&'*+\/=?^`{|}~-]+@[\[\]A-Z0-9.-]+$/i,
// Regex matching a percent encoded char at the beginning of a string.
REGEXP_PERCENT_ENCODED_START: /^(%[0-9a-f]{2}){2,}/i,
TYPE: {
TEXT: 1,
POSSIBLE_ORIGIN: 2, // It may be an ip, a domain, but even just a single word used as host.
POSSIBLE_URL: 3, // Consumers should still check this with a fixup.
RESTRICT_HISTORY: 4,
RESTRICT_BOOKMARK: 5,
RESTRICT_TAG: 6,
RESTRICT_OPENPAGE: 7,
RESTRICT_SEARCH: 8,
RESTRICT_TITLE: 9,
RESTRICT_URL: 10,
},
// The special characters below can be typed into the urlbar to restrict
// the search to a certain category, like history, bookmarks or open pages; or
// to force a match on just the title or url.
// These restriction characters can be typed alone, or at word boundaries,
// provided their meaning cannot be confused, for example # could be present
// in a valid url, and thus it should not be interpreted as a restriction.
RESTRICT: {
HISTORY: "^",
BOOKMARK: "*",
TAG: "+",
OPENPAGE: "%",
SEARCH: "?",
TITLE: "#",
URL: "$",
},
/**
* Returns whether the passed in token looks like a URL.
* This is based on guessing and heuristics, that means if this function
* returns false, it's surely not a URL, if it returns true, the result must
* still be verified through URIFixup.
*
* @param {string} token
* The string token to verify
* @param {object} options {
* requirePath: the url must have a path
* }
* @returns {boolean} whether the token looks like a URL.
*/
looksLikeUrl(token, options = {}) {
if (token.length < 2) {
return false;
}
// It should be a single word.
if (this.REGEXP_SPACES.test(token)) {
return false;
}
// If it starts with something that looks like a protocol, it's likely a url.
if (this.REGEXP_LIKE_PROTOCOL.test(token)) {
return true;
}
// Guess path and prePath. At this point we should be analyzing strings not
// having a protocol.
let slashIndex = token.indexOf("/");
let prePath = slashIndex != -1 ? token.slice(0, slashIndex) : token;
if (!this.looksLikeOrigin(prePath)) {
return false;
}
let path = slashIndex != -1 ? token.slice(slashIndex) : "";
logger.debug("path", path);
if (options.requirePath && !path) {
return false;
}
// If there are both path and userinfo, it's likely a url.
let atIndex = prePath.indexOf("@");
let userinfo = atIndex != -1 ? prePath.slice(0, atIndex) : "";
if (path.length && userinfo.length) {
return true;
}
// If the first character after the slash in the path is a letter, then the
// token may be an "abc/def" url.
if (/^\/[a-z]/i.test(path)) {
return true;
}
// If the path contains special chars, it is likely a url.
if (["%", "?", "#"].some(c => path.includes(c))) {
return true;
}
// The above looksLikeOrigin call told us the prePath looks like an origin,
// now we go into details checking some common origins.
let hostPort = atIndex != -1 ? prePath.slice(atIndex + 1) : prePath;
if (this.REGEXP_HOSTPORT_IPV4.test(hostPort)) {
return true;
}
// ipv6 is very complex to support, just check for a few chars.
if (
this.REGEXP_HOSTPORT_IPV6.test(hostPort) &&
["[", "]", ":"].some(c => hostPort.includes(c))
) {
return true;
}
if (Services.uriFixup.isDomainWhitelisted(hostPort, -1)) {
return true;
}
return false;
},
/**
* Returns whether the passed in token looks like an origin.
* This is based on guessing and heuristics, that means if this function
* returns false, it's surely not an origin, if it returns true, the result
* must still be verified through URIFixup.
*
* @param {string} token
* The string token to verify
* @returns {boolean} whether the token looks like an origin.
*/
looksLikeOrigin(token) {
if (!token.length) {
return false;
}
let atIndex = token.indexOf("@");
if (atIndex != -1 && this.REGEXP_COMMON_EMAIL.test(token)) {
// We prefer handling it as an email rather than an origin with userinfo.
return false;
}
let userinfo = atIndex != -1 ? token.slice(0, atIndex) : "";
let hostPort = atIndex != -1 ? token.slice(atIndex + 1) : token;
logger.debug("userinfo", userinfo);
logger.debug("hostPort", hostPort);
if (
this.REGEXP_HOSTPORT_IPV4.test(hostPort) ||
this.REGEXP_HOSTPORT_IPV6.test(hostPort)
) {
return true;
}
// Check for invalid chars.
return (
!this.REGEXP_LIKE_PROTOCOL.test(hostPort) &&
!this.REGEXP_USERINFO_INVALID_CHARS.test(userinfo) &&
!this.REGEXP_HOSTPORT_INVALID_CHARS.test(hostPort) &&
(this.REGEXP_SINGLE_WORD_HOST.test(hostPort) ||
!this.REGEXP_HOSTPORT_IP_LIKE.test(hostPort) ||
!this.REGEXP_HOSTPORT_INVALID_IP.test(hostPort))
);
},
/**
* Tokenizes the searchString from a UrlbarQueryContext.
* @param {UrlbarQueryContext} queryContext
* The query context object to tokenize
* @returns {UrlbarQueryContext} the same query context object with a new
* tokens property.
*/
tokenize(queryContext) {
logger.info("Tokenizing", queryContext);
let searchString = queryContext.searchString;
if (!searchString.trim()) {
queryContext.tokens = [];
return queryContext;
}
let unfiltered = splitString(searchString);
let tokens = filterTokens(unfiltered);
queryContext.tokens = tokens;
return queryContext;
},
/**
* Given a token, tells if it's a restriction token.
* @param {string} token
* @returns {boolean} Whether the token is a restriction character.
*/
isRestrictionToken(token) {
return (
token.type >= this.TYPE.RESTRICT_HISTORY &&
token.type <= this.TYPE.RESTRICT_URL
);
},
};
const CHAR_TO_TYPE_MAP = new Map(
Object.entries(UrlbarTokenizer.RESTRICT).map(([type, char]) => [
char,
UrlbarTokenizer.TYPE[`RESTRICT_${type}`],
])
);
/**
* Given a search string, splits it into string tokens.
* @param {string} searchString
* The search string to split
* @returns {array} An array of string tokens.
*/
function splitString(searchString) {
// The first step is splitting on unicode whitespaces.
let tokens = searchString.trim().split(UrlbarTokenizer.REGEXP_SPACES);
let accumulator = [];
let hasRestrictionToken = tokens.some(t => CHAR_TO_TYPE_MAP.has(t));
let chars = Array.from(CHAR_TO_TYPE_MAP.keys()).join("");
logger.debug("Restriction chars", chars);
for (let i = 0; i < tokens.length; ++i) {
// If there is no separate restriction token, it's possible we have to split
// a token, if it's the first one and it includes a leading restriction char
// or it's the last one and it includes a trailing restriction char.
// This allows to not require the user to add artificial whitespaces to
// enforce restrictions, for example typing questions would restrict to
// search results.
let token = tokens[i];
if (!hasRestrictionToken && token.length > 1) {
// Check for an unambiguous restriction char at the beginning of the
// first token, or at the end of the last token.
if (
i == 0 &&
chars.includes(token[0]) &&
!UrlbarTokenizer.REGEXP_PERCENT_ENCODED_START.test(token)
) {
hasRestrictionToken = true;
accumulator.push(token[0]);
accumulator.push(token.slice(1));
continue;
} else if (
i == tokens.length - 1 &&
chars.includes(token[token.length - 1]) &&
!UrlbarTokenizer.looksLikeUrl(token, { requirePath: true })
) {
hasRestrictionToken = true;
accumulator.push(token.slice(0, token.length - 1));
accumulator.push(token[token.length - 1]);
continue;
}
}
accumulator.push(token);
}
logger.info("Found tokens", accumulator);
return accumulator;
}
/**
* Given an array of unfiltered tokens, this function filters them and converts
* to token objects with a type.
*
* @param {array} tokens
* An array of strings, representing search tokens.
* @returns {array} An array of token objects.
* @note restriction characters are only considered if they appear at the start
* or at the end of the tokens list. In case of restriction characters
* conflict, the most external ones win. Leading ones win over trailing
* ones. Discarded restriction characters are considered text.
*/
function filterTokens(tokens) {
let filtered = [];
let restrictions = [];
for (let i = 0; i < tokens.length; ++i) {
let token = tokens[i];
let tokenObj = {
value: token,
lowerCaseValue: token.toLocaleLowerCase(),
type: UrlbarTokenizer.TYPE.TEXT,
};
let restrictionType = CHAR_TO_TYPE_MAP.get(token);
if (restrictionType) {
restrictions.push({ index: i, type: restrictionType });
} else if (UrlbarTokenizer.looksLikeOrigin(token)) {
tokenObj.type = UrlbarTokenizer.TYPE.POSSIBLE_ORIGIN;
} else if (UrlbarTokenizer.looksLikeUrl(token, { requirePath: true })) {
tokenObj.type = UrlbarTokenizer.TYPE.POSSIBLE_URL;
}
filtered.push(tokenObj);
}
// Handle restriction characters.
if (restrictions.length) {
// We can apply two kind of restrictions: type (bookmark, search, ...) and
// matching (url, title). These kind of restrictions can be combined, but we
// can only have one restriction per kind.
let matchingRestrictionFound = false;
let typeRestrictionFound = false;
function assignRestriction(r) {
if (r && !(matchingRestrictionFound && typeRestrictionFound)) {
if (
[
UrlbarTokenizer.TYPE.RESTRICT_TITLE,
UrlbarTokenizer.TYPE.RESTRICT_URL,
].includes(r.type)
) {
if (!matchingRestrictionFound) {
matchingRestrictionFound = true;
filtered[r.index].type = r.type;
return true;
}
} else if (!typeRestrictionFound) {
typeRestrictionFound = true;
filtered[r.index].type = r.type;
return true;
}
}
return false;
}
// Look at the first token.
let found = assignRestriction(restrictions.find(r => r.index == 0));
if (found) {
// If the first token was assigned, look at the next one.
assignRestriction(restrictions.find(r => r.index == 1));
}
// Then look at the last token.
let lastIndex = tokens.length - 1;
found = assignRestriction(restrictions.find(r => r.index == lastIndex));
if (found) {
// If the last token was assigned, look at the previous one.
assignRestriction(restrictions.find(r => r.index == lastIndex - 1));
}
}
logger.info("Filtered Tokens", tokens);
return filtered;
}