forked from mirrors/gecko-dev
This aligns us with Servo. It pulls a new `rand` crate version, but I'm removing one in https://phabricator.services.mozilla.com/D55351, so hopefully adds up :) Differential Revision: https://phabricator.services.mozilla.com/D55359 --HG-- rename : third_party/rust/cssparser-macros/Cargo.toml => third_party/rust/phf_macros/Cargo.toml rename : third_party/rust/cssparser-macros/Cargo.toml => third_party/rust/proc-macro-hack/Cargo.toml rename : third_party/rust/rand/.cargo-checksum.json => third_party/rust/rand-0.6.5/.cargo-checksum.json rename : third_party/rust/rand/CHANGELOG.md => third_party/rust/rand-0.6.5/CHANGELOG.md rename : third_party/rust/rand/Cargo.toml => third_party/rust/rand-0.6.5/Cargo.toml rename : third_party/rust/rand/README.md => third_party/rust/rand-0.6.5/README.md rename : third_party/rust/rand/benches/distributions.rs => third_party/rust/rand-0.6.5/benches/distributions.rs rename : third_party/rust/rand/benches/generators.rs => third_party/rust/rand-0.6.5/benches/generators.rs rename : third_party/rust/rand/benches/misc.rs => third_party/rust/rand-0.6.5/benches/misc.rs rename : third_party/rust/rand/benches/seq.rs => third_party/rust/rand-0.6.5/benches/seq.rs rename : third_party/rust/rand/build.rs => third_party/rust/rand-0.6.5/build.rs rename : third_party/rust/rand/examples/monte-carlo.rs => third_party/rust/rand-0.6.5/examples/monte-carlo.rs rename : third_party/rust/rand/examples/monty-hall.rs => third_party/rust/rand-0.6.5/examples/monty-hall.rs rename : third_party/rust/rand/src/deprecated.rs => third_party/rust/rand-0.6.5/src/deprecated.rs rename : third_party/rust/rand/src/distributions/bernoulli.rs => third_party/rust/rand-0.6.5/src/distributions/bernoulli.rs rename : third_party/rust/rand/src/distributions/binomial.rs => third_party/rust/rand-0.6.5/src/distributions/binomial.rs rename : third_party/rust/rand/src/distributions/cauchy.rs => third_party/rust/rand-0.6.5/src/distributions/cauchy.rs rename : third_party/rust/rand/src/distributions/dirichlet.rs => third_party/rust/rand-0.6.5/src/distributions/dirichlet.rs rename : third_party/rust/rand/src/distributions/exponential.rs => third_party/rust/rand-0.6.5/src/distributions/exponential.rs rename : third_party/rust/rand/src/distributions/float.rs => third_party/rust/rand-0.6.5/src/distributions/float.rs rename : third_party/rust/rand/src/distributions/gamma.rs => third_party/rust/rand-0.6.5/src/distributions/gamma.rs rename : third_party/rust/rand/src/distributions/integer.rs => third_party/rust/rand-0.6.5/src/distributions/integer.rs rename : third_party/rust/rand/src/distributions/mod.rs => third_party/rust/rand-0.6.5/src/distributions/mod.rs rename : third_party/rust/rand/src/distributions/normal.rs => third_party/rust/rand-0.6.5/src/distributions/normal.rs rename : third_party/rust/rand/src/distributions/other.rs => third_party/rust/rand-0.6.5/src/distributions/other.rs rename : third_party/rust/rand/src/distributions/pareto.rs => third_party/rust/rand-0.6.5/src/distributions/pareto.rs rename : third_party/rust/rand/src/distributions/poisson.rs => third_party/rust/rand-0.6.5/src/distributions/poisson.rs rename : third_party/rust/rand/src/distributions/triangular.rs => third_party/rust/rand-0.6.5/src/distributions/triangular.rs rename : third_party/rust/rand/src/distributions/uniform.rs => third_party/rust/rand-0.6.5/src/distributions/uniform.rs rename : third_party/rust/rand/src/distributions/unit_circle.rs => third_party/rust/rand-0.6.5/src/distributions/unit_circle.rs rename : third_party/rust/rand/src/distributions/unit_sphere.rs => third_party/rust/rand-0.6.5/src/distributions/unit_sphere.rs rename : third_party/rust/rand/src/distributions/utils.rs => third_party/rust/rand-0.6.5/src/distributions/utils.rs rename : third_party/rust/rand/src/distributions/weibull.rs => third_party/rust/rand-0.6.5/src/distributions/weibull.rs rename : third_party/rust/rand/src/distributions/weighted.rs => third_party/rust/rand-0.6.5/src/distributions/weighted.rs rename : third_party/rust/rand/src/lib.rs => third_party/rust/rand-0.6.5/src/lib.rs rename : third_party/rust/rand/src/prelude.rs => third_party/rust/rand-0.6.5/src/prelude.rs rename : third_party/rust/rand/src/prng/mod.rs => third_party/rust/rand-0.6.5/src/prng/mod.rs rename : third_party/rust/rand/src/rngs/adapter/mod.rs => third_party/rust/rand-0.6.5/src/rngs/adapter/mod.rs rename : third_party/rust/rand/src/rngs/adapter/read.rs => third_party/rust/rand-0.6.5/src/rngs/adapter/read.rs rename : third_party/rust/rand/src/rngs/adapter/reseeding.rs => third_party/rust/rand-0.6.5/src/rngs/adapter/reseeding.rs rename : third_party/rust/rand/src/rngs/entropy.rs => third_party/rust/rand-0.6.5/src/rngs/entropy.rs rename : third_party/rust/rand/src/rngs/mock.rs => third_party/rust/rand-0.6.5/src/rngs/mock.rs rename : third_party/rust/rand/src/rngs/mod.rs => third_party/rust/rand-0.6.5/src/rngs/mod.rs rename : third_party/rust/rand/src/rngs/small.rs => third_party/rust/rand-0.6.5/src/rngs/small.rs rename : third_party/rust/rand/src/rngs/std.rs => third_party/rust/rand-0.6.5/src/rngs/std.rs rename : third_party/rust/rand/src/rngs/thread.rs => third_party/rust/rand-0.6.5/src/rngs/thread.rs rename : third_party/rust/rand/src/seq/index.rs => third_party/rust/rand-0.6.5/src/seq/index.rs rename : third_party/rust/rand/src/seq/mod.rs => third_party/rust/rand-0.6.5/src/seq/mod.rs rename : third_party/rust/rand/tests/uniformity.rs => third_party/rust/rand-0.6.5/tests/uniformity.rs rename : third_party/rust/rand/src/distributions/weighted.rs => third_party/rust/rand/src/distributions/weighted/mod.rs rename : third_party/rust/rand_chacha/.cargo-checksum.json => third_party/rust/rand_chacha-0.1.1/.cargo-checksum.json rename : third_party/rust/rand_chacha/CHANGELOG.md => third_party/rust/rand_chacha-0.1.1/CHANGELOG.md rename : third_party/rust/rand_chacha/Cargo.toml => third_party/rust/rand_chacha-0.1.1/Cargo.toml rename : third_party/rust/rand_chacha/README.md => third_party/rust/rand_chacha-0.1.1/README.md rename : third_party/rust/rand_chacha/build.rs => third_party/rust/rand_chacha-0.1.1/build.rs rename : third_party/rust/rand_chacha/src/chacha.rs => third_party/rust/rand_chacha-0.1.1/src/chacha.rs rename : third_party/rust/rand_chacha/src/lib.rs => third_party/rust/rand_chacha-0.1.1/src/lib.rs rename : third_party/rust/rand_core/.cargo-checksum.json => third_party/rust/rand_core-0.4.0/.cargo-checksum.json rename : third_party/rust/rand_core/CHANGELOG.md => third_party/rust/rand_core-0.4.0/CHANGELOG.md rename : third_party/rust/rand_core/Cargo.toml => third_party/rust/rand_core-0.4.0/Cargo.toml rename : third_party/rust/rand_core/README.md => third_party/rust/rand_core-0.4.0/README.md rename : third_party/rust/rand_core/src/block.rs => third_party/rust/rand_core-0.4.0/src/block.rs rename : third_party/rust/rand_core/src/error.rs => third_party/rust/rand_core-0.4.0/src/error.rs rename : third_party/rust/rand_core/src/impls.rs => third_party/rust/rand_core-0.4.0/src/impls.rs rename : third_party/rust/rand_core/src/lib.rs => third_party/rust/rand_core-0.4.0/src/lib.rs rename : third_party/rust/rand_hc/.cargo-checksum.json => third_party/rust/rand_hc-0.1.0/.cargo-checksum.json rename : third_party/rust/rand_hc/CHANGELOG.md => third_party/rust/rand_hc-0.1.0/CHANGELOG.md rename : third_party/rust/rand_hc/Cargo.toml => third_party/rust/rand_hc-0.1.0/Cargo.toml rename : third_party/rust/rand_hc/README.md => third_party/rust/rand_hc-0.1.0/README.md rename : third_party/rust/rand_hc/src/hc128.rs => third_party/rust/rand_hc-0.1.0/src/hc128.rs rename : third_party/rust/rand_hc/src/lib.rs => third_party/rust/rand_hc-0.1.0/src/lib.rs rename : third_party/rust/rand_pcg/.cargo-checksum.json => third_party/rust/rand_pcg-0.1.2/.cargo-checksum.json rename : third_party/rust/rand_pcg/CHANGELOG.md => third_party/rust/rand_pcg-0.1.2/CHANGELOG.md rename : third_party/rust/rand_pcg/Cargo.toml => third_party/rust/rand_pcg-0.1.2/Cargo.toml rename : third_party/rust/rand_pcg/README.md => third_party/rust/rand_pcg-0.1.2/README.md rename : third_party/rust/rand_chacha/build.rs => third_party/rust/rand_pcg-0.1.2/build.rs rename : third_party/rust/rand_pcg/src/lib.rs => third_party/rust/rand_pcg-0.1.2/src/lib.rs rename : third_party/rust/rand_pcg/src/pcg128.rs => third_party/rust/rand_pcg-0.1.2/src/pcg128.rs rename : third_party/rust/rand_pcg/src/pcg64.rs => third_party/rust/rand_pcg-0.1.2/src/pcg64.rs rename : third_party/rust/rand_pcg/tests/lcg64xsh32.rs => third_party/rust/rand_pcg-0.1.2/tests/lcg64xsh32.rs rename : third_party/rust/rand_pcg/tests/mcg128xsl64.rs => third_party/rust/rand_pcg-0.1.2/tests/mcg128xsl64.rs extra : moz-landing-system : lando
1397 lines
47 KiB
Rust
1397 lines
47 KiB
Rust
/* This Source Code Form is subject to the terms of the Mozilla Public
|
||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
||
|
||
// https://drafts.csswg.org/css-syntax/#tokenization
|
||
|
||
use self::Token::*;
|
||
use crate::cow_rc_str::CowRcStr;
|
||
use crate::parser::ParserState;
|
||
use matches::matches;
|
||
use std::char;
|
||
use std::i32;
|
||
use std::ops::Range;
|
||
|
||
/// One of the pieces the CSS input is broken into.
|
||
///
|
||
/// Some components use `Cow` in order to borrow from the original input string
|
||
/// and avoid allocating/copying when possible.
|
||
#[derive(PartialEq, Debug, Clone)]
|
||
pub enum Token<'a> {
|
||
/// A [`<ident-token>`](https://drafts.csswg.org/css-syntax/#ident-token-diagram)
|
||
Ident(CowRcStr<'a>),
|
||
|
||
/// A [`<at-keyword-token>`](https://drafts.csswg.org/css-syntax/#at-keyword-token-diagram)
|
||
///
|
||
/// The value does not include the `@` marker.
|
||
AtKeyword(CowRcStr<'a>),
|
||
|
||
/// A [`<hash-token>`](https://drafts.csswg.org/css-syntax/#hash-token-diagram) with the type flag set to "unrestricted"
|
||
///
|
||
/// The value does not include the `#` marker.
|
||
Hash(CowRcStr<'a>),
|
||
|
||
/// A [`<hash-token>`](https://drafts.csswg.org/css-syntax/#hash-token-diagram) with the type flag set to "id"
|
||
///
|
||
/// The value does not include the `#` marker.
|
||
IDHash(CowRcStr<'a>), // Hash that is a valid ID selector.
|
||
|
||
/// A [`<string-token>`](https://drafts.csswg.org/css-syntax/#string-token-diagram)
|
||
///
|
||
/// The value does not include the quotes.
|
||
QuotedString(CowRcStr<'a>),
|
||
|
||
/// A [`<url-token>`](https://drafts.csswg.org/css-syntax/#url-token-diagram)
|
||
///
|
||
/// The value does not include the `url(` `)` markers. Note that `url( <string-token> )` is represented by a
|
||
/// `Function` token.
|
||
UnquotedUrl(CowRcStr<'a>),
|
||
|
||
/// A `<delim-token>`
|
||
Delim(char),
|
||
|
||
/// A [`<number-token>`](https://drafts.csswg.org/css-syntax/#number-token-diagram)
|
||
Number {
|
||
/// Whether the number had a `+` or `-` sign.
|
||
///
|
||
/// This is used is some cases like the <An+B> micro syntax. (See the `parse_nth` function.)
|
||
has_sign: bool,
|
||
|
||
/// The value as a float
|
||
value: f32,
|
||
|
||
/// If the origin source did not include a fractional part, the value as an integer.
|
||
int_value: Option<i32>,
|
||
},
|
||
|
||
/// A [`<percentage-token>`](https://drafts.csswg.org/css-syntax/#percentage-token-diagram)
|
||
Percentage {
|
||
/// Whether the number had a `+` or `-` sign.
|
||
has_sign: bool,
|
||
|
||
/// The value as a float, divided by 100 so that the nominal range is 0.0 to 1.0.
|
||
unit_value: f32,
|
||
|
||
/// If the origin source did not include a fractional part, the value as an integer.
|
||
/// It is **not** divided by 100.
|
||
int_value: Option<i32>,
|
||
},
|
||
|
||
/// A [`<dimension-token>`](https://drafts.csswg.org/css-syntax/#dimension-token-diagram)
|
||
Dimension {
|
||
/// Whether the number had a `+` or `-` sign.
|
||
///
|
||
/// This is used is some cases like the <An+B> micro syntax. (See the `parse_nth` function.)
|
||
has_sign: bool,
|
||
|
||
/// The value as a float
|
||
value: f32,
|
||
|
||
/// If the origin source did not include a fractional part, the value as an integer.
|
||
int_value: Option<i32>,
|
||
|
||
/// The unit, e.g. "px" in `12px`
|
||
unit: CowRcStr<'a>,
|
||
},
|
||
|
||
/// A [`<whitespace-token>`](https://drafts.csswg.org/css-syntax/#whitespace-token-diagram)
|
||
WhiteSpace(&'a str),
|
||
|
||
/// A comment.
|
||
///
|
||
/// The CSS Syntax spec does not generate tokens for comments,
|
||
/// But we do, because we can (borrowed &str makes it cheap).
|
||
///
|
||
/// The value does not include the `/*` `*/` markers.
|
||
Comment(&'a str),
|
||
|
||
/// A `:` `<colon-token>`
|
||
Colon, // :
|
||
|
||
/// A `;` `<semicolon-token>`
|
||
Semicolon, // ;
|
||
|
||
/// A `,` `<comma-token>`
|
||
Comma, // ,
|
||
|
||
/// A `~=` [`<include-match-token>`](https://drafts.csswg.org/css-syntax/#include-match-token-diagram)
|
||
IncludeMatch,
|
||
|
||
/// A `|=` [`<dash-match-token>`](https://drafts.csswg.org/css-syntax/#dash-match-token-diagram)
|
||
DashMatch,
|
||
|
||
/// A `^=` [`<prefix-match-token>`](https://drafts.csswg.org/css-syntax/#prefix-match-token-diagram)
|
||
PrefixMatch,
|
||
|
||
/// A `$=` [`<suffix-match-token>`](https://drafts.csswg.org/css-syntax/#suffix-match-token-diagram)
|
||
SuffixMatch,
|
||
|
||
/// A `*=` [`<substring-match-token>`](https://drafts.csswg.org/css-syntax/#substring-match-token-diagram)
|
||
SubstringMatch,
|
||
|
||
/// A `<!--` [`<CDO-token>`](https://drafts.csswg.org/css-syntax/#CDO-token-diagram)
|
||
CDO,
|
||
|
||
/// A `-->` [`<CDC-token>`](https://drafts.csswg.org/css-syntax/#CDC-token-diagram)
|
||
CDC,
|
||
|
||
/// A [`<function-token>`](https://drafts.csswg.org/css-syntax/#function-token-diagram)
|
||
///
|
||
/// The value (name) does not include the `(` marker.
|
||
Function(CowRcStr<'a>),
|
||
|
||
/// A `<(-token>`
|
||
ParenthesisBlock,
|
||
|
||
/// A `<[-token>`
|
||
SquareBracketBlock,
|
||
|
||
/// A `<{-token>`
|
||
CurlyBracketBlock,
|
||
|
||
/// A `<bad-url-token>`
|
||
///
|
||
/// This token always indicates a parse error.
|
||
BadUrl(CowRcStr<'a>),
|
||
|
||
/// A `<bad-string-token>`
|
||
///
|
||
/// This token always indicates a parse error.
|
||
BadString(CowRcStr<'a>),
|
||
|
||
/// A `<)-token>`
|
||
///
|
||
/// When obtained from one of the `Parser::next*` methods,
|
||
/// this token is always unmatched and indicates a parse error.
|
||
CloseParenthesis,
|
||
|
||
/// A `<]-token>`
|
||
///
|
||
/// When obtained from one of the `Parser::next*` methods,
|
||
/// this token is always unmatched and indicates a parse error.
|
||
CloseSquareBracket,
|
||
|
||
/// A `<}-token>`
|
||
///
|
||
/// When obtained from one of the `Parser::next*` methods,
|
||
/// this token is always unmatched and indicates a parse error.
|
||
CloseCurlyBracket,
|
||
}
|
||
|
||
impl<'a> Token<'a> {
|
||
/// Return whether this token represents a parse error.
|
||
///
|
||
/// `BadUrl` and `BadString` are tokenizer-level parse errors.
|
||
///
|
||
/// `CloseParenthesis`, `CloseSquareBracket`, and `CloseCurlyBracket` are *unmatched*
|
||
/// and therefore parse errors when returned by one of the `Parser::next*` methods.
|
||
pub fn is_parse_error(&self) -> bool {
|
||
matches!(
|
||
*self,
|
||
BadUrl(_) | BadString(_) | CloseParenthesis | CloseSquareBracket | CloseCurlyBracket
|
||
)
|
||
}
|
||
}
|
||
|
||
#[derive(Clone)]
|
||
pub struct Tokenizer<'a> {
|
||
input: &'a str,
|
||
/// Counted in bytes, not code points. From 0.
|
||
position: usize,
|
||
/// The position at the start of the current line; but adjusted to
|
||
/// ensure that computing the column will give the result in units
|
||
/// of UTF-16 characters.
|
||
current_line_start_position: usize,
|
||
current_line_number: u32,
|
||
var_or_env_functions: SeenStatus,
|
||
source_map_url: Option<&'a str>,
|
||
source_url: Option<&'a str>,
|
||
}
|
||
|
||
#[derive(Copy, Clone, PartialEq, Eq)]
|
||
enum SeenStatus {
|
||
DontCare,
|
||
LookingForThem,
|
||
SeenAtLeastOne,
|
||
}
|
||
|
||
impl<'a> Tokenizer<'a> {
|
||
#[inline]
|
||
pub fn new(input: &str) -> Tokenizer {
|
||
Tokenizer::with_first_line_number(input, 0)
|
||
}
|
||
|
||
#[inline]
|
||
pub fn with_first_line_number(input: &str, first_line_number: u32) -> Tokenizer {
|
||
Tokenizer {
|
||
input: input,
|
||
position: 0,
|
||
current_line_start_position: 0,
|
||
current_line_number: first_line_number,
|
||
var_or_env_functions: SeenStatus::DontCare,
|
||
source_map_url: None,
|
||
source_url: None,
|
||
}
|
||
}
|
||
|
||
#[inline]
|
||
pub fn look_for_var_or_env_functions(&mut self) {
|
||
self.var_or_env_functions = SeenStatus::LookingForThem;
|
||
}
|
||
|
||
#[inline]
|
||
pub fn seen_var_or_env_functions(&mut self) -> bool {
|
||
let seen = self.var_or_env_functions == SeenStatus::SeenAtLeastOne;
|
||
self.var_or_env_functions = SeenStatus::DontCare;
|
||
seen
|
||
}
|
||
|
||
#[inline]
|
||
pub fn see_function(&mut self, name: &str) {
|
||
if self.var_or_env_functions == SeenStatus::LookingForThem {
|
||
if name.eq_ignore_ascii_case("var") || name.eq_ignore_ascii_case("env") {
|
||
self.var_or_env_functions = SeenStatus::SeenAtLeastOne;
|
||
}
|
||
}
|
||
}
|
||
|
||
#[inline]
|
||
pub fn next(&mut self) -> Result<Token<'a>, ()> {
|
||
next_token(self)
|
||
}
|
||
|
||
#[inline]
|
||
pub fn position(&self) -> SourcePosition {
|
||
SourcePosition(self.position)
|
||
}
|
||
|
||
#[inline]
|
||
pub fn current_source_location(&self) -> SourceLocation {
|
||
SourceLocation {
|
||
line: self.current_line_number,
|
||
column: (self.position - self.current_line_start_position + 1) as u32,
|
||
}
|
||
}
|
||
|
||
#[inline]
|
||
pub fn current_source_map_url(&self) -> Option<&'a str> {
|
||
self.source_map_url
|
||
}
|
||
|
||
#[inline]
|
||
pub fn current_source_url(&self) -> Option<&'a str> {
|
||
self.source_url
|
||
}
|
||
|
||
#[inline]
|
||
pub fn state(&self) -> ParserState {
|
||
ParserState {
|
||
position: self.position,
|
||
current_line_start_position: self.current_line_start_position,
|
||
current_line_number: self.current_line_number,
|
||
at_start_of: None,
|
||
}
|
||
}
|
||
|
||
#[inline]
|
||
pub fn reset(&mut self, state: &ParserState) {
|
||
self.position = state.position;
|
||
self.current_line_start_position = state.current_line_start_position;
|
||
self.current_line_number = state.current_line_number;
|
||
}
|
||
|
||
#[inline]
|
||
pub fn slice_from(&self, start_pos: SourcePosition) -> &'a str {
|
||
&self.input[start_pos.0..self.position]
|
||
}
|
||
|
||
#[inline]
|
||
pub fn slice(&self, range: Range<SourcePosition>) -> &'a str {
|
||
&self.input[range.start.0..range.end.0]
|
||
}
|
||
|
||
pub fn current_source_line(&self) -> &'a str {
|
||
let current = self.position;
|
||
let start = self.input[0..current]
|
||
.rfind(|c| matches!(c, '\r' | '\n' | '\x0C'))
|
||
.map_or(0, |start| start + 1);
|
||
let end = self.input[current..]
|
||
.find(|c| matches!(c, '\r' | '\n' | '\x0C'))
|
||
.map_or(self.input.len(), |end| current + end);
|
||
&self.input[start..end]
|
||
}
|
||
|
||
#[inline]
|
||
pub fn next_byte(&self) -> Option<u8> {
|
||
if self.is_eof() {
|
||
None
|
||
} else {
|
||
Some(self.input.as_bytes()[self.position])
|
||
}
|
||
}
|
||
|
||
// If false, `tokenizer.next_char()` will not panic.
|
||
#[inline]
|
||
fn is_eof(&self) -> bool {
|
||
!self.has_at_least(0)
|
||
}
|
||
|
||
// If true, the input has at least `n` bytes left *after* the current one.
|
||
// That is, `tokenizer.char_at(n)` will not panic.
|
||
#[inline]
|
||
fn has_at_least(&self, n: usize) -> bool {
|
||
self.position + n < self.input.len()
|
||
}
|
||
|
||
// Advance over N bytes in the input. This function can advance
|
||
// over ASCII bytes (excluding newlines), or UTF-8 sequence
|
||
// leaders (excluding leaders for 4-byte sequences).
|
||
#[inline]
|
||
pub fn advance(&mut self, n: usize) {
|
||
if cfg!(debug_assertions) {
|
||
// Each byte must either be an ASCII byte or a sequence
|
||
// leader, but not a 4-byte leader; also newlines are
|
||
// rejected.
|
||
for i in 0..n {
|
||
let b = self.byte_at(i);
|
||
debug_assert!(b.is_ascii() || (b & 0xF0 != 0xF0 && b & 0xC0 != 0x80));
|
||
debug_assert!(b != b'\r' && b != b'\n' && b != b'\x0C');
|
||
}
|
||
}
|
||
self.position += n
|
||
}
|
||
|
||
// Assumes non-EOF
|
||
#[inline]
|
||
fn next_byte_unchecked(&self) -> u8 {
|
||
self.byte_at(0)
|
||
}
|
||
|
||
#[inline]
|
||
fn byte_at(&self, offset: usize) -> u8 {
|
||
self.input.as_bytes()[self.position + offset]
|
||
}
|
||
|
||
// Advance over a single byte; the byte must be a UTF-8 sequence
|
||
// leader for a 4-byte sequence.
|
||
#[inline]
|
||
fn consume_4byte_intro(&mut self) {
|
||
debug_assert!(self.next_byte_unchecked() & 0xF0 == 0xF0);
|
||
// This takes two UTF-16 characters to represent, so we
|
||
// actually have an undercount.
|
||
self.current_line_start_position = self.current_line_start_position.wrapping_sub(1);
|
||
self.position += 1;
|
||
}
|
||
|
||
// Advance over a single byte; the byte must be a UTF-8
|
||
// continuation byte.
|
||
#[inline]
|
||
fn consume_continuation_byte(&mut self) {
|
||
debug_assert!(self.next_byte_unchecked() & 0xC0 == 0x80);
|
||
// Continuation bytes contribute to column overcount. Note
|
||
// that due to the special case for the 4-byte sequence intro,
|
||
// we must use wrapping add here.
|
||
self.current_line_start_position = self.current_line_start_position.wrapping_add(1);
|
||
self.position += 1;
|
||
}
|
||
|
||
// Advance over any kind of byte, excluding newlines.
|
||
#[inline(never)]
|
||
fn consume_known_byte(&mut self, byte: u8) {
|
||
debug_assert!(byte != b'\r' && byte != b'\n' && byte != b'\x0C');
|
||
self.position += 1;
|
||
// Continuation bytes contribute to column overcount.
|
||
if byte & 0xF0 == 0xF0 {
|
||
// This takes two UTF-16 characters to represent, so we
|
||
// actually have an undercount.
|
||
self.current_line_start_position = self.current_line_start_position.wrapping_sub(1);
|
||
} else if byte & 0xC0 == 0x80 {
|
||
// Note that due to the special case for the 4-byte
|
||
// sequence intro, we must use wrapping add here.
|
||
self.current_line_start_position = self.current_line_start_position.wrapping_add(1);
|
||
}
|
||
}
|
||
|
||
#[inline]
|
||
fn next_char(&self) -> char {
|
||
self.input[self.position..].chars().next().unwrap()
|
||
}
|
||
|
||
// Given that a newline has been seen, advance over the newline
|
||
// and update the state.
|
||
#[inline]
|
||
fn consume_newline(&mut self) {
|
||
let byte = self.next_byte_unchecked();
|
||
debug_assert!(byte == b'\r' || byte == b'\n' || byte == b'\x0C');
|
||
self.position += 1;
|
||
if byte == b'\r' && self.next_byte() == Some(b'\n') {
|
||
self.position += 1;
|
||
}
|
||
self.current_line_start_position = self.position;
|
||
self.current_line_number += 1;
|
||
}
|
||
|
||
#[inline]
|
||
fn has_newline_at(&self, offset: usize) -> bool {
|
||
self.position + offset < self.input.len()
|
||
&& matches!(self.byte_at(offset), b'\n' | b'\r' | b'\x0C')
|
||
}
|
||
|
||
#[inline]
|
||
fn consume_char(&mut self) -> char {
|
||
let c = self.next_char();
|
||
let len_utf8 = c.len_utf8();
|
||
self.position += len_utf8;
|
||
// Note that due to the special case for the 4-byte sequence
|
||
// intro, we must use wrapping add here.
|
||
self.current_line_start_position = self
|
||
.current_line_start_position
|
||
.wrapping_add(len_utf8 - c.len_utf16());
|
||
c
|
||
}
|
||
|
||
#[inline]
|
||
fn starts_with(&self, needle: &[u8]) -> bool {
|
||
self.input.as_bytes()[self.position..].starts_with(needle)
|
||
}
|
||
|
||
pub fn skip_whitespace(&mut self) {
|
||
while !self.is_eof() {
|
||
match_byte! { self.next_byte_unchecked(),
|
||
b' ' | b'\t' => {
|
||
self.advance(1)
|
||
},
|
||
b'\n' | b'\x0C' | b'\r' => {
|
||
self.consume_newline();
|
||
},
|
||
b'/' => {
|
||
if self.starts_with(b"/*") {
|
||
consume_comment(self);
|
||
} else {
|
||
return
|
||
}
|
||
}
|
||
_ => {
|
||
return
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
pub fn skip_cdc_and_cdo(&mut self) {
|
||
while !self.is_eof() {
|
||
match_byte! { self.next_byte_unchecked(),
|
||
b' ' | b'\t' => {
|
||
self.advance(1)
|
||
},
|
||
b'\n' | b'\x0C' | b'\r' => {
|
||
self.consume_newline();
|
||
},
|
||
b'/' => {
|
||
if self.starts_with(b"/*") {
|
||
consume_comment(self);
|
||
} else {
|
||
return
|
||
}
|
||
}
|
||
b'<' => {
|
||
if self.starts_with(b"<!--") {
|
||
self.advance(4)
|
||
} else {
|
||
return
|
||
}
|
||
}
|
||
b'-' => {
|
||
if self.starts_with(b"-->") {
|
||
self.advance(3)
|
||
} else {
|
||
return
|
||
}
|
||
}
|
||
_ => {
|
||
return
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
/// A position from the start of the input, counted in UTF-8 bytes.
|
||
#[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)]
|
||
pub struct SourcePosition(pub(crate) usize);
|
||
|
||
impl SourcePosition {
|
||
/// Returns the current byte index in the original input.
|
||
#[inline]
|
||
pub fn byte_index(&self) -> usize {
|
||
self.0
|
||
}
|
||
}
|
||
|
||
/// The line and column number for a given position within the input.
|
||
#[derive(PartialEq, Eq, Debug, Clone, Copy)]
|
||
pub struct SourceLocation {
|
||
/// The line number, starting at 0 for the first line, unless `with_first_line_number` was used.
|
||
pub line: u32,
|
||
|
||
/// The column number within a line, starting at 1 for first the character of the line.
|
||
/// Column numbers are counted in UTF-16 code units.
|
||
pub column: u32,
|
||
}
|
||
|
||
fn next_token<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>, ()> {
|
||
if tokenizer.is_eof() {
|
||
return Err(());
|
||
}
|
||
let b = tokenizer.next_byte_unchecked();
|
||
let token = match_byte! { b,
|
||
b' ' | b'\t' => {
|
||
consume_whitespace(tokenizer, false)
|
||
},
|
||
b'\n' | b'\x0C' | b'\r' => {
|
||
consume_whitespace(tokenizer, true)
|
||
},
|
||
b'"' => { consume_string(tokenizer, false) },
|
||
b'#' => {
|
||
tokenizer.advance(1);
|
||
if is_ident_start(tokenizer) { IDHash(consume_name(tokenizer)) }
|
||
else if !tokenizer.is_eof() && match tokenizer.next_byte_unchecked() {
|
||
// Any other valid case here already resulted in IDHash.
|
||
b'0'..=b'9' | b'-' => true,
|
||
_ => false,
|
||
} { Hash(consume_name(tokenizer)) }
|
||
else { Delim('#') }
|
||
},
|
||
b'$' => {
|
||
if tokenizer.starts_with(b"$=") { tokenizer.advance(2); SuffixMatch }
|
||
else { tokenizer.advance(1); Delim('$') }
|
||
},
|
||
b'\'' => { consume_string(tokenizer, true) },
|
||
b'(' => { tokenizer.advance(1); ParenthesisBlock },
|
||
b')' => { tokenizer.advance(1); CloseParenthesis },
|
||
b'*' => {
|
||
if tokenizer.starts_with(b"*=") { tokenizer.advance(2); SubstringMatch }
|
||
else { tokenizer.advance(1); Delim('*') }
|
||
},
|
||
b'+' => {
|
||
if (
|
||
tokenizer.has_at_least(1)
|
||
&& matches!(tokenizer.byte_at(1), b'0'..=b'9')
|
||
) || (
|
||
tokenizer.has_at_least(2)
|
||
&& tokenizer.byte_at(1) == b'.'
|
||
&& matches!(tokenizer.byte_at(2), b'0'..=b'9')
|
||
) {
|
||
consume_numeric(tokenizer)
|
||
} else {
|
||
tokenizer.advance(1);
|
||
Delim('+')
|
||
}
|
||
},
|
||
b',' => { tokenizer.advance(1); Comma },
|
||
b'-' => {
|
||
if (
|
||
tokenizer.has_at_least(1)
|
||
&& matches!(tokenizer.byte_at(1), b'0'..=b'9')
|
||
) || (
|
||
tokenizer.has_at_least(2)
|
||
&& tokenizer.byte_at(1) == b'.'
|
||
&& matches!(tokenizer.byte_at(2), b'0'..=b'9')
|
||
) {
|
||
consume_numeric(tokenizer)
|
||
} else if tokenizer.starts_with(b"-->") {
|
||
tokenizer.advance(3);
|
||
CDC
|
||
} else if is_ident_start(tokenizer) {
|
||
consume_ident_like(tokenizer)
|
||
} else {
|
||
tokenizer.advance(1);
|
||
Delim('-')
|
||
}
|
||
},
|
||
b'.' => {
|
||
if tokenizer.has_at_least(1)
|
||
&& matches!(tokenizer.byte_at(1), b'0'..=b'9'
|
||
) {
|
||
consume_numeric(tokenizer)
|
||
} else {
|
||
tokenizer.advance(1);
|
||
Delim('.')
|
||
}
|
||
}
|
||
b'/' => {
|
||
if tokenizer.starts_with(b"/*") {
|
||
Comment(consume_comment(tokenizer))
|
||
} else {
|
||
tokenizer.advance(1);
|
||
Delim('/')
|
||
}
|
||
}
|
||
b'0'..=b'9' => { consume_numeric(tokenizer) },
|
||
b':' => { tokenizer.advance(1); Colon },
|
||
b';' => { tokenizer.advance(1); Semicolon },
|
||
b'<' => {
|
||
if tokenizer.starts_with(b"<!--") {
|
||
tokenizer.advance(4);
|
||
CDO
|
||
} else {
|
||
tokenizer.advance(1);
|
||
Delim('<')
|
||
}
|
||
},
|
||
b'@' => {
|
||
tokenizer.advance(1);
|
||
if is_ident_start(tokenizer) { AtKeyword(consume_name(tokenizer)) }
|
||
else { Delim('@') }
|
||
},
|
||
b'a'..=b'z' | b'A'..=b'Z' | b'_' | b'\0' => { consume_ident_like(tokenizer) },
|
||
b'[' => { tokenizer.advance(1); SquareBracketBlock },
|
||
b'\\' => {
|
||
if !tokenizer.has_newline_at(1) { consume_ident_like(tokenizer) }
|
||
else { tokenizer.advance(1); Delim('\\') }
|
||
},
|
||
b']' => { tokenizer.advance(1); CloseSquareBracket },
|
||
b'^' => {
|
||
if tokenizer.starts_with(b"^=") { tokenizer.advance(2); PrefixMatch }
|
||
else { tokenizer.advance(1); Delim('^') }
|
||
},
|
||
b'{' => { tokenizer.advance(1); CurlyBracketBlock },
|
||
b'|' => {
|
||
if tokenizer.starts_with(b"|=") { tokenizer.advance(2); DashMatch }
|
||
else { tokenizer.advance(1); Delim('|') }
|
||
},
|
||
b'}' => { tokenizer.advance(1); CloseCurlyBracket },
|
||
b'~' => {
|
||
if tokenizer.starts_with(b"~=") { tokenizer.advance(2); IncludeMatch }
|
||
else { tokenizer.advance(1); Delim('~') }
|
||
},
|
||
_ => {
|
||
if !b.is_ascii() {
|
||
consume_ident_like(tokenizer)
|
||
} else {
|
||
tokenizer.advance(1);
|
||
Delim(b as char)
|
||
}
|
||
},
|
||
};
|
||
Ok(token)
|
||
}
|
||
|
||
fn consume_whitespace<'a>(tokenizer: &mut Tokenizer<'a>, newline: bool) -> Token<'a> {
|
||
let start_position = tokenizer.position();
|
||
if newline {
|
||
tokenizer.consume_newline();
|
||
} else {
|
||
tokenizer.advance(1);
|
||
}
|
||
while !tokenizer.is_eof() {
|
||
let b = tokenizer.next_byte_unchecked();
|
||
match_byte! { b,
|
||
b' ' | b'\t' => {
|
||
tokenizer.advance(1);
|
||
}
|
||
b'\n' | b'\x0C' | b'\r' => {
|
||
tokenizer.consume_newline();
|
||
}
|
||
_ => {
|
||
break
|
||
}
|
||
}
|
||
}
|
||
WhiteSpace(tokenizer.slice_from(start_position))
|
||
}
|
||
|
||
// Check for sourceMappingURL or sourceURL comments and update the
|
||
// tokenizer appropriately.
|
||
fn check_for_source_map<'a>(tokenizer: &mut Tokenizer<'a>, contents: &'a str) {
|
||
let directive = "# sourceMappingURL=";
|
||
let directive_old = "@ sourceMappingURL=";
|
||
|
||
// If there is a source map directive, extract the URL.
|
||
if contents.starts_with(directive) || contents.starts_with(directive_old) {
|
||
let contents = &contents[directive.len()..];
|
||
tokenizer.source_map_url = contents
|
||
.split(|c| c == ' ' || c == '\t' || c == '\x0C' || c == '\r' || c == '\n')
|
||
.next()
|
||
}
|
||
|
||
let directive = "# sourceURL=";
|
||
let directive_old = "@ sourceURL=";
|
||
|
||
// If there is a source map directive, extract the URL.
|
||
if contents.starts_with(directive) || contents.starts_with(directive_old) {
|
||
let contents = &contents[directive.len()..];
|
||
tokenizer.source_url = contents
|
||
.split(|c| c == ' ' || c == '\t' || c == '\x0C' || c == '\r' || c == '\n')
|
||
.next()
|
||
}
|
||
}
|
||
|
||
fn consume_comment<'a>(tokenizer: &mut Tokenizer<'a>) -> &'a str {
|
||
tokenizer.advance(2); // consume "/*"
|
||
let start_position = tokenizer.position();
|
||
while !tokenizer.is_eof() {
|
||
match_byte! { tokenizer.next_byte_unchecked(),
|
||
b'*' => {
|
||
let end_position = tokenizer.position();
|
||
tokenizer.advance(1);
|
||
if tokenizer.next_byte() == Some(b'/') {
|
||
tokenizer.advance(1);
|
||
let contents = tokenizer.slice(start_position..end_position);
|
||
check_for_source_map(tokenizer, contents);
|
||
return contents
|
||
}
|
||
}
|
||
b'\n' | b'\x0C' | b'\r' => {
|
||
tokenizer.consume_newline();
|
||
}
|
||
b'\x80'..=b'\xBF' => { tokenizer.consume_continuation_byte(); }
|
||
b'\xF0'..=b'\xFF' => { tokenizer.consume_4byte_intro(); }
|
||
_ => {
|
||
// ASCII or other leading byte.
|
||
tokenizer.advance(1);
|
||
}
|
||
}
|
||
}
|
||
let contents = tokenizer.slice_from(start_position);
|
||
check_for_source_map(tokenizer, contents);
|
||
contents
|
||
}
|
||
|
||
fn consume_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool) -> Token<'a> {
|
||
match consume_quoted_string(tokenizer, single_quote) {
|
||
Ok(value) => QuotedString(value),
|
||
Err(value) => BadString(value),
|
||
}
|
||
}
|
||
|
||
/// Return `Err(())` on syntax error (ie. unescaped newline)
|
||
fn consume_quoted_string<'a>(
|
||
tokenizer: &mut Tokenizer<'a>,
|
||
single_quote: bool,
|
||
) -> Result<CowRcStr<'a>, CowRcStr<'a>> {
|
||
tokenizer.advance(1); // Skip the initial quote
|
||
// start_pos is at code point boundary, after " or '
|
||
let start_pos = tokenizer.position();
|
||
let mut string_bytes;
|
||
loop {
|
||
if tokenizer.is_eof() {
|
||
return Ok(tokenizer.slice_from(start_pos).into());
|
||
}
|
||
match_byte! { tokenizer.next_byte_unchecked(),
|
||
b'"' => {
|
||
if !single_quote {
|
||
let value = tokenizer.slice_from(start_pos);
|
||
tokenizer.advance(1);
|
||
return Ok(value.into())
|
||
}
|
||
tokenizer.advance(1);
|
||
}
|
||
b'\'' => {
|
||
if single_quote {
|
||
let value = tokenizer.slice_from(start_pos);
|
||
tokenizer.advance(1);
|
||
return Ok(value.into())
|
||
}
|
||
tokenizer.advance(1);
|
||
}
|
||
b'\\' | b'\0' => {
|
||
// * The tokenizer’s input is UTF-8 since it’s `&str`.
|
||
// * start_pos is at a code point boundary
|
||
// * so is the current position (which is before '\\' or '\0'
|
||
//
|
||
// So `string_bytes` is well-formed UTF-8.
|
||
string_bytes = tokenizer.slice_from(start_pos).as_bytes().to_owned();
|
||
break
|
||
}
|
||
b'\n' | b'\r' | b'\x0C' => {
|
||
return Err(tokenizer.slice_from(start_pos).into())
|
||
},
|
||
b'\x80'..=b'\xBF' => { tokenizer.consume_continuation_byte(); }
|
||
b'\xF0'..=b'\xFF' => { tokenizer.consume_4byte_intro(); }
|
||
_ => {
|
||
// ASCII or other leading byte.
|
||
tokenizer.advance(1);
|
||
}
|
||
}
|
||
}
|
||
|
||
while !tokenizer.is_eof() {
|
||
let b = tokenizer.next_byte_unchecked();
|
||
match_byte! { b,
|
||
b'\n' | b'\r' | b'\x0C' => {
|
||
return Err(
|
||
// string_bytes is well-formed UTF-8, see other comments.
|
||
unsafe {
|
||
from_utf8_release_unchecked(string_bytes)
|
||
}.into()
|
||
);
|
||
}
|
||
b'"' => {
|
||
tokenizer.advance(1);
|
||
if !single_quote {
|
||
break;
|
||
}
|
||
}
|
||
b'\'' => {
|
||
tokenizer.advance(1);
|
||
if single_quote {
|
||
break;
|
||
}
|
||
}
|
||
b'\\' => {
|
||
tokenizer.advance(1);
|
||
if !tokenizer.is_eof() {
|
||
match tokenizer.next_byte_unchecked() {
|
||
// Escaped newline
|
||
b'\n' | b'\x0C' | b'\r' => {
|
||
tokenizer.consume_newline();
|
||
}
|
||
// This pushes one well-formed code point
|
||
_ => consume_escape_and_write(tokenizer, &mut string_bytes)
|
||
}
|
||
}
|
||
// else: escaped EOF, do nothing.
|
||
continue;
|
||
}
|
||
b'\0' => {
|
||
tokenizer.advance(1);
|
||
string_bytes.extend("\u{FFFD}".as_bytes());
|
||
continue;
|
||
}
|
||
b'\x80'..=b'\xBF' => { tokenizer.consume_continuation_byte(); }
|
||
b'\xF0'..=b'\xFF' => { tokenizer.consume_4byte_intro(); }
|
||
_ => {
|
||
// ASCII or other leading byte.
|
||
tokenizer.advance(1);
|
||
},
|
||
}
|
||
|
||
// If this byte is part of a multi-byte code point,
|
||
// we’ll end up copying the whole code point before this loop does something else.
|
||
string_bytes.push(b);
|
||
}
|
||
|
||
Ok(
|
||
// string_bytes is well-formed UTF-8, see other comments.
|
||
unsafe { from_utf8_release_unchecked(string_bytes) }.into(),
|
||
)
|
||
}
|
||
|
||
#[inline]
|
||
fn is_ident_start(tokenizer: &mut Tokenizer) -> bool {
|
||
!tokenizer.is_eof()
|
||
&& match_byte! { tokenizer.next_byte_unchecked(),
|
||
b'a'..=b'z' | b'A'..=b'Z' | b'_' | b'\0' => { true },
|
||
b'-' => {
|
||
tokenizer.has_at_least(1) && match_byte! { tokenizer.byte_at(1),
|
||
b'a'..=b'z' | b'A'..=b'Z' | b'-' | b'_' | b'\0' => {
|
||
true
|
||
}
|
||
b'\\' => { !tokenizer.has_newline_at(1) }
|
||
b => { !b.is_ascii() },
|
||
}
|
||
},
|
||
b'\\' => { !tokenizer.has_newline_at(1) },
|
||
b => { !b.is_ascii() },
|
||
}
|
||
}
|
||
|
||
fn consume_ident_like<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {
|
||
let value = consume_name(tokenizer);
|
||
if !tokenizer.is_eof() && tokenizer.next_byte_unchecked() == b'(' {
|
||
tokenizer.advance(1);
|
||
if value.eq_ignore_ascii_case("url") {
|
||
consume_unquoted_url(tokenizer).unwrap_or(Function(value))
|
||
} else {
|
||
tokenizer.see_function(&value);
|
||
Function(value)
|
||
}
|
||
} else {
|
||
Ident(value)
|
||
}
|
||
}
|
||
|
||
fn consume_name<'a>(tokenizer: &mut Tokenizer<'a>) -> CowRcStr<'a> {
|
||
// start_pos is the end of the previous token, therefore at a code point boundary
|
||
let start_pos = tokenizer.position();
|
||
let mut value_bytes;
|
||
loop {
|
||
if tokenizer.is_eof() {
|
||
return tokenizer.slice_from(start_pos).into();
|
||
}
|
||
match_byte! { tokenizer.next_byte_unchecked(),
|
||
b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_' | b'-' => { tokenizer.advance(1) },
|
||
b'\\' | b'\0' => {
|
||
// * The tokenizer’s input is UTF-8 since it’s `&str`.
|
||
// * start_pos is at a code point boundary
|
||
// * so is the current position (which is before '\\' or '\0'
|
||
//
|
||
// So `value_bytes` is well-formed UTF-8.
|
||
value_bytes = tokenizer.slice_from(start_pos).as_bytes().to_owned();
|
||
break
|
||
}
|
||
b'\x80'..=b'\xBF' => { tokenizer.consume_continuation_byte(); }
|
||
b'\xC0'..=b'\xEF' => { tokenizer.advance(1); }
|
||
b'\xF0'..=b'\xFF' => { tokenizer.consume_4byte_intro(); }
|
||
_b => {
|
||
return tokenizer.slice_from(start_pos).into();
|
||
}
|
||
}
|
||
}
|
||
|
||
while !tokenizer.is_eof() {
|
||
let b = tokenizer.next_byte_unchecked();
|
||
match_byte! { b,
|
||
b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_' | b'-' => {
|
||
tokenizer.advance(1);
|
||
value_bytes.push(b) // ASCII
|
||
}
|
||
b'\\' => {
|
||
if tokenizer.has_newline_at(1) { break }
|
||
tokenizer.advance(1);
|
||
// This pushes one well-formed code point
|
||
consume_escape_and_write(tokenizer, &mut value_bytes)
|
||
}
|
||
b'\0' => {
|
||
tokenizer.advance(1);
|
||
value_bytes.extend("\u{FFFD}".as_bytes());
|
||
},
|
||
b'\x80'..=b'\xBF' => {
|
||
// This byte *is* part of a multi-byte code point,
|
||
// we’ll end up copying the whole code point before this loop does something else.
|
||
tokenizer.consume_continuation_byte();
|
||
value_bytes.push(b)
|
||
}
|
||
b'\xC0'..=b'\xEF' => {
|
||
// This byte *is* part of a multi-byte code point,
|
||
// we’ll end up copying the whole code point before this loop does something else.
|
||
tokenizer.advance(1);
|
||
value_bytes.push(b)
|
||
}
|
||
b'\xF0'..=b'\xFF' => {
|
||
tokenizer.consume_4byte_intro();
|
||
value_bytes.push(b)
|
||
}
|
||
_ => {
|
||
// ASCII
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
// string_bytes is well-formed UTF-8, see other comments.
|
||
unsafe { from_utf8_release_unchecked(value_bytes) }.into()
|
||
}
|
||
|
||
fn byte_to_hex_digit(b: u8) -> Option<u32> {
|
||
Some(match_byte! { b,
|
||
b'0' ..= b'9' => { b - b'0' },
|
||
b'a' ..= b'f' => { b - b'a' + 10 },
|
||
b'A' ..= b'F' => { b - b'A' + 10 },
|
||
_ => {
|
||
return None
|
||
}
|
||
} as u32)
|
||
}
|
||
|
||
fn byte_to_decimal_digit(b: u8) -> Option<u32> {
|
||
if b >= b'0' && b <= b'9' {
|
||
Some((b - b'0') as u32)
|
||
} else {
|
||
None
|
||
}
|
||
}
|
||
|
||
fn consume_numeric<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {
|
||
// Parse [+-]?\d*(\.\d+)?([eE][+-]?\d+)?
|
||
// But this is always called so that there is at least one digit in \d*(\.\d+)?
|
||
|
||
// Do all the math in f64 so that large numbers overflow to +/-inf
|
||
// and i32::{MIN, MAX} are within range.
|
||
|
||
let (has_sign, sign) = match tokenizer.next_byte_unchecked() {
|
||
b'-' => (true, -1.),
|
||
b'+' => (true, 1.),
|
||
_ => (false, 1.),
|
||
};
|
||
if has_sign {
|
||
tokenizer.advance(1);
|
||
}
|
||
|
||
let mut integral_part: f64 = 0.;
|
||
while let Some(digit) = byte_to_decimal_digit(tokenizer.next_byte_unchecked()) {
|
||
integral_part = integral_part * 10. + digit as f64;
|
||
tokenizer.advance(1);
|
||
if tokenizer.is_eof() {
|
||
break;
|
||
}
|
||
}
|
||
|
||
let mut is_integer = true;
|
||
|
||
let mut fractional_part: f64 = 0.;
|
||
if tokenizer.has_at_least(1)
|
||
&& tokenizer.next_byte_unchecked() == b'.'
|
||
&& matches!(tokenizer.byte_at(1), b'0'..=b'9')
|
||
{
|
||
is_integer = false;
|
||
tokenizer.advance(1); // Consume '.'
|
||
let mut factor = 0.1;
|
||
while let Some(digit) = byte_to_decimal_digit(tokenizer.next_byte_unchecked()) {
|
||
fractional_part += digit as f64 * factor;
|
||
factor *= 0.1;
|
||
tokenizer.advance(1);
|
||
if tokenizer.is_eof() {
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
|
||
let mut value = sign * (integral_part + fractional_part);
|
||
|
||
if tokenizer.has_at_least(1) && matches!(tokenizer.next_byte_unchecked(), b'e' | b'E') {
|
||
if matches!(tokenizer.byte_at(1), b'0'..=b'9')
|
||
|| (tokenizer.has_at_least(2)
|
||
&& matches!(tokenizer.byte_at(1), b'+' | b'-')
|
||
&& matches!(tokenizer.byte_at(2), b'0'..=b'9'))
|
||
{
|
||
is_integer = false;
|
||
tokenizer.advance(1);
|
||
let (has_sign, sign) = match tokenizer.next_byte_unchecked() {
|
||
b'-' => (true, -1.),
|
||
b'+' => (true, 1.),
|
||
_ => (false, 1.),
|
||
};
|
||
if has_sign {
|
||
tokenizer.advance(1);
|
||
}
|
||
let mut exponent: f64 = 0.;
|
||
while let Some(digit) = byte_to_decimal_digit(tokenizer.next_byte_unchecked()) {
|
||
exponent = exponent * 10. + digit as f64;
|
||
tokenizer.advance(1);
|
||
if tokenizer.is_eof() {
|
||
break;
|
||
}
|
||
}
|
||
value *= f64::powf(10., sign * exponent);
|
||
}
|
||
}
|
||
|
||
let int_value = if is_integer {
|
||
Some(if value >= i32::MAX as f64 {
|
||
i32::MAX
|
||
} else if value <= i32::MIN as f64 {
|
||
i32::MIN
|
||
} else {
|
||
value as i32
|
||
})
|
||
} else {
|
||
None
|
||
};
|
||
|
||
if !tokenizer.is_eof() && tokenizer.next_byte_unchecked() == b'%' {
|
||
tokenizer.advance(1);
|
||
return Percentage {
|
||
unit_value: (value / 100.) as f32,
|
||
int_value: int_value,
|
||
has_sign: has_sign,
|
||
};
|
||
}
|
||
let value = value as f32;
|
||
if is_ident_start(tokenizer) {
|
||
let unit = consume_name(tokenizer);
|
||
Dimension {
|
||
value: value,
|
||
int_value: int_value,
|
||
has_sign: has_sign,
|
||
unit: unit,
|
||
}
|
||
} else {
|
||
Number {
|
||
value: value,
|
||
int_value: int_value,
|
||
has_sign: has_sign,
|
||
}
|
||
}
|
||
}
|
||
|
||
#[inline]
|
||
unsafe fn from_utf8_release_unchecked(string_bytes: Vec<u8>) -> String {
|
||
if cfg!(debug_assertions) {
|
||
String::from_utf8(string_bytes).unwrap()
|
||
} else {
|
||
String::from_utf8_unchecked(string_bytes)
|
||
}
|
||
}
|
||
|
||
fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>, ()> {
|
||
// This is only called after "url(", so the current position is a code point boundary.
|
||
let start_position = tokenizer.position;
|
||
let from_start = &tokenizer.input[tokenizer.position..];
|
||
let mut newlines = 0;
|
||
let mut last_newline = 0;
|
||
let mut found_printable_char = false;
|
||
let mut iter = from_start.bytes().enumerate();
|
||
loop {
|
||
let (offset, b) = match iter.next() {
|
||
Some(item) => item,
|
||
None => {
|
||
tokenizer.position = tokenizer.input.len();
|
||
break;
|
||
}
|
||
};
|
||
match_byte! { b,
|
||
b' ' | b'\t' => {},
|
||
b'\n' | b'\x0C' => {
|
||
newlines += 1;
|
||
last_newline = offset;
|
||
}
|
||
b'\r' => {
|
||
if from_start.as_bytes().get(offset + 1) != Some(&b'\n') {
|
||
newlines += 1;
|
||
last_newline = offset;
|
||
}
|
||
}
|
||
b'"' | b'\'' => { return Err(()) }, // Do not advance
|
||
b')' => {
|
||
// Don't use advance, because we may be skipping
|
||
// newlines here, and we want to avoid the assert.
|
||
tokenizer.position += offset + 1;
|
||
break
|
||
}
|
||
_ => {
|
||
// Don't use advance, because we may be skipping
|
||
// newlines here, and we want to avoid the assert.
|
||
tokenizer.position += offset;
|
||
found_printable_char = true;
|
||
break
|
||
}
|
||
}
|
||
}
|
||
|
||
if newlines > 0 {
|
||
tokenizer.current_line_number += newlines;
|
||
// No need for wrapping_add here, because there's no possible
|
||
// way to wrap.
|
||
tokenizer.current_line_start_position = start_position + last_newline + 1;
|
||
}
|
||
|
||
if found_printable_char {
|
||
// This function only consumed ASCII (whitespace) bytes,
|
||
// so the current position is a code point boundary.
|
||
return Ok(consume_unquoted_url_internal(tokenizer));
|
||
} else {
|
||
return Ok(UnquotedUrl("".into()));
|
||
}
|
||
|
||
fn consume_unquoted_url_internal<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {
|
||
// This function is only called with start_pos at a code point boundary.
|
||
let start_pos = tokenizer.position();
|
||
let mut string_bytes: Vec<u8>;
|
||
loop {
|
||
if tokenizer.is_eof() {
|
||
return UnquotedUrl(tokenizer.slice_from(start_pos).into());
|
||
}
|
||
match_byte! { tokenizer.next_byte_unchecked(),
|
||
b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => {
|
||
let value = tokenizer.slice_from(start_pos);
|
||
return consume_url_end(tokenizer, start_pos, value.into())
|
||
}
|
||
b')' => {
|
||
let value = tokenizer.slice_from(start_pos);
|
||
tokenizer.advance(1);
|
||
return UnquotedUrl(value.into())
|
||
}
|
||
b'\x01'..=b'\x08' | b'\x0B' | b'\x0E'..=b'\x1F' | b'\x7F' // non-printable
|
||
| b'"' | b'\'' | b'(' => {
|
||
tokenizer.advance(1);
|
||
return consume_bad_url(tokenizer, start_pos)
|
||
},
|
||
b'\\' | b'\0' => {
|
||
// * The tokenizer’s input is UTF-8 since it’s `&str`.
|
||
// * start_pos is at a code point boundary
|
||
// * so is the current position (which is before '\\' or '\0'
|
||
//
|
||
// So `string_bytes` is well-formed UTF-8.
|
||
string_bytes = tokenizer.slice_from(start_pos).as_bytes().to_owned();
|
||
break
|
||
}
|
||
b'\x80'..=b'\xBF' => { tokenizer.consume_continuation_byte(); }
|
||
b'\xF0'..=b'\xFF' => { tokenizer.consume_4byte_intro(); }
|
||
_ => {
|
||
// ASCII or other leading byte.
|
||
tokenizer.advance(1);
|
||
}
|
||
}
|
||
}
|
||
while !tokenizer.is_eof() {
|
||
let b = tokenizer.next_byte_unchecked();
|
||
match_byte! { b,
|
||
b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => {
|
||
// string_bytes is well-formed UTF-8, see other comments.
|
||
let string = unsafe { from_utf8_release_unchecked(string_bytes) }.into();
|
||
return consume_url_end(tokenizer, start_pos, string)
|
||
}
|
||
b')' => {
|
||
tokenizer.advance(1);
|
||
break;
|
||
}
|
||
b'\x01'..=b'\x08' | b'\x0B' | b'\x0E'..=b'\x1F' | b'\x7F' // non-printable
|
||
| b'"' | b'\'' | b'(' => {
|
||
tokenizer.advance(1);
|
||
return consume_bad_url(tokenizer, start_pos);
|
||
}
|
||
b'\\' => {
|
||
tokenizer.advance(1);
|
||
if tokenizer.has_newline_at(0) {
|
||
return consume_bad_url(tokenizer, start_pos)
|
||
}
|
||
|
||
// This pushes one well-formed code point to string_bytes
|
||
consume_escape_and_write(tokenizer, &mut string_bytes)
|
||
},
|
||
b'\0' => {
|
||
tokenizer.advance(1);
|
||
string_bytes.extend("\u{FFFD}".as_bytes());
|
||
}
|
||
b'\x80'..=b'\xBF' => {
|
||
// We’ll end up copying the whole code point
|
||
// before this loop does something else.
|
||
tokenizer.consume_continuation_byte();
|
||
string_bytes.push(b);
|
||
}
|
||
b'\xF0'..=b'\xFF' => {
|
||
// We’ll end up copying the whole code point
|
||
// before this loop does something else.
|
||
tokenizer.consume_4byte_intro();
|
||
string_bytes.push(b);
|
||
}
|
||
// If this byte is part of a multi-byte code point,
|
||
// we’ll end up copying the whole code point before this loop does something else.
|
||
b => {
|
||
// ASCII or other leading byte.
|
||
tokenizer.advance(1);
|
||
string_bytes.push(b)
|
||
}
|
||
}
|
||
}
|
||
UnquotedUrl(
|
||
// string_bytes is well-formed UTF-8, see other comments.
|
||
unsafe { from_utf8_release_unchecked(string_bytes) }.into(),
|
||
)
|
||
}
|
||
|
||
fn consume_url_end<'a>(
|
||
tokenizer: &mut Tokenizer<'a>,
|
||
start_pos: SourcePosition,
|
||
string: CowRcStr<'a>,
|
||
) -> Token<'a> {
|
||
while !tokenizer.is_eof() {
|
||
match_byte! { tokenizer.next_byte_unchecked(),
|
||
b')' => {
|
||
tokenizer.advance(1);
|
||
break
|
||
}
|
||
b' ' | b'\t' => { tokenizer.advance(1); }
|
||
b'\n' | b'\x0C' | b'\r' => {
|
||
tokenizer.consume_newline();
|
||
}
|
||
b => {
|
||
tokenizer.consume_known_byte(b);
|
||
return consume_bad_url(tokenizer, start_pos);
|
||
}
|
||
}
|
||
}
|
||
UnquotedUrl(string)
|
||
}
|
||
|
||
fn consume_bad_url<'a>(tokenizer: &mut Tokenizer<'a>, start_pos: SourcePosition) -> Token<'a> {
|
||
// Consume up to the closing )
|
||
while !tokenizer.is_eof() {
|
||
match_byte! { tokenizer.next_byte_unchecked(),
|
||
b')' => {
|
||
let contents = tokenizer.slice_from(start_pos).into();
|
||
tokenizer.advance(1);
|
||
return BadUrl(contents)
|
||
}
|
||
b'\\' => {
|
||
tokenizer.advance(1);
|
||
if matches!(tokenizer.next_byte(), Some(b')') | Some(b'\\')) {
|
||
tokenizer.advance(1); // Skip an escaped ')' or '\'
|
||
}
|
||
}
|
||
b'\n' | b'\x0C' | b'\r' => {
|
||
tokenizer.consume_newline();
|
||
}
|
||
b => {
|
||
tokenizer.consume_known_byte(b);
|
||
}
|
||
}
|
||
}
|
||
BadUrl(tokenizer.slice_from(start_pos).into())
|
||
}
|
||
}
|
||
|
||
// (value, number of digits up to 6)
|
||
fn consume_hex_digits<'a>(tokenizer: &mut Tokenizer<'a>) -> (u32, u32) {
|
||
let mut value = 0;
|
||
let mut digits = 0;
|
||
while digits < 6 && !tokenizer.is_eof() {
|
||
match byte_to_hex_digit(tokenizer.next_byte_unchecked()) {
|
||
Some(digit) => {
|
||
value = value * 16 + digit;
|
||
digits += 1;
|
||
tokenizer.advance(1);
|
||
}
|
||
None => break,
|
||
}
|
||
}
|
||
(value, digits)
|
||
}
|
||
|
||
// Same constraints as consume_escape except it writes into `bytes` the result
|
||
// instead of returning it.
|
||
fn consume_escape_and_write(tokenizer: &mut Tokenizer, bytes: &mut Vec<u8>) {
|
||
bytes.extend(
|
||
consume_escape(tokenizer)
|
||
.encode_utf8(&mut [0; 4])
|
||
.as_bytes(),
|
||
)
|
||
}
|
||
|
||
// Assumes that the U+005C REVERSE SOLIDUS (\) has already been consumed
|
||
// and that the next input character has already been verified
|
||
// to not be a newline.
|
||
fn consume_escape(tokenizer: &mut Tokenizer) -> char {
|
||
if tokenizer.is_eof() {
|
||
return '\u{FFFD}';
|
||
} // Escaped EOF
|
||
match_byte! { tokenizer.next_byte_unchecked(),
|
||
b'0'..=b'9' | b'A'..=b'F' | b'a'..=b'f' => {
|
||
let (c, _) = consume_hex_digits(tokenizer);
|
||
if !tokenizer.is_eof() {
|
||
match_byte! { tokenizer.next_byte_unchecked(),
|
||
b' ' | b'\t' => {
|
||
tokenizer.advance(1)
|
||
}
|
||
b'\n' | b'\x0C' | b'\r' => {
|
||
tokenizer.consume_newline();
|
||
}
|
||
_ => {}
|
||
}
|
||
}
|
||
static REPLACEMENT_CHAR: char = '\u{FFFD}';
|
||
if c != 0 {
|
||
let c = char::from_u32(c);
|
||
c.unwrap_or(REPLACEMENT_CHAR)
|
||
} else {
|
||
REPLACEMENT_CHAR
|
||
}
|
||
},
|
||
b'\0' => {
|
||
tokenizer.advance(1);
|
||
'\u{FFFD}'
|
||
}
|
||
_ => { tokenizer.consume_char() }
|
||
}
|
||
}
|