forked from mirrors/gecko-dev
538 lines
16 KiB
Rust
538 lines
16 KiB
Rust
mod errors;
|
|
mod layout_table;
|
|
#[cfg(feature = "likelysubtags")]
|
|
pub mod likelysubtags;
|
|
#[doc(hidden)]
|
|
pub mod parser;
|
|
#[cfg(feature = "serde")]
|
|
mod serde;
|
|
pub mod subtags;
|
|
|
|
pub use crate::errors::LanguageIdentifierError;
|
|
use std::fmt::Write;
|
|
use std::iter::Peekable;
|
|
use std::str::FromStr;
|
|
|
|
/// Enum representing available character direction orientations.
|
|
#[derive(Clone, Copy, Debug, PartialEq)]
|
|
pub enum CharacterDirection {
|
|
/// Right To Left
|
|
///
|
|
/// Used in languages such as Arabic, Hebrew, Fula, Kurdish etc.
|
|
RTL,
|
|
/// Left To Right
|
|
///
|
|
/// Used in languages such as French, Spanish, English, German etc.
|
|
LTR,
|
|
/// Top To Bottom
|
|
///
|
|
/// Used in Traditional Mongolian
|
|
TTB,
|
|
}
|
|
|
|
type PartsTuple = (
|
|
subtags::Language,
|
|
Option<subtags::Script>,
|
|
Option<subtags::Region>,
|
|
Vec<subtags::Variant>,
|
|
);
|
|
|
|
/// `LanguageIdentifier` is a core struct representing a Unicode Language Identifier.
|
|
///
|
|
/// # Examples
|
|
///
|
|
/// ```
|
|
/// use unic_langid_impl::LanguageIdentifier;
|
|
///
|
|
/// let li: LanguageIdentifier = "en-US".parse()
|
|
/// .expect("Failed to parse.");
|
|
///
|
|
/// assert_eq!(li.language, "en");
|
|
/// assert_eq!(li.script, None);
|
|
/// assert_eq!(li.region.as_ref().map(Into::into), Some("US"));
|
|
/// assert_eq!(li.variants().len(), 0);
|
|
/// ```
|
|
///
|
|
/// # Parsing
|
|
///
|
|
/// Unicode recognizes three levels of standard conformance for any language identifier:
|
|
///
|
|
/// * *well-formed* - syntactically correct
|
|
/// * *valid* - well-formed and only uses registered language subtags, extensions, keywords, types...
|
|
/// * *canonical* - valid and no deprecated codes or structure.
|
|
///
|
|
/// At the moment parsing normalizes a well-formed language identifier converting
|
|
/// `_` separators to `-` and adjusting casing to conform to the Unicode standard.
|
|
///
|
|
/// Any bogus subtags will cause the parsing to fail with an error.
|
|
/// No subtag validation is performed.
|
|
///
|
|
/// # Examples:
|
|
///
|
|
/// ```
|
|
/// use unic_langid_impl::LanguageIdentifier;
|
|
///
|
|
/// let li: LanguageIdentifier = "eN_latn_Us-Valencia".parse()
|
|
/// .expect("Failed to parse.");
|
|
///
|
|
/// assert_eq!(li.language, "en");
|
|
/// assert_eq!(li.script.as_ref().map(Into::into), Some("Latn"));
|
|
/// assert_eq!(li.region.as_ref().map(Into::into), Some("US"));
|
|
/// assert_eq!(li.variants().map(|v| v.as_str()).collect::<Vec<_>>(), &["valencia"]);
|
|
/// ```
|
|
#[derive(Default, Debug, PartialEq, Eq, Clone, Hash, PartialOrd, Ord)]
|
|
pub struct LanguageIdentifier {
|
|
pub language: subtags::Language,
|
|
pub script: Option<subtags::Script>,
|
|
pub region: Option<subtags::Region>,
|
|
variants: Option<Box<[subtags::Variant]>>,
|
|
}
|
|
|
|
impl LanguageIdentifier {
|
|
/// A constructor which takes a utf8 slice, parses it and
|
|
/// produces a well-formed `LanguageIdentifier`.
|
|
///
|
|
/// # Examples
|
|
///
|
|
/// ```
|
|
/// use unic_langid_impl::LanguageIdentifier;
|
|
///
|
|
/// let li = LanguageIdentifier::from_bytes("en-US".as_bytes())
|
|
/// .expect("Parsing failed.");
|
|
///
|
|
/// assert_eq!(li.to_string(), "en-US");
|
|
/// ```
|
|
pub fn from_bytes(v: &[u8]) -> Result<Self, LanguageIdentifierError> {
|
|
Ok(parser::parse_language_identifier(v)?)
|
|
}
|
|
|
|
/// A constructor which takes optional subtags as `AsRef<[u8]>`, parses them and
|
|
/// produces a well-formed `LanguageIdentifier`.
|
|
///
|
|
/// # Examples
|
|
///
|
|
/// ```
|
|
/// use unic_langid_impl::LanguageIdentifier;
|
|
///
|
|
/// let li = LanguageIdentifier::from_parts(
|
|
/// "fr".parse().expect("Parsing failed."),
|
|
/// None,
|
|
/// Some("CA".parse().expect("Parsing failed.")),
|
|
/// &[]
|
|
/// );
|
|
///
|
|
/// assert_eq!(li.to_string(), "fr-CA");
|
|
/// ```
|
|
pub fn from_parts(
|
|
language: subtags::Language,
|
|
script: Option<subtags::Script>,
|
|
region: Option<subtags::Region>,
|
|
variants: &[subtags::Variant],
|
|
) -> Self {
|
|
let variants = if !variants.is_empty() {
|
|
let mut v = variants.to_vec();
|
|
v.sort_unstable();
|
|
v.dedup();
|
|
Some(v.into_boxed_slice())
|
|
} else {
|
|
None
|
|
};
|
|
|
|
Self {
|
|
language,
|
|
script,
|
|
region,
|
|
variants,
|
|
}
|
|
}
|
|
|
|
/// # Unchecked
|
|
///
|
|
/// This function accepts subtags expecting variants
|
|
/// to be deduplicated and ordered.
|
|
pub const fn from_raw_parts_unchecked(
|
|
language: subtags::Language,
|
|
script: Option<subtags::Script>,
|
|
region: Option<subtags::Region>,
|
|
variants: Option<Box<[subtags::Variant]>>,
|
|
) -> Self {
|
|
Self {
|
|
language,
|
|
script,
|
|
region,
|
|
variants,
|
|
}
|
|
}
|
|
|
|
#[doc(hidden)]
|
|
/// This method is used by `unic-locale` to handle partial
|
|
/// subtag iterator.
|
|
///
|
|
/// Not stable.
|
|
pub fn try_from_iter<'a>(
|
|
iter: &mut Peekable<impl Iterator<Item = &'a [u8]>>,
|
|
allow_extension: bool,
|
|
) -> Result<LanguageIdentifier, LanguageIdentifierError> {
|
|
Ok(parser::parse_language_identifier_from_iter(
|
|
iter,
|
|
allow_extension,
|
|
)?)
|
|
}
|
|
|
|
/// Consumes `LanguageIdentifier` and produces raw internal representations
|
|
/// of all subtags in form of `u64`/`u32`.
|
|
///
|
|
/// Primarily used for storing internal representation and restoring via
|
|
/// `from_raw_parts_unchecked`.
|
|
///
|
|
/// # Examples
|
|
///
|
|
/// ```
|
|
/// use unic_langid_impl::LanguageIdentifier;
|
|
/// use tinystr::{TinyStr8, TinyStr4};
|
|
///
|
|
/// let li: LanguageIdentifier = "en-US".parse()
|
|
/// .expect("Parsing failed.");
|
|
///
|
|
/// let (lang, script, region, variants) = li.into_parts();
|
|
///
|
|
/// // let li2 = LanguageIdentifier::from_raw_parts_unchecked(
|
|
/// // lang.map(|l| unsafe { TinyStr8::new_unchecked(l) }),
|
|
/// // script.map(|s| unsafe { TinyStr4::new_unchecked(s) }),
|
|
/// // region.map(|r| unsafe { TinyStr4::new_unchecked(r) }),
|
|
/// // variants.map(|v| v.into_iter().map(|v| unsafe { TinyStr8::new_unchecked(*v) }).collect()),
|
|
/// //);
|
|
///
|
|
/// //assert_eq!(li2.to_string(), "en-US");
|
|
/// ```
|
|
pub fn into_parts(self) -> PartsTuple {
|
|
(
|
|
self.language,
|
|
self.script,
|
|
self.region,
|
|
self.variants.map_or_else(Vec::new, |v| v.to_vec()),
|
|
)
|
|
}
|
|
|
|
/// Compares a `LanguageIdentifier` to another `AsRef<LanguageIdentifier`
|
|
/// allowing for either side to use the missing fields as wildcards.
|
|
///
|
|
/// This allows for matching between `en` (treated as `en-*-*-*`) and `en-US`.
|
|
///
|
|
/// # Examples
|
|
///
|
|
/// ```
|
|
/// use unic_langid_impl::LanguageIdentifier;
|
|
///
|
|
/// let li1: LanguageIdentifier = "en".parse()
|
|
/// .expect("Parsing failed.");
|
|
///
|
|
/// let li2: LanguageIdentifier = "en-US".parse()
|
|
/// .expect("Parsing failed.");
|
|
///
|
|
/// assert_ne!(li1, li2); // "en" != "en-US"
|
|
/// assert_ne!(li1.to_string(), li2.to_string()); // "en" != "en-US"
|
|
///
|
|
/// assert_eq!(li1.matches(&li2, false, false), false); // "en" != "en-US"
|
|
/// assert_eq!(li1.matches(&li2, true, false), true); // "en-*-*-*" == "en-US"
|
|
/// assert_eq!(li1.matches(&li2, false, true), false); // "en" != "en-*-US-*"
|
|
/// assert_eq!(li1.matches(&li2, true, true), true); // "en-*-*-*" == "en-*-US-*"
|
|
/// ```
|
|
pub fn matches<O: AsRef<Self>>(
|
|
&self,
|
|
other: &O,
|
|
self_as_range: bool,
|
|
other_as_range: bool,
|
|
) -> bool {
|
|
let other = other.as_ref();
|
|
self.language
|
|
.matches(other.language, self_as_range, other_as_range)
|
|
&& subtag_matches(&self.script, &other.script, self_as_range, other_as_range)
|
|
&& subtag_matches(&self.region, &other.region, self_as_range, other_as_range)
|
|
&& subtags_match(
|
|
&self.variants,
|
|
&other.variants,
|
|
self_as_range,
|
|
other_as_range,
|
|
)
|
|
}
|
|
|
|
/// Returns a vector of variants subtags of the `LanguageIdentifier`.
|
|
///
|
|
/// # Examples
|
|
///
|
|
/// ```
|
|
/// use unic_langid_impl::LanguageIdentifier;
|
|
///
|
|
/// let li1: LanguageIdentifier = "ca-ES-valencia".parse()
|
|
/// .expect("Parsing failed.");
|
|
///
|
|
/// assert_eq!(li1.variants().map(|v| v.as_str()).collect::<Vec<_>>(), &["valencia"]);
|
|
///
|
|
/// let li2: LanguageIdentifier = "de".parse()
|
|
/// .expect("Parsing failed.");
|
|
///
|
|
/// assert_eq!(li2.variants().len(), 0);
|
|
/// ```
|
|
pub fn variants(&self) -> impl ExactSizeIterator<Item = &subtags::Variant> {
|
|
let variants: &[_] = match self.variants {
|
|
Some(ref v) => v,
|
|
None => &[],
|
|
};
|
|
|
|
variants.iter()
|
|
}
|
|
|
|
/// Sets variant subtags of the `LanguageIdentifier`.
|
|
///
|
|
/// # Examples
|
|
///
|
|
/// ```
|
|
/// use unic_langid_impl::LanguageIdentifier;
|
|
///
|
|
/// let mut li: LanguageIdentifier = "ca-ES".parse()
|
|
/// .expect("Parsing failed.");
|
|
///
|
|
/// li.set_variants(&["valencia".parse().expect("Parsing failed.")]);
|
|
///
|
|
/// assert_eq!(li.to_string(), "ca-ES-valencia");
|
|
/// ```
|
|
pub fn set_variants(&mut self, variants: &[subtags::Variant]) {
|
|
let mut v = variants.to_vec();
|
|
|
|
if v.is_empty() {
|
|
self.variants = None;
|
|
} else {
|
|
v.sort_unstable();
|
|
v.dedup();
|
|
self.variants = Some(v.into_boxed_slice());
|
|
}
|
|
}
|
|
|
|
/// Tests if a variant subtag is present in the `LanguageIdentifier`.
|
|
///
|
|
/// # Examples
|
|
///
|
|
/// ```
|
|
/// use unic_langid_impl::LanguageIdentifier;
|
|
///
|
|
/// let mut li: LanguageIdentifier = "ca-ES-macos".parse()
|
|
/// .expect("Parsing failed.");
|
|
///
|
|
/// assert_eq!(li.has_variant("valencia".parse().unwrap()), false);
|
|
/// assert_eq!(li.has_variant("macos".parse().unwrap()), true);
|
|
/// ```
|
|
pub fn has_variant(&self, variant: subtags::Variant) -> bool {
|
|
if let Some(variants) = &self.variants {
|
|
variants.contains(&variant)
|
|
} else {
|
|
false
|
|
}
|
|
}
|
|
|
|
/// Clears variant subtags of the `LanguageIdentifier`.
|
|
///
|
|
/// # Examples
|
|
///
|
|
/// ```
|
|
/// use unic_langid_impl::LanguageIdentifier;
|
|
///
|
|
/// let mut li: LanguageIdentifier = "ca-ES-valencia".parse()
|
|
/// .expect("Parsing failed.");
|
|
///
|
|
/// li.clear_variants();
|
|
///
|
|
/// assert_eq!(li.to_string(), "ca-ES");
|
|
/// ```
|
|
pub fn clear_variants(&mut self) {
|
|
self.variants = None;
|
|
}
|
|
|
|
/// Extends the `LanguageIdentifier` adding likely subtags based
|
|
/// on tables provided by CLDR.
|
|
///
|
|
/// # Examples
|
|
///
|
|
/// ```
|
|
/// use unic_langid_impl::LanguageIdentifier;
|
|
///
|
|
/// let mut li: LanguageIdentifier = "en-US".parse()
|
|
/// .expect("Parsing failed.");
|
|
///
|
|
/// assert_eq!(li.maximize(), true);
|
|
/// assert_eq!(li.to_string(), "en-Latn-US");
|
|
/// ```
|
|
#[cfg(feature = "likelysubtags")]
|
|
pub fn maximize(&mut self) -> bool {
|
|
if let Some(new_li) = likelysubtags::maximize(self.language, self.script, self.region) {
|
|
self.language = new_li.0;
|
|
self.script = new_li.1;
|
|
self.region = new_li.2;
|
|
true
|
|
} else {
|
|
false
|
|
}
|
|
}
|
|
|
|
/// Extends the `LanguageIdentifier` removing likely subtags based
|
|
/// on tables provided by CLDR.
|
|
///
|
|
/// # Examples
|
|
///
|
|
/// ```
|
|
/// use unic_langid_impl::LanguageIdentifier;
|
|
///
|
|
/// let mut li: LanguageIdentifier = "en-Latn-US".parse()
|
|
/// .expect("Parsing failed.");
|
|
///
|
|
/// assert_eq!(li.minimize(), true);
|
|
/// assert_eq!(li.to_string(), "en");
|
|
/// ```
|
|
#[cfg(feature = "likelysubtags")]
|
|
pub fn minimize(&mut self) -> bool {
|
|
if let Some(new_li) = likelysubtags::minimize(self.language, self.script, self.region) {
|
|
self.language = new_li.0;
|
|
self.script = new_li.1;
|
|
self.region = new_li.2;
|
|
true
|
|
} else {
|
|
false
|
|
}
|
|
}
|
|
|
|
/// Returns character direction of the `LanguageIdentifier`.
|
|
///
|
|
/// # Examples
|
|
///
|
|
/// ```
|
|
/// use unic_langid_impl::{LanguageIdentifier, CharacterDirection};
|
|
///
|
|
/// let li1: LanguageIdentifier = "es-AR".parse()
|
|
/// .expect("Parsing failed.");
|
|
/// let li2: LanguageIdentifier = "fa".parse()
|
|
/// .expect("Parsing failed.");
|
|
///
|
|
/// assert_eq!(li1.character_direction(), CharacterDirection::LTR);
|
|
/// assert_eq!(li2.character_direction(), CharacterDirection::RTL);
|
|
/// ```
|
|
pub fn character_direction(&self) -> CharacterDirection {
|
|
match (self.language.into(), self.script) {
|
|
(_, Some(script))
|
|
if layout_table::SCRIPTS_CHARACTER_DIRECTION_LTR.contains(&script.into()) =>
|
|
{
|
|
CharacterDirection::LTR
|
|
}
|
|
(_, Some(script))
|
|
if layout_table::SCRIPTS_CHARACTER_DIRECTION_RTL.contains(&script.into()) =>
|
|
{
|
|
CharacterDirection::RTL
|
|
}
|
|
(_, Some(script))
|
|
if layout_table::SCRIPTS_CHARACTER_DIRECTION_TTB.contains(&script.into()) =>
|
|
{
|
|
CharacterDirection::TTB
|
|
}
|
|
(Some(lang), _) if layout_table::LANGS_CHARACTER_DIRECTION_RTL.contains(&lang) => {
|
|
#[cfg(feature = "likelysubtags")]
|
|
if let Some((_, Some(script), _)) =
|
|
likelysubtags::maximize(self.language, None, self.region)
|
|
{
|
|
if layout_table::SCRIPTS_CHARACTER_DIRECTION_LTR.contains(&script.into()) {
|
|
return CharacterDirection::LTR;
|
|
}
|
|
}
|
|
CharacterDirection::RTL
|
|
}
|
|
_ => CharacterDirection::LTR,
|
|
}
|
|
}
|
|
}
|
|
|
|
impl FromStr for LanguageIdentifier {
|
|
type Err = LanguageIdentifierError;
|
|
|
|
fn from_str(source: &str) -> Result<Self, Self::Err> {
|
|
Self::from_bytes(source.as_bytes())
|
|
}
|
|
}
|
|
|
|
impl AsRef<LanguageIdentifier> for LanguageIdentifier {
|
|
#[inline(always)]
|
|
fn as_ref(&self) -> &LanguageIdentifier {
|
|
self
|
|
}
|
|
}
|
|
|
|
impl std::fmt::Display for LanguageIdentifier {
|
|
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
|
self.language.fmt(f)?;
|
|
if let Some(ref script) = self.script {
|
|
f.write_char('-')?;
|
|
script.fmt(f)?;
|
|
}
|
|
if let Some(ref region) = self.region {
|
|
f.write_char('-')?;
|
|
region.fmt(f)?;
|
|
}
|
|
if let Some(variants) = &self.variants {
|
|
for variant in variants.iter() {
|
|
f.write_char('-')?;
|
|
variant.fmt(f)?;
|
|
}
|
|
}
|
|
Ok(())
|
|
}
|
|
}
|
|
|
|
impl PartialEq<&str> for LanguageIdentifier {
|
|
fn eq(&self, other: &&str) -> bool {
|
|
self.to_string().as_str() == *other
|
|
}
|
|
}
|
|
|
|
fn subtag_matches<P: PartialEq>(
|
|
subtag1: &Option<P>,
|
|
subtag2: &Option<P>,
|
|
as_range1: bool,
|
|
as_range2: bool,
|
|
) -> bool {
|
|
(as_range1 && subtag1.is_none()) || (as_range2 && subtag2.is_none()) || subtag1 == subtag2
|
|
}
|
|
|
|
fn is_option_empty<P: PartialEq>(subtag: &Option<Box<[P]>>) -> bool {
|
|
subtag.as_ref().map_or(true, |t| t.is_empty())
|
|
}
|
|
|
|
fn subtags_match<P: PartialEq>(
|
|
subtag1: &Option<Box<[P]>>,
|
|
subtag2: &Option<Box<[P]>>,
|
|
as_range1: bool,
|
|
as_range2: bool,
|
|
) -> bool {
|
|
// or is some and is empty!
|
|
(as_range1 && is_option_empty(subtag1))
|
|
|| (as_range2 && is_option_empty(subtag2))
|
|
|| subtag1 == subtag2
|
|
}
|
|
|
|
/// This is a best-effort operation that performs all available levels of canonicalization.
|
|
///
|
|
/// At the moment the operation will normalize casing and the separator, but in the future
|
|
/// it may also validate and update from deprecated subtags to canonical ones.
|
|
///
|
|
/// # Examples
|
|
///
|
|
/// ```
|
|
/// use unic_langid_impl::canonicalize;
|
|
///
|
|
/// assert_eq!(canonicalize("pL_latn_pl"), Ok("pl-Latn-PL".to_string()));
|
|
/// ```
|
|
pub fn canonicalize<S: AsRef<[u8]>>(input: S) -> Result<String, LanguageIdentifierError> {
|
|
let lang_id = LanguageIdentifier::from_bytes(input.as_ref())?;
|
|
Ok(lang_id.to_string())
|
|
}
|
|
|
|
#[test]
|
|
fn invalid_subtag() {
|
|
assert!(LanguageIdentifier::from_bytes("en-ÁÁÁÁ".as_bytes()).is_err());
|
|
}
|