Bug 1895888 - Vendor fully-implemented Rust relevancy component r=bdk

Differential Revision: https://phabricator.services.mozilla.com/D209964
This commit is contained in:
Nan Jiang 2024-05-09 19:56:58 +00:00
parent 2f9ab8fab6
commit eb54ba3346
27 changed files with 945 additions and 326 deletions

View file

@ -60,9 +60,9 @@ git = "https://github.com/mozilla-spidermonkey/jsparagus"
rev = "61f399c53a641ebd3077c1f39f054f6d396a633c"
replace-with = "vendored-sources"
[source."git+https://github.com/mozilla/application-services?rev=bf37a5174a1dcb7a890dc35386d58f9b77f82434"]
[source."git+https://github.com/mozilla/application-services?rev=e0563d725f852f617878ecc13a03cdf50c85cd5a"]
git = "https://github.com/mozilla/application-services"
rev = "bf37a5174a1dcb7a890dc35386d58f9b77f82434"
rev = "e0563d725f852f617878ecc13a03cdf50c85cd5a"
replace-with = "vendored-sources"
[source."git+https://github.com/mozilla/audioipc?rev=409e11f8de6288e9ddfe269654523735302e59e6"]

31
Cargo.lock generated
View file

@ -1680,7 +1680,7 @@ dependencies = [
[[package]]
name = "error-support"
version = "0.1.0"
source = "git+https://github.com/mozilla/application-services?rev=bf37a5174a1dcb7a890dc35386d58f9b77f82434#bf37a5174a1dcb7a890dc35386d58f9b77f82434"
source = "git+https://github.com/mozilla/application-services?rev=e0563d725f852f617878ecc13a03cdf50c85cd5a#e0563d725f852f617878ecc13a03cdf50c85cd5a"
dependencies = [
"error-support-macros",
"lazy_static",
@ -1692,7 +1692,7 @@ dependencies = [
[[package]]
name = "error-support-macros"
version = "0.1.0"
source = "git+https://github.com/mozilla/application-services?rev=bf37a5174a1dcb7a890dc35386d58f9b77f82434#bf37a5174a1dcb7a890dc35386d58f9b77f82434"
source = "git+https://github.com/mozilla/application-services?rev=e0563d725f852f617878ecc13a03cdf50c85cd5a#e0563d725f852f617878ecc13a03cdf50c85cd5a"
dependencies = [
"proc-macro2",
"quote",
@ -2965,7 +2965,7 @@ dependencies = [
[[package]]
name = "interrupt-support"
version = "0.1.0"
source = "git+https://github.com/mozilla/application-services?rev=bf37a5174a1dcb7a890dc35386d58f9b77f82434#bf37a5174a1dcb7a890dc35386d58f9b77f82434"
source = "git+https://github.com/mozilla/application-services?rev=e0563d725f852f617878ecc13a03cdf50c85cd5a#e0563d725f852f617878ecc13a03cdf50c85cd5a"
dependencies = [
"lazy_static",
"parking_lot",
@ -4173,7 +4173,7 @@ dependencies = [
[[package]]
name = "nss_build_common"
version = "0.1.0"
source = "git+https://github.com/mozilla/application-services?rev=bf37a5174a1dcb7a890dc35386d58f9b77f82434#bf37a5174a1dcb7a890dc35386d58f9b77f82434"
source = "git+https://github.com/mozilla/application-services?rev=e0563d725f852f617878ecc13a03cdf50c85cd5a#e0563d725f852f617878ecc13a03cdf50c85cd5a"
[[package]]
name = "nsstring"
@ -4827,14 +4827,19 @@ checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da"
[[package]]
name = "relevancy"
version = "0.1.0"
source = "git+https://github.com/mozilla/application-services?rev=bf37a5174a1dcb7a890dc35386d58f9b77f82434#bf37a5174a1dcb7a890dc35386d58f9b77f82434"
source = "git+https://github.com/mozilla/application-services?rev=e0563d725f852f617878ecc13a03cdf50c85cd5a#e0563d725f852f617878ecc13a03cdf50c85cd5a"
dependencies = [
"anyhow",
"base64 0.21.3",
"error-support",
"interrupt-support",
"log",
"md-5",
"parking_lot",
"remote_settings",
"rusqlite",
"serde",
"serde_json",
"sql-support",
"thiserror",
"uniffi",
@ -4844,7 +4849,7 @@ dependencies = [
[[package]]
name = "remote_settings"
version = "0.1.0"
source = "git+https://github.com/mozilla/application-services?rev=bf37a5174a1dcb7a890dc35386d58f9b77f82434#bf37a5174a1dcb7a890dc35386d58f9b77f82434"
source = "git+https://github.com/mozilla/application-services?rev=e0563d725f852f617878ecc13a03cdf50c85cd5a#e0563d725f852f617878ecc13a03cdf50c85cd5a"
dependencies = [
"parking_lot",
"serde",
@ -5372,7 +5377,7 @@ dependencies = [
[[package]]
name = "sql-support"
version = "0.1.0"
source = "git+https://github.com/mozilla/application-services?rev=bf37a5174a1dcb7a890dc35386d58f9b77f82434#bf37a5174a1dcb7a890dc35386d58f9b77f82434"
source = "git+https://github.com/mozilla/application-services?rev=e0563d725f852f617878ecc13a03cdf50c85cd5a#e0563d725f852f617878ecc13a03cdf50c85cd5a"
dependencies = [
"ffi-support",
"interrupt-support",
@ -5554,7 +5559,7 @@ checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc"
[[package]]
name = "suggest"
version = "0.1.0"
source = "git+https://github.com/mozilla/application-services?rev=bf37a5174a1dcb7a890dc35386d58f9b77f82434#bf37a5174a1dcb7a890dc35386d58f9b77f82434"
source = "git+https://github.com/mozilla/application-services?rev=e0563d725f852f617878ecc13a03cdf50c85cd5a#e0563d725f852f617878ecc13a03cdf50c85cd5a"
dependencies = [
"anyhow",
"chrono",
@ -5603,7 +5608,7 @@ dependencies = [
[[package]]
name = "sync-guid"
version = "0.1.0"
source = "git+https://github.com/mozilla/application-services?rev=bf37a5174a1dcb7a890dc35386d58f9b77f82434#bf37a5174a1dcb7a890dc35386d58f9b77f82434"
source = "git+https://github.com/mozilla/application-services?rev=e0563d725f852f617878ecc13a03cdf50c85cd5a#e0563d725f852f617878ecc13a03cdf50c85cd5a"
dependencies = [
"base64 0.21.3",
"rand",
@ -5614,7 +5619,7 @@ dependencies = [
[[package]]
name = "sync15"
version = "0.1.0"
source = "git+https://github.com/mozilla/application-services?rev=bf37a5174a1dcb7a890dc35386d58f9b77f82434#bf37a5174a1dcb7a890dc35386d58f9b77f82434"
source = "git+https://github.com/mozilla/application-services?rev=e0563d725f852f617878ecc13a03cdf50c85cd5a#e0563d725f852f617878ecc13a03cdf50c85cd5a"
dependencies = [
"anyhow",
"error-support",
@ -5646,7 +5651,7 @@ dependencies = [
[[package]]
name = "tabs"
version = "0.1.0"
source = "git+https://github.com/mozilla/application-services?rev=bf37a5174a1dcb7a890dc35386d58f9b77f82434#bf37a5174a1dcb7a890dc35386d58f9b77f82434"
source = "git+https://github.com/mozilla/application-services?rev=e0563d725f852f617878ecc13a03cdf50c85cd5a#e0563d725f852f617878ecc13a03cdf50c85cd5a"
dependencies = [
"anyhow",
"error-support",
@ -6319,7 +6324,7 @@ checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
[[package]]
name = "viaduct"
version = "0.1.0"
source = "git+https://github.com/mozilla/application-services?rev=bf37a5174a1dcb7a890dc35386d58f9b77f82434#bf37a5174a1dcb7a890dc35386d58f9b77f82434"
source = "git+https://github.com/mozilla/application-services?rev=e0563d725f852f617878ecc13a03cdf50c85cd5a#e0563d725f852f617878ecc13a03cdf50c85cd5a"
dependencies = [
"ffi-support",
"log",
@ -6467,7 +6472,7 @@ dependencies = [
[[package]]
name = "webext-storage"
version = "0.1.0"
source = "git+https://github.com/mozilla/application-services?rev=bf37a5174a1dcb7a890dc35386d58f9b77f82434#bf37a5174a1dcb7a890dc35386d58f9b77f82434"
source = "git+https://github.com/mozilla/application-services?rev=e0563d725f852f617878ecc13a03cdf50c85cd5a#e0563d725f852f617878ecc13a03cdf50c85cd5a"
dependencies = [
"anyhow",
"error-support",

View file

@ -223,14 +223,14 @@ warp = { git = "https://github.com/seanmonstar/warp", rev = "9d081461ae1167eb321
malloc_size_of_derive = { path = "xpcom/rust/malloc_size_of_derive" }
# application-services overrides to make updating them all simpler.
interrupt-support = { git = "https://github.com/mozilla/application-services", rev = "bf37a5174a1dcb7a890dc35386d58f9b77f82434" }
relevancy = { git = "https://github.com/mozilla/application-services", rev = "bf37a5174a1dcb7a890dc35386d58f9b77f82434" }
sql-support = { git = "https://github.com/mozilla/application-services", rev = "bf37a5174a1dcb7a890dc35386d58f9b77f82434" }
suggest = { git = "https://github.com/mozilla/application-services", rev = "bf37a5174a1dcb7a890dc35386d58f9b77f82434" }
sync15 = { git = "https://github.com/mozilla/application-services", rev = "bf37a5174a1dcb7a890dc35386d58f9b77f82434" }
tabs = { git = "https://github.com/mozilla/application-services", rev = "bf37a5174a1dcb7a890dc35386d58f9b77f82434" }
viaduct = { git = "https://github.com/mozilla/application-services", rev = "bf37a5174a1dcb7a890dc35386d58f9b77f82434" }
webext-storage = { git = "https://github.com/mozilla/application-services", rev = "bf37a5174a1dcb7a890dc35386d58f9b77f82434" }
interrupt-support = { git = "https://github.com/mozilla/application-services", rev = "e0563d725f852f617878ecc13a03cdf50c85cd5a" }
relevancy = { git = "https://github.com/mozilla/application-services", rev = "e0563d725f852f617878ecc13a03cdf50c85cd5a" }
sql-support = { git = "https://github.com/mozilla/application-services", rev = "e0563d725f852f617878ecc13a03cdf50c85cd5a" }
suggest = { git = "https://github.com/mozilla/application-services", rev = "e0563d725f852f617878ecc13a03cdf50c85cd5a" }
sync15 = { git = "https://github.com/mozilla/application-services", rev = "e0563d725f852f617878ecc13a03cdf50c85cd5a" }
tabs = { git = "https://github.com/mozilla/application-services", rev = "e0563d725f852f617878ecc13a03cdf50c85cd5a" }
viaduct = { git = "https://github.com/mozilla/application-services", rev = "e0563d725f852f617878ecc13a03cdf50c85cd5a" }
webext-storage = { git = "https://github.com/mozilla/application-services", rev = "e0563d725f852f617878ecc13a03cdf50c85cd5a" }
# Patch mio 0.8.8 to use windows-sys 0.52 (backport https://github.com/tokio-rs/mio/commit/eea9e3e0c469480e5c59c01e6c3c7e5fd88f0848)
mio_0_8 = { package = "mio", git = "https://github.com/glandium/mio", rev = "9a2ef335c366044ffe73b1c4acabe50a1daefe05" }

View file

@ -3047,3 +3047,5 @@ pref("startup.homepage_override_nimbus_maxVersion", "");
// Pref to enable the content relevancy feature.
pref("toolkit.contentRelevancy.enabled", false);
// Pref to enable the ingestion through the Rust component.
pref("toolkit.contentRelevancy.ingestEnabled", false);

View file

@ -1 +1 @@
{"files":{"Cargo.toml":"8b8d81c6af8ab402f8febf103e10917a55886cacb01d0448c4426a2b54d244d0","build.rs":"a562bfe527d21c4e8a1a44b892defa83cdff141ec5dd51ed6f3862330e50ddd7","src/bin/generate-test-data.rs":"7f1c9dc445418c7627f89d1f2aa8e550d0f85b3d1f05edb7c378ab9441714f1f","src/db.rs":"9470c4566fc6296571b35d493f752d8a1f3c1fd8f7f420007ee3fa3e762af92b","src/error.rs":"00a10d47c9cfd25c4104174ec07eca6a08103564cb1b2c4961739f17f2892fb2","src/interest.rs":"e28b51c9692905ca87e6ab23decf1c1b3897bf29cb3d0d61f71213553b561dcc","src/lib.rs":"a42ffd826fd38a5f9436d9de00fd7b548e233a39063fbc030cae10052e0b4253","src/populate_interests.rs":"96c825796c6cfb7b1bb3a11c6d1b9c3639107943f5d35a259e195fec15aeef4e","src/relevancy.udl":"3de62ea53b4f34c11ff94c782b8389d58525ca40bb292b4b81370025813def5e","src/schema.rs":"f782c712f10c4f1af2f9e1424d6b52f59a2bacfcc452a8feb763f36478f5dd5d","src/url_hash.rs":"5619a249d471e7b642d889bad09e93212559c8b947010d49492c1423da2b310e","test-data":"1ef2cd092d59e7e126cd4a514af983d449ed9f9c98708702fd237464a76c2b5e"},"package":null}
{"files":{"Cargo.toml":"2b7bf33e20b6aa768dd18619845e9d5d22235d86f770e94b250ed0052662ce2d","build.rs":"a562bfe527d21c4e8a1a44b892defa83cdff141ec5dd51ed6f3862330e50ddd7","src/bin/generate-test-data.rs":"7f1c9dc445418c7627f89d1f2aa8e550d0f85b3d1f05edb7c378ab9441714f1f","src/db.rs":"7ca5688c42d44ad6e5320208257d131c5c744be47a1cfe3e1380147abf2aadc3","src/error.rs":"0fe48e211dffb2010f732672c38e1c79b1995df3e70b06398ed8ac43d326c1b1","src/ingest.rs":"d3f528c1d62b4b6af404bb14cb0d431f8d523911ada09e4e1db5836b6cf44e04","src/interest.rs":"adbaa1e0324c7bb32b023f105b45499390a1a83973d1a8c7d727a661a25cc259","src/lib.rs":"29ce35211c9d94d561d62d7e8ef57fc56cc90a9ba42b88b54c2f4c9236a8cd4d","src/relevancy.udl":"b551e7476f30dccdc74cbf2f38fc3b87a3a7d0ec5dfa6c2ea4417b18fbc7475c","src/rs.rs":"b98091d0adca809d8fef38eb5394f885e04d4d382b7c8abd7bd0fe53f64e7bd6","src/schema.rs":"f782c712f10c4f1af2f9e1424d6b52f59a2bacfcc452a8feb763f36478f5dd5d","src/url_hash.rs":"2e908316fb70923644d1990dbf470d69ce2f5e99b0c5c3d95ec691590be8ffa5","test-data":"1ef2cd092d59e7e126cd4a514af983d449ed9f9c98708702fd237464a76c2b5e"},"package":null}

View file

@ -25,9 +25,12 @@ license = "MPL-2.0"
name = "generate-test-data"
[dependencies]
anyhow = "1.0"
base64 = "0.21.2"
log = "0.4"
md-5 = "0.10"
parking_lot = ">=0.11,<=0.12"
serde_json = "1"
thiserror = "1.0"
uniffi = "0.27.1"
url = "2.5"
@ -38,10 +41,17 @@ path = "../support/error"
[dependencies.interrupt-support]
path = "../support/interrupt"
[dependencies.remote_settings]
path = "../remote_settings"
[dependencies.rusqlite]
version = "0.30.0"
features = ["bundled"]
[dependencies.serde]
version = "1"
features = ["derive"]
[dependencies.sql-support]
path = "../support/sql"

View file

@ -20,7 +20,7 @@ pub struct RelevancyDb {
}
impl RelevancyDb {
pub fn new(path: impl AsRef<Path>) -> Result<Self> {
pub fn new(path: impl AsRef<Path>) -> Self {
// Note: use `SQLITE_OPEN_READ_WRITE` for both read and write connections.
// Even if we're opening a read connection, we may need to do a write as part of the
// initialization process.
@ -31,10 +31,10 @@ impl RelevancyDb {
| OpenFlags::SQLITE_OPEN_NO_MUTEX
| OpenFlags::SQLITE_OPEN_CREATE
| OpenFlags::SQLITE_OPEN_READ_WRITE;
Ok(Self {
Self {
reader: LazyDb::new(path.as_ref(), db_open_flags, RelevancyConnectionInitializer),
writer: LazyDb::new(path.as_ref(), db_open_flags, RelevancyConnectionInitializer),
})
}
}
pub fn close(&self) {
@ -52,7 +52,7 @@ impl RelevancyDb {
use std::sync::atomic::{AtomicU32, Ordering};
static COUNTER: AtomicU32 = AtomicU32::new(0);
let count = COUNTER.fetch_add(1, Ordering::Relaxed);
Self::new(format!("file:test{count}.sqlite?mode=memory&cache=shared")).unwrap()
Self::new(format!("file:test{count}.sqlite?mode=memory&cache=shared"))
}
/// Accesses the Suggest database in a transaction for reading.
@ -118,7 +118,7 @@ impl<'a> RelevancyDao<'a> {
",
)?;
let interests = stmt.query_and_then((hash,), |row| -> Result<Interest> {
Ok(row.get::<_, u32>(0)?.into())
row.get::<_, u32>(0)?.try_into()
})?;
let mut interest_vec = InterestVector::default();

View file

@ -26,6 +26,18 @@ pub enum Error {
#[error("Interrupted")]
Interrupted(#[from] interrupt_support::Interrupted),
#[error("Invalid interest code: {0}")]
InvalidInterestCode(u32),
#[error("Remote Setting Error: {0}")]
RemoteSettingsError(#[from] remote_settings::RemoteSettingsError),
#[error("Serde Json Error: {0}")]
SerdeJsonError(#[from] serde_json::Error),
#[error("Base64 Decode Error: {0}")]
Base64DecodeError(String),
}
/// Result enum for the public API

394
third_party/rust/relevancy/src/ingest.rs vendored Normal file
View file

@ -0,0 +1,394 @@
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
use crate::db::RelevancyDao;
use crate::rs::{
RelevancyAttachmentData, RelevancyRecord, RelevancyRemoteSettingsClient,
REMOTE_SETTINGS_COLLECTION,
};
use crate::url_hash::UrlHash;
use crate::{Error, Interest, RelevancyDb, Result};
use base64::{engine::general_purpose::STANDARD, Engine};
use remote_settings::{Client, RemoteSettingsConfig, RemoteSettingsRecord, RemoteSettingsServer};
// Number of rows to write when inserting interest data before checking for interruption
const WRITE_CHUNK_SIZE: usize = 100;
pub fn ensure_interest_data_populated(db: &RelevancyDb) -> Result<()> {
if !db.read(|dao| dao.need_to_load_url_interests())? {
return Ok(());
}
match fetch_interest_data() {
Ok(data) => {
db.read_write(move |dao| insert_interest_data(data, dao))?;
}
Err(e) => {
log::warn!("error fetching interest data: {e}");
return Err(Error::FetchInterestDataError);
}
}
Ok(())
}
fn fetch_interest_data() -> Result<Vec<(Interest, UrlHash)>> {
let rs = Client::new(RemoteSettingsConfig {
collection_name: REMOTE_SETTINGS_COLLECTION.to_string(),
server: Some(RemoteSettingsServer::Prod),
server_url: None,
bucket_name: None,
})?;
fetch_interest_data_inner(rs)
}
/// Fetch the interest data
fn fetch_interest_data_inner(
rs: impl RelevancyRemoteSettingsClient,
) -> Result<Vec<(Interest, UrlHash)>> {
let remote_settings_response = rs.get_records()?;
let mut result = vec![];
for record in remote_settings_response.records {
let attachment_data = match &record.attachment {
None => return Err(Error::FetchInterestDataError),
Some(a) => rs.get_attachment(&a.location)?,
};
let interest = get_interest(&record)?;
let urls = get_hash_urls(attachment_data)?;
result.extend(std::iter::repeat(interest).zip(urls));
}
Ok(result)
}
fn get_hash_urls(attachment_data: Vec<u8>) -> Result<Vec<UrlHash>> {
let mut hash_urls = vec![];
let parsed_attachment_data =
serde_json::from_slice::<Vec<RelevancyAttachmentData>>(&attachment_data)?;
for attachment_data in parsed_attachment_data {
let hash_url = STANDARD
.decode(attachment_data.domain)
.map_err(|_| Error::Base64DecodeError("Invalid base64 error".to_string()))?;
let url_hash = hash_url.try_into().map_err(|_| {
Error::Base64DecodeError("Base64 string has wrong number of bytes".to_string())
})?;
hash_urls.push(url_hash);
}
Ok(hash_urls)
}
/// Extract Interest from the record info
fn get_interest(record: &RemoteSettingsRecord) -> Result<Interest> {
let record_fields: RelevancyRecord =
serde_json::from_value(serde_json::Value::Object(record.fields.clone()))?;
let custom_details = record_fields.record_custom_details;
let category_code = custom_details.category_to_domains.category_code;
Interest::try_from(category_code as u32)
}
/// Insert Interests into Db
fn insert_interest_data(data: Vec<(Interest, UrlHash)>, dao: &mut RelevancyDao) -> Result<()> {
for chunk in data.chunks(WRITE_CHUNK_SIZE) {
dao.err_if_interrupted()?;
for (interest, hash_url) in chunk {
dao.add_url_interest(*hash_url, *interest)?;
}
}
Ok(())
}
#[cfg(test)]
mod test {
use std::{cell::RefCell, collections::HashMap};
use anyhow::Context;
use remote_settings::RemoteSettingsResponse;
use serde_json::json;
use super::*;
use crate::{rs::RelevancyRemoteSettingsClient, url_hash::hash_url, InterestVector};
/// A snapshot containing fake Remote Settings records and attachments for
/// the store to ingest. We use snapshots to test the store's behavior in a
/// data-driven way.
struct Snapshot {
records: Vec<RemoteSettingsRecord>,
attachments: HashMap<&'static str, Vec<u8>>,
}
impl Snapshot {
/// Creates a snapshot from a JSON value that represents a collection of
/// Relevancy Remote Settings records.
///
/// You can use the [`serde_json::json!`] macro to construct the JSON
/// value, then pass it to this function. It's easier to use the
/// `Snapshot::with_records(json!(...))` idiom than to construct the
/// records by hand.
fn with_records(value: serde_json::Value) -> anyhow::Result<Self> {
Ok(Self {
records: serde_json::from_value(value)
.context("Couldn't create snapshot with Remote Settings records")?,
attachments: HashMap::new(),
})
}
/// Adds a data attachment to the snapshot.
fn with_data(
mut self,
location: &'static str,
value: serde_json::Value,
) -> anyhow::Result<Self> {
self.attachments.insert(
location,
serde_json::to_vec(&value).context("Couldn't add data attachment to snapshot")?,
);
Ok(self)
}
}
/// A fake Remote Settings client that returns records and attachments from
/// a snapshot.
struct SnapshotSettingsClient {
/// The current snapshot. You can modify it using
/// [`RefCell::borrow_mut()`] to simulate remote updates in tests.
snapshot: RefCell<Snapshot>,
}
impl SnapshotSettingsClient {
/// Creates a client with an initial snapshot.
fn with_snapshot(snapshot: Snapshot) -> Self {
Self {
snapshot: RefCell::new(snapshot),
}
}
}
impl RelevancyRemoteSettingsClient for SnapshotSettingsClient {
fn get_records(&self) -> Result<RemoteSettingsResponse> {
let records = self.snapshot.borrow().records.clone();
let last_modified = records
.iter()
.map(|record: &RemoteSettingsRecord| record.last_modified)
.max()
.unwrap_or(0);
Ok(RemoteSettingsResponse {
records,
last_modified,
})
}
fn get_attachment(&self, location: &str) -> Result<Vec<u8>> {
Ok(self
.snapshot
.borrow()
.attachments
.get(location)
.unwrap_or_else(|| unreachable!("Unexpected request for attachment `{}`", location))
.clone())
}
}
#[test]
fn test_interest_vectors() {
let db = RelevancyDb::new_for_test();
db.read_write(|dao| {
// Test that the interest data matches the values we started from in
// `bin/generate-test-data.rs`
dao.add_url_interest(hash_url("https://espn.com").unwrap(), Interest::Sports)?;
dao.add_url_interest(hash_url("https://dogs.com").unwrap(), Interest::Animals)?;
dao.add_url_interest(hash_url("https://cars.com").unwrap(), Interest::Autos)?;
dao.add_url_interest(
hash_url("https://www.vouge.com").unwrap(),
Interest::Fashion,
)?;
dao.add_url_interest(hash_url("https://slashdot.org").unwrap(), Interest::Tech)?;
dao.add_url_interest(hash_url("https://www.nascar.com").unwrap(), Interest::Autos)?;
dao.add_url_interest(
hash_url("https://www.nascar.com").unwrap(),
Interest::Sports,
)?;
dao.add_url_interest(
hash_url("https://unknown.url").unwrap(),
Interest::Inconclusive,
)?;
assert_eq!(
dao.get_url_interest_vector("https://espn.com/").unwrap(),
InterestVector {
sports: 1,
..InterestVector::default()
}
);
assert_eq!(
dao.get_url_interest_vector("https://dogs.com/").unwrap(),
InterestVector {
animals: 1,
..InterestVector::default()
}
);
assert_eq!(
dao.get_url_interest_vector("https://cars.com/").unwrap(),
InterestVector {
autos: 1,
..InterestVector::default()
}
);
assert_eq!(
dao.get_url_interest_vector("https://www.vouge.com/")
.unwrap(),
InterestVector {
fashion: 1,
..InterestVector::default()
}
);
assert_eq!(
dao.get_url_interest_vector("https://slashdot.org/")
.unwrap(),
InterestVector {
tech: 1,
..InterestVector::default()
}
);
assert_eq!(
dao.get_url_interest_vector("https://www.nascar.com/")
.unwrap(),
InterestVector {
autos: 1,
sports: 1,
..InterestVector::default()
}
);
assert_eq!(
dao.get_url_interest_vector("https://unknown.url/").unwrap(),
InterestVector {
inconclusive: 1,
..InterestVector::default()
}
);
Ok(())
})
.unwrap();
}
#[test]
fn test_variations_on_the_url() {
let db = RelevancyDb::new_for_test();
db.read_write(|dao| {
dao.add_url_interest(hash_url("https://espn.com").unwrap(), Interest::Sports)?;
dao.add_url_interest(hash_url("https://nascar.com").unwrap(), Interest::Autos)?;
dao.add_url_interest(hash_url("https://nascar.com").unwrap(), Interest::Sports)?;
// Different paths/queries should work
assert_eq!(
dao.get_url_interest_vector("https://espn.com/foo/bar/?baz")
.unwrap(),
InterestVector {
sports: 1,
..InterestVector::default()
}
);
// Different schemes should too
assert_eq!(
dao.get_url_interest_vector("http://espn.com/").unwrap(),
InterestVector {
sports: 1,
..InterestVector::default()
}
);
// But changes to the domain shouldn't
assert_eq!(
dao.get_url_interest_vector("http://espn2.com/").unwrap(),
InterestVector::default()
);
// However, extra components past the 2nd one in the domain are ignored
assert_eq!(
dao.get_url_interest_vector("https://www.nascar.com/")
.unwrap(),
InterestVector {
autos: 1,
sports: 1,
..InterestVector::default()
}
);
Ok(())
})
.unwrap();
}
#[test]
fn test_parse_records() -> anyhow::Result<()> {
let snapshot = Snapshot::with_records(json!([{
"id": "animals-0001",
"last_modified": 15,
"type": "category_to_domains",
"attachment": {
"filename": "data-1.json",
"mimetype": "application/json",
"location": "data-1.json",
"hash": "",
"size": 0
},
"record_custom_details": {
"category_to_domains": {
"category": "animals",
"category_code": 1,
"version": 1
}
}
}]))?
.with_data(
"data-1.json",
json!([
{"domain": "J2jtyjQtYQ/+/p//xhz43Q=="},
{"domain": "Zd4awCwGZLkat59nIWje3g=="}]),
)?;
let rs_client = SnapshotSettingsClient::with_snapshot(snapshot);
assert_eq!(
fetch_interest_data_inner(rs_client).unwrap(),
vec![
(Interest::Animals, hash_url("https://dogs.com").unwrap()),
(Interest::Animals, hash_url("https://cats.com").unwrap())
]
);
Ok(())
}
#[test]
fn test_parse_records_with_bad_domain_strings() -> anyhow::Result<()> {
let snapshot = Snapshot::with_records(json!([{
"id": "animals-0001",
"last_modified": 15,
"type": "category_to_domains",
"attachment": {
"filename": "data-1.json",
"mimetype": "application/json",
"location": "data-1.json",
"hash": "",
"size": 0
},
"record_custom_details": {
"category_to_domains": {
"category": "animals",
"category_code": 1,
"version": 1
}
}
}]))?
.with_data(
"data-1.json",
json!([
{"domain": "badString"},
{"domain": "notBase64"}]),
)?;
let rs_client = SnapshotSettingsClient::with_snapshot(snapshot);
fetch_interest_data_inner(rs_client).expect_err("Invalid base64 error");
Ok(())
}
}

View file

@ -2,33 +2,37 @@
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
use crate::Error;
/// List of possible interests for a domain. Domains can have be associated with one or multiple
/// interests. `Inconclusive` is used for domains in the user's top sites that we can't classify
/// because there's no corresponding entry in the interest database.
#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)]
#[repr(u32)]
pub enum Interest {
Inconclusive,
Animals,
Arts,
Autos,
Business,
Career,
Education,
Fashion,
Finance,
Food,
Government,
// Note: if you change these codes, make sure to update the `TryFrom<u32>` implementation and
// the `test_interest_code_conversion` test.
Inconclusive = 0,
Animals = 1,
Arts = 2,
Autos = 3,
Business = 4,
Career = 5,
Education = 6,
Fashion = 7,
Finance = 8,
Food = 9,
Government = 10,
//Disable this per policy consultation
// Health,
Hobbies,
Home,
News,
RealEstate,
Society,
Sports,
Tech,
Travel,
// Health = 11,
Hobbies = 12,
Home = 13,
News = 14,
RealEstate = 15,
Society = 16,
Sports = 17,
Tech = 18,
Travel = 19,
}
impl From<Interest> for u32 {
@ -43,14 +47,35 @@ impl From<Interest> for usize {
}
}
impl From<u32> for Interest {
fn from(code: u32) -> Self {
if code as usize > Self::COUNT {
panic!("Invalid interest code: {code}")
impl TryFrom<u32> for Interest {
// On error, return the invalid code back
type Error = Error;
fn try_from(code: u32) -> Result<Self, Self::Error> {
match code {
0 => Ok(Self::Inconclusive),
1 => Ok(Self::Animals),
2 => Ok(Self::Arts),
3 => Ok(Self::Autos),
4 => Ok(Self::Business),
5 => Ok(Self::Career),
6 => Ok(Self::Education),
7 => Ok(Self::Fashion),
8 => Ok(Self::Finance),
9 => Ok(Self::Food),
10 => Ok(Self::Government),
//Disable this per policy consultation
// 11 => Ok(Self::Health),
12 => Ok(Self::Hobbies),
13 => Ok(Self::Home),
14 => Ok(Self::News),
15 => Ok(Self::RealEstate),
16 => Ok(Self::Society),
17 => Ok(Self::Sports),
18 => Ok(Self::Tech),
19 => Ok(Self::Travel),
n => Err(Error::InvalidInterestCode(n)),
}
// Safety: This is safe since Interest has a u32 representation and we've done a bounds
// check
unsafe { std::mem::transmute(code) }
}
}
@ -111,6 +136,34 @@ pub struct InterestVector {
pub travel: u32,
}
impl std::ops::Add for InterestVector {
type Output = Self;
fn add(self, other: Self) -> Self {
Self {
inconclusive: self.inconclusive + other.inconclusive,
animals: self.animals + other.animals,
arts: self.arts + other.arts,
autos: self.autos + other.autos,
business: self.business + other.business,
career: self.career + other.career,
education: self.education + other.education,
fashion: self.fashion + other.fashion,
finance: self.finance + other.finance,
food: self.food + other.food,
government: self.government + other.government,
hobbies: self.hobbies + other.hobbies,
home: self.home + other.home,
news: self.news + other.news,
real_estate: self.real_estate + other.real_estate,
society: self.society + other.society,
sports: self.sports + other.sports,
tech: self.tech + other.tech,
travel: self.travel + other.travel,
}
}
}
impl std::ops::Index<Interest> for InterestVector {
type Output = u32;
@ -166,3 +219,29 @@ impl std::ops::IndexMut<Interest> for InterestVector {
}
}
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn test_interest_code_conversion() {
for interest in Interest::all() {
assert_eq!(Interest::try_from(u32::from(interest)).unwrap(), interest)
}
// try_from() for out of bounds codes should return an error
assert!(matches!(
Interest::try_from(20),
Err(Error::InvalidInterestCode(20))
));
assert!(matches!(
Interest::try_from(100),
Err(Error::InvalidInterestCode(100))
));
// Health is currently disabled, so it's code should return None for now
assert!(matches!(
Interest::try_from(11),
Err(Error::InvalidInterestCode(11))
));
}
}

View file

@ -11,8 +11,9 @@
mod db;
mod error;
mod ingest;
mod interest;
mod populate_interests;
mod rs;
mod schema;
pub mod url_hash;
@ -28,11 +29,10 @@ pub struct RelevancyStore {
/// Top-level API for the Relevancy component
impl RelevancyStore {
#[handle_error(Error)]
pub fn new(db_path: String) -> ApiResult<Self> {
Ok(Self {
db: RelevancyDb::new(db_path)?,
})
pub fn new(db_path: String) -> Self {
Self {
db: RelevancyDb::new(db_path),
}
}
pub fn close(&self) {
@ -55,9 +55,21 @@ impl RelevancyStore {
///
/// This method may execute for a long time and should only be called from a worker thread.
#[handle_error(Error)]
pub fn ingest(&self, _top_urls_by_frecency: Vec<String>) -> ApiResult<()> {
populate_interests::ensure_interest_data_populated(&self.db)?;
todo!()
pub fn ingest(&self, top_urls_by_frecency: Vec<String>) -> ApiResult<InterestVector> {
ingest::ensure_interest_data_populated(&self.db)?;
self.classify(top_urls_by_frecency)
}
pub fn classify(&self, top_urls_by_frecency: Vec<String>) -> Result<InterestVector> {
// For experimentation purposes we are going to return an interest vector.
// Eventually we would want to store this data in the DB and incrementally update it.
let mut interest_vector = InterestVector::default();
for url in top_urls_by_frecency {
let interest_count = self.db.read(|dao| dao.get_url_interest_vector(&url))?;
interest_vector = interest_vector + interest_count;
}
Ok(interest_vector)
}
/// Calculate metrics for the validation phase
@ -87,3 +99,45 @@ pub struct InterestMetrics {
}
uniffi::include_scaffolding!("relevancy");
#[cfg(test)]
mod test {
use crate::url_hash::hash_url;
use super::*;
#[test]
fn test_ingest() {
let top_urls = vec![
"https://food.com/".to_string(),
"https://hello.com".to_string(),
"https://pasta.com".to_string(),
"https://dog.com".to_string(),
];
let relevancy_store =
RelevancyStore::new("file:test_store_data?mode=memory&cache=shared".to_owned());
relevancy_store
.db
.read_write(|dao| {
dao.add_url_interest(hash_url("https://food.com").unwrap(), Interest::Food)?;
dao.add_url_interest(
hash_url("https://hello.com").unwrap(),
Interest::Inconclusive,
)?;
dao.add_url_interest(hash_url("https://pasta.com").unwrap(), Interest::Food)?;
dao.add_url_interest(hash_url("https://dog.com").unwrap(), Interest::Animals)?;
Ok(())
})
.expect("Insert should succeed");
assert_eq!(
relevancy_store.ingest(top_urls).unwrap(),
InterestVector {
inconclusive: 1,
animals: 1,
food: 2,
..InterestVector::default()
}
);
}
}

View file

@ -1,164 +0,0 @@
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
use crate::{url_hash::UrlHash, Error, Interest, RelevancyDb, Result};
use std::io::{Cursor, Read};
// Number of rows to write when inserting interest data before checking for interruption
const WRITE_CHUNK_SIZE: usize = 100;
pub fn ensure_interest_data_populated(db: &RelevancyDb) -> Result<()> {
if !db.read(|dao| dao.need_to_load_url_interests())? {
return Ok(());
}
let interest_data = match fetch_interest_data() {
Ok(data) => data,
Err(e) => {
log::warn!("error fetching interest data: {e}");
return Err(Error::FetchInterestDataError);
}
};
db.read_write(move |dao| {
for chunk in interest_data.chunks(WRITE_CHUNK_SIZE) {
for (url_hash, interest) in chunk {
dao.add_url_interest(*url_hash, *interest)?;
}
dao.err_if_interrupted()?;
}
Ok(())
})
}
/// Fetch the interest data
fn fetch_interest_data() -> std::io::Result<Vec<(UrlHash, Interest)>> {
// TODO: this hack should be replaced with something that fetches from remote settings.
// It should ideally check for interruption while fetching the data.
let bytes = include_bytes!("../test-data");
let mut reader = Cursor::new(&bytes);
let mut data = vec![];
// Loop over all possible interests
for interest in Interest::all() {
// read the count
let mut buf = [0u8; 4];
reader.read_exact(&mut buf)?;
let count = u32::from_le_bytes(buf);
for _ in 0..count {
let mut url_hash: UrlHash = [0u8; 16];
reader.read_exact(&mut url_hash)?;
data.push((url_hash, interest));
}
}
Ok(data)
}
#[cfg(test)]
mod test {
use super::*;
use crate::InterestVector;
#[test]
fn test_interest_vectors() {
let db = RelevancyDb::new_for_test();
ensure_interest_data_populated(&db).unwrap();
db.read(|dao| {
// Test that the interest data matches the values we started from in
// `bin/generate-test-data.rs`
assert_eq!(
dao.get_url_interest_vector("https://espn.com/").unwrap(),
InterestVector {
sports: 1,
..InterestVector::default()
}
);
assert_eq!(
dao.get_url_interest_vector("https://dogs.com/").unwrap(),
InterestVector {
animals: 1,
..InterestVector::default()
}
);
assert_eq!(
dao.get_url_interest_vector("https://cars.com/").unwrap(),
InterestVector {
autos: 1,
..InterestVector::default()
}
);
assert_eq!(
dao.get_url_interest_vector("https://www.vouge.com/")
.unwrap(),
InterestVector {
fashion: 1,
..InterestVector::default()
}
);
assert_eq!(
dao.get_url_interest_vector("https://slashdot.org/")
.unwrap(),
InterestVector {
tech: 1,
..InterestVector::default()
}
);
assert_eq!(
dao.get_url_interest_vector("https://www.nascar.com/")
.unwrap(),
InterestVector {
autos: 1,
sports: 1,
..InterestVector::default()
}
);
assert_eq!(
dao.get_url_interest_vector("https://unknown.url/").unwrap(),
InterestVector::default()
);
Ok(())
})
.unwrap();
}
#[test]
fn test_variations_on_the_url() {
let db = RelevancyDb::new_for_test();
ensure_interest_data_populated(&db).unwrap();
db.read(|dao| {
// Different paths/queries should work
assert_eq!(
dao.get_url_interest_vector("https://espn.com/foo/bar/?baz")
.unwrap(),
InterestVector {
sports: 1,
..InterestVector::default()
}
);
// Different schemes should too
assert_eq!(
dao.get_url_interest_vector("http://espn.com/").unwrap(),
InterestVector {
sports: 1,
..InterestVector::default()
}
);
// But changes to the domain shouldn't
assert_eq!(
dao.get_url_interest_vector("http://www.espn.com/").unwrap(),
InterestVector::default()
);
// However, extra components past the 3rd one in the domain are ignored
assert_eq!(
dao.get_url_interest_vector("https://foo.www.nascar.com/")
.unwrap(),
InterestVector {
autos: 1,
sports: 1,
..InterestVector::default()
}
);
Ok(())
})
.unwrap();
}
}

View file

@ -10,7 +10,6 @@ interface RelevancyStore {
// Construct a new RelevancyStore
//
// This is non-blocking since databases and other resources are lazily opened.
[Throws=RelevancyApiError]
constructor(string dbpath);
// Close any open resources (for example databases)
@ -23,7 +22,7 @@ interface RelevancyStore {
// Ingest the top URLs by frequency to build up the user's interest vector
[Throws=RelevancyApiError]
void ingest(sequence<string> top_urls);
InterestVector ingest(sequence<string> top_urls);
// Calculate metrics for the user's interest vector in order to measure how strongly we're
// identifying interests. See the `InterestMetrics` struct for details.

60
third_party/rust/relevancy/src/rs.rs vendored Normal file
View file

@ -0,0 +1,60 @@
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
*/
use crate::Result;
use remote_settings::RemoteSettingsResponse;
use serde::Deserialize;
/// The Remote Settings collection name.
pub(crate) const REMOTE_SETTINGS_COLLECTION: &str = "content-relevance";
/// A trait for a client that downloads records from Remote Settings.
///
/// This trait lets tests use a mock client.
pub(crate) trait RelevancyRemoteSettingsClient {
/// Fetches records from the Suggest Remote Settings collection.
fn get_records(&self) -> Result<RemoteSettingsResponse>;
/// Fetches a record's attachment from the Suggest Remote Settings
/// collection.
fn get_attachment(&self, location: &str) -> Result<Vec<u8>>;
}
impl RelevancyRemoteSettingsClient for remote_settings::Client {
fn get_records(&self) -> Result<RemoteSettingsResponse> {
Ok(remote_settings::Client::get_records(self)?)
}
fn get_attachment(&self, location: &str) -> Result<Vec<u8>> {
Ok(remote_settings::Client::get_attachment(self, location)?)
}
}
/// A record in the Relevancy Remote Settings collection.
#[derive(Clone, Debug, Deserialize)]
pub struct RelevancyRecord {
#[serde(rename = "type")]
pub record_type: String,
pub record_custom_details: RecordCustomDetails,
}
// Custom details related to category of the record.
#[derive(Clone, Debug, Deserialize)]
pub struct RecordCustomDetails {
pub category_to_domains: CategoryToDomains,
}
/// Category information related to the record.
#[derive(Clone, Debug, Deserialize)]
pub struct CategoryToDomains {
pub version: i32,
pub category: String,
pub category_code: i32,
}
/// A downloaded Remote Settings attachment that contains domain data.
#[derive(Clone, Debug, Deserialize)]
pub struct RelevancyAttachmentData {
pub domain: String,
}

View file

@ -8,11 +8,10 @@ use url::{Host, Url};
pub type UrlHash = [u8; 16];
/// Given a URL, extract the part of it that we want to use to identify it.
///
/// We currently use the final 3 components of the URL domain.
///
/// TODO: decide if this should be 3 or 3 components.
pub fn url_hash_source(url: &str) -> Option<String> {
// We currently use the final 2 components of the URL domain.
const URL_COMPONENTS_TO_USE: usize = 2;
let url = Url::parse(url).ok()?;
let domain = match url.host() {
Some(Host::Domain(d)) => d,
@ -20,7 +19,7 @@ pub fn url_hash_source(url: &str) -> Option<String> {
};
// This will store indexes of `.` chars as we search backwards.
let mut pos = domain.len();
for _ in 0..3 {
for _ in 0..URL_COMPONENTS_TO_USE {
match domain[0..pos].rfind('.') {
Some(p) => pos = p,
// The domain has less than 3 dots, return it all
@ -47,12 +46,12 @@ mod test {
fn test_url_hash_source() {
let table = [
("http://example.com/some-path", Some("example.com")),
("http://foo.example.com/some-path", Some("foo.example.com")),
("http://foo.example.com/some-path", Some("example.com")),
(
"http://foo.bar.baz.example.com/some-path",
Some("baz.example.com"),
Some("example.com"),
),
("http://foo.com.uk/some-path", Some("foo.com.uk")),
("http://foo.com.uk/some-path", Some("com.uk")),
("http://amazon.com/some-path", Some("amazon.com")),
("http://192.168.0.1/some-path", None),
];

View file

@ -1 +1 @@
{"files":{"Cargo.toml":"05e4d7f7b3649a3e3fa441c4af53a633d18f20bb04fd761ed33fc9d461fd0dee","README.md":"fb72d0028586cab1421b853ef529d7ce78ad7316818b7733a4f3488b0fba67f7","benches/benchmark_all.rs":"c2343c9197b6d9ccb0798d7701b1b0d2569d494dd31a975d21d7ec6f26e32879","build.rs":"78780c5cccfe22c3ff4198624b9e188559c437c3e6fa1c8bb66548eee6aa66bf","src/benchmarks/README.md":"ee6d50df2c31cfd80a5bc047011b518dcf57f1ef928a811bb770f1a09f41b3de","src/benchmarks/client.rs":"4b2125031d740ca1ab468e76bbea777ac0bc4cc221b03b7bc2da773bed61dac5","src/benchmarks/ingest.rs":"1ffdc403fb945ea0b58353df9773ba45ab0e9082d61dd5330ad49fad8cbb5d9f","src/benchmarks/mod.rs":"fe1898ba4d783213525da10d92858ee84cebfd22749bad7aeb461d338fe5504a","src/bin/debug_ingestion_sizes.rs":"ce6e810be7b3fc19e826d75b622b82cfab5a1a99397a6d0833c2c4eebff2d364","src/config.rs":"206ae9dc768c755649cb0c88a7b1fc3c926c715441784f61e9dc06a8a02fc568","src/db.rs":"734f5fd9f36f03c07a508a9a353872b81107f5fe09f27294ba27d7e1249e3988","src/error.rs":"f563210a6c050d98ec85e0f6d9401e7373bfb816e865e8edabbabb23d848ba13","src/keyword.rs":"988d0ab021c0df19cfd3c519df7d37f606bf984cd14d0efca4e5a7aff88344dd","src/lib.rs":"18f988eb49626c6e186c8bc65a51b4a40d796f36d3de8905506f76c6e5e876cd","src/pocket.rs":"1316668840ec9b4ea886223921dc9d3b5a1731d1a5206c0b1089f2a6c45c1b7b","src/provider.rs":"fe76f19a223f5cac056c7d48525087ca2c26bf0629b0e11b1f8dc98d165c8bb2","src/rs.rs":"e3eabde58c859ebe1154bf8da56ca134ace135934e3f280acc8186b4204399b3","src/schema.rs":"88ff3ae6b652fa5a5cff4dc504d11a7fc33f1b2ee9716b970f646d9f9ca90ab7","src/store.rs":"5873438bfc2d2a3e112935bb196bcd1f9b46351d1b341113115f45f7117fc3bf","src/suggest.udl":"b49043c5ec0210aeccf92eadbc1acdce697fc588a2500a281e083b3d8c42ff73","src/suggestion.rs":"f31227779d13d1b03a622e08a417ceba4afb161885a01c2bc87a6a652b5e8be5","src/yelp.rs":"9c0dc02a994cc05df524aa4ef337d10f575d1891259193b6419fed6fe279cb54","uniffi.toml":"f26317442ddb5b3281245bef6e60ffcb78bb95d29fe4a351a56dbb88d4ec8aab"},"package":null}
{"files":{"Cargo.toml":"05e4d7f7b3649a3e3fa441c4af53a633d18f20bb04fd761ed33fc9d461fd0dee","README.md":"fb72d0028586cab1421b853ef529d7ce78ad7316818b7733a4f3488b0fba67f7","benches/benchmark_all.rs":"c2343c9197b6d9ccb0798d7701b1b0d2569d494dd31a975d21d7ec6f26e32879","build.rs":"78780c5cccfe22c3ff4198624b9e188559c437c3e6fa1c8bb66548eee6aa66bf","src/benchmarks/README.md":"ee6d50df2c31cfd80a5bc047011b518dcf57f1ef928a811bb770f1a09f41b3de","src/benchmarks/client.rs":"4b2125031d740ca1ab468e76bbea777ac0bc4cc221b03b7bc2da773bed61dac5","src/benchmarks/ingest.rs":"1ffdc403fb945ea0b58353df9773ba45ab0e9082d61dd5330ad49fad8cbb5d9f","src/benchmarks/mod.rs":"fe1898ba4d783213525da10d92858ee84cebfd22749bad7aeb461d338fe5504a","src/bin/debug_ingestion_sizes.rs":"ce6e810be7b3fc19e826d75b622b82cfab5a1a99397a6d0833c2c4eebff2d364","src/config.rs":"206ae9dc768c755649cb0c88a7b1fc3c926c715441784f61e9dc06a8a02fc568","src/db.rs":"a4e18b9f45e0473ea64b5ecdf6d1d67e0519f9629d495c157b0bd1b47c3e2f4f","src/error.rs":"f563210a6c050d98ec85e0f6d9401e7373bfb816e865e8edabbabb23d848ba13","src/keyword.rs":"988d0ab021c0df19cfd3c519df7d37f606bf984cd14d0efca4e5a7aff88344dd","src/lib.rs":"18f988eb49626c6e186c8bc65a51b4a40d796f36d3de8905506f76c6e5e876cd","src/pocket.rs":"1316668840ec9b4ea886223921dc9d3b5a1731d1a5206c0b1089f2a6c45c1b7b","src/provider.rs":"fe76f19a223f5cac056c7d48525087ca2c26bf0629b0e11b1f8dc98d165c8bb2","src/rs.rs":"e3eabde58c859ebe1154bf8da56ca134ace135934e3f280acc8186b4204399b3","src/schema.rs":"88ff3ae6b652fa5a5cff4dc504d11a7fc33f1b2ee9716b970f646d9f9ca90ab7","src/store.rs":"aad193774eecec739a7debd1c9e4fd46df384e7a524203e5e5f0354b93f73c1c","src/suggest.udl":"bfa653aa88c954860a9728a597daad8f4a7db8c81bc156725bf801f7cddf8459","src/suggestion.rs":"f31227779d13d1b03a622e08a417ceba4afb161885a01c2bc87a6a652b5e8be5","src/yelp.rs":"9c0dc02a994cc05df524aa4ef337d10f575d1891259193b6419fed6fe279cb54","uniffi.toml":"f26317442ddb5b3281245bef6e60ffcb78bb95d29fe4a351a56dbb88d4ec8aab"},"package":null}

View file

@ -188,6 +188,12 @@ impl<'a> SuggestDao<'a> {
//
// These methods implement CRUD operations
pub fn suggestions_table_empty(&self) -> Result<bool> {
Ok(self
.conn
.query_one::<bool>("SELECT NOT EXISTS (SELECT 1 FROM suggestions)")?)
}
/// Fetches suggestions that match the given query from the database.
pub fn fetch_suggestions(&self, query: &SuggestionQuery) -> Result<Vec<Suggestion>> {
let unique_providers = query.providers.iter().collect::<HashSet<_>>();

View file

@ -275,6 +275,8 @@ pub struct SuggestIngestionConstraints {
/// soft limit, and the store might ingest more than requested.
pub max_suggestions: Option<u64>,
pub providers: Option<Vec<SuggestionProvider>>,
/// Only run ingestion if the table `suggestions` is empty
pub empty_only: bool,
}
/// The implementation of the store. This is generic over the Remote Settings
@ -357,6 +359,10 @@ where
pub fn ingest(&self, constraints: SuggestIngestionConstraints) -> Result<()> {
let writer = &self.dbs()?.writer;
if constraints.empty_only && !writer.read(|dao| dao.suggestions_table_empty())? {
return Ok(());
}
if let Some(unparsable_records) =
writer.read(|dao| dao.get_meta::<UnparsableRecords>(UNPARSABLE_RECORDS_META_KEY))?
{
@ -888,6 +894,12 @@ mod tests {
let store = unique_test_store(SnapshotSettingsClient::with_snapshot(snapshot));
// suggestions_table_empty returns true before the ingestion is complete
assert!(store
.dbs()?
.reader
.read(|dao| dao.suggestions_table_empty())?);
store.ingest(SuggestIngestionConstraints::default())?;
store.dbs()?.reader.read(|dao| {
@ -927,6 +939,153 @@ mod tests {
Ok(())
})?;
// suggestions_table_empty returns false after the ingestion is complete
assert!(!store
.dbs()?
.reader
.read(|dao| dao.suggestions_table_empty())?);
Ok(())
}
/// Tests ingesting suggestions into an empty database.
#[test]
fn ingest_empty_only() -> anyhow::Result<()> {
before_each();
// This ingestion should run, since the DB is empty
let snapshot = Snapshot::with_records(json!([{
"id": "1234",
"type": "data",
"last_modified": 15,
"attachment": {
"filename": "data-1.json",
"mimetype": "application/json",
"location": "data-1.json",
"hash": "",
"size": 0,
},
}]))?
.with_data(
"data-1.json",
json!([{
"id": 0,
"advertiser": "Los Pollos Hermanos",
"iab_category": "8 - Food & Drink",
"keywords": ["lo", "los", "los p", "los pollos", "los pollos h", "los pollos hermanos"],
"title": "Los Pollos Hermanos - Albuquerque",
"url": "https://www.lph-nm.biz",
"icon": "5678",
"impression_url": "https://example.com/impression_url",
"click_url": "https://example.com/click_url",
"score": 0.3
}]),
)?;
let mut store = unique_test_store(SnapshotSettingsClient::with_snapshot(snapshot));
store.ingest(SuggestIngestionConstraints {
empty_only: true,
..SuggestIngestionConstraints::default()
})?;
store.dbs()?.reader.read(|dao| {
expect![[r#"
[
Amp {
title: "Los Pollos Hermanos - Albuquerque",
url: "https://www.lph-nm.biz",
raw_url: "https://www.lph-nm.biz",
icon: None,
icon_mimetype: None,
full_keyword: "los",
block_id: 0,
advertiser: "Los Pollos Hermanos",
iab_category: "8 - Food & Drink",
impression_url: "https://example.com/impression_url",
click_url: "https://example.com/click_url",
raw_click_url: "https://example.com/click_url",
score: 0.3,
},
]
"#]]
.assert_debug_eq(&dao.fetch_suggestions(&SuggestionQuery {
keyword: "lo".into(),
providers: vec![SuggestionProvider::Amp],
limit: None,
})?);
Ok(())
})?;
// ingestion should run with SuggestIngestionConstraints::empty_only = true, since the DB
// is empty
store.settings_client = SnapshotSettingsClient::with_snapshot(Snapshot::with_records(json!([{
"id": "1234",
"type": "data",
"last_modified": 15,
"attachment": {
"filename": "data-1.json",
"mimetype": "application/json",
"location": "data-1.json",
"hash": "",
"size": 0,
},
}, {
"id": "12345",
"type": "data",
"last_modified": 15,
"attachment": {
"filename": "data-2.json",
"mimetype": "application/json",
"location": "data-2.json",
"hash": "",
"size": 0,
},
}]))?
.with_data(
"data-1.json",
json!([{
"id": 0,
"advertiser": "Los Pollos Hermanos",
"iab_category": "8 - Food & Drink",
"keywords": ["lo", "los", "los p", "los pollos", "los pollos h", "los pollos hermanos"],
"title": "Los Pollos Hermanos - Albuquerque",
"url": "https://www.lph-nm.biz",
"icon": "5678",
"impression_url": "https://example.com/impression_url",
"click_url": "https://example.com/click_url",
"score": 0.3
}])
)?
.with_data("data-2.json", json!([{
"id": 1,
"advertiser": "Good Place Eats",
"iab_category": "8 - Food & Drink",
"keywords": ["la", "las", "lasa", "lasagna", "lasagna come out tomorrow"],
"title": "Lasagna Come Out Tomorrow",
"url": "https://www.lasagna.restaurant",
"icon": "2",
"impression_url": "https://example.com/impression_url",
"click_url": "https://example.com/click_url"
}]),
)?);
store.ingest(SuggestIngestionConstraints {
empty_only: true,
..SuggestIngestionConstraints::default()
})?;
store.dbs()?.reader.read(|dao| {
expect![[r#"
[]
"#]]
.assert_debug_eq(&dao.fetch_suggestions(&SuggestionQuery {
keyword: "la".into(),
providers: vec![SuggestionProvider::Amp],
limit: None,
})?);
Ok(())
})?;
Ok(())
}
@ -2212,6 +2371,7 @@ mod tests {
store.ingest(SuggestIngestionConstraints {
max_suggestions: Some(max_suggestions),
providers: Some(vec![SuggestionProvider::Amp]),
..SuggestIngestionConstraints::default()
})?;
let actual_limit = store
.settings_client
@ -5201,6 +5361,7 @@ mod tests {
let constraints = SuggestIngestionConstraints {
max_suggestions: Some(100),
providers: Some(vec![SuggestionProvider::Amp, SuggestionProvider::Pocket]),
..SuggestIngestionConstraints::default()
};
store.ingest(constraints)?;

View file

@ -106,6 +106,14 @@ dictionary SuggestionQuery {
dictionary SuggestIngestionConstraints {
u64? max_suggestions = null;
sequence<SuggestionProvider>? providers = null;
// Only ingest if the table `suggestions` is empty.
//
// This is indented to handle periodic updates. Consumers can schedule an ingest with
// `empty_only=true` on startup and a regular ingest with `empty_only=false` to run on a long periodic schedule (maybe
// once a day). This allows ingestion to normally be run at a slow, periodic rate. However, if
// there is a schema upgrade that causes the database to be thrown away, then the
// `empty_only=true` ingestion that runs on startup will repopulate it.
boolean empty_only = false;
};
dictionary SuggestGlobalConfig {

View file

@ -11,6 +11,7 @@ ChromeUtils.defineESModuleGetters(lazy, {
"resource://gre/modules/contentrelevancy/private/InputUtils.sys.mjs",
NimbusFeatures: "resource://nimbus/ExperimentAPI.sys.mjs",
RelevancyStore: "resource://gre/modules/RustRelevancy.sys.mjs",
InterestVector: "resource://gre/modules/RustRelevancy.sys.mjs",
});
XPCOMUtils.defineLazyServiceGetter(
@ -40,6 +41,7 @@ const NIMBUS_VARIABLE_ENABLED = "enabled";
const NIMBUS_VARIABLE_MAX_INPUT_URLS = "maxInputUrls";
const NIMBUS_VARIABLE_MIN_INPUT_URLS = "minInputUrls";
const NIMBUS_VARIABLE_TIMER_INTERVAL = "timerInterval";
const NIMBUS_VARIABLE_INGEST_ENABLED = "ingestEnabled";
ChromeUtils.defineLazyGetter(lazy, "log", () => {
return console.createInstance({
@ -243,18 +245,21 @@ class RelevancyManager {
lazy.log.info("Starting interest classification");
timerId = Glean.relevancyClassify.duration.start();
await this.#doClassificationHelper(urls);
const interestVector = await this.#doClassificationHelper(urls);
const sortedVector = Object.entries(interestVector).sort(
([, a], [, b]) => b - a // descending
);
lazy.log.info(`Classification results: ${JSON.stringify(sortedVector)}`);
Glean.relevancyClassify.duration.stopAndAccumulate(timerId);
Glean.relevancyClassify.succeed.record({
input_size: urls.length,
// TODO(nanj): Fill out the actual counters once the classification is enabled.
input_classified_size: 0,
input_inconclusive_size: 0,
output_interest_size: 0,
interest_top_1_hits: 0,
interest_top_2_hits: 0,
interest_top_3_hits: 0,
input_classified_size: sortedVector.reduce((acc, [, v]) => acc + v, 0),
input_inconclusive_size: interestVector.inconclusive,
output_interest_size: sortedVector.filter(([, v]) => v != 0).length,
interest_top_1_hits: sortedVector[0][1],
interest_top_2_hits: sortedVector[1][1],
interest_top_3_hits: sortedVector[2][1],
});
} catch (error) {
let reason;
@ -290,28 +295,48 @@ class RelevancyManager {
*
* @param {Array} urls
* An array of URLs.
* @returns {InterestVector}
* An interest vector.
* @throws {StoreNotAvailableError}
* Thrown when the store became unavailable (i.e. set to null elsewhere).
* @throws {RelevancyAPIError}
* Thrown for other API errors on the store.
*/
async #doClassificationHelper(urls) {
// The following logs are unnecessary, only used to suppress the linting error.
// TODO(nanj): delete me once the following TODO is done.
if (!this.#store) {
lazy.log.error("#store became null, aborting interest classification");
}
lazy.log.info("Classification input: " + urls);
// TODO(nanj): uncomment the following once `ingest()` is implemented.
// await this.#store.ingest(urls);
}
let interestVector = new lazy.InterestVector({
animals: 0,
arts: 0,
autos: 0,
business: 0,
career: 0,
education: 0,
fashion: 0,
finance: 0,
food: 0,
government: 0,
hobbies: 0,
home: 0,
news: 0,
realEstate: 0,
society: 0,
sports: 0,
tech: 0,
travel: 0,
inconclusive: 0,
});
/**
* Exposed for testing.
*/
async _test_doClassificationHelper(urls) {
await this.#doClassificationHelper(urls);
if (
lazy.NimbusFeatures.contentRelevancy.getVariable(
NIMBUS_VARIABLE_INGEST_ENABLED
) ??
false
) {
interestVector = await this.#store.ingest(urls);
}
return interestVector;
}
/**

View file

@ -41,6 +41,7 @@ add_task(async function test_NimbusIntegration_enable() {
maxInputUrls: 3,
// Set the timer interval to 0 will trigger the timer right away.
timerInterval: 0,
ingestEnabled: false,
},
});
@ -73,6 +74,7 @@ add_task(async function test_NimbusIntegration_disable() {
maxInputUrls: 3,
// Set the timer interval to 0 will trigger the timer right away.
timerInterval: 0,
ingestEnabled: false,
},
});

View file

@ -100,24 +100,6 @@ add_task(async function test_call_disable_twice() {
Services.prefs.clearUserPref(PREF_CONTENT_RELEVANCY_ENABLED);
});
add_task(async function test_doClassificationHelper() {
Services.prefs.setBoolPref(PREF_CONTENT_RELEVANCY_ENABLED, true);
await TestUtils.waitForCondition(() => ContentRelevancyManager._isStoreReady);
await ContentRelevancyManager._test_doClassificationHelper([]);
// Disable it to reset the store.
Services.prefs.setBoolPref(PREF_CONTENT_RELEVANCY_ENABLED, false);
await TestUtils.waitForTick();
await Assert.rejects(
ContentRelevancyManager._test_doClassificationHelper([]),
/Store is not available/,
"Should throw with an unset store"
);
Services.prefs.clearUserPref(PREF_CONTENT_RELEVANCY_ENABLED);
});
/**
* Sets up the update timer manager for testing: makes it fire more often,
* removes all existing timers, and initializes it for testing. The body of this

View file

@ -7,10 +7,6 @@ const { ContentRelevancyManager } = ChromeUtils.importESModule(
"resource://gre/modules/ContentRelevancyManager.sys.mjs"
);
const { TestUtils } = ChromeUtils.importESModule(
"resource://testing-common/TestUtils.sys.mjs"
);
const PREF_CONTENT_RELEVANCY_ENABLED = "toolkit.contentRelevancy.enabled";
add_setup(async function setup() {
@ -83,39 +79,3 @@ add_task(async function test_classify_fail_case1() {
"Should not record the duration"
);
});
/**
* Test classification metrics - fail - store-not-ready.
*/
add_task(async function test_classify_fail_case2() {
Services.fog.testResetFOG();
// Toggle the pref to disable the manager and nullify the store.
Services.prefs.setBoolPref(PREF_CONTENT_RELEVANCY_ENABLED, false);
await TestUtils.waitForTick();
await TestUtils.waitForCondition(
() => !ContentRelevancyManager.shouldEnable,
"Should be disabled via pref"
);
Assert.equal(null, Glean.relevancyClassify.fail.testGetValue());
Assert.equal(null, Glean.relevancyClassify.duration.testGetValue());
await ContentRelevancyManager._test_doClassification();
Assert.deepEqual(
{
reason: "store-not-ready",
},
Glean.relevancyClassify.fail.testGetValue()[0].extra,
"Should record the fail event"
);
Assert.equal(
null,
Glean.relevancyClassify.duration.testGetValue(),
"Should not record the duration"
);
Services.prefs.setBoolPref(PREF_CONTENT_RELEVANCY_ENABLED, true);
});

View file

@ -2701,6 +2701,10 @@ contentRelevancy:
setPref:
branch: user
pref: toolkit.contentRelevancy.timerInterval
ingestEnabled:
description: Enable the ingestion through the Rust component
type: boolean
fallbackPref: toolkit.contentRelevancy.ingestEnabled
tabPreview:
description: Prefs to control Tab Previews

View file

@ -322,7 +322,7 @@ export class RelevancyStore {
*/
static init(dbpath) {
const liftResult = (result) => FfiConverterTypeRelevancyStore.lift(result);
const liftError = (data) => FfiConverterTypeRelevancyApiError.lift(data);
const liftError = null;
const functionCall = () => {
try {
FfiConverterString.checkType(dbpath)
@ -368,7 +368,7 @@ export class RelevancyStore {
}
ingest(topUrls) {
const liftResult = (result) => undefined;
const liftResult = (result) => FfiConverterTypeInterestVector.lift(result);
const liftError = (data) => FfiConverterTypeRelevancyApiError.lift(data);
const functionCall = () => {
try {

View file

@ -915,7 +915,7 @@ export class FfiConverterTypeSuggestGlobalConfig extends FfiConverterArrayBuffer
}
export class SuggestIngestionConstraints {
constructor({ maxSuggestions = null, providers = null } = {}) {
constructor({ maxSuggestions = null, providers = null, emptyOnly = false } = {}) {
try {
FfiConverterOptionalu64.checkType(maxSuggestions)
} catch (e) {
@ -932,13 +932,23 @@ export class SuggestIngestionConstraints {
}
throw e;
}
try {
FfiConverterBool.checkType(emptyOnly)
} catch (e) {
if (e instanceof UniFFITypeError) {
e.addItemDescriptionPart("emptyOnly");
}
throw e;
}
this.maxSuggestions = maxSuggestions;
this.providers = providers;
this.emptyOnly = emptyOnly;
}
equals(other) {
return (
this.maxSuggestions == other.maxSuggestions &&
this.providers == other.providers
this.providers == other.providers &&
this.emptyOnly == other.emptyOnly
)
}
}
@ -949,17 +959,20 @@ export class FfiConverterTypeSuggestIngestionConstraints extends FfiConverterArr
return new SuggestIngestionConstraints({
maxSuggestions: FfiConverterOptionalu64.read(dataStream),
providers: FfiConverterOptionalSequenceTypeSuggestionProvider.read(dataStream),
emptyOnly: FfiConverterBool.read(dataStream),
});
}
static write(dataStream, value) {
FfiConverterOptionalu64.write(dataStream, value.maxSuggestions);
FfiConverterOptionalSequenceTypeSuggestionProvider.write(dataStream, value.providers);
FfiConverterBool.write(dataStream, value.emptyOnly);
}
static computeSize(value) {
let totalSize = 0;
totalSize += FfiConverterOptionalu64.computeSize(value.maxSuggestions);
totalSize += FfiConverterOptionalSequenceTypeSuggestionProvider.computeSize(value.providers);
totalSize += FfiConverterBool.computeSize(value.emptyOnly);
return totalSize
}
@ -984,6 +997,14 @@ export class FfiConverterTypeSuggestIngestionConstraints extends FfiConverterArr
}
throw e;
}
try {
FfiConverterBool.checkType(value.emptyOnly);
} catch (e) {
if (e instanceof UniFFITypeError) {
e.addItemDescriptionPart(".emptyOnly");
}
throw e;
}
}
}

View file

@ -29,7 +29,7 @@ extern "C" {
void * uniffi_relevancy_fn_constructor_relevancystore_new(RustBuffer, RustCallStatus*);
RustBuffer uniffi_relevancy_fn_method_relevancystore_calculate_metrics(void *, RustCallStatus*);
void uniffi_relevancy_fn_method_relevancystore_close(void *, RustCallStatus*);
void uniffi_relevancy_fn_method_relevancystore_ingest(void *, RustBuffer, RustCallStatus*);
RustBuffer uniffi_relevancy_fn_method_relevancystore_ingest(void *, RustBuffer, RustCallStatus*);
void uniffi_relevancy_fn_method_relevancystore_interrupt(void *, RustCallStatus*);
RustBuffer uniffi_relevancy_fn_method_relevancystore_user_interest_vector(void *, RustCallStatus*);
void * uniffi_remote_settings_fn_clone_remotesettings(void *, RustCallStatus*);
@ -144,7 +144,7 @@ Maybe<already_AddRefed<Promise>> UniFFICallAsync(const GlobalObject& aGlobal, ui
return Some(CallHandler::CallAsync(uniffi_relevancy_fn_method_relevancystore_close, aGlobal, aArgs, "uniffi_relevancy_fn_method_relevancystore_close: "_ns, aError));
}
case 4: { // relevancy:uniffi_relevancy_fn_method_relevancystore_ingest
using CallHandler = ScaffoldingCallHandler<ScaffoldingConverter<void>, ScaffoldingObjectConverter<&kRelevancyRelevancyStorePointerType>, ScaffoldingConverter<RustBuffer>>;
using CallHandler = ScaffoldingCallHandler<ScaffoldingConverter<RustBuffer>, ScaffoldingObjectConverter<&kRelevancyRelevancyStorePointerType>, ScaffoldingConverter<RustBuffer>>;
return Some(CallHandler::CallAsync(uniffi_relevancy_fn_method_relevancystore_ingest, aGlobal, aArgs, "uniffi_relevancy_fn_method_relevancystore_ingest: "_ns, aError));
}
case 5: { // relevancy:uniffi_relevancy_fn_method_relevancystore_interrupt
@ -354,7 +354,7 @@ bool UniFFICallSync(const GlobalObject& aGlobal, uint64_t aId, const Sequence<Un
return true;
}
case 4: { // relevancy:uniffi_relevancy_fn_method_relevancystore_ingest
using CallHandler = ScaffoldingCallHandler<ScaffoldingConverter<void>, ScaffoldingObjectConverter<&kRelevancyRelevancyStorePointerType>, ScaffoldingConverter<RustBuffer>>;
using CallHandler = ScaffoldingCallHandler<ScaffoldingConverter<RustBuffer>, ScaffoldingObjectConverter<&kRelevancyRelevancyStorePointerType>, ScaffoldingConverter<RustBuffer>>;
CallHandler::CallSync(uniffi_relevancy_fn_method_relevancystore_ingest, aGlobal, aArgs, aReturnValue, "uniffi_relevancy_fn_method_relevancystore_ingest: "_ns, aError);
return true;
}