forked from mirrors/gecko-dev
		
	
		
			
				
	
	
		
			424 lines
		
	
	
	
		
			12 KiB
		
	
	
	
		
			Rust
		
	
	
	
	
	
			
		
		
	
	
			424 lines
		
	
	
	
		
			12 KiB
		
	
	
	
		
			Rust
		
	
	
	
	
	
/* This Source Code Form is subject to the terms of the Mozilla Public
 | 
						|
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 | 
						|
 * file, You can obtain one at https://mozilla.org/MPL/2.0/. */
 | 
						|
 | 
						|
//! Counting and non-counting Bloom filters tuned for use as ancestor filters
 | 
						|
//! for selector matching.
 | 
						|
 | 
						|
use std::fmt::{self, Debug};
 | 
						|
 | 
						|
// The top 8 bits of the 32-bit hash value are not used by the bloom filter.
 | 
						|
// Consumers may rely on this to pack hashes more efficiently.
 | 
						|
pub const BLOOM_HASH_MASK: u32 = 0x00ffffff;
 | 
						|
const KEY_SIZE: usize = 12;
 | 
						|
 | 
						|
const ARRAY_SIZE: usize = 1 << KEY_SIZE;
 | 
						|
const KEY_MASK: u32 = (1 << KEY_SIZE) - 1;
 | 
						|
 | 
						|
/// A counting Bloom filter with 8-bit counters.
 | 
						|
pub type BloomFilter = CountingBloomFilter<BloomStorageU8>;
 | 
						|
 | 
						|
/// A counting Bloom filter with parameterized storage to handle
 | 
						|
/// counters of different sizes.  For now we assume that having two hash
 | 
						|
/// functions is enough, but we may revisit that decision later.
 | 
						|
///
 | 
						|
/// The filter uses an array with 2**KeySize entries.
 | 
						|
///
 | 
						|
/// Assuming a well-distributed hash function, a Bloom filter with
 | 
						|
/// array size M containing N elements and
 | 
						|
/// using k hash function has expected false positive rate exactly
 | 
						|
///
 | 
						|
/// $  (1 - (1 - 1/M)^{kN})^k  $
 | 
						|
///
 | 
						|
/// because each array slot has a
 | 
						|
///
 | 
						|
/// $  (1 - 1/M)^{kN}  $
 | 
						|
///
 | 
						|
/// chance of being 0, and the expected false positive rate is the
 | 
						|
/// probability that all of the k hash functions will hit a nonzero
 | 
						|
/// slot.
 | 
						|
///
 | 
						|
/// For reasonable assumptions (M large, kN large, which should both
 | 
						|
/// hold if we're worried about false positives) about M and kN this
 | 
						|
/// becomes approximately
 | 
						|
///
 | 
						|
/// $$  (1 - \exp(-kN/M))^k   $$
 | 
						|
///
 | 
						|
/// For our special case of k == 2, that's $(1 - \exp(-2N/M))^2$,
 | 
						|
/// or in other words
 | 
						|
///
 | 
						|
/// $$    N/M = -0.5 * \ln(1 - \sqrt(r))   $$
 | 
						|
///
 | 
						|
/// where r is the false positive rate.  This can be used to compute
 | 
						|
/// the desired KeySize for a given load N and false positive rate r.
 | 
						|
///
 | 
						|
/// If N/M is assumed small, then the false positive rate can
 | 
						|
/// further be approximated as 4*N^2/M^2.  So increasing KeySize by
 | 
						|
/// 1, which doubles M, reduces the false positive rate by about a
 | 
						|
/// factor of 4, and a false positive rate of 1% corresponds to
 | 
						|
/// about M/N == 20.
 | 
						|
///
 | 
						|
/// What this means in practice is that for a few hundred keys using a
 | 
						|
/// KeySize of 12 gives false positive rates on the order of 0.25-4%.
 | 
						|
///
 | 
						|
/// Similarly, using a KeySize of 10 would lead to a 4% false
 | 
						|
/// positive rate for N == 100 and to quite bad false positive
 | 
						|
/// rates for larger N.
 | 
						|
#[derive(Clone)]
 | 
						|
pub struct CountingBloomFilter<S>
 | 
						|
where
 | 
						|
    S: BloomStorage,
 | 
						|
{
 | 
						|
    storage: S,
 | 
						|
}
 | 
						|
 | 
						|
impl<S> CountingBloomFilter<S>
 | 
						|
where
 | 
						|
    S: BloomStorage,
 | 
						|
{
 | 
						|
    /// Creates a new bloom filter.
 | 
						|
    #[inline]
 | 
						|
    pub fn new() -> Self {
 | 
						|
        CountingBloomFilter {
 | 
						|
            storage: Default::default(),
 | 
						|
        }
 | 
						|
    }
 | 
						|
 | 
						|
    #[inline]
 | 
						|
    pub fn clear(&mut self) {
 | 
						|
        self.storage = Default::default();
 | 
						|
    }
 | 
						|
 | 
						|
    // Slow linear accessor to make sure the bloom filter is zeroed. This should
 | 
						|
    // never be used in release builds.
 | 
						|
    #[cfg(debug_assertions)]
 | 
						|
    pub fn is_zeroed(&self) -> bool {
 | 
						|
        self.storage.is_zeroed()
 | 
						|
    }
 | 
						|
 | 
						|
    #[cfg(not(debug_assertions))]
 | 
						|
    pub fn is_zeroed(&self) -> bool {
 | 
						|
        unreachable!()
 | 
						|
    }
 | 
						|
 | 
						|
    /// Inserts an item with a particular hash into the bloom filter.
 | 
						|
    #[inline]
 | 
						|
    pub fn insert_hash(&mut self, hash: u32) {
 | 
						|
        self.storage.adjust_first_slot(hash, true);
 | 
						|
        self.storage.adjust_second_slot(hash, true);
 | 
						|
    }
 | 
						|
 | 
						|
    /// Removes an item with a particular hash from the bloom filter.
 | 
						|
    #[inline]
 | 
						|
    pub fn remove_hash(&mut self, hash: u32) {
 | 
						|
        self.storage.adjust_first_slot(hash, false);
 | 
						|
        self.storage.adjust_second_slot(hash, false);
 | 
						|
    }
 | 
						|
 | 
						|
    /// Check whether the filter might contain an item with the given hash.
 | 
						|
    /// This can sometimes return true even if the item is not in the filter,
 | 
						|
    /// but will never return false for items that are actually in the filter.
 | 
						|
    #[inline]
 | 
						|
    pub fn might_contain_hash(&self, hash: u32) -> bool {
 | 
						|
        !self.storage.first_slot_is_empty(hash) && !self.storage.second_slot_is_empty(hash)
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
impl<S> Debug for CountingBloomFilter<S>
 | 
						|
where
 | 
						|
    S: BloomStorage,
 | 
						|
{
 | 
						|
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
 | 
						|
        let mut slots_used = 0;
 | 
						|
        for i in 0..ARRAY_SIZE {
 | 
						|
            if !self.storage.slot_is_empty(i) {
 | 
						|
                slots_used += 1;
 | 
						|
            }
 | 
						|
        }
 | 
						|
        write!(f, "BloomFilter({}/{})", slots_used, ARRAY_SIZE)
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
pub trait BloomStorage: Clone + Default {
 | 
						|
    fn slot_is_empty(&self, index: usize) -> bool;
 | 
						|
    fn adjust_slot(&mut self, index: usize, increment: bool);
 | 
						|
    fn is_zeroed(&self) -> bool;
 | 
						|
 | 
						|
    #[inline]
 | 
						|
    fn first_slot_is_empty(&self, hash: u32) -> bool {
 | 
						|
        self.slot_is_empty(Self::first_slot_index(hash))
 | 
						|
    }
 | 
						|
 | 
						|
    #[inline]
 | 
						|
    fn second_slot_is_empty(&self, hash: u32) -> bool {
 | 
						|
        self.slot_is_empty(Self::second_slot_index(hash))
 | 
						|
    }
 | 
						|
 | 
						|
    #[inline]
 | 
						|
    fn adjust_first_slot(&mut self, hash: u32, increment: bool) {
 | 
						|
        self.adjust_slot(Self::first_slot_index(hash), increment)
 | 
						|
    }
 | 
						|
 | 
						|
    #[inline]
 | 
						|
    fn adjust_second_slot(&mut self, hash: u32, increment: bool) {
 | 
						|
        self.adjust_slot(Self::second_slot_index(hash), increment)
 | 
						|
    }
 | 
						|
 | 
						|
    #[inline]
 | 
						|
    fn first_slot_index(hash: u32) -> usize {
 | 
						|
        hash1(hash) as usize
 | 
						|
    }
 | 
						|
 | 
						|
    #[inline]
 | 
						|
    fn second_slot_index(hash: u32) -> usize {
 | 
						|
        hash2(hash) as usize
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
/// Storage class for a CountingBloomFilter that has 8-bit counters.
 | 
						|
pub struct BloomStorageU8 {
 | 
						|
    counters: [u8; ARRAY_SIZE],
 | 
						|
}
 | 
						|
 | 
						|
impl BloomStorage for BloomStorageU8 {
 | 
						|
    #[inline]
 | 
						|
    fn adjust_slot(&mut self, index: usize, increment: bool) {
 | 
						|
        let slot = &mut self.counters[index];
 | 
						|
        if *slot != 0xff {
 | 
						|
            // full
 | 
						|
            if increment {
 | 
						|
                *slot += 1;
 | 
						|
            } else {
 | 
						|
                *slot -= 1;
 | 
						|
            }
 | 
						|
        }
 | 
						|
    }
 | 
						|
 | 
						|
    #[inline]
 | 
						|
    fn slot_is_empty(&self, index: usize) -> bool {
 | 
						|
        self.counters[index] == 0
 | 
						|
    }
 | 
						|
 | 
						|
    #[inline]
 | 
						|
    fn is_zeroed(&self) -> bool {
 | 
						|
        self.counters.iter().all(|x| *x == 0)
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
impl Default for BloomStorageU8 {
 | 
						|
    fn default() -> Self {
 | 
						|
        BloomStorageU8 {
 | 
						|
            counters: [0; ARRAY_SIZE],
 | 
						|
        }
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
impl Clone for BloomStorageU8 {
 | 
						|
    fn clone(&self) -> Self {
 | 
						|
        BloomStorageU8 {
 | 
						|
            counters: self.counters,
 | 
						|
        }
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
/// Storage class for a CountingBloomFilter that has 1-bit counters.
 | 
						|
pub struct BloomStorageBool {
 | 
						|
    counters: [u8; ARRAY_SIZE / 8],
 | 
						|
}
 | 
						|
 | 
						|
impl BloomStorage for BloomStorageBool {
 | 
						|
    #[inline]
 | 
						|
    fn adjust_slot(&mut self, index: usize, increment: bool) {
 | 
						|
        let bit = 1 << (index % 8);
 | 
						|
        let byte = &mut self.counters[index / 8];
 | 
						|
 | 
						|
        // Since we have only one bit for storage, decrementing it
 | 
						|
        // should never do anything.  Assert against an accidental
 | 
						|
        // decrementing of a bit that was never set.
 | 
						|
        assert!(
 | 
						|
            increment || (*byte & bit) != 0,
 | 
						|
            "should not decrement if slot is already false"
 | 
						|
        );
 | 
						|
 | 
						|
        if increment {
 | 
						|
            *byte |= bit;
 | 
						|
        }
 | 
						|
    }
 | 
						|
 | 
						|
    #[inline]
 | 
						|
    fn slot_is_empty(&self, index: usize) -> bool {
 | 
						|
        let bit = 1 << (index % 8);
 | 
						|
        (self.counters[index / 8] & bit) == 0
 | 
						|
    }
 | 
						|
 | 
						|
    #[inline]
 | 
						|
    fn is_zeroed(&self) -> bool {
 | 
						|
        self.counters.iter().all(|x| *x == 0)
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
impl Default for BloomStorageBool {
 | 
						|
    fn default() -> Self {
 | 
						|
        BloomStorageBool {
 | 
						|
            counters: [0; ARRAY_SIZE / 8],
 | 
						|
        }
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
impl Clone for BloomStorageBool {
 | 
						|
    fn clone(&self) -> Self {
 | 
						|
        BloomStorageBool {
 | 
						|
            counters: self.counters,
 | 
						|
        }
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
#[inline]
 | 
						|
fn hash1(hash: u32) -> u32 {
 | 
						|
    hash & KEY_MASK
 | 
						|
}
 | 
						|
 | 
						|
#[inline]
 | 
						|
fn hash2(hash: u32) -> u32 {
 | 
						|
    (hash >> KEY_SIZE) & KEY_MASK
 | 
						|
}
 | 
						|
 | 
						|
#[test]
 | 
						|
fn create_and_insert_some_stuff() {
 | 
						|
    use fxhash::FxHasher;
 | 
						|
    use std::hash::{Hash, Hasher};
 | 
						|
    use std::mem::transmute;
 | 
						|
 | 
						|
    fn hash_as_str(i: usize) -> u32 {
 | 
						|
        let mut hasher = FxHasher::default();
 | 
						|
        let s = i.to_string();
 | 
						|
        s.hash(&mut hasher);
 | 
						|
        let hash: u64 = hasher.finish();
 | 
						|
        (hash >> 32) as u32 ^ (hash as u32)
 | 
						|
    }
 | 
						|
 | 
						|
    let mut bf = BloomFilter::new();
 | 
						|
 | 
						|
    // Statically assert that ARRAY_SIZE is a multiple of 8, which
 | 
						|
    // BloomStorageBool relies on.
 | 
						|
    unsafe {
 | 
						|
        transmute::<[u8; ARRAY_SIZE % 8], [u8; 0]>([]);
 | 
						|
    }
 | 
						|
 | 
						|
    for i in 0_usize..1000 {
 | 
						|
        bf.insert_hash(hash_as_str(i));
 | 
						|
    }
 | 
						|
 | 
						|
    for i in 0_usize..1000 {
 | 
						|
        assert!(bf.might_contain_hash(hash_as_str(i)));
 | 
						|
    }
 | 
						|
 | 
						|
    let false_positives = (1001_usize..2000)
 | 
						|
        .filter(|i| bf.might_contain_hash(hash_as_str(*i)))
 | 
						|
        .count();
 | 
						|
 | 
						|
    assert!(false_positives < 190, "{} is not < 190", false_positives); // 19%.
 | 
						|
 | 
						|
    for i in 0_usize..100 {
 | 
						|
        bf.remove_hash(hash_as_str(i));
 | 
						|
    }
 | 
						|
 | 
						|
    for i in 100_usize..1000 {
 | 
						|
        assert!(bf.might_contain_hash(hash_as_str(i)));
 | 
						|
    }
 | 
						|
 | 
						|
    let false_positives = (0_usize..100)
 | 
						|
        .filter(|i| bf.might_contain_hash(hash_as_str(*i)))
 | 
						|
        .count();
 | 
						|
 | 
						|
    assert!(false_positives < 20, "{} is not < 20", false_positives); // 20%.
 | 
						|
 | 
						|
    bf.clear();
 | 
						|
 | 
						|
    for i in 0_usize..2000 {
 | 
						|
        assert!(!bf.might_contain_hash(hash_as_str(i)));
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
#[cfg(feature = "bench")]
 | 
						|
#[cfg(test)]
 | 
						|
mod bench {
 | 
						|
    extern crate test;
 | 
						|
    use super::BloomFilter;
 | 
						|
 | 
						|
    #[derive(Default)]
 | 
						|
    struct HashGenerator(u32);
 | 
						|
 | 
						|
    impl HashGenerator {
 | 
						|
        fn next(&mut self) -> u32 {
 | 
						|
            // Each hash is split into two twelve-bit segments, which are used
 | 
						|
            // as an index into an array. We increment each by 64 so that we
 | 
						|
            // hit the next cache line, and then another 1 so that our wrapping
 | 
						|
            // behavior leads us to different entries each time.
 | 
						|
            //
 | 
						|
            // Trying to simulate cold caches is rather difficult with the cargo
 | 
						|
            // benchmarking setup, so it may all be moot depending on the number
 | 
						|
            // of iterations that end up being run. But we might as well.
 | 
						|
            self.0 += (65) + (65 << super::KEY_SIZE);
 | 
						|
            self.0
 | 
						|
        }
 | 
						|
    }
 | 
						|
 | 
						|
    #[bench]
 | 
						|
    fn create_insert_1000_remove_100_lookup_100(b: &mut test::Bencher) {
 | 
						|
        b.iter(|| {
 | 
						|
            let mut gen1 = HashGenerator::default();
 | 
						|
            let mut gen2 = HashGenerator::default();
 | 
						|
            let mut bf = BloomFilter::new();
 | 
						|
            for _ in 0_usize..1000 {
 | 
						|
                bf.insert_hash(gen1.next());
 | 
						|
            }
 | 
						|
            for _ in 0_usize..100 {
 | 
						|
                bf.remove_hash(gen2.next());
 | 
						|
            }
 | 
						|
            for _ in 100_usize..200 {
 | 
						|
                test::black_box(bf.might_contain_hash(gen2.next()));
 | 
						|
            }
 | 
						|
        });
 | 
						|
    }
 | 
						|
 | 
						|
    #[bench]
 | 
						|
    fn might_contain_10(b: &mut test::Bencher) {
 | 
						|
        let bf = BloomFilter::new();
 | 
						|
        let mut gen = HashGenerator::default();
 | 
						|
        b.iter(|| {
 | 
						|
            for _ in 0..10 {
 | 
						|
                test::black_box(bf.might_contain_hash(gen.next()));
 | 
						|
            }
 | 
						|
        });
 | 
						|
    }
 | 
						|
 | 
						|
    #[bench]
 | 
						|
    fn clear(b: &mut test::Bencher) {
 | 
						|
        let mut bf = Box::new(BloomFilter::new());
 | 
						|
        b.iter(|| test::black_box(&mut bf).clear());
 | 
						|
    }
 | 
						|
 | 
						|
    #[bench]
 | 
						|
    fn insert_10(b: &mut test::Bencher) {
 | 
						|
        let mut bf = BloomFilter::new();
 | 
						|
        let mut gen = HashGenerator::default();
 | 
						|
        b.iter(|| {
 | 
						|
            for _ in 0..10 {
 | 
						|
                test::black_box(bf.insert_hash(gen.next()));
 | 
						|
            }
 | 
						|
        });
 | 
						|
    }
 | 
						|
 | 
						|
    #[bench]
 | 
						|
    fn remove_10(b: &mut test::Bencher) {
 | 
						|
        let mut bf = BloomFilter::new();
 | 
						|
        let mut gen = HashGenerator::default();
 | 
						|
        // Note: this will underflow, and that's ok.
 | 
						|
        b.iter(|| {
 | 
						|
            for _ in 0..10 {
 | 
						|
                bf.remove_hash(gen.next())
 | 
						|
            }
 | 
						|
        });
 | 
						|
    }
 | 
						|
}
 |