mirror of
https://github.com/torvalds/linux.git
synced 2025-11-03 01:59:51 +02:00
fs: lockless mntns rbtree lookup
Currently we use a read-write lock but for the simple search case we can make this lockless. Creating a new mount namespace is a rather rare event compared with querying mounts in a foreign mount namespace. Once this is picked up by e.g., systemd to list mounts in another mount in it's isolated services or in containers this will be used a lot so this seems worthwhile doing. Link: https://lore.kernel.org/r/20241213-work-mount-rbtree-lockless-v3-3-6e3cdaf9b280@kernel.org Reviewed-by: Jeff Layton <jlayton@kernel.org> Signed-off-by: Christian Brauner <brauner@kernel.org>
This commit is contained in:
parent
144acef333
commit
5dcbd85d35
2 changed files with 74 additions and 47 deletions
|
|
@ -12,7 +12,10 @@ struct mnt_namespace {
|
||||||
struct user_namespace *user_ns;
|
struct user_namespace *user_ns;
|
||||||
struct ucounts *ucounts;
|
struct ucounts *ucounts;
|
||||||
u64 seq; /* Sequence number to prevent loops */
|
u64 seq; /* Sequence number to prevent loops */
|
||||||
wait_queue_head_t poll;
|
union {
|
||||||
|
wait_queue_head_t poll;
|
||||||
|
struct rcu_head mnt_ns_rcu;
|
||||||
|
};
|
||||||
u64 event;
|
u64 event;
|
||||||
unsigned int nr_mounts; /* # of mounts in the namespace */
|
unsigned int nr_mounts; /* # of mounts in the namespace */
|
||||||
unsigned int pending_mounts;
|
unsigned int pending_mounts;
|
||||||
|
|
|
||||||
116
fs/namespace.c
116
fs/namespace.c
|
|
@ -79,6 +79,8 @@ static DECLARE_RWSEM(namespace_sem);
|
||||||
static HLIST_HEAD(unmounted); /* protected by namespace_sem */
|
static HLIST_HEAD(unmounted); /* protected by namespace_sem */
|
||||||
static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
|
static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
|
||||||
static DEFINE_RWLOCK(mnt_ns_tree_lock);
|
static DEFINE_RWLOCK(mnt_ns_tree_lock);
|
||||||
|
static seqcount_rwlock_t mnt_ns_tree_seqcount = SEQCNT_RWLOCK_ZERO(mnt_ns_tree_seqcount, &mnt_ns_tree_lock);
|
||||||
|
|
||||||
static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by mnt_ns_tree_lock */
|
static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by mnt_ns_tree_lock */
|
||||||
|
|
||||||
struct mount_kattr {
|
struct mount_kattr {
|
||||||
|
|
@ -105,17 +107,6 @@ EXPORT_SYMBOL_GPL(fs_kobj);
|
||||||
*/
|
*/
|
||||||
__cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
|
__cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
|
||||||
|
|
||||||
static int mnt_ns_cmp(u64 seq, const struct mnt_namespace *ns)
|
|
||||||
{
|
|
||||||
u64 seq_b = ns->seq;
|
|
||||||
|
|
||||||
if (seq < seq_b)
|
|
||||||
return -1;
|
|
||||||
if (seq > seq_b)
|
|
||||||
return 1;
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node)
|
static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node)
|
||||||
{
|
{
|
||||||
if (!node)
|
if (!node)
|
||||||
|
|
@ -123,19 +114,41 @@ static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node)
|
||||||
return rb_entry(node, struct mnt_namespace, mnt_ns_tree_node);
|
return rb_entry(node, struct mnt_namespace, mnt_ns_tree_node);
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool mnt_ns_less(struct rb_node *a, const struct rb_node *b)
|
static int mnt_ns_cmp(struct rb_node *a, const struct rb_node *b)
|
||||||
{
|
{
|
||||||
struct mnt_namespace *ns_a = node_to_mnt_ns(a);
|
struct mnt_namespace *ns_a = node_to_mnt_ns(a);
|
||||||
struct mnt_namespace *ns_b = node_to_mnt_ns(b);
|
struct mnt_namespace *ns_b = node_to_mnt_ns(b);
|
||||||
u64 seq_a = ns_a->seq;
|
u64 seq_a = ns_a->seq;
|
||||||
|
u64 seq_b = ns_b->seq;
|
||||||
|
|
||||||
return mnt_ns_cmp(seq_a, ns_b) < 0;
|
if (seq_a < seq_b)
|
||||||
|
return -1;
|
||||||
|
if (seq_a > seq_b)
|
||||||
|
return 1;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void mnt_ns_tree_write_lock(void)
|
||||||
|
{
|
||||||
|
write_lock(&mnt_ns_tree_lock);
|
||||||
|
write_seqcount_begin(&mnt_ns_tree_seqcount);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void mnt_ns_tree_write_unlock(void)
|
||||||
|
{
|
||||||
|
write_seqcount_end(&mnt_ns_tree_seqcount);
|
||||||
|
write_unlock(&mnt_ns_tree_lock);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void mnt_ns_tree_add(struct mnt_namespace *ns)
|
static void mnt_ns_tree_add(struct mnt_namespace *ns)
|
||||||
{
|
{
|
||||||
guard(write_lock)(&mnt_ns_tree_lock);
|
struct rb_node *node;
|
||||||
rb_add(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_less);
|
|
||||||
|
mnt_ns_tree_write_lock();
|
||||||
|
node = rb_find_add_rcu(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_cmp);
|
||||||
|
mnt_ns_tree_write_unlock();
|
||||||
|
|
||||||
|
WARN_ON_ONCE(node);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void mnt_ns_release(struct mnt_namespace *ns)
|
static void mnt_ns_release(struct mnt_namespace *ns)
|
||||||
|
|
@ -150,41 +163,33 @@ static void mnt_ns_release(struct mnt_namespace *ns)
|
||||||
}
|
}
|
||||||
DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, if (_T) mnt_ns_release(_T))
|
DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, if (_T) mnt_ns_release(_T))
|
||||||
|
|
||||||
|
static void mnt_ns_release_rcu(struct rcu_head *rcu)
|
||||||
|
{
|
||||||
|
mnt_ns_release(container_of(rcu, struct mnt_namespace, mnt_ns_rcu));
|
||||||
|
}
|
||||||
|
|
||||||
static void mnt_ns_tree_remove(struct mnt_namespace *ns)
|
static void mnt_ns_tree_remove(struct mnt_namespace *ns)
|
||||||
{
|
{
|
||||||
/* remove from global mount namespace list */
|
/* remove from global mount namespace list */
|
||||||
if (!is_anon_ns(ns)) {
|
if (!is_anon_ns(ns)) {
|
||||||
guard(write_lock)(&mnt_ns_tree_lock);
|
mnt_ns_tree_write_lock();
|
||||||
rb_erase(&ns->mnt_ns_tree_node, &mnt_ns_tree);
|
rb_erase(&ns->mnt_ns_tree_node, &mnt_ns_tree);
|
||||||
|
mnt_ns_tree_write_unlock();
|
||||||
}
|
}
|
||||||
|
|
||||||
mnt_ns_release(ns);
|
call_rcu(&ns->mnt_ns_rcu, mnt_ns_release_rcu);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
static int mnt_ns_find(const void *key, const struct rb_node *node)
|
||||||
* Returns the mount namespace which either has the specified id, or has the
|
|
||||||
* next smallest id afer the specified one.
|
|
||||||
*/
|
|
||||||
static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id)
|
|
||||||
{
|
{
|
||||||
struct rb_node *node = mnt_ns_tree.rb_node;
|
const u64 mnt_ns_id = *(u64 *)key;
|
||||||
struct mnt_namespace *ret = NULL;
|
const struct mnt_namespace *ns = node_to_mnt_ns(node);
|
||||||
|
|
||||||
lockdep_assert_held(&mnt_ns_tree_lock);
|
if (mnt_ns_id < ns->seq)
|
||||||
|
return -1;
|
||||||
while (node) {
|
if (mnt_ns_id > ns->seq)
|
||||||
struct mnt_namespace *n = node_to_mnt_ns(node);
|
return 1;
|
||||||
|
return 0;
|
||||||
if (mnt_ns_id <= n->seq) {
|
|
||||||
ret = node_to_mnt_ns(node);
|
|
||||||
if (mnt_ns_id == n->seq)
|
|
||||||
break;
|
|
||||||
node = node->rb_left;
|
|
||||||
} else {
|
|
||||||
node = node->rb_right;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return ret;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
@ -194,18 +199,37 @@ static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id)
|
||||||
* namespace the @namespace_sem must first be acquired. If the namespace has
|
* namespace the @namespace_sem must first be acquired. If the namespace has
|
||||||
* already shut down before acquiring @namespace_sem, {list,stat}mount() will
|
* already shut down before acquiring @namespace_sem, {list,stat}mount() will
|
||||||
* see that the mount rbtree of the namespace is empty.
|
* see that the mount rbtree of the namespace is empty.
|
||||||
|
*
|
||||||
|
* Note the lookup is lockless protected by a sequence counter. We only
|
||||||
|
* need to guard against false negatives as false positives aren't
|
||||||
|
* possible. So if we didn't find a mount namespace and the sequence
|
||||||
|
* counter has changed we need to retry. If the sequence counter is
|
||||||
|
* still the same we know the search actually failed.
|
||||||
*/
|
*/
|
||||||
static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id)
|
static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id)
|
||||||
{
|
{
|
||||||
struct mnt_namespace *ns;
|
struct mnt_namespace *ns;
|
||||||
|
struct rb_node *node;
|
||||||
|
unsigned int seq;
|
||||||
|
|
||||||
guard(read_lock)(&mnt_ns_tree_lock);
|
guard(rcu)();
|
||||||
ns = mnt_ns_find_id_at(mnt_ns_id);
|
do {
|
||||||
if (!ns || ns->seq != mnt_ns_id)
|
seq = read_seqcount_begin(&mnt_ns_tree_seqcount);
|
||||||
return NULL;
|
node = rb_find_rcu(&mnt_ns_id, &mnt_ns_tree, mnt_ns_find);
|
||||||
|
if (node)
|
||||||
|
break;
|
||||||
|
} while (read_seqcount_retry(&mnt_ns_tree_seqcount, seq));
|
||||||
|
|
||||||
refcount_inc(&ns->passive);
|
if (!node)
|
||||||
return ns;
|
return NULL;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* The last reference count is put with RCU delay so we can
|
||||||
|
* unconditonally acquire a reference here.
|
||||||
|
*/
|
||||||
|
ns = node_to_mnt_ns(node);
|
||||||
|
refcount_inc(&ns->passive);
|
||||||
|
return ns;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void lock_mount_hash(void)
|
static inline void lock_mount_hash(void)
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue