mirror of
				https://github.com/torvalds/linux.git
				synced 2025-11-04 02:30:34 +02:00 
			
		
		
		
	futex: Use RCU-based per-CPU reference counting instead of rcuref_t
The use of rcuref_t for reference counting introduces a performance bottleneck when accessed concurrently by multiple threads during futex operations. Replace rcuref_t with special crafted per-CPU reference counters. The lifetime logic remains the same. The newly allocate private hash starts in FR_PERCPU state. In this state, each futex operation that requires the private hash uses a per-CPU counter (an unsigned int) for incrementing or decrementing the reference count. When the private hash is about to be replaced, the per-CPU counters are migrated to a atomic_t counter mm_struct::futex_atomic. The migration process: - Waiting for one RCU grace period to ensure all users observe the current private hash. This can be skipped if a grace period elapsed since the private hash was assigned. - futex_private_hash::state is set to FR_ATOMIC, forcing all users to use mm_struct::futex_atomic for reference counting. - After a RCU grace period, all users are guaranteed to be using the atomic counter. The per-CPU counters can now be summed up and added to the atomic_t counter. If the resulting count is zero, the hash can be safely replaced. Otherwise, active users still hold a valid reference. - Once the atomic reference count drops to zero, the next futex operation will switch to the new private hash. call_rcu_hurry() is used to speed up transition which otherwise might be delay with RCU_LAZY. There is nothing wrong with using call_rcu(). The side effects would be that on auto scaling the new hash is used later and the SET_SLOTS prctl() will block longer. [bigeasy: commit description + mm get/ put_async] Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Link: https://lore.kernel.org/r/20250710110011.384614-3-bigeasy@linutronix.de
This commit is contained in:
		
							parent
							
								
									a255b78d14
								
							
						
					
					
						commit
						56180dd20c
					
				
					 6 changed files with 243 additions and 35 deletions
				
			
		| 
						 | 
					@ -85,18 +85,12 @@ int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsigned long arg4)
 | 
				
			||||||
#ifdef CONFIG_FUTEX_PRIVATE_HASH
 | 
					#ifdef CONFIG_FUTEX_PRIVATE_HASH
 | 
				
			||||||
int futex_hash_allocate_default(void);
 | 
					int futex_hash_allocate_default(void);
 | 
				
			||||||
void futex_hash_free(struct mm_struct *mm);
 | 
					void futex_hash_free(struct mm_struct *mm);
 | 
				
			||||||
 | 
					int futex_mm_init(struct mm_struct *mm);
 | 
				
			||||||
static inline void futex_mm_init(struct mm_struct *mm)
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
	RCU_INIT_POINTER(mm->futex_phash, NULL);
 | 
					 | 
				
			||||||
	mm->futex_phash_new = NULL;
 | 
					 | 
				
			||||||
	mutex_init(&mm->futex_hash_lock);
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
#else /* !CONFIG_FUTEX_PRIVATE_HASH */
 | 
					#else /* !CONFIG_FUTEX_PRIVATE_HASH */
 | 
				
			||||||
static inline int futex_hash_allocate_default(void) { return 0; }
 | 
					static inline int futex_hash_allocate_default(void) { return 0; }
 | 
				
			||||||
static inline void futex_hash_free(struct mm_struct *mm) { }
 | 
					static inline int futex_hash_free(struct mm_struct *mm) { return 0; }
 | 
				
			||||||
static inline void futex_mm_init(struct mm_struct *mm) { }
 | 
					static inline int futex_mm_init(struct mm_struct *mm) { return 0; }
 | 
				
			||||||
#endif /* CONFIG_FUTEX_PRIVATE_HASH */
 | 
					#endif /* CONFIG_FUTEX_PRIVATE_HASH */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#else /* !CONFIG_FUTEX */
 | 
					#else /* !CONFIG_FUTEX */
 | 
				
			||||||
| 
						 | 
					@ -118,8 +112,8 @@ static inline int futex_hash_allocate_default(void)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	return 0;
 | 
						return 0;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
static inline void futex_hash_free(struct mm_struct *mm) { }
 | 
					static inline int futex_hash_free(struct mm_struct *mm) { return 0; }
 | 
				
			||||||
static inline void futex_mm_init(struct mm_struct *mm) { }
 | 
					static inline int futex_mm_init(struct mm_struct *mm) { return 0; }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1070,6 +1070,11 @@ struct mm_struct {
 | 
				
			||||||
		struct mutex			futex_hash_lock;
 | 
							struct mutex			futex_hash_lock;
 | 
				
			||||||
		struct futex_private_hash	__rcu *futex_phash;
 | 
							struct futex_private_hash	__rcu *futex_phash;
 | 
				
			||||||
		struct futex_private_hash	*futex_phash_new;
 | 
							struct futex_private_hash	*futex_phash_new;
 | 
				
			||||||
 | 
							/* futex-ref */
 | 
				
			||||||
 | 
							unsigned long			futex_batches;
 | 
				
			||||||
 | 
							struct rcu_head			futex_rcu;
 | 
				
			||||||
 | 
							atomic_long_t			futex_atomic;
 | 
				
			||||||
 | 
							unsigned int			__percpu *futex_ref;
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		unsigned long hiwater_rss; /* High-watermark of RSS usage */
 | 
							unsigned long hiwater_rss; /* High-watermark of RSS usage */
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -140,7 +140,7 @@ static inline bool mmget_not_zero(struct mm_struct *mm)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/* mmput gets rid of the mappings and all user-space */
 | 
					/* mmput gets rid of the mappings and all user-space */
 | 
				
			||||||
extern void mmput(struct mm_struct *);
 | 
					extern void mmput(struct mm_struct *);
 | 
				
			||||||
#ifdef CONFIG_MMU
 | 
					#if defined(CONFIG_MMU) || defined(CONFIG_FUTEX_PRIVATE_HASH)
 | 
				
			||||||
/* same as above but performs the slow path from the async context. Can
 | 
					/* same as above but performs the slow path from the async context. Can
 | 
				
			||||||
 * be called from the atomic context as well
 | 
					 * be called from the atomic context as well
 | 
				
			||||||
 */
 | 
					 */
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1716,13 +1716,9 @@ config FUTEX_PI
 | 
				
			||||||
	depends on FUTEX && RT_MUTEXES
 | 
						depends on FUTEX && RT_MUTEXES
 | 
				
			||||||
	default y
 | 
						default y
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#
 | 
					 | 
				
			||||||
# marked broken for performance reasons; gives us one more cycle to sort things out.
 | 
					 | 
				
			||||||
#
 | 
					 | 
				
			||||||
config FUTEX_PRIVATE_HASH
 | 
					config FUTEX_PRIVATE_HASH
 | 
				
			||||||
	bool
 | 
						bool
 | 
				
			||||||
	depends on FUTEX && !BASE_SMALL && MMU
 | 
						depends on FUTEX && !BASE_SMALL && MMU
 | 
				
			||||||
	depends on BROKEN
 | 
					 | 
				
			||||||
	default y
 | 
						default y
 | 
				
			||||||
 | 
					
 | 
				
			||||||
config FUTEX_MPOL
 | 
					config FUTEX_MPOL
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1046,7 +1046,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 | 
				
			||||||
	RCU_INIT_POINTER(mm->exe_file, NULL);
 | 
						RCU_INIT_POINTER(mm->exe_file, NULL);
 | 
				
			||||||
	mmu_notifier_subscriptions_init(mm);
 | 
						mmu_notifier_subscriptions_init(mm);
 | 
				
			||||||
	init_tlb_flush_pending(mm);
 | 
						init_tlb_flush_pending(mm);
 | 
				
			||||||
	futex_mm_init(mm);
 | 
					 | 
				
			||||||
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !defined(CONFIG_SPLIT_PMD_PTLOCKS)
 | 
					#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !defined(CONFIG_SPLIT_PMD_PTLOCKS)
 | 
				
			||||||
	mm->pmd_huge_pte = NULL;
 | 
						mm->pmd_huge_pte = NULL;
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
| 
						 | 
					@ -1061,6 +1060,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 | 
				
			||||||
		mm->def_flags = 0;
 | 
							mm->def_flags = 0;
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if (futex_mm_init(mm))
 | 
				
			||||||
 | 
							goto fail_mm_init;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if (mm_alloc_pgd(mm))
 | 
						if (mm_alloc_pgd(mm))
 | 
				
			||||||
		goto fail_nopgd;
 | 
							goto fail_nopgd;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1090,6 +1092,8 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 | 
				
			||||||
fail_noid:
 | 
					fail_noid:
 | 
				
			||||||
	mm_free_pgd(mm);
 | 
						mm_free_pgd(mm);
 | 
				
			||||||
fail_nopgd:
 | 
					fail_nopgd:
 | 
				
			||||||
 | 
						futex_hash_free(mm);
 | 
				
			||||||
 | 
					fail_mm_init:
 | 
				
			||||||
	free_mm(mm);
 | 
						free_mm(mm);
 | 
				
			||||||
	return NULL;
 | 
						return NULL;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
| 
						 | 
					@ -1145,7 +1149,7 @@ void mmput(struct mm_struct *mm)
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
EXPORT_SYMBOL_GPL(mmput);
 | 
					EXPORT_SYMBOL_GPL(mmput);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#ifdef CONFIG_MMU
 | 
					#if defined(CONFIG_MMU) || defined(CONFIG_FUTEX_PRIVATE_HASH)
 | 
				
			||||||
static void mmput_async_fn(struct work_struct *work)
 | 
					static void mmput_async_fn(struct work_struct *work)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	struct mm_struct *mm = container_of(work, struct mm_struct,
 | 
						struct mm_struct *mm = container_of(work, struct mm_struct,
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -42,7 +42,6 @@
 | 
				
			||||||
#include <linux/fault-inject.h>
 | 
					#include <linux/fault-inject.h>
 | 
				
			||||||
#include <linux/slab.h>
 | 
					#include <linux/slab.h>
 | 
				
			||||||
#include <linux/prctl.h>
 | 
					#include <linux/prctl.h>
 | 
				
			||||||
#include <linux/rcuref.h>
 | 
					 | 
				
			||||||
#include <linux/mempolicy.h>
 | 
					#include <linux/mempolicy.h>
 | 
				
			||||||
#include <linux/mmap_lock.h>
 | 
					#include <linux/mmap_lock.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -65,7 +64,7 @@ static struct {
 | 
				
			||||||
#define futex_queues	(__futex_data.queues)
 | 
					#define futex_queues	(__futex_data.queues)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
struct futex_private_hash {
 | 
					struct futex_private_hash {
 | 
				
			||||||
	rcuref_t	users;
 | 
						int		state;
 | 
				
			||||||
	unsigned int	hash_mask;
 | 
						unsigned int	hash_mask;
 | 
				
			||||||
	struct rcu_head	rcu;
 | 
						struct rcu_head	rcu;
 | 
				
			||||||
	void		*mm;
 | 
						void		*mm;
 | 
				
			||||||
| 
						 | 
					@ -129,6 +128,12 @@ static struct futex_hash_bucket *
 | 
				
			||||||
__futex_hash(union futex_key *key, struct futex_private_hash *fph);
 | 
					__futex_hash(union futex_key *key, struct futex_private_hash *fph);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#ifdef CONFIG_FUTEX_PRIVATE_HASH
 | 
					#ifdef CONFIG_FUTEX_PRIVATE_HASH
 | 
				
			||||||
 | 
					static bool futex_ref_get(struct futex_private_hash *fph);
 | 
				
			||||||
 | 
					static bool futex_ref_put(struct futex_private_hash *fph);
 | 
				
			||||||
 | 
					static bool futex_ref_is_dead(struct futex_private_hash *fph);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					enum { FR_PERCPU = 0, FR_ATOMIC };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static inline bool futex_key_is_private(union futex_key *key)
 | 
					static inline bool futex_key_is_private(union futex_key *key)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	/*
 | 
						/*
 | 
				
			||||||
| 
						 | 
					@ -142,15 +147,14 @@ bool futex_private_hash_get(struct futex_private_hash *fph)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	if (fph->immutable)
 | 
						if (fph->immutable)
 | 
				
			||||||
		return true;
 | 
							return true;
 | 
				
			||||||
	return rcuref_get(&fph->users);
 | 
						return futex_ref_get(fph);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void futex_private_hash_put(struct futex_private_hash *fph)
 | 
					void futex_private_hash_put(struct futex_private_hash *fph)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	/* Ignore return value, last put is verified via rcuref_is_dead() */
 | 
					 | 
				
			||||||
	if (fph->immutable)
 | 
						if (fph->immutable)
 | 
				
			||||||
		return;
 | 
							return;
 | 
				
			||||||
	if (rcuref_put(&fph->users))
 | 
						if (futex_ref_put(fph))
 | 
				
			||||||
		wake_up_var(fph->mm);
 | 
							wake_up_var(fph->mm);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -243,14 +247,18 @@ static bool __futex_pivot_hash(struct mm_struct *mm,
 | 
				
			||||||
	fph = rcu_dereference_protected(mm->futex_phash,
 | 
						fph = rcu_dereference_protected(mm->futex_phash,
 | 
				
			||||||
					lockdep_is_held(&mm->futex_hash_lock));
 | 
										lockdep_is_held(&mm->futex_hash_lock));
 | 
				
			||||||
	if (fph) {
 | 
						if (fph) {
 | 
				
			||||||
		if (!rcuref_is_dead(&fph->users)) {
 | 
							if (!futex_ref_is_dead(fph)) {
 | 
				
			||||||
			mm->futex_phash_new = new;
 | 
								mm->futex_phash_new = new;
 | 
				
			||||||
			return false;
 | 
								return false;
 | 
				
			||||||
		}
 | 
							}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		futex_rehash_private(fph, new);
 | 
							futex_rehash_private(fph, new);
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
						new->state = FR_PERCPU;
 | 
				
			||||||
 | 
						scoped_guard(rcu) {
 | 
				
			||||||
 | 
							mm->futex_batches = get_state_synchronize_rcu();
 | 
				
			||||||
		rcu_assign_pointer(mm->futex_phash, new);
 | 
							rcu_assign_pointer(mm->futex_phash, new);
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
	kvfree_rcu(fph, rcu);
 | 
						kvfree_rcu(fph, rcu);
 | 
				
			||||||
	return true;
 | 
						return true;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
| 
						 | 
					@ -289,9 +297,7 @@ struct futex_private_hash *futex_private_hash(void)
 | 
				
			||||||
		if (!fph)
 | 
							if (!fph)
 | 
				
			||||||
			return NULL;
 | 
								return NULL;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		if (fph->immutable)
 | 
							if (futex_private_hash_get(fph))
 | 
				
			||||||
			return fph;
 | 
					 | 
				
			||||||
		if (rcuref_get(&fph->users))
 | 
					 | 
				
			||||||
			return fph;
 | 
								return fph;
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
	futex_pivot_hash(mm);
 | 
						futex_pivot_hash(mm);
 | 
				
			||||||
| 
						 | 
					@ -1527,17 +1533,220 @@ static void futex_hash_bucket_init(struct futex_hash_bucket *fhb,
 | 
				
			||||||
#define FH_IMMUTABLE	0x02
 | 
					#define FH_IMMUTABLE	0x02
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#ifdef CONFIG_FUTEX_PRIVATE_HASH
 | 
					#ifdef CONFIG_FUTEX_PRIVATE_HASH
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/*
 | 
				
			||||||
 | 
					 * futex-ref
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * Heavily inspired by percpu-rwsem/percpu-refcount; not reusing any of that
 | 
				
			||||||
 | 
					 * code because it just doesn't fit right.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * Dual counter, per-cpu / atomic approach like percpu-refcount, except it
 | 
				
			||||||
 | 
					 * re-initializes the state automatically, such that the fph swizzle is also a
 | 
				
			||||||
 | 
					 * transition back to per-cpu.
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static void futex_ref_rcu(struct rcu_head *head);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static void __futex_ref_atomic_begin(struct futex_private_hash *fph)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						struct mm_struct *mm = fph->mm;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/*
 | 
				
			||||||
 | 
						 * The counter we're about to switch to must have fully switched;
 | 
				
			||||||
 | 
						 * otherwise it would be impossible for it to have reported success
 | 
				
			||||||
 | 
						 * from futex_ref_is_dead().
 | 
				
			||||||
 | 
						 */
 | 
				
			||||||
 | 
						WARN_ON_ONCE(atomic_long_read(&mm->futex_atomic) != 0);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/*
 | 
				
			||||||
 | 
						 * Set the atomic to the bias value such that futex_ref_{get,put}()
 | 
				
			||||||
 | 
						 * will never observe 0. Will be fixed up in __futex_ref_atomic_end()
 | 
				
			||||||
 | 
						 * when folding in the percpu count.
 | 
				
			||||||
 | 
						 */
 | 
				
			||||||
 | 
						atomic_long_set(&mm->futex_atomic, LONG_MAX);
 | 
				
			||||||
 | 
						smp_store_release(&fph->state, FR_ATOMIC);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						call_rcu_hurry(&mm->futex_rcu, futex_ref_rcu);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static void __futex_ref_atomic_end(struct futex_private_hash *fph)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						struct mm_struct *mm = fph->mm;
 | 
				
			||||||
 | 
						unsigned int count = 0;
 | 
				
			||||||
 | 
						long ret;
 | 
				
			||||||
 | 
						int cpu;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/*
 | 
				
			||||||
 | 
						 * Per __futex_ref_atomic_begin() the state of the fph must be ATOMIC
 | 
				
			||||||
 | 
						 * and per this RCU callback, everybody must now observe this state and
 | 
				
			||||||
 | 
						 * use the atomic variable.
 | 
				
			||||||
 | 
						 */
 | 
				
			||||||
 | 
						WARN_ON_ONCE(fph->state != FR_ATOMIC);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/*
 | 
				
			||||||
 | 
						 * Therefore the per-cpu counter is now stable, sum and reset.
 | 
				
			||||||
 | 
						 */
 | 
				
			||||||
 | 
						for_each_possible_cpu(cpu) {
 | 
				
			||||||
 | 
							unsigned int *ptr = per_cpu_ptr(mm->futex_ref, cpu);
 | 
				
			||||||
 | 
							count += *ptr;
 | 
				
			||||||
 | 
							*ptr = 0;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/*
 | 
				
			||||||
 | 
						 * Re-init for the next cycle.
 | 
				
			||||||
 | 
						 */
 | 
				
			||||||
 | 
						this_cpu_inc(*mm->futex_ref); /* 0 -> 1 */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/*
 | 
				
			||||||
 | 
						 * Add actual count, subtract bias and initial refcount.
 | 
				
			||||||
 | 
						 *
 | 
				
			||||||
 | 
						 * The moment this atomic operation happens, futex_ref_is_dead() can
 | 
				
			||||||
 | 
						 * become true.
 | 
				
			||||||
 | 
						 */
 | 
				
			||||||
 | 
						ret = atomic_long_add_return(count - LONG_MAX - 1, &mm->futex_atomic);
 | 
				
			||||||
 | 
						if (!ret)
 | 
				
			||||||
 | 
							wake_up_var(mm);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						WARN_ON_ONCE(ret < 0);
 | 
				
			||||||
 | 
						mmput_async(mm);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static void futex_ref_rcu(struct rcu_head *head)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						struct mm_struct *mm = container_of(head, struct mm_struct, futex_rcu);
 | 
				
			||||||
 | 
						struct futex_private_hash *fph = rcu_dereference_raw(mm->futex_phash);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if (fph->state == FR_PERCPU) {
 | 
				
			||||||
 | 
							/*
 | 
				
			||||||
 | 
							 * Per this extra grace-period, everybody must now observe
 | 
				
			||||||
 | 
							 * fph as the current fph and no previously observed fph's
 | 
				
			||||||
 | 
							 * are in-flight.
 | 
				
			||||||
 | 
							 *
 | 
				
			||||||
 | 
							 * Notably, nobody will now rely on the atomic
 | 
				
			||||||
 | 
							 * futex_ref_is_dead() state anymore so we can begin the
 | 
				
			||||||
 | 
							 * migration of the per-cpu counter into the atomic.
 | 
				
			||||||
 | 
							 */
 | 
				
			||||||
 | 
							__futex_ref_atomic_begin(fph);
 | 
				
			||||||
 | 
							return;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						__futex_ref_atomic_end(fph);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/*
 | 
				
			||||||
 | 
					 * Drop the initial refcount and transition to atomics.
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
 | 
					static void futex_ref_drop(struct futex_private_hash *fph)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						struct mm_struct *mm = fph->mm;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/*
 | 
				
			||||||
 | 
						 * Can only transition the current fph;
 | 
				
			||||||
 | 
						 */
 | 
				
			||||||
 | 
						WARN_ON_ONCE(rcu_dereference_raw(mm->futex_phash) != fph);
 | 
				
			||||||
 | 
						/*
 | 
				
			||||||
 | 
						 * We enqueue at least one RCU callback. Ensure mm stays if the task
 | 
				
			||||||
 | 
						 * exits before the transition is completed.
 | 
				
			||||||
 | 
						 */
 | 
				
			||||||
 | 
						mmget(mm);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/*
 | 
				
			||||||
 | 
						 * In order to avoid the following scenario:
 | 
				
			||||||
 | 
						 *
 | 
				
			||||||
 | 
						 * futex_hash()			__futex_pivot_hash()
 | 
				
			||||||
 | 
						 *   guard(rcu);		  guard(mm->futex_hash_lock);
 | 
				
			||||||
 | 
						 *   fph = mm->futex_phash;
 | 
				
			||||||
 | 
						 *				  rcu_assign_pointer(&mm->futex_phash, new);
 | 
				
			||||||
 | 
						 *				futex_hash_allocate()
 | 
				
			||||||
 | 
						 *				  futex_ref_drop()
 | 
				
			||||||
 | 
						 *				    fph->state = FR_ATOMIC;
 | 
				
			||||||
 | 
						 *				    atomic_set(, BIAS);
 | 
				
			||||||
 | 
						 *
 | 
				
			||||||
 | 
						 *   futex_private_hash_get(fph); // OOPS
 | 
				
			||||||
 | 
						 *
 | 
				
			||||||
 | 
						 * Where an old fph (which is FR_ATOMIC) and should fail on
 | 
				
			||||||
 | 
						 * inc_not_zero, will succeed because a new transition is started and
 | 
				
			||||||
 | 
						 * the atomic is bias'ed away from 0.
 | 
				
			||||||
 | 
						 *
 | 
				
			||||||
 | 
						 * There must be at least one full grace-period between publishing a
 | 
				
			||||||
 | 
						 * new fph and trying to replace it.
 | 
				
			||||||
 | 
						 */
 | 
				
			||||||
 | 
						if (poll_state_synchronize_rcu(mm->futex_batches)) {
 | 
				
			||||||
 | 
							/*
 | 
				
			||||||
 | 
							 * There was a grace-period, we can begin now.
 | 
				
			||||||
 | 
							 */
 | 
				
			||||||
 | 
							__futex_ref_atomic_begin(fph);
 | 
				
			||||||
 | 
							return;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						call_rcu_hurry(&mm->futex_rcu, futex_ref_rcu);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static bool futex_ref_get(struct futex_private_hash *fph)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						struct mm_struct *mm = fph->mm;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						guard(rcu)();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if (smp_load_acquire(&fph->state) == FR_PERCPU) {
 | 
				
			||||||
 | 
							this_cpu_inc(*mm->futex_ref);
 | 
				
			||||||
 | 
							return true;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						return atomic_long_inc_not_zero(&mm->futex_atomic);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static bool futex_ref_put(struct futex_private_hash *fph)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						struct mm_struct *mm = fph->mm;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						guard(rcu)();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if (smp_load_acquire(&fph->state) == FR_PERCPU) {
 | 
				
			||||||
 | 
							this_cpu_dec(*mm->futex_ref);
 | 
				
			||||||
 | 
							return false;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						return atomic_long_dec_and_test(&mm->futex_atomic);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static bool futex_ref_is_dead(struct futex_private_hash *fph)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						struct mm_struct *mm = fph->mm;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						guard(rcu)();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if (smp_load_acquire(&fph->state) == FR_PERCPU)
 | 
				
			||||||
 | 
							return false;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						return atomic_long_read(&mm->futex_atomic) == 0;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					int futex_mm_init(struct mm_struct *mm)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						mutex_init(&mm->futex_hash_lock);
 | 
				
			||||||
 | 
						RCU_INIT_POINTER(mm->futex_phash, NULL);
 | 
				
			||||||
 | 
						mm->futex_phash_new = NULL;
 | 
				
			||||||
 | 
						/* futex-ref */
 | 
				
			||||||
 | 
						atomic_long_set(&mm->futex_atomic, 0);
 | 
				
			||||||
 | 
						mm->futex_batches = get_state_synchronize_rcu();
 | 
				
			||||||
 | 
						mm->futex_ref = alloc_percpu(unsigned int);
 | 
				
			||||||
 | 
						if (!mm->futex_ref)
 | 
				
			||||||
 | 
							return -ENOMEM;
 | 
				
			||||||
 | 
						this_cpu_inc(*mm->futex_ref); /* 0 -> 1 */
 | 
				
			||||||
 | 
						return 0;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void futex_hash_free(struct mm_struct *mm)
 | 
					void futex_hash_free(struct mm_struct *mm)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	struct futex_private_hash *fph;
 | 
						struct futex_private_hash *fph;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						free_percpu(mm->futex_ref);
 | 
				
			||||||
	kvfree(mm->futex_phash_new);
 | 
						kvfree(mm->futex_phash_new);
 | 
				
			||||||
	fph = rcu_dereference_raw(mm->futex_phash);
 | 
						fph = rcu_dereference_raw(mm->futex_phash);
 | 
				
			||||||
	if (fph) {
 | 
						if (fph)
 | 
				
			||||||
		WARN_ON_ONCE(rcuref_read(&fph->users) > 1);
 | 
					 | 
				
			||||||
		kvfree(fph);
 | 
							kvfree(fph);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
static bool futex_pivot_pending(struct mm_struct *mm)
 | 
					static bool futex_pivot_pending(struct mm_struct *mm)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
| 
						 | 
					@ -1549,7 +1758,7 @@ static bool futex_pivot_pending(struct mm_struct *mm)
 | 
				
			||||||
		return true;
 | 
							return true;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	fph = rcu_dereference(mm->futex_phash);
 | 
						fph = rcu_dereference(mm->futex_phash);
 | 
				
			||||||
	return rcuref_is_dead(&fph->users);
 | 
						return futex_ref_is_dead(fph);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static bool futex_hash_less(struct futex_private_hash *a,
 | 
					static bool futex_hash_less(struct futex_private_hash *a,
 | 
				
			||||||
| 
						 | 
					@ -1598,11 +1807,11 @@ static int futex_hash_allocate(unsigned int hash_slots, unsigned int flags)
 | 
				
			||||||
		}
 | 
							}
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	fph = kvzalloc(struct_size(fph, queues, hash_slots), GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
 | 
						fph = kvzalloc(struct_size(fph, queues, hash_slots),
 | 
				
			||||||
 | 
							       GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
 | 
				
			||||||
	if (!fph)
 | 
						if (!fph)
 | 
				
			||||||
		return -ENOMEM;
 | 
							return -ENOMEM;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	rcuref_init(&fph->users, 1);
 | 
					 | 
				
			||||||
	fph->hash_mask = hash_slots ? hash_slots - 1 : 0;
 | 
						fph->hash_mask = hash_slots ? hash_slots - 1 : 0;
 | 
				
			||||||
	fph->custom = custom;
 | 
						fph->custom = custom;
 | 
				
			||||||
	fph->immutable = !!(flags & FH_IMMUTABLE);
 | 
						fph->immutable = !!(flags & FH_IMMUTABLE);
 | 
				
			||||||
| 
						 | 
					@ -1645,7 +1854,7 @@ static int futex_hash_allocate(unsigned int hash_slots, unsigned int flags)
 | 
				
			||||||
				 * allocated a replacement hash, drop the initial
 | 
									 * allocated a replacement hash, drop the initial
 | 
				
			||||||
				 * reference on the existing hash.
 | 
									 * reference on the existing hash.
 | 
				
			||||||
				 */
 | 
									 */
 | 
				
			||||||
				futex_private_hash_put(cur);
 | 
									futex_ref_drop(cur);
 | 
				
			||||||
			}
 | 
								}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
			if (new) {
 | 
								if (new) {
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in a new issue