forked from mirrors/linux
		
	Use atomic_try_cmpxchg() instead of atomic_cmpxchg(*ptr, old, new) == old in rcuref_put_slowpath(). On x86 the CMPXCHG instruction returns success in the ZF flag, so this change saves a compare after CMPXCHG. Additionaly, the compiler reorders some code blocks to follow likely/unlikely annotations in the atomic_try_cmpxchg() macro, improving the code from: 9a: f0 0f b1 0b lock cmpxchg %ecx,(%rbx) 9e: 83 f8 ff cmp $0xffffffff,%eax a1: 74 04 je a7 <rcuref_put_slowpath+0x27> a3: 31 c0 xor %eax,%eax to: 9a: f0 0f b1 0b lock cmpxchg %ecx,(%rbx) 9e: 75 4c jne ec <rcuref_put_slowpath+0x6c> a0: b0 01 mov $0x1,%al No functional change intended. Signed-off-by: Uros Bizjak <ubizjak@gmail.com> Signed-off-by: Ingo Molnar <mingo@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Paul E. McKenney <paulmck@kernel.org> Link: https://lore.kernel.org/r/20230509150255.3691-1-ubizjak@gmail.com
		
			
				
	
	
		
			281 lines
		
	
	
	
		
			9.5 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			281 lines
		
	
	
	
		
			9.5 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
// SPDX-License-Identifier: GPL-2.0-only
 | 
						|
 | 
						|
/*
 | 
						|
 * rcuref - A scalable reference count implementation for RCU managed objects
 | 
						|
 *
 | 
						|
 * rcuref is provided to replace open coded reference count implementations
 | 
						|
 * based on atomic_t. It protects explicitely RCU managed objects which can
 | 
						|
 * be visible even after the last reference has been dropped and the object
 | 
						|
 * is heading towards destruction.
 | 
						|
 *
 | 
						|
 * A common usage pattern is:
 | 
						|
 *
 | 
						|
 * get()
 | 
						|
 *	rcu_read_lock();
 | 
						|
 *	p = get_ptr();
 | 
						|
 *	if (p && !atomic_inc_not_zero(&p->refcnt))
 | 
						|
 *		p = NULL;
 | 
						|
 *	rcu_read_unlock();
 | 
						|
 *	return p;
 | 
						|
 *
 | 
						|
 * put()
 | 
						|
 *	if (!atomic_dec_return(&->refcnt)) {
 | 
						|
 *		remove_ptr(p);
 | 
						|
 *		kfree_rcu((p, rcu);
 | 
						|
 *	}
 | 
						|
 *
 | 
						|
 * atomic_inc_not_zero() is implemented with a try_cmpxchg() loop which has
 | 
						|
 * O(N^2) behaviour under contention with N concurrent operations.
 | 
						|
 *
 | 
						|
 * rcuref uses atomic_add_negative_relaxed() for the fast path, which scales
 | 
						|
 * better under contention.
 | 
						|
 *
 | 
						|
 * Why not refcount?
 | 
						|
 * =================
 | 
						|
 *
 | 
						|
 * In principle it should be possible to make refcount use the rcuref
 | 
						|
 * scheme, but the destruction race described below cannot be prevented
 | 
						|
 * unless the protected object is RCU managed.
 | 
						|
 *
 | 
						|
 * Theory of operation
 | 
						|
 * ===================
 | 
						|
 *
 | 
						|
 * rcuref uses an unsigned integer reference counter. As long as the
 | 
						|
 * counter value is greater than or equal to RCUREF_ONEREF and not larger
 | 
						|
 * than RCUREF_MAXREF the reference is alive:
 | 
						|
 *
 | 
						|
 * ONEREF   MAXREF               SATURATED             RELEASED      DEAD    NOREF
 | 
						|
 * 0        0x7FFFFFFF 0x8000000 0xA0000000 0xBFFFFFFF 0xC0000000 0xE0000000 0xFFFFFFFF
 | 
						|
 * <---valid --------> <-------saturation zone-------> <-----dead zone----->
 | 
						|
 *
 | 
						|
 * The get() and put() operations do unconditional increments and
 | 
						|
 * decrements. The result is checked after the operation. This optimizes
 | 
						|
 * for the fast path.
 | 
						|
 *
 | 
						|
 * If the reference count is saturated or dead, then the increments and
 | 
						|
 * decrements are not harmful as the reference count still stays in the
 | 
						|
 * respective zones and is always set back to STATURATED resp. DEAD. The
 | 
						|
 * zones have room for 2^28 racing operations in each direction, which
 | 
						|
 * makes it practically impossible to escape the zones.
 | 
						|
 *
 | 
						|
 * Once the last reference is dropped the reference count becomes
 | 
						|
 * RCUREF_NOREF which forces rcuref_put() into the slowpath operation. The
 | 
						|
 * slowpath then tries to set the reference count from RCUREF_NOREF to
 | 
						|
 * RCUREF_DEAD via a cmpxchg(). This opens a small window where a
 | 
						|
 * concurrent rcuref_get() can acquire the reference count and bring it
 | 
						|
 * back to RCUREF_ONEREF or even drop the reference again and mark it DEAD.
 | 
						|
 *
 | 
						|
 * If the cmpxchg() succeeds then a concurrent rcuref_get() will result in
 | 
						|
 * DEAD + 1, which is inside the dead zone. If that happens the reference
 | 
						|
 * count is put back to DEAD.
 | 
						|
 *
 | 
						|
 * The actual race is possible due to the unconditional increment and
 | 
						|
 * decrements in rcuref_get() and rcuref_put():
 | 
						|
 *
 | 
						|
 *	T1				T2
 | 
						|
 *	get()				put()
 | 
						|
 *					if (atomic_add_negative(-1, &ref->refcnt))
 | 
						|
 *		succeeds->			atomic_cmpxchg(&ref->refcnt, NOREF, DEAD);
 | 
						|
 *
 | 
						|
 *	atomic_add_negative(1, &ref->refcnt);	<- Elevates refcount to DEAD + 1
 | 
						|
 *
 | 
						|
 * As the result of T1's add is negative, the get() goes into the slow path
 | 
						|
 * and observes refcnt being in the dead zone which makes the operation fail.
 | 
						|
 *
 | 
						|
 * Possible critical states:
 | 
						|
 *
 | 
						|
 *	Context Counter	References	Operation
 | 
						|
 *	T1	0	1		init()
 | 
						|
 *	T2	1	2		get()
 | 
						|
 *	T1	0	1		put()
 | 
						|
 *	T2     -1	0		put() tries to mark dead
 | 
						|
 *	T1	0	1		get()
 | 
						|
 *	T2	0	1		put() mark dead fails
 | 
						|
 *	T1     -1	0		put() tries to mark dead
 | 
						|
 *	T1    DEAD	0		put() mark dead succeeds
 | 
						|
 *	T2    DEAD+1	0		get() fails and puts it back to DEAD
 | 
						|
 *
 | 
						|
 * Of course there are more complex scenarios, but the above illustrates
 | 
						|
 * the working principle. The rest is left to the imagination of the
 | 
						|
 * reader.
 | 
						|
 *
 | 
						|
 * Deconstruction race
 | 
						|
 * ===================
 | 
						|
 *
 | 
						|
 * The release operation must be protected by prohibiting a grace period in
 | 
						|
 * order to prevent a possible use after free:
 | 
						|
 *
 | 
						|
 *	T1				T2
 | 
						|
 *	put()				get()
 | 
						|
 *	// ref->refcnt = ONEREF
 | 
						|
 *	if (!atomic_add_negative(-1, &ref->refcnt))
 | 
						|
 *		return false;				<- Not taken
 | 
						|
 *
 | 
						|
 *	// ref->refcnt == NOREF
 | 
						|
 *	--> preemption
 | 
						|
 *					// Elevates ref->refcnt to ONEREF
 | 
						|
 *					if (!atomic_add_negative(1, &ref->refcnt))
 | 
						|
 *						return true;			<- taken
 | 
						|
 *
 | 
						|
 *					if (put(&p->ref)) { <-- Succeeds
 | 
						|
 *						remove_pointer(p);
 | 
						|
 *						kfree_rcu(p, rcu);
 | 
						|
 *					}
 | 
						|
 *
 | 
						|
 *		RCU grace period ends, object is freed
 | 
						|
 *
 | 
						|
 *	atomic_cmpxchg(&ref->refcnt, NOREF, DEAD);	<- UAF
 | 
						|
 *
 | 
						|
 * This is prevented by disabling preemption around the put() operation as
 | 
						|
 * that's in most kernel configurations cheaper than a rcu_read_lock() /
 | 
						|
 * rcu_read_unlock() pair and in many cases even a NOOP. In any case it
 | 
						|
 * prevents the grace period which keeps the object alive until all put()
 | 
						|
 * operations complete.
 | 
						|
 *
 | 
						|
 * Saturation protection
 | 
						|
 * =====================
 | 
						|
 *
 | 
						|
 * The reference count has a saturation limit RCUREF_MAXREF (INT_MAX).
 | 
						|
 * Once this is exceedded the reference count becomes stale by setting it
 | 
						|
 * to RCUREF_SATURATED, which will cause a memory leak, but it prevents
 | 
						|
 * wrap arounds which obviously cause worse problems than a memory
 | 
						|
 * leak. When saturation is reached a warning is emitted.
 | 
						|
 *
 | 
						|
 * Race conditions
 | 
						|
 * ===============
 | 
						|
 *
 | 
						|
 * All reference count increment/decrement operations are unconditional and
 | 
						|
 * only verified after the fact. This optimizes for the good case and takes
 | 
						|
 * the occasional race vs. a dead or already saturated refcount into
 | 
						|
 * account. The saturation and dead zones are large enough to accomodate
 | 
						|
 * for that.
 | 
						|
 *
 | 
						|
 * Memory ordering
 | 
						|
 * ===============
 | 
						|
 *
 | 
						|
 * Memory ordering rules are slightly relaxed wrt regular atomic_t functions
 | 
						|
 * and provide only what is strictly required for refcounts.
 | 
						|
 *
 | 
						|
 * The increments are fully relaxed; these will not provide ordering. The
 | 
						|
 * rationale is that whatever is used to obtain the object to increase the
 | 
						|
 * reference count on will provide the ordering. For locked data
 | 
						|
 * structures, its the lock acquire, for RCU/lockless data structures its
 | 
						|
 * the dependent load.
 | 
						|
 *
 | 
						|
 * rcuref_get() provides a control dependency ordering future stores which
 | 
						|
 * ensures that the object is not modified when acquiring a reference
 | 
						|
 * fails.
 | 
						|
 *
 | 
						|
 * rcuref_put() provides release order, i.e. all prior loads and stores
 | 
						|
 * will be issued before. It also provides a control dependency ordering
 | 
						|
 * against the subsequent destruction of the object.
 | 
						|
 *
 | 
						|
 * If rcuref_put() successfully dropped the last reference and marked the
 | 
						|
 * object DEAD it also provides acquire ordering.
 | 
						|
 */
 | 
						|
 | 
						|
#include <linux/export.h>
 | 
						|
#include <linux/rcuref.h>
 | 
						|
 | 
						|
/**
 | 
						|
 * rcuref_get_slowpath - Slowpath of rcuref_get()
 | 
						|
 * @ref:	Pointer to the reference count
 | 
						|
 *
 | 
						|
 * Invoked when the reference count is outside of the valid zone.
 | 
						|
 *
 | 
						|
 * Return:
 | 
						|
 *	False if the reference count was already marked dead
 | 
						|
 *
 | 
						|
 *	True if the reference count is saturated, which prevents the
 | 
						|
 *	object from being deconstructed ever.
 | 
						|
 */
 | 
						|
bool rcuref_get_slowpath(rcuref_t *ref)
 | 
						|
{
 | 
						|
	unsigned int cnt = atomic_read(&ref->refcnt);
 | 
						|
 | 
						|
	/*
 | 
						|
	 * If the reference count was already marked dead, undo the
 | 
						|
	 * increment so it stays in the middle of the dead zone and return
 | 
						|
	 * fail.
 | 
						|
	 */
 | 
						|
	if (cnt >= RCUREF_RELEASED) {
 | 
						|
		atomic_set(&ref->refcnt, RCUREF_DEAD);
 | 
						|
		return false;
 | 
						|
	}
 | 
						|
 | 
						|
	/*
 | 
						|
	 * If it was saturated, warn and mark it so. In case the increment
 | 
						|
	 * was already on a saturated value restore the saturation
 | 
						|
	 * marker. This keeps it in the middle of the saturation zone and
 | 
						|
	 * prevents the reference count from overflowing. This leaks the
 | 
						|
	 * object memory, but prevents the obvious reference count overflow
 | 
						|
	 * damage.
 | 
						|
	 */
 | 
						|
	if (WARN_ONCE(cnt > RCUREF_MAXREF, "rcuref saturated - leaking memory"))
 | 
						|
		atomic_set(&ref->refcnt, RCUREF_SATURATED);
 | 
						|
	return true;
 | 
						|
}
 | 
						|
EXPORT_SYMBOL_GPL(rcuref_get_slowpath);
 | 
						|
 | 
						|
/**
 | 
						|
 * rcuref_put_slowpath - Slowpath of __rcuref_put()
 | 
						|
 * @ref:	Pointer to the reference count
 | 
						|
 *
 | 
						|
 * Invoked when the reference count is outside of the valid zone.
 | 
						|
 *
 | 
						|
 * Return:
 | 
						|
 *	True if this was the last reference with no future references
 | 
						|
 *	possible. This signals the caller that it can safely schedule the
 | 
						|
 *	object, which is protected by the reference counter, for
 | 
						|
 *	deconstruction.
 | 
						|
 *
 | 
						|
 *	False if there are still active references or the put() raced
 | 
						|
 *	with a concurrent get()/put() pair. Caller is not allowed to
 | 
						|
 *	deconstruct the protected object.
 | 
						|
 */
 | 
						|
bool rcuref_put_slowpath(rcuref_t *ref)
 | 
						|
{
 | 
						|
	unsigned int cnt = atomic_read(&ref->refcnt);
 | 
						|
 | 
						|
	/* Did this drop the last reference? */
 | 
						|
	if (likely(cnt == RCUREF_NOREF)) {
 | 
						|
		/*
 | 
						|
		 * Carefully try to set the reference count to RCUREF_DEAD.
 | 
						|
		 *
 | 
						|
		 * This can fail if a concurrent get() operation has
 | 
						|
		 * elevated it again or the corresponding put() even marked
 | 
						|
		 * it dead already. Both are valid situations and do not
 | 
						|
		 * require a retry. If this fails the caller is not
 | 
						|
		 * allowed to deconstruct the object.
 | 
						|
		 */
 | 
						|
		if (!atomic_try_cmpxchg_release(&ref->refcnt, &cnt, RCUREF_DEAD))
 | 
						|
			return false;
 | 
						|
 | 
						|
		/*
 | 
						|
		 * The caller can safely schedule the object for
 | 
						|
		 * deconstruction. Provide acquire ordering.
 | 
						|
		 */
 | 
						|
		smp_acquire__after_ctrl_dep();
 | 
						|
		return true;
 | 
						|
	}
 | 
						|
 | 
						|
	/*
 | 
						|
	 * If the reference count was already in the dead zone, then this
 | 
						|
	 * put() operation is imbalanced. Warn, put the reference count back to
 | 
						|
	 * DEAD and tell the caller to not deconstruct the object.
 | 
						|
	 */
 | 
						|
	if (WARN_ONCE(cnt >= RCUREF_RELEASED, "rcuref - imbalanced put()")) {
 | 
						|
		atomic_set(&ref->refcnt, RCUREF_DEAD);
 | 
						|
		return false;
 | 
						|
	}
 | 
						|
 | 
						|
	/*
 | 
						|
	 * This is a put() operation on a saturated refcount. Restore the
 | 
						|
	 * mean saturation value and tell the caller to not deconstruct the
 | 
						|
	 * object.
 | 
						|
	 */
 | 
						|
	if (cnt > RCUREF_MAXREF)
 | 
						|
		atomic_set(&ref->refcnt, RCUREF_SATURATED);
 | 
						|
	return false;
 | 
						|
}
 | 
						|
EXPORT_SYMBOL_GPL(rcuref_put_slowpath);
 |