mirror of
				https://github.com/torvalds/linux.git
				synced 2025-11-03 18:20:25 +02:00 
			
		
		
		
	The arch_spin_value_unlocked() of ticket-lock would cause the compiler to
generate inefficient asm code in riscv architecture because of
unnecessary memory access to the contended value.
Before the patch:
	void lockref_get(struct lockref *lockref)
	{
	  78:   fd010113                add     sp,sp,-48
	  7c:   02813023                sd      s0,32(sp)
	  80:   02113423                sd      ra,40(sp)
	  84:   03010413                add     s0,sp,48
	0000000000000088 <.LBB296>:
		CMPXCHG_LOOP(
	  88:   00053783                ld      a5,0(a0)
After the patch:
	void lockref_get(struct lockref *lockref)
	{
		CMPXCHG_LOOP(
	  78:   00053783                ld      a5,0(a0)
After the patch, the lockref_get() could get in a fast path instead of the
function's prologue. This is because ticket lock complex logic would
limit compiler optimization for the spinlock fast path, and qspinlock
won't.
The caller of arch_spin_value_unlocked() could benefit from this
change. Currently, the only caller is lockref.
Signed-off-by: Guo Ren <guoren@kernel.org>
Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Waiman Long <longman@redhat.com>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20230908154339.3250567-1-guoren@kernel.org
		
	
			
		
			
				
	
	
		
			94 lines
		
	
	
	
		
			2.7 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			94 lines
		
	
	
	
		
			2.7 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
/* SPDX-License-Identifier: GPL-2.0 */
 | 
						|
 | 
						|
/*
 | 
						|
 * 'Generic' ticket-lock implementation.
 | 
						|
 *
 | 
						|
 * It relies on atomic_fetch_add() having well defined forward progress
 | 
						|
 * guarantees under contention. If your architecture cannot provide this, stick
 | 
						|
 * to a test-and-set lock.
 | 
						|
 *
 | 
						|
 * It also relies on atomic_fetch_add() being safe vs smp_store_release() on a
 | 
						|
 * sub-word of the value. This is generally true for anything LL/SC although
 | 
						|
 * you'd be hard pressed to find anything useful in architecture specifications
 | 
						|
 * about this. If your architecture cannot do this you might be better off with
 | 
						|
 * a test-and-set.
 | 
						|
 *
 | 
						|
 * It further assumes atomic_*_release() + atomic_*_acquire() is RCpc and hence
 | 
						|
 * uses atomic_fetch_add() which is RCsc to create an RCsc hot path, along with
 | 
						|
 * a full fence after the spin to upgrade the otherwise-RCpc
 | 
						|
 * atomic_cond_read_acquire().
 | 
						|
 *
 | 
						|
 * The implementation uses smp_cond_load_acquire() to spin, so if the
 | 
						|
 * architecture has WFE like instructions to sleep instead of poll for word
 | 
						|
 * modifications be sure to implement that (see ARM64 for example).
 | 
						|
 *
 | 
						|
 */
 | 
						|
 | 
						|
#ifndef __ASM_GENERIC_SPINLOCK_H
 | 
						|
#define __ASM_GENERIC_SPINLOCK_H
 | 
						|
 | 
						|
#include <linux/atomic.h>
 | 
						|
#include <asm-generic/spinlock_types.h>
 | 
						|
 | 
						|
static __always_inline void arch_spin_lock(arch_spinlock_t *lock)
 | 
						|
{
 | 
						|
	u32 val = atomic_fetch_add(1<<16, lock);
 | 
						|
	u16 ticket = val >> 16;
 | 
						|
 | 
						|
	if (ticket == (u16)val)
 | 
						|
		return;
 | 
						|
 | 
						|
	/*
 | 
						|
	 * atomic_cond_read_acquire() is RCpc, but rather than defining a
 | 
						|
	 * custom cond_read_rcsc() here we just emit a full fence.  We only
 | 
						|
	 * need the prior reads before subsequent writes ordering from
 | 
						|
	 * smb_mb(), but as atomic_cond_read_acquire() just emits reads and we
 | 
						|
	 * have no outstanding writes due to the atomic_fetch_add() the extra
 | 
						|
	 * orderings are free.
 | 
						|
	 */
 | 
						|
	atomic_cond_read_acquire(lock, ticket == (u16)VAL);
 | 
						|
	smp_mb();
 | 
						|
}
 | 
						|
 | 
						|
static __always_inline bool arch_spin_trylock(arch_spinlock_t *lock)
 | 
						|
{
 | 
						|
	u32 old = atomic_read(lock);
 | 
						|
 | 
						|
	if ((old >> 16) != (old & 0xffff))
 | 
						|
		return false;
 | 
						|
 | 
						|
	return atomic_try_cmpxchg(lock, &old, old + (1<<16)); /* SC, for RCsc */
 | 
						|
}
 | 
						|
 | 
						|
static __always_inline void arch_spin_unlock(arch_spinlock_t *lock)
 | 
						|
{
 | 
						|
	u16 *ptr = (u16 *)lock + IS_ENABLED(CONFIG_CPU_BIG_ENDIAN);
 | 
						|
	u32 val = atomic_read(lock);
 | 
						|
 | 
						|
	smp_store_release(ptr, (u16)val + 1);
 | 
						|
}
 | 
						|
 | 
						|
static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock)
 | 
						|
{
 | 
						|
	u32 val = lock.counter;
 | 
						|
 | 
						|
	return ((val >> 16) == (val & 0xffff));
 | 
						|
}
 | 
						|
 | 
						|
static __always_inline int arch_spin_is_locked(arch_spinlock_t *lock)
 | 
						|
{
 | 
						|
	arch_spinlock_t val = READ_ONCE(*lock);
 | 
						|
 | 
						|
	return !arch_spin_value_unlocked(val);
 | 
						|
}
 | 
						|
 | 
						|
static __always_inline int arch_spin_is_contended(arch_spinlock_t *lock)
 | 
						|
{
 | 
						|
	u32 val = atomic_read(lock);
 | 
						|
 | 
						|
	return (s16)((val >> 16) - (val & 0xffff)) > 1;
 | 
						|
}
 | 
						|
 | 
						|
#include <asm/qrwlock.h>
 | 
						|
 | 
						|
#endif /* __ASM_GENERIC_SPINLOCK_H */
 |