forked from mirrors/linux
		
	 ee1ee6db07
			
		
	
	
		ee1ee6db07
		
	
	
	
	
		
			
			atomic_t based reference counting, including refcount_t, uses
atomic_inc_not_zero() for acquiring a reference. atomic_inc_not_zero() is
implemented with a atomic_try_cmpxchg() loop. High contention of the
reference count leads to retry loops and scales badly. There is nothing to
improve on this implementation as the semantics have to be preserved.
Provide rcuref as a scalable alternative solution which is suitable for RCU
managed objects. Similar to refcount_t it comes with overflow and underflow
detection and mitigation.
rcuref treats the underlying atomic_t as an unsigned integer and partitions
this space into zones:
  0x00000000 - 0x7FFFFFFF	valid zone (1 .. (INT_MAX + 1) references)
  0x80000000 - 0xBFFFFFFF	saturation zone
  0xC0000000 - 0xFFFFFFFE	dead zone
  0xFFFFFFFF   			no reference
rcuref_get() unconditionally increments the reference count with
atomic_add_negative_relaxed(). rcuref_put() unconditionally decrements the
reference count with atomic_add_negative_release().
This unconditional increment avoids the inc_not_zero() problem, but
requires a more complex implementation on the put() side when the count
drops from 0 to -1.
When this transition is detected then it is attempted to mark the reference
count dead, by setting it to the midpoint of the dead zone with a single
atomic_cmpxchg_release() operation. This operation can fail due to a
concurrent rcuref_get() elevating the reference count from -1 to 0 again.
If the unconditional increment in rcuref_get() hits a reference count which
is marked dead (or saturated) it will detect it after the fact and bring
back the reference count to the midpoint of the respective zone. The zones
provide enough tolerance which makes it practically impossible to escape
from a zone.
The racy implementation of rcuref_put() requires to protect rcuref_put()
against a grace period ending in order to prevent a subtle use after
free. As RCU is the only mechanism which allows to protect against that, it
is not possible to fully replace the atomic_inc_not_zero() based
implementation of refcount_t with this scheme.
The final drop is slightly more expensive than the atomic_dec_return()
counterpart, but that's not the case which this is optimized for. The
optimization is on the high frequeunt get()/put() pairs and their
scalability.
The performance of an uncontended rcuref_get()/put() pair where the put()
is not dropping the last reference is still on par with the plain atomic
operations, while at the same time providing overflow and underflow
detection and mitigation.
The performance of rcuref compared to plain atomic_inc_not_zero() and
atomic_dec_return() based reference counting under contention:
 -  Micro benchmark: All CPUs running a increment/decrement loop on an
    elevated reference count, which means the 0 to -1 transition never
    happens.
    The performance gain depends on microarchitecture and the number of
    CPUs and has been observed in the range of 1.3X to 4.7X
 - Conversion of dst_entry::__refcnt to rcuref and testing with the
    localhost memtier/memcached benchmark. That benchmark shows the
    reference count contention prominently.
    The performance gain depends on microarchitecture and the number of
    CPUs and has been observed in the range of 1.1X to 2.6X over the
    previous fix for the false sharing issue vs. struct
    dst_entry::__refcnt.
    When memtier is run over a real 1Gb network connection, there is a
    small gain on top of the false sharing fix. The two changes combined
    result in a 2%-5% total gain for that networked test.
Reported-by: Wangyang Guo <wangyang.guo@intel.com>
Reported-by: Arjan Van De Ven <arjan.van.de.ven@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20230323102800.158429195@linutronix.de
		
	
			
		
			
				
	
	
		
			242 lines
		
	
	
	
		
			5.8 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			242 lines
		
	
	
	
		
			5.8 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| /* SPDX-License-Identifier: GPL-2.0 */
 | |
| #ifndef _LINUX_TYPES_H
 | |
| #define _LINUX_TYPES_H
 | |
| 
 | |
| #define __EXPORTED_HEADERS__
 | |
| #include <uapi/linux/types.h>
 | |
| 
 | |
| #ifndef __ASSEMBLY__
 | |
| 
 | |
| #define DECLARE_BITMAP(name,bits) \
 | |
| 	unsigned long name[BITS_TO_LONGS(bits)]
 | |
| 
 | |
| typedef u32 __kernel_dev_t;
 | |
| 
 | |
| typedef __kernel_fd_set		fd_set;
 | |
| typedef __kernel_dev_t		dev_t;
 | |
| typedef __kernel_ulong_t	ino_t;
 | |
| typedef __kernel_mode_t		mode_t;
 | |
| typedef unsigned short		umode_t;
 | |
| typedef u32			nlink_t;
 | |
| typedef __kernel_off_t		off_t;
 | |
| typedef __kernel_pid_t		pid_t;
 | |
| typedef __kernel_daddr_t	daddr_t;
 | |
| typedef __kernel_key_t		key_t;
 | |
| typedef __kernel_suseconds_t	suseconds_t;
 | |
| typedef __kernel_timer_t	timer_t;
 | |
| typedef __kernel_clockid_t	clockid_t;
 | |
| typedef __kernel_mqd_t		mqd_t;
 | |
| 
 | |
| typedef _Bool			bool;
 | |
| 
 | |
| typedef __kernel_uid32_t	uid_t;
 | |
| typedef __kernel_gid32_t	gid_t;
 | |
| typedef __kernel_uid16_t        uid16_t;
 | |
| typedef __kernel_gid16_t        gid16_t;
 | |
| 
 | |
| typedef unsigned long		uintptr_t;
 | |
| 
 | |
| #ifdef CONFIG_HAVE_UID16
 | |
| /* This is defined by include/asm-{arch}/posix_types.h */
 | |
| typedef __kernel_old_uid_t	old_uid_t;
 | |
| typedef __kernel_old_gid_t	old_gid_t;
 | |
| #endif /* CONFIG_UID16 */
 | |
| 
 | |
| #if defined(__GNUC__)
 | |
| typedef __kernel_loff_t		loff_t;
 | |
| #endif
 | |
| 
 | |
| /*
 | |
|  * The following typedefs are also protected by individual ifdefs for
 | |
|  * historical reasons:
 | |
|  */
 | |
| #ifndef _SIZE_T
 | |
| #define _SIZE_T
 | |
| typedef __kernel_size_t		size_t;
 | |
| #endif
 | |
| 
 | |
| #ifndef _SSIZE_T
 | |
| #define _SSIZE_T
 | |
| typedef __kernel_ssize_t	ssize_t;
 | |
| #endif
 | |
| 
 | |
| #ifndef _PTRDIFF_T
 | |
| #define _PTRDIFF_T
 | |
| typedef __kernel_ptrdiff_t	ptrdiff_t;
 | |
| #endif
 | |
| 
 | |
| #ifndef _CLOCK_T
 | |
| #define _CLOCK_T
 | |
| typedef __kernel_clock_t	clock_t;
 | |
| #endif
 | |
| 
 | |
| #ifndef _CADDR_T
 | |
| #define _CADDR_T
 | |
| typedef __kernel_caddr_t	caddr_t;
 | |
| #endif
 | |
| 
 | |
| /* bsd */
 | |
| typedef unsigned char		u_char;
 | |
| typedef unsigned short		u_short;
 | |
| typedef unsigned int		u_int;
 | |
| typedef unsigned long		u_long;
 | |
| 
 | |
| /* sysv */
 | |
| typedef unsigned char		unchar;
 | |
| typedef unsigned short		ushort;
 | |
| typedef unsigned int		uint;
 | |
| typedef unsigned long		ulong;
 | |
| 
 | |
| #ifndef __BIT_TYPES_DEFINED__
 | |
| #define __BIT_TYPES_DEFINED__
 | |
| 
 | |
| typedef u8			u_int8_t;
 | |
| typedef s8			int8_t;
 | |
| typedef u16			u_int16_t;
 | |
| typedef s16			int16_t;
 | |
| typedef u32			u_int32_t;
 | |
| typedef s32			int32_t;
 | |
| 
 | |
| #endif /* !(__BIT_TYPES_DEFINED__) */
 | |
| 
 | |
| typedef u8			uint8_t;
 | |
| typedef u16			uint16_t;
 | |
| typedef u32			uint32_t;
 | |
| 
 | |
| #if defined(__GNUC__)
 | |
| typedef u64			uint64_t;
 | |
| typedef u64			u_int64_t;
 | |
| typedef s64			int64_t;
 | |
| #endif
 | |
| 
 | |
| /* this is a special 64bit data type that is 8-byte aligned */
 | |
| #define aligned_u64		__aligned_u64
 | |
| #define aligned_be64		__aligned_be64
 | |
| #define aligned_le64		__aligned_le64
 | |
| 
 | |
| /**
 | |
|  * The type used for indexing onto a disc or disc partition.
 | |
|  *
 | |
|  * Linux always considers sectors to be 512 bytes long independently
 | |
|  * of the devices real block size.
 | |
|  *
 | |
|  * blkcnt_t is the type of the inode's block count.
 | |
|  */
 | |
| typedef u64 sector_t;
 | |
| typedef u64 blkcnt_t;
 | |
| 
 | |
| /*
 | |
|  * The type of an index into the pagecache.
 | |
|  */
 | |
| #define pgoff_t unsigned long
 | |
| 
 | |
| /*
 | |
|  * A dma_addr_t can hold any valid DMA address, i.e., any address returned
 | |
|  * by the DMA API.
 | |
|  *
 | |
|  * If the DMA API only uses 32-bit addresses, dma_addr_t need only be 32
 | |
|  * bits wide.  Bus addresses, e.g., PCI BARs, may be wider than 32 bits,
 | |
|  * but drivers do memory-mapped I/O to ioremapped kernel virtual addresses,
 | |
|  * so they don't care about the size of the actual bus addresses.
 | |
|  */
 | |
| #ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
 | |
| typedef u64 dma_addr_t;
 | |
| #else
 | |
| typedef u32 dma_addr_t;
 | |
| #endif
 | |
| 
 | |
| typedef unsigned int __bitwise gfp_t;
 | |
| typedef unsigned int __bitwise slab_flags_t;
 | |
| typedef unsigned int __bitwise fmode_t;
 | |
| 
 | |
| #ifdef CONFIG_PHYS_ADDR_T_64BIT
 | |
| typedef u64 phys_addr_t;
 | |
| #else
 | |
| typedef u32 phys_addr_t;
 | |
| #endif
 | |
| 
 | |
| typedef phys_addr_t resource_size_t;
 | |
| 
 | |
| /*
 | |
|  * This type is the placeholder for a hardware interrupt number. It has to be
 | |
|  * big enough to enclose whatever representation is used by a given platform.
 | |
|  */
 | |
| typedef unsigned long irq_hw_number_t;
 | |
| 
 | |
| typedef struct {
 | |
| 	int counter;
 | |
| } atomic_t;
 | |
| 
 | |
| #define ATOMIC_INIT(i) { (i) }
 | |
| 
 | |
| #ifdef CONFIG_64BIT
 | |
| typedef struct {
 | |
| 	s64 counter;
 | |
| } atomic64_t;
 | |
| #endif
 | |
| 
 | |
| typedef struct {
 | |
| 	atomic_t refcnt;
 | |
| } rcuref_t;
 | |
| 
 | |
| #define RCUREF_INIT(i)	{ .refcnt = ATOMIC_INIT(i - 1) }
 | |
| 
 | |
| struct list_head {
 | |
| 	struct list_head *next, *prev;
 | |
| };
 | |
| 
 | |
| struct hlist_head {
 | |
| 	struct hlist_node *first;
 | |
| };
 | |
| 
 | |
| struct hlist_node {
 | |
| 	struct hlist_node *next, **pprev;
 | |
| };
 | |
| 
 | |
| struct ustat {
 | |
| 	__kernel_daddr_t	f_tfree;
 | |
| #ifdef CONFIG_ARCH_32BIT_USTAT_F_TINODE
 | |
| 	unsigned int		f_tinode;
 | |
| #else
 | |
| 	unsigned long		f_tinode;
 | |
| #endif
 | |
| 	char			f_fname[6];
 | |
| 	char			f_fpack[6];
 | |
| };
 | |
| 
 | |
| /**
 | |
|  * struct callback_head - callback structure for use with RCU and task_work
 | |
|  * @next: next update requests in a list
 | |
|  * @func: actual update function to call after the grace period.
 | |
|  *
 | |
|  * The struct is aligned to size of pointer. On most architectures it happens
 | |
|  * naturally due ABI requirements, but some architectures (like CRIS) have
 | |
|  * weird ABI and we need to ask it explicitly.
 | |
|  *
 | |
|  * The alignment is required to guarantee that bit 0 of @next will be
 | |
|  * clear under normal conditions -- as long as we use call_rcu() or
 | |
|  * call_srcu() to queue the callback.
 | |
|  *
 | |
|  * This guarantee is important for few reasons:
 | |
|  *  - future call_rcu_lazy() will make use of lower bits in the pointer;
 | |
|  *  - the structure shares storage space in struct page with @compound_head,
 | |
|  *    which encode PageTail() in bit 0. The guarantee is needed to avoid
 | |
|  *    false-positive PageTail().
 | |
|  */
 | |
| struct callback_head {
 | |
| 	struct callback_head *next;
 | |
| 	void (*func)(struct callback_head *head);
 | |
| } __attribute__((aligned(sizeof(void *))));
 | |
| #define rcu_head callback_head
 | |
| 
 | |
| typedef void (*rcu_callback_t)(struct rcu_head *head);
 | |
| typedef void (*call_rcu_func_t)(struct rcu_head *head, rcu_callback_t func);
 | |
| 
 | |
| typedef void (*swap_r_func_t)(void *a, void *b, int size, const void *priv);
 | |
| typedef void (*swap_func_t)(void *a, void *b, int size);
 | |
| 
 | |
| typedef int (*cmp_r_func_t)(const void *a, const void *b, const void *priv);
 | |
| typedef int (*cmp_func_t)(const void *a, const void *b);
 | |
| 
 | |
| #endif /*  __ASSEMBLY__ */
 | |
| #endif /* _LINUX_TYPES_H */
 |