mirror of
				https://github.com/torvalds/linux.git
				synced 2025-10-31 08:38:45 +02:00 
			
		
		
		
	 3a6358c0db
			
		
	
	
		3a6358c0db
		
	
	
	
	
		
			
			When running UnixBench/Execl throughput case, false sharing is observed
due to frequent read on base_addr and write on free_bytes, chunk_md.
UnixBench/Execl represents a class of workload where bash scripts are
spawned frequently to do some short jobs.  It will do system call on execl
frequently, and execl will call mm_init to initialize mm_struct of the
process.  mm_init will call __percpu_counter_init for percpu_counters
initialization.  Then pcpu_alloc is called to read the base_addr of
pcpu_chunk for memory allocation.  Inside pcpu_alloc, it will call
pcpu_alloc_area to allocate memory from a specified chunk.  This function
will update "free_bytes" and "chunk_md" to record the rest free bytes and
other meta data for this chunk.  Correspondingly, pcpu_free_area will also
update these 2 members when free memory.
Call trace from perf is as below:
+   57.15%  0.01%  execl   [kernel.kallsyms] [k] __percpu_counter_init
+   57.13%  0.91%  execl   [kernel.kallsyms] [k] pcpu_alloc
-   55.27% 54.51%  execl   [kernel.kallsyms] [k] osq_lock
   - 53.54% 0x654278696e552f34
        main
        __execve
        entry_SYSCALL_64_after_hwframe
        do_syscall_64
        __x64_sys_execve
        do_execveat_common.isra.47
        alloc_bprm
        mm_init
        __percpu_counter_init
        pcpu_alloc
      - __mutex_lock.isra.17
In current pcpu_chunk layout, `base_addr' is in the same cache line with
`free_bytes' and `chunk_md', and `base_addr' is at the last 8 bytes.  This
patch moves `bound_map' up to `base_addr', to let `base_addr' locate in a
new cacheline.
With this change, on Intel Sapphire Rapids 112C/224T platform, based on
v6.4-rc4, the 160 parallel score improves by 24%.
The pcpu_chunk struct is a backing data structure per chunk, so the
additional memory should not be dramatic.  A chunk covers ballpark
between 64kb and 512kb memory depending on some config and boot time
stuff, so I believe the additional memory used here is nominal at best.
Working the #s on my desktop:
Percpu:            58624 kB
28 cores -> ~2.1MB of percpu memory.
At say ~128KB per chunk -> 33 chunks, generously 40 chunks.
Adding alignment might bump the chunk size ~64 bytes, so in total ~2KB
of overhead?
I believe we can do a little better to avoid eating that full padding,
so likely less than that.
[dennis@kernel.org: changelog details]
Link: https://lkml.kernel.org/r/20230610030730.110074-1-yu.ma@intel.com
Signed-off-by: Yu Ma <yu.ma@intel.com>
Reviewed-by: Tim Chen <tim.c.chen@linux.intel.com>
Acked-by: Dennis Zhou <dennis@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Shakeel Butt <shakeelb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
		
	
			
		
			
				
	
	
		
			266 lines
		
	
	
	
		
			7.3 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			266 lines
		
	
	
	
		
			7.3 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| /* SPDX-License-Identifier: GPL-2.0 */
 | |
| #ifndef _MM_PERCPU_INTERNAL_H
 | |
| #define _MM_PERCPU_INTERNAL_H
 | |
| 
 | |
| #include <linux/types.h>
 | |
| #include <linux/percpu.h>
 | |
| #include <linux/memcontrol.h>
 | |
| 
 | |
| /*
 | |
|  * pcpu_block_md is the metadata block struct.
 | |
|  * Each chunk's bitmap is split into a number of full blocks.
 | |
|  * All units are in terms of bits.
 | |
|  *
 | |
|  * The scan hint is the largest known contiguous area before the contig hint.
 | |
|  * It is not necessarily the actual largest contig hint though.  There is an
 | |
|  * invariant that the scan_hint_start > contig_hint_start iff
 | |
|  * scan_hint == contig_hint.  This is necessary because when scanning forward,
 | |
|  * we don't know if a new contig hint would be better than the current one.
 | |
|  */
 | |
| struct pcpu_block_md {
 | |
| 	int			scan_hint;	/* scan hint for block */
 | |
| 	int			scan_hint_start; /* block relative starting
 | |
| 						    position of the scan hint */
 | |
| 	int                     contig_hint;    /* contig hint for block */
 | |
| 	int                     contig_hint_start; /* block relative starting
 | |
| 						      position of the contig hint */
 | |
| 	int                     left_free;      /* size of free space along
 | |
| 						   the left side of the block */
 | |
| 	int                     right_free;     /* size of free space along
 | |
| 						   the right side of the block */
 | |
| 	int                     first_free;     /* block position of first free */
 | |
| 	int			nr_bits;	/* total bits responsible for */
 | |
| };
 | |
| 
 | |
| struct pcpu_chunk {
 | |
| #ifdef CONFIG_PERCPU_STATS
 | |
| 	int			nr_alloc;	/* # of allocations */
 | |
| 	size_t			max_alloc_size; /* largest allocation size */
 | |
| #endif
 | |
| 
 | |
| 	struct list_head	list;		/* linked to pcpu_slot lists */
 | |
| 	int			free_bytes;	/* free bytes in the chunk */
 | |
| 	struct pcpu_block_md	chunk_md;
 | |
| 	unsigned long		*bound_map;	/* boundary map */
 | |
| 
 | |
| 	/*
 | |
| 	 * base_addr is the base address of this chunk.
 | |
| 	 * To reduce false sharing, current layout is optimized to make sure
 | |
| 	 * base_addr locate in the different cacheline with free_bytes and
 | |
| 	 * chunk_md.
 | |
| 	 */
 | |
| 	void			*base_addr ____cacheline_aligned_in_smp;
 | |
| 
 | |
| 	unsigned long		*alloc_map;	/* allocation map */
 | |
| 	struct pcpu_block_md	*md_blocks;	/* metadata blocks */
 | |
| 
 | |
| 	void			*data;		/* chunk data */
 | |
| 	bool			immutable;	/* no [de]population allowed */
 | |
| 	bool			isolated;	/* isolated from active chunk
 | |
| 						   slots */
 | |
| 	int			start_offset;	/* the overlap with the previous
 | |
| 						   region to have a page aligned
 | |
| 						   base_addr */
 | |
| 	int			end_offset;	/* additional area required to
 | |
| 						   have the region end page
 | |
| 						   aligned */
 | |
| #ifdef CONFIG_MEMCG_KMEM
 | |
| 	struct obj_cgroup	**obj_cgroups;	/* vector of object cgroups */
 | |
| #endif
 | |
| 
 | |
| 	int			nr_pages;	/* # of pages served by this chunk */
 | |
| 	int			nr_populated;	/* # of populated pages */
 | |
| 	int                     nr_empty_pop_pages; /* # of empty populated pages */
 | |
| 	unsigned long		populated[];	/* populated bitmap */
 | |
| };
 | |
| 
 | |
| extern spinlock_t pcpu_lock;
 | |
| 
 | |
| extern struct list_head *pcpu_chunk_lists;
 | |
| extern int pcpu_nr_slots;
 | |
| extern int pcpu_sidelined_slot;
 | |
| extern int pcpu_to_depopulate_slot;
 | |
| extern int pcpu_nr_empty_pop_pages;
 | |
| 
 | |
| extern struct pcpu_chunk *pcpu_first_chunk;
 | |
| extern struct pcpu_chunk *pcpu_reserved_chunk;
 | |
| 
 | |
| /**
 | |
|  * pcpu_chunk_nr_blocks - converts nr_pages to # of md_blocks
 | |
|  * @chunk: chunk of interest
 | |
|  *
 | |
|  * This conversion is from the number of physical pages that the chunk
 | |
|  * serves to the number of bitmap blocks used.
 | |
|  */
 | |
| static inline int pcpu_chunk_nr_blocks(struct pcpu_chunk *chunk)
 | |
| {
 | |
| 	return chunk->nr_pages * PAGE_SIZE / PCPU_BITMAP_BLOCK_SIZE;
 | |
| }
 | |
| 
 | |
| /**
 | |
|  * pcpu_nr_pages_to_map_bits - converts the pages to size of bitmap
 | |
|  * @pages: number of physical pages
 | |
|  *
 | |
|  * This conversion is from physical pages to the number of bits
 | |
|  * required in the bitmap.
 | |
|  */
 | |
| static inline int pcpu_nr_pages_to_map_bits(int pages)
 | |
| {
 | |
| 	return pages * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE;
 | |
| }
 | |
| 
 | |
| /**
 | |
|  * pcpu_chunk_map_bits - helper to convert nr_pages to size of bitmap
 | |
|  * @chunk: chunk of interest
 | |
|  *
 | |
|  * This conversion is from the number of physical pages that the chunk
 | |
|  * serves to the number of bits in the bitmap.
 | |
|  */
 | |
| static inline int pcpu_chunk_map_bits(struct pcpu_chunk *chunk)
 | |
| {
 | |
| 	return pcpu_nr_pages_to_map_bits(chunk->nr_pages);
 | |
| }
 | |
| 
 | |
| /**
 | |
|  * pcpu_obj_full_size - helper to calculate size of each accounted object
 | |
|  * @size: size of area to allocate in bytes
 | |
|  *
 | |
|  * For each accounted object there is an extra space which is used to store
 | |
|  * obj_cgroup membership if kmemcg is not disabled. Charge it too.
 | |
|  */
 | |
| static inline size_t pcpu_obj_full_size(size_t size)
 | |
| {
 | |
| 	size_t extra_size = 0;
 | |
| 
 | |
| #ifdef CONFIG_MEMCG_KMEM
 | |
| 	if (!mem_cgroup_kmem_disabled())
 | |
| 		extra_size += size / PCPU_MIN_ALLOC_SIZE * sizeof(struct obj_cgroup *);
 | |
| #endif
 | |
| 
 | |
| 	return size * num_possible_cpus() + extra_size;
 | |
| }
 | |
| 
 | |
| #ifdef CONFIG_PERCPU_STATS
 | |
| 
 | |
| #include <linux/spinlock.h>
 | |
| 
 | |
| struct percpu_stats {
 | |
| 	u64 nr_alloc;		/* lifetime # of allocations */
 | |
| 	u64 nr_dealloc;		/* lifetime # of deallocations */
 | |
| 	u64 nr_cur_alloc;	/* current # of allocations */
 | |
| 	u64 nr_max_alloc;	/* max # of live allocations */
 | |
| 	u32 nr_chunks;		/* current # of live chunks */
 | |
| 	u32 nr_max_chunks;	/* max # of live chunks */
 | |
| 	size_t min_alloc_size;	/* min allocation size */
 | |
| 	size_t max_alloc_size;	/* max allocation size */
 | |
| };
 | |
| 
 | |
| extern struct percpu_stats pcpu_stats;
 | |
| extern struct pcpu_alloc_info pcpu_stats_ai;
 | |
| 
 | |
| /*
 | |
|  * For debug purposes. We don't care about the flexible array.
 | |
|  */
 | |
| static inline void pcpu_stats_save_ai(const struct pcpu_alloc_info *ai)
 | |
| {
 | |
| 	memcpy(&pcpu_stats_ai, ai, sizeof(struct pcpu_alloc_info));
 | |
| 
 | |
| 	/* initialize min_alloc_size to unit_size */
 | |
| 	pcpu_stats.min_alloc_size = pcpu_stats_ai.unit_size;
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * pcpu_stats_area_alloc - increment area allocation stats
 | |
|  * @chunk: the location of the area being allocated
 | |
|  * @size: size of area to allocate in bytes
 | |
|  *
 | |
|  * CONTEXT:
 | |
|  * pcpu_lock.
 | |
|  */
 | |
| static inline void pcpu_stats_area_alloc(struct pcpu_chunk *chunk, size_t size)
 | |
| {
 | |
| 	lockdep_assert_held(&pcpu_lock);
 | |
| 
 | |
| 	pcpu_stats.nr_alloc++;
 | |
| 	pcpu_stats.nr_cur_alloc++;
 | |
| 	pcpu_stats.nr_max_alloc =
 | |
| 		max(pcpu_stats.nr_max_alloc, pcpu_stats.nr_cur_alloc);
 | |
| 	pcpu_stats.min_alloc_size =
 | |
| 		min(pcpu_stats.min_alloc_size, size);
 | |
| 	pcpu_stats.max_alloc_size =
 | |
| 		max(pcpu_stats.max_alloc_size, size);
 | |
| 
 | |
| 	chunk->nr_alloc++;
 | |
| 	chunk->max_alloc_size = max(chunk->max_alloc_size, size);
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * pcpu_stats_area_dealloc - decrement allocation stats
 | |
|  * @chunk: the location of the area being deallocated
 | |
|  *
 | |
|  * CONTEXT:
 | |
|  * pcpu_lock.
 | |
|  */
 | |
| static inline void pcpu_stats_area_dealloc(struct pcpu_chunk *chunk)
 | |
| {
 | |
| 	lockdep_assert_held(&pcpu_lock);
 | |
| 
 | |
| 	pcpu_stats.nr_dealloc++;
 | |
| 	pcpu_stats.nr_cur_alloc--;
 | |
| 
 | |
| 	chunk->nr_alloc--;
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * pcpu_stats_chunk_alloc - increment chunk stats
 | |
|  */
 | |
| static inline void pcpu_stats_chunk_alloc(void)
 | |
| {
 | |
| 	unsigned long flags;
 | |
| 	spin_lock_irqsave(&pcpu_lock, flags);
 | |
| 
 | |
| 	pcpu_stats.nr_chunks++;
 | |
| 	pcpu_stats.nr_max_chunks =
 | |
| 		max(pcpu_stats.nr_max_chunks, pcpu_stats.nr_chunks);
 | |
| 
 | |
| 	spin_unlock_irqrestore(&pcpu_lock, flags);
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * pcpu_stats_chunk_dealloc - decrement chunk stats
 | |
|  */
 | |
| static inline void pcpu_stats_chunk_dealloc(void)
 | |
| {
 | |
| 	unsigned long flags;
 | |
| 	spin_lock_irqsave(&pcpu_lock, flags);
 | |
| 
 | |
| 	pcpu_stats.nr_chunks--;
 | |
| 
 | |
| 	spin_unlock_irqrestore(&pcpu_lock, flags);
 | |
| }
 | |
| 
 | |
| #else
 | |
| 
 | |
| static inline void pcpu_stats_save_ai(const struct pcpu_alloc_info *ai)
 | |
| {
 | |
| }
 | |
| 
 | |
| static inline void pcpu_stats_area_alloc(struct pcpu_chunk *chunk, size_t size)
 | |
| {
 | |
| }
 | |
| 
 | |
| static inline void pcpu_stats_area_dealloc(struct pcpu_chunk *chunk)
 | |
| {
 | |
| }
 | |
| 
 | |
| static inline void pcpu_stats_chunk_alloc(void)
 | |
| {
 | |
| }
 | |
| 
 | |
| static inline void pcpu_stats_chunk_dealloc(void)
 | |
| {
 | |
| }
 | |
| 
 | |
| #endif /* !CONFIG_PERCPU_STATS */
 | |
| 
 | |
| #endif
 |