forked from mirrors/linux
		
	mm: vmalloc: remove a global vmap_blocks xarray
A global vmap_blocks-xarray array can be contented under heavy usage of the vm_map_ram()/vm_unmap_ram() APIs. The lock_stat shows that a "vmap_blocks.xa_lock" lock is a second in a top-list when it comes to contentions: <snip> ---------------------------------------- class name con-bounces contentions ... ---------------------------------------- vmap_area_lock: 2554079 2554276 ... -------------- vmap_area_lock 1297948 [<00000000dd41cbaa>] alloc_vmap_area+0x1c7/0x910 vmap_area_lock 1256330 [<000000009d927bf3>] free_vmap_block+0x4a/0xe0 vmap_area_lock 1 [<00000000c95c05a7>] find_vm_area+0x16/0x70 -------------- vmap_area_lock 1738590 [<00000000dd41cbaa>] alloc_vmap_area+0x1c7/0x910 vmap_area_lock 815688 [<000000009d927bf3>] free_vmap_block+0x4a/0xe0 vmap_area_lock 1 [<00000000c1d619d7>] __get_vm_area_node+0xd2/0x170 vmap_blocks.xa_lock: 862689 862698 ... ------------------- vmap_blocks.xa_lock 378418 [<00000000625a5626>] vm_map_ram+0x359/0x4a0 vmap_blocks.xa_lock 484280 [<00000000caa2ef03>] xa_erase+0xe/0x30 ------------------- vmap_blocks.xa_lock 576226 [<00000000caa2ef03>] xa_erase+0xe/0x30 vmap_blocks.xa_lock 286472 [<00000000625a5626>] vm_map_ram+0x359/0x4a0 ... <snip> that is a result of running vm_map_ram()/vm_unmap_ram() in a loop. The test creates 64(on 64 CPUs system) threads and each one maps/unmaps 1 page. After this change the "xa_lock" can be considered as a noise in the same test condition: <snip> ... &xa->xa_lock#1: 10333 10394 ... -------------- &xa->xa_lock#1 5349 [<00000000bbbc9751>] xa_erase+0xe/0x30 &xa->xa_lock#1 5045 [<0000000018def45d>] vm_map_ram+0x3a4/0x4f0 -------------- &xa->xa_lock#1 7326 [<0000000018def45d>] vm_map_ram+0x3a4/0x4f0 &xa->xa_lock#1 3068 [<00000000bbbc9751>] xa_erase+0xe/0x30 ... <snip> Running the test_vmalloc.sh run_test_mask=1024 nr_threads=64 nr_pages=5 shows around ~8 percent of throughput improvement of vm_map_ram() and vm_unmap_ram() APIs. This patch does not fix vmap_area_lock/free_vmap_area_lock and purge_vmap_area_lock bottle-necks, it is rather a separate rework. Link: https://lkml.kernel.org/r/20230330190639.431589-1-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com> Reviewed-by: Lorenzo Stoakes <lstoakes@gmail.com> Reviewed-by: Baoquan He <bhe@redhat.com> Cc: Christoph Hellwig <hch@infradead.org> Cc: Dave Chinner <david@fromorbit.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Oleksiy Avramchenko <oleksiy.avramchenko@sony.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
This commit is contained in:
		
							parent
							
								
									62f31bd4dc
								
							
						
					
					
						commit
						062eacf57a
					
				
					 1 changed files with 63 additions and 8 deletions
				
			
		
							
								
								
									
										71
									
								
								mm/vmalloc.c
									
									
									
									
									
								
							
							
						
						
									
										71
									
								
								mm/vmalloc.c
									
									
									
									
									
								
							|  | @ -1915,6 +1915,13 @@ static struct vmap_area *find_unlink_vmap_area(unsigned long addr) | ||||||
| struct vmap_block_queue { | struct vmap_block_queue { | ||||||
| 	spinlock_t lock; | 	spinlock_t lock; | ||||||
| 	struct list_head free; | 	struct list_head free; | ||||||
|  | 
 | ||||||
|  | 	/*
 | ||||||
|  | 	 * An xarray requires an extra memory dynamically to | ||||||
|  | 	 * be allocated. If it is an issue, we can use rb-tree | ||||||
|  | 	 * instead. | ||||||
|  | 	 */ | ||||||
|  | 	struct xarray vmap_blocks; | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
| struct vmap_block { | struct vmap_block { | ||||||
|  | @ -1932,11 +1939,48 @@ struct vmap_block { | ||||||
| static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue); | static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue); | ||||||
| 
 | 
 | ||||||
| /*
 | /*
 | ||||||
|  * XArray of vmap blocks, indexed by address, to quickly find a vmap block |  * In order to fast access to any "vmap_block" associated with a | ||||||
|  * in the free path. Could get rid of this if we change the API to return a |  * specific address, we use a hash. | ||||||
|  * "cookie" from alloc, to be passed to free. But no big deal yet. |  * | ||||||
|  |  * A per-cpu vmap_block_queue is used in both ways, to serialize | ||||||
|  |  * an access to free block chains among CPUs(alloc path) and it | ||||||
|  |  * also acts as a vmap_block hash(alloc/free paths). It means we | ||||||
|  |  * overload it, since we already have the per-cpu array which is | ||||||
|  |  * used as a hash table. When used as a hash a 'cpu' passed to | ||||||
|  |  * per_cpu() is not actually a CPU but rather a hash index. | ||||||
|  |  * | ||||||
|  |  * A hash function is addr_to_vb_xarray() which hashes any address | ||||||
|  |  * to a specific index(in a hash) it belongs to. This then uses a | ||||||
|  |  * per_cpu() macro to access an array with generated index. | ||||||
|  |  * | ||||||
|  |  * An example: | ||||||
|  |  * | ||||||
|  |  *  CPU_1  CPU_2  CPU_0 | ||||||
|  |  *    |      |      | | ||||||
|  |  *    V      V      V | ||||||
|  |  * 0     10     20     30     40     50     60 | ||||||
|  |  * |------|------|------|------|------|------|...<vmap address space> | ||||||
|  |  *   CPU0   CPU1   CPU2   CPU0   CPU1   CPU2 | ||||||
|  |  * | ||||||
|  |  * - CPU_1 invokes vm_unmap_ram(6), 6 belongs to CPU0 zone, thus | ||||||
|  |  *   it access: CPU0/INDEX0 -> vmap_blocks -> xa_lock; | ||||||
|  |  * | ||||||
|  |  * - CPU_2 invokes vm_unmap_ram(11), 11 belongs to CPU1 zone, thus | ||||||
|  |  *   it access: CPU1/INDEX1 -> vmap_blocks -> xa_lock; | ||||||
|  |  * | ||||||
|  |  * - CPU_0 invokes vm_unmap_ram(20), 20 belongs to CPU2 zone, thus | ||||||
|  |  *   it access: CPU2/INDEX2 -> vmap_blocks -> xa_lock. | ||||||
|  |  * | ||||||
|  |  * This technique almost always avoids lock contention on insert/remove, | ||||||
|  |  * however xarray spinlocks protect against any contention that remains. | ||||||
|  */ |  */ | ||||||
| static DEFINE_XARRAY(vmap_blocks); | static struct xarray * | ||||||
|  | addr_to_vb_xarray(unsigned long addr) | ||||||
|  | { | ||||||
|  | 	int index = (addr / VMAP_BLOCK_SIZE) % num_possible_cpus(); | ||||||
|  | 
 | ||||||
|  | 	return &per_cpu(vmap_block_queue, index).vmap_blocks; | ||||||
|  | } | ||||||
| 
 | 
 | ||||||
| /*
 | /*
 | ||||||
|  * We should probably have a fallback mechanism to allocate virtual memory |  * We should probably have a fallback mechanism to allocate virtual memory | ||||||
|  | @ -1974,6 +2018,7 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) | ||||||
| 	struct vmap_block_queue *vbq; | 	struct vmap_block_queue *vbq; | ||||||
| 	struct vmap_block *vb; | 	struct vmap_block *vb; | ||||||
| 	struct vmap_area *va; | 	struct vmap_area *va; | ||||||
|  | 	struct xarray *xa; | ||||||
| 	unsigned long vb_idx; | 	unsigned long vb_idx; | ||||||
| 	int node, err; | 	int node, err; | ||||||
| 	void *vaddr; | 	void *vaddr; | ||||||
|  | @ -2007,8 +2052,9 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) | ||||||
| 	bitmap_set(vb->used_map, 0, (1UL << order)); | 	bitmap_set(vb->used_map, 0, (1UL << order)); | ||||||
| 	INIT_LIST_HEAD(&vb->free_list); | 	INIT_LIST_HEAD(&vb->free_list); | ||||||
| 
 | 
 | ||||||
|  | 	xa = addr_to_vb_xarray(va->va_start); | ||||||
| 	vb_idx = addr_to_vb_idx(va->va_start); | 	vb_idx = addr_to_vb_idx(va->va_start); | ||||||
| 	err = xa_insert(&vmap_blocks, vb_idx, vb, gfp_mask); | 	err = xa_insert(xa, vb_idx, vb, gfp_mask); | ||||||
| 	if (err) { | 	if (err) { | ||||||
| 		kfree(vb); | 		kfree(vb); | ||||||
| 		free_vmap_area(va); | 		free_vmap_area(va); | ||||||
|  | @ -2026,8 +2072,10 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) | ||||||
| static void free_vmap_block(struct vmap_block *vb) | static void free_vmap_block(struct vmap_block *vb) | ||||||
| { | { | ||||||
| 	struct vmap_block *tmp; | 	struct vmap_block *tmp; | ||||||
|  | 	struct xarray *xa; | ||||||
| 
 | 
 | ||||||
| 	tmp = xa_erase(&vmap_blocks, addr_to_vb_idx(vb->va->va_start)); | 	xa = addr_to_vb_xarray(vb->va->va_start); | ||||||
|  | 	tmp = xa_erase(xa, addr_to_vb_idx(vb->va->va_start)); | ||||||
| 	BUG_ON(tmp != vb); | 	BUG_ON(tmp != vb); | ||||||
| 
 | 
 | ||||||
| 	spin_lock(&vmap_area_lock); | 	spin_lock(&vmap_area_lock); | ||||||
|  | @ -2139,6 +2187,7 @@ static void vb_free(unsigned long addr, unsigned long size) | ||||||
| 	unsigned long offset; | 	unsigned long offset; | ||||||
| 	unsigned int order; | 	unsigned int order; | ||||||
| 	struct vmap_block *vb; | 	struct vmap_block *vb; | ||||||
|  | 	struct xarray *xa; | ||||||
| 
 | 
 | ||||||
| 	BUG_ON(offset_in_page(size)); | 	BUG_ON(offset_in_page(size)); | ||||||
| 	BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); | 	BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); | ||||||
|  | @ -2147,7 +2196,10 @@ static void vb_free(unsigned long addr, unsigned long size) | ||||||
| 
 | 
 | ||||||
| 	order = get_order(size); | 	order = get_order(size); | ||||||
| 	offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT; | 	offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT; | ||||||
| 	vb = xa_load(&vmap_blocks, addr_to_vb_idx(addr)); | 
 | ||||||
|  | 	xa = addr_to_vb_xarray(addr); | ||||||
|  | 	vb = xa_load(xa, addr_to_vb_idx(addr)); | ||||||
|  | 
 | ||||||
| 	spin_lock(&vb->lock); | 	spin_lock(&vb->lock); | ||||||
| 	bitmap_clear(vb->used_map, offset, (1UL << order)); | 	bitmap_clear(vb->used_map, offset, (1UL << order)); | ||||||
| 	spin_unlock(&vb->lock); | 	spin_unlock(&vb->lock); | ||||||
|  | @ -3525,6 +3577,7 @@ static size_t vmap_ram_vread_iter(struct iov_iter *iter, const char *addr, | ||||||
| { | { | ||||||
| 	char *start; | 	char *start; | ||||||
| 	struct vmap_block *vb; | 	struct vmap_block *vb; | ||||||
|  | 	struct xarray *xa; | ||||||
| 	unsigned long offset; | 	unsigned long offset; | ||||||
| 	unsigned int rs, re; | 	unsigned int rs, re; | ||||||
| 	size_t remains, n; | 	size_t remains, n; | ||||||
|  | @ -3543,7 +3596,8 @@ static size_t vmap_ram_vread_iter(struct iov_iter *iter, const char *addr, | ||||||
| 	 * Area is split into regions and tracked with vmap_block, read out | 	 * Area is split into regions and tracked with vmap_block, read out | ||||||
| 	 * each region and zero fill the hole between regions. | 	 * each region and zero fill the hole between regions. | ||||||
| 	 */ | 	 */ | ||||||
| 	vb = xa_load(&vmap_blocks, addr_to_vb_idx((unsigned long)addr)); | 	xa = addr_to_vb_xarray((unsigned long) addr); | ||||||
|  | 	vb = xa_load(xa, addr_to_vb_idx((unsigned long)addr)); | ||||||
| 	if (!vb) | 	if (!vb) | ||||||
| 		goto finished_zero; | 		goto finished_zero; | ||||||
| 
 | 
 | ||||||
|  | @ -4337,6 +4391,7 @@ void __init vmalloc_init(void) | ||||||
| 		p = &per_cpu(vfree_deferred, i); | 		p = &per_cpu(vfree_deferred, i); | ||||||
| 		init_llist_head(&p->list); | 		init_llist_head(&p->list); | ||||||
| 		INIT_WORK(&p->wq, delayed_vfree_work); | 		INIT_WORK(&p->wq, delayed_vfree_work); | ||||||
|  | 		xa_init(&vbq->vmap_blocks); | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	/* Import existing vmlist entries. */ | 	/* Import existing vmlist entries. */ | ||||||
|  |  | ||||||
		Loading…
	
		Reference in a new issue
	
	 Uladzislau Rezki (Sony)
						Uladzislau Rezki (Sony)