forked from mirrors/linux
		
	mm/page_alloc: replace local_lock with normal spinlock
struct per_cpu_pages is no longer strictly local as PCP lists can be drained remotely using a lock for protection. While the use of local_lock works, it goes against the intent of local_lock which is for "pure CPU local concurrency control mechanisms and not suited for inter-CPU concurrency control" (Documentation/locking/locktypes.rst) local_lock protects against migration between when the percpu pointer is accessed and the pcp->lock acquired. The lock acquisition is a preemption point so in the worst case, a task could migrate to another NUMA node and accidentally allocate remote memory. The main requirement is to pin the task to a CPU that is suitable for PREEMPT_RT and !PREEMPT_RT. Replace local_lock with helpers that pin a task to a CPU, lookup the per-cpu structure and acquire the embedded lock. It's similar to local_lock without breaking the intent behind the API. It is not a complete API as only the parts needed for PCP-alloc are implemented but in theory, the generic helpers could be promoted to a general API if there was demand for an embedded lock within a per-cpu struct with a guarantee that the per-cpu structure locked matches the running CPU and cannot use get_cpu_var due to RT concerns. PCP requires these semantics to avoid accidentally allocating remote memory. [mgorman@techsingularity.net: use pcp_spin_trylock_irqsave instead of pcpu_spin_trylock_irqsave] Link: https://lkml.kernel.org/r/20220627084645.GA27531@techsingularity.net Link: https://lkml.kernel.org/r/20220624125423.6126-8-mgorman@techsingularity.net Signed-off-by: Mel Gorman <mgorman@techsingularity.net> Tested-by: Yu Zhao <yuzhao@google.com> Reviewed-by: Nicolas Saenz Julienne <nsaenzju@redhat.com> Tested-by: Nicolas Saenz Julienne <nsaenzju@redhat.com> Acked-by: Vlastimil Babka <vbabka@suse.cz> Tested-by: Yu Zhao <yuzhao@google.com> Cc: Hugh Dickins <hughd@google.com> Cc: Marcelo Tosatti <mtosatti@redhat.com> Cc: Marek Szyprowski <m.szyprowski@samsung.com> Cc: Michal Hocko <mhocko@kernel.org> Cc: Minchan Kim <minchan@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
This commit is contained in:
		
							parent
							
								
									443c2accd1
								
							
						
					
					
						commit
						01b44456a7
					
				
					 1 changed files with 95 additions and 45 deletions
				
			
		
							
								
								
									
										140
									
								
								mm/page_alloc.c
									
									
									
									
									
								
							
							
						
						
									
										140
									
								
								mm/page_alloc.c
									
									
									
									
									
								
							|  | @ -126,13 +126,6 @@ typedef int __bitwise fpi_t; | ||||||
| static DEFINE_MUTEX(pcp_batch_high_lock); | static DEFINE_MUTEX(pcp_batch_high_lock); | ||||||
| #define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8) | #define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8) | ||||||
| 
 | 
 | ||||||
| struct pagesets { |  | ||||||
| 	local_lock_t lock; |  | ||||||
| }; |  | ||||||
| static DEFINE_PER_CPU(struct pagesets, pagesets) = { |  | ||||||
| 	.lock = INIT_LOCAL_LOCK(lock), |  | ||||||
| }; |  | ||||||
| 
 |  | ||||||
| #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT) | #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT) | ||||||
| /*
 | /*
 | ||||||
|  * On SMP, spin_trylock is sufficient protection. |  * On SMP, spin_trylock is sufficient protection. | ||||||
|  | @ -147,6 +140,83 @@ static DEFINE_PER_CPU(struct pagesets, pagesets) = { | ||||||
| #define pcp_trylock_finish(flags)	local_irq_restore(flags) | #define pcp_trylock_finish(flags)	local_irq_restore(flags) | ||||||
| #endif | #endif | ||||||
| 
 | 
 | ||||||
|  | /*
 | ||||||
|  |  * Locking a pcp requires a PCP lookup followed by a spinlock. To avoid | ||||||
|  |  * a migration causing the wrong PCP to be locked and remote memory being | ||||||
|  |  * potentially allocated, pin the task to the CPU for the lookup+lock. | ||||||
|  |  * preempt_disable is used on !RT because it is faster than migrate_disable. | ||||||
|  |  * migrate_disable is used on RT because otherwise RT spinlock usage is | ||||||
|  |  * interfered with and a high priority task cannot preempt the allocator. | ||||||
|  |  */ | ||||||
|  | #ifndef CONFIG_PREEMPT_RT | ||||||
|  | #define pcpu_task_pin()		preempt_disable() | ||||||
|  | #define pcpu_task_unpin()	preempt_enable() | ||||||
|  | #else | ||||||
|  | #define pcpu_task_pin()		migrate_disable() | ||||||
|  | #define pcpu_task_unpin()	migrate_enable() | ||||||
|  | #endif | ||||||
|  | 
 | ||||||
|  | /*
 | ||||||
|  |  * Generic helper to lookup and a per-cpu variable with an embedded spinlock. | ||||||
|  |  * Return value should be used with equivalent unlock helper. | ||||||
|  |  */ | ||||||
|  | #define pcpu_spin_lock(type, member, ptr)				\ | ||||||
|  | ({									\ | ||||||
|  | 	type *_ret;							\ | ||||||
|  | 	pcpu_task_pin();						\ | ||||||
|  | 	_ret = this_cpu_ptr(ptr);					\ | ||||||
|  | 	spin_lock(&_ret->member);					\ | ||||||
|  | 	_ret;								\ | ||||||
|  | }) | ||||||
|  | 
 | ||||||
|  | #define pcpu_spin_lock_irqsave(type, member, ptr, flags)		\ | ||||||
|  | ({									\ | ||||||
|  | 	type *_ret;							\ | ||||||
|  | 	pcpu_task_pin();						\ | ||||||
|  | 	_ret = this_cpu_ptr(ptr);					\ | ||||||
|  | 	spin_lock_irqsave(&_ret->member, flags);			\ | ||||||
|  | 	_ret;								\ | ||||||
|  | }) | ||||||
|  | 
 | ||||||
|  | #define pcpu_spin_trylock_irqsave(type, member, ptr, flags)		\ | ||||||
|  | ({									\ | ||||||
|  | 	type *_ret;							\ | ||||||
|  | 	pcpu_task_pin();						\ | ||||||
|  | 	_ret = this_cpu_ptr(ptr);					\ | ||||||
|  | 	if (!spin_trylock_irqsave(&_ret->member, flags)) {		\ | ||||||
|  | 		pcpu_task_unpin();					\ | ||||||
|  | 		_ret = NULL;						\ | ||||||
|  | 	}								\ | ||||||
|  | 	_ret;								\ | ||||||
|  | }) | ||||||
|  | 
 | ||||||
|  | #define pcpu_spin_unlock(member, ptr)					\ | ||||||
|  | ({									\ | ||||||
|  | 	spin_unlock(&ptr->member);					\ | ||||||
|  | 	pcpu_task_unpin();						\ | ||||||
|  | }) | ||||||
|  | 
 | ||||||
|  | #define pcpu_spin_unlock_irqrestore(member, ptr, flags)			\ | ||||||
|  | ({									\ | ||||||
|  | 	spin_unlock_irqrestore(&ptr->member, flags);			\ | ||||||
|  | 	pcpu_task_unpin();						\ | ||||||
|  | }) | ||||||
|  | 
 | ||||||
|  | /* struct per_cpu_pages specific helpers. */ | ||||||
|  | #define pcp_spin_lock(ptr)						\ | ||||||
|  | 	pcpu_spin_lock(struct per_cpu_pages, lock, ptr) | ||||||
|  | 
 | ||||||
|  | #define pcp_spin_lock_irqsave(ptr, flags)				\ | ||||||
|  | 	pcpu_spin_lock_irqsave(struct per_cpu_pages, lock, ptr, flags) | ||||||
|  | 
 | ||||||
|  | #define pcp_spin_trylock_irqsave(ptr, flags)				\ | ||||||
|  | 	pcpu_spin_trylock_irqsave(struct per_cpu_pages, lock, ptr, flags) | ||||||
|  | 
 | ||||||
|  | #define pcp_spin_unlock(ptr)						\ | ||||||
|  | 	pcpu_spin_unlock(lock, ptr) | ||||||
|  | 
 | ||||||
|  | #define pcp_spin_unlock_irqrestore(ptr, flags)				\ | ||||||
|  | 	pcpu_spin_unlock_irqrestore(lock, ptr, flags) | ||||||
| #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID | #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID | ||||||
| DEFINE_PER_CPU(int, numa_node); | DEFINE_PER_CPU(int, numa_node); | ||||||
| EXPORT_PER_CPU_SYMBOL(numa_node); | EXPORT_PER_CPU_SYMBOL(numa_node); | ||||||
|  | @ -1485,10 +1555,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, | ||||||
| 	/* Ensure requested pindex is drained first. */ | 	/* Ensure requested pindex is drained first. */ | ||||||
| 	pindex = pindex - 1; | 	pindex = pindex - 1; | ||||||
| 
 | 
 | ||||||
| 	/*
 | 	/* Caller must hold IRQ-safe pcp->lock so IRQs are disabled. */ | ||||||
| 	 * local_lock_irq held so equivalent to spin_lock_irqsave for |  | ||||||
| 	 * both PREEMPT_RT and non-PREEMPT_RT configurations. |  | ||||||
| 	 */ |  | ||||||
| 	spin_lock(&zone->lock); | 	spin_lock(&zone->lock); | ||||||
| 	isolated_pageblocks = has_isolate_pageblock(zone); | 	isolated_pageblocks = has_isolate_pageblock(zone); | ||||||
| 
 | 
 | ||||||
|  | @ -3056,10 +3123,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, | ||||||
| { | { | ||||||
| 	int i, allocated = 0; | 	int i, allocated = 0; | ||||||
| 
 | 
 | ||||||
| 	/*
 | 	/* Caller must hold IRQ-safe pcp->lock so IRQs are disabled. */ | ||||||
| 	 * local_lock_irq held so equivalent to spin_lock_irqsave for |  | ||||||
| 	 * both PREEMPT_RT and non-PREEMPT_RT configurations. |  | ||||||
| 	 */ |  | ||||||
| 	spin_lock(&zone->lock); | 	spin_lock(&zone->lock); | ||||||
| 	for (i = 0; i < count; ++i) { | 	for (i = 0; i < count; ++i) { | ||||||
| 		struct page *page = __rmqueue(zone, order, migratetype, | 		struct page *page = __rmqueue(zone, order, migratetype, | ||||||
|  | @ -3431,18 +3495,16 @@ void free_unref_page(struct page *page, unsigned int order) | ||||||
| 		migratetype = MIGRATE_MOVABLE; | 		migratetype = MIGRATE_MOVABLE; | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	local_lock_irqsave(&pagesets.lock, flags); |  | ||||||
| 	zone = page_zone(page); | 	zone = page_zone(page); | ||||||
| 	pcp_trylock_prepare(UP_flags); | 	pcp_trylock_prepare(UP_flags); | ||||||
| 	pcp = this_cpu_ptr(zone->per_cpu_pageset); | 	pcp = pcp_spin_trylock_irqsave(zone->per_cpu_pageset, flags); | ||||||
| 	if (spin_trylock(&pcp->lock)) { | 	if (pcp) { | ||||||
| 		free_unref_page_commit(zone, pcp, page, migratetype, order); | 		free_unref_page_commit(zone, pcp, page, migratetype, order); | ||||||
| 		spin_unlock(&pcp->lock); | 		pcp_spin_unlock_irqrestore(pcp, flags); | ||||||
| 	} else { | 	} else { | ||||||
| 		free_one_page(zone, page, pfn, order, migratetype, FPI_NONE); | 		free_one_page(zone, page, pfn, order, migratetype, FPI_NONE); | ||||||
| 	} | 	} | ||||||
| 	pcp_trylock_finish(UP_flags); | 	pcp_trylock_finish(UP_flags); | ||||||
| 	local_unlock_irqrestore(&pagesets.lock, flags); |  | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| /*
 | /*
 | ||||||
|  | @ -3477,17 +3539,16 @@ void free_unref_page_list(struct list_head *list) | ||||||
| 		} | 		} | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	local_lock_irqsave(&pagesets.lock, flags); |  | ||||||
| 	list_for_each_entry_safe(page, next, list, lru) { | 	list_for_each_entry_safe(page, next, list, lru) { | ||||||
| 		struct zone *zone = page_zone(page); | 		struct zone *zone = page_zone(page); | ||||||
| 
 | 
 | ||||||
| 		/* Different zone, different pcp lock. */ | 		/* Different zone, different pcp lock. */ | ||||||
| 		if (zone != locked_zone) { | 		if (zone != locked_zone) { | ||||||
| 			if (pcp) | 			if (pcp) | ||||||
| 				spin_unlock(&pcp->lock); | 				pcp_spin_unlock_irqrestore(pcp, flags); | ||||||
|  | 
 | ||||||
| 			locked_zone = zone; | 			locked_zone = zone; | ||||||
| 			pcp = this_cpu_ptr(zone->per_cpu_pageset); | 			pcp = pcp_spin_lock_irqsave(locked_zone->per_cpu_pageset, flags); | ||||||
| 			spin_lock(&pcp->lock); |  | ||||||
| 		} | 		} | ||||||
| 
 | 
 | ||||||
| 		/*
 | 		/*
 | ||||||
|  | @ -3506,18 +3567,14 @@ void free_unref_page_list(struct list_head *list) | ||||||
| 		 * a large list of pages to free. | 		 * a large list of pages to free. | ||||||
| 		 */ | 		 */ | ||||||
| 		if (++batch_count == SWAP_CLUSTER_MAX) { | 		if (++batch_count == SWAP_CLUSTER_MAX) { | ||||||
| 			spin_unlock(&pcp->lock); | 			pcp_spin_unlock_irqrestore(pcp, flags); | ||||||
| 			local_unlock_irqrestore(&pagesets.lock, flags); |  | ||||||
| 			batch_count = 0; | 			batch_count = 0; | ||||||
| 			local_lock_irqsave(&pagesets.lock, flags); | 			pcp = pcp_spin_lock_irqsave(locked_zone->per_cpu_pageset, flags); | ||||||
| 			pcp = this_cpu_ptr(locked_zone->per_cpu_pageset); |  | ||||||
| 			spin_lock(&pcp->lock); |  | ||||||
| 		} | 		} | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	if (pcp) | 	if (pcp) | ||||||
| 		spin_unlock(&pcp->lock); | 		pcp_spin_unlock_irqrestore(pcp, flags); | ||||||
| 	local_unlock_irqrestore(&pagesets.lock, flags); |  | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| /*
 | /*
 | ||||||
|  | @ -3732,17 +3789,14 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, | ||||||
| 	unsigned long flags; | 	unsigned long flags; | ||||||
| 	unsigned long __maybe_unused UP_flags; | 	unsigned long __maybe_unused UP_flags; | ||||||
| 
 | 
 | ||||||
| 	local_lock_irqsave(&pagesets.lock, flags); |  | ||||||
| 
 |  | ||||||
| 	/*
 | 	/*
 | ||||||
| 	 * spin_trylock may fail due to a parallel drain. In the future, the | 	 * spin_trylock may fail due to a parallel drain. In the future, the | ||||||
| 	 * trylock will also protect against IRQ reentrancy. | 	 * trylock will also protect against IRQ reentrancy. | ||||||
| 	 */ | 	 */ | ||||||
| 	pcp = this_cpu_ptr(zone->per_cpu_pageset); |  | ||||||
| 	pcp_trylock_prepare(UP_flags); | 	pcp_trylock_prepare(UP_flags); | ||||||
| 	if (!spin_trylock(&pcp->lock)) { | 	pcp = pcp_spin_trylock_irqsave(zone->per_cpu_pageset, flags); | ||||||
|  | 	if (!pcp) { | ||||||
| 		pcp_trylock_finish(UP_flags); | 		pcp_trylock_finish(UP_flags); | ||||||
| 		local_unlock_irqrestore(&pagesets.lock, flags); |  | ||||||
| 		return NULL; | 		return NULL; | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
|  | @ -3754,9 +3808,8 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, | ||||||
| 	pcp->free_factor >>= 1; | 	pcp->free_factor >>= 1; | ||||||
| 	list = &pcp->lists[order_to_pindex(migratetype, order)]; | 	list = &pcp->lists[order_to_pindex(migratetype, order)]; | ||||||
| 	page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list); | 	page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list); | ||||||
| 	spin_unlock(&pcp->lock); | 	pcp_spin_unlock_irqrestore(pcp, flags); | ||||||
| 	pcp_trylock_finish(UP_flags); | 	pcp_trylock_finish(UP_flags); | ||||||
| 	local_unlock_irqrestore(&pagesets.lock, flags); |  | ||||||
| 	if (page) { | 	if (page) { | ||||||
| 		__count_zid_vm_events(PGALLOC, page_zonenum(page), 1); | 		__count_zid_vm_events(PGALLOC, page_zonenum(page), 1); | ||||||
| 		zone_statistics(preferred_zone, zone, 1); | 		zone_statistics(preferred_zone, zone, 1); | ||||||
|  | @ -5358,10 +5411,9 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, | ||||||
| 		goto failed; | 		goto failed; | ||||||
| 
 | 
 | ||||||
| 	/* Is a parallel drain in progress? */ | 	/* Is a parallel drain in progress? */ | ||||||
| 	local_lock_irqsave(&pagesets.lock, flags); |  | ||||||
| 	pcp_trylock_prepare(UP_flags); | 	pcp_trylock_prepare(UP_flags); | ||||||
| 	pcp = this_cpu_ptr(zone->per_cpu_pageset); | 	pcp = pcp_spin_trylock_irqsave(zone->per_cpu_pageset, flags); | ||||||
| 	if (!spin_trylock(&pcp->lock)) | 	if (!pcp) | ||||||
| 		goto failed_irq; | 		goto failed_irq; | ||||||
| 
 | 
 | ||||||
| 	/* Attempt the batch allocation */ | 	/* Attempt the batch allocation */ | ||||||
|  | @ -5379,7 +5431,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, | ||||||
| 		if (unlikely(!page)) { | 		if (unlikely(!page)) { | ||||||
| 			/* Try and allocate at least one page */ | 			/* Try and allocate at least one page */ | ||||||
| 			if (!nr_account) { | 			if (!nr_account) { | ||||||
| 				spin_unlock(&pcp->lock); | 				pcp_spin_unlock_irqrestore(pcp, flags); | ||||||
| 				goto failed_irq; | 				goto failed_irq; | ||||||
| 			} | 			} | ||||||
| 			break; | 			break; | ||||||
|  | @ -5394,9 +5446,8 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, | ||||||
| 		nr_populated++; | 		nr_populated++; | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	spin_unlock(&pcp->lock); | 	pcp_spin_unlock_irqrestore(pcp, flags); | ||||||
| 	pcp_trylock_finish(UP_flags); | 	pcp_trylock_finish(UP_flags); | ||||||
| 	local_unlock_irqrestore(&pagesets.lock, flags); |  | ||||||
| 
 | 
 | ||||||
| 	__count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account); | 	__count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account); | ||||||
| 	zone_statistics(ac.preferred_zoneref->zone, zone, nr_account); | 	zone_statistics(ac.preferred_zoneref->zone, zone, nr_account); | ||||||
|  | @ -5406,7 +5457,6 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, | ||||||
| 
 | 
 | ||||||
| failed_irq: | failed_irq: | ||||||
| 	pcp_trylock_finish(UP_flags); | 	pcp_trylock_finish(UP_flags); | ||||||
| 	local_unlock_irqrestore(&pagesets.lock, flags); |  | ||||||
| 
 | 
 | ||||||
| failed: | failed: | ||||||
| 	page = __alloc_pages(gfp, 0, preferred_nid, nodemask); | 	page = __alloc_pages(gfp, 0, preferred_nid, nodemask); | ||||||
|  |  | ||||||
		Loading…
	
		Reference in a new issue
	
	 Mel Gorman
						Mel Gorman