forked from mirrors/linux
		
	mm/page_alloc: replace local_lock with normal spinlock
struct per_cpu_pages is no longer strictly local as PCP lists can be drained remotely using a lock for protection. While the use of local_lock works, it goes against the intent of local_lock which is for "pure CPU local concurrency control mechanisms and not suited for inter-CPU concurrency control" (Documentation/locking/locktypes.rst) local_lock protects against migration between when the percpu pointer is accessed and the pcp->lock acquired. The lock acquisition is a preemption point so in the worst case, a task could migrate to another NUMA node and accidentally allocate remote memory. The main requirement is to pin the task to a CPU that is suitable for PREEMPT_RT and !PREEMPT_RT. Replace local_lock with helpers that pin a task to a CPU, lookup the per-cpu structure and acquire the embedded lock. It's similar to local_lock without breaking the intent behind the API. It is not a complete API as only the parts needed for PCP-alloc are implemented but in theory, the generic helpers could be promoted to a general API if there was demand for an embedded lock within a per-cpu struct with a guarantee that the per-cpu structure locked matches the running CPU and cannot use get_cpu_var due to RT concerns. PCP requires these semantics to avoid accidentally allocating remote memory. [mgorman@techsingularity.net: use pcp_spin_trylock_irqsave instead of pcpu_spin_trylock_irqsave] Link: https://lkml.kernel.org/r/20220627084645.GA27531@techsingularity.net Link: https://lkml.kernel.org/r/20220624125423.6126-8-mgorman@techsingularity.net Signed-off-by: Mel Gorman <mgorman@techsingularity.net> Tested-by: Yu Zhao <yuzhao@google.com> Reviewed-by: Nicolas Saenz Julienne <nsaenzju@redhat.com> Tested-by: Nicolas Saenz Julienne <nsaenzju@redhat.com> Acked-by: Vlastimil Babka <vbabka@suse.cz> Tested-by: Yu Zhao <yuzhao@google.com> Cc: Hugh Dickins <hughd@google.com> Cc: Marcelo Tosatti <mtosatti@redhat.com> Cc: Marek Szyprowski <m.szyprowski@samsung.com> Cc: Michal Hocko <mhocko@kernel.org> Cc: Minchan Kim <minchan@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
This commit is contained in:
		
							parent
							
								
									443c2accd1
								
							
						
					
					
						commit
						01b44456a7
					
				
					 1 changed files with 95 additions and 45 deletions
				
			
		
							
								
								
									
										140
									
								
								mm/page_alloc.c
									
									
									
									
									
								
							
							
						
						
									
										140
									
								
								mm/page_alloc.c
									
									
									
									
									
								
							|  | @ -126,13 +126,6 @@ typedef int __bitwise fpi_t; | |||
| static DEFINE_MUTEX(pcp_batch_high_lock); | ||||
| #define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8) | ||||
| 
 | ||||
| struct pagesets { | ||||
| 	local_lock_t lock; | ||||
| }; | ||||
| static DEFINE_PER_CPU(struct pagesets, pagesets) = { | ||||
| 	.lock = INIT_LOCAL_LOCK(lock), | ||||
| }; | ||||
| 
 | ||||
| #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT) | ||||
| /*
 | ||||
|  * On SMP, spin_trylock is sufficient protection. | ||||
|  | @ -147,6 +140,83 @@ static DEFINE_PER_CPU(struct pagesets, pagesets) = { | |||
| #define pcp_trylock_finish(flags)	local_irq_restore(flags) | ||||
| #endif | ||||
| 
 | ||||
| /*
 | ||||
|  * Locking a pcp requires a PCP lookup followed by a spinlock. To avoid | ||||
|  * a migration causing the wrong PCP to be locked and remote memory being | ||||
|  * potentially allocated, pin the task to the CPU for the lookup+lock. | ||||
|  * preempt_disable is used on !RT because it is faster than migrate_disable. | ||||
|  * migrate_disable is used on RT because otherwise RT spinlock usage is | ||||
|  * interfered with and a high priority task cannot preempt the allocator. | ||||
|  */ | ||||
| #ifndef CONFIG_PREEMPT_RT | ||||
| #define pcpu_task_pin()		preempt_disable() | ||||
| #define pcpu_task_unpin()	preempt_enable() | ||||
| #else | ||||
| #define pcpu_task_pin()		migrate_disable() | ||||
| #define pcpu_task_unpin()	migrate_enable() | ||||
| #endif | ||||
| 
 | ||||
| /*
 | ||||
|  * Generic helper to lookup and a per-cpu variable with an embedded spinlock. | ||||
|  * Return value should be used with equivalent unlock helper. | ||||
|  */ | ||||
| #define pcpu_spin_lock(type, member, ptr)				\ | ||||
| ({									\ | ||||
| 	type *_ret;							\ | ||||
| 	pcpu_task_pin();						\ | ||||
| 	_ret = this_cpu_ptr(ptr);					\ | ||||
| 	spin_lock(&_ret->member);					\ | ||||
| 	_ret;								\ | ||||
| }) | ||||
| 
 | ||||
| #define pcpu_spin_lock_irqsave(type, member, ptr, flags)		\ | ||||
| ({									\ | ||||
| 	type *_ret;							\ | ||||
| 	pcpu_task_pin();						\ | ||||
| 	_ret = this_cpu_ptr(ptr);					\ | ||||
| 	spin_lock_irqsave(&_ret->member, flags);			\ | ||||
| 	_ret;								\ | ||||
| }) | ||||
| 
 | ||||
| #define pcpu_spin_trylock_irqsave(type, member, ptr, flags)		\ | ||||
| ({									\ | ||||
| 	type *_ret;							\ | ||||
| 	pcpu_task_pin();						\ | ||||
| 	_ret = this_cpu_ptr(ptr);					\ | ||||
| 	if (!spin_trylock_irqsave(&_ret->member, flags)) {		\ | ||||
| 		pcpu_task_unpin();					\ | ||||
| 		_ret = NULL;						\ | ||||
| 	}								\ | ||||
| 	_ret;								\ | ||||
| }) | ||||
| 
 | ||||
| #define pcpu_spin_unlock(member, ptr)					\ | ||||
| ({									\ | ||||
| 	spin_unlock(&ptr->member);					\ | ||||
| 	pcpu_task_unpin();						\ | ||||
| }) | ||||
| 
 | ||||
| #define pcpu_spin_unlock_irqrestore(member, ptr, flags)			\ | ||||
| ({									\ | ||||
| 	spin_unlock_irqrestore(&ptr->member, flags);			\ | ||||
| 	pcpu_task_unpin();						\ | ||||
| }) | ||||
| 
 | ||||
| /* struct per_cpu_pages specific helpers. */ | ||||
| #define pcp_spin_lock(ptr)						\ | ||||
| 	pcpu_spin_lock(struct per_cpu_pages, lock, ptr) | ||||
| 
 | ||||
| #define pcp_spin_lock_irqsave(ptr, flags)				\ | ||||
| 	pcpu_spin_lock_irqsave(struct per_cpu_pages, lock, ptr, flags) | ||||
| 
 | ||||
| #define pcp_spin_trylock_irqsave(ptr, flags)				\ | ||||
| 	pcpu_spin_trylock_irqsave(struct per_cpu_pages, lock, ptr, flags) | ||||
| 
 | ||||
| #define pcp_spin_unlock(ptr)						\ | ||||
| 	pcpu_spin_unlock(lock, ptr) | ||||
| 
 | ||||
| #define pcp_spin_unlock_irqrestore(ptr, flags)				\ | ||||
| 	pcpu_spin_unlock_irqrestore(lock, ptr, flags) | ||||
| #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID | ||||
| DEFINE_PER_CPU(int, numa_node); | ||||
| EXPORT_PER_CPU_SYMBOL(numa_node); | ||||
|  | @ -1485,10 +1555,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, | |||
| 	/* Ensure requested pindex is drained first. */ | ||||
| 	pindex = pindex - 1; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * local_lock_irq held so equivalent to spin_lock_irqsave for | ||||
| 	 * both PREEMPT_RT and non-PREEMPT_RT configurations. | ||||
| 	 */ | ||||
| 	/* Caller must hold IRQ-safe pcp->lock so IRQs are disabled. */ | ||||
| 	spin_lock(&zone->lock); | ||||
| 	isolated_pageblocks = has_isolate_pageblock(zone); | ||||
| 
 | ||||
|  | @ -3056,10 +3123,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, | |||
| { | ||||
| 	int i, allocated = 0; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * local_lock_irq held so equivalent to spin_lock_irqsave for | ||||
| 	 * both PREEMPT_RT and non-PREEMPT_RT configurations. | ||||
| 	 */ | ||||
| 	/* Caller must hold IRQ-safe pcp->lock so IRQs are disabled. */ | ||||
| 	spin_lock(&zone->lock); | ||||
| 	for (i = 0; i < count; ++i) { | ||||
| 		struct page *page = __rmqueue(zone, order, migratetype, | ||||
|  | @ -3431,18 +3495,16 @@ void free_unref_page(struct page *page, unsigned int order) | |||
| 		migratetype = MIGRATE_MOVABLE; | ||||
| 	} | ||||
| 
 | ||||
| 	local_lock_irqsave(&pagesets.lock, flags); | ||||
| 	zone = page_zone(page); | ||||
| 	pcp_trylock_prepare(UP_flags); | ||||
| 	pcp = this_cpu_ptr(zone->per_cpu_pageset); | ||||
| 	if (spin_trylock(&pcp->lock)) { | ||||
| 	pcp = pcp_spin_trylock_irqsave(zone->per_cpu_pageset, flags); | ||||
| 	if (pcp) { | ||||
| 		free_unref_page_commit(zone, pcp, page, migratetype, order); | ||||
| 		spin_unlock(&pcp->lock); | ||||
| 		pcp_spin_unlock_irqrestore(pcp, flags); | ||||
| 	} else { | ||||
| 		free_one_page(zone, page, pfn, order, migratetype, FPI_NONE); | ||||
| 	} | ||||
| 	pcp_trylock_finish(UP_flags); | ||||
| 	local_unlock_irqrestore(&pagesets.lock, flags); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  | @ -3477,17 +3539,16 @@ void free_unref_page_list(struct list_head *list) | |||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	local_lock_irqsave(&pagesets.lock, flags); | ||||
| 	list_for_each_entry_safe(page, next, list, lru) { | ||||
| 		struct zone *zone = page_zone(page); | ||||
| 
 | ||||
| 		/* Different zone, different pcp lock. */ | ||||
| 		if (zone != locked_zone) { | ||||
| 			if (pcp) | ||||
| 				spin_unlock(&pcp->lock); | ||||
| 				pcp_spin_unlock_irqrestore(pcp, flags); | ||||
| 
 | ||||
| 			locked_zone = zone; | ||||
| 			pcp = this_cpu_ptr(zone->per_cpu_pageset); | ||||
| 			spin_lock(&pcp->lock); | ||||
| 			pcp = pcp_spin_lock_irqsave(locked_zone->per_cpu_pageset, flags); | ||||
| 		} | ||||
| 
 | ||||
| 		/*
 | ||||
|  | @ -3506,18 +3567,14 @@ void free_unref_page_list(struct list_head *list) | |||
| 		 * a large list of pages to free. | ||||
| 		 */ | ||||
| 		if (++batch_count == SWAP_CLUSTER_MAX) { | ||||
| 			spin_unlock(&pcp->lock); | ||||
| 			local_unlock_irqrestore(&pagesets.lock, flags); | ||||
| 			pcp_spin_unlock_irqrestore(pcp, flags); | ||||
| 			batch_count = 0; | ||||
| 			local_lock_irqsave(&pagesets.lock, flags); | ||||
| 			pcp = this_cpu_ptr(locked_zone->per_cpu_pageset); | ||||
| 			spin_lock(&pcp->lock); | ||||
| 			pcp = pcp_spin_lock_irqsave(locked_zone->per_cpu_pageset, flags); | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	if (pcp) | ||||
| 		spin_unlock(&pcp->lock); | ||||
| 	local_unlock_irqrestore(&pagesets.lock, flags); | ||||
| 		pcp_spin_unlock_irqrestore(pcp, flags); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  | @ -3732,17 +3789,14 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, | |||
| 	unsigned long flags; | ||||
| 	unsigned long __maybe_unused UP_flags; | ||||
| 
 | ||||
| 	local_lock_irqsave(&pagesets.lock, flags); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * spin_trylock may fail due to a parallel drain. In the future, the | ||||
| 	 * trylock will also protect against IRQ reentrancy. | ||||
| 	 */ | ||||
| 	pcp = this_cpu_ptr(zone->per_cpu_pageset); | ||||
| 	pcp_trylock_prepare(UP_flags); | ||||
| 	if (!spin_trylock(&pcp->lock)) { | ||||
| 	pcp = pcp_spin_trylock_irqsave(zone->per_cpu_pageset, flags); | ||||
| 	if (!pcp) { | ||||
| 		pcp_trylock_finish(UP_flags); | ||||
| 		local_unlock_irqrestore(&pagesets.lock, flags); | ||||
| 		return NULL; | ||||
| 	} | ||||
| 
 | ||||
|  | @ -3754,9 +3808,8 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, | |||
| 	pcp->free_factor >>= 1; | ||||
| 	list = &pcp->lists[order_to_pindex(migratetype, order)]; | ||||
| 	page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list); | ||||
| 	spin_unlock(&pcp->lock); | ||||
| 	pcp_spin_unlock_irqrestore(pcp, flags); | ||||
| 	pcp_trylock_finish(UP_flags); | ||||
| 	local_unlock_irqrestore(&pagesets.lock, flags); | ||||
| 	if (page) { | ||||
| 		__count_zid_vm_events(PGALLOC, page_zonenum(page), 1); | ||||
| 		zone_statistics(preferred_zone, zone, 1); | ||||
|  | @ -5358,10 +5411,9 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, | |||
| 		goto failed; | ||||
| 
 | ||||
| 	/* Is a parallel drain in progress? */ | ||||
| 	local_lock_irqsave(&pagesets.lock, flags); | ||||
| 	pcp_trylock_prepare(UP_flags); | ||||
| 	pcp = this_cpu_ptr(zone->per_cpu_pageset); | ||||
| 	if (!spin_trylock(&pcp->lock)) | ||||
| 	pcp = pcp_spin_trylock_irqsave(zone->per_cpu_pageset, flags); | ||||
| 	if (!pcp) | ||||
| 		goto failed_irq; | ||||
| 
 | ||||
| 	/* Attempt the batch allocation */ | ||||
|  | @ -5379,7 +5431,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, | |||
| 		if (unlikely(!page)) { | ||||
| 			/* Try and allocate at least one page */ | ||||
| 			if (!nr_account) { | ||||
| 				spin_unlock(&pcp->lock); | ||||
| 				pcp_spin_unlock_irqrestore(pcp, flags); | ||||
| 				goto failed_irq; | ||||
| 			} | ||||
| 			break; | ||||
|  | @ -5394,9 +5446,8 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, | |||
| 		nr_populated++; | ||||
| 	} | ||||
| 
 | ||||
| 	spin_unlock(&pcp->lock); | ||||
| 	pcp_spin_unlock_irqrestore(pcp, flags); | ||||
| 	pcp_trylock_finish(UP_flags); | ||||
| 	local_unlock_irqrestore(&pagesets.lock, flags); | ||||
| 
 | ||||
| 	__count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account); | ||||
| 	zone_statistics(ac.preferred_zoneref->zone, zone, nr_account); | ||||
|  | @ -5406,7 +5457,6 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, | |||
| 
 | ||||
| failed_irq: | ||||
| 	pcp_trylock_finish(UP_flags); | ||||
| 	local_unlock_irqrestore(&pagesets.lock, flags); | ||||
| 
 | ||||
| failed: | ||||
| 	page = __alloc_pages(gfp, 0, preferred_nid, nodemask); | ||||
|  |  | |||
		Loading…
	
		Reference in a new issue
	
	 Mel Gorman
						Mel Gorman