mirror of
				https://github.com/torvalds/linux.git
				synced 2025-11-04 02:30:34 +02:00 
			
		
		
		
	sched/topology: Improve load balancing on AMD EPYC systems
SD_BALANCE_{FORK,EXEC} and SD_WAKE_AFFINE are stripped in sd_init()
for any sched domains with a NUMA distance greater than 2 hops
(RECLAIM_DISTANCE). The idea being that it's expensive to balance
across domains that far apart.
However, as is rather unfortunately explained in:
  commit 32e45ff43e ("mm: increase RECLAIM_DISTANCE to 30")
the value for RECLAIM_DISTANCE is based on node distance tables from
2011-era hardware.
Current AMD EPYC machines have the following NUMA node distances:
 node distances:
 node   0   1   2   3   4   5   6   7
   0:  10  16  16  16  32  32  32  32
   1:  16  10  16  16  32  32  32  32
   2:  16  16  10  16  32  32  32  32
   3:  16  16  16  10  32  32  32  32
   4:  32  32  32  32  10  16  16  16
   5:  32  32  32  32  16  10  16  16
   6:  32  32  32  32  16  16  10  16
   7:  32  32  32  32  16  16  16  10
where 2 hops is 32.
The result is that the scheduler fails to load balance properly across
NUMA nodes on different sockets -- 2 hops apart.
For example, pinning 16 busy threads to NUMA nodes 0 (CPUs 0-7) and 4
(CPUs 32-39) like so,
  $ numactl -C 0-7,32-39 ./spinner 16
causes all threads to fork and remain on node 0 until the active
balancer kicks in after a few seconds and forcibly moves some threads
to node 4.
Override node_reclaim_distance for AMD Zen.
Signed-off-by: Matt Fleming <matt@codeblueprint.co.uk>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Suravee.Suthikulpanit@amd.com
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Thomas.Lendacky@amd.com
Cc: Tony Luck <tony.luck@intel.com>
Link: https://lkml.kernel.org/r/20190808195301.13222-3-matt@codeblueprint.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
			
			
This commit is contained in:
		
							parent
							
								
									a2cbfd4655
								
							
						
					
					
						commit
						a55c7454a8
					
				
					 5 changed files with 23 additions and 3 deletions
				
			
		| 
						 | 
					@ -8,6 +8,7 @@
 | 
				
			||||||
#include <linux/sched.h>
 | 
					#include <linux/sched.h>
 | 
				
			||||||
#include <linux/sched/clock.h>
 | 
					#include <linux/sched/clock.h>
 | 
				
			||||||
#include <linux/random.h>
 | 
					#include <linux/random.h>
 | 
				
			||||||
 | 
					#include <linux/topology.h>
 | 
				
			||||||
#include <asm/processor.h>
 | 
					#include <asm/processor.h>
 | 
				
			||||||
#include <asm/apic.h>
 | 
					#include <asm/apic.h>
 | 
				
			||||||
#include <asm/cacheinfo.h>
 | 
					#include <asm/cacheinfo.h>
 | 
				
			||||||
| 
						 | 
					@ -824,6 +825,10 @@ static void init_amd_zn(struct cpuinfo_x86 *c)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	set_cpu_cap(c, X86_FEATURE_ZEN);
 | 
						set_cpu_cap(c, X86_FEATURE_ZEN);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#ifdef CONFIG_NUMA
 | 
				
			||||||
 | 
						node_reclaim_distance = 32;
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/*
 | 
						/*
 | 
				
			||||||
	 * Fix erratum 1076: CPB feature bit not being set in CPUID.
 | 
						 * Fix erratum 1076: CPB feature bit not being set in CPUID.
 | 
				
			||||||
	 * Always set it, except when running under a hypervisor.
 | 
						 * Always set it, except when running under a hypervisor.
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -59,6 +59,20 @@ int arch_update_cpu_topology(void);
 | 
				
			||||||
 */
 | 
					 */
 | 
				
			||||||
#define RECLAIM_DISTANCE 30
 | 
					#define RECLAIM_DISTANCE 30
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/*
 | 
				
			||||||
 | 
					 * The following tunable allows platforms to override the default node
 | 
				
			||||||
 | 
					 * reclaim distance (RECLAIM_DISTANCE) if remote memory accesses are
 | 
				
			||||||
 | 
					 * sufficiently fast that the default value actually hurts
 | 
				
			||||||
 | 
					 * performance.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * AMD EPYC machines use this because even though the 2-hop distance
 | 
				
			||||||
 | 
					 * is 32 (3.2x slower than a local memory access) performance actually
 | 
				
			||||||
 | 
					 * *improves* if allowed to reclaim memory and load balance tasks
 | 
				
			||||||
 | 
					 * between NUMA nodes 2-hops apart.
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
 | 
					extern int __read_mostly node_reclaim_distance;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#ifndef PENALTY_FOR_NODE_WITH_CPUS
 | 
					#ifndef PENALTY_FOR_NODE_WITH_CPUS
 | 
				
			||||||
#define PENALTY_FOR_NODE_WITH_CPUS	(1)
 | 
					#define PENALTY_FOR_NODE_WITH_CPUS	(1)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1284,6 +1284,7 @@ static int			sched_domains_curr_level;
 | 
				
			||||||
int				sched_max_numa_distance;
 | 
					int				sched_max_numa_distance;
 | 
				
			||||||
static int			*sched_domains_numa_distance;
 | 
					static int			*sched_domains_numa_distance;
 | 
				
			||||||
static struct cpumask		***sched_domains_numa_masks;
 | 
					static struct cpumask		***sched_domains_numa_masks;
 | 
				
			||||||
 | 
					int __read_mostly		node_reclaim_distance = RECLAIM_DISTANCE;
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/*
 | 
					/*
 | 
				
			||||||
| 
						 | 
					@ -1402,7 +1403,7 @@ sd_init(struct sched_domain_topology_level *tl,
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		sd->flags &= ~SD_PREFER_SIBLING;
 | 
							sd->flags &= ~SD_PREFER_SIBLING;
 | 
				
			||||||
		sd->flags |= SD_SERIALIZE;
 | 
							sd->flags |= SD_SERIALIZE;
 | 
				
			||||||
		if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
 | 
							if (sched_domains_numa_distance[tl->numa_level] > node_reclaim_distance) {
 | 
				
			||||||
			sd->flags &= ~(SD_BALANCE_EXEC |
 | 
								sd->flags &= ~(SD_BALANCE_EXEC |
 | 
				
			||||||
				       SD_BALANCE_FORK |
 | 
									       SD_BALANCE_FORK |
 | 
				
			||||||
				       SD_WAKE_AFFINE);
 | 
									       SD_WAKE_AFFINE);
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -710,7 +710,7 @@ static bool khugepaged_scan_abort(int nid)
 | 
				
			||||||
	for (i = 0; i < MAX_NUMNODES; i++) {
 | 
						for (i = 0; i < MAX_NUMNODES; i++) {
 | 
				
			||||||
		if (!khugepaged_node_load[i])
 | 
							if (!khugepaged_node_load[i])
 | 
				
			||||||
			continue;
 | 
								continue;
 | 
				
			||||||
		if (node_distance(nid, i) > RECLAIM_DISTANCE)
 | 
							if (node_distance(nid, i) > node_reclaim_distance)
 | 
				
			||||||
			return true;
 | 
								return true;
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
	return false;
 | 
						return false;
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -3522,7 +3522,7 @@ bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
 | 
				
			||||||
static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
 | 
					static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <=
 | 
						return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <=
 | 
				
			||||||
				RECLAIM_DISTANCE;
 | 
									node_reclaim_distance;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
#else	/* CONFIG_NUMA */
 | 
					#else	/* CONFIG_NUMA */
 | 
				
			||||||
static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
 | 
					static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in a new issue