forked from mirrors/linux
		
	The x86 implementation of range-to-target_node lookup (i.e. phys_to_target_node() and memory_add_physaddr_to_nid()) relies on numa_memblks. Since numa_memblks are now part of the generic code, move these functions from x86 to mm/numa_memblks.c and select CONFIG_NUMA_KEEP_MEMINFO when CONFIG_NUMA_MEMBLKS=y for dax and cxl. [rppt@kernel.org: fix build] Link: https://lkml.kernel.org/r/ZtVfSt_zloPdDqVB@kernel.org Link: https://lkml.kernel.org/r/20240807064110.1003856-26-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org> Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> Tested-by: Zi Yan <ziy@nvidia.com> # for x86_64 and arm64 Tested-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> [arm64 + CXL via QEMU] Reviewed-by: Dan Williams <dan.j.williams@intel.com> Acked-by: David Hildenbrand <david@redhat.com> Cc: Alexander Gordeev <agordeev@linux.ibm.com> Cc: Andreas Larsson <andreas@gaisler.com> Cc: Arnd Bergmann <arnd@arndb.de> Cc: Borislav Petkov <bp@alien8.de> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Christophe Leroy <christophe.leroy@csgroup.eu> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: David S. Miller <davem@davemloft.net> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Heiko Carstens <hca@linux.ibm.com> Cc: Huacai Chen <chenhuacai@kernel.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Jiaxun Yang <jiaxun.yang@flygoat.com> Cc: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Michael Ellerman <mpe@ellerman.id.au> Cc: Palmer Dabbelt <palmer@dabbelt.com> Cc: Rafael J. Wysocki <rafael@kernel.org> Cc: Rob Herring (Arm) <robh@kernel.org> Cc: Samuel Holland <samuel.holland@sifive.com> Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Vasily Gorbik <gor@linux.ibm.com> Cc: Will Deacon <will@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
		
			
				
	
	
		
			455 lines
		
	
	
	
		
			10 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			455 lines
		
	
	
	
		
			10 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
// SPDX-License-Identifier: GPL-2.0-only
 | 
						|
/* Common code for 32 and 64-bit NUMA */
 | 
						|
#include <linux/acpi.h>
 | 
						|
#include <linux/kernel.h>
 | 
						|
#include <linux/mm.h>
 | 
						|
#include <linux/of.h>
 | 
						|
#include <linux/string.h>
 | 
						|
#include <linux/init.h>
 | 
						|
#include <linux/memblock.h>
 | 
						|
#include <linux/mmzone.h>
 | 
						|
#include <linux/ctype.h>
 | 
						|
#include <linux/nodemask.h>
 | 
						|
#include <linux/sched.h>
 | 
						|
#include <linux/topology.h>
 | 
						|
#include <linux/sort.h>
 | 
						|
#include <linux/numa_memblks.h>
 | 
						|
 | 
						|
#include <asm/e820/api.h>
 | 
						|
#include <asm/proto.h>
 | 
						|
#include <asm/dma.h>
 | 
						|
#include <asm/amd_nb.h>
 | 
						|
 | 
						|
#include "numa_internal.h"
 | 
						|
 | 
						|
int numa_off;
 | 
						|
 | 
						|
static __init int numa_setup(char *opt)
 | 
						|
{
 | 
						|
	if (!opt)
 | 
						|
		return -EINVAL;
 | 
						|
	if (!strncmp(opt, "off", 3))
 | 
						|
		numa_off = 1;
 | 
						|
	if (!strncmp(opt, "fake=", 5))
 | 
						|
		return numa_emu_cmdline(opt + 5);
 | 
						|
	if (!strncmp(opt, "noacpi", 6))
 | 
						|
		disable_srat();
 | 
						|
	if (!strncmp(opt, "nohmat", 6))
 | 
						|
		disable_hmat();
 | 
						|
	return 0;
 | 
						|
}
 | 
						|
early_param("numa", numa_setup);
 | 
						|
 | 
						|
/*
 | 
						|
 * apicid, cpu, node mappings
 | 
						|
 */
 | 
						|
s16 __apicid_to_node[MAX_LOCAL_APIC] = {
 | 
						|
	[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
 | 
						|
};
 | 
						|
 | 
						|
int numa_cpu_node(int cpu)
 | 
						|
{
 | 
						|
	u32 apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
 | 
						|
 | 
						|
	if (apicid != BAD_APICID)
 | 
						|
		return __apicid_to_node[apicid];
 | 
						|
	return NUMA_NO_NODE;
 | 
						|
}
 | 
						|
 | 
						|
cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
 | 
						|
EXPORT_SYMBOL(node_to_cpumask_map);
 | 
						|
 | 
						|
/*
 | 
						|
 * Map cpu index to node index
 | 
						|
 */
 | 
						|
DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
 | 
						|
EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
 | 
						|
 | 
						|
void numa_set_node(int cpu, int node)
 | 
						|
{
 | 
						|
	int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
 | 
						|
 | 
						|
	/* early setting, no percpu area yet */
 | 
						|
	if (cpu_to_node_map) {
 | 
						|
		cpu_to_node_map[cpu] = node;
 | 
						|
		return;
 | 
						|
	}
 | 
						|
 | 
						|
#ifdef CONFIG_DEBUG_PER_CPU_MAPS
 | 
						|
	if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) {
 | 
						|
		printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu);
 | 
						|
		dump_stack();
 | 
						|
		return;
 | 
						|
	}
 | 
						|
#endif
 | 
						|
	per_cpu(x86_cpu_to_node_map, cpu) = node;
 | 
						|
 | 
						|
	set_cpu_numa_node(cpu, node);
 | 
						|
}
 | 
						|
 | 
						|
void numa_clear_node(int cpu)
 | 
						|
{
 | 
						|
	numa_set_node(cpu, NUMA_NO_NODE);
 | 
						|
}
 | 
						|
 | 
						|
/*
 | 
						|
 * Allocate node_to_cpumask_map based on number of available nodes
 | 
						|
 * Requires node_possible_map to be valid.
 | 
						|
 *
 | 
						|
 * Note: cpumask_of_node() is not valid until after this is done.
 | 
						|
 * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.)
 | 
						|
 */
 | 
						|
void __init setup_node_to_cpumask_map(void)
 | 
						|
{
 | 
						|
	unsigned int node;
 | 
						|
 | 
						|
	/* setup nr_node_ids if not done yet */
 | 
						|
	if (nr_node_ids == MAX_NUMNODES)
 | 
						|
		setup_nr_node_ids();
 | 
						|
 | 
						|
	/* allocate the map */
 | 
						|
	for (node = 0; node < nr_node_ids; node++)
 | 
						|
		alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]);
 | 
						|
 | 
						|
	/* cpumask_of_node() will now work */
 | 
						|
	pr_debug("Node to cpumask map for %u nodes\n", nr_node_ids);
 | 
						|
}
 | 
						|
 | 
						|
static int __init numa_register_nodes(void)
 | 
						|
{
 | 
						|
	int nid;
 | 
						|
 | 
						|
	if (!memblock_validate_numa_coverage(SZ_1M))
 | 
						|
		return -EINVAL;
 | 
						|
 | 
						|
	/* Finally register nodes. */
 | 
						|
	for_each_node_mask(nid, node_possible_map) {
 | 
						|
		unsigned long start_pfn, end_pfn;
 | 
						|
 | 
						|
		/*
 | 
						|
		 * Note, get_pfn_range_for_nid() depends on
 | 
						|
		 * memblock_set_node() having already happened
 | 
						|
		 */
 | 
						|
		get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
 | 
						|
		if (start_pfn >= end_pfn)
 | 
						|
			continue;
 | 
						|
 | 
						|
		alloc_node_data(nid);
 | 
						|
		node_set_online(nid);
 | 
						|
	}
 | 
						|
 | 
						|
	/* Dump memblock with node info and return. */
 | 
						|
	memblock_dump_all();
 | 
						|
	return 0;
 | 
						|
}
 | 
						|
 | 
						|
/*
 | 
						|
 * There are unfortunately some poorly designed mainboards around that
 | 
						|
 * only connect memory to a single CPU. This breaks the 1:1 cpu->node
 | 
						|
 * mapping. To avoid this fill in the mapping for all possible CPUs,
 | 
						|
 * as the number of CPUs is not known yet. We round robin the existing
 | 
						|
 * nodes.
 | 
						|
 */
 | 
						|
static void __init numa_init_array(void)
 | 
						|
{
 | 
						|
	int rr, i;
 | 
						|
 | 
						|
	rr = first_node(node_online_map);
 | 
						|
	for (i = 0; i < nr_cpu_ids; i++) {
 | 
						|
		if (early_cpu_to_node(i) != NUMA_NO_NODE)
 | 
						|
			continue;
 | 
						|
		numa_set_node(i, rr);
 | 
						|
		rr = next_node_in(rr, node_online_map);
 | 
						|
	}
 | 
						|
}
 | 
						|
 | 
						|
static int __init numa_init(int (*init_func)(void))
 | 
						|
{
 | 
						|
	int i;
 | 
						|
	int ret;
 | 
						|
 | 
						|
	for (i = 0; i < MAX_LOCAL_APIC; i++)
 | 
						|
		set_apicid_to_node(i, NUMA_NO_NODE);
 | 
						|
 | 
						|
	ret = numa_memblks_init(init_func, /* memblock_force_top_down */ true);
 | 
						|
	if (ret < 0)
 | 
						|
		return ret;
 | 
						|
 | 
						|
	ret = numa_register_nodes();
 | 
						|
	if (ret < 0)
 | 
						|
		return ret;
 | 
						|
 | 
						|
	for (i = 0; i < nr_cpu_ids; i++) {
 | 
						|
		int nid = early_cpu_to_node(i);
 | 
						|
 | 
						|
		if (nid == NUMA_NO_NODE)
 | 
						|
			continue;
 | 
						|
		if (!node_online(nid))
 | 
						|
			numa_clear_node(i);
 | 
						|
	}
 | 
						|
	numa_init_array();
 | 
						|
 | 
						|
	return 0;
 | 
						|
}
 | 
						|
 | 
						|
/**
 | 
						|
 * dummy_numa_init - Fallback dummy NUMA init
 | 
						|
 *
 | 
						|
 * Used if there's no underlying NUMA architecture, NUMA initialization
 | 
						|
 * fails, or NUMA is disabled on the command line.
 | 
						|
 *
 | 
						|
 * Must online at least one node and add memory blocks that cover all
 | 
						|
 * allowed memory.  This function must not fail.
 | 
						|
 */
 | 
						|
static int __init dummy_numa_init(void)
 | 
						|
{
 | 
						|
	printk(KERN_INFO "%s\n",
 | 
						|
	       numa_off ? "NUMA turned off" : "No NUMA configuration found");
 | 
						|
	printk(KERN_INFO "Faking a node at [mem %#018Lx-%#018Lx]\n",
 | 
						|
	       0LLU, PFN_PHYS(max_pfn) - 1);
 | 
						|
 | 
						|
	node_set(0, numa_nodes_parsed);
 | 
						|
	numa_add_memblk(0, 0, PFN_PHYS(max_pfn));
 | 
						|
 | 
						|
	return 0;
 | 
						|
}
 | 
						|
 | 
						|
/**
 | 
						|
 * x86_numa_init - Initialize NUMA
 | 
						|
 *
 | 
						|
 * Try each configured NUMA initialization method until one succeeds.  The
 | 
						|
 * last fallback is dummy single node config encompassing whole memory and
 | 
						|
 * never fails.
 | 
						|
 */
 | 
						|
void __init x86_numa_init(void)
 | 
						|
{
 | 
						|
	if (!numa_off) {
 | 
						|
#ifdef CONFIG_ACPI_NUMA
 | 
						|
		if (!numa_init(x86_acpi_numa_init))
 | 
						|
			return;
 | 
						|
#endif
 | 
						|
#ifdef CONFIG_AMD_NUMA
 | 
						|
		if (!numa_init(amd_numa_init))
 | 
						|
			return;
 | 
						|
#endif
 | 
						|
		if (acpi_disabled && !numa_init(of_numa_init))
 | 
						|
			return;
 | 
						|
	}
 | 
						|
 | 
						|
	numa_init(dummy_numa_init);
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
/*
 | 
						|
 * A node may exist which has one or more Generic Initiators but no CPUs and no
 | 
						|
 * memory.
 | 
						|
 *
 | 
						|
 * This function must be called after init_cpu_to_node(), to ensure that any
 | 
						|
 * memoryless CPU nodes have already been brought online, and before the
 | 
						|
 * node_data[nid] is needed for zone list setup in build_all_zonelists().
 | 
						|
 *
 | 
						|
 * When this function is called, any nodes containing either memory and/or CPUs
 | 
						|
 * will already be online and there is no need to do anything extra, even if
 | 
						|
 * they also contain one or more Generic Initiators.
 | 
						|
 */
 | 
						|
void __init init_gi_nodes(void)
 | 
						|
{
 | 
						|
	int nid;
 | 
						|
 | 
						|
	/*
 | 
						|
	 * Exclude this node from
 | 
						|
	 * bringup_nonboot_cpus
 | 
						|
	 *  cpu_up
 | 
						|
	 *   __try_online_node
 | 
						|
	 *    register_one_node
 | 
						|
	 * because node_subsys is not initialized yet.
 | 
						|
	 * TODO remove dependency on node_online
 | 
						|
	 */
 | 
						|
	for_each_node_state(nid, N_GENERIC_INITIATOR)
 | 
						|
		if (!node_online(nid))
 | 
						|
			node_set_online(nid);
 | 
						|
}
 | 
						|
 | 
						|
/*
 | 
						|
 * Setup early cpu_to_node.
 | 
						|
 *
 | 
						|
 * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
 | 
						|
 * and apicid_to_node[] tables have valid entries for a CPU.
 | 
						|
 * This means we skip cpu_to_node[] initialisation for NUMA
 | 
						|
 * emulation and faking node case (when running a kernel compiled
 | 
						|
 * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
 | 
						|
 * is already initialized in a round robin manner at numa_init_array,
 | 
						|
 * prior to this call, and this initialization is good enough
 | 
						|
 * for the fake NUMA cases.
 | 
						|
 *
 | 
						|
 * Called before the per_cpu areas are setup.
 | 
						|
 */
 | 
						|
void __init init_cpu_to_node(void)
 | 
						|
{
 | 
						|
	int cpu;
 | 
						|
	u32 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
 | 
						|
 | 
						|
	BUG_ON(cpu_to_apicid == NULL);
 | 
						|
 | 
						|
	for_each_possible_cpu(cpu) {
 | 
						|
		int node = numa_cpu_node(cpu);
 | 
						|
 | 
						|
		if (node == NUMA_NO_NODE)
 | 
						|
			continue;
 | 
						|
 | 
						|
		/*
 | 
						|
		 * Exclude this node from
 | 
						|
		 * bringup_nonboot_cpus
 | 
						|
		 *  cpu_up
 | 
						|
		 *   __try_online_node
 | 
						|
		 *    register_one_node
 | 
						|
		 * because node_subsys is not initialized yet.
 | 
						|
		 * TODO remove dependency on node_online
 | 
						|
		 */
 | 
						|
		if (!node_online(node))
 | 
						|
			node_set_online(node);
 | 
						|
 | 
						|
		numa_set_node(cpu, node);
 | 
						|
	}
 | 
						|
}
 | 
						|
 | 
						|
#ifndef CONFIG_DEBUG_PER_CPU_MAPS
 | 
						|
 | 
						|
# ifndef CONFIG_NUMA_EMU
 | 
						|
void numa_add_cpu(unsigned int cpu)
 | 
						|
{
 | 
						|
	cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
 | 
						|
}
 | 
						|
 | 
						|
void numa_remove_cpu(unsigned int cpu)
 | 
						|
{
 | 
						|
	cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
 | 
						|
}
 | 
						|
# endif	/* !CONFIG_NUMA_EMU */
 | 
						|
 | 
						|
#else	/* !CONFIG_DEBUG_PER_CPU_MAPS */
 | 
						|
 | 
						|
int __cpu_to_node(int cpu)
 | 
						|
{
 | 
						|
	if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
 | 
						|
		printk(KERN_WARNING
 | 
						|
			"cpu_to_node(%d): usage too early!\n", cpu);
 | 
						|
		dump_stack();
 | 
						|
		return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
 | 
						|
	}
 | 
						|
	return per_cpu(x86_cpu_to_node_map, cpu);
 | 
						|
}
 | 
						|
EXPORT_SYMBOL(__cpu_to_node);
 | 
						|
 | 
						|
/*
 | 
						|
 * Same function as cpu_to_node() but used if called before the
 | 
						|
 * per_cpu areas are setup.
 | 
						|
 */
 | 
						|
int early_cpu_to_node(int cpu)
 | 
						|
{
 | 
						|
	if (early_per_cpu_ptr(x86_cpu_to_node_map))
 | 
						|
		return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
 | 
						|
 | 
						|
	if (!cpu_possible(cpu)) {
 | 
						|
		printk(KERN_WARNING
 | 
						|
			"early_cpu_to_node(%d): no per_cpu area!\n", cpu);
 | 
						|
		dump_stack();
 | 
						|
		return NUMA_NO_NODE;
 | 
						|
	}
 | 
						|
	return per_cpu(x86_cpu_to_node_map, cpu);
 | 
						|
}
 | 
						|
 | 
						|
void debug_cpumask_set_cpu(unsigned int cpu, int node, bool enable)
 | 
						|
{
 | 
						|
	struct cpumask *mask;
 | 
						|
 | 
						|
	if (node == NUMA_NO_NODE) {
 | 
						|
		/* early_cpu_to_node() already emits a warning and trace */
 | 
						|
		return;
 | 
						|
	}
 | 
						|
	mask = node_to_cpumask_map[node];
 | 
						|
	if (!cpumask_available(mask)) {
 | 
						|
		pr_err("node_to_cpumask_map[%i] NULL\n", node);
 | 
						|
		dump_stack();
 | 
						|
		return;
 | 
						|
	}
 | 
						|
 | 
						|
	if (enable)
 | 
						|
		cpumask_set_cpu(cpu, mask);
 | 
						|
	else
 | 
						|
		cpumask_clear_cpu(cpu, mask);
 | 
						|
 | 
						|
	printk(KERN_DEBUG "%s cpu %d node %d: mask now %*pbl\n",
 | 
						|
		enable ? "numa_add_cpu" : "numa_remove_cpu",
 | 
						|
		cpu, node, cpumask_pr_args(mask));
 | 
						|
	return;
 | 
						|
}
 | 
						|
 | 
						|
# ifndef CONFIG_NUMA_EMU
 | 
						|
static void numa_set_cpumask(int cpu, bool enable)
 | 
						|
{
 | 
						|
	debug_cpumask_set_cpu(cpu, early_cpu_to_node(cpu), enable);
 | 
						|
}
 | 
						|
 | 
						|
void numa_add_cpu(unsigned int cpu)
 | 
						|
{
 | 
						|
	numa_set_cpumask(cpu, true);
 | 
						|
}
 | 
						|
 | 
						|
void numa_remove_cpu(unsigned int cpu)
 | 
						|
{
 | 
						|
	numa_set_cpumask(cpu, false);
 | 
						|
}
 | 
						|
# endif	/* !CONFIG_NUMA_EMU */
 | 
						|
 | 
						|
/*
 | 
						|
 * Returns a pointer to the bitmask of CPUs on Node 'node'.
 | 
						|
 */
 | 
						|
const struct cpumask *cpumask_of_node(int node)
 | 
						|
{
 | 
						|
	if ((unsigned)node >= nr_node_ids) {
 | 
						|
		printk(KERN_WARNING
 | 
						|
			"cpumask_of_node(%d): (unsigned)node >= nr_node_ids(%u)\n",
 | 
						|
			node, nr_node_ids);
 | 
						|
		dump_stack();
 | 
						|
		return cpu_none_mask;
 | 
						|
	}
 | 
						|
	if (!cpumask_available(node_to_cpumask_map[node])) {
 | 
						|
		printk(KERN_WARNING
 | 
						|
			"cpumask_of_node(%d): no node_to_cpumask_map!\n",
 | 
						|
			node);
 | 
						|
		dump_stack();
 | 
						|
		return cpu_online_mask;
 | 
						|
	}
 | 
						|
	return node_to_cpumask_map[node];
 | 
						|
}
 | 
						|
EXPORT_SYMBOL(cpumask_of_node);
 | 
						|
 | 
						|
#endif	/* !CONFIG_DEBUG_PER_CPU_MAPS */
 | 
						|
 | 
						|
#ifdef CONFIG_NUMA_EMU
 | 
						|
void __init numa_emu_update_cpu_to_node(int *emu_nid_to_phys,
 | 
						|
					unsigned int nr_emu_nids)
 | 
						|
{
 | 
						|
	int i, j;
 | 
						|
 | 
						|
	/*
 | 
						|
	 * Transform __apicid_to_node table to use emulated nids by
 | 
						|
	 * reverse-mapping phys_nid.  The maps should always exist but fall
 | 
						|
	 * back to zero just in case.
 | 
						|
	 */
 | 
						|
	for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) {
 | 
						|
		if (__apicid_to_node[i] == NUMA_NO_NODE)
 | 
						|
			continue;
 | 
						|
		for (j = 0; j < nr_emu_nids; j++)
 | 
						|
			if (__apicid_to_node[i] == emu_nid_to_phys[j])
 | 
						|
				break;
 | 
						|
		__apicid_to_node[i] = j < nr_emu_nids ? j : 0;
 | 
						|
	}
 | 
						|
}
 | 
						|
 | 
						|
u64 __init numa_emu_dma_end(void)
 | 
						|
{
 | 
						|
	return PFN_PHYS(MAX_DMA32_PFN);
 | 
						|
}
 | 
						|
#endif /* CONFIG_NUMA_EMU */
 |