3
0
Fork 0
forked from mirrors/linux
kernel/include/linux/node.h
Huang Ying e39bb6be9f NUMA Balancing: add page promotion counter
Patch series "NUMA balancing: optimize memory placement for memory tiering system", v13

With the advent of various new memory types, some machines will have
multiple types of memory, e.g.  DRAM and PMEM (persistent memory).  The
memory subsystem of these machines can be called memory tiering system,
because the performance of the different types of memory are different.

After commit c221c0b030 ("device-dax: "Hotplug" persistent memory for
use like normal RAM"), the PMEM could be used as the cost-effective
volatile memory in separate NUMA nodes.  In a typical memory tiering
system, there are CPUs, DRAM and PMEM in each physical NUMA node.  The
CPUs and the DRAM will be put in one logical node, while the PMEM will
be put in another (faked) logical node.

To optimize the system overall performance, the hot pages should be
placed in DRAM node.  To do that, we need to identify the hot pages in
the PMEM node and migrate them to DRAM node via NUMA migration.

In the original NUMA balancing, there are already a set of existing
mechanisms to identify the pages recently accessed by the CPUs in a node
and migrate the pages to the node.  So we can reuse these mechanisms to
build the mechanisms to optimize the page placement in the memory
tiering system.  This is implemented in this patchset.

At the other hand, the cold pages should be placed in PMEM node.  So, we
also need to identify the cold pages in the DRAM node and migrate them
to PMEM node.

In commit 26aa2d199d ("mm/migrate: demote pages during reclaim"), a
mechanism to demote the cold DRAM pages to PMEM node under memory
pressure is implemented.  Based on that, the cold DRAM pages can be
demoted to PMEM node proactively to free some memory space on DRAM node
to accommodate the promoted hot PMEM pages.  This is implemented in this
patchset too.

We have tested the solution with the pmbench memory accessing benchmark
with the 80:20 read/write ratio and the Gauss access address
distribution on a 2 socket Intel server with Optane DC Persistent Memory
Model.  The test results shows that the pmbench score can improve up to
95.9%.

This patch (of 3):

In a system with multiple memory types, e.g.  DRAM and PMEM, the CPU
and DRAM in one socket will be put in one NUMA node as before, while
the PMEM will be put in another NUMA node as described in the
description of the commit c221c0b030 ("device-dax: "Hotplug"
persistent memory for use like normal RAM").  So, the NUMA balancing
mechanism will identify all PMEM accesses as remote access and try to
promote the PMEM pages to DRAM.

To distinguish the number of the inter-type promoted pages from that of
the inter-socket migrated pages.  A new vmstat count is added.  The
counter is per-node (count in the target node).  So this can be used to
identify promotion imbalance among the NUMA nodes.

Link: https://lkml.kernel.org/r/20220301085329.3210428-1-ying.huang@intel.com
Link: https://lkml.kernel.org/r/20220221084529.1052339-1-ying.huang@intel.com
Link: https://lkml.kernel.org/r/20220221084529.1052339-2-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Reviewed-by: Yang Shi <shy828301@gmail.com>
Tested-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Wei Xu <weixugc@google.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: zhongjiang-ali <zhongjiang-ali@linux.alibaba.com>
Cc: Feng Tang <feng.tang@intel.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2022-03-22 15:57:09 -07:00

189 lines
4.7 KiB
C

/* SPDX-License-Identifier: GPL-2.0 */
/*
* include/linux/node.h - generic node definition
*
* This is mainly for topological representation. We define the
* basic 'struct node' here, which can be embedded in per-arch
* definitions of processors.
*
* Basic handling of the devices is done in drivers/base/node.c
* and system devices are handled in drivers/base/sys.c.
*
* Nodes are exported via driverfs in the class/node/devices/
* directory.
*/
#ifndef _LINUX_NODE_H_
#define _LINUX_NODE_H_
#include <linux/device.h>
#include <linux/cpumask.h>
#include <linux/list.h>
#include <linux/workqueue.h>
/**
* struct node_hmem_attrs - heterogeneous memory performance attributes
*
* @read_bandwidth: Read bandwidth in MB/s
* @write_bandwidth: Write bandwidth in MB/s
* @read_latency: Read latency in nanoseconds
* @write_latency: Write latency in nanoseconds
*/
struct node_hmem_attrs {
unsigned int read_bandwidth;
unsigned int write_bandwidth;
unsigned int read_latency;
unsigned int write_latency;
};
enum cache_indexing {
NODE_CACHE_DIRECT_MAP,
NODE_CACHE_INDEXED,
NODE_CACHE_OTHER,
};
enum cache_write_policy {
NODE_CACHE_WRITE_BACK,
NODE_CACHE_WRITE_THROUGH,
NODE_CACHE_WRITE_OTHER,
};
/**
* struct node_cache_attrs - system memory caching attributes
*
* @indexing: The ways memory blocks may be placed in cache
* @write_policy: Write back or write through policy
* @size: Total size of cache in bytes
* @line_size: Number of bytes fetched on a cache miss
* @level: The cache hierarchy level
*/
struct node_cache_attrs {
enum cache_indexing indexing;
enum cache_write_policy write_policy;
u64 size;
u16 line_size;
u8 level;
};
#ifdef CONFIG_HMEM_REPORTING
void node_add_cache(unsigned int nid, struct node_cache_attrs *cache_attrs);
void node_set_perf_attrs(unsigned int nid, struct node_hmem_attrs *hmem_attrs,
unsigned access);
#else
static inline void node_add_cache(unsigned int nid,
struct node_cache_attrs *cache_attrs)
{
}
static inline void node_set_perf_attrs(unsigned int nid,
struct node_hmem_attrs *hmem_attrs,
unsigned access)
{
}
#endif
struct node {
struct device dev;
struct list_head access_list;
#if defined(CONFIG_MEMORY_HOTPLUG) && defined(CONFIG_HUGETLBFS)
struct work_struct node_work;
#endif
#ifdef CONFIG_HMEM_REPORTING
struct list_head cache_attrs;
struct device *cache_dev;
#endif
};
struct memory_block;
extern struct node *node_devices[];
typedef void (*node_registration_func_t)(struct node *);
#if defined(CONFIG_MEMORY_HOTPLUG) && defined(CONFIG_NUMA)
void link_mem_sections(int nid, unsigned long start_pfn,
unsigned long end_pfn,
enum meminit_context context);
#else
static inline void link_mem_sections(int nid, unsigned long start_pfn,
unsigned long end_pfn,
enum meminit_context context)
{
}
#endif
extern void unregister_node(struct node *node);
#ifdef CONFIG_NUMA
/* Core of the node registration - only memory hotplug should use this */
extern int __register_one_node(int nid);
/* Registers an online node */
static inline int register_one_node(int nid)
{
int error = 0;
if (node_online(nid)) {
struct pglist_data *pgdat = NODE_DATA(nid);
unsigned long start_pfn = pgdat->node_start_pfn;
unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages;
error = __register_one_node(nid);
if (error)
return error;
/* link memory sections under this node */
link_mem_sections(nid, start_pfn, end_pfn, MEMINIT_EARLY);
}
return error;
}
extern void unregister_one_node(int nid);
extern int register_cpu_under_node(unsigned int cpu, unsigned int nid);
extern int unregister_cpu_under_node(unsigned int cpu, unsigned int nid);
extern void unregister_memory_block_under_nodes(struct memory_block *mem_blk);
extern int register_memory_node_under_compute_node(unsigned int mem_nid,
unsigned int cpu_nid,
unsigned access);
#ifdef CONFIG_HUGETLBFS
extern void register_hugetlbfs_with_node(node_registration_func_t doregister,
node_registration_func_t unregister);
#endif
#else
static inline int __register_one_node(int nid)
{
return 0;
}
static inline int register_one_node(int nid)
{
return 0;
}
static inline int unregister_one_node(int nid)
{
return 0;
}
static inline int register_cpu_under_node(unsigned int cpu, unsigned int nid)
{
return 0;
}
static inline int unregister_cpu_under_node(unsigned int cpu, unsigned int nid)
{
return 0;
}
static inline void unregister_memory_block_under_nodes(struct memory_block *mem_blk)
{
}
static inline void register_hugetlbfs_with_node(node_registration_func_t reg,
node_registration_func_t unreg)
{
}
#endif
#define to_node(device) container_of(device, struct node, dev)
static inline bool node_is_toptier(int node)
{
return node_state(node, N_CPU);
}
#endif /* _LINUX_NODE_H_ */