mirror of
				https://github.com/torvalds/linux.git
				synced 2025-11-04 02:30:34 +02:00 
			
		
		
		
	mm/migrate: move node demotion code to near its user
Now, node_demotion and next_demotion_node() are placed between __unmap_and_move() and unmap_and_move(). This hurts code readability. So move them near their users in the file. There's no functionality change in this patch. Link: https://lkml.kernel.org/r/20211206031227.3323097-1-ying.huang@intel.com Signed-off-by: "Huang, Ying" <ying.huang@intel.com> Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com> Reviewed-by: Yang Shi <shy828301@gmail.com> Reviewed-by: Wei Xu <weixugc@google.com> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: Zi Yan <ziy@nvidia.com> Cc: Oscar Salvador <osalvador@suse.de> Cc: Michal Hocko <mhocko@suse.com> Cc: David Rientjes <rientjes@google.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Thelen <gthelen@google.com> Cc: Keith Busch <kbusch@kernel.org> Cc: Yang Shi <yang.shi@linux.alibaba.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
		
							parent
							
								
									7813a1b525
								
							
						
					
					
						commit
						dcee9bf5bf
					
				
					 1 changed files with 132 additions and 133 deletions
				
			
		
							
								
								
									
										265
									
								
								mm/migrate.c
									
									
									
									
									
								
							
							
						
						
									
										265
									
								
								mm/migrate.c
									
									
									
									
									
								
							| 
						 | 
				
			
			@ -1093,139 +1093,6 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
 | 
			
		|||
	return rc;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
 * node_demotion[] example:
 | 
			
		||||
 *
 | 
			
		||||
 * Consider a system with two sockets.  Each socket has
 | 
			
		||||
 * three classes of memory attached: fast, medium and slow.
 | 
			
		||||
 * Each memory class is placed in its own NUMA node.  The
 | 
			
		||||
 * CPUs are placed in the node with the "fast" memory.  The
 | 
			
		||||
 * 6 NUMA nodes (0-5) might be split among the sockets like
 | 
			
		||||
 * this:
 | 
			
		||||
 *
 | 
			
		||||
 *	Socket A: 0, 1, 2
 | 
			
		||||
 *	Socket B: 3, 4, 5
 | 
			
		||||
 *
 | 
			
		||||
 * When Node 0 fills up, its memory should be migrated to
 | 
			
		||||
 * Node 1.  When Node 1 fills up, it should be migrated to
 | 
			
		||||
 * Node 2.  The migration path start on the nodes with the
 | 
			
		||||
 * processors (since allocations default to this node) and
 | 
			
		||||
 * fast memory, progress through medium and end with the
 | 
			
		||||
 * slow memory:
 | 
			
		||||
 *
 | 
			
		||||
 *	0 -> 1 -> 2 -> stop
 | 
			
		||||
 *	3 -> 4 -> 5 -> stop
 | 
			
		||||
 *
 | 
			
		||||
 * This is represented in the node_demotion[] like this:
 | 
			
		||||
 *
 | 
			
		||||
 *	{  nr=1, nodes[0]=1 }, // Node 0 migrates to 1
 | 
			
		||||
 *	{  nr=1, nodes[0]=2 }, // Node 1 migrates to 2
 | 
			
		||||
 *	{  nr=0, nodes[0]=-1 }, // Node 2 does not migrate
 | 
			
		||||
 *	{  nr=1, nodes[0]=4 }, // Node 3 migrates to 4
 | 
			
		||||
 *	{  nr=1, nodes[0]=5 }, // Node 4 migrates to 5
 | 
			
		||||
 *	{  nr=0, nodes[0]=-1 }, // Node 5 does not migrate
 | 
			
		||||
 *
 | 
			
		||||
 * Moreover some systems may have multiple slow memory nodes.
 | 
			
		||||
 * Suppose a system has one socket with 3 memory nodes, node 0
 | 
			
		||||
 * is fast memory type, and node 1/2 both are slow memory
 | 
			
		||||
 * type, and the distance between fast memory node and slow
 | 
			
		||||
 * memory node is same. So the migration path should be:
 | 
			
		||||
 *
 | 
			
		||||
 *	0 -> 1/2 -> stop
 | 
			
		||||
 *
 | 
			
		||||
 * This is represented in the node_demotion[] like this:
 | 
			
		||||
 *	{ nr=2, {nodes[0]=1, nodes[1]=2} }, // Node 0 migrates to node 1 and node 2
 | 
			
		||||
 *	{ nr=0, nodes[0]=-1, }, // Node 1 dose not migrate
 | 
			
		||||
 *	{ nr=0, nodes[0]=-1, }, // Node 2 does not migrate
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
 * Writes to this array occur without locking.  Cycles are
 | 
			
		||||
 * not allowed: Node X demotes to Y which demotes to X...
 | 
			
		||||
 *
 | 
			
		||||
 * If multiple reads are performed, a single rcu_read_lock()
 | 
			
		||||
 * must be held over all reads to ensure that no cycles are
 | 
			
		||||
 * observed.
 | 
			
		||||
 */
 | 
			
		||||
#define DEFAULT_DEMOTION_TARGET_NODES 15
 | 
			
		||||
 | 
			
		||||
#if MAX_NUMNODES < DEFAULT_DEMOTION_TARGET_NODES
 | 
			
		||||
#define DEMOTION_TARGET_NODES	(MAX_NUMNODES - 1)
 | 
			
		||||
#else
 | 
			
		||||
#define DEMOTION_TARGET_NODES	DEFAULT_DEMOTION_TARGET_NODES
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
struct demotion_nodes {
 | 
			
		||||
	unsigned short nr;
 | 
			
		||||
	short nodes[DEMOTION_TARGET_NODES];
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
static struct demotion_nodes *node_demotion __read_mostly;
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * next_demotion_node() - Get the next node in the demotion path
 | 
			
		||||
 * @node: The starting node to lookup the next node
 | 
			
		||||
 *
 | 
			
		||||
 * Return: node id for next memory node in the demotion path hierarchy
 | 
			
		||||
 * from @node; NUMA_NO_NODE if @node is terminal.  This does not keep
 | 
			
		||||
 * @node online or guarantee that it *continues* to be the next demotion
 | 
			
		||||
 * target.
 | 
			
		||||
 */
 | 
			
		||||
int next_demotion_node(int node)
 | 
			
		||||
{
 | 
			
		||||
	struct demotion_nodes *nd;
 | 
			
		||||
	unsigned short target_nr, index;
 | 
			
		||||
	int target;
 | 
			
		||||
 | 
			
		||||
	if (!node_demotion)
 | 
			
		||||
		return NUMA_NO_NODE;
 | 
			
		||||
 | 
			
		||||
	nd = &node_demotion[node];
 | 
			
		||||
 | 
			
		||||
	/*
 | 
			
		||||
	 * node_demotion[] is updated without excluding this
 | 
			
		||||
	 * function from running.  RCU doesn't provide any
 | 
			
		||||
	 * compiler barriers, so the READ_ONCE() is required
 | 
			
		||||
	 * to avoid compiler reordering or read merging.
 | 
			
		||||
	 *
 | 
			
		||||
	 * Make sure to use RCU over entire code blocks if
 | 
			
		||||
	 * node_demotion[] reads need to be consistent.
 | 
			
		||||
	 */
 | 
			
		||||
	rcu_read_lock();
 | 
			
		||||
	target_nr = READ_ONCE(nd->nr);
 | 
			
		||||
 | 
			
		||||
	switch (target_nr) {
 | 
			
		||||
	case 0:
 | 
			
		||||
		target = NUMA_NO_NODE;
 | 
			
		||||
		goto out;
 | 
			
		||||
	case 1:
 | 
			
		||||
		index = 0;
 | 
			
		||||
		break;
 | 
			
		||||
	default:
 | 
			
		||||
		/*
 | 
			
		||||
		 * If there are multiple target nodes, just select one
 | 
			
		||||
		 * target node randomly.
 | 
			
		||||
		 *
 | 
			
		||||
		 * In addition, we can also use round-robin to select
 | 
			
		||||
		 * target node, but we should introduce another variable
 | 
			
		||||
		 * for node_demotion[] to record last selected target node,
 | 
			
		||||
		 * that may cause cache ping-pong due to the changing of
 | 
			
		||||
		 * last target node. Or introducing per-cpu data to avoid
 | 
			
		||||
		 * caching issue, which seems more complicated. So selecting
 | 
			
		||||
		 * target node randomly seems better until now.
 | 
			
		||||
		 */
 | 
			
		||||
		index = get_random_int() % target_nr;
 | 
			
		||||
		break;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	target = READ_ONCE(nd->nodes[index]);
 | 
			
		||||
 | 
			
		||||
out:
 | 
			
		||||
	rcu_read_unlock();
 | 
			
		||||
	return target;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
 * Obtain the lock on page, remove all ptes and migrate the page
 | 
			
		||||
 * to the newly allocated page in newpage.
 | 
			
		||||
| 
						 | 
				
			
			@ -3059,6 +2926,138 @@ void migrate_vma_finalize(struct migrate_vma *migrate)
 | 
			
		|||
EXPORT_SYMBOL(migrate_vma_finalize);
 | 
			
		||||
#endif /* CONFIG_DEVICE_PRIVATE */
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
 * node_demotion[] example:
 | 
			
		||||
 *
 | 
			
		||||
 * Consider a system with two sockets.  Each socket has
 | 
			
		||||
 * three classes of memory attached: fast, medium and slow.
 | 
			
		||||
 * Each memory class is placed in its own NUMA node.  The
 | 
			
		||||
 * CPUs are placed in the node with the "fast" memory.  The
 | 
			
		||||
 * 6 NUMA nodes (0-5) might be split among the sockets like
 | 
			
		||||
 * this:
 | 
			
		||||
 *
 | 
			
		||||
 *	Socket A: 0, 1, 2
 | 
			
		||||
 *	Socket B: 3, 4, 5
 | 
			
		||||
 *
 | 
			
		||||
 * When Node 0 fills up, its memory should be migrated to
 | 
			
		||||
 * Node 1.  When Node 1 fills up, it should be migrated to
 | 
			
		||||
 * Node 2.  The migration path start on the nodes with the
 | 
			
		||||
 * processors (since allocations default to this node) and
 | 
			
		||||
 * fast memory, progress through medium and end with the
 | 
			
		||||
 * slow memory:
 | 
			
		||||
 *
 | 
			
		||||
 *	0 -> 1 -> 2 -> stop
 | 
			
		||||
 *	3 -> 4 -> 5 -> stop
 | 
			
		||||
 *
 | 
			
		||||
 * This is represented in the node_demotion[] like this:
 | 
			
		||||
 *
 | 
			
		||||
 *	{  nr=1, nodes[0]=1 }, // Node 0 migrates to 1
 | 
			
		||||
 *	{  nr=1, nodes[0]=2 }, // Node 1 migrates to 2
 | 
			
		||||
 *	{  nr=0, nodes[0]=-1 }, // Node 2 does not migrate
 | 
			
		||||
 *	{  nr=1, nodes[0]=4 }, // Node 3 migrates to 4
 | 
			
		||||
 *	{  nr=1, nodes[0]=5 }, // Node 4 migrates to 5
 | 
			
		||||
 *	{  nr=0, nodes[0]=-1 }, // Node 5 does not migrate
 | 
			
		||||
 *
 | 
			
		||||
 * Moreover some systems may have multiple slow memory nodes.
 | 
			
		||||
 * Suppose a system has one socket with 3 memory nodes, node 0
 | 
			
		||||
 * is fast memory type, and node 1/2 both are slow memory
 | 
			
		||||
 * type, and the distance between fast memory node and slow
 | 
			
		||||
 * memory node is same. So the migration path should be:
 | 
			
		||||
 *
 | 
			
		||||
 *	0 -> 1/2 -> stop
 | 
			
		||||
 *
 | 
			
		||||
 * This is represented in the node_demotion[] like this:
 | 
			
		||||
 *	{ nr=2, {nodes[0]=1, nodes[1]=2} }, // Node 0 migrates to node 1 and node 2
 | 
			
		||||
 *	{ nr=0, nodes[0]=-1, }, // Node 1 dose not migrate
 | 
			
		||||
 *	{ nr=0, nodes[0]=-1, }, // Node 2 does not migrate
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
 * Writes to this array occur without locking.  Cycles are
 | 
			
		||||
 * not allowed: Node X demotes to Y which demotes to X...
 | 
			
		||||
 *
 | 
			
		||||
 * If multiple reads are performed, a single rcu_read_lock()
 | 
			
		||||
 * must be held over all reads to ensure that no cycles are
 | 
			
		||||
 * observed.
 | 
			
		||||
 */
 | 
			
		||||
#define DEFAULT_DEMOTION_TARGET_NODES 15
 | 
			
		||||
 | 
			
		||||
#if MAX_NUMNODES < DEFAULT_DEMOTION_TARGET_NODES
 | 
			
		||||
#define DEMOTION_TARGET_NODES	(MAX_NUMNODES - 1)
 | 
			
		||||
#else
 | 
			
		||||
#define DEMOTION_TARGET_NODES	DEFAULT_DEMOTION_TARGET_NODES
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
struct demotion_nodes {
 | 
			
		||||
	unsigned short nr;
 | 
			
		||||
	short nodes[DEMOTION_TARGET_NODES];
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
static struct demotion_nodes *node_demotion __read_mostly;
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * next_demotion_node() - Get the next node in the demotion path
 | 
			
		||||
 * @node: The starting node to lookup the next node
 | 
			
		||||
 *
 | 
			
		||||
 * Return: node id for next memory node in the demotion path hierarchy
 | 
			
		||||
 * from @node; NUMA_NO_NODE if @node is terminal.  This does not keep
 | 
			
		||||
 * @node online or guarantee that it *continues* to be the next demotion
 | 
			
		||||
 * target.
 | 
			
		||||
 */
 | 
			
		||||
int next_demotion_node(int node)
 | 
			
		||||
{
 | 
			
		||||
	struct demotion_nodes *nd;
 | 
			
		||||
	unsigned short target_nr, index;
 | 
			
		||||
	int target;
 | 
			
		||||
 | 
			
		||||
	if (!node_demotion)
 | 
			
		||||
		return NUMA_NO_NODE;
 | 
			
		||||
 | 
			
		||||
	nd = &node_demotion[node];
 | 
			
		||||
 | 
			
		||||
	/*
 | 
			
		||||
	 * node_demotion[] is updated without excluding this
 | 
			
		||||
	 * function from running.  RCU doesn't provide any
 | 
			
		||||
	 * compiler barriers, so the READ_ONCE() is required
 | 
			
		||||
	 * to avoid compiler reordering or read merging.
 | 
			
		||||
	 *
 | 
			
		||||
	 * Make sure to use RCU over entire code blocks if
 | 
			
		||||
	 * node_demotion[] reads need to be consistent.
 | 
			
		||||
	 */
 | 
			
		||||
	rcu_read_lock();
 | 
			
		||||
	target_nr = READ_ONCE(nd->nr);
 | 
			
		||||
 | 
			
		||||
	switch (target_nr) {
 | 
			
		||||
	case 0:
 | 
			
		||||
		target = NUMA_NO_NODE;
 | 
			
		||||
		goto out;
 | 
			
		||||
	case 1:
 | 
			
		||||
		index = 0;
 | 
			
		||||
		break;
 | 
			
		||||
	default:
 | 
			
		||||
		/*
 | 
			
		||||
		 * If there are multiple target nodes, just select one
 | 
			
		||||
		 * target node randomly.
 | 
			
		||||
		 *
 | 
			
		||||
		 * In addition, we can also use round-robin to select
 | 
			
		||||
		 * target node, but we should introduce another variable
 | 
			
		||||
		 * for node_demotion[] to record last selected target node,
 | 
			
		||||
		 * that may cause cache ping-pong due to the changing of
 | 
			
		||||
		 * last target node. Or introducing per-cpu data to avoid
 | 
			
		||||
		 * caching issue, which seems more complicated. So selecting
 | 
			
		||||
		 * target node randomly seems better until now.
 | 
			
		||||
		 */
 | 
			
		||||
		index = get_random_int() % target_nr;
 | 
			
		||||
		break;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	target = READ_ONCE(nd->nodes[index]);
 | 
			
		||||
 | 
			
		||||
out:
 | 
			
		||||
	rcu_read_unlock();
 | 
			
		||||
	return target;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#if defined(CONFIG_HOTPLUG_CPU)
 | 
			
		||||
/* Disable reclaim-based migration. */
 | 
			
		||||
static void __disable_all_migrate_targets(void)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in a new issue