mirror of
				https://github.com/torvalds/linux.git
				synced 2025-10-31 08:38:45 +02:00 
			
		
		
		
	 8821f36333
			
		
	
	
		8821f36333
		
	
	
	
	
		
			
			The current implementation has a bug: If the current css doesn't
contain any pool that is a descendant of the "pool" (i.e. when
found_descendant == false), then "pool" will point to some unrelated
pool. If the current css has a child, we'll overwrite parent_pool with
this unrelated pool on the next iteration.
Since we can just check whether a pool refers to the same region to
determine whether or not it's related, all the additional pool tracking
is unnecessary, so just switch to using css_for_each_descendant_pre for
traversal.
Fixes: b168ed458d ("kernel/cgroup: Add "dmem" memory accounting cgroup")
Signed-off-by: Friedrich Vock <friedrich.vock@gmx.de>
Reviewed-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20250127152754.21325-1-friedrich.vock@gmx.de
Signed-off-by: Maarten Lankhorst <dev@lankhorst.se>
		
	
			
		
			
				
	
	
		
			829 lines
		
	
	
	
		
			20 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			829 lines
		
	
	
	
		
			20 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| // SPDX-License-Identifier: GPL-2.0
 | |
| /*
 | |
|  * Copyright 2023-2024 Intel Corporation (Maarten Lankhorst <dev@lankhorst.se>)
 | |
|  * Copyright 2024 Red Hat (Maxime Ripard <mripard@kernel.org>)
 | |
|  * Partially based on the rdma and misc controllers, which bear the following copyrights:
 | |
|  *
 | |
|  * Copyright 2020 Google LLC
 | |
|  * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
 | |
|  */
 | |
| 
 | |
| #include <linux/cgroup.h>
 | |
| #include <linux/cgroup_dmem.h>
 | |
| #include <linux/list.h>
 | |
| #include <linux/mutex.h>
 | |
| #include <linux/page_counter.h>
 | |
| #include <linux/parser.h>
 | |
| #include <linux/slab.h>
 | |
| 
 | |
| struct dmem_cgroup_region {
 | |
| 	/**
 | |
| 	 * @ref: References keeping the region alive.
 | |
| 	 * Keeps the region reference alive after a succesful RCU lookup.
 | |
| 	 */
 | |
| 	struct kref ref;
 | |
| 
 | |
| 	/** @rcu: RCU head for freeing */
 | |
| 	struct rcu_head rcu;
 | |
| 
 | |
| 	/**
 | |
| 	 * @region_node: Linked into &dmem_cgroup_regions list.
 | |
| 	 * Protected by RCU and global spinlock.
 | |
| 	 */
 | |
| 	struct list_head region_node;
 | |
| 
 | |
| 	/**
 | |
| 	 * @pools: List of pools linked to this region.
 | |
| 	 * Protected by global spinlock only
 | |
| 	 */
 | |
| 	struct list_head pools;
 | |
| 
 | |
| 	/** @size: Size of region, in bytes */
 | |
| 	u64 size;
 | |
| 
 | |
| 	/** @name: Name describing the node, set by dmem_cgroup_register_region */
 | |
| 	char *name;
 | |
| 
 | |
| 	/**
 | |
| 	 * @unregistered: Whether the region is unregistered by its caller.
 | |
| 	 * No new pools should be added to the region afterwards.
 | |
| 	 */
 | |
| 	bool unregistered;
 | |
| };
 | |
| 
 | |
| struct dmemcg_state {
 | |
| 	struct cgroup_subsys_state css;
 | |
| 
 | |
| 	struct list_head pools;
 | |
| };
 | |
| 
 | |
| struct dmem_cgroup_pool_state {
 | |
| 	struct dmem_cgroup_region *region;
 | |
| 	struct dmemcg_state *cs;
 | |
| 
 | |
| 	/* css node, RCU protected against region teardown */
 | |
| 	struct list_head	css_node;
 | |
| 
 | |
| 	/* dev node, no RCU protection required */
 | |
| 	struct list_head	region_node;
 | |
| 
 | |
| 	struct rcu_head rcu;
 | |
| 
 | |
| 	struct page_counter cnt;
 | |
| 
 | |
| 	bool inited;
 | |
| };
 | |
| 
 | |
| /*
 | |
|  * 3 operations require locking protection:
 | |
|  * - Registering and unregistering region to/from list, requires global lock.
 | |
|  * - Adding a dmem_cgroup_pool_state to a CSS, removing when CSS is freed.
 | |
|  * - Adding a dmem_cgroup_pool_state to a region list.
 | |
|  *
 | |
|  * Since for the most common operations RCU provides enough protection, I
 | |
|  * do not think more granular locking makes sense. Most protection is offered
 | |
|  * by RCU and the lockless operating page_counter.
 | |
|  */
 | |
| static DEFINE_SPINLOCK(dmemcg_lock);
 | |
| static LIST_HEAD(dmem_cgroup_regions);
 | |
| 
 | |
| static inline struct dmemcg_state *
 | |
| css_to_dmemcs(struct cgroup_subsys_state *css)
 | |
| {
 | |
| 	return container_of(css, struct dmemcg_state, css);
 | |
| }
 | |
| 
 | |
| static inline struct dmemcg_state *get_current_dmemcs(void)
 | |
| {
 | |
| 	return css_to_dmemcs(task_get_css(current, dmem_cgrp_id));
 | |
| }
 | |
| 
 | |
| static struct dmemcg_state *parent_dmemcs(struct dmemcg_state *cg)
 | |
| {
 | |
| 	return cg->css.parent ? css_to_dmemcs(cg->css.parent) : NULL;
 | |
| }
 | |
| 
 | |
| static void free_cg_pool(struct dmem_cgroup_pool_state *pool)
 | |
| {
 | |
| 	list_del(&pool->region_node);
 | |
| 	kfree(pool);
 | |
| }
 | |
| 
 | |
| static void
 | |
| set_resource_min(struct dmem_cgroup_pool_state *pool, u64 val)
 | |
| {
 | |
| 	page_counter_set_min(&pool->cnt, val);
 | |
| }
 | |
| 
 | |
| static void
 | |
| set_resource_low(struct dmem_cgroup_pool_state *pool, u64 val)
 | |
| {
 | |
| 	page_counter_set_low(&pool->cnt, val);
 | |
| }
 | |
| 
 | |
| static void
 | |
| set_resource_max(struct dmem_cgroup_pool_state *pool, u64 val)
 | |
| {
 | |
| 	page_counter_set_max(&pool->cnt, val);
 | |
| }
 | |
| 
 | |
| static u64 get_resource_low(struct dmem_cgroup_pool_state *pool)
 | |
| {
 | |
| 	return pool ? READ_ONCE(pool->cnt.low) : 0;
 | |
| }
 | |
| 
 | |
| static u64 get_resource_min(struct dmem_cgroup_pool_state *pool)
 | |
| {
 | |
| 	return pool ? READ_ONCE(pool->cnt.min) : 0;
 | |
| }
 | |
| 
 | |
| static u64 get_resource_max(struct dmem_cgroup_pool_state *pool)
 | |
| {
 | |
| 	return pool ? READ_ONCE(pool->cnt.max) : PAGE_COUNTER_MAX;
 | |
| }
 | |
| 
 | |
| static u64 get_resource_current(struct dmem_cgroup_pool_state *pool)
 | |
| {
 | |
| 	return pool ? page_counter_read(&pool->cnt) : 0;
 | |
| }
 | |
| 
 | |
| static void reset_all_resource_limits(struct dmem_cgroup_pool_state *rpool)
 | |
| {
 | |
| 	set_resource_min(rpool, 0);
 | |
| 	set_resource_low(rpool, 0);
 | |
| 	set_resource_max(rpool, PAGE_COUNTER_MAX);
 | |
| }
 | |
| 
 | |
| static void dmemcs_offline(struct cgroup_subsys_state *css)
 | |
| {
 | |
| 	struct dmemcg_state *dmemcs = css_to_dmemcs(css);
 | |
| 	struct dmem_cgroup_pool_state *pool;
 | |
| 
 | |
| 	rcu_read_lock();
 | |
| 	list_for_each_entry_rcu(pool, &dmemcs->pools, css_node)
 | |
| 		reset_all_resource_limits(pool);
 | |
| 	rcu_read_unlock();
 | |
| }
 | |
| 
 | |
| static void dmemcs_free(struct cgroup_subsys_state *css)
 | |
| {
 | |
| 	struct dmemcg_state *dmemcs = css_to_dmemcs(css);
 | |
| 	struct dmem_cgroup_pool_state *pool, *next;
 | |
| 
 | |
| 	spin_lock(&dmemcg_lock);
 | |
| 	list_for_each_entry_safe(pool, next, &dmemcs->pools, css_node) {
 | |
| 		/*
 | |
| 		 *The pool is dead and all references are 0,
 | |
| 		 * no need for RCU protection with list_del_rcu or freeing.
 | |
| 		 */
 | |
| 		list_del(&pool->css_node);
 | |
| 		free_cg_pool(pool);
 | |
| 	}
 | |
| 	spin_unlock(&dmemcg_lock);
 | |
| 
 | |
| 	kfree(dmemcs);
 | |
| }
 | |
| 
 | |
| static struct cgroup_subsys_state *
 | |
| dmemcs_alloc(struct cgroup_subsys_state *parent_css)
 | |
| {
 | |
| 	struct dmemcg_state *dmemcs = kzalloc(sizeof(*dmemcs), GFP_KERNEL);
 | |
| 	if (!dmemcs)
 | |
| 		return ERR_PTR(-ENOMEM);
 | |
| 
 | |
| 	INIT_LIST_HEAD(&dmemcs->pools);
 | |
| 	return &dmemcs->css;
 | |
| }
 | |
| 
 | |
| static struct dmem_cgroup_pool_state *
 | |
| find_cg_pool_locked(struct dmemcg_state *dmemcs, struct dmem_cgroup_region *region)
 | |
| {
 | |
| 	struct dmem_cgroup_pool_state *pool;
 | |
| 
 | |
| 	list_for_each_entry_rcu(pool, &dmemcs->pools, css_node, spin_is_locked(&dmemcg_lock))
 | |
| 		if (pool->region == region)
 | |
| 			return pool;
 | |
| 
 | |
| 	return NULL;
 | |
| }
 | |
| 
 | |
| static struct dmem_cgroup_pool_state *pool_parent(struct dmem_cgroup_pool_state *pool)
 | |
| {
 | |
| 	if (!pool->cnt.parent)
 | |
| 		return NULL;
 | |
| 
 | |
| 	return container_of(pool->cnt.parent, typeof(*pool), cnt);
 | |
| }
 | |
| 
 | |
| static void
 | |
| dmem_cgroup_calculate_protection(struct dmem_cgroup_pool_state *limit_pool,
 | |
| 				 struct dmem_cgroup_pool_state *test_pool)
 | |
| {
 | |
| 	struct page_counter *climit;
 | |
| 	struct cgroup_subsys_state *css;
 | |
| 	struct dmemcg_state *dmemcg_iter;
 | |
| 	struct dmem_cgroup_pool_state *pool, *found_pool;
 | |
| 
 | |
| 	climit = &limit_pool->cnt;
 | |
| 
 | |
| 	rcu_read_lock();
 | |
| 
 | |
| 	css_for_each_descendant_pre(css, &limit_pool->cs->css) {
 | |
| 		dmemcg_iter = container_of(css, struct dmemcg_state, css);
 | |
| 		found_pool = NULL;
 | |
| 
 | |
| 		list_for_each_entry_rcu(pool, &dmemcg_iter->pools, css_node) {
 | |
| 			if (pool->region == limit_pool->region) {
 | |
| 				found_pool = pool;
 | |
| 				break;
 | |
| 			}
 | |
| 		}
 | |
| 		if (!found_pool)
 | |
| 			continue;
 | |
| 
 | |
| 		page_counter_calculate_protection(
 | |
| 			climit, &found_pool->cnt, true);
 | |
| 
 | |
| 		if (found_pool == test_pool)
 | |
| 			break;
 | |
| 	}
 | |
| 	rcu_read_unlock();
 | |
| }
 | |
| 
 | |
| /**
 | |
|  * dmem_cgroup_state_evict_valuable() - Check if we should evict from test_pool
 | |
|  * @limit_pool: The pool for which we hit limits
 | |
|  * @test_pool: The pool for which to test
 | |
|  * @ignore_low: Whether we have to respect low watermarks.
 | |
|  * @ret_hit_low: Pointer to whether it makes sense to consider low watermark.
 | |
|  *
 | |
|  * This function returns true if we can evict from @test_pool, false if not.
 | |
|  * When returning false and @ignore_low is false, @ret_hit_low may
 | |
|  * be set to true to indicate this function can be retried with @ignore_low
 | |
|  * set to true.
 | |
|  *
 | |
|  * Return: bool
 | |
|  */
 | |
| bool dmem_cgroup_state_evict_valuable(struct dmem_cgroup_pool_state *limit_pool,
 | |
| 				      struct dmem_cgroup_pool_state *test_pool,
 | |
| 				      bool ignore_low, bool *ret_hit_low)
 | |
| {
 | |
| 	struct dmem_cgroup_pool_state *pool = test_pool;
 | |
| 	struct page_counter *ctest;
 | |
| 	u64 used, min, low;
 | |
| 
 | |
| 	/* Can always evict from current pool, despite limits */
 | |
| 	if (limit_pool == test_pool)
 | |
| 		return true;
 | |
| 
 | |
| 	if (limit_pool) {
 | |
| 		if (!parent_dmemcs(limit_pool->cs))
 | |
| 			return true;
 | |
| 
 | |
| 		for (pool = test_pool; pool && limit_pool != pool; pool = pool_parent(pool))
 | |
| 			{}
 | |
| 
 | |
| 		if (!pool)
 | |
| 			return false;
 | |
| 	} else {
 | |
| 		/*
 | |
| 		 * If there is no cgroup limiting memory usage, use the root
 | |
| 		 * cgroup instead for limit calculations.
 | |
| 		 */
 | |
| 		for (limit_pool = test_pool; pool_parent(limit_pool); limit_pool = pool_parent(limit_pool))
 | |
| 			{}
 | |
| 	}
 | |
| 
 | |
| 	ctest = &test_pool->cnt;
 | |
| 
 | |
| 	dmem_cgroup_calculate_protection(limit_pool, test_pool);
 | |
| 
 | |
| 	used = page_counter_read(ctest);
 | |
| 	min = READ_ONCE(ctest->emin);
 | |
| 
 | |
| 	if (used <= min)
 | |
| 		return false;
 | |
| 
 | |
| 	if (!ignore_low) {
 | |
| 		low = READ_ONCE(ctest->elow);
 | |
| 		if (used > low)
 | |
| 			return true;
 | |
| 
 | |
| 		*ret_hit_low = true;
 | |
| 		return false;
 | |
| 	}
 | |
| 	return true;
 | |
| }
 | |
| EXPORT_SYMBOL_GPL(dmem_cgroup_state_evict_valuable);
 | |
| 
 | |
| static struct dmem_cgroup_pool_state *
 | |
| alloc_pool_single(struct dmemcg_state *dmemcs, struct dmem_cgroup_region *region,
 | |
| 		  struct dmem_cgroup_pool_state **allocpool)
 | |
| {
 | |
| 	struct dmemcg_state *parent = parent_dmemcs(dmemcs);
 | |
| 	struct dmem_cgroup_pool_state *pool, *ppool = NULL;
 | |
| 
 | |
| 	if (!*allocpool) {
 | |
| 		pool = kzalloc(sizeof(*pool), GFP_NOWAIT);
 | |
| 		if (!pool)
 | |
| 			return ERR_PTR(-ENOMEM);
 | |
| 	} else {
 | |
| 		pool = *allocpool;
 | |
| 		*allocpool = NULL;
 | |
| 	}
 | |
| 
 | |
| 	pool->region = region;
 | |
| 	pool->cs = dmemcs;
 | |
| 
 | |
| 	if (parent)
 | |
| 		ppool = find_cg_pool_locked(parent, region);
 | |
| 
 | |
| 	page_counter_init(&pool->cnt,
 | |
| 			  ppool ? &ppool->cnt : NULL, true);
 | |
| 	reset_all_resource_limits(pool);
 | |
| 
 | |
| 	list_add_tail_rcu(&pool->css_node, &dmemcs->pools);
 | |
| 	list_add_tail(&pool->region_node, ®ion->pools);
 | |
| 
 | |
| 	if (!parent)
 | |
| 		pool->inited = true;
 | |
| 	else
 | |
| 		pool->inited = ppool ? ppool->inited : false;
 | |
| 	return pool;
 | |
| }
 | |
| 
 | |
| static struct dmem_cgroup_pool_state *
 | |
| get_cg_pool_locked(struct dmemcg_state *dmemcs, struct dmem_cgroup_region *region,
 | |
| 		   struct dmem_cgroup_pool_state **allocpool)
 | |
| {
 | |
| 	struct dmem_cgroup_pool_state *pool, *ppool, *retpool;
 | |
| 	struct dmemcg_state *p, *pp;
 | |
| 
 | |
| 	/*
 | |
| 	 * Recursively create pool, we may not initialize yet on
 | |
| 	 * recursion, this is done as a separate step.
 | |
| 	 */
 | |
| 	for (p = dmemcs; p; p = parent_dmemcs(p)) {
 | |
| 		pool = find_cg_pool_locked(p, region);
 | |
| 		if (!pool)
 | |
| 			pool = alloc_pool_single(p, region, allocpool);
 | |
| 
 | |
| 		if (IS_ERR(pool))
 | |
| 			return pool;
 | |
| 
 | |
| 		if (p == dmemcs && pool->inited)
 | |
| 			return pool;
 | |
| 
 | |
| 		if (pool->inited)
 | |
| 			break;
 | |
| 	}
 | |
| 
 | |
| 	retpool = pool = find_cg_pool_locked(dmemcs, region);
 | |
| 	for (p = dmemcs, pp = parent_dmemcs(dmemcs); pp; p = pp, pp = parent_dmemcs(p)) {
 | |
| 		if (pool->inited)
 | |
| 			break;
 | |
| 
 | |
| 		/* ppool was created if it didn't exist by above loop. */
 | |
| 		ppool = find_cg_pool_locked(pp, region);
 | |
| 
 | |
| 		/* Fix up parent links, mark as inited. */
 | |
| 		pool->cnt.parent = &ppool->cnt;
 | |
| 		pool->inited = true;
 | |
| 
 | |
| 		pool = ppool;
 | |
| 	}
 | |
| 
 | |
| 	return retpool;
 | |
| }
 | |
| 
 | |
| static void dmemcg_free_rcu(struct rcu_head *rcu)
 | |
| {
 | |
| 	struct dmem_cgroup_region *region = container_of(rcu, typeof(*region), rcu);
 | |
| 	struct dmem_cgroup_pool_state *pool, *next;
 | |
| 
 | |
| 	list_for_each_entry_safe(pool, next, ®ion->pools, region_node)
 | |
| 		free_cg_pool(pool);
 | |
| 	kfree(region->name);
 | |
| 	kfree(region);
 | |
| }
 | |
| 
 | |
| static void dmemcg_free_region(struct kref *ref)
 | |
| {
 | |
| 	struct dmem_cgroup_region *cgregion = container_of(ref, typeof(*cgregion), ref);
 | |
| 
 | |
| 	call_rcu(&cgregion->rcu, dmemcg_free_rcu);
 | |
| }
 | |
| 
 | |
| /**
 | |
|  * dmem_cgroup_unregister_region() - Unregister a previously registered region.
 | |
|  * @region: The region to unregister.
 | |
|  *
 | |
|  * This function undoes dmem_cgroup_register_region.
 | |
|  */
 | |
| void dmem_cgroup_unregister_region(struct dmem_cgroup_region *region)
 | |
| {
 | |
| 	struct list_head *entry;
 | |
| 
 | |
| 	if (!region)
 | |
| 		return;
 | |
| 
 | |
| 	spin_lock(&dmemcg_lock);
 | |
| 
 | |
| 	/* Remove from global region list */
 | |
| 	list_del_rcu(®ion->region_node);
 | |
| 
 | |
| 	list_for_each_rcu(entry, ®ion->pools) {
 | |
| 		struct dmem_cgroup_pool_state *pool =
 | |
| 			container_of(entry, typeof(*pool), region_node);
 | |
| 
 | |
| 		list_del_rcu(&pool->css_node);
 | |
| 	}
 | |
| 
 | |
| 	/*
 | |
| 	 * Ensure any RCU based lookups fail. Additionally,
 | |
| 	 * no new pools should be added to the dead region
 | |
| 	 * by get_cg_pool_unlocked.
 | |
| 	 */
 | |
| 	region->unregistered = true;
 | |
| 	spin_unlock(&dmemcg_lock);
 | |
| 
 | |
| 	kref_put(®ion->ref, dmemcg_free_region);
 | |
| }
 | |
| EXPORT_SYMBOL_GPL(dmem_cgroup_unregister_region);
 | |
| 
 | |
| /**
 | |
|  * dmem_cgroup_register_region() - Register a regions for dev cgroup.
 | |
|  * @size: Size of region to register, in bytes.
 | |
|  * @fmt: Region parameters to register
 | |
|  *
 | |
|  * This function registers a node in the dmem cgroup with the
 | |
|  * name given. After calling this function, the region can be
 | |
|  * used for allocations.
 | |
|  *
 | |
|  * Return: NULL or a struct on success, PTR_ERR on failure.
 | |
|  */
 | |
| struct dmem_cgroup_region *dmem_cgroup_register_region(u64 size, const char *fmt, ...)
 | |
| {
 | |
| 	struct dmem_cgroup_region *ret;
 | |
| 	char *region_name;
 | |
| 	va_list ap;
 | |
| 
 | |
| 	if (!size)
 | |
| 		return NULL;
 | |
| 
 | |
| 	va_start(ap, fmt);
 | |
| 	region_name = kvasprintf(GFP_KERNEL, fmt, ap);
 | |
| 	va_end(ap);
 | |
| 	if (!region_name)
 | |
| 		return ERR_PTR(-ENOMEM);
 | |
| 
 | |
| 	ret = kzalloc(sizeof(*ret), GFP_KERNEL);
 | |
| 	if (!ret) {
 | |
| 		kfree(region_name);
 | |
| 		return ERR_PTR(-ENOMEM);
 | |
| 	}
 | |
| 
 | |
| 	INIT_LIST_HEAD(&ret->pools);
 | |
| 	ret->name = region_name;
 | |
| 	ret->size = size;
 | |
| 	kref_init(&ret->ref);
 | |
| 
 | |
| 	spin_lock(&dmemcg_lock);
 | |
| 	list_add_tail_rcu(&ret->region_node, &dmem_cgroup_regions);
 | |
| 	spin_unlock(&dmemcg_lock);
 | |
| 
 | |
| 	return ret;
 | |
| }
 | |
| EXPORT_SYMBOL_GPL(dmem_cgroup_register_region);
 | |
| 
 | |
| static struct dmem_cgroup_region *dmemcg_get_region_by_name(const char *name)
 | |
| {
 | |
| 	struct dmem_cgroup_region *region;
 | |
| 
 | |
| 	list_for_each_entry_rcu(region, &dmem_cgroup_regions, region_node, spin_is_locked(&dmemcg_lock))
 | |
| 		if (!strcmp(name, region->name) &&
 | |
| 		    kref_get_unless_zero(®ion->ref))
 | |
| 			return region;
 | |
| 
 | |
| 	return NULL;
 | |
| }
 | |
| 
 | |
| /**
 | |
|  * dmem_cgroup_pool_state_put() - Drop a reference to a dmem_cgroup_pool_state
 | |
|  * @pool: &dmem_cgroup_pool_state
 | |
|  *
 | |
|  * Called to drop a reference to the limiting pool returned by
 | |
|  * dmem_cgroup_try_charge().
 | |
|  */
 | |
| void dmem_cgroup_pool_state_put(struct dmem_cgroup_pool_state *pool)
 | |
| {
 | |
| 	if (pool)
 | |
| 		css_put(&pool->cs->css);
 | |
| }
 | |
| EXPORT_SYMBOL_GPL(dmem_cgroup_pool_state_put);
 | |
| 
 | |
| static struct dmem_cgroup_pool_state *
 | |
| get_cg_pool_unlocked(struct dmemcg_state *cg, struct dmem_cgroup_region *region)
 | |
| {
 | |
| 	struct dmem_cgroup_pool_state *pool, *allocpool = NULL;
 | |
| 
 | |
| 	/* fastpath lookup? */
 | |
| 	rcu_read_lock();
 | |
| 	pool = find_cg_pool_locked(cg, region);
 | |
| 	if (pool && !READ_ONCE(pool->inited))
 | |
| 		pool = NULL;
 | |
| 	rcu_read_unlock();
 | |
| 
 | |
| 	while (!pool) {
 | |
| 		spin_lock(&dmemcg_lock);
 | |
| 		if (!region->unregistered)
 | |
| 			pool = get_cg_pool_locked(cg, region, &allocpool);
 | |
| 		else
 | |
| 			pool = ERR_PTR(-ENODEV);
 | |
| 		spin_unlock(&dmemcg_lock);
 | |
| 
 | |
| 		if (pool == ERR_PTR(-ENOMEM)) {
 | |
| 			pool = NULL;
 | |
| 			if (WARN_ON(allocpool))
 | |
| 				continue;
 | |
| 
 | |
| 			allocpool = kzalloc(sizeof(*allocpool), GFP_KERNEL);
 | |
| 			if (allocpool) {
 | |
| 				pool = NULL;
 | |
| 				continue;
 | |
| 			}
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	kfree(allocpool);
 | |
| 	return pool;
 | |
| }
 | |
| 
 | |
| /**
 | |
|  * dmem_cgroup_uncharge() - Uncharge a pool.
 | |
|  * @pool: Pool to uncharge.
 | |
|  * @size: Size to uncharge.
 | |
|  *
 | |
|  * Undoes the effects of dmem_cgroup_try_charge.
 | |
|  * Must be called with the returned pool as argument,
 | |
|  * and same @index and @size.
 | |
|  */
 | |
| void dmem_cgroup_uncharge(struct dmem_cgroup_pool_state *pool, u64 size)
 | |
| {
 | |
| 	if (!pool)
 | |
| 		return;
 | |
| 
 | |
| 	page_counter_uncharge(&pool->cnt, size);
 | |
| 	css_put(&pool->cs->css);
 | |
| }
 | |
| EXPORT_SYMBOL_GPL(dmem_cgroup_uncharge);
 | |
| 
 | |
| /**
 | |
|  * dmem_cgroup_try_charge() - Try charging a new allocation to a region.
 | |
|  * @region: dmem region to charge
 | |
|  * @size: Size (in bytes) to charge.
 | |
|  * @ret_pool: On succesfull allocation, the pool that is charged.
 | |
|  * @ret_limit_pool: On a failed allocation, the limiting pool.
 | |
|  *
 | |
|  * This function charges the @region region for a size of @size bytes.
 | |
|  *
 | |
|  * If the function succeeds, @ret_pool is set, which must be passed to
 | |
|  * dmem_cgroup_uncharge() when undoing the allocation.
 | |
|  *
 | |
|  * When this function fails with -EAGAIN and @ret_limit_pool is non-null, it
 | |
|  * will be set to the pool for which the limit is hit. This can be used for
 | |
|  * eviction as argument to dmem_cgroup_evict_valuable(). This reference must be freed
 | |
|  * with @dmem_cgroup_pool_state_put().
 | |
|  *
 | |
|  * Return: 0 on success, -EAGAIN on hitting a limit, or a negative errno on failure.
 | |
|  */
 | |
| int dmem_cgroup_try_charge(struct dmem_cgroup_region *region, u64 size,
 | |
| 			  struct dmem_cgroup_pool_state **ret_pool,
 | |
| 			  struct dmem_cgroup_pool_state **ret_limit_pool)
 | |
| {
 | |
| 	struct dmemcg_state *cg;
 | |
| 	struct dmem_cgroup_pool_state *pool;
 | |
| 	struct page_counter *fail;
 | |
| 	int ret;
 | |
| 
 | |
| 	*ret_pool = NULL;
 | |
| 	if (ret_limit_pool)
 | |
| 		*ret_limit_pool = NULL;
 | |
| 
 | |
| 	/*
 | |
| 	 * hold on to css, as cgroup can be removed but resource
 | |
| 	 * accounting happens on css.
 | |
| 	 */
 | |
| 	cg = get_current_dmemcs();
 | |
| 
 | |
| 	pool = get_cg_pool_unlocked(cg, region);
 | |
| 	if (IS_ERR(pool)) {
 | |
| 		ret = PTR_ERR(pool);
 | |
| 		goto err;
 | |
| 	}
 | |
| 
 | |
| 	if (!page_counter_try_charge(&pool->cnt, size, &fail)) {
 | |
| 		if (ret_limit_pool) {
 | |
| 			*ret_limit_pool = container_of(fail, struct dmem_cgroup_pool_state, cnt);
 | |
| 			css_get(&(*ret_limit_pool)->cs->css);
 | |
| 		}
 | |
| 		ret = -EAGAIN;
 | |
| 		goto err;
 | |
| 	}
 | |
| 
 | |
| 	/* On success, reference from get_current_dmemcs is transferred to *ret_pool */
 | |
| 	*ret_pool = pool;
 | |
| 	return 0;
 | |
| 
 | |
| err:
 | |
| 	css_put(&cg->css);
 | |
| 	return ret;
 | |
| }
 | |
| EXPORT_SYMBOL_GPL(dmem_cgroup_try_charge);
 | |
| 
 | |
| static int dmem_cgroup_region_capacity_show(struct seq_file *sf, void *v)
 | |
| {
 | |
| 	struct dmem_cgroup_region *region;
 | |
| 
 | |
| 	rcu_read_lock();
 | |
| 	list_for_each_entry_rcu(region, &dmem_cgroup_regions, region_node) {
 | |
| 		seq_puts(sf, region->name);
 | |
| 		seq_printf(sf, " %llu\n", region->size);
 | |
| 	}
 | |
| 	rcu_read_unlock();
 | |
| 	return 0;
 | |
| }
 | |
| 
 | |
| static int dmemcg_parse_limit(char *options, struct dmem_cgroup_region *region,
 | |
| 			      u64 *new_limit)
 | |
| {
 | |
| 	char *end;
 | |
| 
 | |
| 	if (!strcmp(options, "max")) {
 | |
| 		*new_limit = PAGE_COUNTER_MAX;
 | |
| 		return 0;
 | |
| 	}
 | |
| 
 | |
| 	*new_limit = memparse(options, &end);
 | |
| 	if (*end != '\0')
 | |
| 		return -EINVAL;
 | |
| 
 | |
| 	return 0;
 | |
| }
 | |
| 
 | |
| static ssize_t dmemcg_limit_write(struct kernfs_open_file *of,
 | |
| 				 char *buf, size_t nbytes, loff_t off,
 | |
| 				 void (*apply)(struct dmem_cgroup_pool_state *, u64))
 | |
| {
 | |
| 	struct dmemcg_state *dmemcs = css_to_dmemcs(of_css(of));
 | |
| 	int err = 0;
 | |
| 
 | |
| 	while (buf && !err) {
 | |
| 		struct dmem_cgroup_pool_state *pool = NULL;
 | |
| 		char *options, *region_name;
 | |
| 		struct dmem_cgroup_region *region;
 | |
| 		u64 new_limit;
 | |
| 
 | |
| 		options = buf;
 | |
| 		buf = strchr(buf, '\n');
 | |
| 		if (buf)
 | |
| 			*buf++ = '\0';
 | |
| 
 | |
| 		options = strstrip(options);
 | |
| 
 | |
| 		/* eat empty lines */
 | |
| 		if (!options[0])
 | |
| 			continue;
 | |
| 
 | |
| 		region_name = strsep(&options, " \t");
 | |
| 		if (!region_name[0])
 | |
| 			continue;
 | |
| 
 | |
| 		rcu_read_lock();
 | |
| 		region = dmemcg_get_region_by_name(region_name);
 | |
| 		rcu_read_unlock();
 | |
| 
 | |
| 		if (!region)
 | |
| 			return -EINVAL;
 | |
| 
 | |
| 		err = dmemcg_parse_limit(options, region, &new_limit);
 | |
| 		if (err < 0)
 | |
| 			goto out_put;
 | |
| 
 | |
| 		pool = get_cg_pool_unlocked(dmemcs, region);
 | |
| 		if (IS_ERR(pool)) {
 | |
| 			err = PTR_ERR(pool);
 | |
| 			goto out_put;
 | |
| 		}
 | |
| 
 | |
| 		/* And commit */
 | |
| 		apply(pool, new_limit);
 | |
| 
 | |
| out_put:
 | |
| 		kref_put(®ion->ref, dmemcg_free_region);
 | |
| 	}
 | |
| 
 | |
| 
 | |
| 	return err ?: nbytes;
 | |
| }
 | |
| 
 | |
| static int dmemcg_limit_show(struct seq_file *sf, void *v,
 | |
| 			    u64 (*fn)(struct dmem_cgroup_pool_state *))
 | |
| {
 | |
| 	struct dmemcg_state *dmemcs = css_to_dmemcs(seq_css(sf));
 | |
| 	struct dmem_cgroup_region *region;
 | |
| 
 | |
| 	rcu_read_lock();
 | |
| 	list_for_each_entry_rcu(region, &dmem_cgroup_regions, region_node) {
 | |
| 		struct dmem_cgroup_pool_state *pool = find_cg_pool_locked(dmemcs, region);
 | |
| 		u64 val;
 | |
| 
 | |
| 		seq_puts(sf, region->name);
 | |
| 
 | |
| 		val = fn(pool);
 | |
| 		if (val < PAGE_COUNTER_MAX)
 | |
| 			seq_printf(sf, " %lld\n", val);
 | |
| 		else
 | |
| 			seq_puts(sf, " max\n");
 | |
| 	}
 | |
| 	rcu_read_unlock();
 | |
| 
 | |
| 	return 0;
 | |
| }
 | |
| 
 | |
| static int dmem_cgroup_region_current_show(struct seq_file *sf, void *v)
 | |
| {
 | |
| 	return dmemcg_limit_show(sf, v, get_resource_current);
 | |
| }
 | |
| 
 | |
| static int dmem_cgroup_region_min_show(struct seq_file *sf, void *v)
 | |
| {
 | |
| 	return dmemcg_limit_show(sf, v, get_resource_min);
 | |
| }
 | |
| 
 | |
| static ssize_t dmem_cgroup_region_min_write(struct kernfs_open_file *of,
 | |
| 				      char *buf, size_t nbytes, loff_t off)
 | |
| {
 | |
| 	return dmemcg_limit_write(of, buf, nbytes, off, set_resource_min);
 | |
| }
 | |
| 
 | |
| static int dmem_cgroup_region_low_show(struct seq_file *sf, void *v)
 | |
| {
 | |
| 	return dmemcg_limit_show(sf, v, get_resource_low);
 | |
| }
 | |
| 
 | |
| static ssize_t dmem_cgroup_region_low_write(struct kernfs_open_file *of,
 | |
| 				      char *buf, size_t nbytes, loff_t off)
 | |
| {
 | |
| 	return dmemcg_limit_write(of, buf, nbytes, off, set_resource_low);
 | |
| }
 | |
| 
 | |
| static int dmem_cgroup_region_max_show(struct seq_file *sf, void *v)
 | |
| {
 | |
| 	return dmemcg_limit_show(sf, v, get_resource_max);
 | |
| }
 | |
| 
 | |
| static ssize_t dmem_cgroup_region_max_write(struct kernfs_open_file *of,
 | |
| 				      char *buf, size_t nbytes, loff_t off)
 | |
| {
 | |
| 	return dmemcg_limit_write(of, buf, nbytes, off, set_resource_max);
 | |
| }
 | |
| 
 | |
| static struct cftype files[] = {
 | |
| 	{
 | |
| 		.name = "capacity",
 | |
| 		.seq_show = dmem_cgroup_region_capacity_show,
 | |
| 		.flags = CFTYPE_ONLY_ON_ROOT,
 | |
| 	},
 | |
| 	{
 | |
| 		.name = "current",
 | |
| 		.seq_show = dmem_cgroup_region_current_show,
 | |
| 	},
 | |
| 	{
 | |
| 		.name = "min",
 | |
| 		.write = dmem_cgroup_region_min_write,
 | |
| 		.seq_show = dmem_cgroup_region_min_show,
 | |
| 		.flags = CFTYPE_NOT_ON_ROOT,
 | |
| 	},
 | |
| 	{
 | |
| 		.name = "low",
 | |
| 		.write = dmem_cgroup_region_low_write,
 | |
| 		.seq_show = dmem_cgroup_region_low_show,
 | |
| 		.flags = CFTYPE_NOT_ON_ROOT,
 | |
| 	},
 | |
| 	{
 | |
| 		.name = "max",
 | |
| 		.write = dmem_cgroup_region_max_write,
 | |
| 		.seq_show = dmem_cgroup_region_max_show,
 | |
| 		.flags = CFTYPE_NOT_ON_ROOT,
 | |
| 	},
 | |
| 	{ } /* Zero entry terminates. */
 | |
| };
 | |
| 
 | |
| struct cgroup_subsys dmem_cgrp_subsys = {
 | |
| 	.css_alloc	= dmemcs_alloc,
 | |
| 	.css_free	= dmemcs_free,
 | |
| 	.css_offline	= dmemcs_offline,
 | |
| 	.legacy_cftypes	= files,
 | |
| 	.dfl_cftypes	= files,
 | |
| };
 |