mirror of
				https://github.com/torvalds/linux.git
				synced 2025-11-04 02:30:34 +02:00 
			
		
		
		
	A memleak was found as below:
unreferenced object 0xffff8881010d2a80 (size 32):
  comm "mkdir", pid 1559, jiffies 4294932666
  hex dump (first 32 bytes):
    00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
    40 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  @...............
  backtrace (crc 2e7ef6fa):
    [<ffffffff81372754>] __kmalloc_node_noprof+0x394/0x470
    [<ffffffff813024ab>] alloc_shrinker_info+0x7b/0x1a0
    [<ffffffff813b526a>] mem_cgroup_css_online+0x11a/0x3b0
    [<ffffffff81198dd9>] online_css+0x29/0xa0
    [<ffffffff811a243d>] cgroup_apply_control_enable+0x20d/0x360
    [<ffffffff811a5728>] cgroup_mkdir+0x168/0x5f0
    [<ffffffff8148543e>] kernfs_iop_mkdir+0x5e/0x90
    [<ffffffff813dbb24>] vfs_mkdir+0x144/0x220
    [<ffffffff813e1c97>] do_mkdirat+0x87/0x130
    [<ffffffff813e1de9>] __x64_sys_mkdir+0x49/0x70
    [<ffffffff81f8c928>] do_syscall_64+0x68/0x140
    [<ffffffff8200012f>] entry_SYSCALL_64_after_hwframe+0x76/0x7e
alloc_shrinker_info(), when shrinker_unit_alloc() returns an errer, the
info won't be freed.  Just fix it.
Link: https://lkml.kernel.org/r/20241025060942.1049263-1-chenridong@huaweicloud.com
Fixes: 307bececcd ("mm: shrinker: add a secondary array for shrinker_info::{map, nr_deferred}")
Signed-off-by: Chen Ridong <chenridong@huawei.com>
Acked-by: Qi Zheng <zhengqi.arch@bytedance.com>
Acked-by: Roman Gushchin <roman.gushchin@linux.dev>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Wang Weiyang <wangweiyang2@huawei.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
		
	
			
		
			
				
	
	
		
			811 lines
		
	
	
	
		
			21 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			811 lines
		
	
	
	
		
			21 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
// SPDX-License-Identifier: GPL-2.0
 | 
						|
#include <linux/memcontrol.h>
 | 
						|
#include <linux/rwsem.h>
 | 
						|
#include <linux/shrinker.h>
 | 
						|
#include <linux/rculist.h>
 | 
						|
#include <trace/events/vmscan.h>
 | 
						|
 | 
						|
#include "internal.h"
 | 
						|
 | 
						|
LIST_HEAD(shrinker_list);
 | 
						|
DEFINE_MUTEX(shrinker_mutex);
 | 
						|
 | 
						|
#ifdef CONFIG_MEMCG
 | 
						|
static int shrinker_nr_max;
 | 
						|
 | 
						|
static inline int shrinker_unit_size(int nr_items)
 | 
						|
{
 | 
						|
	return (DIV_ROUND_UP(nr_items, SHRINKER_UNIT_BITS) * sizeof(struct shrinker_info_unit *));
 | 
						|
}
 | 
						|
 | 
						|
static inline void shrinker_unit_free(struct shrinker_info *info, int start)
 | 
						|
{
 | 
						|
	struct shrinker_info_unit **unit;
 | 
						|
	int nr, i;
 | 
						|
 | 
						|
	if (!info)
 | 
						|
		return;
 | 
						|
 | 
						|
	unit = info->unit;
 | 
						|
	nr = DIV_ROUND_UP(info->map_nr_max, SHRINKER_UNIT_BITS);
 | 
						|
 | 
						|
	for (i = start; i < nr; i++) {
 | 
						|
		if (!unit[i])
 | 
						|
			break;
 | 
						|
 | 
						|
		kfree(unit[i]);
 | 
						|
		unit[i] = NULL;
 | 
						|
	}
 | 
						|
}
 | 
						|
 | 
						|
static inline int shrinker_unit_alloc(struct shrinker_info *new,
 | 
						|
				       struct shrinker_info *old, int nid)
 | 
						|
{
 | 
						|
	struct shrinker_info_unit *unit;
 | 
						|
	int nr = DIV_ROUND_UP(new->map_nr_max, SHRINKER_UNIT_BITS);
 | 
						|
	int start = old ? DIV_ROUND_UP(old->map_nr_max, SHRINKER_UNIT_BITS) : 0;
 | 
						|
	int i;
 | 
						|
 | 
						|
	for (i = start; i < nr; i++) {
 | 
						|
		unit = kzalloc_node(sizeof(*unit), GFP_KERNEL, nid);
 | 
						|
		if (!unit) {
 | 
						|
			shrinker_unit_free(new, start);
 | 
						|
			return -ENOMEM;
 | 
						|
		}
 | 
						|
 | 
						|
		new->unit[i] = unit;
 | 
						|
	}
 | 
						|
 | 
						|
	return 0;
 | 
						|
}
 | 
						|
 | 
						|
void free_shrinker_info(struct mem_cgroup *memcg)
 | 
						|
{
 | 
						|
	struct mem_cgroup_per_node *pn;
 | 
						|
	struct shrinker_info *info;
 | 
						|
	int nid;
 | 
						|
 | 
						|
	for_each_node(nid) {
 | 
						|
		pn = memcg->nodeinfo[nid];
 | 
						|
		info = rcu_dereference_protected(pn->shrinker_info, true);
 | 
						|
		shrinker_unit_free(info, 0);
 | 
						|
		kvfree(info);
 | 
						|
		rcu_assign_pointer(pn->shrinker_info, NULL);
 | 
						|
	}
 | 
						|
}
 | 
						|
 | 
						|
int alloc_shrinker_info(struct mem_cgroup *memcg)
 | 
						|
{
 | 
						|
	int nid, ret = 0;
 | 
						|
	int array_size = 0;
 | 
						|
 | 
						|
	mutex_lock(&shrinker_mutex);
 | 
						|
	array_size = shrinker_unit_size(shrinker_nr_max);
 | 
						|
	for_each_node(nid) {
 | 
						|
		struct shrinker_info *info = kvzalloc_node(sizeof(*info) + array_size,
 | 
						|
							   GFP_KERNEL, nid);
 | 
						|
		if (!info)
 | 
						|
			goto err;
 | 
						|
		info->map_nr_max = shrinker_nr_max;
 | 
						|
		if (shrinker_unit_alloc(info, NULL, nid)) {
 | 
						|
			kvfree(info);
 | 
						|
			goto err;
 | 
						|
		}
 | 
						|
		rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info);
 | 
						|
	}
 | 
						|
	mutex_unlock(&shrinker_mutex);
 | 
						|
 | 
						|
	return ret;
 | 
						|
 | 
						|
err:
 | 
						|
	mutex_unlock(&shrinker_mutex);
 | 
						|
	free_shrinker_info(memcg);
 | 
						|
	return -ENOMEM;
 | 
						|
}
 | 
						|
 | 
						|
static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg,
 | 
						|
						     int nid)
 | 
						|
{
 | 
						|
	return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info,
 | 
						|
					 lockdep_is_held(&shrinker_mutex));
 | 
						|
}
 | 
						|
 | 
						|
static int expand_one_shrinker_info(struct mem_cgroup *memcg, int new_size,
 | 
						|
				    int old_size, int new_nr_max)
 | 
						|
{
 | 
						|
	struct shrinker_info *new, *old;
 | 
						|
	struct mem_cgroup_per_node *pn;
 | 
						|
	int nid;
 | 
						|
 | 
						|
	for_each_node(nid) {
 | 
						|
		pn = memcg->nodeinfo[nid];
 | 
						|
		old = shrinker_info_protected(memcg, nid);
 | 
						|
		/* Not yet online memcg */
 | 
						|
		if (!old)
 | 
						|
			return 0;
 | 
						|
 | 
						|
		/* Already expanded this shrinker_info */
 | 
						|
		if (new_nr_max <= old->map_nr_max)
 | 
						|
			continue;
 | 
						|
 | 
						|
		new = kvzalloc_node(sizeof(*new) + new_size, GFP_KERNEL, nid);
 | 
						|
		if (!new)
 | 
						|
			return -ENOMEM;
 | 
						|
 | 
						|
		new->map_nr_max = new_nr_max;
 | 
						|
 | 
						|
		memcpy(new->unit, old->unit, old_size);
 | 
						|
		if (shrinker_unit_alloc(new, old, nid)) {
 | 
						|
			kvfree(new);
 | 
						|
			return -ENOMEM;
 | 
						|
		}
 | 
						|
 | 
						|
		rcu_assign_pointer(pn->shrinker_info, new);
 | 
						|
		kvfree_rcu(old, rcu);
 | 
						|
	}
 | 
						|
 | 
						|
	return 0;
 | 
						|
}
 | 
						|
 | 
						|
static int expand_shrinker_info(int new_id)
 | 
						|
{
 | 
						|
	int ret = 0;
 | 
						|
	int new_nr_max = round_up(new_id + 1, SHRINKER_UNIT_BITS);
 | 
						|
	int new_size, old_size = 0;
 | 
						|
	struct mem_cgroup *memcg;
 | 
						|
 | 
						|
	if (!root_mem_cgroup)
 | 
						|
		goto out;
 | 
						|
 | 
						|
	lockdep_assert_held(&shrinker_mutex);
 | 
						|
 | 
						|
	new_size = shrinker_unit_size(new_nr_max);
 | 
						|
	old_size = shrinker_unit_size(shrinker_nr_max);
 | 
						|
 | 
						|
	memcg = mem_cgroup_iter(NULL, NULL, NULL);
 | 
						|
	do {
 | 
						|
		ret = expand_one_shrinker_info(memcg, new_size, old_size,
 | 
						|
					       new_nr_max);
 | 
						|
		if (ret) {
 | 
						|
			mem_cgroup_iter_break(NULL, memcg);
 | 
						|
			goto out;
 | 
						|
		}
 | 
						|
	} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
 | 
						|
out:
 | 
						|
	if (!ret)
 | 
						|
		shrinker_nr_max = new_nr_max;
 | 
						|
 | 
						|
	return ret;
 | 
						|
}
 | 
						|
 | 
						|
static inline int shrinker_id_to_index(int shrinker_id)
 | 
						|
{
 | 
						|
	return shrinker_id / SHRINKER_UNIT_BITS;
 | 
						|
}
 | 
						|
 | 
						|
static inline int shrinker_id_to_offset(int shrinker_id)
 | 
						|
{
 | 
						|
	return shrinker_id % SHRINKER_UNIT_BITS;
 | 
						|
}
 | 
						|
 | 
						|
static inline int calc_shrinker_id(int index, int offset)
 | 
						|
{
 | 
						|
	return index * SHRINKER_UNIT_BITS + offset;
 | 
						|
}
 | 
						|
 | 
						|
void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
 | 
						|
{
 | 
						|
	if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
 | 
						|
		struct shrinker_info *info;
 | 
						|
		struct shrinker_info_unit *unit;
 | 
						|
 | 
						|
		rcu_read_lock();
 | 
						|
		info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
 | 
						|
		unit = info->unit[shrinker_id_to_index(shrinker_id)];
 | 
						|
		if (!WARN_ON_ONCE(shrinker_id >= info->map_nr_max)) {
 | 
						|
			/* Pairs with smp mb in shrink_slab() */
 | 
						|
			smp_mb__before_atomic();
 | 
						|
			set_bit(shrinker_id_to_offset(shrinker_id), unit->map);
 | 
						|
		}
 | 
						|
		rcu_read_unlock();
 | 
						|
	}
 | 
						|
}
 | 
						|
 | 
						|
static DEFINE_IDR(shrinker_idr);
 | 
						|
 | 
						|
static int shrinker_memcg_alloc(struct shrinker *shrinker)
 | 
						|
{
 | 
						|
	int id, ret = -ENOMEM;
 | 
						|
 | 
						|
	if (mem_cgroup_disabled())
 | 
						|
		return -ENOSYS;
 | 
						|
 | 
						|
	mutex_lock(&shrinker_mutex);
 | 
						|
	id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL);
 | 
						|
	if (id < 0)
 | 
						|
		goto unlock;
 | 
						|
 | 
						|
	if (id >= shrinker_nr_max) {
 | 
						|
		if (expand_shrinker_info(id)) {
 | 
						|
			idr_remove(&shrinker_idr, id);
 | 
						|
			goto unlock;
 | 
						|
		}
 | 
						|
	}
 | 
						|
	shrinker->id = id;
 | 
						|
	ret = 0;
 | 
						|
unlock:
 | 
						|
	mutex_unlock(&shrinker_mutex);
 | 
						|
	return ret;
 | 
						|
}
 | 
						|
 | 
						|
static void shrinker_memcg_remove(struct shrinker *shrinker)
 | 
						|
{
 | 
						|
	int id = shrinker->id;
 | 
						|
 | 
						|
	BUG_ON(id < 0);
 | 
						|
 | 
						|
	lockdep_assert_held(&shrinker_mutex);
 | 
						|
 | 
						|
	idr_remove(&shrinker_idr, id);
 | 
						|
}
 | 
						|
 | 
						|
static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
 | 
						|
				   struct mem_cgroup *memcg)
 | 
						|
{
 | 
						|
	struct shrinker_info *info;
 | 
						|
	struct shrinker_info_unit *unit;
 | 
						|
	long nr_deferred;
 | 
						|
 | 
						|
	rcu_read_lock();
 | 
						|
	info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
 | 
						|
	unit = info->unit[shrinker_id_to_index(shrinker->id)];
 | 
						|
	nr_deferred = atomic_long_xchg(&unit->nr_deferred[shrinker_id_to_offset(shrinker->id)], 0);
 | 
						|
	rcu_read_unlock();
 | 
						|
 | 
						|
	return nr_deferred;
 | 
						|
}
 | 
						|
 | 
						|
static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
 | 
						|
				  struct mem_cgroup *memcg)
 | 
						|
{
 | 
						|
	struct shrinker_info *info;
 | 
						|
	struct shrinker_info_unit *unit;
 | 
						|
	long nr_deferred;
 | 
						|
 | 
						|
	rcu_read_lock();
 | 
						|
	info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
 | 
						|
	unit = info->unit[shrinker_id_to_index(shrinker->id)];
 | 
						|
	nr_deferred =
 | 
						|
		atomic_long_add_return(nr, &unit->nr_deferred[shrinker_id_to_offset(shrinker->id)]);
 | 
						|
	rcu_read_unlock();
 | 
						|
 | 
						|
	return nr_deferred;
 | 
						|
}
 | 
						|
 | 
						|
void reparent_shrinker_deferred(struct mem_cgroup *memcg)
 | 
						|
{
 | 
						|
	int nid, index, offset;
 | 
						|
	long nr;
 | 
						|
	struct mem_cgroup *parent;
 | 
						|
	struct shrinker_info *child_info, *parent_info;
 | 
						|
	struct shrinker_info_unit *child_unit, *parent_unit;
 | 
						|
 | 
						|
	parent = parent_mem_cgroup(memcg);
 | 
						|
	if (!parent)
 | 
						|
		parent = root_mem_cgroup;
 | 
						|
 | 
						|
	/* Prevent from concurrent shrinker_info expand */
 | 
						|
	mutex_lock(&shrinker_mutex);
 | 
						|
	for_each_node(nid) {
 | 
						|
		child_info = shrinker_info_protected(memcg, nid);
 | 
						|
		parent_info = shrinker_info_protected(parent, nid);
 | 
						|
		for (index = 0; index < shrinker_id_to_index(child_info->map_nr_max); index++) {
 | 
						|
			child_unit = child_info->unit[index];
 | 
						|
			parent_unit = parent_info->unit[index];
 | 
						|
			for (offset = 0; offset < SHRINKER_UNIT_BITS; offset++) {
 | 
						|
				nr = atomic_long_read(&child_unit->nr_deferred[offset]);
 | 
						|
				atomic_long_add(nr, &parent_unit->nr_deferred[offset]);
 | 
						|
			}
 | 
						|
		}
 | 
						|
	}
 | 
						|
	mutex_unlock(&shrinker_mutex);
 | 
						|
}
 | 
						|
#else
 | 
						|
static int shrinker_memcg_alloc(struct shrinker *shrinker)
 | 
						|
{
 | 
						|
	return -ENOSYS;
 | 
						|
}
 | 
						|
 | 
						|
static void shrinker_memcg_remove(struct shrinker *shrinker)
 | 
						|
{
 | 
						|
}
 | 
						|
 | 
						|
static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
 | 
						|
				   struct mem_cgroup *memcg)
 | 
						|
{
 | 
						|
	return 0;
 | 
						|
}
 | 
						|
 | 
						|
static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
 | 
						|
				  struct mem_cgroup *memcg)
 | 
						|
{
 | 
						|
	return 0;
 | 
						|
}
 | 
						|
#endif /* CONFIG_MEMCG */
 | 
						|
 | 
						|
static long xchg_nr_deferred(struct shrinker *shrinker,
 | 
						|
			     struct shrink_control *sc)
 | 
						|
{
 | 
						|
	int nid = sc->nid;
 | 
						|
 | 
						|
	if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
 | 
						|
		nid = 0;
 | 
						|
 | 
						|
	if (sc->memcg &&
 | 
						|
	    (shrinker->flags & SHRINKER_MEMCG_AWARE))
 | 
						|
		return xchg_nr_deferred_memcg(nid, shrinker,
 | 
						|
					      sc->memcg);
 | 
						|
 | 
						|
	return atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
static long add_nr_deferred(long nr, struct shrinker *shrinker,
 | 
						|
			    struct shrink_control *sc)
 | 
						|
{
 | 
						|
	int nid = sc->nid;
 | 
						|
 | 
						|
	if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
 | 
						|
		nid = 0;
 | 
						|
 | 
						|
	if (sc->memcg &&
 | 
						|
	    (shrinker->flags & SHRINKER_MEMCG_AWARE))
 | 
						|
		return add_nr_deferred_memcg(nr, nid, shrinker,
 | 
						|
					     sc->memcg);
 | 
						|
 | 
						|
	return atomic_long_add_return(nr, &shrinker->nr_deferred[nid]);
 | 
						|
}
 | 
						|
 | 
						|
#define SHRINK_BATCH 128
 | 
						|
 | 
						|
static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
 | 
						|
				    struct shrinker *shrinker, int priority)
 | 
						|
{
 | 
						|
	unsigned long freed = 0;
 | 
						|
	unsigned long long delta;
 | 
						|
	long total_scan;
 | 
						|
	long freeable;
 | 
						|
	long nr;
 | 
						|
	long new_nr;
 | 
						|
	long batch_size = shrinker->batch ? shrinker->batch
 | 
						|
					  : SHRINK_BATCH;
 | 
						|
	long scanned = 0, next_deferred;
 | 
						|
 | 
						|
	freeable = shrinker->count_objects(shrinker, shrinkctl);
 | 
						|
	if (freeable == 0 || freeable == SHRINK_EMPTY)
 | 
						|
		return freeable;
 | 
						|
 | 
						|
	/*
 | 
						|
	 * copy the current shrinker scan count into a local variable
 | 
						|
	 * and zero it so that other concurrent shrinker invocations
 | 
						|
	 * don't also do this scanning work.
 | 
						|
	 */
 | 
						|
	nr = xchg_nr_deferred(shrinker, shrinkctl);
 | 
						|
 | 
						|
	if (shrinker->seeks) {
 | 
						|
		delta = freeable >> priority;
 | 
						|
		delta *= 4;
 | 
						|
		do_div(delta, shrinker->seeks);
 | 
						|
	} else {
 | 
						|
		/*
 | 
						|
		 * These objects don't require any IO to create. Trim
 | 
						|
		 * them aggressively under memory pressure to keep
 | 
						|
		 * them from causing refetches in the IO caches.
 | 
						|
		 */
 | 
						|
		delta = freeable / 2;
 | 
						|
	}
 | 
						|
 | 
						|
	total_scan = nr >> priority;
 | 
						|
	total_scan += delta;
 | 
						|
	total_scan = min(total_scan, (2 * freeable));
 | 
						|
 | 
						|
	trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
 | 
						|
				   freeable, delta, total_scan, priority);
 | 
						|
 | 
						|
	/*
 | 
						|
	 * Normally, we should not scan less than batch_size objects in one
 | 
						|
	 * pass to avoid too frequent shrinker calls, but if the slab has less
 | 
						|
	 * than batch_size objects in total and we are really tight on memory,
 | 
						|
	 * we will try to reclaim all available objects, otherwise we can end
 | 
						|
	 * up failing allocations although there are plenty of reclaimable
 | 
						|
	 * objects spread over several slabs with usage less than the
 | 
						|
	 * batch_size.
 | 
						|
	 *
 | 
						|
	 * We detect the "tight on memory" situations by looking at the total
 | 
						|
	 * number of objects we want to scan (total_scan). If it is greater
 | 
						|
	 * than the total number of objects on slab (freeable), we must be
 | 
						|
	 * scanning at high prio and therefore should try to reclaim as much as
 | 
						|
	 * possible.
 | 
						|
	 */
 | 
						|
	while (total_scan >= batch_size ||
 | 
						|
	       total_scan >= freeable) {
 | 
						|
		unsigned long ret;
 | 
						|
		unsigned long nr_to_scan = min(batch_size, total_scan);
 | 
						|
 | 
						|
		shrinkctl->nr_to_scan = nr_to_scan;
 | 
						|
		shrinkctl->nr_scanned = nr_to_scan;
 | 
						|
		ret = shrinker->scan_objects(shrinker, shrinkctl);
 | 
						|
		if (ret == SHRINK_STOP)
 | 
						|
			break;
 | 
						|
		freed += ret;
 | 
						|
 | 
						|
		count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned);
 | 
						|
		total_scan -= shrinkctl->nr_scanned;
 | 
						|
		scanned += shrinkctl->nr_scanned;
 | 
						|
 | 
						|
		cond_resched();
 | 
						|
	}
 | 
						|
 | 
						|
	/*
 | 
						|
	 * The deferred work is increased by any new work (delta) that wasn't
 | 
						|
	 * done, decreased by old deferred work that was done now.
 | 
						|
	 *
 | 
						|
	 * And it is capped to two times of the freeable items.
 | 
						|
	 */
 | 
						|
	next_deferred = max_t(long, (nr + delta - scanned), 0);
 | 
						|
	next_deferred = min(next_deferred, (2 * freeable));
 | 
						|
 | 
						|
	/*
 | 
						|
	 * move the unused scan count back into the shrinker in a
 | 
						|
	 * manner that handles concurrent updates.
 | 
						|
	 */
 | 
						|
	new_nr = add_nr_deferred(next_deferred, shrinker, shrinkctl);
 | 
						|
 | 
						|
	trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan);
 | 
						|
	return freed;
 | 
						|
}
 | 
						|
 | 
						|
#ifdef CONFIG_MEMCG
 | 
						|
static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
 | 
						|
			struct mem_cgroup *memcg, int priority)
 | 
						|
{
 | 
						|
	struct shrinker_info *info;
 | 
						|
	unsigned long ret, freed = 0;
 | 
						|
	int offset, index = 0;
 | 
						|
 | 
						|
	if (!mem_cgroup_online(memcg))
 | 
						|
		return 0;
 | 
						|
 | 
						|
	/*
 | 
						|
	 * lockless algorithm of memcg shrink.
 | 
						|
	 *
 | 
						|
	 * The shrinker_info may be freed asynchronously via RCU in the
 | 
						|
	 * expand_one_shrinker_info(), so the rcu_read_lock() needs to be used
 | 
						|
	 * to ensure the existence of the shrinker_info.
 | 
						|
	 *
 | 
						|
	 * The shrinker_info_unit is never freed unless its corresponding memcg
 | 
						|
	 * is destroyed. Here we already hold the refcount of memcg, so the
 | 
						|
	 * memcg will not be destroyed, and of course shrinker_info_unit will
 | 
						|
	 * not be freed.
 | 
						|
	 *
 | 
						|
	 * So in the memcg shrink:
 | 
						|
	 *  step 1: use rcu_read_lock() to guarantee existence of the
 | 
						|
	 *          shrinker_info.
 | 
						|
	 *  step 2: after getting shrinker_info_unit we can safely release the
 | 
						|
	 *          RCU lock.
 | 
						|
	 *  step 3: traverse the bitmap and calculate shrinker_id
 | 
						|
	 *  step 4: use rcu_read_lock() to guarantee existence of the shrinker.
 | 
						|
	 *  step 5: use shrinker_id to find the shrinker, then use
 | 
						|
	 *          shrinker_try_get() to guarantee existence of the shrinker,
 | 
						|
	 *          then we can release the RCU lock to do do_shrink_slab() that
 | 
						|
	 *          may sleep.
 | 
						|
	 *  step 6: do shrinker_put() paired with step 5 to put the refcount,
 | 
						|
	 *          if the refcount reaches 0, then wake up the waiter in
 | 
						|
	 *          shrinker_free() by calling complete().
 | 
						|
	 *          Note: here is different from the global shrink, we don't
 | 
						|
	 *                need to acquire the RCU lock to guarantee existence of
 | 
						|
	 *                the shrinker, because we don't need to use this
 | 
						|
	 *                shrinker to traverse the next shrinker in the bitmap.
 | 
						|
	 *  step 7: we have already exited the read-side of rcu critical section
 | 
						|
	 *          before calling do_shrink_slab(), the shrinker_info may be
 | 
						|
	 *          released in expand_one_shrinker_info(), so go back to step 1
 | 
						|
	 *          to reacquire the shrinker_info.
 | 
						|
	 */
 | 
						|
again:
 | 
						|
	rcu_read_lock();
 | 
						|
	info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
 | 
						|
	if (unlikely(!info))
 | 
						|
		goto unlock;
 | 
						|
 | 
						|
	if (index < shrinker_id_to_index(info->map_nr_max)) {
 | 
						|
		struct shrinker_info_unit *unit;
 | 
						|
 | 
						|
		unit = info->unit[index];
 | 
						|
 | 
						|
		rcu_read_unlock();
 | 
						|
 | 
						|
		for_each_set_bit(offset, unit->map, SHRINKER_UNIT_BITS) {
 | 
						|
			struct shrink_control sc = {
 | 
						|
				.gfp_mask = gfp_mask,
 | 
						|
				.nid = nid,
 | 
						|
				.memcg = memcg,
 | 
						|
			};
 | 
						|
			struct shrinker *shrinker;
 | 
						|
			int shrinker_id = calc_shrinker_id(index, offset);
 | 
						|
 | 
						|
			rcu_read_lock();
 | 
						|
			shrinker = idr_find(&shrinker_idr, shrinker_id);
 | 
						|
			if (unlikely(!shrinker || !shrinker_try_get(shrinker))) {
 | 
						|
				clear_bit(offset, unit->map);
 | 
						|
				rcu_read_unlock();
 | 
						|
				continue;
 | 
						|
			}
 | 
						|
			rcu_read_unlock();
 | 
						|
 | 
						|
			/* Call non-slab shrinkers even though kmem is disabled */
 | 
						|
			if (!memcg_kmem_online() &&
 | 
						|
			    !(shrinker->flags & SHRINKER_NONSLAB))
 | 
						|
				continue;
 | 
						|
 | 
						|
			ret = do_shrink_slab(&sc, shrinker, priority);
 | 
						|
			if (ret == SHRINK_EMPTY) {
 | 
						|
				clear_bit(offset, unit->map);
 | 
						|
				/*
 | 
						|
				 * After the shrinker reported that it had no objects to
 | 
						|
				 * free, but before we cleared the corresponding bit in
 | 
						|
				 * the memcg shrinker map, a new object might have been
 | 
						|
				 * added. To make sure, we have the bit set in this
 | 
						|
				 * case, we invoke the shrinker one more time and reset
 | 
						|
				 * the bit if it reports that it is not empty anymore.
 | 
						|
				 * The memory barrier here pairs with the barrier in
 | 
						|
				 * set_shrinker_bit():
 | 
						|
				 *
 | 
						|
				 * list_lru_add()     shrink_slab_memcg()
 | 
						|
				 *   list_add_tail()    clear_bit()
 | 
						|
				 *   <MB>               <MB>
 | 
						|
				 *   set_bit()          do_shrink_slab()
 | 
						|
				 */
 | 
						|
				smp_mb__after_atomic();
 | 
						|
				ret = do_shrink_slab(&sc, shrinker, priority);
 | 
						|
				if (ret == SHRINK_EMPTY)
 | 
						|
					ret = 0;
 | 
						|
				else
 | 
						|
					set_shrinker_bit(memcg, nid, shrinker_id);
 | 
						|
			}
 | 
						|
			freed += ret;
 | 
						|
			shrinker_put(shrinker);
 | 
						|
		}
 | 
						|
 | 
						|
		index++;
 | 
						|
		goto again;
 | 
						|
	}
 | 
						|
unlock:
 | 
						|
	rcu_read_unlock();
 | 
						|
	return freed;
 | 
						|
}
 | 
						|
#else /* !CONFIG_MEMCG */
 | 
						|
static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
 | 
						|
			struct mem_cgroup *memcg, int priority)
 | 
						|
{
 | 
						|
	return 0;
 | 
						|
}
 | 
						|
#endif /* CONFIG_MEMCG */
 | 
						|
 | 
						|
/**
 | 
						|
 * shrink_slab - shrink slab caches
 | 
						|
 * @gfp_mask: allocation context
 | 
						|
 * @nid: node whose slab caches to target
 | 
						|
 * @memcg: memory cgroup whose slab caches to target
 | 
						|
 * @priority: the reclaim priority
 | 
						|
 *
 | 
						|
 * Call the shrink functions to age shrinkable caches.
 | 
						|
 *
 | 
						|
 * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set,
 | 
						|
 * unaware shrinkers will receive a node id of 0 instead.
 | 
						|
 *
 | 
						|
 * @memcg specifies the memory cgroup to target. Unaware shrinkers
 | 
						|
 * are called only if it is the root cgroup.
 | 
						|
 *
 | 
						|
 * @priority is sc->priority, we take the number of objects and >> by priority
 | 
						|
 * in order to get the scan target.
 | 
						|
 *
 | 
						|
 * Returns the number of reclaimed slab objects.
 | 
						|
 */
 | 
						|
unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg,
 | 
						|
			  int priority)
 | 
						|
{
 | 
						|
	unsigned long ret, freed = 0;
 | 
						|
	struct shrinker *shrinker;
 | 
						|
 | 
						|
	/*
 | 
						|
	 * The root memcg might be allocated even though memcg is disabled
 | 
						|
	 * via "cgroup_disable=memory" boot parameter.  This could make
 | 
						|
	 * mem_cgroup_is_root() return false, then just run memcg slab
 | 
						|
	 * shrink, but skip global shrink.  This may result in premature
 | 
						|
	 * oom.
 | 
						|
	 */
 | 
						|
	if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
 | 
						|
		return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
 | 
						|
 | 
						|
	/*
 | 
						|
	 * lockless algorithm of global shrink.
 | 
						|
	 *
 | 
						|
	 * In the unregistration setp, the shrinker will be freed asynchronously
 | 
						|
	 * via RCU after its refcount reaches 0. So both rcu_read_lock() and
 | 
						|
	 * shrinker_try_get() can be used to ensure the existence of the shrinker.
 | 
						|
	 *
 | 
						|
	 * So in the global shrink:
 | 
						|
	 *  step 1: use rcu_read_lock() to guarantee existence of the shrinker
 | 
						|
	 *          and the validity of the shrinker_list walk.
 | 
						|
	 *  step 2: use shrinker_try_get() to try get the refcount, if successful,
 | 
						|
	 *          then the existence of the shrinker can also be guaranteed,
 | 
						|
	 *          so we can release the RCU lock to do do_shrink_slab() that
 | 
						|
	 *          may sleep.
 | 
						|
	 *  step 3: *MUST* to reacquire the RCU lock before calling shrinker_put(),
 | 
						|
	 *          which ensures that neither this shrinker nor the next shrinker
 | 
						|
	 *          will be freed in the next traversal operation.
 | 
						|
	 *  step 4: do shrinker_put() paired with step 2 to put the refcount,
 | 
						|
	 *          if the refcount reaches 0, then wake up the waiter in
 | 
						|
	 *          shrinker_free() by calling complete().
 | 
						|
	 */
 | 
						|
	rcu_read_lock();
 | 
						|
	list_for_each_entry_rcu(shrinker, &shrinker_list, list) {
 | 
						|
		struct shrink_control sc = {
 | 
						|
			.gfp_mask = gfp_mask,
 | 
						|
			.nid = nid,
 | 
						|
			.memcg = memcg,
 | 
						|
		};
 | 
						|
 | 
						|
		if (!shrinker_try_get(shrinker))
 | 
						|
			continue;
 | 
						|
 | 
						|
		rcu_read_unlock();
 | 
						|
 | 
						|
		ret = do_shrink_slab(&sc, shrinker, priority);
 | 
						|
		if (ret == SHRINK_EMPTY)
 | 
						|
			ret = 0;
 | 
						|
		freed += ret;
 | 
						|
 | 
						|
		rcu_read_lock();
 | 
						|
		shrinker_put(shrinker);
 | 
						|
	}
 | 
						|
 | 
						|
	rcu_read_unlock();
 | 
						|
	cond_resched();
 | 
						|
	return freed;
 | 
						|
}
 | 
						|
 | 
						|
struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...)
 | 
						|
{
 | 
						|
	struct shrinker *shrinker;
 | 
						|
	unsigned int size;
 | 
						|
	va_list ap;
 | 
						|
	int err;
 | 
						|
 | 
						|
	shrinker = kzalloc(sizeof(struct shrinker), GFP_KERNEL);
 | 
						|
	if (!shrinker)
 | 
						|
		return NULL;
 | 
						|
 | 
						|
	va_start(ap, fmt);
 | 
						|
	err = shrinker_debugfs_name_alloc(shrinker, fmt, ap);
 | 
						|
	va_end(ap);
 | 
						|
	if (err)
 | 
						|
		goto err_name;
 | 
						|
 | 
						|
	shrinker->flags = flags | SHRINKER_ALLOCATED;
 | 
						|
	shrinker->seeks = DEFAULT_SEEKS;
 | 
						|
 | 
						|
	if (flags & SHRINKER_MEMCG_AWARE) {
 | 
						|
		err = shrinker_memcg_alloc(shrinker);
 | 
						|
		if (err == -ENOSYS) {
 | 
						|
			/* Memcg is not supported, fallback to non-memcg-aware shrinker. */
 | 
						|
			shrinker->flags &= ~SHRINKER_MEMCG_AWARE;
 | 
						|
			goto non_memcg;
 | 
						|
		}
 | 
						|
 | 
						|
		if (err)
 | 
						|
			goto err_flags;
 | 
						|
 | 
						|
		return shrinker;
 | 
						|
	}
 | 
						|
 | 
						|
non_memcg:
 | 
						|
	/*
 | 
						|
	 * The nr_deferred is available on per memcg level for memcg aware
 | 
						|
	 * shrinkers, so only allocate nr_deferred in the following cases:
 | 
						|
	 *  - non-memcg-aware shrinkers
 | 
						|
	 *  - !CONFIG_MEMCG
 | 
						|
	 *  - memcg is disabled by kernel command line
 | 
						|
	 */
 | 
						|
	size = sizeof(*shrinker->nr_deferred);
 | 
						|
	if (flags & SHRINKER_NUMA_AWARE)
 | 
						|
		size *= nr_node_ids;
 | 
						|
 | 
						|
	shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
 | 
						|
	if (!shrinker->nr_deferred)
 | 
						|
		goto err_flags;
 | 
						|
 | 
						|
	return shrinker;
 | 
						|
 | 
						|
err_flags:
 | 
						|
	shrinker_debugfs_name_free(shrinker);
 | 
						|
err_name:
 | 
						|
	kfree(shrinker);
 | 
						|
	return NULL;
 | 
						|
}
 | 
						|
EXPORT_SYMBOL_GPL(shrinker_alloc);
 | 
						|
 | 
						|
void shrinker_register(struct shrinker *shrinker)
 | 
						|
{
 | 
						|
	if (unlikely(!(shrinker->flags & SHRINKER_ALLOCATED))) {
 | 
						|
		pr_warn("Must use shrinker_alloc() to dynamically allocate the shrinker");
 | 
						|
		return;
 | 
						|
	}
 | 
						|
 | 
						|
	mutex_lock(&shrinker_mutex);
 | 
						|
	list_add_tail_rcu(&shrinker->list, &shrinker_list);
 | 
						|
	shrinker->flags |= SHRINKER_REGISTERED;
 | 
						|
	shrinker_debugfs_add(shrinker);
 | 
						|
	mutex_unlock(&shrinker_mutex);
 | 
						|
 | 
						|
	init_completion(&shrinker->done);
 | 
						|
	/*
 | 
						|
	 * Now the shrinker is fully set up, take the first reference to it to
 | 
						|
	 * indicate that lookup operations are now allowed to use it via
 | 
						|
	 * shrinker_try_get().
 | 
						|
	 */
 | 
						|
	refcount_set(&shrinker->refcount, 1);
 | 
						|
}
 | 
						|
EXPORT_SYMBOL_GPL(shrinker_register);
 | 
						|
 | 
						|
static void shrinker_free_rcu_cb(struct rcu_head *head)
 | 
						|
{
 | 
						|
	struct shrinker *shrinker = container_of(head, struct shrinker, rcu);
 | 
						|
 | 
						|
	kfree(shrinker->nr_deferred);
 | 
						|
	kfree(shrinker);
 | 
						|
}
 | 
						|
 | 
						|
void shrinker_free(struct shrinker *shrinker)
 | 
						|
{
 | 
						|
	struct dentry *debugfs_entry = NULL;
 | 
						|
	int debugfs_id;
 | 
						|
 | 
						|
	if (!shrinker)
 | 
						|
		return;
 | 
						|
 | 
						|
	if (shrinker->flags & SHRINKER_REGISTERED) {
 | 
						|
		/* drop the initial refcount */
 | 
						|
		shrinker_put(shrinker);
 | 
						|
		/*
 | 
						|
		 * Wait for all lookups of the shrinker to complete, after that,
 | 
						|
		 * no shrinker is running or will run again, then we can safely
 | 
						|
		 * free it asynchronously via RCU and safely free the structure
 | 
						|
		 * where the shrinker is located, such as super_block etc.
 | 
						|
		 */
 | 
						|
		wait_for_completion(&shrinker->done);
 | 
						|
	}
 | 
						|
 | 
						|
	mutex_lock(&shrinker_mutex);
 | 
						|
	if (shrinker->flags & SHRINKER_REGISTERED) {
 | 
						|
		/*
 | 
						|
		 * Now we can safely remove it from the shrinker_list and then
 | 
						|
		 * free it.
 | 
						|
		 */
 | 
						|
		list_del_rcu(&shrinker->list);
 | 
						|
		debugfs_entry = shrinker_debugfs_detach(shrinker, &debugfs_id);
 | 
						|
		shrinker->flags &= ~SHRINKER_REGISTERED;
 | 
						|
	}
 | 
						|
 | 
						|
	shrinker_debugfs_name_free(shrinker);
 | 
						|
 | 
						|
	if (shrinker->flags & SHRINKER_MEMCG_AWARE)
 | 
						|
		shrinker_memcg_remove(shrinker);
 | 
						|
	mutex_unlock(&shrinker_mutex);
 | 
						|
 | 
						|
	if (debugfs_entry)
 | 
						|
		shrinker_debugfs_remove(debugfs_entry, debugfs_id);
 | 
						|
 | 
						|
	call_rcu(&shrinker->rcu, shrinker_free_rcu_cb);
 | 
						|
}
 | 
						|
EXPORT_SYMBOL_GPL(shrinker_free);
 |