mirror of
				https://github.com/torvalds/linux.git
				synced 2025-11-02 17:49:03 +02:00 
			
		
		
		
	92117d8443 ("bpf: fix refcnt overflow") turned refcounting of bpf_map into
potentially failing operation, when refcount reaches BPF_MAX_REFCNT limit
(32k). Due to using 32-bit counter, it's possible in practice to overflow
refcounter and make it wrap around to 0, causing erroneous map free, while
there are still references to it, causing use-after-free problems.
But having a failing refcounting operations are problematic in some cases. One
example is mmap() interface. After establishing initial memory-mapping, user
is allowed to arbitrarily map/remap/unmap parts of mapped memory, arbitrarily
splitting it into multiple non-contiguous regions. All this happening without
any control from the users of mmap subsystem. Rather mmap subsystem sends
notifications to original creator of memory mapping through open/close
callbacks, which are optionally specified during initial memory mapping
creation. These callbacks are used to maintain accurate refcount for bpf_map
(see next patch in this series). The problem is that open() callback is not
supposed to fail, because memory-mapped resource is set up and properly
referenced. This is posing a problem for using memory-mapping with BPF maps.
One solution to this is to maintain separate refcount for just memory-mappings
and do single bpf_map_inc/bpf_map_put when it goes from/to zero, respectively.
There are similar use cases in current work on tcp-bpf, necessitating extra
counter as well. This seems like a rather unfortunate and ugly solution that
doesn't scale well to various new use cases.
Another approach to solve this is to use non-failing refcount_t type, which
uses 32-bit counter internally, but, once reaching overflow state at UINT_MAX,
stays there. This utlimately causes memory leak, but prevents use after free.
But given refcounting is not the most performance-critical operation with BPF
maps (it's not used from running BPF program code), we can also just switch to
64-bit counter that can't overflow in practice, potentially disadvantaging
32-bit platforms a tiny bit. This simplifies semantics and allows above
described scenarios to not worry about failing refcount increment operation.
In terms of struct bpf_map size, we are still good and use the same amount of
space:
BEFORE (3 cache lines, 8 bytes of padding at the end):
struct bpf_map {
	const struct bpf_map_ops  * ops __attribute__((__aligned__(64))); /*     0     8 */
	struct bpf_map *           inner_map_meta;       /*     8     8 */
	void *                     security;             /*    16     8 */
	enum bpf_map_type  map_type;                     /*    24     4 */
	u32                        key_size;             /*    28     4 */
	u32                        value_size;           /*    32     4 */
	u32                        max_entries;          /*    36     4 */
	u32                        map_flags;            /*    40     4 */
	int                        spin_lock_off;        /*    44     4 */
	u32                        id;                   /*    48     4 */
	int                        numa_node;            /*    52     4 */
	u32                        btf_key_type_id;      /*    56     4 */
	u32                        btf_value_type_id;    /*    60     4 */
	/* --- cacheline 1 boundary (64 bytes) --- */
	struct btf *               btf;                  /*    64     8 */
	struct bpf_map_memory memory;                    /*    72    16 */
	bool                       unpriv_array;         /*    88     1 */
	bool                       frozen;               /*    89     1 */
	/* XXX 38 bytes hole, try to pack */
	/* --- cacheline 2 boundary (128 bytes) --- */
	atomic_t                   refcnt __attribute__((__aligned__(64))); /*   128     4 */
	atomic_t                   usercnt;              /*   132     4 */
	struct work_struct work;                         /*   136    32 */
	char                       name[16];             /*   168    16 */
	/* size: 192, cachelines: 3, members: 21 */
	/* sum members: 146, holes: 1, sum holes: 38 */
	/* padding: 8 */
	/* forced alignments: 2, forced holes: 1, sum forced holes: 38 */
} __attribute__((__aligned__(64)));
AFTER (same 3 cache lines, no extra padding now):
struct bpf_map {
	const struct bpf_map_ops  * ops __attribute__((__aligned__(64))); /*     0     8 */
	struct bpf_map *           inner_map_meta;       /*     8     8 */
	void *                     security;             /*    16     8 */
	enum bpf_map_type  map_type;                     /*    24     4 */
	u32                        key_size;             /*    28     4 */
	u32                        value_size;           /*    32     4 */
	u32                        max_entries;          /*    36     4 */
	u32                        map_flags;            /*    40     4 */
	int                        spin_lock_off;        /*    44     4 */
	u32                        id;                   /*    48     4 */
	int                        numa_node;            /*    52     4 */
	u32                        btf_key_type_id;      /*    56     4 */
	u32                        btf_value_type_id;    /*    60     4 */
	/* --- cacheline 1 boundary (64 bytes) --- */
	struct btf *               btf;                  /*    64     8 */
	struct bpf_map_memory memory;                    /*    72    16 */
	bool                       unpriv_array;         /*    88     1 */
	bool                       frozen;               /*    89     1 */
	/* XXX 38 bytes hole, try to pack */
	/* --- cacheline 2 boundary (128 bytes) --- */
	atomic64_t                 refcnt __attribute__((__aligned__(64))); /*   128     8 */
	atomic64_t                 usercnt;              /*   136     8 */
	struct work_struct work;                         /*   144    32 */
	char                       name[16];             /*   176    16 */
	/* size: 192, cachelines: 3, members: 21 */
	/* sum members: 154, holes: 1, sum holes: 38 */
	/* forced alignments: 2, forced holes: 1, sum forced holes: 38 */
} __attribute__((__aligned__(64)));
This patch, while modifying all users of bpf_map_inc, also cleans up its
interface to match bpf_map_put with separate operations for bpf_map_inc and
bpf_map_inc_with_uref (to match bpf_map_put and bpf_map_put_with_uref,
respectively). Also, given there are no users of bpf_map_inc_not_zero
specifying uref=true, remove uref flag and default to uref=false internally.
Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Song Liu <songliubraving@fb.com>
Link: https://lore.kernel.org/bpf/20191117172806.2195367-2-andriin@fb.com
		
	
			
		
			
				
	
	
		
			277 lines
		
	
	
	
		
			6.4 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			277 lines
		
	
	
	
		
			6.4 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
// SPDX-License-Identifier: GPL-2.0
 | 
						|
/* XSKMAP used for AF_XDP sockets
 | 
						|
 * Copyright(c) 2018 Intel Corporation.
 | 
						|
 */
 | 
						|
 | 
						|
#include <linux/bpf.h>
 | 
						|
#include <linux/capability.h>
 | 
						|
#include <net/xdp_sock.h>
 | 
						|
#include <linux/slab.h>
 | 
						|
#include <linux/sched.h>
 | 
						|
 | 
						|
int xsk_map_inc(struct xsk_map *map)
 | 
						|
{
 | 
						|
	bpf_map_inc(&map->map);
 | 
						|
	return 0;
 | 
						|
}
 | 
						|
 | 
						|
void xsk_map_put(struct xsk_map *map)
 | 
						|
{
 | 
						|
	bpf_map_put(&map->map);
 | 
						|
}
 | 
						|
 | 
						|
static struct xsk_map_node *xsk_map_node_alloc(struct xsk_map *map,
 | 
						|
					       struct xdp_sock **map_entry)
 | 
						|
{
 | 
						|
	struct xsk_map_node *node;
 | 
						|
	int err;
 | 
						|
 | 
						|
	node = kzalloc(sizeof(*node), GFP_ATOMIC | __GFP_NOWARN);
 | 
						|
	if (!node)
 | 
						|
		return ERR_PTR(-ENOMEM);
 | 
						|
 | 
						|
	err = xsk_map_inc(map);
 | 
						|
	if (err) {
 | 
						|
		kfree(node);
 | 
						|
		return ERR_PTR(err);
 | 
						|
	}
 | 
						|
 | 
						|
	node->map = map;
 | 
						|
	node->map_entry = map_entry;
 | 
						|
	return node;
 | 
						|
}
 | 
						|
 | 
						|
static void xsk_map_node_free(struct xsk_map_node *node)
 | 
						|
{
 | 
						|
	xsk_map_put(node->map);
 | 
						|
	kfree(node);
 | 
						|
}
 | 
						|
 | 
						|
static void xsk_map_sock_add(struct xdp_sock *xs, struct xsk_map_node *node)
 | 
						|
{
 | 
						|
	spin_lock_bh(&xs->map_list_lock);
 | 
						|
	list_add_tail(&node->node, &xs->map_list);
 | 
						|
	spin_unlock_bh(&xs->map_list_lock);
 | 
						|
}
 | 
						|
 | 
						|
static void xsk_map_sock_delete(struct xdp_sock *xs,
 | 
						|
				struct xdp_sock **map_entry)
 | 
						|
{
 | 
						|
	struct xsk_map_node *n, *tmp;
 | 
						|
 | 
						|
	spin_lock_bh(&xs->map_list_lock);
 | 
						|
	list_for_each_entry_safe(n, tmp, &xs->map_list, node) {
 | 
						|
		if (map_entry == n->map_entry) {
 | 
						|
			list_del(&n->node);
 | 
						|
			xsk_map_node_free(n);
 | 
						|
		}
 | 
						|
	}
 | 
						|
	spin_unlock_bh(&xs->map_list_lock);
 | 
						|
}
 | 
						|
 | 
						|
static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
 | 
						|
{
 | 
						|
	struct bpf_map_memory mem;
 | 
						|
	int cpu, err, numa_node;
 | 
						|
	struct xsk_map *m;
 | 
						|
	u64 cost, size;
 | 
						|
 | 
						|
	if (!capable(CAP_NET_ADMIN))
 | 
						|
		return ERR_PTR(-EPERM);
 | 
						|
 | 
						|
	if (attr->max_entries == 0 || attr->key_size != 4 ||
 | 
						|
	    attr->value_size != 4 ||
 | 
						|
	    attr->map_flags & ~(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY))
 | 
						|
		return ERR_PTR(-EINVAL);
 | 
						|
 | 
						|
	numa_node = bpf_map_attr_numa_node(attr);
 | 
						|
	size = struct_size(m, xsk_map, attr->max_entries);
 | 
						|
	cost = size + array_size(sizeof(*m->flush_list), num_possible_cpus());
 | 
						|
 | 
						|
	err = bpf_map_charge_init(&mem, cost);
 | 
						|
	if (err < 0)
 | 
						|
		return ERR_PTR(err);
 | 
						|
 | 
						|
	m = bpf_map_area_alloc(size, numa_node);
 | 
						|
	if (!m) {
 | 
						|
		bpf_map_charge_finish(&mem);
 | 
						|
		return ERR_PTR(-ENOMEM);
 | 
						|
	}
 | 
						|
 | 
						|
	bpf_map_init_from_attr(&m->map, attr);
 | 
						|
	bpf_map_charge_move(&m->map.memory, &mem);
 | 
						|
	spin_lock_init(&m->lock);
 | 
						|
 | 
						|
	m->flush_list = alloc_percpu(struct list_head);
 | 
						|
	if (!m->flush_list) {
 | 
						|
		bpf_map_charge_finish(&m->map.memory);
 | 
						|
		bpf_map_area_free(m);
 | 
						|
		return ERR_PTR(-ENOMEM);
 | 
						|
	}
 | 
						|
 | 
						|
	for_each_possible_cpu(cpu)
 | 
						|
		INIT_LIST_HEAD(per_cpu_ptr(m->flush_list, cpu));
 | 
						|
 | 
						|
	return &m->map;
 | 
						|
}
 | 
						|
 | 
						|
static void xsk_map_free(struct bpf_map *map)
 | 
						|
{
 | 
						|
	struct xsk_map *m = container_of(map, struct xsk_map, map);
 | 
						|
 | 
						|
	bpf_clear_redirect_map(map);
 | 
						|
	synchronize_net();
 | 
						|
	free_percpu(m->flush_list);
 | 
						|
	bpf_map_area_free(m);
 | 
						|
}
 | 
						|
 | 
						|
static int xsk_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
 | 
						|
{
 | 
						|
	struct xsk_map *m = container_of(map, struct xsk_map, map);
 | 
						|
	u32 index = key ? *(u32 *)key : U32_MAX;
 | 
						|
	u32 *next = next_key;
 | 
						|
 | 
						|
	if (index >= m->map.max_entries) {
 | 
						|
		*next = 0;
 | 
						|
		return 0;
 | 
						|
	}
 | 
						|
 | 
						|
	if (index == m->map.max_entries - 1)
 | 
						|
		return -ENOENT;
 | 
						|
	*next = index + 1;
 | 
						|
	return 0;
 | 
						|
}
 | 
						|
 | 
						|
static u32 xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
 | 
						|
{
 | 
						|
	const int ret = BPF_REG_0, mp = BPF_REG_1, index = BPF_REG_2;
 | 
						|
	struct bpf_insn *insn = insn_buf;
 | 
						|
 | 
						|
	*insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
 | 
						|
	*insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5);
 | 
						|
	*insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(sizeof(struct xsk_sock *)));
 | 
						|
	*insn++ = BPF_ALU64_IMM(BPF_ADD, mp, offsetof(struct xsk_map, xsk_map));
 | 
						|
	*insn++ = BPF_ALU64_REG(BPF_ADD, ret, mp);
 | 
						|
	*insn++ = BPF_LDX_MEM(BPF_SIZEOF(struct xsk_sock *), ret, ret, 0);
 | 
						|
	*insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
 | 
						|
	*insn++ = BPF_MOV64_IMM(ret, 0);
 | 
						|
	return insn - insn_buf;
 | 
						|
}
 | 
						|
 | 
						|
static void *xsk_map_lookup_elem(struct bpf_map *map, void *key)
 | 
						|
{
 | 
						|
	WARN_ON_ONCE(!rcu_read_lock_held());
 | 
						|
	return __xsk_map_lookup_elem(map, *(u32 *)key);
 | 
						|
}
 | 
						|
 | 
						|
static void *xsk_map_lookup_elem_sys_only(struct bpf_map *map, void *key)
 | 
						|
{
 | 
						|
	return ERR_PTR(-EOPNOTSUPP);
 | 
						|
}
 | 
						|
 | 
						|
static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value,
 | 
						|
			       u64 map_flags)
 | 
						|
{
 | 
						|
	struct xsk_map *m = container_of(map, struct xsk_map, map);
 | 
						|
	struct xdp_sock *xs, *old_xs, **map_entry;
 | 
						|
	u32 i = *(u32 *)key, fd = *(u32 *)value;
 | 
						|
	struct xsk_map_node *node;
 | 
						|
	struct socket *sock;
 | 
						|
	int err;
 | 
						|
 | 
						|
	if (unlikely(map_flags > BPF_EXIST))
 | 
						|
		return -EINVAL;
 | 
						|
	if (unlikely(i >= m->map.max_entries))
 | 
						|
		return -E2BIG;
 | 
						|
 | 
						|
	sock = sockfd_lookup(fd, &err);
 | 
						|
	if (!sock)
 | 
						|
		return err;
 | 
						|
 | 
						|
	if (sock->sk->sk_family != PF_XDP) {
 | 
						|
		sockfd_put(sock);
 | 
						|
		return -EOPNOTSUPP;
 | 
						|
	}
 | 
						|
 | 
						|
	xs = (struct xdp_sock *)sock->sk;
 | 
						|
 | 
						|
	if (!xsk_is_setup_for_bpf_map(xs)) {
 | 
						|
		sockfd_put(sock);
 | 
						|
		return -EOPNOTSUPP;
 | 
						|
	}
 | 
						|
 | 
						|
	map_entry = &m->xsk_map[i];
 | 
						|
	node = xsk_map_node_alloc(m, map_entry);
 | 
						|
	if (IS_ERR(node)) {
 | 
						|
		sockfd_put(sock);
 | 
						|
		return PTR_ERR(node);
 | 
						|
	}
 | 
						|
 | 
						|
	spin_lock_bh(&m->lock);
 | 
						|
	old_xs = READ_ONCE(*map_entry);
 | 
						|
	if (old_xs == xs) {
 | 
						|
		err = 0;
 | 
						|
		goto out;
 | 
						|
	} else if (old_xs && map_flags == BPF_NOEXIST) {
 | 
						|
		err = -EEXIST;
 | 
						|
		goto out;
 | 
						|
	} else if (!old_xs && map_flags == BPF_EXIST) {
 | 
						|
		err = -ENOENT;
 | 
						|
		goto out;
 | 
						|
	}
 | 
						|
	xsk_map_sock_add(xs, node);
 | 
						|
	WRITE_ONCE(*map_entry, xs);
 | 
						|
	if (old_xs)
 | 
						|
		xsk_map_sock_delete(old_xs, map_entry);
 | 
						|
	spin_unlock_bh(&m->lock);
 | 
						|
	sockfd_put(sock);
 | 
						|
	return 0;
 | 
						|
 | 
						|
out:
 | 
						|
	spin_unlock_bh(&m->lock);
 | 
						|
	sockfd_put(sock);
 | 
						|
	xsk_map_node_free(node);
 | 
						|
	return err;
 | 
						|
}
 | 
						|
 | 
						|
static int xsk_map_delete_elem(struct bpf_map *map, void *key)
 | 
						|
{
 | 
						|
	struct xsk_map *m = container_of(map, struct xsk_map, map);
 | 
						|
	struct xdp_sock *old_xs, **map_entry;
 | 
						|
	int k = *(u32 *)key;
 | 
						|
 | 
						|
	if (k >= map->max_entries)
 | 
						|
		return -EINVAL;
 | 
						|
 | 
						|
	spin_lock_bh(&m->lock);
 | 
						|
	map_entry = &m->xsk_map[k];
 | 
						|
	old_xs = xchg(map_entry, NULL);
 | 
						|
	if (old_xs)
 | 
						|
		xsk_map_sock_delete(old_xs, map_entry);
 | 
						|
	spin_unlock_bh(&m->lock);
 | 
						|
 | 
						|
	return 0;
 | 
						|
}
 | 
						|
 | 
						|
void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs,
 | 
						|
			     struct xdp_sock **map_entry)
 | 
						|
{
 | 
						|
	spin_lock_bh(&map->lock);
 | 
						|
	if (READ_ONCE(*map_entry) == xs) {
 | 
						|
		WRITE_ONCE(*map_entry, NULL);
 | 
						|
		xsk_map_sock_delete(xs, map_entry);
 | 
						|
	}
 | 
						|
	spin_unlock_bh(&map->lock);
 | 
						|
}
 | 
						|
 | 
						|
const struct bpf_map_ops xsk_map_ops = {
 | 
						|
	.map_alloc = xsk_map_alloc,
 | 
						|
	.map_free = xsk_map_free,
 | 
						|
	.map_get_next_key = xsk_map_get_next_key,
 | 
						|
	.map_lookup_elem = xsk_map_lookup_elem,
 | 
						|
	.map_gen_lookup = xsk_map_gen_lookup,
 | 
						|
	.map_lookup_elem_sys_only = xsk_map_lookup_elem_sys_only,
 | 
						|
	.map_update_elem = xsk_map_update_elem,
 | 
						|
	.map_delete_elem = xsk_map_delete_elem,
 | 
						|
	.map_check_btf = map_check_no_btf,
 | 
						|
};
 |