mirror of
				https://github.com/torvalds/linux.git
				synced 2025-11-04 10:40:15 +02:00 
			
		
		
		
	bpf: add lookup/update support for per-cpu hash and array maps
The functions bpf_map_lookup_elem(map, key, value) and
bpf_map_update_elem(map, key, value, flags) need to get/set
values from all-cpus for per-cpu hash and array maps,
so that user space can aggregate/update them as necessary.
Example of single counter aggregation in user space:
  unsigned int nr_cpus = sysconf(_SC_NPROCESSORS_CONF);
  long values[nr_cpus];
  long value = 0;
  bpf_lookup_elem(fd, key, values);
  for (i = 0; i < nr_cpus; i++)
    value += values[i];
The user space must provide round_up(value_size, 8) * nr_cpus
array to get/set values, since kernel will use 'long' copy
of per-cpu values to try to copy good counters atomically.
It's a best-effort, since bpf programs and user space are racing
to access the same memory.
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
			
			
This commit is contained in:
		
							parent
							
								
									a10423b87a
								
							
						
					
					
						commit
						15a07b3381
					
				
					 4 changed files with 201 additions and 26 deletions
				
			
		| 
						 | 
				
			
			@ -183,6 +183,29 @@ int bpf_prog_new_fd(struct bpf_prog *prog);
 | 
			
		|||
int bpf_obj_pin_user(u32 ufd, const char __user *pathname);
 | 
			
		||||
int bpf_obj_get_user(const char __user *pathname);
 | 
			
		||||
 | 
			
		||||
int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value);
 | 
			
		||||
int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value);
 | 
			
		||||
int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
 | 
			
		||||
			   u64 flags);
 | 
			
		||||
int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
 | 
			
		||||
			    u64 flags);
 | 
			
		||||
 | 
			
		||||
/* memcpy that is used with 8-byte aligned pointers, power-of-8 size and
 | 
			
		||||
 * forced to use 'long' read/writes to try to atomically copy long counters.
 | 
			
		||||
 * Best-effort only.  No barriers here, since it _will_ race with concurrent
 | 
			
		||||
 * updates from BPF programs. Called from bpf syscall and mostly used with
 | 
			
		||||
 * size 8 or 16 bytes, so ask compiler to inline it.
 | 
			
		||||
 */
 | 
			
		||||
static inline void bpf_long_memcpy(void *dst, const void *src, u32 size)
 | 
			
		||||
{
 | 
			
		||||
	const long *lsrc = src;
 | 
			
		||||
	long *ldst = dst;
 | 
			
		||||
 | 
			
		||||
	size /= sizeof(long);
 | 
			
		||||
	while (size--)
 | 
			
		||||
		*ldst++ = *lsrc++;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/* verify correctness of eBPF program */
 | 
			
		||||
int bpf_check(struct bpf_prog **fp, union bpf_attr *attr);
 | 
			
		||||
#else
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -130,6 +130,32 @@ static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key)
 | 
			
		|||
	return this_cpu_ptr(array->pptrs[index]);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value)
 | 
			
		||||
{
 | 
			
		||||
	struct bpf_array *array = container_of(map, struct bpf_array, map);
 | 
			
		||||
	u32 index = *(u32 *)key;
 | 
			
		||||
	void __percpu *pptr;
 | 
			
		||||
	int cpu, off = 0;
 | 
			
		||||
	u32 size;
 | 
			
		||||
 | 
			
		||||
	if (unlikely(index >= array->map.max_entries))
 | 
			
		||||
		return -ENOENT;
 | 
			
		||||
 | 
			
		||||
	/* per_cpu areas are zero-filled and bpf programs can only
 | 
			
		||||
	 * access 'value_size' of them, so copying rounded areas
 | 
			
		||||
	 * will not leak any kernel data
 | 
			
		||||
	 */
 | 
			
		||||
	size = round_up(map->value_size, 8);
 | 
			
		||||
	rcu_read_lock();
 | 
			
		||||
	pptr = array->pptrs[index];
 | 
			
		||||
	for_each_possible_cpu(cpu) {
 | 
			
		||||
		bpf_long_memcpy(value + off, per_cpu_ptr(pptr, cpu), size);
 | 
			
		||||
		off += size;
 | 
			
		||||
	}
 | 
			
		||||
	rcu_read_unlock();
 | 
			
		||||
	return 0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/* Called from syscall */
 | 
			
		||||
static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
 | 
			
		||||
{
 | 
			
		||||
| 
						 | 
				
			
			@ -177,6 +203,44 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
 | 
			
		|||
	return 0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
 | 
			
		||||
			    u64 map_flags)
 | 
			
		||||
{
 | 
			
		||||
	struct bpf_array *array = container_of(map, struct bpf_array, map);
 | 
			
		||||
	u32 index = *(u32 *)key;
 | 
			
		||||
	void __percpu *pptr;
 | 
			
		||||
	int cpu, off = 0;
 | 
			
		||||
	u32 size;
 | 
			
		||||
 | 
			
		||||
	if (unlikely(map_flags > BPF_EXIST))
 | 
			
		||||
		/* unknown flags */
 | 
			
		||||
		return -EINVAL;
 | 
			
		||||
 | 
			
		||||
	if (unlikely(index >= array->map.max_entries))
 | 
			
		||||
		/* all elements were pre-allocated, cannot insert a new one */
 | 
			
		||||
		return -E2BIG;
 | 
			
		||||
 | 
			
		||||
	if (unlikely(map_flags == BPF_NOEXIST))
 | 
			
		||||
		/* all elements already exist */
 | 
			
		||||
		return -EEXIST;
 | 
			
		||||
 | 
			
		||||
	/* the user space will provide round_up(value_size, 8) bytes that
 | 
			
		||||
	 * will be copied into per-cpu area. bpf programs can only access
 | 
			
		||||
	 * value_size of it. During lookup the same extra bytes will be
 | 
			
		||||
	 * returned or zeros which were zero-filled by percpu_alloc,
 | 
			
		||||
	 * so no kernel data leaks possible
 | 
			
		||||
	 */
 | 
			
		||||
	size = round_up(map->value_size, 8);
 | 
			
		||||
	rcu_read_lock();
 | 
			
		||||
	pptr = array->pptrs[index];
 | 
			
		||||
	for_each_possible_cpu(cpu) {
 | 
			
		||||
		bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value + off, size);
 | 
			
		||||
		off += size;
 | 
			
		||||
	}
 | 
			
		||||
	rcu_read_unlock();
 | 
			
		||||
	return 0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/* Called from syscall or from eBPF program */
 | 
			
		||||
static int array_map_delete_elem(struct bpf_map *map, void *key)
 | 
			
		||||
{
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -290,7 +290,7 @@ static void free_htab_elem(struct htab_elem *l, bool percpu, u32 key_size)
 | 
			
		|||
 | 
			
		||||
static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
 | 
			
		||||
					 void *value, u32 key_size, u32 hash,
 | 
			
		||||
					 bool percpu)
 | 
			
		||||
					 bool percpu, bool onallcpus)
 | 
			
		||||
{
 | 
			
		||||
	u32 size = htab->map.value_size;
 | 
			
		||||
	struct htab_elem *l_new;
 | 
			
		||||
| 
						 | 
				
			
			@ -312,8 +312,18 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
 | 
			
		|||
			return NULL;
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		if (!onallcpus) {
 | 
			
		||||
			/* copy true value_size bytes */
 | 
			
		||||
			memcpy(this_cpu_ptr(pptr), value, htab->map.value_size);
 | 
			
		||||
		} else {
 | 
			
		||||
			int off = 0, cpu;
 | 
			
		||||
 | 
			
		||||
			for_each_possible_cpu(cpu) {
 | 
			
		||||
				bpf_long_memcpy(per_cpu_ptr(pptr, cpu),
 | 
			
		||||
						value + off, size);
 | 
			
		||||
				off += size;
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
		htab_elem_set_ptr(l_new, key_size, pptr);
 | 
			
		||||
	} else {
 | 
			
		||||
		memcpy(l_new->key + round_up(key_size, 8), value, size);
 | 
			
		||||
| 
						 | 
				
			
			@ -368,7 +378,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 | 
			
		|||
	/* allocate new element outside of the lock, since
 | 
			
		||||
	 * we're most likley going to insert it
 | 
			
		||||
	 */
 | 
			
		||||
	l_new = alloc_htab_elem(htab, key, value, key_size, hash, false);
 | 
			
		||||
	l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false);
 | 
			
		||||
	if (!l_new)
 | 
			
		||||
		return -ENOMEM;
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -402,8 +412,9 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 | 
			
		|||
	return ret;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static int htab_percpu_map_update_elem(struct bpf_map *map, void *key,
 | 
			
		||||
				       void *value, u64 map_flags)
 | 
			
		||||
static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
 | 
			
		||||
					 void *value, u64 map_flags,
 | 
			
		||||
					 bool onallcpus)
 | 
			
		||||
{
 | 
			
		||||
	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
 | 
			
		||||
	struct htab_elem *l_new = NULL, *l_old;
 | 
			
		||||
| 
						 | 
				
			
			@ -436,12 +447,25 @@ static int htab_percpu_map_update_elem(struct bpf_map *map, void *key,
 | 
			
		|||
		goto err;
 | 
			
		||||
 | 
			
		||||
	if (l_old) {
 | 
			
		||||
		void __percpu *pptr = htab_elem_get_ptr(l_old, key_size);
 | 
			
		||||
		u32 size = htab->map.value_size;
 | 
			
		||||
 | 
			
		||||
		/* per-cpu hash map can update value in-place */
 | 
			
		||||
		memcpy(this_cpu_ptr(htab_elem_get_ptr(l_old, key_size)),
 | 
			
		||||
		       value, htab->map.value_size);
 | 
			
		||||
		if (!onallcpus) {
 | 
			
		||||
			memcpy(this_cpu_ptr(pptr), value, size);
 | 
			
		||||
		} else {
 | 
			
		||||
			int off = 0, cpu;
 | 
			
		||||
 | 
			
		||||
			size = round_up(size, 8);
 | 
			
		||||
			for_each_possible_cpu(cpu) {
 | 
			
		||||
				bpf_long_memcpy(per_cpu_ptr(pptr, cpu),
 | 
			
		||||
						value + off, size);
 | 
			
		||||
				off += size;
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
	} else {
 | 
			
		||||
		l_new = alloc_htab_elem(htab, key, value, key_size,
 | 
			
		||||
					hash, true);
 | 
			
		||||
					hash, true, onallcpus);
 | 
			
		||||
		if (!l_new) {
 | 
			
		||||
			ret = -ENOMEM;
 | 
			
		||||
			goto err;
 | 
			
		||||
| 
						 | 
				
			
			@ -455,6 +479,12 @@ static int htab_percpu_map_update_elem(struct bpf_map *map, void *key,
 | 
			
		|||
	return ret;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static int htab_percpu_map_update_elem(struct bpf_map *map, void *key,
 | 
			
		||||
				       void *value, u64 map_flags)
 | 
			
		||||
{
 | 
			
		||||
	return __htab_percpu_map_update_elem(map, key, value, map_flags, false);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/* Called from syscall or from eBPF program */
 | 
			
		||||
static int htab_map_delete_elem(struct bpf_map *map, void *key)
 | 
			
		||||
{
 | 
			
		||||
| 
						 | 
				
			
			@ -557,6 +587,41 @@ static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key)
 | 
			
		|||
		return NULL;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
 | 
			
		||||
{
 | 
			
		||||
	struct htab_elem *l;
 | 
			
		||||
	void __percpu *pptr;
 | 
			
		||||
	int ret = -ENOENT;
 | 
			
		||||
	int cpu, off = 0;
 | 
			
		||||
	u32 size;
 | 
			
		||||
 | 
			
		||||
	/* per_cpu areas are zero-filled and bpf programs can only
 | 
			
		||||
	 * access 'value_size' of them, so copying rounded areas
 | 
			
		||||
	 * will not leak any kernel data
 | 
			
		||||
	 */
 | 
			
		||||
	size = round_up(map->value_size, 8);
 | 
			
		||||
	rcu_read_lock();
 | 
			
		||||
	l = __htab_map_lookup_elem(map, key);
 | 
			
		||||
	if (!l)
 | 
			
		||||
		goto out;
 | 
			
		||||
	pptr = htab_elem_get_ptr(l, map->key_size);
 | 
			
		||||
	for_each_possible_cpu(cpu) {
 | 
			
		||||
		bpf_long_memcpy(value + off,
 | 
			
		||||
				per_cpu_ptr(pptr, cpu), size);
 | 
			
		||||
		off += size;
 | 
			
		||||
	}
 | 
			
		||||
	ret = 0;
 | 
			
		||||
out:
 | 
			
		||||
	rcu_read_unlock();
 | 
			
		||||
	return ret;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
 | 
			
		||||
			   u64 map_flags)
 | 
			
		||||
{
 | 
			
		||||
	return __htab_percpu_map_update_elem(map, key, value, map_flags, true);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static const struct bpf_map_ops htab_percpu_ops = {
 | 
			
		||||
	.map_alloc = htab_map_alloc,
 | 
			
		||||
	.map_free = htab_map_free,
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -239,6 +239,7 @@ static int map_lookup_elem(union bpf_attr *attr)
 | 
			
		|||
	int ufd = attr->map_fd;
 | 
			
		||||
	struct bpf_map *map;
 | 
			
		||||
	void *key, *value, *ptr;
 | 
			
		||||
	u32 value_size;
 | 
			
		||||
	struct fd f;
 | 
			
		||||
	int err;
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -259,23 +260,35 @@ static int map_lookup_elem(union bpf_attr *attr)
 | 
			
		|||
	if (copy_from_user(key, ukey, map->key_size) != 0)
 | 
			
		||||
		goto free_key;
 | 
			
		||||
 | 
			
		||||
	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
 | 
			
		||||
	    map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
 | 
			
		||||
		value_size = round_up(map->value_size, 8) * num_possible_cpus();
 | 
			
		||||
	else
 | 
			
		||||
		value_size = map->value_size;
 | 
			
		||||
 | 
			
		||||
	err = -ENOMEM;
 | 
			
		||||
	value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN);
 | 
			
		||||
	value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
 | 
			
		||||
	if (!value)
 | 
			
		||||
		goto free_key;
 | 
			
		||||
 | 
			
		||||
	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH) {
 | 
			
		||||
		err = bpf_percpu_hash_copy(map, key, value);
 | 
			
		||||
	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
 | 
			
		||||
		err = bpf_percpu_array_copy(map, key, value);
 | 
			
		||||
	} else {
 | 
			
		||||
		rcu_read_lock();
 | 
			
		||||
		ptr = map->ops->map_lookup_elem(map, key);
 | 
			
		||||
		if (ptr)
 | 
			
		||||
		memcpy(value, ptr, map->value_size);
 | 
			
		||||
			memcpy(value, ptr, value_size);
 | 
			
		||||
		rcu_read_unlock();
 | 
			
		||||
		err = ptr ? 0 : -ENOENT;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	err = -ENOENT;
 | 
			
		||||
	if (!ptr)
 | 
			
		||||
	if (err)
 | 
			
		||||
		goto free_value;
 | 
			
		||||
 | 
			
		||||
	err = -EFAULT;
 | 
			
		||||
	if (copy_to_user(uvalue, value, map->value_size) != 0)
 | 
			
		||||
	if (copy_to_user(uvalue, value, value_size) != 0)
 | 
			
		||||
		goto free_value;
 | 
			
		||||
 | 
			
		||||
	err = 0;
 | 
			
		||||
| 
						 | 
				
			
			@ -298,6 +311,7 @@ static int map_update_elem(union bpf_attr *attr)
 | 
			
		|||
	int ufd = attr->map_fd;
 | 
			
		||||
	struct bpf_map *map;
 | 
			
		||||
	void *key, *value;
 | 
			
		||||
	u32 value_size;
 | 
			
		||||
	struct fd f;
 | 
			
		||||
	int err;
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -318,21 +332,30 @@ static int map_update_elem(union bpf_attr *attr)
 | 
			
		|||
	if (copy_from_user(key, ukey, map->key_size) != 0)
 | 
			
		||||
		goto free_key;
 | 
			
		||||
 | 
			
		||||
	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
 | 
			
		||||
	    map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
 | 
			
		||||
		value_size = round_up(map->value_size, 8) * num_possible_cpus();
 | 
			
		||||
	else
 | 
			
		||||
		value_size = map->value_size;
 | 
			
		||||
 | 
			
		||||
	err = -ENOMEM;
 | 
			
		||||
	value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN);
 | 
			
		||||
	value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
 | 
			
		||||
	if (!value)
 | 
			
		||||
		goto free_key;
 | 
			
		||||
 | 
			
		||||
	err = -EFAULT;
 | 
			
		||||
	if (copy_from_user(value, uvalue, map->value_size) != 0)
 | 
			
		||||
	if (copy_from_user(value, uvalue, value_size) != 0)
 | 
			
		||||
		goto free_value;
 | 
			
		||||
 | 
			
		||||
	/* eBPF program that use maps are running under rcu_read_lock(),
 | 
			
		||||
	 * therefore all map accessors rely on this fact, so do the same here
 | 
			
		||||
	 */
 | 
			
		||||
	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH) {
 | 
			
		||||
		err = bpf_percpu_hash_update(map, key, value, attr->flags);
 | 
			
		||||
	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
 | 
			
		||||
		err = bpf_percpu_array_update(map, key, value, attr->flags);
 | 
			
		||||
	} else {
 | 
			
		||||
		rcu_read_lock();
 | 
			
		||||
		err = map->ops->map_update_elem(map, key, value, attr->flags);
 | 
			
		||||
		rcu_read_unlock();
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
free_value:
 | 
			
		||||
	kfree(value);
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in a new issue