forked from mirrors/linux
		
	bpf: introduce bpf_spin_lock
Introduce 'struct bpf_spin_lock' and bpf_spin_lock/unlock() helpers to let
bpf program serialize access to other variables.
Example:
struct hash_elem {
    int cnt;
    struct bpf_spin_lock lock;
};
struct hash_elem * val = bpf_map_lookup_elem(&hash_map, &key);
if (val) {
    bpf_spin_lock(&val->lock);
    val->cnt++;
    bpf_spin_unlock(&val->lock);
}
Restrictions and safety checks:
- bpf_spin_lock is only allowed inside HASH and ARRAY maps.
- BTF description of the map is mandatory for safety analysis.
- bpf program can take one bpf_spin_lock at a time, since two or more can
  cause dead locks.
- only one 'struct bpf_spin_lock' is allowed per map element.
  It drastically simplifies implementation yet allows bpf program to use
  any number of bpf_spin_locks.
- when bpf_spin_lock is taken the calls (either bpf2bpf or helpers) are not allowed.
- bpf program must bpf_spin_unlock() before return.
- bpf program can access 'struct bpf_spin_lock' only via
  bpf_spin_lock()/bpf_spin_unlock() helpers.
- load/store into 'struct bpf_spin_lock lock;' field is not allowed.
- to use bpf_spin_lock() helper the BTF description of map value must be
  a struct and have 'struct bpf_spin_lock anyname;' field at the top level.
  Nested lock inside another struct is not allowed.
- syscall map_lookup doesn't copy bpf_spin_lock field to user space.
- syscall map_update and program map_update do not update bpf_spin_lock field.
- bpf_spin_lock cannot be on the stack or inside networking packet.
  bpf_spin_lock can only be inside HASH or ARRAY map value.
- bpf_spin_lock is available to root only and to all program types.
- bpf_spin_lock is not allowed in inner maps of map-in-map.
- ld_abs is not allowed inside spin_lock-ed region.
- tracing progs and socket filter progs cannot use bpf_spin_lock due to
  insufficient preemption checks
Implementation details:
- cgroup-bpf class of programs can nest with xdp/tc programs.
  Hence bpf_spin_lock is equivalent to spin_lock_irqsave.
  Other solutions to avoid nested bpf_spin_lock are possible.
  Like making sure that all networking progs run with softirq disabled.
  spin_lock_irqsave is the simplest and doesn't add overhead to the
  programs that don't use it.
- arch_spinlock_t is used when its implemented as queued_spin_lock
- archs can force their own arch_spinlock_t
- on architectures where queued_spin_lock is not available and
  sizeof(arch_spinlock_t) != sizeof(__u32) trivial lock is used.
- presence of bpf_spin_lock inside map value could have been indicated via
  extra flag during map_create, but specifying it via BTF is cleaner.
  It provides introspection for map key/value and reduces user mistakes.
Next steps:
- allow bpf_spin_lock in other map types (like cgroup local storage)
- introduce BPF_F_LOCK flag for bpf_map_update() syscall and helper
  to request kernel to grab bpf_spin_lock before rewriting the value.
  That will serialize access to map elements.
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
			
			
This commit is contained in:
		
							parent
							
								
									1832f4ef58
								
							
						
					
					
						commit
						d83525ca62
					
				
					 14 changed files with 386 additions and 26 deletions
				
			
		|  | @ -72,14 +72,15 @@ struct bpf_map { | |||
| 	u32 value_size; | ||||
| 	u32 max_entries; | ||||
| 	u32 map_flags; | ||||
| 	u32 pages; | ||||
| 	int spin_lock_off; /* >=0 valid offset, <0 error */ | ||||
| 	u32 id; | ||||
| 	int numa_node; | ||||
| 	u32 btf_key_type_id; | ||||
| 	u32 btf_value_type_id; | ||||
| 	struct btf *btf; | ||||
| 	u32 pages; | ||||
| 	bool unpriv_array; | ||||
| 	/* 55 bytes hole */ | ||||
| 	/* 51 bytes hole */ | ||||
| 
 | ||||
| 	/* The 3rd and 4th cacheline with misc members to avoid false sharing
 | ||||
| 	 * particularly with refcounting. | ||||
|  | @ -91,6 +92,34 @@ struct bpf_map { | |||
| 	char name[BPF_OBJ_NAME_LEN]; | ||||
| }; | ||||
| 
 | ||||
| static inline bool map_value_has_spin_lock(const struct bpf_map *map) | ||||
| { | ||||
| 	return map->spin_lock_off >= 0; | ||||
| } | ||||
| 
 | ||||
| static inline void check_and_init_map_lock(struct bpf_map *map, void *dst) | ||||
| { | ||||
| 	if (likely(!map_value_has_spin_lock(map))) | ||||
| 		return; | ||||
| 	*(struct bpf_spin_lock *)(dst + map->spin_lock_off) = | ||||
| 		(struct bpf_spin_lock){}; | ||||
| } | ||||
| 
 | ||||
| /* copy everything but bpf_spin_lock */ | ||||
| static inline void copy_map_value(struct bpf_map *map, void *dst, void *src) | ||||
| { | ||||
| 	if (unlikely(map_value_has_spin_lock(map))) { | ||||
| 		u32 off = map->spin_lock_off; | ||||
| 
 | ||||
| 		memcpy(dst, src, off); | ||||
| 		memcpy(dst + off + sizeof(struct bpf_spin_lock), | ||||
| 		       src + off + sizeof(struct bpf_spin_lock), | ||||
| 		       map->value_size - off - sizeof(struct bpf_spin_lock)); | ||||
| 	} else { | ||||
| 		memcpy(dst, src, map->value_size); | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| struct bpf_offload_dev; | ||||
| struct bpf_offloaded_map; | ||||
| 
 | ||||
|  | @ -162,6 +191,7 @@ enum bpf_arg_type { | |||
| 	ARG_PTR_TO_CTX,		/* pointer to context */ | ||||
| 	ARG_ANYTHING,		/* any (initialized) argument is ok */ | ||||
| 	ARG_PTR_TO_SOCKET,	/* pointer to bpf_sock */ | ||||
| 	ARG_PTR_TO_SPIN_LOCK,	/* pointer to bpf_spin_lock */ | ||||
| }; | ||||
| 
 | ||||
| /* type of values returned from helper functions */ | ||||
|  | @ -879,7 +909,8 @@ extern const struct bpf_func_proto bpf_msg_redirect_hash_proto; | |||
| extern const struct bpf_func_proto bpf_msg_redirect_map_proto; | ||||
| extern const struct bpf_func_proto bpf_sk_redirect_hash_proto; | ||||
| extern const struct bpf_func_proto bpf_sk_redirect_map_proto; | ||||
| 
 | ||||
| extern const struct bpf_func_proto bpf_spin_lock_proto; | ||||
| extern const struct bpf_func_proto bpf_spin_unlock_proto; | ||||
| extern const struct bpf_func_proto bpf_get_local_storage_proto; | ||||
| 
 | ||||
| /* Shared helpers among cBPF and eBPF. */ | ||||
|  |  | |||
|  | @ -148,6 +148,7 @@ struct bpf_verifier_state { | |||
| 	/* call stack tracking */ | ||||
| 	struct bpf_func_state *frame[MAX_CALL_FRAMES]; | ||||
| 	u32 curframe; | ||||
| 	u32 active_spin_lock; | ||||
| 	bool speculative; | ||||
| }; | ||||
| 
 | ||||
|  |  | |||
|  | @ -50,6 +50,7 @@ u32 btf_id(const struct btf *btf); | |||
| bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s, | ||||
| 			   const struct btf_member *m, | ||||
| 			   u32 expected_offset, u32 expected_size); | ||||
| int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t); | ||||
| 
 | ||||
| #ifdef CONFIG_BPF_SYSCALL | ||||
| const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id); | ||||
|  |  | |||
|  | @ -2422,7 +2422,9 @@ union bpf_attr { | |||
| 	FN(map_peek_elem),		\ | ||||
| 	FN(msg_push_data),		\ | ||||
| 	FN(msg_pop_data),		\ | ||||
| 	FN(rc_pointer_rel), | ||||
| 	FN(rc_pointer_rel),		\ | ||||
| 	FN(spin_lock),			\ | ||||
| 	FN(spin_unlock), | ||||
| 
 | ||||
| /* integer value in 'imm' field of BPF_CALL instruction selects which helper
 | ||||
|  * function eBPF program intends to call | ||||
|  | @ -3056,4 +3058,7 @@ struct bpf_line_info { | |||
| 	__u32	line_col; | ||||
| }; | ||||
| 
 | ||||
| struct bpf_spin_lock { | ||||
| 	__u32	val; | ||||
| }; | ||||
| #endif /* _UAPI__LINUX_BPF_H__ */ | ||||
|  |  | |||
|  | @ -242,6 +242,9 @@ config QUEUED_SPINLOCKS | |||
| 	def_bool y if ARCH_USE_QUEUED_SPINLOCKS | ||||
| 	depends on SMP | ||||
| 
 | ||||
| config BPF_ARCH_SPINLOCK | ||||
| 	bool | ||||
| 
 | ||||
| config ARCH_USE_QUEUED_RWLOCKS | ||||
| 	bool | ||||
| 
 | ||||
|  |  | |||
|  | @ -270,9 +270,10 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, | |||
| 		memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]), | ||||
| 		       value, map->value_size); | ||||
| 	else | ||||
| 		memcpy(array->value + | ||||
| 		       array->elem_size * (index & array->index_mask), | ||||
| 		       value, map->value_size); | ||||
| 		copy_map_value(map, | ||||
| 			       array->value + | ||||
| 			       array->elem_size * (index & array->index_mask), | ||||
| 			       value); | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
|  |  | |||
|  | @ -355,6 +355,11 @@ static bool btf_type_is_struct(const struct btf_type *t) | |||
| 	return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION; | ||||
| } | ||||
| 
 | ||||
| static bool __btf_type_is_struct(const struct btf_type *t) | ||||
| { | ||||
| 	return BTF_INFO_KIND(t->info) == BTF_KIND_STRUCT; | ||||
| } | ||||
| 
 | ||||
| static bool btf_type_is_array(const struct btf_type *t) | ||||
| { | ||||
| 	return BTF_INFO_KIND(t->info) == BTF_KIND_ARRAY; | ||||
|  | @ -2045,6 +2050,43 @@ static void btf_struct_log(struct btf_verifier_env *env, | |||
| 	btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t)); | ||||
| } | ||||
| 
 | ||||
| /* find 'struct bpf_spin_lock' in map value.
 | ||||
|  * return >= 0 offset if found | ||||
|  * and < 0 in case of error | ||||
|  */ | ||||
| int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t) | ||||
| { | ||||
| 	const struct btf_member *member; | ||||
| 	u32 i, off = -ENOENT; | ||||
| 
 | ||||
| 	if (!__btf_type_is_struct(t)) | ||||
| 		return -EINVAL; | ||||
| 
 | ||||
| 	for_each_member(i, t, member) { | ||||
| 		const struct btf_type *member_type = btf_type_by_id(btf, | ||||
| 								    member->type); | ||||
| 		if (!__btf_type_is_struct(member_type)) | ||||
| 			continue; | ||||
| 		if (member_type->size != sizeof(struct bpf_spin_lock)) | ||||
| 			continue; | ||||
| 		if (strcmp(__btf_name_by_offset(btf, member_type->name_off), | ||||
| 			   "bpf_spin_lock")) | ||||
| 			continue; | ||||
| 		if (off != -ENOENT) | ||||
| 			/* only one 'struct bpf_spin_lock' is allowed */ | ||||
| 			return -E2BIG; | ||||
| 		off = btf_member_bit_offset(t, member); | ||||
| 		if (off % 8) | ||||
| 			/* valid C code cannot generate such BTF */ | ||||
| 			return -EINVAL; | ||||
| 		off /= 8; | ||||
| 		if (off % __alignof__(struct bpf_spin_lock)) | ||||
| 			/* valid struct bpf_spin_lock will be 4 byte aligned */ | ||||
| 			return -EINVAL; | ||||
| 	} | ||||
| 	return off; | ||||
| } | ||||
| 
 | ||||
| static void btf_struct_seq_show(const struct btf *btf, const struct btf_type *t, | ||||
| 				u32 type_id, void *data, u8 bits_offset, | ||||
| 				struct seq_file *m) | ||||
|  |  | |||
|  | @ -2002,6 +2002,8 @@ const struct bpf_func_proto bpf_map_delete_elem_proto __weak; | |||
| const struct bpf_func_proto bpf_map_push_elem_proto __weak; | ||||
| const struct bpf_func_proto bpf_map_pop_elem_proto __weak; | ||||
| const struct bpf_func_proto bpf_map_peek_elem_proto __weak; | ||||
| const struct bpf_func_proto bpf_spin_lock_proto __weak; | ||||
| const struct bpf_func_proto bpf_spin_unlock_proto __weak; | ||||
| 
 | ||||
| const struct bpf_func_proto bpf_get_prandom_u32_proto __weak; | ||||
| const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak; | ||||
|  |  | |||
|  | @ -718,21 +718,12 @@ static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab) | |||
| 	       BITS_PER_LONG == 64; | ||||
| } | ||||
| 
 | ||||
| static u32 htab_size_value(const struct bpf_htab *htab, bool percpu) | ||||
| { | ||||
| 	u32 size = htab->map.value_size; | ||||
| 
 | ||||
| 	if (percpu || fd_htab_map_needs_adjust(htab)) | ||||
| 		size = round_up(size, 8); | ||||
| 	return size; | ||||
| } | ||||
| 
 | ||||
| static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, | ||||
| 					 void *value, u32 key_size, u32 hash, | ||||
| 					 bool percpu, bool onallcpus, | ||||
| 					 struct htab_elem *old_elem) | ||||
| { | ||||
| 	u32 size = htab_size_value(htab, percpu); | ||||
| 	u32 size = htab->map.value_size; | ||||
| 	bool prealloc = htab_is_prealloc(htab); | ||||
| 	struct htab_elem *l_new, **pl_new; | ||||
| 	void __percpu *pptr; | ||||
|  | @ -770,10 +761,13 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, | |||
| 			l_new = ERR_PTR(-ENOMEM); | ||||
| 			goto dec_count; | ||||
| 		} | ||||
| 		check_and_init_map_lock(&htab->map, | ||||
| 					l_new->key + round_up(key_size, 8)); | ||||
| 	} | ||||
| 
 | ||||
| 	memcpy(l_new->key, key, key_size); | ||||
| 	if (percpu) { | ||||
| 		size = round_up(size, 8); | ||||
| 		if (prealloc) { | ||||
| 			pptr = htab_elem_get_ptr(l_new, key_size); | ||||
| 		} else { | ||||
|  | @ -791,8 +785,13 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, | |||
| 
 | ||||
| 		if (!prealloc) | ||||
| 			htab_elem_set_ptr(l_new, key_size, pptr); | ||||
| 	} else { | ||||
| 	} else if (fd_htab_map_needs_adjust(htab)) { | ||||
| 		size = round_up(size, 8); | ||||
| 		memcpy(l_new->key + round_up(key_size, 8), value, size); | ||||
| 	} else { | ||||
| 		copy_map_value(&htab->map, | ||||
| 			       l_new->key + round_up(key_size, 8), | ||||
| 			       value); | ||||
| 	} | ||||
| 
 | ||||
| 	l_new->hash = hash; | ||||
|  |  | |||
|  | @ -221,6 +221,86 @@ const struct bpf_func_proto bpf_get_current_comm_proto = { | |||
| 	.arg2_type	= ARG_CONST_SIZE, | ||||
| }; | ||||
| 
 | ||||
| #if defined(CONFIG_QUEUED_SPINLOCKS) || defined(CONFIG_BPF_ARCH_SPINLOCK) | ||||
| 
 | ||||
| static inline void __bpf_spin_lock(struct bpf_spin_lock *lock) | ||||
| { | ||||
| 	arch_spinlock_t *l = (void *)lock; | ||||
| 	union { | ||||
| 		__u32 val; | ||||
| 		arch_spinlock_t lock; | ||||
| 	} u = { .lock = __ARCH_SPIN_LOCK_UNLOCKED }; | ||||
| 
 | ||||
| 	compiletime_assert(u.val == 0, "__ARCH_SPIN_LOCK_UNLOCKED not 0"); | ||||
| 	BUILD_BUG_ON(sizeof(*l) != sizeof(__u32)); | ||||
| 	BUILD_BUG_ON(sizeof(*lock) != sizeof(__u32)); | ||||
| 	arch_spin_lock(l); | ||||
| } | ||||
| 
 | ||||
| static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock) | ||||
| { | ||||
| 	arch_spinlock_t *l = (void *)lock; | ||||
| 
 | ||||
| 	arch_spin_unlock(l); | ||||
| } | ||||
| 
 | ||||
| #else | ||||
| 
 | ||||
| static inline void __bpf_spin_lock(struct bpf_spin_lock *lock) | ||||
| { | ||||
| 	atomic_t *l = (void *)lock; | ||||
| 
 | ||||
| 	BUILD_BUG_ON(sizeof(*l) != sizeof(*lock)); | ||||
| 	do { | ||||
| 		atomic_cond_read_relaxed(l, !VAL); | ||||
| 	} while (atomic_xchg(l, 1)); | ||||
| } | ||||
| 
 | ||||
| static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock) | ||||
| { | ||||
| 	atomic_t *l = (void *)lock; | ||||
| 
 | ||||
| 	atomic_set_release(l, 0); | ||||
| } | ||||
| 
 | ||||
| #endif | ||||
| 
 | ||||
| static DEFINE_PER_CPU(unsigned long, irqsave_flags); | ||||
| 
 | ||||
| notrace BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock) | ||||
| { | ||||
| 	unsigned long flags; | ||||
| 
 | ||||
| 	local_irq_save(flags); | ||||
| 	__bpf_spin_lock(lock); | ||||
| 	__this_cpu_write(irqsave_flags, flags); | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| const struct bpf_func_proto bpf_spin_lock_proto = { | ||||
| 	.func		= bpf_spin_lock, | ||||
| 	.gpl_only	= false, | ||||
| 	.ret_type	= RET_VOID, | ||||
| 	.arg1_type	= ARG_PTR_TO_SPIN_LOCK, | ||||
| }; | ||||
| 
 | ||||
| notrace BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock) | ||||
| { | ||||
| 	unsigned long flags; | ||||
| 
 | ||||
| 	flags = __this_cpu_read(irqsave_flags); | ||||
| 	__bpf_spin_unlock(lock); | ||||
| 	local_irq_restore(flags); | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| const struct bpf_func_proto bpf_spin_unlock_proto = { | ||||
| 	.func		= bpf_spin_unlock, | ||||
| 	.gpl_only	= false, | ||||
| 	.ret_type	= RET_VOID, | ||||
| 	.arg1_type	= ARG_PTR_TO_SPIN_LOCK, | ||||
| }; | ||||
| 
 | ||||
| #ifdef CONFIG_CGROUPS | ||||
| BPF_CALL_0(bpf_get_current_cgroup_id) | ||||
| { | ||||
|  |  | |||
|  | @ -37,6 +37,11 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) | |||
| 		return ERR_PTR(-EINVAL); | ||||
| 	} | ||||
| 
 | ||||
| 	if (map_value_has_spin_lock(inner_map)) { | ||||
| 		fdput(f); | ||||
| 		return ERR_PTR(-ENOTSUPP); | ||||
| 	} | ||||
| 
 | ||||
| 	inner_map_meta_size = sizeof(*inner_map_meta); | ||||
| 	/* In some cases verifier needs to access beyond just base map. */ | ||||
| 	if (inner_map->ops == &array_map_ops) | ||||
|  |  | |||
|  | @ -463,7 +463,7 @@ int map_check_no_btf(const struct bpf_map *map, | |||
| 	return -ENOTSUPP; | ||||
| } | ||||
| 
 | ||||
| static int map_check_btf(const struct bpf_map *map, const struct btf *btf, | ||||
| static int map_check_btf(struct bpf_map *map, const struct btf *btf, | ||||
| 			 u32 btf_key_id, u32 btf_value_id) | ||||
| { | ||||
| 	const struct btf_type *key_type, *value_type; | ||||
|  | @ -478,6 +478,21 @@ static int map_check_btf(const struct bpf_map *map, const struct btf *btf, | |||
| 	if (!value_type || value_size != map->value_size) | ||||
| 		return -EINVAL; | ||||
| 
 | ||||
| 	map->spin_lock_off = btf_find_spin_lock(btf, value_type); | ||||
| 
 | ||||
| 	if (map_value_has_spin_lock(map)) { | ||||
| 		if (map->map_type != BPF_MAP_TYPE_HASH && | ||||
| 		    map->map_type != BPF_MAP_TYPE_ARRAY) | ||||
| 			return -ENOTSUPP; | ||||
| 		if (map->spin_lock_off + sizeof(struct bpf_spin_lock) > | ||||
| 		    map->value_size) { | ||||
| 			WARN_ONCE(1, | ||||
| 				  "verifier bug spin_lock_off %d value_size %d\n", | ||||
| 				  map->spin_lock_off, map->value_size); | ||||
| 			return -EFAULT; | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	if (map->ops->map_check_btf) | ||||
| 		ret = map->ops->map_check_btf(map, btf, key_type, value_type); | ||||
| 
 | ||||
|  | @ -542,6 +557,8 @@ static int map_create(union bpf_attr *attr) | |||
| 		map->btf = btf; | ||||
| 		map->btf_key_type_id = attr->btf_key_type_id; | ||||
| 		map->btf_value_type_id = attr->btf_value_type_id; | ||||
| 	} else { | ||||
| 		map->spin_lock_off = -EINVAL; | ||||
| 	} | ||||
| 
 | ||||
| 	err = security_bpf_map_alloc(map); | ||||
|  | @ -740,7 +757,7 @@ static int map_lookup_elem(union bpf_attr *attr) | |||
| 			err = -ENOENT; | ||||
| 		} else { | ||||
| 			err = 0; | ||||
| 			memcpy(value, ptr, value_size); | ||||
| 			copy_map_value(map, value, ptr); | ||||
| 		} | ||||
| 		rcu_read_unlock(); | ||||
| 	} | ||||
|  |  | |||
|  | @ -213,6 +213,7 @@ struct bpf_call_arg_meta { | |||
| 	s64 msize_smax_value; | ||||
| 	u64 msize_umax_value; | ||||
| 	int ptr_id; | ||||
| 	int func_id; | ||||
| }; | ||||
| 
 | ||||
| static DEFINE_MUTEX(bpf_verifier_lock); | ||||
|  | @ -351,6 +352,12 @@ static bool reg_is_refcounted(const struct bpf_reg_state *reg) | |||
| 	return type_is_refcounted(reg->type); | ||||
| } | ||||
| 
 | ||||
| static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) | ||||
| { | ||||
| 	return reg->type == PTR_TO_MAP_VALUE && | ||||
| 		map_value_has_spin_lock(reg->map_ptr); | ||||
| } | ||||
| 
 | ||||
| static bool reg_is_refcounted_or_null(const struct bpf_reg_state *reg) | ||||
| { | ||||
| 	return type_is_refcounted_or_null(reg->type); | ||||
|  | @ -712,6 +719,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, | |||
| 	} | ||||
| 	dst_state->speculative = src->speculative; | ||||
| 	dst_state->curframe = src->curframe; | ||||
| 	dst_state->active_spin_lock = src->active_spin_lock; | ||||
| 	for (i = 0; i <= src->curframe; i++) { | ||||
| 		dst = dst_state->frame[i]; | ||||
| 		if (!dst) { | ||||
|  | @ -1483,6 +1491,21 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, | |||
| 	if (err) | ||||
| 		verbose(env, "R%d max value is outside of the array range\n", | ||||
| 			regno); | ||||
| 
 | ||||
| 	if (map_value_has_spin_lock(reg->map_ptr)) { | ||||
| 		u32 lock = reg->map_ptr->spin_lock_off; | ||||
| 
 | ||||
| 		/* if any part of struct bpf_spin_lock can be touched by
 | ||||
| 		 * load/store reject this program. | ||||
| 		 * To check that [x1, x2) overlaps with [y1, y2) | ||||
| 		 * it is sufficient to check x1 < y2 && y1 < x2. | ||||
| 		 */ | ||||
| 		if (reg->smin_value + off < lock + sizeof(struct bpf_spin_lock) && | ||||
| 		     lock < reg->umax_value + off + size) { | ||||
| 			verbose(env, "bpf_spin_lock cannot be accessed directly by load/store\n"); | ||||
| 			return -EACCES; | ||||
| 		} | ||||
| 	} | ||||
| 	return err; | ||||
| } | ||||
| 
 | ||||
|  | @ -2192,6 +2215,91 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, | |||
| 	} | ||||
| } | ||||
| 
 | ||||
| /* Implementation details:
 | ||||
|  * bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL | ||||
|  * Two bpf_map_lookups (even with the same key) will have different reg->id. | ||||
|  * For traditional PTR_TO_MAP_VALUE the verifier clears reg->id after | ||||
|  * value_or_null->value transition, since the verifier only cares about | ||||
|  * the range of access to valid map value pointer and doesn't care about actual | ||||
|  * address of the map element. | ||||
|  * For maps with 'struct bpf_spin_lock' inside map value the verifier keeps | ||||
|  * reg->id > 0 after value_or_null->value transition. By doing so | ||||
|  * two bpf_map_lookups will be considered two different pointers that | ||||
|  * point to different bpf_spin_locks. | ||||
|  * The verifier allows taking only one bpf_spin_lock at a time to avoid | ||||
|  * dead-locks. | ||||
|  * Since only one bpf_spin_lock is allowed the checks are simpler than | ||||
|  * reg_is_refcounted() logic. The verifier needs to remember only | ||||
|  * one spin_lock instead of array of acquired_refs. | ||||
|  * cur_state->active_spin_lock remembers which map value element got locked | ||||
|  * and clears it after bpf_spin_unlock. | ||||
|  */ | ||||
| static int process_spin_lock(struct bpf_verifier_env *env, int regno, | ||||
| 			     bool is_lock) | ||||
| { | ||||
| 	struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; | ||||
| 	struct bpf_verifier_state *cur = env->cur_state; | ||||
| 	bool is_const = tnum_is_const(reg->var_off); | ||||
| 	struct bpf_map *map = reg->map_ptr; | ||||
| 	u64 val = reg->var_off.value; | ||||
| 
 | ||||
| 	if (reg->type != PTR_TO_MAP_VALUE) { | ||||
| 		verbose(env, "R%d is not a pointer to map_value\n", regno); | ||||
| 		return -EINVAL; | ||||
| 	} | ||||
| 	if (!is_const) { | ||||
| 		verbose(env, | ||||
| 			"R%d doesn't have constant offset. bpf_spin_lock has to be at the constant offset\n", | ||||
| 			regno); | ||||
| 		return -EINVAL; | ||||
| 	} | ||||
| 	if (!map->btf) { | ||||
| 		verbose(env, | ||||
| 			"map '%s' has to have BTF in order to use bpf_spin_lock\n", | ||||
| 			map->name); | ||||
| 		return -EINVAL; | ||||
| 	} | ||||
| 	if (!map_value_has_spin_lock(map)) { | ||||
| 		if (map->spin_lock_off == -E2BIG) | ||||
| 			verbose(env, | ||||
| 				"map '%s' has more than one 'struct bpf_spin_lock'\n", | ||||
| 				map->name); | ||||
| 		else if (map->spin_lock_off == -ENOENT) | ||||
| 			verbose(env, | ||||
| 				"map '%s' doesn't have 'struct bpf_spin_lock'\n", | ||||
| 				map->name); | ||||
| 		else | ||||
| 			verbose(env, | ||||
| 				"map '%s' is not a struct type or bpf_spin_lock is mangled\n", | ||||
| 				map->name); | ||||
| 		return -EINVAL; | ||||
| 	} | ||||
| 	if (map->spin_lock_off != val + reg->off) { | ||||
| 		verbose(env, "off %lld doesn't point to 'struct bpf_spin_lock'\n", | ||||
| 			val + reg->off); | ||||
| 		return -EINVAL; | ||||
| 	} | ||||
| 	if (is_lock) { | ||||
| 		if (cur->active_spin_lock) { | ||||
| 			verbose(env, | ||||
| 				"Locking two bpf_spin_locks are not allowed\n"); | ||||
| 			return -EINVAL; | ||||
| 		} | ||||
| 		cur->active_spin_lock = reg->id; | ||||
| 	} else { | ||||
| 		if (!cur->active_spin_lock) { | ||||
| 			verbose(env, "bpf_spin_unlock without taking a lock\n"); | ||||
| 			return -EINVAL; | ||||
| 		} | ||||
| 		if (cur->active_spin_lock != reg->id) { | ||||
| 			verbose(env, "bpf_spin_unlock of different lock\n"); | ||||
| 			return -EINVAL; | ||||
| 		} | ||||
| 		cur->active_spin_lock = 0; | ||||
| 	} | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| static bool arg_type_is_mem_ptr(enum bpf_arg_type type) | ||||
| { | ||||
| 	return type == ARG_PTR_TO_MEM || | ||||
|  | @ -2268,6 +2376,17 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, | |||
| 			return -EFAULT; | ||||
| 		} | ||||
| 		meta->ptr_id = reg->id; | ||||
| 	} else if (arg_type == ARG_PTR_TO_SPIN_LOCK) { | ||||
| 		if (meta->func_id == BPF_FUNC_spin_lock) { | ||||
| 			if (process_spin_lock(env, regno, true)) | ||||
| 				return -EACCES; | ||||
| 		} else if (meta->func_id == BPF_FUNC_spin_unlock) { | ||||
| 			if (process_spin_lock(env, regno, false)) | ||||
| 				return -EACCES; | ||||
| 		} else { | ||||
| 			verbose(env, "verifier internal error\n"); | ||||
| 			return -EFAULT; | ||||
| 		} | ||||
| 	} else if (arg_type_is_mem_ptr(arg_type)) { | ||||
| 		expected_type = PTR_TO_STACK; | ||||
| 		/* One exception here. In case function allows for NULL to be
 | ||||
|  | @ -2887,6 +3006,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn | |||
| 		return err; | ||||
| 	} | ||||
| 
 | ||||
| 	meta.func_id = func_id; | ||||
| 	/* check args */ | ||||
| 	err = check_func_arg(env, BPF_REG_1, fn->arg1_type, &meta); | ||||
| 	if (err) | ||||
|  | @ -4473,7 +4593,8 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, | |||
| 		} else if (reg->type == PTR_TO_SOCKET_OR_NULL) { | ||||
| 			reg->type = PTR_TO_SOCKET; | ||||
| 		} | ||||
| 		if (is_null || !reg_is_refcounted(reg)) { | ||||
| 		if (is_null || !(reg_is_refcounted(reg) || | ||||
| 				 reg_may_point_to_spin_lock(reg))) { | ||||
| 			/* We don't need id from this point onwards anymore,
 | ||||
| 			 * thus we should better reset it, so that state | ||||
| 			 * pruning has chances to take effect. | ||||
|  | @ -4871,6 +4992,11 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) | |||
| 		return err; | ||||
| 	} | ||||
| 
 | ||||
| 	if (env->cur_state->active_spin_lock) { | ||||
| 		verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_spin_lock-ed region\n"); | ||||
| 		return -EINVAL; | ||||
| 	} | ||||
| 
 | ||||
| 	if (regs[BPF_REG_6].type != PTR_TO_CTX) { | ||||
| 		verbose(env, | ||||
| 			"at the time of BPF_LD_ABS|IND R6 != pointer to skb\n"); | ||||
|  | @ -5607,8 +5733,11 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, | |||
| 	case PTR_TO_MAP_VALUE: | ||||
| 		/* If the new min/max/var_off satisfy the old ones and
 | ||||
| 		 * everything else matches, we are OK. | ||||
| 		 * We don't care about the 'id' value, because nothing | ||||
| 		 * uses it for PTR_TO_MAP_VALUE (only for ..._OR_NULL) | ||||
| 		 * 'id' is not compared, since it's only used for maps with | ||||
| 		 * bpf_spin_lock inside map element and in such cases if | ||||
| 		 * the rest of the prog is valid for one map element then | ||||
| 		 * it's valid for all map elements regardless of the key | ||||
| 		 * used in bpf_map_lookup() | ||||
| 		 */ | ||||
| 		return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 && | ||||
| 		       range_within(rold, rcur) && | ||||
|  | @ -5811,6 +5940,9 @@ static bool states_equal(struct bpf_verifier_env *env, | |||
| 	if (old->speculative && !cur->speculative) | ||||
| 		return false; | ||||
| 
 | ||||
| 	if (old->active_spin_lock != cur->active_spin_lock) | ||||
| 		return false; | ||||
| 
 | ||||
| 	/* for states to be equal callsites have to be the same
 | ||||
| 	 * and all frame states need to be equivalent | ||||
| 	 */ | ||||
|  | @ -6229,6 +6361,12 @@ static int do_check(struct bpf_verifier_env *env) | |||
| 					return -EINVAL; | ||||
| 				} | ||||
| 
 | ||||
| 				if (env->cur_state->active_spin_lock && | ||||
| 				    (insn->src_reg == BPF_PSEUDO_CALL || | ||||
| 				     insn->imm != BPF_FUNC_spin_unlock)) { | ||||
| 					verbose(env, "function calls are not allowed while holding a lock\n"); | ||||
| 					return -EINVAL; | ||||
| 				} | ||||
| 				if (insn->src_reg == BPF_PSEUDO_CALL) | ||||
| 					err = check_func_call(env, insn, &env->insn_idx); | ||||
| 				else | ||||
|  | @ -6259,6 +6397,11 @@ static int do_check(struct bpf_verifier_env *env) | |||
| 					return -EINVAL; | ||||
| 				} | ||||
| 
 | ||||
| 				if (env->cur_state->active_spin_lock) { | ||||
| 					verbose(env, "bpf_spin_unlock is missing\n"); | ||||
| 					return -EINVAL; | ||||
| 				} | ||||
| 
 | ||||
| 				if (state->curframe) { | ||||
| 					/* exit from nested function */ | ||||
| 					env->prev_insn_idx = env->insn_idx; | ||||
|  | @ -6356,6 +6499,19 @@ static int check_map_prealloc(struct bpf_map *map) | |||
| 		!(map->map_flags & BPF_F_NO_PREALLOC); | ||||
| } | ||||
| 
 | ||||
| static bool is_tracing_prog_type(enum bpf_prog_type type) | ||||
| { | ||||
| 	switch (type) { | ||||
| 	case BPF_PROG_TYPE_KPROBE: | ||||
| 	case BPF_PROG_TYPE_TRACEPOINT: | ||||
| 	case BPF_PROG_TYPE_PERF_EVENT: | ||||
| 	case BPF_PROG_TYPE_RAW_TRACEPOINT: | ||||
| 		return true; | ||||
| 	default: | ||||
| 		return false; | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| static int check_map_prog_compatibility(struct bpf_verifier_env *env, | ||||
| 					struct bpf_map *map, | ||||
| 					struct bpf_prog *prog) | ||||
|  | @ -6378,6 +6534,13 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, | |||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	if ((is_tracing_prog_type(prog->type) || | ||||
| 	     prog->type == BPF_PROG_TYPE_SOCKET_FILTER) && | ||||
| 	    map_value_has_spin_lock(map)) { | ||||
| 		verbose(env, "tracing progs cannot use bpf_spin_lock yet\n"); | ||||
| 		return -EINVAL; | ||||
| 	} | ||||
| 
 | ||||
| 	if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) && | ||||
| 	    !bpf_offload_prog_map_match(prog, map)) { | ||||
| 		verbose(env, "offload device mismatch between prog and map\n"); | ||||
|  |  | |||
|  | @ -5314,10 +5314,20 @@ bpf_base_func_proto(enum bpf_func_id func_id) | |||
| 		return &bpf_tail_call_proto; | ||||
| 	case BPF_FUNC_ktime_get_ns: | ||||
| 		return &bpf_ktime_get_ns_proto; | ||||
| 	default: | ||||
| 		break; | ||||
| 	} | ||||
| 
 | ||||
| 	if (!capable(CAP_SYS_ADMIN)) | ||||
| 		return NULL; | ||||
| 
 | ||||
| 	switch (func_id) { | ||||
| 	case BPF_FUNC_spin_lock: | ||||
| 		return &bpf_spin_lock_proto; | ||||
| 	case BPF_FUNC_spin_unlock: | ||||
| 		return &bpf_spin_unlock_proto; | ||||
| 	case BPF_FUNC_trace_printk: | ||||
| 		if (capable(CAP_SYS_ADMIN)) | ||||
| 			return bpf_get_trace_printk_proto(); | ||||
| 		/* else, fall through */ | ||||
| 		return bpf_get_trace_printk_proto(); | ||||
| 	default: | ||||
| 		return NULL; | ||||
| 	} | ||||
|  |  | |||
		Loading…
	
		Reference in a new issue
	
	 Alexei Starovoitov
						Alexei Starovoitov