forked from mirrors/linux
		
	 079082c60a
			
		
	
	
		079082c60a
		
	
	
	
	
		
			
			During unregister_netdevice_many_notify(), the ordering of our concerned
function calls is like this:
  unregister_netdevice_many_notify
    dev_shutdown
	qdisc_put
            clsact_destroy
    tcx_uninstall
The syzbot reproducer triggered a case that the qdisc refcnt is not
zero during dev_shutdown().
tcx_uninstall() will then WARN_ON_ONCE(tcx_entry(entry)->miniq_active)
because the miniq is still active and the entry should not be freed.
The latter assumed that qdisc destruction happens before tcx teardown.
This fix is to avoid tcx_uninstall() doing tcx_entry_free() when the
miniq is still alive and let the clsact_destroy() do the free later, so
that we do not assume any specific ordering for either of them.
If still active, tcx_uninstall() does clear the entry when flushing out
the prog/link. clsact_destroy() will then notice the "!tcx_entry_is_active()"
and then does the tcx_entry_free() eventually.
Fixes: e420bed025 ("bpf: Add fd-based tcx multi-prog infra with link support")
Reported-by: syzbot+376a289e86a0fd02b9ba@syzkaller.appspotmail.com
Reported-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Co-developed-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Tested-by: syzbot+376a289e86a0fd02b9ba@syzkaller.appspotmail.com
Tested-by: Leon Romanovsky <leonro@nvidia.com>
Link: https://lore.kernel.org/r/222255fe07cb58f15ee662e7ee78328af5b438e4.1690549248.git.daniel@iogearbox.net
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
		
	
			
		
			
				
	
	
		
			343 lines
		
	
	
	
		
			9.3 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			343 lines
		
	
	
	
		
			9.3 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| /* SPDX-License-Identifier: GPL-2.0 */
 | |
| /* Copyright (c) 2023 Isovalent */
 | |
| #ifndef __BPF_MPROG_H
 | |
| #define __BPF_MPROG_H
 | |
| 
 | |
| #include <linux/bpf.h>
 | |
| 
 | |
| /* bpf_mprog framework:
 | |
|  *
 | |
|  * bpf_mprog is a generic layer for multi-program attachment. In-kernel users
 | |
|  * of the bpf_mprog don't need to care about the dependency resolution
 | |
|  * internals, they can just consume it with few API calls. Currently available
 | |
|  * dependency directives are BPF_F_{BEFORE,AFTER} which enable insertion of
 | |
|  * a BPF program or BPF link relative to an existing BPF program or BPF link
 | |
|  * inside the multi-program array as well as prepend and append behavior if
 | |
|  * no relative object was specified, see corresponding selftests for concrete
 | |
|  * examples (e.g. tc_links and tc_opts test cases of test_progs).
 | |
|  *
 | |
|  * Usage of bpf_mprog_{attach,detach,query}() core APIs with pseudo code:
 | |
|  *
 | |
|  *  Attach case:
 | |
|  *
 | |
|  *   struct bpf_mprog_entry *entry, *entry_new;
 | |
|  *   int ret;
 | |
|  *
 | |
|  *   // bpf_mprog user-side lock
 | |
|  *   // fetch active @entry from attach location
 | |
|  *   [...]
 | |
|  *   ret = bpf_mprog_attach(entry, &entry_new, [...]);
 | |
|  *   if (!ret) {
 | |
|  *       if (entry != entry_new) {
 | |
|  *           // swap @entry to @entry_new at attach location
 | |
|  *           // ensure there are no inflight users of @entry:
 | |
|  *           synchronize_rcu();
 | |
|  *       }
 | |
|  *       bpf_mprog_commit(entry);
 | |
|  *   } else {
 | |
|  *       // error path, bail out, propagate @ret
 | |
|  *   }
 | |
|  *   // bpf_mprog user-side unlock
 | |
|  *
 | |
|  *  Detach case:
 | |
|  *
 | |
|  *   struct bpf_mprog_entry *entry, *entry_new;
 | |
|  *   int ret;
 | |
|  *
 | |
|  *   // bpf_mprog user-side lock
 | |
|  *   // fetch active @entry from attach location
 | |
|  *   [...]
 | |
|  *   ret = bpf_mprog_detach(entry, &entry_new, [...]);
 | |
|  *   if (!ret) {
 | |
|  *       // all (*) marked is optional and depends on the use-case
 | |
|  *       // whether bpf_mprog_bundle should be freed or not
 | |
|  *       if (!bpf_mprog_total(entry_new))     (*)
 | |
|  *           entry_new = NULL                 (*)
 | |
|  *       // swap @entry to @entry_new at attach location
 | |
|  *       // ensure there are no inflight users of @entry:
 | |
|  *       synchronize_rcu();
 | |
|  *       bpf_mprog_commit(entry);
 | |
|  *       if (!entry_new)                      (*)
 | |
|  *           // free bpf_mprog_bundle         (*)
 | |
|  *   } else {
 | |
|  *       // error path, bail out, propagate @ret
 | |
|  *   }
 | |
|  *   // bpf_mprog user-side unlock
 | |
|  *
 | |
|  *  Query case:
 | |
|  *
 | |
|  *   struct bpf_mprog_entry *entry;
 | |
|  *   int ret;
 | |
|  *
 | |
|  *   // bpf_mprog user-side lock
 | |
|  *   // fetch active @entry from attach location
 | |
|  *   [...]
 | |
|  *   ret = bpf_mprog_query(attr, uattr, entry);
 | |
|  *   // bpf_mprog user-side unlock
 | |
|  *
 | |
|  *  Data/fast path:
 | |
|  *
 | |
|  *   struct bpf_mprog_entry *entry;
 | |
|  *   struct bpf_mprog_fp *fp;
 | |
|  *   struct bpf_prog *prog;
 | |
|  *   int ret = [...];
 | |
|  *
 | |
|  *   rcu_read_lock();
 | |
|  *   // fetch active @entry from attach location
 | |
|  *   [...]
 | |
|  *   bpf_mprog_foreach_prog(entry, fp, prog) {
 | |
|  *       ret = bpf_prog_run(prog, [...]);
 | |
|  *       // process @ret from program
 | |
|  *   }
 | |
|  *   [...]
 | |
|  *   rcu_read_unlock();
 | |
|  *
 | |
|  * bpf_mprog locking considerations:
 | |
|  *
 | |
|  * bpf_mprog_{attach,detach,query}() must be protected by an external lock
 | |
|  * (like RTNL in case of tcx).
 | |
|  *
 | |
|  * bpf_mprog_entry pointer can be an __rcu annotated pointer (in case of tcx
 | |
|  * the netdevice has tcx_ingress and tcx_egress __rcu pointer) which gets
 | |
|  * updated via rcu_assign_pointer() pointing to the active bpf_mprog_entry of
 | |
|  * the bpf_mprog_bundle.
 | |
|  *
 | |
|  * Fast path accesses the active bpf_mprog_entry within RCU critical section
 | |
|  * (in case of tcx it runs in NAPI which provides RCU protection there,
 | |
|  * other users might need explicit rcu_read_lock()). The bpf_mprog_commit()
 | |
|  * assumes that for the old bpf_mprog_entry there are no inflight users
 | |
|  * anymore.
 | |
|  *
 | |
|  * The READ_ONCE()/WRITE_ONCE() pairing for bpf_mprog_fp's prog access is for
 | |
|  * the replacement case where we don't swap the bpf_mprog_entry.
 | |
|  */
 | |
| 
 | |
| #define bpf_mprog_foreach_tuple(entry, fp, cp, t)			\
 | |
| 	for (fp = &entry->fp_items[0], cp = &entry->parent->cp_items[0];\
 | |
| 	     ({								\
 | |
| 		t.prog = READ_ONCE(fp->prog);				\
 | |
| 		t.link = cp->link;					\
 | |
| 		t.prog;							\
 | |
| 	      });							\
 | |
| 	     fp++, cp++)
 | |
| 
 | |
| #define bpf_mprog_foreach_prog(entry, fp, p)				\
 | |
| 	for (fp = &entry->fp_items[0];					\
 | |
| 	     (p = READ_ONCE(fp->prog));					\
 | |
| 	     fp++)
 | |
| 
 | |
| #define BPF_MPROG_MAX 64
 | |
| 
 | |
| struct bpf_mprog_fp {
 | |
| 	struct bpf_prog *prog;
 | |
| };
 | |
| 
 | |
| struct bpf_mprog_cp {
 | |
| 	struct bpf_link *link;
 | |
| };
 | |
| 
 | |
| struct bpf_mprog_entry {
 | |
| 	struct bpf_mprog_fp fp_items[BPF_MPROG_MAX];
 | |
| 	struct bpf_mprog_bundle *parent;
 | |
| };
 | |
| 
 | |
| struct bpf_mprog_bundle {
 | |
| 	struct bpf_mprog_entry a;
 | |
| 	struct bpf_mprog_entry b;
 | |
| 	struct bpf_mprog_cp cp_items[BPF_MPROG_MAX];
 | |
| 	struct bpf_prog *ref;
 | |
| 	atomic64_t revision;
 | |
| 	u32 count;
 | |
| };
 | |
| 
 | |
| struct bpf_tuple {
 | |
| 	struct bpf_prog *prog;
 | |
| 	struct bpf_link *link;
 | |
| };
 | |
| 
 | |
| static inline struct bpf_mprog_entry *
 | |
| bpf_mprog_peer(const struct bpf_mprog_entry *entry)
 | |
| {
 | |
| 	if (entry == &entry->parent->a)
 | |
| 		return &entry->parent->b;
 | |
| 	else
 | |
| 		return &entry->parent->a;
 | |
| }
 | |
| 
 | |
| static inline void bpf_mprog_bundle_init(struct bpf_mprog_bundle *bundle)
 | |
| {
 | |
| 	BUILD_BUG_ON(sizeof(bundle->a.fp_items[0]) > sizeof(u64));
 | |
| 	BUILD_BUG_ON(ARRAY_SIZE(bundle->a.fp_items) !=
 | |
| 		     ARRAY_SIZE(bundle->cp_items));
 | |
| 
 | |
| 	memset(bundle, 0, sizeof(*bundle));
 | |
| 	atomic64_set(&bundle->revision, 1);
 | |
| 	bundle->a.parent = bundle;
 | |
| 	bundle->b.parent = bundle;
 | |
| }
 | |
| 
 | |
| static inline void bpf_mprog_inc(struct bpf_mprog_entry *entry)
 | |
| {
 | |
| 	entry->parent->count++;
 | |
| }
 | |
| 
 | |
| static inline void bpf_mprog_dec(struct bpf_mprog_entry *entry)
 | |
| {
 | |
| 	entry->parent->count--;
 | |
| }
 | |
| 
 | |
| static inline int bpf_mprog_max(void)
 | |
| {
 | |
| 	return ARRAY_SIZE(((struct bpf_mprog_entry *)NULL)->fp_items) - 1;
 | |
| }
 | |
| 
 | |
| static inline int bpf_mprog_total(struct bpf_mprog_entry *entry)
 | |
| {
 | |
| 	int total = entry->parent->count;
 | |
| 
 | |
| 	WARN_ON_ONCE(total > bpf_mprog_max());
 | |
| 	return total;
 | |
| }
 | |
| 
 | |
| static inline bool bpf_mprog_exists(struct bpf_mprog_entry *entry,
 | |
| 				    struct bpf_prog *prog)
 | |
| {
 | |
| 	const struct bpf_mprog_fp *fp;
 | |
| 	const struct bpf_prog *tmp;
 | |
| 
 | |
| 	bpf_mprog_foreach_prog(entry, fp, tmp) {
 | |
| 		if (tmp == prog)
 | |
| 			return true;
 | |
| 	}
 | |
| 	return false;
 | |
| }
 | |
| 
 | |
| static inline void bpf_mprog_mark_for_release(struct bpf_mprog_entry *entry,
 | |
| 					      struct bpf_tuple *tuple)
 | |
| {
 | |
| 	WARN_ON_ONCE(entry->parent->ref);
 | |
| 	if (!tuple->link)
 | |
| 		entry->parent->ref = tuple->prog;
 | |
| }
 | |
| 
 | |
| static inline void bpf_mprog_complete_release(struct bpf_mprog_entry *entry)
 | |
| {
 | |
| 	/* In the non-link case prog deletions can only drop the reference
 | |
| 	 * to the prog after the bpf_mprog_entry got swapped and the
 | |
| 	 * bpf_mprog ensured that there are no inflight users anymore.
 | |
| 	 *
 | |
| 	 * Paired with bpf_mprog_mark_for_release().
 | |
| 	 */
 | |
| 	if (entry->parent->ref) {
 | |
| 		bpf_prog_put(entry->parent->ref);
 | |
| 		entry->parent->ref = NULL;
 | |
| 	}
 | |
| }
 | |
| 
 | |
| static inline void bpf_mprog_revision_new(struct bpf_mprog_entry *entry)
 | |
| {
 | |
| 	atomic64_inc(&entry->parent->revision);
 | |
| }
 | |
| 
 | |
| static inline void bpf_mprog_commit(struct bpf_mprog_entry *entry)
 | |
| {
 | |
| 	bpf_mprog_complete_release(entry);
 | |
| 	bpf_mprog_revision_new(entry);
 | |
| }
 | |
| 
 | |
| static inline u64 bpf_mprog_revision(struct bpf_mprog_entry *entry)
 | |
| {
 | |
| 	return atomic64_read(&entry->parent->revision);
 | |
| }
 | |
| 
 | |
| static inline void bpf_mprog_entry_copy(struct bpf_mprog_entry *dst,
 | |
| 					struct bpf_mprog_entry *src)
 | |
| {
 | |
| 	memcpy(dst->fp_items, src->fp_items, sizeof(src->fp_items));
 | |
| }
 | |
| 
 | |
| static inline void bpf_mprog_entry_clear(struct bpf_mprog_entry *dst)
 | |
| {
 | |
| 	memset(dst->fp_items, 0, sizeof(dst->fp_items));
 | |
| }
 | |
| 
 | |
| static inline void bpf_mprog_clear_all(struct bpf_mprog_entry *entry,
 | |
| 				       struct bpf_mprog_entry **entry_new)
 | |
| {
 | |
| 	struct bpf_mprog_entry *peer;
 | |
| 
 | |
| 	peer = bpf_mprog_peer(entry);
 | |
| 	bpf_mprog_entry_clear(peer);
 | |
| 	peer->parent->count = 0;
 | |
| 	*entry_new = peer;
 | |
| }
 | |
| 
 | |
| static inline void bpf_mprog_entry_grow(struct bpf_mprog_entry *entry, int idx)
 | |
| {
 | |
| 	int total = bpf_mprog_total(entry);
 | |
| 
 | |
| 	memmove(entry->fp_items + idx + 1,
 | |
| 		entry->fp_items + idx,
 | |
| 		(total - idx) * sizeof(struct bpf_mprog_fp));
 | |
| 
 | |
| 	memmove(entry->parent->cp_items + idx + 1,
 | |
| 		entry->parent->cp_items + idx,
 | |
| 		(total - idx) * sizeof(struct bpf_mprog_cp));
 | |
| }
 | |
| 
 | |
| static inline void bpf_mprog_entry_shrink(struct bpf_mprog_entry *entry, int idx)
 | |
| {
 | |
| 	/* Total array size is needed in this case to enure the NULL
 | |
| 	 * entry is copied at the end.
 | |
| 	 */
 | |
| 	int total = ARRAY_SIZE(entry->fp_items);
 | |
| 
 | |
| 	memmove(entry->fp_items + idx,
 | |
| 		entry->fp_items + idx + 1,
 | |
| 		(total - idx - 1) * sizeof(struct bpf_mprog_fp));
 | |
| 
 | |
| 	memmove(entry->parent->cp_items + idx,
 | |
| 		entry->parent->cp_items + idx + 1,
 | |
| 		(total - idx - 1) * sizeof(struct bpf_mprog_cp));
 | |
| }
 | |
| 
 | |
| static inline void bpf_mprog_read(struct bpf_mprog_entry *entry, u32 idx,
 | |
| 				  struct bpf_mprog_fp **fp,
 | |
| 				  struct bpf_mprog_cp **cp)
 | |
| {
 | |
| 	*fp = &entry->fp_items[idx];
 | |
| 	*cp = &entry->parent->cp_items[idx];
 | |
| }
 | |
| 
 | |
| static inline void bpf_mprog_write(struct bpf_mprog_fp *fp,
 | |
| 				   struct bpf_mprog_cp *cp,
 | |
| 				   struct bpf_tuple *tuple)
 | |
| {
 | |
| 	WRITE_ONCE(fp->prog, tuple->prog);
 | |
| 	cp->link = tuple->link;
 | |
| }
 | |
| 
 | |
| int bpf_mprog_attach(struct bpf_mprog_entry *entry,
 | |
| 		     struct bpf_mprog_entry **entry_new,
 | |
| 		     struct bpf_prog *prog_new, struct bpf_link *link,
 | |
| 		     struct bpf_prog *prog_old,
 | |
| 		     u32 flags, u32 id_or_fd, u64 revision);
 | |
| 
 | |
| int bpf_mprog_detach(struct bpf_mprog_entry *entry,
 | |
| 		     struct bpf_mprog_entry **entry_new,
 | |
| 		     struct bpf_prog *prog, struct bpf_link *link,
 | |
| 		     u32 flags, u32 id_or_fd, u64 revision);
 | |
| 
 | |
| int bpf_mprog_query(const union bpf_attr *attr, union bpf_attr __user *uattr,
 | |
| 		    struct bpf_mprog_entry *entry);
 | |
| 
 | |
| static inline bool bpf_mprog_supported(enum bpf_prog_type type)
 | |
| {
 | |
| 	switch (type) {
 | |
| 	case BPF_PROG_TYPE_SCHED_CLS:
 | |
| 		return true;
 | |
| 	default:
 | |
| 		return false;
 | |
| 	}
 | |
| }
 | |
| #endif /* __BPF_MPROG_H */
 |