mirror of
				https://github.com/torvalds/linux.git
				synced 2025-10-31 08:38:45 +02:00 
			
		
		
		
	 1cb6f0bae5
			
		
	
	
		1cb6f0bae5
		
	
	
	
	
		
			
			Pedro Pinto and later independently also Hyunwoo Kim and Wongi Lee reported
an issue that the tcx_entry can be released too early leading to a use
after free (UAF) when an active old-style ingress or clsact qdisc with a
shared tc block is later replaced by another ingress or clsact instance.
Essentially, the sequence to trigger the UAF (one example) can be as follows:
  1. A network namespace is created
  2. An ingress qdisc is created. This allocates a tcx_entry, and
     &tcx_entry->miniq is stored in the qdisc's miniqp->p_miniq. At the
     same time, a tcf block with index 1 is created.
  3. chain0 is attached to the tcf block. chain0 must be connected to
     the block linked to the ingress qdisc to later reach the function
     tcf_chain0_head_change_cb_del() which triggers the UAF.
  4. Create and graft a clsact qdisc. This causes the ingress qdisc
     created in step 1 to be removed, thus freeing the previously linked
     tcx_entry:
     rtnetlink_rcv_msg()
       => tc_modify_qdisc()
         => qdisc_create()
           => clsact_init() [a]
         => qdisc_graft()
           => qdisc_destroy()
             => __qdisc_destroy()
               => ingress_destroy() [b]
                 => tcx_entry_free()
                   => kfree_rcu() // tcx_entry freed
  5. Finally, the network namespace is closed. This registers the
     cleanup_net worker, and during the process of releasing the
     remaining clsact qdisc, it accesses the tcx_entry that was
     already freed in step 4, causing the UAF to occur:
     cleanup_net()
       => ops_exit_list()
         => default_device_exit_batch()
           => unregister_netdevice_many()
             => unregister_netdevice_many_notify()
               => dev_shutdown()
                 => qdisc_put()
                   => clsact_destroy() [c]
                     => tcf_block_put_ext()
                       => tcf_chain0_head_change_cb_del()
                         => tcf_chain_head_change_item()
                           => clsact_chain_head_change()
                             => mini_qdisc_pair_swap() // UAF
There are also other variants, the gist is to add an ingress (or clsact)
qdisc with a specific shared block, then to replace that qdisc, waiting
for the tcx_entry kfree_rcu() to be executed and subsequently accessing
the current active qdisc's miniq one way or another.
The correct fix is to turn the miniq_active boolean into a counter. What
can be observed, at step 2 above, the counter transitions from 0->1, at
step [a] from 1->2 (in order for the miniq object to remain active during
the replacement), then in [b] from 2->1 and finally [c] 1->0 with the
eventual release. The reference counter in general ranges from [0,2] and
it does not need to be atomic since all access to the counter is protected
by the rtnl mutex. With this in place, there is no longer a UAF happening
and the tcx_entry is freed at the correct time.
Fixes: e420bed025 ("bpf: Add fd-based tcx multi-prog infra with link support")
Reported-by: Pedro Pinto <xten@osec.io>
Co-developed-by: Pedro Pinto <xten@osec.io>
Signed-off-by: Pedro Pinto <xten@osec.io>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Hyunwoo Kim <v4bel@theori.io>
Cc: Wongi Lee <qwerty@theori.io>
Cc: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20240708133130.11609-1-daniel@iogearbox.net
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
		
	
			
		
			
				
	
	
		
			207 lines
		
	
	
	
		
			4.4 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			207 lines
		
	
	
	
		
			4.4 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| /* SPDX-License-Identifier: GPL-2.0 */
 | |
| /* Copyright (c) 2023 Isovalent */
 | |
| #ifndef __NET_TCX_H
 | |
| #define __NET_TCX_H
 | |
| 
 | |
| #include <linux/bpf.h>
 | |
| #include <linux/bpf_mprog.h>
 | |
| 
 | |
| #include <net/sch_generic.h>
 | |
| 
 | |
| struct mini_Qdisc;
 | |
| 
 | |
| struct tcx_entry {
 | |
| 	struct mini_Qdisc __rcu *miniq;
 | |
| 	struct bpf_mprog_bundle bundle;
 | |
| 	u32 miniq_active;
 | |
| 	struct rcu_head rcu;
 | |
| };
 | |
| 
 | |
| struct tcx_link {
 | |
| 	struct bpf_link link;
 | |
| 	struct net_device *dev;
 | |
| 	u32 location;
 | |
| };
 | |
| 
 | |
| static inline void tcx_set_ingress(struct sk_buff *skb, bool ingress)
 | |
| {
 | |
| #ifdef CONFIG_NET_XGRESS
 | |
| 	skb->tc_at_ingress = ingress;
 | |
| #endif
 | |
| }
 | |
| 
 | |
| #ifdef CONFIG_NET_XGRESS
 | |
| static inline struct tcx_entry *tcx_entry(struct bpf_mprog_entry *entry)
 | |
| {
 | |
| 	struct bpf_mprog_bundle *bundle = entry->parent;
 | |
| 
 | |
| 	return container_of(bundle, struct tcx_entry, bundle);
 | |
| }
 | |
| 
 | |
| static inline struct tcx_link *tcx_link(const struct bpf_link *link)
 | |
| {
 | |
| 	return container_of(link, struct tcx_link, link);
 | |
| }
 | |
| 
 | |
| void tcx_inc(void);
 | |
| void tcx_dec(void);
 | |
| 
 | |
| static inline void tcx_entry_sync(void)
 | |
| {
 | |
| 	/* bpf_mprog_entry got a/b swapped, therefore ensure that
 | |
| 	 * there are no inflight users on the old one anymore.
 | |
| 	 */
 | |
| 	synchronize_rcu();
 | |
| }
 | |
| 
 | |
| static inline void
 | |
| tcx_entry_update(struct net_device *dev, struct bpf_mprog_entry *entry,
 | |
| 		 bool ingress)
 | |
| {
 | |
| 	ASSERT_RTNL();
 | |
| 	if (ingress)
 | |
| 		rcu_assign_pointer(dev->tcx_ingress, entry);
 | |
| 	else
 | |
| 		rcu_assign_pointer(dev->tcx_egress, entry);
 | |
| }
 | |
| 
 | |
| static inline struct bpf_mprog_entry *
 | |
| tcx_entry_fetch(struct net_device *dev, bool ingress)
 | |
| {
 | |
| 	ASSERT_RTNL();
 | |
| 	if (ingress)
 | |
| 		return rcu_dereference_rtnl(dev->tcx_ingress);
 | |
| 	else
 | |
| 		return rcu_dereference_rtnl(dev->tcx_egress);
 | |
| }
 | |
| 
 | |
| static inline struct bpf_mprog_entry *tcx_entry_create_noprof(void)
 | |
| {
 | |
| 	struct tcx_entry *tcx = kzalloc_noprof(sizeof(*tcx), GFP_KERNEL);
 | |
| 
 | |
| 	if (tcx) {
 | |
| 		bpf_mprog_bundle_init(&tcx->bundle);
 | |
| 		return &tcx->bundle.a;
 | |
| 	}
 | |
| 	return NULL;
 | |
| }
 | |
| #define tcx_entry_create(...)	alloc_hooks(tcx_entry_create_noprof(__VA_ARGS__))
 | |
| 
 | |
| static inline void tcx_entry_free(struct bpf_mprog_entry *entry)
 | |
| {
 | |
| 	kfree_rcu(tcx_entry(entry), rcu);
 | |
| }
 | |
| 
 | |
| static inline struct bpf_mprog_entry *
 | |
| tcx_entry_fetch_or_create(struct net_device *dev, bool ingress, bool *created)
 | |
| {
 | |
| 	struct bpf_mprog_entry *entry = tcx_entry_fetch(dev, ingress);
 | |
| 
 | |
| 	*created = false;
 | |
| 	if (!entry) {
 | |
| 		entry = tcx_entry_create();
 | |
| 		if (!entry)
 | |
| 			return NULL;
 | |
| 		*created = true;
 | |
| 	}
 | |
| 	return entry;
 | |
| }
 | |
| 
 | |
| static inline void tcx_skeys_inc(bool ingress)
 | |
| {
 | |
| 	tcx_inc();
 | |
| 	if (ingress)
 | |
| 		net_inc_ingress_queue();
 | |
| 	else
 | |
| 		net_inc_egress_queue();
 | |
| }
 | |
| 
 | |
| static inline void tcx_skeys_dec(bool ingress)
 | |
| {
 | |
| 	if (ingress)
 | |
| 		net_dec_ingress_queue();
 | |
| 	else
 | |
| 		net_dec_egress_queue();
 | |
| 	tcx_dec();
 | |
| }
 | |
| 
 | |
| static inline void tcx_miniq_inc(struct bpf_mprog_entry *entry)
 | |
| {
 | |
| 	ASSERT_RTNL();
 | |
| 	tcx_entry(entry)->miniq_active++;
 | |
| }
 | |
| 
 | |
| static inline void tcx_miniq_dec(struct bpf_mprog_entry *entry)
 | |
| {
 | |
| 	ASSERT_RTNL();
 | |
| 	tcx_entry(entry)->miniq_active--;
 | |
| }
 | |
| 
 | |
| static inline bool tcx_entry_is_active(struct bpf_mprog_entry *entry)
 | |
| {
 | |
| 	ASSERT_RTNL();
 | |
| 	return bpf_mprog_total(entry) || tcx_entry(entry)->miniq_active;
 | |
| }
 | |
| 
 | |
| static inline enum tcx_action_base tcx_action_code(struct sk_buff *skb,
 | |
| 						   int code)
 | |
| {
 | |
| 	switch (code) {
 | |
| 	case TCX_PASS:
 | |
| 		skb->tc_index = qdisc_skb_cb(skb)->tc_classid;
 | |
| 		fallthrough;
 | |
| 	case TCX_DROP:
 | |
| 	case TCX_REDIRECT:
 | |
| 		return code;
 | |
| 	case TCX_NEXT:
 | |
| 	default:
 | |
| 		return TCX_NEXT;
 | |
| 	}
 | |
| }
 | |
| #endif /* CONFIG_NET_XGRESS */
 | |
| 
 | |
| #if defined(CONFIG_NET_XGRESS) && defined(CONFIG_BPF_SYSCALL)
 | |
| int tcx_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog);
 | |
| int tcx_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
 | |
| int tcx_prog_detach(const union bpf_attr *attr, struct bpf_prog *prog);
 | |
| void tcx_uninstall(struct net_device *dev, bool ingress);
 | |
| 
 | |
| int tcx_prog_query(const union bpf_attr *attr,
 | |
| 		   union bpf_attr __user *uattr);
 | |
| 
 | |
| static inline void dev_tcx_uninstall(struct net_device *dev)
 | |
| {
 | |
| 	ASSERT_RTNL();
 | |
| 	tcx_uninstall(dev, true);
 | |
| 	tcx_uninstall(dev, false);
 | |
| }
 | |
| #else
 | |
| static inline int tcx_prog_attach(const union bpf_attr *attr,
 | |
| 				  struct bpf_prog *prog)
 | |
| {
 | |
| 	return -EINVAL;
 | |
| }
 | |
| 
 | |
| static inline int tcx_link_attach(const union bpf_attr *attr,
 | |
| 				  struct bpf_prog *prog)
 | |
| {
 | |
| 	return -EINVAL;
 | |
| }
 | |
| 
 | |
| static inline int tcx_prog_detach(const union bpf_attr *attr,
 | |
| 				  struct bpf_prog *prog)
 | |
| {
 | |
| 	return -EINVAL;
 | |
| }
 | |
| 
 | |
| static inline int tcx_prog_query(const union bpf_attr *attr,
 | |
| 				 union bpf_attr __user *uattr)
 | |
| {
 | |
| 	return -EINVAL;
 | |
| }
 | |
| 
 | |
| static inline void dev_tcx_uninstall(struct net_device *dev)
 | |
| {
 | |
| }
 | |
| #endif /* CONFIG_NET_XGRESS && CONFIG_BPF_SYSCALL */
 | |
| #endif /* __NET_TCX_H */
 |