mirror of
				https://github.com/torvalds/linux.git
				synced 2025-11-04 02:30:34 +02:00 
			
		
		
		
	cgroup: add support for eBPF programs
This patch adds two sets of eBPF program pointers to struct cgroup.
One for such that are directly pinned to a cgroup, and one for such
that are effective for it.
To illustrate the logic behind that, assume the following example
cgroup hierarchy.
  A - B - C
        \ D - E
If only B has a program attached, it will be effective for B, C, D
and E. If D then attaches a program itself, that will be effective for
both D and E, and the program in B will only affect B and C. Only one
program of a given type is effective for a cgroup.
Attaching and detaching programs will be done through the bpf(2)
syscall. For now, ingress and egress inet socket filtering are the
only supported use-cases.
Signed-off-by: Daniel Mack <daniel@zonque.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
			
			
This commit is contained in:
		
							parent
							
								
									0e33661de4
								
							
						
					
					
						commit
						3007098494
					
				
					 6 changed files with 281 additions and 0 deletions
				
			
		
							
								
								
									
										79
									
								
								include/linux/bpf-cgroup.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										79
									
								
								include/linux/bpf-cgroup.h
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,79 @@
 | 
			
		|||
#ifndef _BPF_CGROUP_H
 | 
			
		||||
#define _BPF_CGROUP_H
 | 
			
		||||
 | 
			
		||||
#include <linux/bpf.h>
 | 
			
		||||
#include <linux/jump_label.h>
 | 
			
		||||
#include <uapi/linux/bpf.h>
 | 
			
		||||
 | 
			
		||||
struct sock;
 | 
			
		||||
struct cgroup;
 | 
			
		||||
struct sk_buff;
 | 
			
		||||
 | 
			
		||||
#ifdef CONFIG_CGROUP_BPF
 | 
			
		||||
 | 
			
		||||
extern struct static_key_false cgroup_bpf_enabled_key;
 | 
			
		||||
#define cgroup_bpf_enabled static_branch_unlikely(&cgroup_bpf_enabled_key)
 | 
			
		||||
 | 
			
		||||
struct cgroup_bpf {
 | 
			
		||||
	/*
 | 
			
		||||
	 * Store two sets of bpf_prog pointers, one for programs that are
 | 
			
		||||
	 * pinned directly to this cgroup, and one for those that are effective
 | 
			
		||||
	 * when this cgroup is accessed.
 | 
			
		||||
	 */
 | 
			
		||||
	struct bpf_prog *prog[MAX_BPF_ATTACH_TYPE];
 | 
			
		||||
	struct bpf_prog *effective[MAX_BPF_ATTACH_TYPE];
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
void cgroup_bpf_put(struct cgroup *cgrp);
 | 
			
		||||
void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent);
 | 
			
		||||
 | 
			
		||||
void __cgroup_bpf_update(struct cgroup *cgrp,
 | 
			
		||||
			 struct cgroup *parent,
 | 
			
		||||
			 struct bpf_prog *prog,
 | 
			
		||||
			 enum bpf_attach_type type);
 | 
			
		||||
 | 
			
		||||
/* Wrapper for __cgroup_bpf_update() protected by cgroup_mutex */
 | 
			
		||||
void cgroup_bpf_update(struct cgroup *cgrp,
 | 
			
		||||
		       struct bpf_prog *prog,
 | 
			
		||||
		       enum bpf_attach_type type);
 | 
			
		||||
 | 
			
		||||
int __cgroup_bpf_run_filter(struct sock *sk,
 | 
			
		||||
			    struct sk_buff *skb,
 | 
			
		||||
			    enum bpf_attach_type type);
 | 
			
		||||
 | 
			
		||||
/* Wrappers for __cgroup_bpf_run_filter() guarded by cgroup_bpf_enabled. */
 | 
			
		||||
#define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb)			\
 | 
			
		||||
({									\
 | 
			
		||||
	int __ret = 0;							\
 | 
			
		||||
	if (cgroup_bpf_enabled)						\
 | 
			
		||||
		__ret = __cgroup_bpf_run_filter(sk, skb,		\
 | 
			
		||||
						BPF_CGROUP_INET_INGRESS); \
 | 
			
		||||
									\
 | 
			
		||||
	__ret;								\
 | 
			
		||||
})
 | 
			
		||||
 | 
			
		||||
#define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb)				\
 | 
			
		||||
({									\
 | 
			
		||||
	int __ret = 0;							\
 | 
			
		||||
	if (cgroup_bpf_enabled && sk && sk == skb->sk) {		\
 | 
			
		||||
		typeof(sk) __sk = sk_to_full_sk(sk);			\
 | 
			
		||||
		if (sk_fullsock(__sk))					\
 | 
			
		||||
			__ret = __cgroup_bpf_run_filter(__sk, skb,	\
 | 
			
		||||
						BPF_CGROUP_INET_EGRESS); \
 | 
			
		||||
	}								\
 | 
			
		||||
	__ret;								\
 | 
			
		||||
})
 | 
			
		||||
 | 
			
		||||
#else
 | 
			
		||||
 | 
			
		||||
struct cgroup_bpf {};
 | 
			
		||||
static inline void cgroup_bpf_put(struct cgroup *cgrp) {}
 | 
			
		||||
static inline void cgroup_bpf_inherit(struct cgroup *cgrp,
 | 
			
		||||
				      struct cgroup *parent) {}
 | 
			
		||||
 | 
			
		||||
#define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; })
 | 
			
		||||
#define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; })
 | 
			
		||||
 | 
			
		||||
#endif /* CONFIG_CGROUP_BPF */
 | 
			
		||||
 | 
			
		||||
#endif /* _BPF_CGROUP_H */
 | 
			
		||||
| 
						 | 
				
			
			@ -16,6 +16,7 @@
 | 
			
		|||
#include <linux/percpu-refcount.h>
 | 
			
		||||
#include <linux/percpu-rwsem.h>
 | 
			
		||||
#include <linux/workqueue.h>
 | 
			
		||||
#include <linux/bpf-cgroup.h>
 | 
			
		||||
 | 
			
		||||
#ifdef CONFIG_CGROUPS
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -300,6 +301,9 @@ struct cgroup {
 | 
			
		|||
	/* used to schedule release agent */
 | 
			
		||||
	struct work_struct release_agent_work;
 | 
			
		||||
 | 
			
		||||
	/* used to store eBPF programs */
 | 
			
		||||
	struct cgroup_bpf bpf;
 | 
			
		||||
 | 
			
		||||
	/* ids of the ancestors at each level including self */
 | 
			
		||||
	int ancestor_ids[];
 | 
			
		||||
};
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										12
									
								
								init/Kconfig
									
									
									
									
									
								
							
							
						
						
									
										12
									
								
								init/Kconfig
									
									
									
									
									
								
							| 
						 | 
				
			
			@ -1154,6 +1154,18 @@ config CGROUP_PERF
 | 
			
		|||
 | 
			
		||||
	  Say N if unsure.
 | 
			
		||||
 | 
			
		||||
config CGROUP_BPF
 | 
			
		||||
	bool "Support for eBPF programs attached to cgroups"
 | 
			
		||||
	depends on BPF_SYSCALL && SOCK_CGROUP_DATA
 | 
			
		||||
	help
 | 
			
		||||
	  Allow attaching eBPF programs to a cgroup using the bpf(2)
 | 
			
		||||
	  syscall command BPF_PROG_ATTACH.
 | 
			
		||||
 | 
			
		||||
	  In which context these programs are accessed depends on the type
 | 
			
		||||
	  of attachment. For instance, programs that are attached using
 | 
			
		||||
	  BPF_CGROUP_INET_INGRESS will be executed on the ingress path of
 | 
			
		||||
	  inet sockets.
 | 
			
		||||
 | 
			
		||||
config CGROUP_DEBUG
 | 
			
		||||
	bool "Example controller"
 | 
			
		||||
	default n
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -5,3 +5,4 @@ obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list
 | 
			
		|||
ifeq ($(CONFIG_PERF_EVENTS),y)
 | 
			
		||||
obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
 | 
			
		||||
endif
 | 
			
		||||
obj-$(CONFIG_CGROUP_BPF) += cgroup.o
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										167
									
								
								kernel/bpf/cgroup.c
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										167
									
								
								kernel/bpf/cgroup.c
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,167 @@
 | 
			
		|||
/*
 | 
			
		||||
 * Functions to manage eBPF programs attached to cgroups
 | 
			
		||||
 *
 | 
			
		||||
 * Copyright (c) 2016 Daniel Mack
 | 
			
		||||
 *
 | 
			
		||||
 * This file is subject to the terms and conditions of version 2 of the GNU
 | 
			
		||||
 * General Public License.  See the file COPYING in the main directory of the
 | 
			
		||||
 * Linux distribution for more details.
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
#include <linux/kernel.h>
 | 
			
		||||
#include <linux/atomic.h>
 | 
			
		||||
#include <linux/cgroup.h>
 | 
			
		||||
#include <linux/slab.h>
 | 
			
		||||
#include <linux/bpf.h>
 | 
			
		||||
#include <linux/bpf-cgroup.h>
 | 
			
		||||
#include <net/sock.h>
 | 
			
		||||
 | 
			
		||||
DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key);
 | 
			
		||||
EXPORT_SYMBOL(cgroup_bpf_enabled_key);
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * cgroup_bpf_put() - put references of all bpf programs
 | 
			
		||||
 * @cgrp: the cgroup to modify
 | 
			
		||||
 */
 | 
			
		||||
void cgroup_bpf_put(struct cgroup *cgrp)
 | 
			
		||||
{
 | 
			
		||||
	unsigned int type;
 | 
			
		||||
 | 
			
		||||
	for (type = 0; type < ARRAY_SIZE(cgrp->bpf.prog); type++) {
 | 
			
		||||
		struct bpf_prog *prog = cgrp->bpf.prog[type];
 | 
			
		||||
 | 
			
		||||
		if (prog) {
 | 
			
		||||
			bpf_prog_put(prog);
 | 
			
		||||
			static_branch_dec(&cgroup_bpf_enabled_key);
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * cgroup_bpf_inherit() - inherit effective programs from parent
 | 
			
		||||
 * @cgrp: the cgroup to modify
 | 
			
		||||
 * @parent: the parent to inherit from
 | 
			
		||||
 */
 | 
			
		||||
void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent)
 | 
			
		||||
{
 | 
			
		||||
	unsigned int type;
 | 
			
		||||
 | 
			
		||||
	for (type = 0; type < ARRAY_SIZE(cgrp->bpf.effective); type++) {
 | 
			
		||||
		struct bpf_prog *e;
 | 
			
		||||
 | 
			
		||||
		e = rcu_dereference_protected(parent->bpf.effective[type],
 | 
			
		||||
					      lockdep_is_held(&cgroup_mutex));
 | 
			
		||||
		rcu_assign_pointer(cgrp->bpf.effective[type], e);
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * __cgroup_bpf_update() - Update the pinned program of a cgroup, and
 | 
			
		||||
 *                         propagate the change to descendants
 | 
			
		||||
 * @cgrp: The cgroup which descendants to traverse
 | 
			
		||||
 * @parent: The parent of @cgrp, or %NULL if @cgrp is the root
 | 
			
		||||
 * @prog: A new program to pin
 | 
			
		||||
 * @type: Type of pinning operation (ingress/egress)
 | 
			
		||||
 *
 | 
			
		||||
 * Each cgroup has a set of two pointers for bpf programs; one for eBPF
 | 
			
		||||
 * programs it owns, and which is effective for execution.
 | 
			
		||||
 *
 | 
			
		||||
 * If @prog is %NULL, this function attaches a new program to the cgroup and
 | 
			
		||||
 * releases the one that is currently attached, if any. @prog is then made
 | 
			
		||||
 * the effective program of type @type in that cgroup.
 | 
			
		||||
 *
 | 
			
		||||
 * If @prog is %NULL, the currently attached program of type @type is released,
 | 
			
		||||
 * and the effective program of the parent cgroup (if any) is inherited to
 | 
			
		||||
 * @cgrp.
 | 
			
		||||
 *
 | 
			
		||||
 * Then, the descendants of @cgrp are walked and the effective program for
 | 
			
		||||
 * each of them is set to the effective program of @cgrp unless the
 | 
			
		||||
 * descendant has its own program attached, in which case the subbranch is
 | 
			
		||||
 * skipped. This ensures that delegated subcgroups with own programs are left
 | 
			
		||||
 * untouched.
 | 
			
		||||
 *
 | 
			
		||||
 * Must be called with cgroup_mutex held.
 | 
			
		||||
 */
 | 
			
		||||
void __cgroup_bpf_update(struct cgroup *cgrp,
 | 
			
		||||
			 struct cgroup *parent,
 | 
			
		||||
			 struct bpf_prog *prog,
 | 
			
		||||
			 enum bpf_attach_type type)
 | 
			
		||||
{
 | 
			
		||||
	struct bpf_prog *old_prog, *effective;
 | 
			
		||||
	struct cgroup_subsys_state *pos;
 | 
			
		||||
 | 
			
		||||
	old_prog = xchg(cgrp->bpf.prog + type, prog);
 | 
			
		||||
 | 
			
		||||
	effective = (!prog && parent) ?
 | 
			
		||||
		rcu_dereference_protected(parent->bpf.effective[type],
 | 
			
		||||
					  lockdep_is_held(&cgroup_mutex)) :
 | 
			
		||||
		prog;
 | 
			
		||||
 | 
			
		||||
	css_for_each_descendant_pre(pos, &cgrp->self) {
 | 
			
		||||
		struct cgroup *desc = container_of(pos, struct cgroup, self);
 | 
			
		||||
 | 
			
		||||
		/* skip the subtree if the descendant has its own program */
 | 
			
		||||
		if (desc->bpf.prog[type] && desc != cgrp)
 | 
			
		||||
			pos = css_rightmost_descendant(pos);
 | 
			
		||||
		else
 | 
			
		||||
			rcu_assign_pointer(desc->bpf.effective[type],
 | 
			
		||||
					   effective);
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	if (prog)
 | 
			
		||||
		static_branch_inc(&cgroup_bpf_enabled_key);
 | 
			
		||||
 | 
			
		||||
	if (old_prog) {
 | 
			
		||||
		bpf_prog_put(old_prog);
 | 
			
		||||
		static_branch_dec(&cgroup_bpf_enabled_key);
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * __cgroup_bpf_run_filter() - Run a program for packet filtering
 | 
			
		||||
 * @sk: The socken sending or receiving traffic
 | 
			
		||||
 * @skb: The skb that is being sent or received
 | 
			
		||||
 * @type: The type of program to be exectuted
 | 
			
		||||
 *
 | 
			
		||||
 * If no socket is passed, or the socket is not of type INET or INET6,
 | 
			
		||||
 * this function does nothing and returns 0.
 | 
			
		||||
 *
 | 
			
		||||
 * The program type passed in via @type must be suitable for network
 | 
			
		||||
 * filtering. No further check is performed to assert that.
 | 
			
		||||
 *
 | 
			
		||||
 * This function will return %-EPERM if any if an attached program was found
 | 
			
		||||
 * and if it returned != 1 during execution. In all other cases, 0 is returned.
 | 
			
		||||
 */
 | 
			
		||||
int __cgroup_bpf_run_filter(struct sock *sk,
 | 
			
		||||
			    struct sk_buff *skb,
 | 
			
		||||
			    enum bpf_attach_type type)
 | 
			
		||||
{
 | 
			
		||||
	struct bpf_prog *prog;
 | 
			
		||||
	struct cgroup *cgrp;
 | 
			
		||||
	int ret = 0;
 | 
			
		||||
 | 
			
		||||
	if (!sk || !sk_fullsock(sk))
 | 
			
		||||
		return 0;
 | 
			
		||||
 | 
			
		||||
	if (sk->sk_family != AF_INET &&
 | 
			
		||||
	    sk->sk_family != AF_INET6)
 | 
			
		||||
		return 0;
 | 
			
		||||
 | 
			
		||||
	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
 | 
			
		||||
 | 
			
		||||
	rcu_read_lock();
 | 
			
		||||
 | 
			
		||||
	prog = rcu_dereference(cgrp->bpf.effective[type]);
 | 
			
		||||
	if (prog) {
 | 
			
		||||
		unsigned int offset = skb->data - skb_network_header(skb);
 | 
			
		||||
 | 
			
		||||
		__skb_push(skb, offset);
 | 
			
		||||
		ret = bpf_prog_run_save_cb(prog, skb) == 1 ? 0 : -EPERM;
 | 
			
		||||
		__skb_pull(skb, offset);
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	rcu_read_unlock();
 | 
			
		||||
 | 
			
		||||
	return ret;
 | 
			
		||||
}
 | 
			
		||||
EXPORT_SYMBOL(__cgroup_bpf_run_filter);
 | 
			
		||||
| 
						 | 
				
			
			@ -5074,6 +5074,8 @@ static void css_release_work_fn(struct work_struct *work)
 | 
			
		|||
		if (cgrp->kn)
 | 
			
		||||
			RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv,
 | 
			
		||||
					 NULL);
 | 
			
		||||
 | 
			
		||||
		cgroup_bpf_put(cgrp);
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	mutex_unlock(&cgroup_mutex);
 | 
			
		||||
| 
						 | 
				
			
			@ -5281,6 +5283,9 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
 | 
			
		|||
	if (!cgroup_on_dfl(cgrp))
 | 
			
		||||
		cgrp->subtree_control = cgroup_control(cgrp);
 | 
			
		||||
 | 
			
		||||
	if (parent)
 | 
			
		||||
		cgroup_bpf_inherit(cgrp, parent);
 | 
			
		||||
 | 
			
		||||
	cgroup_propagate_control(cgrp);
 | 
			
		||||
 | 
			
		||||
	/* @cgrp doesn't have dir yet so the following will only create csses */
 | 
			
		||||
| 
						 | 
				
			
			@ -6495,6 +6500,19 @@ static __init int cgroup_namespaces_init(void)
 | 
			
		|||
}
 | 
			
		||||
subsys_initcall(cgroup_namespaces_init);
 | 
			
		||||
 | 
			
		||||
#ifdef CONFIG_CGROUP_BPF
 | 
			
		||||
void cgroup_bpf_update(struct cgroup *cgrp,
 | 
			
		||||
		       struct bpf_prog *prog,
 | 
			
		||||
		       enum bpf_attach_type type)
 | 
			
		||||
{
 | 
			
		||||
	struct cgroup *parent = cgroup_parent(cgrp);
 | 
			
		||||
 | 
			
		||||
	mutex_lock(&cgroup_mutex);
 | 
			
		||||
	__cgroup_bpf_update(cgrp, parent, prog, type);
 | 
			
		||||
	mutex_unlock(&cgroup_mutex);
 | 
			
		||||
}
 | 
			
		||||
#endif /* CONFIG_CGROUP_BPF */
 | 
			
		||||
 | 
			
		||||
#ifdef CONFIG_CGROUP_DEBUG
 | 
			
		||||
static struct cgroup_subsys_state *
 | 
			
		||||
debug_css_alloc(struct cgroup_subsys_state *parent_css)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in a new issue