forked from mirrors/linux
		
	cgroup: add support for eBPF programs
This patch adds two sets of eBPF program pointers to struct cgroup.
One for such that are directly pinned to a cgroup, and one for such
that are effective for it.
To illustrate the logic behind that, assume the following example
cgroup hierarchy.
  A - B - C
        \ D - E
If only B has a program attached, it will be effective for B, C, D
and E. If D then attaches a program itself, that will be effective for
both D and E, and the program in B will only affect B and C. Only one
program of a given type is effective for a cgroup.
Attaching and detaching programs will be done through the bpf(2)
syscall. For now, ingress and egress inet socket filtering are the
only supported use-cases.
Signed-off-by: Daniel Mack <daniel@zonque.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
			
			
This commit is contained in:
		
							parent
							
								
									0e33661de4
								
							
						
					
					
						commit
						3007098494
					
				
					 6 changed files with 281 additions and 0 deletions
				
			
		
							
								
								
									
										79
									
								
								include/linux/bpf-cgroup.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										79
									
								
								include/linux/bpf-cgroup.h
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,79 @@ | |||
| #ifndef _BPF_CGROUP_H | ||||
| #define _BPF_CGROUP_H | ||||
| 
 | ||||
| #include <linux/bpf.h> | ||||
| #include <linux/jump_label.h> | ||||
| #include <uapi/linux/bpf.h> | ||||
| 
 | ||||
| struct sock; | ||||
| struct cgroup; | ||||
| struct sk_buff; | ||||
| 
 | ||||
| #ifdef CONFIG_CGROUP_BPF | ||||
| 
 | ||||
| extern struct static_key_false cgroup_bpf_enabled_key; | ||||
| #define cgroup_bpf_enabled static_branch_unlikely(&cgroup_bpf_enabled_key) | ||||
| 
 | ||||
| struct cgroup_bpf { | ||||
| 	/*
 | ||||
| 	 * Store two sets of bpf_prog pointers, one for programs that are | ||||
| 	 * pinned directly to this cgroup, and one for those that are effective | ||||
| 	 * when this cgroup is accessed. | ||||
| 	 */ | ||||
| 	struct bpf_prog *prog[MAX_BPF_ATTACH_TYPE]; | ||||
| 	struct bpf_prog *effective[MAX_BPF_ATTACH_TYPE]; | ||||
| }; | ||||
| 
 | ||||
| void cgroup_bpf_put(struct cgroup *cgrp); | ||||
| void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent); | ||||
| 
 | ||||
| void __cgroup_bpf_update(struct cgroup *cgrp, | ||||
| 			 struct cgroup *parent, | ||||
| 			 struct bpf_prog *prog, | ||||
| 			 enum bpf_attach_type type); | ||||
| 
 | ||||
| /* Wrapper for __cgroup_bpf_update() protected by cgroup_mutex */ | ||||
| void cgroup_bpf_update(struct cgroup *cgrp, | ||||
| 		       struct bpf_prog *prog, | ||||
| 		       enum bpf_attach_type type); | ||||
| 
 | ||||
| int __cgroup_bpf_run_filter(struct sock *sk, | ||||
| 			    struct sk_buff *skb, | ||||
| 			    enum bpf_attach_type type); | ||||
| 
 | ||||
| /* Wrappers for __cgroup_bpf_run_filter() guarded by cgroup_bpf_enabled. */ | ||||
| #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb)			\ | ||||
| ({									\ | ||||
| 	int __ret = 0;							\ | ||||
| 	if (cgroup_bpf_enabled)						\ | ||||
| 		__ret = __cgroup_bpf_run_filter(sk, skb,		\ | ||||
| 						BPF_CGROUP_INET_INGRESS); \ | ||||
| 									\ | ||||
| 	__ret;								\ | ||||
| }) | ||||
| 
 | ||||
| #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb)				\ | ||||
| ({									\ | ||||
| 	int __ret = 0;							\ | ||||
| 	if (cgroup_bpf_enabled && sk && sk == skb->sk) {		\ | ||||
| 		typeof(sk) __sk = sk_to_full_sk(sk);			\ | ||||
| 		if (sk_fullsock(__sk))					\ | ||||
| 			__ret = __cgroup_bpf_run_filter(__sk, skb,	\ | ||||
| 						BPF_CGROUP_INET_EGRESS); \ | ||||
| 	}								\ | ||||
| 	__ret;								\ | ||||
| }) | ||||
| 
 | ||||
| #else | ||||
| 
 | ||||
| struct cgroup_bpf {}; | ||||
| static inline void cgroup_bpf_put(struct cgroup *cgrp) {} | ||||
| static inline void cgroup_bpf_inherit(struct cgroup *cgrp, | ||||
| 				      struct cgroup *parent) {} | ||||
| 
 | ||||
| #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; }) | ||||
| #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; }) | ||||
| 
 | ||||
| #endif /* CONFIG_CGROUP_BPF */ | ||||
| 
 | ||||
| #endif /* _BPF_CGROUP_H */ | ||||
|  | @ -16,6 +16,7 @@ | |||
| #include <linux/percpu-refcount.h> | ||||
| #include <linux/percpu-rwsem.h> | ||||
| #include <linux/workqueue.h> | ||||
| #include <linux/bpf-cgroup.h> | ||||
| 
 | ||||
| #ifdef CONFIG_CGROUPS | ||||
| 
 | ||||
|  | @ -300,6 +301,9 @@ struct cgroup { | |||
| 	/* used to schedule release agent */ | ||||
| 	struct work_struct release_agent_work; | ||||
| 
 | ||||
| 	/* used to store eBPF programs */ | ||||
| 	struct cgroup_bpf bpf; | ||||
| 
 | ||||
| 	/* ids of the ancestors at each level including self */ | ||||
| 	int ancestor_ids[]; | ||||
| }; | ||||
|  |  | |||
							
								
								
									
										12
									
								
								init/Kconfig
									
									
									
									
									
								
							
							
						
						
									
										12
									
								
								init/Kconfig
									
									
									
									
									
								
							|  | @ -1154,6 +1154,18 @@ config CGROUP_PERF | |||
| 
 | ||||
| 	  Say N if unsure. | ||||
| 
 | ||||
| config CGROUP_BPF | ||||
| 	bool "Support for eBPF programs attached to cgroups" | ||||
| 	depends on BPF_SYSCALL && SOCK_CGROUP_DATA | ||||
| 	help | ||||
| 	  Allow attaching eBPF programs to a cgroup using the bpf(2) | ||||
| 	  syscall command BPF_PROG_ATTACH. | ||||
| 
 | ||||
| 	  In which context these programs are accessed depends on the type | ||||
| 	  of attachment. For instance, programs that are attached using | ||||
| 	  BPF_CGROUP_INET_INGRESS will be executed on the ingress path of | ||||
| 	  inet sockets. | ||||
| 
 | ||||
| config CGROUP_DEBUG | ||||
| 	bool "Example controller" | ||||
| 	default n | ||||
|  |  | |||
|  | @ -5,3 +5,4 @@ obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list | |||
| ifeq ($(CONFIG_PERF_EVENTS),y) | ||||
| obj-$(CONFIG_BPF_SYSCALL) += stackmap.o | ||||
| endif | ||||
| obj-$(CONFIG_CGROUP_BPF) += cgroup.o | ||||
|  |  | |||
							
								
								
									
										167
									
								
								kernel/bpf/cgroup.c
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										167
									
								
								kernel/bpf/cgroup.c
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,167 @@ | |||
| /*
 | ||||
|  * Functions to manage eBPF programs attached to cgroups | ||||
|  * | ||||
|  * Copyright (c) 2016 Daniel Mack | ||||
|  * | ||||
|  * This file is subject to the terms and conditions of version 2 of the GNU | ||||
|  * General Public License.  See the file COPYING in the main directory of the | ||||
|  * Linux distribution for more details. | ||||
|  */ | ||||
| 
 | ||||
| #include <linux/kernel.h> | ||||
| #include <linux/atomic.h> | ||||
| #include <linux/cgroup.h> | ||||
| #include <linux/slab.h> | ||||
| #include <linux/bpf.h> | ||||
| #include <linux/bpf-cgroup.h> | ||||
| #include <net/sock.h> | ||||
| 
 | ||||
| DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key); | ||||
| EXPORT_SYMBOL(cgroup_bpf_enabled_key); | ||||
| 
 | ||||
| /**
 | ||||
|  * cgroup_bpf_put() - put references of all bpf programs | ||||
|  * @cgrp: the cgroup to modify | ||||
|  */ | ||||
| void cgroup_bpf_put(struct cgroup *cgrp) | ||||
| { | ||||
| 	unsigned int type; | ||||
| 
 | ||||
| 	for (type = 0; type < ARRAY_SIZE(cgrp->bpf.prog); type++) { | ||||
| 		struct bpf_prog *prog = cgrp->bpf.prog[type]; | ||||
| 
 | ||||
| 		if (prog) { | ||||
| 			bpf_prog_put(prog); | ||||
| 			static_branch_dec(&cgroup_bpf_enabled_key); | ||||
| 		} | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| /**
 | ||||
|  * cgroup_bpf_inherit() - inherit effective programs from parent | ||||
|  * @cgrp: the cgroup to modify | ||||
|  * @parent: the parent to inherit from | ||||
|  */ | ||||
| void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent) | ||||
| { | ||||
| 	unsigned int type; | ||||
| 
 | ||||
| 	for (type = 0; type < ARRAY_SIZE(cgrp->bpf.effective); type++) { | ||||
| 		struct bpf_prog *e; | ||||
| 
 | ||||
| 		e = rcu_dereference_protected(parent->bpf.effective[type], | ||||
| 					      lockdep_is_held(&cgroup_mutex)); | ||||
| 		rcu_assign_pointer(cgrp->bpf.effective[type], e); | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| /**
 | ||||
|  * __cgroup_bpf_update() - Update the pinned program of a cgroup, and | ||||
|  *                         propagate the change to descendants | ||||
|  * @cgrp: The cgroup which descendants to traverse | ||||
|  * @parent: The parent of @cgrp, or %NULL if @cgrp is the root | ||||
|  * @prog: A new program to pin | ||||
|  * @type: Type of pinning operation (ingress/egress) | ||||
|  * | ||||
|  * Each cgroup has a set of two pointers for bpf programs; one for eBPF | ||||
|  * programs it owns, and which is effective for execution. | ||||
|  * | ||||
|  * If @prog is %NULL, this function attaches a new program to the cgroup and | ||||
|  * releases the one that is currently attached, if any. @prog is then made | ||||
|  * the effective program of type @type in that cgroup. | ||||
|  * | ||||
|  * If @prog is %NULL, the currently attached program of type @type is released, | ||||
|  * and the effective program of the parent cgroup (if any) is inherited to | ||||
|  * @cgrp. | ||||
|  * | ||||
|  * Then, the descendants of @cgrp are walked and the effective program for | ||||
|  * each of them is set to the effective program of @cgrp unless the | ||||
|  * descendant has its own program attached, in which case the subbranch is | ||||
|  * skipped. This ensures that delegated subcgroups with own programs are left | ||||
|  * untouched. | ||||
|  * | ||||
|  * Must be called with cgroup_mutex held. | ||||
|  */ | ||||
| void __cgroup_bpf_update(struct cgroup *cgrp, | ||||
| 			 struct cgroup *parent, | ||||
| 			 struct bpf_prog *prog, | ||||
| 			 enum bpf_attach_type type) | ||||
| { | ||||
| 	struct bpf_prog *old_prog, *effective; | ||||
| 	struct cgroup_subsys_state *pos; | ||||
| 
 | ||||
| 	old_prog = xchg(cgrp->bpf.prog + type, prog); | ||||
| 
 | ||||
| 	effective = (!prog && parent) ? | ||||
| 		rcu_dereference_protected(parent->bpf.effective[type], | ||||
| 					  lockdep_is_held(&cgroup_mutex)) : | ||||
| 		prog; | ||||
| 
 | ||||
| 	css_for_each_descendant_pre(pos, &cgrp->self) { | ||||
| 		struct cgroup *desc = container_of(pos, struct cgroup, self); | ||||
| 
 | ||||
| 		/* skip the subtree if the descendant has its own program */ | ||||
| 		if (desc->bpf.prog[type] && desc != cgrp) | ||||
| 			pos = css_rightmost_descendant(pos); | ||||
| 		else | ||||
| 			rcu_assign_pointer(desc->bpf.effective[type], | ||||
| 					   effective); | ||||
| 	} | ||||
| 
 | ||||
| 	if (prog) | ||||
| 		static_branch_inc(&cgroup_bpf_enabled_key); | ||||
| 
 | ||||
| 	if (old_prog) { | ||||
| 		bpf_prog_put(old_prog); | ||||
| 		static_branch_dec(&cgroup_bpf_enabled_key); | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| /**
 | ||||
|  * __cgroup_bpf_run_filter() - Run a program for packet filtering | ||||
|  * @sk: The socken sending or receiving traffic | ||||
|  * @skb: The skb that is being sent or received | ||||
|  * @type: The type of program to be exectuted | ||||
|  * | ||||
|  * If no socket is passed, or the socket is not of type INET or INET6, | ||||
|  * this function does nothing and returns 0. | ||||
|  * | ||||
|  * The program type passed in via @type must be suitable for network | ||||
|  * filtering. No further check is performed to assert that. | ||||
|  * | ||||
|  * This function will return %-EPERM if any if an attached program was found | ||||
|  * and if it returned != 1 during execution. In all other cases, 0 is returned. | ||||
|  */ | ||||
| int __cgroup_bpf_run_filter(struct sock *sk, | ||||
| 			    struct sk_buff *skb, | ||||
| 			    enum bpf_attach_type type) | ||||
| { | ||||
| 	struct bpf_prog *prog; | ||||
| 	struct cgroup *cgrp; | ||||
| 	int ret = 0; | ||||
| 
 | ||||
| 	if (!sk || !sk_fullsock(sk)) | ||||
| 		return 0; | ||||
| 
 | ||||
| 	if (sk->sk_family != AF_INET && | ||||
| 	    sk->sk_family != AF_INET6) | ||||
| 		return 0; | ||||
| 
 | ||||
| 	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); | ||||
| 
 | ||||
| 	rcu_read_lock(); | ||||
| 
 | ||||
| 	prog = rcu_dereference(cgrp->bpf.effective[type]); | ||||
| 	if (prog) { | ||||
| 		unsigned int offset = skb->data - skb_network_header(skb); | ||||
| 
 | ||||
| 		__skb_push(skb, offset); | ||||
| 		ret = bpf_prog_run_save_cb(prog, skb) == 1 ? 0 : -EPERM; | ||||
| 		__skb_pull(skb, offset); | ||||
| 	} | ||||
| 
 | ||||
| 	rcu_read_unlock(); | ||||
| 
 | ||||
| 	return ret; | ||||
| } | ||||
| EXPORT_SYMBOL(__cgroup_bpf_run_filter); | ||||
|  | @ -5074,6 +5074,8 @@ static void css_release_work_fn(struct work_struct *work) | |||
| 		if (cgrp->kn) | ||||
| 			RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, | ||||
| 					 NULL); | ||||
| 
 | ||||
| 		cgroup_bpf_put(cgrp); | ||||
| 	} | ||||
| 
 | ||||
| 	mutex_unlock(&cgroup_mutex); | ||||
|  | @ -5281,6 +5283,9 @@ static struct cgroup *cgroup_create(struct cgroup *parent) | |||
| 	if (!cgroup_on_dfl(cgrp)) | ||||
| 		cgrp->subtree_control = cgroup_control(cgrp); | ||||
| 
 | ||||
| 	if (parent) | ||||
| 		cgroup_bpf_inherit(cgrp, parent); | ||||
| 
 | ||||
| 	cgroup_propagate_control(cgrp); | ||||
| 
 | ||||
| 	/* @cgrp doesn't have dir yet so the following will only create csses */ | ||||
|  | @ -6495,6 +6500,19 @@ static __init int cgroup_namespaces_init(void) | |||
| } | ||||
| subsys_initcall(cgroup_namespaces_init); | ||||
| 
 | ||||
| #ifdef CONFIG_CGROUP_BPF | ||||
| void cgroup_bpf_update(struct cgroup *cgrp, | ||||
| 		       struct bpf_prog *prog, | ||||
| 		       enum bpf_attach_type type) | ||||
| { | ||||
| 	struct cgroup *parent = cgroup_parent(cgrp); | ||||
| 
 | ||||
| 	mutex_lock(&cgroup_mutex); | ||||
| 	__cgroup_bpf_update(cgrp, parent, prog, type); | ||||
| 	mutex_unlock(&cgroup_mutex); | ||||
| } | ||||
| #endif /* CONFIG_CGROUP_BPF */ | ||||
| 
 | ||||
| #ifdef CONFIG_CGROUP_DEBUG | ||||
| static struct cgroup_subsys_state * | ||||
| debug_css_alloc(struct cgroup_subsys_state *parent_css) | ||||
|  |  | |||
		Loading…
	
		Reference in a new issue
	
	 Daniel Mack
						Daniel Mack