forked from mirrors/linux
		
	bpf: Remove extra lock_sock for TCP_ZEROCOPY_RECEIVE
Add custom implementation of getsockopt hook for TCP_ZEROCOPY_RECEIVE.
We skip generic hooks for TCP_ZEROCOPY_RECEIVE and have a custom
call in do_tcp_getsockopt using the on-stack data. This removes
3% overhead for locking/unlocking the socket.
Without this patch:
     3.38%     0.07%  tcp_mmap  [kernel.kallsyms]  [k] __cgroup_bpf_run_filter_getsockopt
            |
             --3.30%--__cgroup_bpf_run_filter_getsockopt
                       |
                        --0.81%--__kmalloc
With the patch applied:
     0.52%     0.12%  tcp_mmap  [kernel.kallsyms]  [k] __cgroup_bpf_run_filter_getsockopt_kern
Note, exporting uapi/tcp.h requires removing netinet/tcp.h
from test_progs.h because those headers have confliciting
definitions.
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20210115163501.805133-2-sdf@google.com
			
			
This commit is contained in:
		
							parent
							
								
									13ca51d5eb
								
							
						
					
					
						commit
						9cacf81f81
					
				
					 16 changed files with 506 additions and 7 deletions
				
			
		|  | @ -147,6 +147,10 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, | ||||||
| 				       int __user *optlen, int max_optlen, | 				       int __user *optlen, int max_optlen, | ||||||
| 				       int retval); | 				       int retval); | ||||||
| 
 | 
 | ||||||
|  | int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level, | ||||||
|  | 					    int optname, void *optval, | ||||||
|  | 					    int *optlen, int retval); | ||||||
|  | 
 | ||||||
| static inline enum bpf_cgroup_storage_type cgroup_storage_type( | static inline enum bpf_cgroup_storage_type cgroup_storage_type( | ||||||
| 	struct bpf_map *map) | 	struct bpf_map *map) | ||||||
| { | { | ||||||
|  | @ -364,10 +368,23 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, | ||||||
| ({									       \ | ({									       \ | ||||||
| 	int __ret = retval;						       \ | 	int __ret = retval;						       \ | ||||||
| 	if (cgroup_bpf_enabled)						       \ | 	if (cgroup_bpf_enabled)						       \ | ||||||
| 		__ret = __cgroup_bpf_run_filter_getsockopt(sock, level,	       \ | 		if (!(sock)->sk_prot->bpf_bypass_getsockopt ||		       \ | ||||||
| 							   optname, optval,    \ | 		    !INDIRECT_CALL_INET_1((sock)->sk_prot->bpf_bypass_getsockopt, \ | ||||||
| 							   optlen, max_optlen, \ | 					tcp_bpf_bypass_getsockopt,	       \ | ||||||
| 							   retval);	       \ | 					level, optname))		       \ | ||||||
|  | 			__ret = __cgroup_bpf_run_filter_getsockopt(	       \ | ||||||
|  | 				sock, level, optname, optval, optlen,	       \ | ||||||
|  | 				max_optlen, retval);			       \ | ||||||
|  | 	__ret;								       \ | ||||||
|  | }) | ||||||
|  | 
 | ||||||
|  | #define BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sock, level, optname, optval,      \ | ||||||
|  | 					    optlen, retval)		       \ | ||||||
|  | ({									       \ | ||||||
|  | 	int __ret = retval;						       \ | ||||||
|  | 	if (cgroup_bpf_enabled)						       \ | ||||||
|  | 		__ret = __cgroup_bpf_run_filter_getsockopt_kern(	       \ | ||||||
|  | 			sock, level, optname, optval, optlen, retval);	       \ | ||||||
| 	__ret;								       \ | 	__ret;								       \ | ||||||
| }) | }) | ||||||
| 
 | 
 | ||||||
|  | @ -452,6 +469,8 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, | ||||||
| #define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) ({ 0; }) | #define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) ({ 0; }) | ||||||
| #define BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock, level, optname, optval, \ | #define BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock, level, optname, optval, \ | ||||||
| 				       optlen, max_optlen, retval) ({ retval; }) | 				       optlen, max_optlen, retval) ({ retval; }) | ||||||
|  | #define BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sock, level, optname, optval, \ | ||||||
|  | 					    optlen, retval) ({ retval; }) | ||||||
| #define BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock, level, optname, optval, optlen, \ | #define BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock, level, optname, optval, optlen, \ | ||||||
| 				       kernel_optval) ({ 0; }) | 				       kernel_optval) ({ 0; }) | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -60,4 +60,10 @@ | ||||||
| #define INDIRECT_CALL_INET(f, f2, f1, ...) f(__VA_ARGS__) | #define INDIRECT_CALL_INET(f, f2, f1, ...) f(__VA_ARGS__) | ||||||
| #endif | #endif | ||||||
| 
 | 
 | ||||||
|  | #if IS_ENABLED(CONFIG_INET) | ||||||
|  | #define INDIRECT_CALL_INET_1(f, f1, ...) INDIRECT_CALL_1(f, f1, __VA_ARGS__) | ||||||
|  | #else | ||||||
|  | #define INDIRECT_CALL_INET_1(f, f1, ...) f(__VA_ARGS__) | ||||||
|  | #endif | ||||||
|  | 
 | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
|  | @ -1174,6 +1174,8 @@ struct proto { | ||||||
| 
 | 
 | ||||||
| 	int			(*backlog_rcv) (struct sock *sk, | 	int			(*backlog_rcv) (struct sock *sk, | ||||||
| 						struct sk_buff *skb); | 						struct sk_buff *skb); | ||||||
|  | 	bool			(*bpf_bypass_getsockopt)(int level, | ||||||
|  | 							 int optname); | ||||||
| 
 | 
 | ||||||
| 	void		(*release_cb)(struct sock *sk); | 	void		(*release_cb)(struct sock *sk); | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -403,6 +403,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, | ||||||
| 		      struct poll_table_struct *wait); | 		      struct poll_table_struct *wait); | ||||||
| int tcp_getsockopt(struct sock *sk, int level, int optname, | int tcp_getsockopt(struct sock *sk, int level, int optname, | ||||||
| 		   char __user *optval, int __user *optlen); | 		   char __user *optval, int __user *optlen); | ||||||
|  | bool tcp_bpf_bypass_getsockopt(int level, int optname); | ||||||
| int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, | int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, | ||||||
| 		   unsigned int optlen); | 		   unsigned int optlen); | ||||||
| void tcp_set_keepalive(struct sock *sk, int val); | void tcp_set_keepalive(struct sock *sk, int val); | ||||||
|  |  | ||||||
|  | @ -1486,6 +1486,52 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, | ||||||
| 	sockopt_free_buf(&ctx); | 	sockopt_free_buf(&ctx); | ||||||
| 	return ret; | 	return ret; | ||||||
| } | } | ||||||
|  | 
 | ||||||
|  | int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level, | ||||||
|  | 					    int optname, void *optval, | ||||||
|  | 					    int *optlen, int retval) | ||||||
|  | { | ||||||
|  | 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); | ||||||
|  | 	struct bpf_sockopt_kern ctx = { | ||||||
|  | 		.sk = sk, | ||||||
|  | 		.level = level, | ||||||
|  | 		.optname = optname, | ||||||
|  | 		.retval = retval, | ||||||
|  | 		.optlen = *optlen, | ||||||
|  | 		.optval = optval, | ||||||
|  | 		.optval_end = optval + *optlen, | ||||||
|  | 	}; | ||||||
|  | 	int ret; | ||||||
|  | 
 | ||||||
|  | 	/* Note that __cgroup_bpf_run_filter_getsockopt doesn't copy
 | ||||||
|  | 	 * user data back into BPF buffer when reval != 0. This is | ||||||
|  | 	 * done as an optimization to avoid extra copy, assuming | ||||||
|  | 	 * kernel won't populate the data in case of an error. | ||||||
|  | 	 * Here we always pass the data and memset() should | ||||||
|  | 	 * be called if that data shouldn't be "exported". | ||||||
|  | 	 */ | ||||||
|  | 
 | ||||||
|  | 	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT], | ||||||
|  | 				 &ctx, BPF_PROG_RUN); | ||||||
|  | 	if (!ret) | ||||||
|  | 		return -EPERM; | ||||||
|  | 
 | ||||||
|  | 	if (ctx.optlen > *optlen) | ||||||
|  | 		return -EFAULT; | ||||||
|  | 
 | ||||||
|  | 	/* BPF programs only allowed to set retval to 0, not some
 | ||||||
|  | 	 * arbitrary value. | ||||||
|  | 	 */ | ||||||
|  | 	if (ctx.retval != 0 && ctx.retval != retval) | ||||||
|  | 		return -EFAULT; | ||||||
|  | 
 | ||||||
|  | 	/* BPF programs can shrink the buffer, export the modifications.
 | ||||||
|  | 	 */ | ||||||
|  | 	if (ctx.optlen != 0) | ||||||
|  | 		*optlen = ctx.optlen; | ||||||
|  | 
 | ||||||
|  | 	return ctx.retval; | ||||||
|  | } | ||||||
| #endif | #endif | ||||||
| 
 | 
 | ||||||
| static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp, | static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp, | ||||||
|  |  | ||||||
|  | @ -4099,6 +4099,8 @@ static int do_tcp_getsockopt(struct sock *sk, int level, | ||||||
| 			return -EFAULT; | 			return -EFAULT; | ||||||
| 		lock_sock(sk); | 		lock_sock(sk); | ||||||
| 		err = tcp_zerocopy_receive(sk, &zc); | 		err = tcp_zerocopy_receive(sk, &zc); | ||||||
|  | 		err = BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sk, level, optname, | ||||||
|  | 							  &zc, &len, err); | ||||||
| 		release_sock(sk); | 		release_sock(sk); | ||||||
| 		if (len >= offsetofend(struct tcp_zerocopy_receive, err)) | 		if (len >= offsetofend(struct tcp_zerocopy_receive, err)) | ||||||
| 			goto zerocopy_rcv_sk_err; | 			goto zerocopy_rcv_sk_err; | ||||||
|  | @ -4133,6 +4135,18 @@ static int do_tcp_getsockopt(struct sock *sk, int level, | ||||||
| 	return 0; | 	return 0; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | bool tcp_bpf_bypass_getsockopt(int level, int optname) | ||||||
|  | { | ||||||
|  | 	/* TCP do_tcp_getsockopt has optimized getsockopt implementation
 | ||||||
|  | 	 * to avoid extra socket lock for TCP_ZEROCOPY_RECEIVE. | ||||||
|  | 	 */ | ||||||
|  | 	if (level == SOL_TCP && optname == TCP_ZEROCOPY_RECEIVE) | ||||||
|  | 		return true; | ||||||
|  | 
 | ||||||
|  | 	return false; | ||||||
|  | } | ||||||
|  | EXPORT_SYMBOL(tcp_bpf_bypass_getsockopt); | ||||||
|  | 
 | ||||||
| int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, | int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, | ||||||
| 		   int __user *optlen) | 		   int __user *optlen) | ||||||
| { | { | ||||||
|  |  | ||||||
|  | @ -2793,6 +2793,7 @@ struct proto tcp_prot = { | ||||||
| 	.shutdown		= tcp_shutdown, | 	.shutdown		= tcp_shutdown, | ||||||
| 	.setsockopt		= tcp_setsockopt, | 	.setsockopt		= tcp_setsockopt, | ||||||
| 	.getsockopt		= tcp_getsockopt, | 	.getsockopt		= tcp_getsockopt, | ||||||
|  | 	.bpf_bypass_getsockopt	= tcp_bpf_bypass_getsockopt, | ||||||
| 	.keepalive		= tcp_set_keepalive, | 	.keepalive		= tcp_set_keepalive, | ||||||
| 	.recvmsg		= tcp_recvmsg, | 	.recvmsg		= tcp_recvmsg, | ||||||
| 	.sendmsg		= tcp_sendmsg, | 	.sendmsg		= tcp_sendmsg, | ||||||
|  |  | ||||||
|  | @ -2121,6 +2121,7 @@ struct proto tcpv6_prot = { | ||||||
| 	.shutdown		= tcp_shutdown, | 	.shutdown		= tcp_shutdown, | ||||||
| 	.setsockopt		= tcp_setsockopt, | 	.setsockopt		= tcp_setsockopt, | ||||||
| 	.getsockopt		= tcp_getsockopt, | 	.getsockopt		= tcp_getsockopt, | ||||||
|  | 	.bpf_bypass_getsockopt	= tcp_bpf_bypass_getsockopt, | ||||||
| 	.keepalive		= tcp_set_keepalive, | 	.keepalive		= tcp_set_keepalive, | ||||||
| 	.recvmsg		= tcp_recvmsg, | 	.recvmsg		= tcp_recvmsg, | ||||||
| 	.sendmsg		= tcp_sendmsg, | 	.sendmsg		= tcp_sendmsg, | ||||||
|  |  | ||||||
|  | @ -2126,6 +2126,9 @@ SYSCALL_DEFINE5(setsockopt, int, fd, int, level, int, optname, | ||||||
| 	return __sys_setsockopt(fd, level, optname, optval, optlen); | 	return __sys_setsockopt(fd, level, optname, optval, optlen); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | INDIRECT_CALLABLE_DECLARE(bool tcp_bpf_bypass_getsockopt(int level, | ||||||
|  | 							 int optname)); | ||||||
|  | 
 | ||||||
| /*
 | /*
 | ||||||
|  *	Get a socket option. Because we don't know the option lengths we have |  *	Get a socket option. Because we don't know the option lengths we have | ||||||
|  *	to pass a user mode parameter for the protocols to sort out. |  *	to pass a user mode parameter for the protocols to sort out. | ||||||
|  |  | ||||||
							
								
								
									
										357
									
								
								tools/include/uapi/linux/tcp.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										357
									
								
								tools/include/uapi/linux/tcp.h
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,357 @@ | ||||||
|  | /* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ | ||||||
|  | /*
 | ||||||
|  |  * INET		An implementation of the TCP/IP protocol suite for the LINUX | ||||||
|  |  *		operating system.  INET is implemented using the  BSD Socket | ||||||
|  |  *		interface as the means of communication with the user level. | ||||||
|  |  * | ||||||
|  |  *		Definitions for the TCP protocol. | ||||||
|  |  * | ||||||
|  |  * Version:	@(#)tcp.h	1.0.2	04/28/93 | ||||||
|  |  * | ||||||
|  |  * Author:	Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> | ||||||
|  |  * | ||||||
|  |  *		This program is free software; you can redistribute it and/or | ||||||
|  |  *		modify it under the terms of the GNU General Public License | ||||||
|  |  *		as published by the Free Software Foundation; either version | ||||||
|  |  *		2 of the License, or (at your option) any later version. | ||||||
|  |  */ | ||||||
|  | #ifndef _UAPI_LINUX_TCP_H | ||||||
|  | #define _UAPI_LINUX_TCP_H | ||||||
|  | 
 | ||||||
|  | #include <linux/types.h> | ||||||
|  | #include <asm/byteorder.h> | ||||||
|  | #include <linux/socket.h> | ||||||
|  | 
 | ||||||
|  | struct tcphdr { | ||||||
|  | 	__be16	source; | ||||||
|  | 	__be16	dest; | ||||||
|  | 	__be32	seq; | ||||||
|  | 	__be32	ack_seq; | ||||||
|  | #if defined(__LITTLE_ENDIAN_BITFIELD) | ||||||
|  | 	__u16	res1:4, | ||||||
|  | 		doff:4, | ||||||
|  | 		fin:1, | ||||||
|  | 		syn:1, | ||||||
|  | 		rst:1, | ||||||
|  | 		psh:1, | ||||||
|  | 		ack:1, | ||||||
|  | 		urg:1, | ||||||
|  | 		ece:1, | ||||||
|  | 		cwr:1; | ||||||
|  | #elif defined(__BIG_ENDIAN_BITFIELD) | ||||||
|  | 	__u16	doff:4, | ||||||
|  | 		res1:4, | ||||||
|  | 		cwr:1, | ||||||
|  | 		ece:1, | ||||||
|  | 		urg:1, | ||||||
|  | 		ack:1, | ||||||
|  | 		psh:1, | ||||||
|  | 		rst:1, | ||||||
|  | 		syn:1, | ||||||
|  | 		fin:1; | ||||||
|  | #else | ||||||
|  | #error	"Adjust your <asm/byteorder.h> defines" | ||||||
|  | #endif	 | ||||||
|  | 	__be16	window; | ||||||
|  | 	__sum16	check; | ||||||
|  | 	__be16	urg_ptr; | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | /*
 | ||||||
|  |  *	The union cast uses a gcc extension to avoid aliasing problems | ||||||
|  |  *  (union is compatible to any of its members) | ||||||
|  |  *  This means this part of the code is -fstrict-aliasing safe now. | ||||||
|  |  */ | ||||||
|  | union tcp_word_hdr {  | ||||||
|  | 	struct tcphdr hdr; | ||||||
|  | 	__be32 		  words[5]; | ||||||
|  | };  | ||||||
|  | 
 | ||||||
|  | #define tcp_flag_word(tp) ( ((union tcp_word_hdr *)(tp))->words [3])  | ||||||
|  | 
 | ||||||
|  | enum {  | ||||||
|  | 	TCP_FLAG_CWR = __constant_cpu_to_be32(0x00800000), | ||||||
|  | 	TCP_FLAG_ECE = __constant_cpu_to_be32(0x00400000), | ||||||
|  | 	TCP_FLAG_URG = __constant_cpu_to_be32(0x00200000), | ||||||
|  | 	TCP_FLAG_ACK = __constant_cpu_to_be32(0x00100000), | ||||||
|  | 	TCP_FLAG_PSH = __constant_cpu_to_be32(0x00080000), | ||||||
|  | 	TCP_FLAG_RST = __constant_cpu_to_be32(0x00040000), | ||||||
|  | 	TCP_FLAG_SYN = __constant_cpu_to_be32(0x00020000), | ||||||
|  | 	TCP_FLAG_FIN = __constant_cpu_to_be32(0x00010000), | ||||||
|  | 	TCP_RESERVED_BITS = __constant_cpu_to_be32(0x0F000000), | ||||||
|  | 	TCP_DATA_OFFSET = __constant_cpu_to_be32(0xF0000000) | ||||||
|  | };  | ||||||
|  | 
 | ||||||
|  | /*
 | ||||||
|  |  * TCP general constants | ||||||
|  |  */ | ||||||
|  | #define TCP_MSS_DEFAULT		 536U	/* IPv4 (RFC1122, RFC2581) */ | ||||||
|  | #define TCP_MSS_DESIRED		1220U	/* IPv6 (tunneled), EDNS0 (RFC3226) */ | ||||||
|  | 
 | ||||||
|  | /* TCP socket options */ | ||||||
|  | #define TCP_NODELAY		1	/* Turn off Nagle's algorithm. */ | ||||||
|  | #define TCP_MAXSEG		2	/* Limit MSS */ | ||||||
|  | #define TCP_CORK		3	/* Never send partially complete segments */ | ||||||
|  | #define TCP_KEEPIDLE		4	/* Start keeplives after this period */ | ||||||
|  | #define TCP_KEEPINTVL		5	/* Interval between keepalives */ | ||||||
|  | #define TCP_KEEPCNT		6	/* Number of keepalives before death */ | ||||||
|  | #define TCP_SYNCNT		7	/* Number of SYN retransmits */ | ||||||
|  | #define TCP_LINGER2		8	/* Life time of orphaned FIN-WAIT-2 state */ | ||||||
|  | #define TCP_DEFER_ACCEPT	9	/* Wake up listener only when data arrive */ | ||||||
|  | #define TCP_WINDOW_CLAMP	10	/* Bound advertised window */ | ||||||
|  | #define TCP_INFO		11	/* Information about this connection. */ | ||||||
|  | #define TCP_QUICKACK		12	/* Block/reenable quick acks */ | ||||||
|  | #define TCP_CONGESTION		13	/* Congestion control algorithm */ | ||||||
|  | #define TCP_MD5SIG		14	/* TCP MD5 Signature (RFC2385) */ | ||||||
|  | #define TCP_THIN_LINEAR_TIMEOUTS 16      /* Use linear timeouts for thin streams*/ | ||||||
|  | #define TCP_THIN_DUPACK         17      /* Fast retrans. after 1 dupack */ | ||||||
|  | #define TCP_USER_TIMEOUT	18	/* How long for loss retry before timeout */ | ||||||
|  | #define TCP_REPAIR		19	/* TCP sock is under repair right now */ | ||||||
|  | #define TCP_REPAIR_QUEUE	20 | ||||||
|  | #define TCP_QUEUE_SEQ		21 | ||||||
|  | #define TCP_REPAIR_OPTIONS	22 | ||||||
|  | #define TCP_FASTOPEN		23	/* Enable FastOpen on listeners */ | ||||||
|  | #define TCP_TIMESTAMP		24 | ||||||
|  | #define TCP_NOTSENT_LOWAT	25	/* limit number of unsent bytes in write queue */ | ||||||
|  | #define TCP_CC_INFO		26	/* Get Congestion Control (optional) info */ | ||||||
|  | #define TCP_SAVE_SYN		27	/* Record SYN headers for new connections */ | ||||||
|  | #define TCP_SAVED_SYN		28	/* Get SYN headers recorded for connection */ | ||||||
|  | #define TCP_REPAIR_WINDOW	29	/* Get/set window parameters */ | ||||||
|  | #define TCP_FASTOPEN_CONNECT	30	/* Attempt FastOpen with connect */ | ||||||
|  | #define TCP_ULP			31	/* Attach a ULP to a TCP connection */ | ||||||
|  | #define TCP_MD5SIG_EXT		32	/* TCP MD5 Signature with extensions */ | ||||||
|  | #define TCP_FASTOPEN_KEY	33	/* Set the key for Fast Open (cookie) */ | ||||||
|  | #define TCP_FASTOPEN_NO_COOKIE	34	/* Enable TFO without a TFO cookie */ | ||||||
|  | #define TCP_ZEROCOPY_RECEIVE	35 | ||||||
|  | #define TCP_INQ			36	/* Notify bytes available to read as a cmsg on read */ | ||||||
|  | 
 | ||||||
|  | #define TCP_CM_INQ		TCP_INQ | ||||||
|  | 
 | ||||||
|  | #define TCP_TX_DELAY		37	/* delay outgoing packets by XX usec */ | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | #define TCP_REPAIR_ON		1 | ||||||
|  | #define TCP_REPAIR_OFF		0 | ||||||
|  | #define TCP_REPAIR_OFF_NO_WP	-1	/* Turn off without window probes */ | ||||||
|  | 
 | ||||||
|  | struct tcp_repair_opt { | ||||||
|  | 	__u32	opt_code; | ||||||
|  | 	__u32	opt_val; | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | struct tcp_repair_window { | ||||||
|  | 	__u32	snd_wl1; | ||||||
|  | 	__u32	snd_wnd; | ||||||
|  | 	__u32	max_window; | ||||||
|  | 
 | ||||||
|  | 	__u32	rcv_wnd; | ||||||
|  | 	__u32	rcv_wup; | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | enum { | ||||||
|  | 	TCP_NO_QUEUE, | ||||||
|  | 	TCP_RECV_QUEUE, | ||||||
|  | 	TCP_SEND_QUEUE, | ||||||
|  | 	TCP_QUEUES_NR, | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | /* why fastopen failed from client perspective */ | ||||||
|  | enum tcp_fastopen_client_fail { | ||||||
|  | 	TFO_STATUS_UNSPEC, /* catch-all */ | ||||||
|  | 	TFO_COOKIE_UNAVAILABLE, /* if not in TFO_CLIENT_NO_COOKIE mode */ | ||||||
|  | 	TFO_DATA_NOT_ACKED, /* SYN-ACK did not ack SYN data */ | ||||||
|  | 	TFO_SYN_RETRANSMITTED, /* SYN-ACK did not ack SYN data after timeout */ | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | /* for TCP_INFO socket option */ | ||||||
|  | #define TCPI_OPT_TIMESTAMPS	1 | ||||||
|  | #define TCPI_OPT_SACK		2 | ||||||
|  | #define TCPI_OPT_WSCALE		4 | ||||||
|  | #define TCPI_OPT_ECN		8 /* ECN was negociated at TCP session init */ | ||||||
|  | #define TCPI_OPT_ECN_SEEN	16 /* we received at least one packet with ECT */ | ||||||
|  | #define TCPI_OPT_SYN_DATA	32 /* SYN-ACK acked data in SYN sent or rcvd */ | ||||||
|  | 
 | ||||||
|  | /*
 | ||||||
|  |  * Sender's congestion state indicating normal or abnormal situations | ||||||
|  |  * in the last round of packets sent. The state is driven by the ACK | ||||||
|  |  * information and timer events. | ||||||
|  |  */ | ||||||
|  | enum tcp_ca_state { | ||||||
|  | 	/*
 | ||||||
|  | 	 * Nothing bad has been observed recently. | ||||||
|  | 	 * No apparent reordering, packet loss, or ECN marks. | ||||||
|  | 	 */ | ||||||
|  | 	TCP_CA_Open = 0, | ||||||
|  | #define TCPF_CA_Open	(1<<TCP_CA_Open) | ||||||
|  | 	/*
 | ||||||
|  | 	 * The sender enters disordered state when it has received DUPACKs or | ||||||
|  | 	 * SACKs in the last round of packets sent. This could be due to packet | ||||||
|  | 	 * loss or reordering but needs further information to confirm packets | ||||||
|  | 	 * have been lost. | ||||||
|  | 	 */ | ||||||
|  | 	TCP_CA_Disorder = 1, | ||||||
|  | #define TCPF_CA_Disorder (1<<TCP_CA_Disorder) | ||||||
|  | 	/*
 | ||||||
|  | 	 * The sender enters Congestion Window Reduction (CWR) state when it | ||||||
|  | 	 * has received ACKs with ECN-ECE marks, or has experienced congestion | ||||||
|  | 	 * or packet discard on the sender host (e.g. qdisc). | ||||||
|  | 	 */ | ||||||
|  | 	TCP_CA_CWR = 2, | ||||||
|  | #define TCPF_CA_CWR	(1<<TCP_CA_CWR) | ||||||
|  | 	/*
 | ||||||
|  | 	 * The sender is in fast recovery and retransmitting lost packets, | ||||||
|  | 	 * typically triggered by ACK events. | ||||||
|  | 	 */ | ||||||
|  | 	TCP_CA_Recovery = 3, | ||||||
|  | #define TCPF_CA_Recovery (1<<TCP_CA_Recovery) | ||||||
|  | 	/*
 | ||||||
|  | 	 * The sender is in loss recovery triggered by retransmission timeout. | ||||||
|  | 	 */ | ||||||
|  | 	TCP_CA_Loss = 4 | ||||||
|  | #define TCPF_CA_Loss	(1<<TCP_CA_Loss) | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | struct tcp_info { | ||||||
|  | 	__u8	tcpi_state; | ||||||
|  | 	__u8	tcpi_ca_state; | ||||||
|  | 	__u8	tcpi_retransmits; | ||||||
|  | 	__u8	tcpi_probes; | ||||||
|  | 	__u8	tcpi_backoff; | ||||||
|  | 	__u8	tcpi_options; | ||||||
|  | 	__u8	tcpi_snd_wscale : 4, tcpi_rcv_wscale : 4; | ||||||
|  | 	__u8	tcpi_delivery_rate_app_limited:1, tcpi_fastopen_client_fail:2; | ||||||
|  | 
 | ||||||
|  | 	__u32	tcpi_rto; | ||||||
|  | 	__u32	tcpi_ato; | ||||||
|  | 	__u32	tcpi_snd_mss; | ||||||
|  | 	__u32	tcpi_rcv_mss; | ||||||
|  | 
 | ||||||
|  | 	__u32	tcpi_unacked; | ||||||
|  | 	__u32	tcpi_sacked; | ||||||
|  | 	__u32	tcpi_lost; | ||||||
|  | 	__u32	tcpi_retrans; | ||||||
|  | 	__u32	tcpi_fackets; | ||||||
|  | 
 | ||||||
|  | 	/* Times. */ | ||||||
|  | 	__u32	tcpi_last_data_sent; | ||||||
|  | 	__u32	tcpi_last_ack_sent;     /* Not remembered, sorry. */ | ||||||
|  | 	__u32	tcpi_last_data_recv; | ||||||
|  | 	__u32	tcpi_last_ack_recv; | ||||||
|  | 
 | ||||||
|  | 	/* Metrics. */ | ||||||
|  | 	__u32	tcpi_pmtu; | ||||||
|  | 	__u32	tcpi_rcv_ssthresh; | ||||||
|  | 	__u32	tcpi_rtt; | ||||||
|  | 	__u32	tcpi_rttvar; | ||||||
|  | 	__u32	tcpi_snd_ssthresh; | ||||||
|  | 	__u32	tcpi_snd_cwnd; | ||||||
|  | 	__u32	tcpi_advmss; | ||||||
|  | 	__u32	tcpi_reordering; | ||||||
|  | 
 | ||||||
|  | 	__u32	tcpi_rcv_rtt; | ||||||
|  | 	__u32	tcpi_rcv_space; | ||||||
|  | 
 | ||||||
|  | 	__u32	tcpi_total_retrans; | ||||||
|  | 
 | ||||||
|  | 	__u64	tcpi_pacing_rate; | ||||||
|  | 	__u64	tcpi_max_pacing_rate; | ||||||
|  | 	__u64	tcpi_bytes_acked;    /* RFC4898 tcpEStatsAppHCThruOctetsAcked */ | ||||||
|  | 	__u64	tcpi_bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived */ | ||||||
|  | 	__u32	tcpi_segs_out;	     /* RFC4898 tcpEStatsPerfSegsOut */ | ||||||
|  | 	__u32	tcpi_segs_in;	     /* RFC4898 tcpEStatsPerfSegsIn */ | ||||||
|  | 
 | ||||||
|  | 	__u32	tcpi_notsent_bytes; | ||||||
|  | 	__u32	tcpi_min_rtt; | ||||||
|  | 	__u32	tcpi_data_segs_in;	/* RFC4898 tcpEStatsDataSegsIn */ | ||||||
|  | 	__u32	tcpi_data_segs_out;	/* RFC4898 tcpEStatsDataSegsOut */ | ||||||
|  | 
 | ||||||
|  | 	__u64   tcpi_delivery_rate; | ||||||
|  | 
 | ||||||
|  | 	__u64	tcpi_busy_time;      /* Time (usec) busy sending data */ | ||||||
|  | 	__u64	tcpi_rwnd_limited;   /* Time (usec) limited by receive window */ | ||||||
|  | 	__u64	tcpi_sndbuf_limited; /* Time (usec) limited by send buffer */ | ||||||
|  | 
 | ||||||
|  | 	__u32	tcpi_delivered; | ||||||
|  | 	__u32	tcpi_delivered_ce; | ||||||
|  | 
 | ||||||
|  | 	__u64	tcpi_bytes_sent;     /* RFC4898 tcpEStatsPerfHCDataOctetsOut */ | ||||||
|  | 	__u64	tcpi_bytes_retrans;  /* RFC4898 tcpEStatsPerfOctetsRetrans */ | ||||||
|  | 	__u32	tcpi_dsack_dups;     /* RFC4898 tcpEStatsStackDSACKDups */ | ||||||
|  | 	__u32	tcpi_reord_seen;     /* reordering events seen */ | ||||||
|  | 
 | ||||||
|  | 	__u32	tcpi_rcv_ooopack;    /* Out-of-order packets received */ | ||||||
|  | 
 | ||||||
|  | 	__u32	tcpi_snd_wnd;	     /* peer's advertised receive window after
 | ||||||
|  | 				      * scaling (bytes) | ||||||
|  | 				      */ | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | /* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */ | ||||||
|  | enum { | ||||||
|  | 	TCP_NLA_PAD, | ||||||
|  | 	TCP_NLA_BUSY,		/* Time (usec) busy sending data */ | ||||||
|  | 	TCP_NLA_RWND_LIMITED,	/* Time (usec) limited by receive window */ | ||||||
|  | 	TCP_NLA_SNDBUF_LIMITED,	/* Time (usec) limited by send buffer */ | ||||||
|  | 	TCP_NLA_DATA_SEGS_OUT,	/* Data pkts sent including retransmission */ | ||||||
|  | 	TCP_NLA_TOTAL_RETRANS,	/* Data pkts retransmitted */ | ||||||
|  | 	TCP_NLA_PACING_RATE,    /* Pacing rate in bytes per second */ | ||||||
|  | 	TCP_NLA_DELIVERY_RATE,  /* Delivery rate in bytes per second */ | ||||||
|  | 	TCP_NLA_SND_CWND,       /* Sending congestion window */ | ||||||
|  | 	TCP_NLA_REORDERING,     /* Reordering metric */ | ||||||
|  | 	TCP_NLA_MIN_RTT,        /* minimum RTT */ | ||||||
|  | 	TCP_NLA_RECUR_RETRANS,  /* Recurring retransmits for the current pkt */ | ||||||
|  | 	TCP_NLA_DELIVERY_RATE_APP_LMT, /* delivery rate application limited ? */ | ||||||
|  | 	TCP_NLA_SNDQ_SIZE,	/* Data (bytes) pending in send queue */ | ||||||
|  | 	TCP_NLA_CA_STATE,	/* ca_state of socket */ | ||||||
|  | 	TCP_NLA_SND_SSTHRESH,	/* Slow start size threshold */ | ||||||
|  | 	TCP_NLA_DELIVERED,	/* Data pkts delivered incl. out-of-order */ | ||||||
|  | 	TCP_NLA_DELIVERED_CE,	/* Like above but only ones w/ CE marks */ | ||||||
|  | 	TCP_NLA_BYTES_SENT,	/* Data bytes sent including retransmission */ | ||||||
|  | 	TCP_NLA_BYTES_RETRANS,	/* Data bytes retransmitted */ | ||||||
|  | 	TCP_NLA_DSACK_DUPS,	/* DSACK blocks received */ | ||||||
|  | 	TCP_NLA_REORD_SEEN,	/* reordering events seen */ | ||||||
|  | 	TCP_NLA_SRTT,		/* smoothed RTT in usecs */ | ||||||
|  | 	TCP_NLA_TIMEOUT_REHASH, /* Timeout-triggered rehash attempts */ | ||||||
|  | 	TCP_NLA_BYTES_NOTSENT,	/* Bytes in write queue not yet sent */ | ||||||
|  | 	TCP_NLA_EDT,		/* Earliest departure time (CLOCK_MONOTONIC) */ | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | /* for TCP_MD5SIG socket option */ | ||||||
|  | #define TCP_MD5SIG_MAXKEYLEN	80 | ||||||
|  | 
 | ||||||
|  | /* tcp_md5sig extension flags for TCP_MD5SIG_EXT */ | ||||||
|  | #define TCP_MD5SIG_FLAG_PREFIX		0x1	/* address prefix length */ | ||||||
|  | #define TCP_MD5SIG_FLAG_IFINDEX		0x2	/* ifindex set */ | ||||||
|  | 
 | ||||||
|  | struct tcp_md5sig { | ||||||
|  | 	struct __kernel_sockaddr_storage tcpm_addr;	/* address associated */ | ||||||
|  | 	__u8	tcpm_flags;				/* extension flags */ | ||||||
|  | 	__u8	tcpm_prefixlen;				/* address prefix */ | ||||||
|  | 	__u16	tcpm_keylen;				/* key length */ | ||||||
|  | 	int	tcpm_ifindex;				/* device index for scope */ | ||||||
|  | 	__u8	tcpm_key[TCP_MD5SIG_MAXKEYLEN];		/* key (binary) */ | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | /* INET_DIAG_MD5SIG */ | ||||||
|  | struct tcp_diag_md5sig { | ||||||
|  | 	__u8	tcpm_family; | ||||||
|  | 	__u8	tcpm_prefixlen; | ||||||
|  | 	__u16	tcpm_keylen; | ||||||
|  | 	__be32	tcpm_addr[4]; | ||||||
|  | 	__u8	tcpm_key[TCP_MD5SIG_MAXKEYLEN]; | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | /* setsockopt(fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE, ...) */ | ||||||
|  | 
 | ||||||
|  | #define TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT 0x1 | ||||||
|  | struct tcp_zerocopy_receive { | ||||||
|  | 	__u64 address;		/* in: address of mapping */ | ||||||
|  | 	__u32 length;		/* in/out: number of bytes to map/mapped */ | ||||||
|  | 	__u32 recv_skip_hint;	/* out: amount of bytes to skip */ | ||||||
|  | 	__u32 inq; /* out: amount of bytes in read queue */ | ||||||
|  | 	__s32 err; /* out: socket error */ | ||||||
|  | 	__u64 copybuf_address;	/* in: copybuf address (small reads) */ | ||||||
|  | 	__s32 copybuf_len; /* in/out: copybuf bytes avail/used or error */ | ||||||
|  | 	__u32 flags; /* in: flags */ | ||||||
|  | }; | ||||||
|  | #endif /* _UAPI_LINUX_TCP_H */ | ||||||
|  | @ -2,6 +2,7 @@ | ||||||
| /* Copyright (c) 2019 Facebook */ | /* Copyright (c) 2019 Facebook */ | ||||||
| 
 | 
 | ||||||
| #include <linux/err.h> | #include <linux/err.h> | ||||||
|  | #include <netinet/tcp.h> | ||||||
| #include <test_progs.h> | #include <test_progs.h> | ||||||
| #include "bpf_dctcp.skel.h" | #include "bpf_dctcp.skel.h" | ||||||
| #include "bpf_cubic.skel.h" | #include "bpf_cubic.skel.h" | ||||||
|  |  | ||||||
|  | @ -7,6 +7,7 @@ | ||||||
| #include <string.h> | #include <string.h> | ||||||
| 
 | 
 | ||||||
| #include <linux/pkt_cls.h> | #include <linux/pkt_cls.h> | ||||||
|  | #include <netinet/tcp.h> | ||||||
| 
 | 
 | ||||||
| #include <test_progs.h> | #include <test_progs.h> | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -1,6 +1,7 @@ | ||||||
| // SPDX-License-Identifier: GPL-2.0
 | // SPDX-License-Identifier: GPL-2.0
 | ||||||
| // Copyright (c) 2020 Cloudflare
 | // Copyright (c) 2020 Cloudflare
 | ||||||
| #include <error.h> | #include <error.h> | ||||||
|  | #include <netinet/tcp.h> | ||||||
| 
 | 
 | ||||||
| #include "test_progs.h" | #include "test_progs.h" | ||||||
| #include "test_skmsg_load_helpers.skel.h" | #include "test_skmsg_load_helpers.skel.h" | ||||||
|  |  | ||||||
|  | @ -2,6 +2,12 @@ | ||||||
| #include <test_progs.h> | #include <test_progs.h> | ||||||
| #include "cgroup_helpers.h" | #include "cgroup_helpers.h" | ||||||
| 
 | 
 | ||||||
|  | #include <linux/tcp.h> | ||||||
|  | 
 | ||||||
|  | #ifndef SOL_TCP | ||||||
|  | #define SOL_TCP IPPROTO_TCP | ||||||
|  | #endif | ||||||
|  | 
 | ||||||
| #define SOL_CUSTOM			0xdeadbeef | #define SOL_CUSTOM			0xdeadbeef | ||||||
| 
 | 
 | ||||||
| static int getsetsockopt(void) | static int getsetsockopt(void) | ||||||
|  | @ -11,6 +17,7 @@ static int getsetsockopt(void) | ||||||
| 		char u8[4]; | 		char u8[4]; | ||||||
| 		__u32 u32; | 		__u32 u32; | ||||||
| 		char cc[16]; /* TCP_CA_NAME_MAX */ | 		char cc[16]; /* TCP_CA_NAME_MAX */ | ||||||
|  | 		struct tcp_zerocopy_receive zc; | ||||||
| 	} buf = {}; | 	} buf = {}; | ||||||
| 	socklen_t optlen; | 	socklen_t optlen; | ||||||
| 	char *big_buf = NULL; | 	char *big_buf = NULL; | ||||||
|  | @ -154,6 +161,27 @@ static int getsetsockopt(void) | ||||||
| 		goto err; | 		goto err; | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
|  | 	/* TCP_ZEROCOPY_RECEIVE triggers */ | ||||||
|  | 	memset(&buf, 0, sizeof(buf)); | ||||||
|  | 	optlen = sizeof(buf.zc); | ||||||
|  | 	err = getsockopt(fd, SOL_TCP, TCP_ZEROCOPY_RECEIVE, &buf, &optlen); | ||||||
|  | 	if (err) { | ||||||
|  | 		log_err("Unexpected getsockopt(TCP_ZEROCOPY_RECEIVE) err=%d errno=%d", | ||||||
|  | 			err, errno); | ||||||
|  | 		goto err; | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	memset(&buf, 0, sizeof(buf)); | ||||||
|  | 	buf.zc.address = 12345; /* rejected by BPF */ | ||||||
|  | 	optlen = sizeof(buf.zc); | ||||||
|  | 	errno = 0; | ||||||
|  | 	err = getsockopt(fd, SOL_TCP, TCP_ZEROCOPY_RECEIVE, &buf, &optlen); | ||||||
|  | 	if (errno != EPERM) { | ||||||
|  | 		log_err("Unexpected getsockopt(TCP_ZEROCOPY_RECEIVE) err=%d errno=%d", | ||||||
|  | 			err, errno); | ||||||
|  | 		goto err; | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
| 	free(big_buf); | 	free(big_buf); | ||||||
| 	close(fd); | 	close(fd); | ||||||
| 	return 0; | 	return 0; | ||||||
|  |  | ||||||
|  | @ -1,8 +1,8 @@ | ||||||
| // SPDX-License-Identifier: GPL-2.0
 | // SPDX-License-Identifier: GPL-2.0
 | ||||||
| #include <string.h> | #include <string.h> | ||||||
| #include <netinet/in.h> | #include <linux/tcp.h> | ||||||
| #include <netinet/tcp.h> |  | ||||||
| #include <linux/bpf.h> | #include <linux/bpf.h> | ||||||
|  | #include <netinet/in.h> | ||||||
| #include <bpf/bpf_helpers.h> | #include <bpf/bpf_helpers.h> | ||||||
| 
 | 
 | ||||||
| char _license[] SEC("license") = "GPL"; | char _license[] SEC("license") = "GPL"; | ||||||
|  | @ -12,6 +12,10 @@ __u32 _version SEC("version") = 1; | ||||||
| #define PAGE_SIZE 4096 | #define PAGE_SIZE 4096 | ||||||
| #endif | #endif | ||||||
| 
 | 
 | ||||||
|  | #ifndef SOL_TCP | ||||||
|  | #define SOL_TCP IPPROTO_TCP | ||||||
|  | #endif | ||||||
|  | 
 | ||||||
| #define SOL_CUSTOM			0xdeadbeef | #define SOL_CUSTOM			0xdeadbeef | ||||||
| 
 | 
 | ||||||
| struct sockopt_sk { | struct sockopt_sk { | ||||||
|  | @ -57,6 +61,21 @@ int _getsockopt(struct bpf_sockopt *ctx) | ||||||
| 		return 1; | 		return 1; | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
|  | 	if (ctx->level == SOL_TCP && ctx->optname == TCP_ZEROCOPY_RECEIVE) { | ||||||
|  | 		/* Verify that TCP_ZEROCOPY_RECEIVE triggers.
 | ||||||
|  | 		 * It has a custom implementation for performance | ||||||
|  | 		 * reasons. | ||||||
|  | 		 */ | ||||||
|  | 
 | ||||||
|  | 		if (optval + sizeof(struct tcp_zerocopy_receive) > optval_end) | ||||||
|  | 			return 0; /* EPERM, bounds check */ | ||||||
|  | 
 | ||||||
|  | 		if (((struct tcp_zerocopy_receive *)optval)->address != 0) | ||||||
|  | 			return 0; /* EPERM, unexpected data */ | ||||||
|  | 
 | ||||||
|  | 		return 1; | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
| 	if (ctx->level == SOL_IP && ctx->optname == IP_FREEBIND) { | 	if (ctx->level == SOL_IP && ctx->optname == IP_FREEBIND) { | ||||||
| 		if (optval + 1 > optval_end) | 		if (optval + 1 > optval_end) | ||||||
| 			return 0; /* EPERM, bounds check */ | 			return 0; /* EPERM, bounds check */ | ||||||
|  |  | ||||||
|  | @ -16,7 +16,6 @@ typedef __u16 __sum16; | ||||||
| #include <linux/if_packet.h> | #include <linux/if_packet.h> | ||||||
| #include <linux/ip.h> | #include <linux/ip.h> | ||||||
| #include <linux/ipv6.h> | #include <linux/ipv6.h> | ||||||
| #include <netinet/tcp.h> |  | ||||||
| #include <linux/filter.h> | #include <linux/filter.h> | ||||||
| #include <linux/perf_event.h> | #include <linux/perf_event.h> | ||||||
| #include <linux/socket.h> | #include <linux/socket.h> | ||||||
|  |  | ||||||
		Loading…
	
		Reference in a new issue
	
	 Stanislav Fomichev
						Stanislav Fomichev