forked from mirrors/linux
		
	bpf: Remove extra lock_sock for TCP_ZEROCOPY_RECEIVE
Add custom implementation of getsockopt hook for TCP_ZEROCOPY_RECEIVE.
We skip generic hooks for TCP_ZEROCOPY_RECEIVE and have a custom
call in do_tcp_getsockopt using the on-stack data. This removes
3% overhead for locking/unlocking the socket.
Without this patch:
     3.38%     0.07%  tcp_mmap  [kernel.kallsyms]  [k] __cgroup_bpf_run_filter_getsockopt
            |
             --3.30%--__cgroup_bpf_run_filter_getsockopt
                       |
                        --0.81%--__kmalloc
With the patch applied:
     0.52%     0.12%  tcp_mmap  [kernel.kallsyms]  [k] __cgroup_bpf_run_filter_getsockopt_kern
Note, exporting uapi/tcp.h requires removing netinet/tcp.h
from test_progs.h because those headers have confliciting
definitions.
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20210115163501.805133-2-sdf@google.com
			
			
This commit is contained in:
		
							parent
							
								
									13ca51d5eb
								
							
						
					
					
						commit
						9cacf81f81
					
				
					 16 changed files with 506 additions and 7 deletions
				
			
		|  | @ -147,6 +147,10 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, | |||
| 				       int __user *optlen, int max_optlen, | ||||
| 				       int retval); | ||||
| 
 | ||||
| int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level, | ||||
| 					    int optname, void *optval, | ||||
| 					    int *optlen, int retval); | ||||
| 
 | ||||
| static inline enum bpf_cgroup_storage_type cgroup_storage_type( | ||||
| 	struct bpf_map *map) | ||||
| { | ||||
|  | @ -364,10 +368,23 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, | |||
| ({									       \ | ||||
| 	int __ret = retval;						       \ | ||||
| 	if (cgroup_bpf_enabled)						       \ | ||||
| 		__ret = __cgroup_bpf_run_filter_getsockopt(sock, level,	       \ | ||||
| 							   optname, optval,    \ | ||||
| 							   optlen, max_optlen, \ | ||||
| 							   retval);	       \ | ||||
| 		if (!(sock)->sk_prot->bpf_bypass_getsockopt ||		       \ | ||||
| 		    !INDIRECT_CALL_INET_1((sock)->sk_prot->bpf_bypass_getsockopt, \ | ||||
| 					tcp_bpf_bypass_getsockopt,	       \ | ||||
| 					level, optname))		       \ | ||||
| 			__ret = __cgroup_bpf_run_filter_getsockopt(	       \ | ||||
| 				sock, level, optname, optval, optlen,	       \ | ||||
| 				max_optlen, retval);			       \ | ||||
| 	__ret;								       \ | ||||
| }) | ||||
| 
 | ||||
| #define BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sock, level, optname, optval,      \ | ||||
| 					    optlen, retval)		       \ | ||||
| ({									       \ | ||||
| 	int __ret = retval;						       \ | ||||
| 	if (cgroup_bpf_enabled)						       \ | ||||
| 		__ret = __cgroup_bpf_run_filter_getsockopt_kern(	       \ | ||||
| 			sock, level, optname, optval, optlen, retval);	       \ | ||||
| 	__ret;								       \ | ||||
| }) | ||||
| 
 | ||||
|  | @ -452,6 +469,8 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, | |||
| #define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) ({ 0; }) | ||||
| #define BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock, level, optname, optval, \ | ||||
| 				       optlen, max_optlen, retval) ({ retval; }) | ||||
| #define BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sock, level, optname, optval, \ | ||||
| 					    optlen, retval) ({ retval; }) | ||||
| #define BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock, level, optname, optval, optlen, \ | ||||
| 				       kernel_optval) ({ 0; }) | ||||
| 
 | ||||
|  |  | |||
|  | @ -60,4 +60,10 @@ | |||
| #define INDIRECT_CALL_INET(f, f2, f1, ...) f(__VA_ARGS__) | ||||
| #endif | ||||
| 
 | ||||
| #if IS_ENABLED(CONFIG_INET) | ||||
| #define INDIRECT_CALL_INET_1(f, f1, ...) INDIRECT_CALL_1(f, f1, __VA_ARGS__) | ||||
| #else | ||||
| #define INDIRECT_CALL_INET_1(f, f1, ...) f(__VA_ARGS__) | ||||
| #endif | ||||
| 
 | ||||
| #endif | ||||
|  |  | |||
|  | @ -1174,6 +1174,8 @@ struct proto { | |||
| 
 | ||||
| 	int			(*backlog_rcv) (struct sock *sk, | ||||
| 						struct sk_buff *skb); | ||||
| 	bool			(*bpf_bypass_getsockopt)(int level, | ||||
| 							 int optname); | ||||
| 
 | ||||
| 	void		(*release_cb)(struct sock *sk); | ||||
| 
 | ||||
|  |  | |||
|  | @ -403,6 +403,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, | |||
| 		      struct poll_table_struct *wait); | ||||
| int tcp_getsockopt(struct sock *sk, int level, int optname, | ||||
| 		   char __user *optval, int __user *optlen); | ||||
| bool tcp_bpf_bypass_getsockopt(int level, int optname); | ||||
| int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, | ||||
| 		   unsigned int optlen); | ||||
| void tcp_set_keepalive(struct sock *sk, int val); | ||||
|  |  | |||
|  | @ -1486,6 +1486,52 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, | |||
| 	sockopt_free_buf(&ctx); | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level, | ||||
| 					    int optname, void *optval, | ||||
| 					    int *optlen, int retval) | ||||
| { | ||||
| 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); | ||||
| 	struct bpf_sockopt_kern ctx = { | ||||
| 		.sk = sk, | ||||
| 		.level = level, | ||||
| 		.optname = optname, | ||||
| 		.retval = retval, | ||||
| 		.optlen = *optlen, | ||||
| 		.optval = optval, | ||||
| 		.optval_end = optval + *optlen, | ||||
| 	}; | ||||
| 	int ret; | ||||
| 
 | ||||
| 	/* Note that __cgroup_bpf_run_filter_getsockopt doesn't copy
 | ||||
| 	 * user data back into BPF buffer when reval != 0. This is | ||||
| 	 * done as an optimization to avoid extra copy, assuming | ||||
| 	 * kernel won't populate the data in case of an error. | ||||
| 	 * Here we always pass the data and memset() should | ||||
| 	 * be called if that data shouldn't be "exported". | ||||
| 	 */ | ||||
| 
 | ||||
| 	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT], | ||||
| 				 &ctx, BPF_PROG_RUN); | ||||
| 	if (!ret) | ||||
| 		return -EPERM; | ||||
| 
 | ||||
| 	if (ctx.optlen > *optlen) | ||||
| 		return -EFAULT; | ||||
| 
 | ||||
| 	/* BPF programs only allowed to set retval to 0, not some
 | ||||
| 	 * arbitrary value. | ||||
| 	 */ | ||||
| 	if (ctx.retval != 0 && ctx.retval != retval) | ||||
| 		return -EFAULT; | ||||
| 
 | ||||
| 	/* BPF programs can shrink the buffer, export the modifications.
 | ||||
| 	 */ | ||||
| 	if (ctx.optlen != 0) | ||||
| 		*optlen = ctx.optlen; | ||||
| 
 | ||||
| 	return ctx.retval; | ||||
| } | ||||
| #endif | ||||
| 
 | ||||
| static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp, | ||||
|  |  | |||
|  | @ -4099,6 +4099,8 @@ static int do_tcp_getsockopt(struct sock *sk, int level, | |||
| 			return -EFAULT; | ||||
| 		lock_sock(sk); | ||||
| 		err = tcp_zerocopy_receive(sk, &zc); | ||||
| 		err = BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sk, level, optname, | ||||
| 							  &zc, &len, err); | ||||
| 		release_sock(sk); | ||||
| 		if (len >= offsetofend(struct tcp_zerocopy_receive, err)) | ||||
| 			goto zerocopy_rcv_sk_err; | ||||
|  | @ -4133,6 +4135,18 @@ static int do_tcp_getsockopt(struct sock *sk, int level, | |||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| bool tcp_bpf_bypass_getsockopt(int level, int optname) | ||||
| { | ||||
| 	/* TCP do_tcp_getsockopt has optimized getsockopt implementation
 | ||||
| 	 * to avoid extra socket lock for TCP_ZEROCOPY_RECEIVE. | ||||
| 	 */ | ||||
| 	if (level == SOL_TCP && optname == TCP_ZEROCOPY_RECEIVE) | ||||
| 		return true; | ||||
| 
 | ||||
| 	return false; | ||||
| } | ||||
| EXPORT_SYMBOL(tcp_bpf_bypass_getsockopt); | ||||
| 
 | ||||
| int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, | ||||
| 		   int __user *optlen) | ||||
| { | ||||
|  |  | |||
|  | @ -2793,6 +2793,7 @@ struct proto tcp_prot = { | |||
| 	.shutdown		= tcp_shutdown, | ||||
| 	.setsockopt		= tcp_setsockopt, | ||||
| 	.getsockopt		= tcp_getsockopt, | ||||
| 	.bpf_bypass_getsockopt	= tcp_bpf_bypass_getsockopt, | ||||
| 	.keepalive		= tcp_set_keepalive, | ||||
| 	.recvmsg		= tcp_recvmsg, | ||||
| 	.sendmsg		= tcp_sendmsg, | ||||
|  |  | |||
|  | @ -2121,6 +2121,7 @@ struct proto tcpv6_prot = { | |||
| 	.shutdown		= tcp_shutdown, | ||||
| 	.setsockopt		= tcp_setsockopt, | ||||
| 	.getsockopt		= tcp_getsockopt, | ||||
| 	.bpf_bypass_getsockopt	= tcp_bpf_bypass_getsockopt, | ||||
| 	.keepalive		= tcp_set_keepalive, | ||||
| 	.recvmsg		= tcp_recvmsg, | ||||
| 	.sendmsg		= tcp_sendmsg, | ||||
|  |  | |||
|  | @ -2126,6 +2126,9 @@ SYSCALL_DEFINE5(setsockopt, int, fd, int, level, int, optname, | |||
| 	return __sys_setsockopt(fd, level, optname, optval, optlen); | ||||
| } | ||||
| 
 | ||||
| INDIRECT_CALLABLE_DECLARE(bool tcp_bpf_bypass_getsockopt(int level, | ||||
| 							 int optname)); | ||||
| 
 | ||||
| /*
 | ||||
|  *	Get a socket option. Because we don't know the option lengths we have | ||||
|  *	to pass a user mode parameter for the protocols to sort out. | ||||
|  |  | |||
							
								
								
									
										357
									
								
								tools/include/uapi/linux/tcp.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										357
									
								
								tools/include/uapi/linux/tcp.h
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,357 @@ | |||
| /* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ | ||||
| /*
 | ||||
|  * INET		An implementation of the TCP/IP protocol suite for the LINUX | ||||
|  *		operating system.  INET is implemented using the  BSD Socket | ||||
|  *		interface as the means of communication with the user level. | ||||
|  * | ||||
|  *		Definitions for the TCP protocol. | ||||
|  * | ||||
|  * Version:	@(#)tcp.h	1.0.2	04/28/93 | ||||
|  * | ||||
|  * Author:	Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> | ||||
|  * | ||||
|  *		This program is free software; you can redistribute it and/or | ||||
|  *		modify it under the terms of the GNU General Public License | ||||
|  *		as published by the Free Software Foundation; either version | ||||
|  *		2 of the License, or (at your option) any later version. | ||||
|  */ | ||||
| #ifndef _UAPI_LINUX_TCP_H | ||||
| #define _UAPI_LINUX_TCP_H | ||||
| 
 | ||||
| #include <linux/types.h> | ||||
| #include <asm/byteorder.h> | ||||
| #include <linux/socket.h> | ||||
| 
 | ||||
| struct tcphdr { | ||||
| 	__be16	source; | ||||
| 	__be16	dest; | ||||
| 	__be32	seq; | ||||
| 	__be32	ack_seq; | ||||
| #if defined(__LITTLE_ENDIAN_BITFIELD) | ||||
| 	__u16	res1:4, | ||||
| 		doff:4, | ||||
| 		fin:1, | ||||
| 		syn:1, | ||||
| 		rst:1, | ||||
| 		psh:1, | ||||
| 		ack:1, | ||||
| 		urg:1, | ||||
| 		ece:1, | ||||
| 		cwr:1; | ||||
| #elif defined(__BIG_ENDIAN_BITFIELD) | ||||
| 	__u16	doff:4, | ||||
| 		res1:4, | ||||
| 		cwr:1, | ||||
| 		ece:1, | ||||
| 		urg:1, | ||||
| 		ack:1, | ||||
| 		psh:1, | ||||
| 		rst:1, | ||||
| 		syn:1, | ||||
| 		fin:1; | ||||
| #else | ||||
| #error	"Adjust your <asm/byteorder.h> defines" | ||||
| #endif	 | ||||
| 	__be16	window; | ||||
| 	__sum16	check; | ||||
| 	__be16	urg_ptr; | ||||
| }; | ||||
| 
 | ||||
| /*
 | ||||
|  *	The union cast uses a gcc extension to avoid aliasing problems | ||||
|  *  (union is compatible to any of its members) | ||||
|  *  This means this part of the code is -fstrict-aliasing safe now. | ||||
|  */ | ||||
| union tcp_word_hdr {  | ||||
| 	struct tcphdr hdr; | ||||
| 	__be32 		  words[5]; | ||||
| };  | ||||
| 
 | ||||
| #define tcp_flag_word(tp) ( ((union tcp_word_hdr *)(tp))->words [3])  | ||||
| 
 | ||||
| enum {  | ||||
| 	TCP_FLAG_CWR = __constant_cpu_to_be32(0x00800000), | ||||
| 	TCP_FLAG_ECE = __constant_cpu_to_be32(0x00400000), | ||||
| 	TCP_FLAG_URG = __constant_cpu_to_be32(0x00200000), | ||||
| 	TCP_FLAG_ACK = __constant_cpu_to_be32(0x00100000), | ||||
| 	TCP_FLAG_PSH = __constant_cpu_to_be32(0x00080000), | ||||
| 	TCP_FLAG_RST = __constant_cpu_to_be32(0x00040000), | ||||
| 	TCP_FLAG_SYN = __constant_cpu_to_be32(0x00020000), | ||||
| 	TCP_FLAG_FIN = __constant_cpu_to_be32(0x00010000), | ||||
| 	TCP_RESERVED_BITS = __constant_cpu_to_be32(0x0F000000), | ||||
| 	TCP_DATA_OFFSET = __constant_cpu_to_be32(0xF0000000) | ||||
| };  | ||||
| 
 | ||||
| /*
 | ||||
|  * TCP general constants | ||||
|  */ | ||||
| #define TCP_MSS_DEFAULT		 536U	/* IPv4 (RFC1122, RFC2581) */ | ||||
| #define TCP_MSS_DESIRED		1220U	/* IPv6 (tunneled), EDNS0 (RFC3226) */ | ||||
| 
 | ||||
| /* TCP socket options */ | ||||
| #define TCP_NODELAY		1	/* Turn off Nagle's algorithm. */ | ||||
| #define TCP_MAXSEG		2	/* Limit MSS */ | ||||
| #define TCP_CORK		3	/* Never send partially complete segments */ | ||||
| #define TCP_KEEPIDLE		4	/* Start keeplives after this period */ | ||||
| #define TCP_KEEPINTVL		5	/* Interval between keepalives */ | ||||
| #define TCP_KEEPCNT		6	/* Number of keepalives before death */ | ||||
| #define TCP_SYNCNT		7	/* Number of SYN retransmits */ | ||||
| #define TCP_LINGER2		8	/* Life time of orphaned FIN-WAIT-2 state */ | ||||
| #define TCP_DEFER_ACCEPT	9	/* Wake up listener only when data arrive */ | ||||
| #define TCP_WINDOW_CLAMP	10	/* Bound advertised window */ | ||||
| #define TCP_INFO		11	/* Information about this connection. */ | ||||
| #define TCP_QUICKACK		12	/* Block/reenable quick acks */ | ||||
| #define TCP_CONGESTION		13	/* Congestion control algorithm */ | ||||
| #define TCP_MD5SIG		14	/* TCP MD5 Signature (RFC2385) */ | ||||
| #define TCP_THIN_LINEAR_TIMEOUTS 16      /* Use linear timeouts for thin streams*/ | ||||
| #define TCP_THIN_DUPACK         17      /* Fast retrans. after 1 dupack */ | ||||
| #define TCP_USER_TIMEOUT	18	/* How long for loss retry before timeout */ | ||||
| #define TCP_REPAIR		19	/* TCP sock is under repair right now */ | ||||
| #define TCP_REPAIR_QUEUE	20 | ||||
| #define TCP_QUEUE_SEQ		21 | ||||
| #define TCP_REPAIR_OPTIONS	22 | ||||
| #define TCP_FASTOPEN		23	/* Enable FastOpen on listeners */ | ||||
| #define TCP_TIMESTAMP		24 | ||||
| #define TCP_NOTSENT_LOWAT	25	/* limit number of unsent bytes in write queue */ | ||||
| #define TCP_CC_INFO		26	/* Get Congestion Control (optional) info */ | ||||
| #define TCP_SAVE_SYN		27	/* Record SYN headers for new connections */ | ||||
| #define TCP_SAVED_SYN		28	/* Get SYN headers recorded for connection */ | ||||
| #define TCP_REPAIR_WINDOW	29	/* Get/set window parameters */ | ||||
| #define TCP_FASTOPEN_CONNECT	30	/* Attempt FastOpen with connect */ | ||||
| #define TCP_ULP			31	/* Attach a ULP to a TCP connection */ | ||||
| #define TCP_MD5SIG_EXT		32	/* TCP MD5 Signature with extensions */ | ||||
| #define TCP_FASTOPEN_KEY	33	/* Set the key for Fast Open (cookie) */ | ||||
| #define TCP_FASTOPEN_NO_COOKIE	34	/* Enable TFO without a TFO cookie */ | ||||
| #define TCP_ZEROCOPY_RECEIVE	35 | ||||
| #define TCP_INQ			36	/* Notify bytes available to read as a cmsg on read */ | ||||
| 
 | ||||
| #define TCP_CM_INQ		TCP_INQ | ||||
| 
 | ||||
| #define TCP_TX_DELAY		37	/* delay outgoing packets by XX usec */ | ||||
| 
 | ||||
| 
 | ||||
| #define TCP_REPAIR_ON		1 | ||||
| #define TCP_REPAIR_OFF		0 | ||||
| #define TCP_REPAIR_OFF_NO_WP	-1	/* Turn off without window probes */ | ||||
| 
 | ||||
| struct tcp_repair_opt { | ||||
| 	__u32	opt_code; | ||||
| 	__u32	opt_val; | ||||
| }; | ||||
| 
 | ||||
| struct tcp_repair_window { | ||||
| 	__u32	snd_wl1; | ||||
| 	__u32	snd_wnd; | ||||
| 	__u32	max_window; | ||||
| 
 | ||||
| 	__u32	rcv_wnd; | ||||
| 	__u32	rcv_wup; | ||||
| }; | ||||
| 
 | ||||
| enum { | ||||
| 	TCP_NO_QUEUE, | ||||
| 	TCP_RECV_QUEUE, | ||||
| 	TCP_SEND_QUEUE, | ||||
| 	TCP_QUEUES_NR, | ||||
| }; | ||||
| 
 | ||||
| /* why fastopen failed from client perspective */ | ||||
| enum tcp_fastopen_client_fail { | ||||
| 	TFO_STATUS_UNSPEC, /* catch-all */ | ||||
| 	TFO_COOKIE_UNAVAILABLE, /* if not in TFO_CLIENT_NO_COOKIE mode */ | ||||
| 	TFO_DATA_NOT_ACKED, /* SYN-ACK did not ack SYN data */ | ||||
| 	TFO_SYN_RETRANSMITTED, /* SYN-ACK did not ack SYN data after timeout */ | ||||
| }; | ||||
| 
 | ||||
| /* for TCP_INFO socket option */ | ||||
| #define TCPI_OPT_TIMESTAMPS	1 | ||||
| #define TCPI_OPT_SACK		2 | ||||
| #define TCPI_OPT_WSCALE		4 | ||||
| #define TCPI_OPT_ECN		8 /* ECN was negociated at TCP session init */ | ||||
| #define TCPI_OPT_ECN_SEEN	16 /* we received at least one packet with ECT */ | ||||
| #define TCPI_OPT_SYN_DATA	32 /* SYN-ACK acked data in SYN sent or rcvd */ | ||||
| 
 | ||||
| /*
 | ||||
|  * Sender's congestion state indicating normal or abnormal situations | ||||
|  * in the last round of packets sent. The state is driven by the ACK | ||||
|  * information and timer events. | ||||
|  */ | ||||
| enum tcp_ca_state { | ||||
| 	/*
 | ||||
| 	 * Nothing bad has been observed recently. | ||||
| 	 * No apparent reordering, packet loss, or ECN marks. | ||||
| 	 */ | ||||
| 	TCP_CA_Open = 0, | ||||
| #define TCPF_CA_Open	(1<<TCP_CA_Open) | ||||
| 	/*
 | ||||
| 	 * The sender enters disordered state when it has received DUPACKs or | ||||
| 	 * SACKs in the last round of packets sent. This could be due to packet | ||||
| 	 * loss or reordering but needs further information to confirm packets | ||||
| 	 * have been lost. | ||||
| 	 */ | ||||
| 	TCP_CA_Disorder = 1, | ||||
| #define TCPF_CA_Disorder (1<<TCP_CA_Disorder) | ||||
| 	/*
 | ||||
| 	 * The sender enters Congestion Window Reduction (CWR) state when it | ||||
| 	 * has received ACKs with ECN-ECE marks, or has experienced congestion | ||||
| 	 * or packet discard on the sender host (e.g. qdisc). | ||||
| 	 */ | ||||
| 	TCP_CA_CWR = 2, | ||||
| #define TCPF_CA_CWR	(1<<TCP_CA_CWR) | ||||
| 	/*
 | ||||
| 	 * The sender is in fast recovery and retransmitting lost packets, | ||||
| 	 * typically triggered by ACK events. | ||||
| 	 */ | ||||
| 	TCP_CA_Recovery = 3, | ||||
| #define TCPF_CA_Recovery (1<<TCP_CA_Recovery) | ||||
| 	/*
 | ||||
| 	 * The sender is in loss recovery triggered by retransmission timeout. | ||||
| 	 */ | ||||
| 	TCP_CA_Loss = 4 | ||||
| #define TCPF_CA_Loss	(1<<TCP_CA_Loss) | ||||
| }; | ||||
| 
 | ||||
| struct tcp_info { | ||||
| 	__u8	tcpi_state; | ||||
| 	__u8	tcpi_ca_state; | ||||
| 	__u8	tcpi_retransmits; | ||||
| 	__u8	tcpi_probes; | ||||
| 	__u8	tcpi_backoff; | ||||
| 	__u8	tcpi_options; | ||||
| 	__u8	tcpi_snd_wscale : 4, tcpi_rcv_wscale : 4; | ||||
| 	__u8	tcpi_delivery_rate_app_limited:1, tcpi_fastopen_client_fail:2; | ||||
| 
 | ||||
| 	__u32	tcpi_rto; | ||||
| 	__u32	tcpi_ato; | ||||
| 	__u32	tcpi_snd_mss; | ||||
| 	__u32	tcpi_rcv_mss; | ||||
| 
 | ||||
| 	__u32	tcpi_unacked; | ||||
| 	__u32	tcpi_sacked; | ||||
| 	__u32	tcpi_lost; | ||||
| 	__u32	tcpi_retrans; | ||||
| 	__u32	tcpi_fackets; | ||||
| 
 | ||||
| 	/* Times. */ | ||||
| 	__u32	tcpi_last_data_sent; | ||||
| 	__u32	tcpi_last_ack_sent;     /* Not remembered, sorry. */ | ||||
| 	__u32	tcpi_last_data_recv; | ||||
| 	__u32	tcpi_last_ack_recv; | ||||
| 
 | ||||
| 	/* Metrics. */ | ||||
| 	__u32	tcpi_pmtu; | ||||
| 	__u32	tcpi_rcv_ssthresh; | ||||
| 	__u32	tcpi_rtt; | ||||
| 	__u32	tcpi_rttvar; | ||||
| 	__u32	tcpi_snd_ssthresh; | ||||
| 	__u32	tcpi_snd_cwnd; | ||||
| 	__u32	tcpi_advmss; | ||||
| 	__u32	tcpi_reordering; | ||||
| 
 | ||||
| 	__u32	tcpi_rcv_rtt; | ||||
| 	__u32	tcpi_rcv_space; | ||||
| 
 | ||||
| 	__u32	tcpi_total_retrans; | ||||
| 
 | ||||
| 	__u64	tcpi_pacing_rate; | ||||
| 	__u64	tcpi_max_pacing_rate; | ||||
| 	__u64	tcpi_bytes_acked;    /* RFC4898 tcpEStatsAppHCThruOctetsAcked */ | ||||
| 	__u64	tcpi_bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived */ | ||||
| 	__u32	tcpi_segs_out;	     /* RFC4898 tcpEStatsPerfSegsOut */ | ||||
| 	__u32	tcpi_segs_in;	     /* RFC4898 tcpEStatsPerfSegsIn */ | ||||
| 
 | ||||
| 	__u32	tcpi_notsent_bytes; | ||||
| 	__u32	tcpi_min_rtt; | ||||
| 	__u32	tcpi_data_segs_in;	/* RFC4898 tcpEStatsDataSegsIn */ | ||||
| 	__u32	tcpi_data_segs_out;	/* RFC4898 tcpEStatsDataSegsOut */ | ||||
| 
 | ||||
| 	__u64   tcpi_delivery_rate; | ||||
| 
 | ||||
| 	__u64	tcpi_busy_time;      /* Time (usec) busy sending data */ | ||||
| 	__u64	tcpi_rwnd_limited;   /* Time (usec) limited by receive window */ | ||||
| 	__u64	tcpi_sndbuf_limited; /* Time (usec) limited by send buffer */ | ||||
| 
 | ||||
| 	__u32	tcpi_delivered; | ||||
| 	__u32	tcpi_delivered_ce; | ||||
| 
 | ||||
| 	__u64	tcpi_bytes_sent;     /* RFC4898 tcpEStatsPerfHCDataOctetsOut */ | ||||
| 	__u64	tcpi_bytes_retrans;  /* RFC4898 tcpEStatsPerfOctetsRetrans */ | ||||
| 	__u32	tcpi_dsack_dups;     /* RFC4898 tcpEStatsStackDSACKDups */ | ||||
| 	__u32	tcpi_reord_seen;     /* reordering events seen */ | ||||
| 
 | ||||
| 	__u32	tcpi_rcv_ooopack;    /* Out-of-order packets received */ | ||||
| 
 | ||||
| 	__u32	tcpi_snd_wnd;	     /* peer's advertised receive window after
 | ||||
| 				      * scaling (bytes) | ||||
| 				      */ | ||||
| }; | ||||
| 
 | ||||
| /* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */ | ||||
| enum { | ||||
| 	TCP_NLA_PAD, | ||||
| 	TCP_NLA_BUSY,		/* Time (usec) busy sending data */ | ||||
| 	TCP_NLA_RWND_LIMITED,	/* Time (usec) limited by receive window */ | ||||
| 	TCP_NLA_SNDBUF_LIMITED,	/* Time (usec) limited by send buffer */ | ||||
| 	TCP_NLA_DATA_SEGS_OUT,	/* Data pkts sent including retransmission */ | ||||
| 	TCP_NLA_TOTAL_RETRANS,	/* Data pkts retransmitted */ | ||||
| 	TCP_NLA_PACING_RATE,    /* Pacing rate in bytes per second */ | ||||
| 	TCP_NLA_DELIVERY_RATE,  /* Delivery rate in bytes per second */ | ||||
| 	TCP_NLA_SND_CWND,       /* Sending congestion window */ | ||||
| 	TCP_NLA_REORDERING,     /* Reordering metric */ | ||||
| 	TCP_NLA_MIN_RTT,        /* minimum RTT */ | ||||
| 	TCP_NLA_RECUR_RETRANS,  /* Recurring retransmits for the current pkt */ | ||||
| 	TCP_NLA_DELIVERY_RATE_APP_LMT, /* delivery rate application limited ? */ | ||||
| 	TCP_NLA_SNDQ_SIZE,	/* Data (bytes) pending in send queue */ | ||||
| 	TCP_NLA_CA_STATE,	/* ca_state of socket */ | ||||
| 	TCP_NLA_SND_SSTHRESH,	/* Slow start size threshold */ | ||||
| 	TCP_NLA_DELIVERED,	/* Data pkts delivered incl. out-of-order */ | ||||
| 	TCP_NLA_DELIVERED_CE,	/* Like above but only ones w/ CE marks */ | ||||
| 	TCP_NLA_BYTES_SENT,	/* Data bytes sent including retransmission */ | ||||
| 	TCP_NLA_BYTES_RETRANS,	/* Data bytes retransmitted */ | ||||
| 	TCP_NLA_DSACK_DUPS,	/* DSACK blocks received */ | ||||
| 	TCP_NLA_REORD_SEEN,	/* reordering events seen */ | ||||
| 	TCP_NLA_SRTT,		/* smoothed RTT in usecs */ | ||||
| 	TCP_NLA_TIMEOUT_REHASH, /* Timeout-triggered rehash attempts */ | ||||
| 	TCP_NLA_BYTES_NOTSENT,	/* Bytes in write queue not yet sent */ | ||||
| 	TCP_NLA_EDT,		/* Earliest departure time (CLOCK_MONOTONIC) */ | ||||
| }; | ||||
| 
 | ||||
| /* for TCP_MD5SIG socket option */ | ||||
| #define TCP_MD5SIG_MAXKEYLEN	80 | ||||
| 
 | ||||
| /* tcp_md5sig extension flags for TCP_MD5SIG_EXT */ | ||||
| #define TCP_MD5SIG_FLAG_PREFIX		0x1	/* address prefix length */ | ||||
| #define TCP_MD5SIG_FLAG_IFINDEX		0x2	/* ifindex set */ | ||||
| 
 | ||||
| struct tcp_md5sig { | ||||
| 	struct __kernel_sockaddr_storage tcpm_addr;	/* address associated */ | ||||
| 	__u8	tcpm_flags;				/* extension flags */ | ||||
| 	__u8	tcpm_prefixlen;				/* address prefix */ | ||||
| 	__u16	tcpm_keylen;				/* key length */ | ||||
| 	int	tcpm_ifindex;				/* device index for scope */ | ||||
| 	__u8	tcpm_key[TCP_MD5SIG_MAXKEYLEN];		/* key (binary) */ | ||||
| }; | ||||
| 
 | ||||
| /* INET_DIAG_MD5SIG */ | ||||
| struct tcp_diag_md5sig { | ||||
| 	__u8	tcpm_family; | ||||
| 	__u8	tcpm_prefixlen; | ||||
| 	__u16	tcpm_keylen; | ||||
| 	__be32	tcpm_addr[4]; | ||||
| 	__u8	tcpm_key[TCP_MD5SIG_MAXKEYLEN]; | ||||
| }; | ||||
| 
 | ||||
| /* setsockopt(fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE, ...) */ | ||||
| 
 | ||||
| #define TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT 0x1 | ||||
| struct tcp_zerocopy_receive { | ||||
| 	__u64 address;		/* in: address of mapping */ | ||||
| 	__u32 length;		/* in/out: number of bytes to map/mapped */ | ||||
| 	__u32 recv_skip_hint;	/* out: amount of bytes to skip */ | ||||
| 	__u32 inq; /* out: amount of bytes in read queue */ | ||||
| 	__s32 err; /* out: socket error */ | ||||
| 	__u64 copybuf_address;	/* in: copybuf address (small reads) */ | ||||
| 	__s32 copybuf_len; /* in/out: copybuf bytes avail/used or error */ | ||||
| 	__u32 flags; /* in: flags */ | ||||
| }; | ||||
| #endif /* _UAPI_LINUX_TCP_H */ | ||||
|  | @ -2,6 +2,7 @@ | |||
| /* Copyright (c) 2019 Facebook */ | ||||
| 
 | ||||
| #include <linux/err.h> | ||||
| #include <netinet/tcp.h> | ||||
| #include <test_progs.h> | ||||
| #include "bpf_dctcp.skel.h" | ||||
| #include "bpf_cubic.skel.h" | ||||
|  |  | |||
|  | @ -7,6 +7,7 @@ | |||
| #include <string.h> | ||||
| 
 | ||||
| #include <linux/pkt_cls.h> | ||||
| #include <netinet/tcp.h> | ||||
| 
 | ||||
| #include <test_progs.h> | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,6 +1,7 @@ | |||
| // SPDX-License-Identifier: GPL-2.0
 | ||||
| // Copyright (c) 2020 Cloudflare
 | ||||
| #include <error.h> | ||||
| #include <netinet/tcp.h> | ||||
| 
 | ||||
| #include "test_progs.h" | ||||
| #include "test_skmsg_load_helpers.skel.h" | ||||
|  |  | |||
|  | @ -2,6 +2,12 @@ | |||
| #include <test_progs.h> | ||||
| #include "cgroup_helpers.h" | ||||
| 
 | ||||
| #include <linux/tcp.h> | ||||
| 
 | ||||
| #ifndef SOL_TCP | ||||
| #define SOL_TCP IPPROTO_TCP | ||||
| #endif | ||||
| 
 | ||||
| #define SOL_CUSTOM			0xdeadbeef | ||||
| 
 | ||||
| static int getsetsockopt(void) | ||||
|  | @ -11,6 +17,7 @@ static int getsetsockopt(void) | |||
| 		char u8[4]; | ||||
| 		__u32 u32; | ||||
| 		char cc[16]; /* TCP_CA_NAME_MAX */ | ||||
| 		struct tcp_zerocopy_receive zc; | ||||
| 	} buf = {}; | ||||
| 	socklen_t optlen; | ||||
| 	char *big_buf = NULL; | ||||
|  | @ -154,6 +161,27 @@ static int getsetsockopt(void) | |||
| 		goto err; | ||||
| 	} | ||||
| 
 | ||||
| 	/* TCP_ZEROCOPY_RECEIVE triggers */ | ||||
| 	memset(&buf, 0, sizeof(buf)); | ||||
| 	optlen = sizeof(buf.zc); | ||||
| 	err = getsockopt(fd, SOL_TCP, TCP_ZEROCOPY_RECEIVE, &buf, &optlen); | ||||
| 	if (err) { | ||||
| 		log_err("Unexpected getsockopt(TCP_ZEROCOPY_RECEIVE) err=%d errno=%d", | ||||
| 			err, errno); | ||||
| 		goto err; | ||||
| 	} | ||||
| 
 | ||||
| 	memset(&buf, 0, sizeof(buf)); | ||||
| 	buf.zc.address = 12345; /* rejected by BPF */ | ||||
| 	optlen = sizeof(buf.zc); | ||||
| 	errno = 0; | ||||
| 	err = getsockopt(fd, SOL_TCP, TCP_ZEROCOPY_RECEIVE, &buf, &optlen); | ||||
| 	if (errno != EPERM) { | ||||
| 		log_err("Unexpected getsockopt(TCP_ZEROCOPY_RECEIVE) err=%d errno=%d", | ||||
| 			err, errno); | ||||
| 		goto err; | ||||
| 	} | ||||
| 
 | ||||
| 	free(big_buf); | ||||
| 	close(fd); | ||||
| 	return 0; | ||||
|  |  | |||
|  | @ -1,8 +1,8 @@ | |||
| // SPDX-License-Identifier: GPL-2.0
 | ||||
| #include <string.h> | ||||
| #include <netinet/in.h> | ||||
| #include <netinet/tcp.h> | ||||
| #include <linux/tcp.h> | ||||
| #include <linux/bpf.h> | ||||
| #include <netinet/in.h> | ||||
| #include <bpf/bpf_helpers.h> | ||||
| 
 | ||||
| char _license[] SEC("license") = "GPL"; | ||||
|  | @ -12,6 +12,10 @@ __u32 _version SEC("version") = 1; | |||
| #define PAGE_SIZE 4096 | ||||
| #endif | ||||
| 
 | ||||
| #ifndef SOL_TCP | ||||
| #define SOL_TCP IPPROTO_TCP | ||||
| #endif | ||||
| 
 | ||||
| #define SOL_CUSTOM			0xdeadbeef | ||||
| 
 | ||||
| struct sockopt_sk { | ||||
|  | @ -57,6 +61,21 @@ int _getsockopt(struct bpf_sockopt *ctx) | |||
| 		return 1; | ||||
| 	} | ||||
| 
 | ||||
| 	if (ctx->level == SOL_TCP && ctx->optname == TCP_ZEROCOPY_RECEIVE) { | ||||
| 		/* Verify that TCP_ZEROCOPY_RECEIVE triggers.
 | ||||
| 		 * It has a custom implementation for performance | ||||
| 		 * reasons. | ||||
| 		 */ | ||||
| 
 | ||||
| 		if (optval + sizeof(struct tcp_zerocopy_receive) > optval_end) | ||||
| 			return 0; /* EPERM, bounds check */ | ||||
| 
 | ||||
| 		if (((struct tcp_zerocopy_receive *)optval)->address != 0) | ||||
| 			return 0; /* EPERM, unexpected data */ | ||||
| 
 | ||||
| 		return 1; | ||||
| 	} | ||||
| 
 | ||||
| 	if (ctx->level == SOL_IP && ctx->optname == IP_FREEBIND) { | ||||
| 		if (optval + 1 > optval_end) | ||||
| 			return 0; /* EPERM, bounds check */ | ||||
|  |  | |||
|  | @ -16,7 +16,6 @@ typedef __u16 __sum16; | |||
| #include <linux/if_packet.h> | ||||
| #include <linux/ip.h> | ||||
| #include <linux/ipv6.h> | ||||
| #include <netinet/tcp.h> | ||||
| #include <linux/filter.h> | ||||
| #include <linux/perf_event.h> | ||||
| #include <linux/socket.h> | ||||
|  |  | |||
		Loading…
	
		Reference in a new issue
	
	 Stanislav Fomichev
						Stanislav Fomichev