mirror of
				https://github.com/torvalds/linux.git
				synced 2025-11-04 10:40:15 +02:00 
			
		
		
		
	bpf: Remove extra lock_sock for TCP_ZEROCOPY_RECEIVE
Add custom implementation of getsockopt hook for TCP_ZEROCOPY_RECEIVE.
We skip generic hooks for TCP_ZEROCOPY_RECEIVE and have a custom
call in do_tcp_getsockopt using the on-stack data. This removes
3% overhead for locking/unlocking the socket.
Without this patch:
     3.38%     0.07%  tcp_mmap  [kernel.kallsyms]  [k] __cgroup_bpf_run_filter_getsockopt
            |
             --3.30%--__cgroup_bpf_run_filter_getsockopt
                       |
                        --0.81%--__kmalloc
With the patch applied:
     0.52%     0.12%  tcp_mmap  [kernel.kallsyms]  [k] __cgroup_bpf_run_filter_getsockopt_kern
Note, exporting uapi/tcp.h requires removing netinet/tcp.h
from test_progs.h because those headers have confliciting
definitions.
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20210115163501.805133-2-sdf@google.com
			
			
This commit is contained in:
		
							parent
							
								
									13ca51d5eb
								
							
						
					
					
						commit
						9cacf81f81
					
				
					 16 changed files with 506 additions and 7 deletions
				
			
		| 
						 | 
					@ -147,6 +147,10 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
 | 
				
			||||||
				       int __user *optlen, int max_optlen,
 | 
									       int __user *optlen, int max_optlen,
 | 
				
			||||||
				       int retval);
 | 
									       int retval);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level,
 | 
				
			||||||
 | 
										    int optname, void *optval,
 | 
				
			||||||
 | 
										    int *optlen, int retval);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static inline enum bpf_cgroup_storage_type cgroup_storage_type(
 | 
					static inline enum bpf_cgroup_storage_type cgroup_storage_type(
 | 
				
			||||||
	struct bpf_map *map)
 | 
						struct bpf_map *map)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
| 
						 | 
					@ -364,10 +368,23 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
 | 
				
			||||||
({									       \
 | 
					({									       \
 | 
				
			||||||
	int __ret = retval;						       \
 | 
						int __ret = retval;						       \
 | 
				
			||||||
	if (cgroup_bpf_enabled)						       \
 | 
						if (cgroup_bpf_enabled)						       \
 | 
				
			||||||
		__ret = __cgroup_bpf_run_filter_getsockopt(sock, level,	       \
 | 
							if (!(sock)->sk_prot->bpf_bypass_getsockopt ||		       \
 | 
				
			||||||
							   optname, optval,    \
 | 
							    !INDIRECT_CALL_INET_1((sock)->sk_prot->bpf_bypass_getsockopt, \
 | 
				
			||||||
							   optlen, max_optlen, \
 | 
										tcp_bpf_bypass_getsockopt,	       \
 | 
				
			||||||
							   retval);	       \
 | 
										level, optname))		       \
 | 
				
			||||||
 | 
								__ret = __cgroup_bpf_run_filter_getsockopt(	       \
 | 
				
			||||||
 | 
									sock, level, optname, optval, optlen,	       \
 | 
				
			||||||
 | 
									max_optlen, retval);			       \
 | 
				
			||||||
 | 
						__ret;								       \
 | 
				
			||||||
 | 
					})
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sock, level, optname, optval,      \
 | 
				
			||||||
 | 
										    optlen, retval)		       \
 | 
				
			||||||
 | 
					({									       \
 | 
				
			||||||
 | 
						int __ret = retval;						       \
 | 
				
			||||||
 | 
						if (cgroup_bpf_enabled)						       \
 | 
				
			||||||
 | 
							__ret = __cgroup_bpf_run_filter_getsockopt_kern(	       \
 | 
				
			||||||
 | 
								sock, level, optname, optval, optlen, retval);	       \
 | 
				
			||||||
	__ret;								       \
 | 
						__ret;								       \
 | 
				
			||||||
})
 | 
					})
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -452,6 +469,8 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
 | 
				
			||||||
#define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) ({ 0; })
 | 
					#define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) ({ 0; })
 | 
				
			||||||
#define BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock, level, optname, optval, \
 | 
					#define BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock, level, optname, optval, \
 | 
				
			||||||
				       optlen, max_optlen, retval) ({ retval; })
 | 
									       optlen, max_optlen, retval) ({ retval; })
 | 
				
			||||||
 | 
					#define BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sock, level, optname, optval, \
 | 
				
			||||||
 | 
										    optlen, retval) ({ retval; })
 | 
				
			||||||
#define BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock, level, optname, optval, optlen, \
 | 
					#define BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock, level, optname, optval, optlen, \
 | 
				
			||||||
				       kernel_optval) ({ 0; })
 | 
									       kernel_optval) ({ 0; })
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -60,4 +60,10 @@
 | 
				
			||||||
#define INDIRECT_CALL_INET(f, f2, f1, ...) f(__VA_ARGS__)
 | 
					#define INDIRECT_CALL_INET(f, f2, f1, ...) f(__VA_ARGS__)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#if IS_ENABLED(CONFIG_INET)
 | 
				
			||||||
 | 
					#define INDIRECT_CALL_INET_1(f, f1, ...) INDIRECT_CALL_1(f, f1, __VA_ARGS__)
 | 
				
			||||||
 | 
					#else
 | 
				
			||||||
 | 
					#define INDIRECT_CALL_INET_1(f, f1, ...) f(__VA_ARGS__)
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1174,6 +1174,8 @@ struct proto {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	int			(*backlog_rcv) (struct sock *sk,
 | 
						int			(*backlog_rcv) (struct sock *sk,
 | 
				
			||||||
						struct sk_buff *skb);
 | 
											struct sk_buff *skb);
 | 
				
			||||||
 | 
						bool			(*bpf_bypass_getsockopt)(int level,
 | 
				
			||||||
 | 
												 int optname);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	void		(*release_cb)(struct sock *sk);
 | 
						void		(*release_cb)(struct sock *sk);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -403,6 +403,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock,
 | 
				
			||||||
		      struct poll_table_struct *wait);
 | 
							      struct poll_table_struct *wait);
 | 
				
			||||||
int tcp_getsockopt(struct sock *sk, int level, int optname,
 | 
					int tcp_getsockopt(struct sock *sk, int level, int optname,
 | 
				
			||||||
		   char __user *optval, int __user *optlen);
 | 
							   char __user *optval, int __user *optlen);
 | 
				
			||||||
 | 
					bool tcp_bpf_bypass_getsockopt(int level, int optname);
 | 
				
			||||||
int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
 | 
					int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
 | 
				
			||||||
		   unsigned int optlen);
 | 
							   unsigned int optlen);
 | 
				
			||||||
void tcp_set_keepalive(struct sock *sk, int val);
 | 
					void tcp_set_keepalive(struct sock *sk, int val);
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1486,6 +1486,52 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
 | 
				
			||||||
	sockopt_free_buf(&ctx);
 | 
						sockopt_free_buf(&ctx);
 | 
				
			||||||
	return ret;
 | 
						return ret;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level,
 | 
				
			||||||
 | 
										    int optname, void *optval,
 | 
				
			||||||
 | 
										    int *optlen, int retval)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
 | 
				
			||||||
 | 
						struct bpf_sockopt_kern ctx = {
 | 
				
			||||||
 | 
							.sk = sk,
 | 
				
			||||||
 | 
							.level = level,
 | 
				
			||||||
 | 
							.optname = optname,
 | 
				
			||||||
 | 
							.retval = retval,
 | 
				
			||||||
 | 
							.optlen = *optlen,
 | 
				
			||||||
 | 
							.optval = optval,
 | 
				
			||||||
 | 
							.optval_end = optval + *optlen,
 | 
				
			||||||
 | 
						};
 | 
				
			||||||
 | 
						int ret;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/* Note that __cgroup_bpf_run_filter_getsockopt doesn't copy
 | 
				
			||||||
 | 
						 * user data back into BPF buffer when reval != 0. This is
 | 
				
			||||||
 | 
						 * done as an optimization to avoid extra copy, assuming
 | 
				
			||||||
 | 
						 * kernel won't populate the data in case of an error.
 | 
				
			||||||
 | 
						 * Here we always pass the data and memset() should
 | 
				
			||||||
 | 
						 * be called if that data shouldn't be "exported".
 | 
				
			||||||
 | 
						 */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT],
 | 
				
			||||||
 | 
									 &ctx, BPF_PROG_RUN);
 | 
				
			||||||
 | 
						if (!ret)
 | 
				
			||||||
 | 
							return -EPERM;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if (ctx.optlen > *optlen)
 | 
				
			||||||
 | 
							return -EFAULT;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/* BPF programs only allowed to set retval to 0, not some
 | 
				
			||||||
 | 
						 * arbitrary value.
 | 
				
			||||||
 | 
						 */
 | 
				
			||||||
 | 
						if (ctx.retval != 0 && ctx.retval != retval)
 | 
				
			||||||
 | 
							return -EFAULT;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/* BPF programs can shrink the buffer, export the modifications.
 | 
				
			||||||
 | 
						 */
 | 
				
			||||||
 | 
						if (ctx.optlen != 0)
 | 
				
			||||||
 | 
							*optlen = ctx.optlen;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						return ctx.retval;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp,
 | 
					static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp,
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -4099,6 +4099,8 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
 | 
				
			||||||
			return -EFAULT;
 | 
								return -EFAULT;
 | 
				
			||||||
		lock_sock(sk);
 | 
							lock_sock(sk);
 | 
				
			||||||
		err = tcp_zerocopy_receive(sk, &zc);
 | 
							err = tcp_zerocopy_receive(sk, &zc);
 | 
				
			||||||
 | 
							err = BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sk, level, optname,
 | 
				
			||||||
 | 
												  &zc, &len, err);
 | 
				
			||||||
		release_sock(sk);
 | 
							release_sock(sk);
 | 
				
			||||||
		if (len >= offsetofend(struct tcp_zerocopy_receive, err))
 | 
							if (len >= offsetofend(struct tcp_zerocopy_receive, err))
 | 
				
			||||||
			goto zerocopy_rcv_sk_err;
 | 
								goto zerocopy_rcv_sk_err;
 | 
				
			||||||
| 
						 | 
					@ -4133,6 +4135,18 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
 | 
				
			||||||
	return 0;
 | 
						return 0;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					bool tcp_bpf_bypass_getsockopt(int level, int optname)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						/* TCP do_tcp_getsockopt has optimized getsockopt implementation
 | 
				
			||||||
 | 
						 * to avoid extra socket lock for TCP_ZEROCOPY_RECEIVE.
 | 
				
			||||||
 | 
						 */
 | 
				
			||||||
 | 
						if (level == SOL_TCP && optname == TCP_ZEROCOPY_RECEIVE)
 | 
				
			||||||
 | 
							return true;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						return false;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					EXPORT_SYMBOL(tcp_bpf_bypass_getsockopt);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
 | 
					int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
 | 
				
			||||||
		   int __user *optlen)
 | 
							   int __user *optlen)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -2793,6 +2793,7 @@ struct proto tcp_prot = {
 | 
				
			||||||
	.shutdown		= tcp_shutdown,
 | 
						.shutdown		= tcp_shutdown,
 | 
				
			||||||
	.setsockopt		= tcp_setsockopt,
 | 
						.setsockopt		= tcp_setsockopt,
 | 
				
			||||||
	.getsockopt		= tcp_getsockopt,
 | 
						.getsockopt		= tcp_getsockopt,
 | 
				
			||||||
 | 
						.bpf_bypass_getsockopt	= tcp_bpf_bypass_getsockopt,
 | 
				
			||||||
	.keepalive		= tcp_set_keepalive,
 | 
						.keepalive		= tcp_set_keepalive,
 | 
				
			||||||
	.recvmsg		= tcp_recvmsg,
 | 
						.recvmsg		= tcp_recvmsg,
 | 
				
			||||||
	.sendmsg		= tcp_sendmsg,
 | 
						.sendmsg		= tcp_sendmsg,
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -2121,6 +2121,7 @@ struct proto tcpv6_prot = {
 | 
				
			||||||
	.shutdown		= tcp_shutdown,
 | 
						.shutdown		= tcp_shutdown,
 | 
				
			||||||
	.setsockopt		= tcp_setsockopt,
 | 
						.setsockopt		= tcp_setsockopt,
 | 
				
			||||||
	.getsockopt		= tcp_getsockopt,
 | 
						.getsockopt		= tcp_getsockopt,
 | 
				
			||||||
 | 
						.bpf_bypass_getsockopt	= tcp_bpf_bypass_getsockopt,
 | 
				
			||||||
	.keepalive		= tcp_set_keepalive,
 | 
						.keepalive		= tcp_set_keepalive,
 | 
				
			||||||
	.recvmsg		= tcp_recvmsg,
 | 
						.recvmsg		= tcp_recvmsg,
 | 
				
			||||||
	.sendmsg		= tcp_sendmsg,
 | 
						.sendmsg		= tcp_sendmsg,
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -2126,6 +2126,9 @@ SYSCALL_DEFINE5(setsockopt, int, fd, int, level, int, optname,
 | 
				
			||||||
	return __sys_setsockopt(fd, level, optname, optval, optlen);
 | 
						return __sys_setsockopt(fd, level, optname, optval, optlen);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					INDIRECT_CALLABLE_DECLARE(bool tcp_bpf_bypass_getsockopt(int level,
 | 
				
			||||||
 | 
												 int optname));
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/*
 | 
					/*
 | 
				
			||||||
 *	Get a socket option. Because we don't know the option lengths we have
 | 
					 *	Get a socket option. Because we don't know the option lengths we have
 | 
				
			||||||
 *	to pass a user mode parameter for the protocols to sort out.
 | 
					 *	to pass a user mode parameter for the protocols to sort out.
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										357
									
								
								tools/include/uapi/linux/tcp.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										357
									
								
								tools/include/uapi/linux/tcp.h
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,357 @@
 | 
				
			||||||
 | 
					/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
 | 
				
			||||||
 | 
					/*
 | 
				
			||||||
 | 
					 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 | 
				
			||||||
 | 
					 *		operating system.  INET is implemented using the  BSD Socket
 | 
				
			||||||
 | 
					 *		interface as the means of communication with the user level.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 *		Definitions for the TCP protocol.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * Version:	@(#)tcp.h	1.0.2	04/28/93
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * Author:	Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 *		This program is free software; you can redistribute it and/or
 | 
				
			||||||
 | 
					 *		modify it under the terms of the GNU General Public License
 | 
				
			||||||
 | 
					 *		as published by the Free Software Foundation; either version
 | 
				
			||||||
 | 
					 *		2 of the License, or (at your option) any later version.
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
 | 
					#ifndef _UAPI_LINUX_TCP_H
 | 
				
			||||||
 | 
					#define _UAPI_LINUX_TCP_H
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#include <linux/types.h>
 | 
				
			||||||
 | 
					#include <asm/byteorder.h>
 | 
				
			||||||
 | 
					#include <linux/socket.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					struct tcphdr {
 | 
				
			||||||
 | 
						__be16	source;
 | 
				
			||||||
 | 
						__be16	dest;
 | 
				
			||||||
 | 
						__be32	seq;
 | 
				
			||||||
 | 
						__be32	ack_seq;
 | 
				
			||||||
 | 
					#if defined(__LITTLE_ENDIAN_BITFIELD)
 | 
				
			||||||
 | 
						__u16	res1:4,
 | 
				
			||||||
 | 
							doff:4,
 | 
				
			||||||
 | 
							fin:1,
 | 
				
			||||||
 | 
							syn:1,
 | 
				
			||||||
 | 
							rst:1,
 | 
				
			||||||
 | 
							psh:1,
 | 
				
			||||||
 | 
							ack:1,
 | 
				
			||||||
 | 
							urg:1,
 | 
				
			||||||
 | 
							ece:1,
 | 
				
			||||||
 | 
							cwr:1;
 | 
				
			||||||
 | 
					#elif defined(__BIG_ENDIAN_BITFIELD)
 | 
				
			||||||
 | 
						__u16	doff:4,
 | 
				
			||||||
 | 
							res1:4,
 | 
				
			||||||
 | 
							cwr:1,
 | 
				
			||||||
 | 
							ece:1,
 | 
				
			||||||
 | 
							urg:1,
 | 
				
			||||||
 | 
							ack:1,
 | 
				
			||||||
 | 
							psh:1,
 | 
				
			||||||
 | 
							rst:1,
 | 
				
			||||||
 | 
							syn:1,
 | 
				
			||||||
 | 
							fin:1;
 | 
				
			||||||
 | 
					#else
 | 
				
			||||||
 | 
					#error	"Adjust your <asm/byteorder.h> defines"
 | 
				
			||||||
 | 
					#endif	
 | 
				
			||||||
 | 
						__be16	window;
 | 
				
			||||||
 | 
						__sum16	check;
 | 
				
			||||||
 | 
						__be16	urg_ptr;
 | 
				
			||||||
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/*
 | 
				
			||||||
 | 
					 *	The union cast uses a gcc extension to avoid aliasing problems
 | 
				
			||||||
 | 
					 *  (union is compatible to any of its members)
 | 
				
			||||||
 | 
					 *  This means this part of the code is -fstrict-aliasing safe now.
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
 | 
					union tcp_word_hdr { 
 | 
				
			||||||
 | 
						struct tcphdr hdr;
 | 
				
			||||||
 | 
						__be32 		  words[5];
 | 
				
			||||||
 | 
					}; 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define tcp_flag_word(tp) ( ((union tcp_word_hdr *)(tp))->words [3]) 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					enum { 
 | 
				
			||||||
 | 
						TCP_FLAG_CWR = __constant_cpu_to_be32(0x00800000),
 | 
				
			||||||
 | 
						TCP_FLAG_ECE = __constant_cpu_to_be32(0x00400000),
 | 
				
			||||||
 | 
						TCP_FLAG_URG = __constant_cpu_to_be32(0x00200000),
 | 
				
			||||||
 | 
						TCP_FLAG_ACK = __constant_cpu_to_be32(0x00100000),
 | 
				
			||||||
 | 
						TCP_FLAG_PSH = __constant_cpu_to_be32(0x00080000),
 | 
				
			||||||
 | 
						TCP_FLAG_RST = __constant_cpu_to_be32(0x00040000),
 | 
				
			||||||
 | 
						TCP_FLAG_SYN = __constant_cpu_to_be32(0x00020000),
 | 
				
			||||||
 | 
						TCP_FLAG_FIN = __constant_cpu_to_be32(0x00010000),
 | 
				
			||||||
 | 
						TCP_RESERVED_BITS = __constant_cpu_to_be32(0x0F000000),
 | 
				
			||||||
 | 
						TCP_DATA_OFFSET = __constant_cpu_to_be32(0xF0000000)
 | 
				
			||||||
 | 
					}; 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/*
 | 
				
			||||||
 | 
					 * TCP general constants
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
 | 
					#define TCP_MSS_DEFAULT		 536U	/* IPv4 (RFC1122, RFC2581) */
 | 
				
			||||||
 | 
					#define TCP_MSS_DESIRED		1220U	/* IPv6 (tunneled), EDNS0 (RFC3226) */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/* TCP socket options */
 | 
				
			||||||
 | 
					#define TCP_NODELAY		1	/* Turn off Nagle's algorithm. */
 | 
				
			||||||
 | 
					#define TCP_MAXSEG		2	/* Limit MSS */
 | 
				
			||||||
 | 
					#define TCP_CORK		3	/* Never send partially complete segments */
 | 
				
			||||||
 | 
					#define TCP_KEEPIDLE		4	/* Start keeplives after this period */
 | 
				
			||||||
 | 
					#define TCP_KEEPINTVL		5	/* Interval between keepalives */
 | 
				
			||||||
 | 
					#define TCP_KEEPCNT		6	/* Number of keepalives before death */
 | 
				
			||||||
 | 
					#define TCP_SYNCNT		7	/* Number of SYN retransmits */
 | 
				
			||||||
 | 
					#define TCP_LINGER2		8	/* Life time of orphaned FIN-WAIT-2 state */
 | 
				
			||||||
 | 
					#define TCP_DEFER_ACCEPT	9	/* Wake up listener only when data arrive */
 | 
				
			||||||
 | 
					#define TCP_WINDOW_CLAMP	10	/* Bound advertised window */
 | 
				
			||||||
 | 
					#define TCP_INFO		11	/* Information about this connection. */
 | 
				
			||||||
 | 
					#define TCP_QUICKACK		12	/* Block/reenable quick acks */
 | 
				
			||||||
 | 
					#define TCP_CONGESTION		13	/* Congestion control algorithm */
 | 
				
			||||||
 | 
					#define TCP_MD5SIG		14	/* TCP MD5 Signature (RFC2385) */
 | 
				
			||||||
 | 
					#define TCP_THIN_LINEAR_TIMEOUTS 16      /* Use linear timeouts for thin streams*/
 | 
				
			||||||
 | 
					#define TCP_THIN_DUPACK         17      /* Fast retrans. after 1 dupack */
 | 
				
			||||||
 | 
					#define TCP_USER_TIMEOUT	18	/* How long for loss retry before timeout */
 | 
				
			||||||
 | 
					#define TCP_REPAIR		19	/* TCP sock is under repair right now */
 | 
				
			||||||
 | 
					#define TCP_REPAIR_QUEUE	20
 | 
				
			||||||
 | 
					#define TCP_QUEUE_SEQ		21
 | 
				
			||||||
 | 
					#define TCP_REPAIR_OPTIONS	22
 | 
				
			||||||
 | 
					#define TCP_FASTOPEN		23	/* Enable FastOpen on listeners */
 | 
				
			||||||
 | 
					#define TCP_TIMESTAMP		24
 | 
				
			||||||
 | 
					#define TCP_NOTSENT_LOWAT	25	/* limit number of unsent bytes in write queue */
 | 
				
			||||||
 | 
					#define TCP_CC_INFO		26	/* Get Congestion Control (optional) info */
 | 
				
			||||||
 | 
					#define TCP_SAVE_SYN		27	/* Record SYN headers for new connections */
 | 
				
			||||||
 | 
					#define TCP_SAVED_SYN		28	/* Get SYN headers recorded for connection */
 | 
				
			||||||
 | 
					#define TCP_REPAIR_WINDOW	29	/* Get/set window parameters */
 | 
				
			||||||
 | 
					#define TCP_FASTOPEN_CONNECT	30	/* Attempt FastOpen with connect */
 | 
				
			||||||
 | 
					#define TCP_ULP			31	/* Attach a ULP to a TCP connection */
 | 
				
			||||||
 | 
					#define TCP_MD5SIG_EXT		32	/* TCP MD5 Signature with extensions */
 | 
				
			||||||
 | 
					#define TCP_FASTOPEN_KEY	33	/* Set the key for Fast Open (cookie) */
 | 
				
			||||||
 | 
					#define TCP_FASTOPEN_NO_COOKIE	34	/* Enable TFO without a TFO cookie */
 | 
				
			||||||
 | 
					#define TCP_ZEROCOPY_RECEIVE	35
 | 
				
			||||||
 | 
					#define TCP_INQ			36	/* Notify bytes available to read as a cmsg on read */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define TCP_CM_INQ		TCP_INQ
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define TCP_TX_DELAY		37	/* delay outgoing packets by XX usec */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define TCP_REPAIR_ON		1
 | 
				
			||||||
 | 
					#define TCP_REPAIR_OFF		0
 | 
				
			||||||
 | 
					#define TCP_REPAIR_OFF_NO_WP	-1	/* Turn off without window probes */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					struct tcp_repair_opt {
 | 
				
			||||||
 | 
						__u32	opt_code;
 | 
				
			||||||
 | 
						__u32	opt_val;
 | 
				
			||||||
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					struct tcp_repair_window {
 | 
				
			||||||
 | 
						__u32	snd_wl1;
 | 
				
			||||||
 | 
						__u32	snd_wnd;
 | 
				
			||||||
 | 
						__u32	max_window;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						__u32	rcv_wnd;
 | 
				
			||||||
 | 
						__u32	rcv_wup;
 | 
				
			||||||
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					enum {
 | 
				
			||||||
 | 
						TCP_NO_QUEUE,
 | 
				
			||||||
 | 
						TCP_RECV_QUEUE,
 | 
				
			||||||
 | 
						TCP_SEND_QUEUE,
 | 
				
			||||||
 | 
						TCP_QUEUES_NR,
 | 
				
			||||||
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/* why fastopen failed from client perspective */
 | 
				
			||||||
 | 
					enum tcp_fastopen_client_fail {
 | 
				
			||||||
 | 
						TFO_STATUS_UNSPEC, /* catch-all */
 | 
				
			||||||
 | 
						TFO_COOKIE_UNAVAILABLE, /* if not in TFO_CLIENT_NO_COOKIE mode */
 | 
				
			||||||
 | 
						TFO_DATA_NOT_ACKED, /* SYN-ACK did not ack SYN data */
 | 
				
			||||||
 | 
						TFO_SYN_RETRANSMITTED, /* SYN-ACK did not ack SYN data after timeout */
 | 
				
			||||||
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/* for TCP_INFO socket option */
 | 
				
			||||||
 | 
					#define TCPI_OPT_TIMESTAMPS	1
 | 
				
			||||||
 | 
					#define TCPI_OPT_SACK		2
 | 
				
			||||||
 | 
					#define TCPI_OPT_WSCALE		4
 | 
				
			||||||
 | 
					#define TCPI_OPT_ECN		8 /* ECN was negociated at TCP session init */
 | 
				
			||||||
 | 
					#define TCPI_OPT_ECN_SEEN	16 /* we received at least one packet with ECT */
 | 
				
			||||||
 | 
					#define TCPI_OPT_SYN_DATA	32 /* SYN-ACK acked data in SYN sent or rcvd */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/*
 | 
				
			||||||
 | 
					 * Sender's congestion state indicating normal or abnormal situations
 | 
				
			||||||
 | 
					 * in the last round of packets sent. The state is driven by the ACK
 | 
				
			||||||
 | 
					 * information and timer events.
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
 | 
					enum tcp_ca_state {
 | 
				
			||||||
 | 
						/*
 | 
				
			||||||
 | 
						 * Nothing bad has been observed recently.
 | 
				
			||||||
 | 
						 * No apparent reordering, packet loss, or ECN marks.
 | 
				
			||||||
 | 
						 */
 | 
				
			||||||
 | 
						TCP_CA_Open = 0,
 | 
				
			||||||
 | 
					#define TCPF_CA_Open	(1<<TCP_CA_Open)
 | 
				
			||||||
 | 
						/*
 | 
				
			||||||
 | 
						 * The sender enters disordered state when it has received DUPACKs or
 | 
				
			||||||
 | 
						 * SACKs in the last round of packets sent. This could be due to packet
 | 
				
			||||||
 | 
						 * loss or reordering but needs further information to confirm packets
 | 
				
			||||||
 | 
						 * have been lost.
 | 
				
			||||||
 | 
						 */
 | 
				
			||||||
 | 
						TCP_CA_Disorder = 1,
 | 
				
			||||||
 | 
					#define TCPF_CA_Disorder (1<<TCP_CA_Disorder)
 | 
				
			||||||
 | 
						/*
 | 
				
			||||||
 | 
						 * The sender enters Congestion Window Reduction (CWR) state when it
 | 
				
			||||||
 | 
						 * has received ACKs with ECN-ECE marks, or has experienced congestion
 | 
				
			||||||
 | 
						 * or packet discard on the sender host (e.g. qdisc).
 | 
				
			||||||
 | 
						 */
 | 
				
			||||||
 | 
						TCP_CA_CWR = 2,
 | 
				
			||||||
 | 
					#define TCPF_CA_CWR	(1<<TCP_CA_CWR)
 | 
				
			||||||
 | 
						/*
 | 
				
			||||||
 | 
						 * The sender is in fast recovery and retransmitting lost packets,
 | 
				
			||||||
 | 
						 * typically triggered by ACK events.
 | 
				
			||||||
 | 
						 */
 | 
				
			||||||
 | 
						TCP_CA_Recovery = 3,
 | 
				
			||||||
 | 
					#define TCPF_CA_Recovery (1<<TCP_CA_Recovery)
 | 
				
			||||||
 | 
						/*
 | 
				
			||||||
 | 
						 * The sender is in loss recovery triggered by retransmission timeout.
 | 
				
			||||||
 | 
						 */
 | 
				
			||||||
 | 
						TCP_CA_Loss = 4
 | 
				
			||||||
 | 
					#define TCPF_CA_Loss	(1<<TCP_CA_Loss)
 | 
				
			||||||
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					struct tcp_info {
 | 
				
			||||||
 | 
						__u8	tcpi_state;
 | 
				
			||||||
 | 
						__u8	tcpi_ca_state;
 | 
				
			||||||
 | 
						__u8	tcpi_retransmits;
 | 
				
			||||||
 | 
						__u8	tcpi_probes;
 | 
				
			||||||
 | 
						__u8	tcpi_backoff;
 | 
				
			||||||
 | 
						__u8	tcpi_options;
 | 
				
			||||||
 | 
						__u8	tcpi_snd_wscale : 4, tcpi_rcv_wscale : 4;
 | 
				
			||||||
 | 
						__u8	tcpi_delivery_rate_app_limited:1, tcpi_fastopen_client_fail:2;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						__u32	tcpi_rto;
 | 
				
			||||||
 | 
						__u32	tcpi_ato;
 | 
				
			||||||
 | 
						__u32	tcpi_snd_mss;
 | 
				
			||||||
 | 
						__u32	tcpi_rcv_mss;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						__u32	tcpi_unacked;
 | 
				
			||||||
 | 
						__u32	tcpi_sacked;
 | 
				
			||||||
 | 
						__u32	tcpi_lost;
 | 
				
			||||||
 | 
						__u32	tcpi_retrans;
 | 
				
			||||||
 | 
						__u32	tcpi_fackets;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/* Times. */
 | 
				
			||||||
 | 
						__u32	tcpi_last_data_sent;
 | 
				
			||||||
 | 
						__u32	tcpi_last_ack_sent;     /* Not remembered, sorry. */
 | 
				
			||||||
 | 
						__u32	tcpi_last_data_recv;
 | 
				
			||||||
 | 
						__u32	tcpi_last_ack_recv;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/* Metrics. */
 | 
				
			||||||
 | 
						__u32	tcpi_pmtu;
 | 
				
			||||||
 | 
						__u32	tcpi_rcv_ssthresh;
 | 
				
			||||||
 | 
						__u32	tcpi_rtt;
 | 
				
			||||||
 | 
						__u32	tcpi_rttvar;
 | 
				
			||||||
 | 
						__u32	tcpi_snd_ssthresh;
 | 
				
			||||||
 | 
						__u32	tcpi_snd_cwnd;
 | 
				
			||||||
 | 
						__u32	tcpi_advmss;
 | 
				
			||||||
 | 
						__u32	tcpi_reordering;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						__u32	tcpi_rcv_rtt;
 | 
				
			||||||
 | 
						__u32	tcpi_rcv_space;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						__u32	tcpi_total_retrans;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						__u64	tcpi_pacing_rate;
 | 
				
			||||||
 | 
						__u64	tcpi_max_pacing_rate;
 | 
				
			||||||
 | 
						__u64	tcpi_bytes_acked;    /* RFC4898 tcpEStatsAppHCThruOctetsAcked */
 | 
				
			||||||
 | 
						__u64	tcpi_bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived */
 | 
				
			||||||
 | 
						__u32	tcpi_segs_out;	     /* RFC4898 tcpEStatsPerfSegsOut */
 | 
				
			||||||
 | 
						__u32	tcpi_segs_in;	     /* RFC4898 tcpEStatsPerfSegsIn */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						__u32	tcpi_notsent_bytes;
 | 
				
			||||||
 | 
						__u32	tcpi_min_rtt;
 | 
				
			||||||
 | 
						__u32	tcpi_data_segs_in;	/* RFC4898 tcpEStatsDataSegsIn */
 | 
				
			||||||
 | 
						__u32	tcpi_data_segs_out;	/* RFC4898 tcpEStatsDataSegsOut */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						__u64   tcpi_delivery_rate;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						__u64	tcpi_busy_time;      /* Time (usec) busy sending data */
 | 
				
			||||||
 | 
						__u64	tcpi_rwnd_limited;   /* Time (usec) limited by receive window */
 | 
				
			||||||
 | 
						__u64	tcpi_sndbuf_limited; /* Time (usec) limited by send buffer */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						__u32	tcpi_delivered;
 | 
				
			||||||
 | 
						__u32	tcpi_delivered_ce;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						__u64	tcpi_bytes_sent;     /* RFC4898 tcpEStatsPerfHCDataOctetsOut */
 | 
				
			||||||
 | 
						__u64	tcpi_bytes_retrans;  /* RFC4898 tcpEStatsPerfOctetsRetrans */
 | 
				
			||||||
 | 
						__u32	tcpi_dsack_dups;     /* RFC4898 tcpEStatsStackDSACKDups */
 | 
				
			||||||
 | 
						__u32	tcpi_reord_seen;     /* reordering events seen */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						__u32	tcpi_rcv_ooopack;    /* Out-of-order packets received */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						__u32	tcpi_snd_wnd;	     /* peer's advertised receive window after
 | 
				
			||||||
 | 
									      * scaling (bytes)
 | 
				
			||||||
 | 
									      */
 | 
				
			||||||
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */
 | 
				
			||||||
 | 
					enum {
 | 
				
			||||||
 | 
						TCP_NLA_PAD,
 | 
				
			||||||
 | 
						TCP_NLA_BUSY,		/* Time (usec) busy sending data */
 | 
				
			||||||
 | 
						TCP_NLA_RWND_LIMITED,	/* Time (usec) limited by receive window */
 | 
				
			||||||
 | 
						TCP_NLA_SNDBUF_LIMITED,	/* Time (usec) limited by send buffer */
 | 
				
			||||||
 | 
						TCP_NLA_DATA_SEGS_OUT,	/* Data pkts sent including retransmission */
 | 
				
			||||||
 | 
						TCP_NLA_TOTAL_RETRANS,	/* Data pkts retransmitted */
 | 
				
			||||||
 | 
						TCP_NLA_PACING_RATE,    /* Pacing rate in bytes per second */
 | 
				
			||||||
 | 
						TCP_NLA_DELIVERY_RATE,  /* Delivery rate in bytes per second */
 | 
				
			||||||
 | 
						TCP_NLA_SND_CWND,       /* Sending congestion window */
 | 
				
			||||||
 | 
						TCP_NLA_REORDERING,     /* Reordering metric */
 | 
				
			||||||
 | 
						TCP_NLA_MIN_RTT,        /* minimum RTT */
 | 
				
			||||||
 | 
						TCP_NLA_RECUR_RETRANS,  /* Recurring retransmits for the current pkt */
 | 
				
			||||||
 | 
						TCP_NLA_DELIVERY_RATE_APP_LMT, /* delivery rate application limited ? */
 | 
				
			||||||
 | 
						TCP_NLA_SNDQ_SIZE,	/* Data (bytes) pending in send queue */
 | 
				
			||||||
 | 
						TCP_NLA_CA_STATE,	/* ca_state of socket */
 | 
				
			||||||
 | 
						TCP_NLA_SND_SSTHRESH,	/* Slow start size threshold */
 | 
				
			||||||
 | 
						TCP_NLA_DELIVERED,	/* Data pkts delivered incl. out-of-order */
 | 
				
			||||||
 | 
						TCP_NLA_DELIVERED_CE,	/* Like above but only ones w/ CE marks */
 | 
				
			||||||
 | 
						TCP_NLA_BYTES_SENT,	/* Data bytes sent including retransmission */
 | 
				
			||||||
 | 
						TCP_NLA_BYTES_RETRANS,	/* Data bytes retransmitted */
 | 
				
			||||||
 | 
						TCP_NLA_DSACK_DUPS,	/* DSACK blocks received */
 | 
				
			||||||
 | 
						TCP_NLA_REORD_SEEN,	/* reordering events seen */
 | 
				
			||||||
 | 
						TCP_NLA_SRTT,		/* smoothed RTT in usecs */
 | 
				
			||||||
 | 
						TCP_NLA_TIMEOUT_REHASH, /* Timeout-triggered rehash attempts */
 | 
				
			||||||
 | 
						TCP_NLA_BYTES_NOTSENT,	/* Bytes in write queue not yet sent */
 | 
				
			||||||
 | 
						TCP_NLA_EDT,		/* Earliest departure time (CLOCK_MONOTONIC) */
 | 
				
			||||||
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/* for TCP_MD5SIG socket option */
 | 
				
			||||||
 | 
					#define TCP_MD5SIG_MAXKEYLEN	80
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/* tcp_md5sig extension flags for TCP_MD5SIG_EXT */
 | 
				
			||||||
 | 
					#define TCP_MD5SIG_FLAG_PREFIX		0x1	/* address prefix length */
 | 
				
			||||||
 | 
					#define TCP_MD5SIG_FLAG_IFINDEX		0x2	/* ifindex set */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					struct tcp_md5sig {
 | 
				
			||||||
 | 
						struct __kernel_sockaddr_storage tcpm_addr;	/* address associated */
 | 
				
			||||||
 | 
						__u8	tcpm_flags;				/* extension flags */
 | 
				
			||||||
 | 
						__u8	tcpm_prefixlen;				/* address prefix */
 | 
				
			||||||
 | 
						__u16	tcpm_keylen;				/* key length */
 | 
				
			||||||
 | 
						int	tcpm_ifindex;				/* device index for scope */
 | 
				
			||||||
 | 
						__u8	tcpm_key[TCP_MD5SIG_MAXKEYLEN];		/* key (binary) */
 | 
				
			||||||
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/* INET_DIAG_MD5SIG */
 | 
				
			||||||
 | 
					struct tcp_diag_md5sig {
 | 
				
			||||||
 | 
						__u8	tcpm_family;
 | 
				
			||||||
 | 
						__u8	tcpm_prefixlen;
 | 
				
			||||||
 | 
						__u16	tcpm_keylen;
 | 
				
			||||||
 | 
						__be32	tcpm_addr[4];
 | 
				
			||||||
 | 
						__u8	tcpm_key[TCP_MD5SIG_MAXKEYLEN];
 | 
				
			||||||
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/* setsockopt(fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE, ...) */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT 0x1
 | 
				
			||||||
 | 
					struct tcp_zerocopy_receive {
 | 
				
			||||||
 | 
						__u64 address;		/* in: address of mapping */
 | 
				
			||||||
 | 
						__u32 length;		/* in/out: number of bytes to map/mapped */
 | 
				
			||||||
 | 
						__u32 recv_skip_hint;	/* out: amount of bytes to skip */
 | 
				
			||||||
 | 
						__u32 inq; /* out: amount of bytes in read queue */
 | 
				
			||||||
 | 
						__s32 err; /* out: socket error */
 | 
				
			||||||
 | 
						__u64 copybuf_address;	/* in: copybuf address (small reads) */
 | 
				
			||||||
 | 
						__s32 copybuf_len; /* in/out: copybuf bytes avail/used or error */
 | 
				
			||||||
 | 
						__u32 flags; /* in: flags */
 | 
				
			||||||
 | 
					};
 | 
				
			||||||
 | 
					#endif /* _UAPI_LINUX_TCP_H */
 | 
				
			||||||
| 
						 | 
					@ -2,6 +2,7 @@
 | 
				
			||||||
/* Copyright (c) 2019 Facebook */
 | 
					/* Copyright (c) 2019 Facebook */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#include <linux/err.h>
 | 
					#include <linux/err.h>
 | 
				
			||||||
 | 
					#include <netinet/tcp.h>
 | 
				
			||||||
#include <test_progs.h>
 | 
					#include <test_progs.h>
 | 
				
			||||||
#include "bpf_dctcp.skel.h"
 | 
					#include "bpf_dctcp.skel.h"
 | 
				
			||||||
#include "bpf_cubic.skel.h"
 | 
					#include "bpf_cubic.skel.h"
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -7,6 +7,7 @@
 | 
				
			||||||
#include <string.h>
 | 
					#include <string.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#include <linux/pkt_cls.h>
 | 
					#include <linux/pkt_cls.h>
 | 
				
			||||||
 | 
					#include <netinet/tcp.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#include <test_progs.h>
 | 
					#include <test_progs.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,7 @@
 | 
				
			||||||
// SPDX-License-Identifier: GPL-2.0
 | 
					// SPDX-License-Identifier: GPL-2.0
 | 
				
			||||||
// Copyright (c) 2020 Cloudflare
 | 
					// Copyright (c) 2020 Cloudflare
 | 
				
			||||||
#include <error.h>
 | 
					#include <error.h>
 | 
				
			||||||
 | 
					#include <netinet/tcp.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#include "test_progs.h"
 | 
					#include "test_progs.h"
 | 
				
			||||||
#include "test_skmsg_load_helpers.skel.h"
 | 
					#include "test_skmsg_load_helpers.skel.h"
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -2,6 +2,12 @@
 | 
				
			||||||
#include <test_progs.h>
 | 
					#include <test_progs.h>
 | 
				
			||||||
#include "cgroup_helpers.h"
 | 
					#include "cgroup_helpers.h"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#include <linux/tcp.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#ifndef SOL_TCP
 | 
				
			||||||
 | 
					#define SOL_TCP IPPROTO_TCP
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define SOL_CUSTOM			0xdeadbeef
 | 
					#define SOL_CUSTOM			0xdeadbeef
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static int getsetsockopt(void)
 | 
					static int getsetsockopt(void)
 | 
				
			||||||
| 
						 | 
					@ -11,6 +17,7 @@ static int getsetsockopt(void)
 | 
				
			||||||
		char u8[4];
 | 
							char u8[4];
 | 
				
			||||||
		__u32 u32;
 | 
							__u32 u32;
 | 
				
			||||||
		char cc[16]; /* TCP_CA_NAME_MAX */
 | 
							char cc[16]; /* TCP_CA_NAME_MAX */
 | 
				
			||||||
 | 
							struct tcp_zerocopy_receive zc;
 | 
				
			||||||
	} buf = {};
 | 
						} buf = {};
 | 
				
			||||||
	socklen_t optlen;
 | 
						socklen_t optlen;
 | 
				
			||||||
	char *big_buf = NULL;
 | 
						char *big_buf = NULL;
 | 
				
			||||||
| 
						 | 
					@ -154,6 +161,27 @@ static int getsetsockopt(void)
 | 
				
			||||||
		goto err;
 | 
							goto err;
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/* TCP_ZEROCOPY_RECEIVE triggers */
 | 
				
			||||||
 | 
						memset(&buf, 0, sizeof(buf));
 | 
				
			||||||
 | 
						optlen = sizeof(buf.zc);
 | 
				
			||||||
 | 
						err = getsockopt(fd, SOL_TCP, TCP_ZEROCOPY_RECEIVE, &buf, &optlen);
 | 
				
			||||||
 | 
						if (err) {
 | 
				
			||||||
 | 
							log_err("Unexpected getsockopt(TCP_ZEROCOPY_RECEIVE) err=%d errno=%d",
 | 
				
			||||||
 | 
								err, errno);
 | 
				
			||||||
 | 
							goto err;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						memset(&buf, 0, sizeof(buf));
 | 
				
			||||||
 | 
						buf.zc.address = 12345; /* rejected by BPF */
 | 
				
			||||||
 | 
						optlen = sizeof(buf.zc);
 | 
				
			||||||
 | 
						errno = 0;
 | 
				
			||||||
 | 
						err = getsockopt(fd, SOL_TCP, TCP_ZEROCOPY_RECEIVE, &buf, &optlen);
 | 
				
			||||||
 | 
						if (errno != EPERM) {
 | 
				
			||||||
 | 
							log_err("Unexpected getsockopt(TCP_ZEROCOPY_RECEIVE) err=%d errno=%d",
 | 
				
			||||||
 | 
								err, errno);
 | 
				
			||||||
 | 
							goto err;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	free(big_buf);
 | 
						free(big_buf);
 | 
				
			||||||
	close(fd);
 | 
						close(fd);
 | 
				
			||||||
	return 0;
 | 
						return 0;
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,8 +1,8 @@
 | 
				
			||||||
// SPDX-License-Identifier: GPL-2.0
 | 
					// SPDX-License-Identifier: GPL-2.0
 | 
				
			||||||
#include <string.h>
 | 
					#include <string.h>
 | 
				
			||||||
#include <netinet/in.h>
 | 
					#include <linux/tcp.h>
 | 
				
			||||||
#include <netinet/tcp.h>
 | 
					 | 
				
			||||||
#include <linux/bpf.h>
 | 
					#include <linux/bpf.h>
 | 
				
			||||||
 | 
					#include <netinet/in.h>
 | 
				
			||||||
#include <bpf/bpf_helpers.h>
 | 
					#include <bpf/bpf_helpers.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
char _license[] SEC("license") = "GPL";
 | 
					char _license[] SEC("license") = "GPL";
 | 
				
			||||||
| 
						 | 
					@ -12,6 +12,10 @@ __u32 _version SEC("version") = 1;
 | 
				
			||||||
#define PAGE_SIZE 4096
 | 
					#define PAGE_SIZE 4096
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#ifndef SOL_TCP
 | 
				
			||||||
 | 
					#define SOL_TCP IPPROTO_TCP
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define SOL_CUSTOM			0xdeadbeef
 | 
					#define SOL_CUSTOM			0xdeadbeef
 | 
				
			||||||
 | 
					
 | 
				
			||||||
struct sockopt_sk {
 | 
					struct sockopt_sk {
 | 
				
			||||||
| 
						 | 
					@ -57,6 +61,21 @@ int _getsockopt(struct bpf_sockopt *ctx)
 | 
				
			||||||
		return 1;
 | 
							return 1;
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if (ctx->level == SOL_TCP && ctx->optname == TCP_ZEROCOPY_RECEIVE) {
 | 
				
			||||||
 | 
							/* Verify that TCP_ZEROCOPY_RECEIVE triggers.
 | 
				
			||||||
 | 
							 * It has a custom implementation for performance
 | 
				
			||||||
 | 
							 * reasons.
 | 
				
			||||||
 | 
							 */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							if (optval + sizeof(struct tcp_zerocopy_receive) > optval_end)
 | 
				
			||||||
 | 
								return 0; /* EPERM, bounds check */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							if (((struct tcp_zerocopy_receive *)optval)->address != 0)
 | 
				
			||||||
 | 
								return 0; /* EPERM, unexpected data */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							return 1;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if (ctx->level == SOL_IP && ctx->optname == IP_FREEBIND) {
 | 
						if (ctx->level == SOL_IP && ctx->optname == IP_FREEBIND) {
 | 
				
			||||||
		if (optval + 1 > optval_end)
 | 
							if (optval + 1 > optval_end)
 | 
				
			||||||
			return 0; /* EPERM, bounds check */
 | 
								return 0; /* EPERM, bounds check */
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -16,7 +16,6 @@ typedef __u16 __sum16;
 | 
				
			||||||
#include <linux/if_packet.h>
 | 
					#include <linux/if_packet.h>
 | 
				
			||||||
#include <linux/ip.h>
 | 
					#include <linux/ip.h>
 | 
				
			||||||
#include <linux/ipv6.h>
 | 
					#include <linux/ipv6.h>
 | 
				
			||||||
#include <netinet/tcp.h>
 | 
					 | 
				
			||||||
#include <linux/filter.h>
 | 
					#include <linux/filter.h>
 | 
				
			||||||
#include <linux/perf_event.h>
 | 
					#include <linux/perf_event.h>
 | 
				
			||||||
#include <linux/socket.h>
 | 
					#include <linux/socket.h>
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in a new issue