forked from mirrors/linux
		
	net: poll/select low latency socket support
select/poll busy-poll support. Split sysctl value into two separate ones, one for read and one for poll. updated Documentation/sysctl/net.txt Add a new poll flag POLL_LL. When this flag is set, sock_poll will call sk_poll_ll if possible. sock_poll sets this flag in its return value to indicate to select/poll when a socket that can busy poll is found. When poll/select have nothing to report, call the low-level sock_poll again until we are out of time or we find something. Once the system call finds something, it stops setting POLL_LL, so it can return the result to the user ASAP. Signed-off-by: Eliezer Tamir <eliezer.tamir@linux.intel.com> Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
		
							parent
							
								
									e4f2379db6
								
							
						
					
					
						commit
						2d48d67fa8
					
				
					 7 changed files with 91 additions and 22 deletions
				
			
		|  | @ -50,11 +50,25 @@ The maximum number of packets that kernel can handle on a NAPI interrupt, | ||||||
| it's a Per-CPU variable. | it's a Per-CPU variable. | ||||||
| Default: 64 | Default: 64 | ||||||
| 
 | 
 | ||||||
|  | low_latency_read | ||||||
|  | ---------------- | ||||||
|  | Low latency busy poll timeout for socket reads. (needs CONFIG_NET_LL_RX_POLL) | ||||||
|  | Approximate time in us to spin waiting for packets on the device queue. | ||||||
|  | This sets the default value of the SO_LL socket option. | ||||||
|  | Can be set or overridden per socket by setting socket option SO_LL. | ||||||
|  | Recommended value is 50. May increase power usage. | ||||||
|  | Default: 0 (off) | ||||||
|  | 
 | ||||||
| low_latency_poll | low_latency_poll | ||||||
| ---------------- | ---------------- | ||||||
| Low latency busy poll timeout. (needs CONFIG_NET_LL_RX_POLL) | Low latency busy poll timeout for poll and select. (needs CONFIG_NET_LL_RX_POLL) | ||||||
| Approximate time in us to spin waiting for packets on the device queue. | Approximate time in us to spin waiting for packets on the device queue. | ||||||
| Recommended value is 50. May increase power usage. | Recommended value depends on the number of sockets you poll on. | ||||||
|  | For several sockets 50, for several hundreds 100. | ||||||
|  | For more than that you probably want to use epoll. | ||||||
|  | Note that only sockets with SO_LL set will be busy polled, so you want to either | ||||||
|  | selectively set SO_LL on those sockets or set sysctl.net.low_latency_read globally. | ||||||
|  | May increase power usage. | ||||||
| Default: 0 (off) | Default: 0 (off) | ||||||
| 
 | 
 | ||||||
| rmem_default | rmem_default | ||||||
|  |  | ||||||
							
								
								
									
										34
									
								
								fs/select.c
									
									
									
									
									
								
							
							
						
						
									
										34
									
								
								fs/select.c
									
									
									
									
									
								
							|  | @ -27,6 +27,7 @@ | ||||||
| #include <linux/rcupdate.h> | #include <linux/rcupdate.h> | ||||||
| #include <linux/hrtimer.h> | #include <linux/hrtimer.h> | ||||||
| #include <linux/sched/rt.h> | #include <linux/sched/rt.h> | ||||||
|  | #include <net/ll_poll.h> | ||||||
| 
 | 
 | ||||||
| #include <asm/uaccess.h> | #include <asm/uaccess.h> | ||||||
| 
 | 
 | ||||||
|  | @ -384,9 +385,10 @@ static int max_select_fd(unsigned long n, fd_set_bits *fds) | ||||||
| #define POLLEX_SET (POLLPRI) | #define POLLEX_SET (POLLPRI) | ||||||
| 
 | 
 | ||||||
| static inline void wait_key_set(poll_table *wait, unsigned long in, | static inline void wait_key_set(poll_table *wait, unsigned long in, | ||||||
| 				unsigned long out, unsigned long bit) | 				unsigned long out, unsigned long bit, | ||||||
|  | 				unsigned int ll_flag) | ||||||
| { | { | ||||||
| 	wait->_key = POLLEX_SET; | 	wait->_key = POLLEX_SET | ll_flag; | ||||||
| 	if (in & bit) | 	if (in & bit) | ||||||
| 		wait->_key |= POLLIN_SET; | 		wait->_key |= POLLIN_SET; | ||||||
| 	if (out & bit) | 	if (out & bit) | ||||||
|  | @ -400,6 +402,8 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) | ||||||
| 	poll_table *wait; | 	poll_table *wait; | ||||||
| 	int retval, i, timed_out = 0; | 	int retval, i, timed_out = 0; | ||||||
| 	unsigned long slack = 0; | 	unsigned long slack = 0; | ||||||
|  | 	unsigned int ll_flag = POLL_LL; | ||||||
|  | 	u64 ll_time = ll_end_time(); | ||||||
| 
 | 
 | ||||||
| 	rcu_read_lock(); | 	rcu_read_lock(); | ||||||
| 	retval = max_select_fd(n, fds); | 	retval = max_select_fd(n, fds); | ||||||
|  | @ -422,6 +426,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) | ||||||
| 	retval = 0; | 	retval = 0; | ||||||
| 	for (;;) { | 	for (;;) { | ||||||
| 		unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp; | 		unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp; | ||||||
|  | 		bool can_ll = false; | ||||||
| 
 | 
 | ||||||
| 		inp = fds->in; outp = fds->out; exp = fds->ex; | 		inp = fds->in; outp = fds->out; exp = fds->ex; | ||||||
| 		rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex; | 		rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex; | ||||||
|  | @ -449,7 +454,8 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) | ||||||
| 					f_op = f.file->f_op; | 					f_op = f.file->f_op; | ||||||
| 					mask = DEFAULT_POLLMASK; | 					mask = DEFAULT_POLLMASK; | ||||||
| 					if (f_op && f_op->poll) { | 					if (f_op && f_op->poll) { | ||||||
| 						wait_key_set(wait, in, out, bit); | 						wait_key_set(wait, in, out, | ||||||
|  | 							     bit, ll_flag); | ||||||
| 						mask = (*f_op->poll)(f.file, wait); | 						mask = (*f_op->poll)(f.file, wait); | ||||||
| 					} | 					} | ||||||
| 					fdput(f); | 					fdput(f); | ||||||
|  | @ -468,6 +474,11 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) | ||||||
| 						retval++; | 						retval++; | ||||||
| 						wait->_qproc = NULL; | 						wait->_qproc = NULL; | ||||||
| 					} | 					} | ||||||
|  | 					if (mask & POLL_LL) | ||||||
|  | 						can_ll = true; | ||||||
|  | 					/* got something, stop busy polling */ | ||||||
|  | 					if (retval) | ||||||
|  | 						ll_flag = 0; | ||||||
| 				} | 				} | ||||||
| 			} | 			} | ||||||
| 			if (res_in) | 			if (res_in) | ||||||
|  | @ -486,6 +497,9 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) | ||||||
| 			break; | 			break; | ||||||
| 		} | 		} | ||||||
| 
 | 
 | ||||||
|  | 		if (can_ll && can_poll_ll(ll_time)) | ||||||
|  | 			continue; | ||||||
|  | 
 | ||||||
| 		/*
 | 		/*
 | ||||||
| 		 * If this is the first loop and we have a timeout | 		 * If this is the first loop and we have a timeout | ||||||
| 		 * given, then we convert to ktime_t and set the to | 		 * given, then we convert to ktime_t and set the to | ||||||
|  | @ -717,7 +731,8 @@ struct poll_list { | ||||||
|  * pwait poll_table will be used by the fd-provided poll handler for waiting, |  * pwait poll_table will be used by the fd-provided poll handler for waiting, | ||||||
|  * if pwait->_qproc is non-NULL. |  * if pwait->_qproc is non-NULL. | ||||||
|  */ |  */ | ||||||
| static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait) | static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait, | ||||||
|  | 				     bool *can_ll, unsigned int ll_flag) | ||||||
| { | { | ||||||
| 	unsigned int mask; | 	unsigned int mask; | ||||||
| 	int fd; | 	int fd; | ||||||
|  | @ -731,7 +746,10 @@ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait) | ||||||
| 			mask = DEFAULT_POLLMASK; | 			mask = DEFAULT_POLLMASK; | ||||||
| 			if (f.file->f_op && f.file->f_op->poll) { | 			if (f.file->f_op && f.file->f_op->poll) { | ||||||
| 				pwait->_key = pollfd->events|POLLERR|POLLHUP; | 				pwait->_key = pollfd->events|POLLERR|POLLHUP; | ||||||
|  | 				pwait->_key |= ll_flag; | ||||||
| 				mask = f.file->f_op->poll(f.file, pwait); | 				mask = f.file->f_op->poll(f.file, pwait); | ||||||
|  | 				if (mask & POLL_LL) | ||||||
|  | 					*can_ll = true; | ||||||
| 			} | 			} | ||||||
| 			/* Mask out unneeded events. */ | 			/* Mask out unneeded events. */ | ||||||
| 			mask &= pollfd->events | POLLERR | POLLHUP; | 			mask &= pollfd->events | POLLERR | POLLHUP; | ||||||
|  | @ -750,6 +768,8 @@ static int do_poll(unsigned int nfds,  struct poll_list *list, | ||||||
| 	ktime_t expire, *to = NULL; | 	ktime_t expire, *to = NULL; | ||||||
| 	int timed_out = 0, count = 0; | 	int timed_out = 0, count = 0; | ||||||
| 	unsigned long slack = 0; | 	unsigned long slack = 0; | ||||||
|  | 	unsigned int ll_flag = POLL_LL; | ||||||
|  | 	u64 ll_time = ll_end_time(); | ||||||
| 
 | 
 | ||||||
| 	/* Optimise the no-wait case */ | 	/* Optimise the no-wait case */ | ||||||
| 	if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { | 	if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { | ||||||
|  | @ -762,6 +782,7 @@ static int do_poll(unsigned int nfds,  struct poll_list *list, | ||||||
| 
 | 
 | ||||||
| 	for (;;) { | 	for (;;) { | ||||||
| 		struct poll_list *walk; | 		struct poll_list *walk; | ||||||
|  | 		bool can_ll = false; | ||||||
| 
 | 
 | ||||||
| 		for (walk = list; walk != NULL; walk = walk->next) { | 		for (walk = list; walk != NULL; walk = walk->next) { | ||||||
| 			struct pollfd * pfd, * pfd_end; | 			struct pollfd * pfd, * pfd_end; | ||||||
|  | @ -776,9 +797,10 @@ static int do_poll(unsigned int nfds,  struct poll_list *list, | ||||||
| 				 * this. They'll get immediately deregistered | 				 * this. They'll get immediately deregistered | ||||||
| 				 * when we break out and return. | 				 * when we break out and return. | ||||||
| 				 */ | 				 */ | ||||||
| 				if (do_pollfd(pfd, pt)) { | 				if (do_pollfd(pfd, pt, &can_ll, ll_flag)) { | ||||||
| 					count++; | 					count++; | ||||||
| 					pt->_qproc = NULL; | 					pt->_qproc = NULL; | ||||||
|  | 					ll_flag = 0; | ||||||
| 				} | 				} | ||||||
| 			} | 			} | ||||||
| 		} | 		} | ||||||
|  | @ -795,6 +817,8 @@ static int do_poll(unsigned int nfds,  struct poll_list *list, | ||||||
| 		if (count || timed_out) | 		if (count || timed_out) | ||||||
| 			break; | 			break; | ||||||
| 
 | 
 | ||||||
|  | 		if (can_ll && can_poll_ll(ll_time)) | ||||||
|  | 			continue; | ||||||
| 		/*
 | 		/*
 | ||||||
| 		 * If this is the first loop and we have a timeout | 		 * If this is the first loop and we have a timeout | ||||||
| 		 * given, then we convert to ktime_t and set the to | 		 * given, then we convert to ktime_t and set the to | ||||||
|  |  | ||||||
|  | @ -30,6 +30,7 @@ | ||||||
| #ifdef CONFIG_NET_LL_RX_POLL | #ifdef CONFIG_NET_LL_RX_POLL | ||||||
| 
 | 
 | ||||||
| struct napi_struct; | struct napi_struct; | ||||||
|  | extern unsigned int sysctl_net_ll_read __read_mostly; | ||||||
| extern unsigned int sysctl_net_ll_poll __read_mostly; | extern unsigned int sysctl_net_ll_poll __read_mostly; | ||||||
| 
 | 
 | ||||||
| /* return values from ndo_ll_poll */ | /* return values from ndo_ll_poll */ | ||||||
|  | @ -38,17 +39,18 @@ extern unsigned int sysctl_net_ll_poll __read_mostly; | ||||||
| 
 | 
 | ||||||
| /* we can use sched_clock() because we don't care much about precision
 | /* we can use sched_clock() because we don't care much about precision
 | ||||||
|  * we only care that the average is bounded |  * we only care that the average is bounded | ||||||
|  |  * we don't mind a ~2.5% imprecision so <<10 instead of *1000 | ||||||
|  |  * sk->sk_ll_usec is a u_int so this can't overflow | ||||||
|  */ |  */ | ||||||
| static inline u64 ll_end_time(struct sock *sk) | static inline u64 ll_sk_end_time(struct sock *sk) | ||||||
| { | { | ||||||
| 	u64 end_time = ACCESS_ONCE(sk->sk_ll_usec); | 	return ((u64)ACCESS_ONCE(sk->sk_ll_usec) << 10) + sched_clock(); | ||||||
|  | } | ||||||
| 
 | 
 | ||||||
| 	/* we don't mind a ~2.5% imprecision
 | /* in poll/select we use the global sysctl_net_ll_poll value */ | ||||||
| 	 * sk->sk_ll_usec is a u_int so this can't overflow | static inline u64 ll_end_time(void) | ||||||
| 	 */ | { | ||||||
| 	end_time = (end_time << 10) + sched_clock(); | 	return ((u64)ACCESS_ONCE(sysctl_net_ll_poll) << 10) + sched_clock(); | ||||||
| 
 |  | ||||||
| 	return end_time; |  | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static inline bool sk_valid_ll(struct sock *sk) | static inline bool sk_valid_ll(struct sock *sk) | ||||||
|  | @ -62,10 +64,13 @@ static inline bool can_poll_ll(u64 end_time) | ||||||
| 	return !time_after64(sched_clock(), end_time); | 	return !time_after64(sched_clock(), end_time); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | /* when used in sock_poll() nonblock is known at compile time to be true
 | ||||||
|  |  * so the loop and end_time will be optimized out | ||||||
|  |  */ | ||||||
| static inline bool sk_poll_ll(struct sock *sk, int nonblock) | static inline bool sk_poll_ll(struct sock *sk, int nonblock) | ||||||
| { | { | ||||||
|  | 	u64 end_time = nonblock ? 0 : ll_sk_end_time(sk); | ||||||
| 	const struct net_device_ops *ops; | 	const struct net_device_ops *ops; | ||||||
| 	u64 end_time = ll_end_time(sk); |  | ||||||
| 	struct napi_struct *napi; | 	struct napi_struct *napi; | ||||||
| 	int rc = false; | 	int rc = false; | ||||||
| 
 | 
 | ||||||
|  | @ -84,7 +89,6 @@ static inline bool sk_poll_ll(struct sock *sk, int nonblock) | ||||||
| 		goto out; | 		goto out; | ||||||
| 
 | 
 | ||||||
| 	do { | 	do { | ||||||
| 
 |  | ||||||
| 		rc = ops->ndo_ll_poll(napi); | 		rc = ops->ndo_ll_poll(napi); | ||||||
| 
 | 
 | ||||||
| 		if (rc == LL_FLUSH_FAILED) | 		if (rc == LL_FLUSH_FAILED) | ||||||
|  | @ -95,8 +99,8 @@ static inline bool sk_poll_ll(struct sock *sk, int nonblock) | ||||||
| 			NET_ADD_STATS_BH(sock_net(sk), | 			NET_ADD_STATS_BH(sock_net(sk), | ||||||
| 					 LINUX_MIB_LOWLATENCYRXPACKETS, rc); | 					 LINUX_MIB_LOWLATENCYRXPACKETS, rc); | ||||||
| 
 | 
 | ||||||
| 	} while (skb_queue_empty(&sk->sk_receive_queue) | 	} while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) && | ||||||
| 			&& can_poll_ll(end_time) && !nonblock); | 		 can_poll_ll(end_time)); | ||||||
| 
 | 
 | ||||||
| 	rc = !skb_queue_empty(&sk->sk_receive_queue); | 	rc = !skb_queue_empty(&sk->sk_receive_queue); | ||||||
| out: | out: | ||||||
|  | @ -118,7 +122,12 @@ static inline void sk_mark_ll(struct sock *sk, struct sk_buff *skb) | ||||||
| 
 | 
 | ||||||
| #else /* CONFIG_NET_LL_RX_POLL */ | #else /* CONFIG_NET_LL_RX_POLL */ | ||||||
| 
 | 
 | ||||||
| static inline u64 ll_end_time(struct sock *sk) | static inline u64 sk_ll_end_time(struct sock *sk) | ||||||
|  | { | ||||||
|  | 	return 0; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static inline u64 ll_end_time(void) | ||||||
| { | { | ||||||
| 	return 0; | 	return 0; | ||||||
| } | } | ||||||
|  |  | ||||||
|  | @ -30,6 +30,8 @@ | ||||||
| 
 | 
 | ||||||
| #define POLLFREE	0x4000	/* currently only for epoll */ | #define POLLFREE	0x4000	/* currently only for epoll */ | ||||||
| 
 | 
 | ||||||
|  | #define POLL_LL		0x8000 | ||||||
|  | 
 | ||||||
| struct pollfd { | struct pollfd { | ||||||
| 	int fd; | 	int fd; | ||||||
| 	short events; | 	short events; | ||||||
|  |  | ||||||
|  | @ -2307,7 +2307,7 @@ void sock_init_data(struct socket *sock, struct sock *sk) | ||||||
| 
 | 
 | ||||||
| #ifdef CONFIG_NET_LL_RX_POLL | #ifdef CONFIG_NET_LL_RX_POLL | ||||||
| 	sk->sk_napi_id		=	0; | 	sk->sk_napi_id		=	0; | ||||||
| 	sk->sk_ll_usec		=	sysctl_net_ll_poll; | 	sk->sk_ll_usec		=	sysctl_net_ll_read; | ||||||
| #endif | #endif | ||||||
| 
 | 
 | ||||||
| 	/*
 | 	/*
 | ||||||
|  |  | ||||||
|  | @ -306,6 +306,14 @@ static struct ctl_table net_core_table[] = { | ||||||
| 		.mode		= 0644, | 		.mode		= 0644, | ||||||
| 		.proc_handler	= proc_dointvec | 		.proc_handler	= proc_dointvec | ||||||
| 	}, | 	}, | ||||||
|  | 	{ | ||||||
|  | 		.procname	= "low_latency_read", | ||||||
|  | 		.data		= &sysctl_net_ll_read, | ||||||
|  | 		.maxlen		= sizeof(unsigned int), | ||||||
|  | 		.mode		= 0644, | ||||||
|  | 		.proc_handler	= proc_dointvec | ||||||
|  | 	}, | ||||||
|  | # | ||||||
| #endif | #endif | ||||||
| #endif /* CONFIG_NET */ | #endif /* CONFIG_NET */ | ||||||
| 	{ | 	{ | ||||||
|  |  | ||||||
							
								
								
									
										14
									
								
								net/socket.c
									
									
									
									
									
								
							
							
						
						
									
										14
									
								
								net/socket.c
									
									
									
									
									
								
							|  | @ -107,6 +107,7 @@ | ||||||
| #include <net/ll_poll.h> | #include <net/ll_poll.h> | ||||||
| 
 | 
 | ||||||
| #ifdef CONFIG_NET_LL_RX_POLL | #ifdef CONFIG_NET_LL_RX_POLL | ||||||
|  | unsigned int sysctl_net_ll_read __read_mostly; | ||||||
| unsigned int sysctl_net_ll_poll __read_mostly; | unsigned int sysctl_net_ll_poll __read_mostly; | ||||||
| #endif | #endif | ||||||
| 
 | 
 | ||||||
|  | @ -1147,13 +1148,24 @@ EXPORT_SYMBOL(sock_create_lite); | ||||||
| /* No kernel lock held - perfect */ | /* No kernel lock held - perfect */ | ||||||
| static unsigned int sock_poll(struct file *file, poll_table *wait) | static unsigned int sock_poll(struct file *file, poll_table *wait) | ||||||
| { | { | ||||||
|  | 	unsigned int ll_flag = 0; | ||||||
| 	struct socket *sock; | 	struct socket *sock; | ||||||
| 
 | 
 | ||||||
| 	/*
 | 	/*
 | ||||||
| 	 *      We can't return errors to poll, so it's either yes or no. | 	 *      We can't return errors to poll, so it's either yes or no. | ||||||
| 	 */ | 	 */ | ||||||
| 	sock = file->private_data; | 	sock = file->private_data; | ||||||
| 	return sock->ops->poll(file, sock, wait); | 
 | ||||||
|  | 	if (sk_valid_ll(sock->sk)) { | ||||||
|  | 		/* this socket can poll_ll so tell the system call */ | ||||||
|  | 		ll_flag = POLL_LL; | ||||||
|  | 
 | ||||||
|  | 		/* once, only if requested by syscall */ | ||||||
|  | 		if (wait && (wait->_key & POLL_LL)) | ||||||
|  | 			sk_poll_ll(sock->sk, 1); | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	return ll_flag | sock->ops->poll(file, sock, wait); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static int sock_mmap(struct file *file, struct vm_area_struct *vma) | static int sock_mmap(struct file *file, struct vm_area_struct *vma) | ||||||
|  |  | ||||||
		Loading…
	
		Reference in a new issue
	
	 Eliezer Tamir
						Eliezer Tamir