mirror of
				https://github.com/torvalds/linux.git
				synced 2025-11-04 02:30:34 +02:00 
			
		
		
		
	net: add low latency socket poll
Adds an ndo_ll_poll method and the code that supports it. This method can be used by low latency applications to busy-poll Ethernet device queues directly from the socket code. sysctl_net_ll_poll controls how many microseconds to poll. Default is zero (disabled). Individual protocol support will be added by subsequent patches. Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com> Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com> Signed-off-by: Eliezer Tamir <eliezer.tamir@linux.intel.com> Acked-by: Eric Dumazet <edumazet@google.com> Tested-by: Willem de Bruijn <willemb@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
		
							parent
							
								
									af12fa6e46
								
							
						
					
					
						commit
						0602129286
					
				
					 12 changed files with 208 additions and 2 deletions
				
			
		| 
						 | 
				
			
			@ -50,6 +50,13 @@ The maximum number of packets that kernel can handle on a NAPI interrupt,
 | 
			
		|||
it's a Per-CPU variable.
 | 
			
		||||
Default: 64
 | 
			
		||||
 | 
			
		||||
low_latency_poll
 | 
			
		||||
----------------
 | 
			
		||||
Low latency busy poll timeout. (needs CONFIG_NET_LL_RX_POLL)
 | 
			
		||||
Approximate time in us to spin waiting for packets on the device queue.
 | 
			
		||||
Recommended value is 50. May increase power usage.
 | 
			
		||||
Default: 0 (off)
 | 
			
		||||
 | 
			
		||||
rmem_default
 | 
			
		||||
------------
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -971,6 +971,9 @@ struct net_device_ops {
 | 
			
		|||
						     struct netpoll_info *info,
 | 
			
		||||
						     gfp_t gfp);
 | 
			
		||||
	void			(*ndo_netpoll_cleanup)(struct net_device *dev);
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef CONFIG_NET_LL_RX_POLL
 | 
			
		||||
	int			(*ndo_ll_poll)(struct napi_struct *dev);
 | 
			
		||||
#endif
 | 
			
		||||
	int			(*ndo_set_vf_mac)(struct net_device *dev,
 | 
			
		||||
						  int queue, u8 *mac);
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -386,6 +386,7 @@ typedef unsigned char *sk_buff_data_t;
 | 
			
		|||
 *	@no_fcs:  Request NIC to treat last 4 bytes as Ethernet FCS
 | 
			
		||||
 *	@dma_cookie: a cookie to one of several possible DMA operations
 | 
			
		||||
 *		done by skb DMA functions
 | 
			
		||||
  *	@napi_id: id of the NAPI struct this skb came from
 | 
			
		||||
 *	@secmark: security marking
 | 
			
		||||
 *	@mark: Generic packet mark
 | 
			
		||||
 *	@dropcount: total number of sk_receive_queue overflows
 | 
			
		||||
| 
						 | 
				
			
			@ -500,8 +501,11 @@ struct sk_buff {
 | 
			
		|||
	/* 7/9 bit hole (depending on ndisc_nodetype presence) */
 | 
			
		||||
	kmemcheck_bitfield_end(flags2);
 | 
			
		||||
 | 
			
		||||
#ifdef CONFIG_NET_DMA
 | 
			
		||||
	dma_cookie_t		dma_cookie;
 | 
			
		||||
#if defined CONFIG_NET_DMA || defined CONFIG_NET_LL_RX_POLL
 | 
			
		||||
	union {
 | 
			
		||||
		unsigned int	napi_id;
 | 
			
		||||
		dma_cookie_t	dma_cookie;
 | 
			
		||||
	};
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef CONFIG_NETWORK_SECMARK
 | 
			
		||||
	__u32			secmark;
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										148
									
								
								include/net/ll_poll.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										148
									
								
								include/net/ll_poll.h
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,148 @@
 | 
			
		|||
/*
 | 
			
		||||
 * Low Latency Sockets
 | 
			
		||||
 * Copyright(c) 2013 Intel Corporation.
 | 
			
		||||
 *
 | 
			
		||||
 * This program is free software; you can redistribute it and/or modify it
 | 
			
		||||
 * under the terms and conditions of the GNU General Public License,
 | 
			
		||||
 * version 2, as published by the Free Software Foundation.
 | 
			
		||||
 *
 | 
			
		||||
 * This program is distributed in the hope it will be useful, but WITHOUT
 | 
			
		||||
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 | 
			
		||||
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
 | 
			
		||||
 * more details.
 | 
			
		||||
 *
 | 
			
		||||
 * You should have received a copy of the GNU General Public License along with
 | 
			
		||||
 * this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 *
 | 
			
		||||
 * Author: Eliezer Tamir
 | 
			
		||||
 *
 | 
			
		||||
 * Contact Information:
 | 
			
		||||
 * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
 * For now this depends on CONFIG_X86_TSC
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
#ifndef _LINUX_NET_LL_POLL_H
 | 
			
		||||
#define _LINUX_NET_LL_POLL_H
 | 
			
		||||
 | 
			
		||||
#include <linux/netdevice.h>
 | 
			
		||||
#include <net/ip.h>
 | 
			
		||||
 | 
			
		||||
#ifdef CONFIG_NET_LL_RX_POLL
 | 
			
		||||
 | 
			
		||||
struct napi_struct;
 | 
			
		||||
extern unsigned long sysctl_net_ll_poll __read_mostly;
 | 
			
		||||
 | 
			
		||||
/* return values from ndo_ll_poll */
 | 
			
		||||
#define LL_FLUSH_FAILED		-1
 | 
			
		||||
#define LL_FLUSH_BUSY		-2
 | 
			
		||||
 | 
			
		||||
/* we don't mind a ~2.5% imprecision */
 | 
			
		||||
#define TSC_MHZ (tsc_khz >> 10)
 | 
			
		||||
 | 
			
		||||
static inline cycles_t ll_end_time(void)
 | 
			
		||||
{
 | 
			
		||||
	return TSC_MHZ * ACCESS_ONCE(sysctl_net_ll_poll) + get_cycles();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static inline bool sk_valid_ll(struct sock *sk)
 | 
			
		||||
{
 | 
			
		||||
	return sysctl_net_ll_poll && sk->sk_napi_id &&
 | 
			
		||||
	       !need_resched() && !signal_pending(current);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static inline bool can_poll_ll(cycles_t end_time)
 | 
			
		||||
{
 | 
			
		||||
	return !time_after((unsigned long)get_cycles(),
 | 
			
		||||
			    (unsigned long)end_time);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static inline bool sk_poll_ll(struct sock *sk, int nonblock)
 | 
			
		||||
{
 | 
			
		||||
	cycles_t end_time = ll_end_time();
 | 
			
		||||
	const struct net_device_ops *ops;
 | 
			
		||||
	struct napi_struct *napi;
 | 
			
		||||
	int rc = false;
 | 
			
		||||
 | 
			
		||||
	/*
 | 
			
		||||
	 * rcu read lock for napi hash
 | 
			
		||||
	 * bh so we don't race with net_rx_action
 | 
			
		||||
	 */
 | 
			
		||||
	rcu_read_lock_bh();
 | 
			
		||||
 | 
			
		||||
	napi = napi_by_id(sk->sk_napi_id);
 | 
			
		||||
	if (!napi)
 | 
			
		||||
		goto out;
 | 
			
		||||
 | 
			
		||||
	ops = napi->dev->netdev_ops;
 | 
			
		||||
	if (!ops->ndo_ll_poll)
 | 
			
		||||
		goto out;
 | 
			
		||||
 | 
			
		||||
	do {
 | 
			
		||||
 | 
			
		||||
		rc = ops->ndo_ll_poll(napi);
 | 
			
		||||
 | 
			
		||||
		if (rc == LL_FLUSH_FAILED)
 | 
			
		||||
			break; /* permanent failure */
 | 
			
		||||
 | 
			
		||||
		if (rc > 0)
 | 
			
		||||
			/* local bh are disabled so it is ok to use _BH */
 | 
			
		||||
			NET_ADD_STATS_BH(sock_net(sk),
 | 
			
		||||
					 LINUX_MIB_LOWLATENCYRXPACKETS, rc);
 | 
			
		||||
 | 
			
		||||
	} while (skb_queue_empty(&sk->sk_receive_queue)
 | 
			
		||||
			&& can_poll_ll(end_time) && !nonblock);
 | 
			
		||||
 | 
			
		||||
	rc = !skb_queue_empty(&sk->sk_receive_queue);
 | 
			
		||||
out:
 | 
			
		||||
	rcu_read_unlock_bh();
 | 
			
		||||
	return rc;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/* used in the NIC receive handler to mark the skb */
 | 
			
		||||
static inline void skb_mark_ll(struct sk_buff *skb, struct napi_struct *napi)
 | 
			
		||||
{
 | 
			
		||||
	skb->napi_id = napi->napi_id;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/* used in the protocol hanlder to propagate the napi_id to the socket */
 | 
			
		||||
static inline void sk_mark_ll(struct sock *sk, struct sk_buff *skb)
 | 
			
		||||
{
 | 
			
		||||
	sk->sk_napi_id = skb->napi_id;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#else /* CONFIG_NET_LL_RX_POLL */
 | 
			
		||||
 | 
			
		||||
static inline cycles_t ll_end_time(void)
 | 
			
		||||
{
 | 
			
		||||
	return 0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static inline bool sk_valid_ll(struct sock *sk)
 | 
			
		||||
{
 | 
			
		||||
	return false;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static inline bool sk_poll_ll(struct sock *sk, int nonblock)
 | 
			
		||||
{
 | 
			
		||||
	return false;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static inline void skb_mark_ll(struct sk_buff *skb, struct napi_struct *napi)
 | 
			
		||||
{
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static inline void sk_mark_ll(struct sock *sk, struct sk_buff *skb)
 | 
			
		||||
{
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static inline bool can_poll_ll(cycles_t end_time)
 | 
			
		||||
{
 | 
			
		||||
	return false;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#endif /* CONFIG_NET_LL_RX_POLL */
 | 
			
		||||
#endif /* _LINUX_NET_LL_POLL_H */
 | 
			
		||||
| 
						 | 
				
			
			@ -229,6 +229,7 @@ struct cg_proto;
 | 
			
		|||
  *	@sk_omem_alloc: "o" is "option" or "other"
 | 
			
		||||
  *	@sk_wmem_queued: persistent queue size
 | 
			
		||||
  *	@sk_forward_alloc: space allocated forward
 | 
			
		||||
  *	@sk_napi_id: id of the last napi context to receive data for sk
 | 
			
		||||
  *	@sk_allocation: allocation mode
 | 
			
		||||
  *	@sk_sndbuf: size of send buffer in bytes
 | 
			
		||||
  *	@sk_flags: %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
 | 
			
		||||
| 
						 | 
				
			
			@ -324,6 +325,9 @@ struct sock {
 | 
			
		|||
	int			sk_forward_alloc;
 | 
			
		||||
#ifdef CONFIG_RPS
 | 
			
		||||
	__u32			sk_rxhash;
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef CONFIG_NET_LL_RX_POLL
 | 
			
		||||
	unsigned int		sk_napi_id;
 | 
			
		||||
#endif
 | 
			
		||||
	atomic_t		sk_drops;
 | 
			
		||||
	int			sk_rcvbuf;
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -253,6 +253,7 @@ enum
 | 
			
		|||
	LINUX_MIB_TCPFASTOPENLISTENOVERFLOW,	/* TCPFastOpenListenOverflow */
 | 
			
		||||
	LINUX_MIB_TCPFASTOPENCOOKIEREQD,	/* TCPFastOpenCookieReqd */
 | 
			
		||||
	LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES, /* TCPSpuriousRtxHostQueues */
 | 
			
		||||
	LINUX_MIB_LOWLATENCYRXPACKETS,		/* LowLatencyRxPackets */
 | 
			
		||||
	__LINUX_MIB_MAX
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										12
									
								
								net/Kconfig
									
									
									
									
									
								
							
							
						
						
									
										12
									
								
								net/Kconfig
									
									
									
									
									
								
							| 
						 | 
				
			
			@ -243,6 +243,18 @@ config NETPRIO_CGROUP
 | 
			
		|||
	  Cgroup subsystem for use in assigning processes to network priorities on
 | 
			
		||||
	  a per-interface basis
 | 
			
		||||
 | 
			
		||||
config NET_LL_RX_POLL
 | 
			
		||||
	bool "Low Latency Receive Poll"
 | 
			
		||||
	depends on X86_TSC
 | 
			
		||||
	default n
 | 
			
		||||
	---help---
 | 
			
		||||
	  Support Low Latency Receive Queue Poll.
 | 
			
		||||
	  (For network card drivers which support this option.)
 | 
			
		||||
	  When waiting for data in read or poll call directly into the the device driver
 | 
			
		||||
	  to flush packets which may be pending on the device queues into the stack.
 | 
			
		||||
 | 
			
		||||
	  If unsure, say N.
 | 
			
		||||
 | 
			
		||||
config BQL
 | 
			
		||||
	boolean
 | 
			
		||||
	depends on SYSFS
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -733,6 +733,10 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
 | 
			
		|||
	new->vlan_tci		= old->vlan_tci;
 | 
			
		||||
 | 
			
		||||
	skb_copy_secmark(new, old);
 | 
			
		||||
 | 
			
		||||
#ifdef CONFIG_NET_LL_RX_POLL
 | 
			
		||||
	new->napi_id	= old->napi_id;
 | 
			
		||||
#endif
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -139,6 +139,8 @@
 | 
			
		|||
#include <net/tcp.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#include <net/ll_poll.h>
 | 
			
		||||
 | 
			
		||||
static DEFINE_MUTEX(proto_list_mutex);
 | 
			
		||||
static LIST_HEAD(proto_list);
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -2284,6 +2286,10 @@ void sock_init_data(struct socket *sock, struct sock *sk)
 | 
			
		|||
 | 
			
		||||
	sk->sk_stamp = ktime_set(-1L, 0);
 | 
			
		||||
 | 
			
		||||
#ifdef CONFIG_NET_LL_RX_POLL
 | 
			
		||||
	sk->sk_napi_id		=	0;
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
	/*
 | 
			
		||||
	 * Before updating sk_refcnt, we must commit prior changes to memory
 | 
			
		||||
	 * (Documentation/RCU/rculist_nulls.txt for details)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -19,6 +19,7 @@
 | 
			
		|||
#include <net/ip.h>
 | 
			
		||||
#include <net/sock.h>
 | 
			
		||||
#include <net/net_ratelimit.h>
 | 
			
		||||
#include <net/ll_poll.h>
 | 
			
		||||
 | 
			
		||||
static int one = 1;
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -284,6 +285,15 @@ static struct ctl_table net_core_table[] = {
 | 
			
		|||
		.proc_handler	= flow_limit_table_len_sysctl
 | 
			
		||||
	},
 | 
			
		||||
#endif /* CONFIG_NET_FLOW_LIMIT */
 | 
			
		||||
#ifdef CONFIG_NET_LL_RX_POLL
 | 
			
		||||
	{
 | 
			
		||||
		.procname	= "low_latency_poll",
 | 
			
		||||
		.data		= &sysctl_net_ll_poll,
 | 
			
		||||
		.maxlen		= sizeof(unsigned long),
 | 
			
		||||
		.mode		= 0644,
 | 
			
		||||
		.proc_handler	= proc_doulongvec_minmax
 | 
			
		||||
	},
 | 
			
		||||
#endif
 | 
			
		||||
#endif /* CONFIG_NET */
 | 
			
		||||
	{
 | 
			
		||||
		.procname	= "netdev_budget",
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -273,6 +273,7 @@ static const struct snmp_mib snmp4_net_list[] = {
 | 
			
		|||
	SNMP_MIB_ITEM("TCPFastOpenListenOverflow", LINUX_MIB_TCPFASTOPENLISTENOVERFLOW),
 | 
			
		||||
	SNMP_MIB_ITEM("TCPFastOpenCookieReqd", LINUX_MIB_TCPFASTOPENCOOKIEREQD),
 | 
			
		||||
	SNMP_MIB_ITEM("TCPSpuriousRtxHostQueues", LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES),
 | 
			
		||||
	SNMP_MIB_ITEM("LowLatencyRxPackets", LINUX_MIB_LOWLATENCYRXPACKETS),
 | 
			
		||||
	SNMP_MIB_SENTINEL
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -104,6 +104,12 @@
 | 
			
		|||
#include <linux/route.h>
 | 
			
		||||
#include <linux/sockios.h>
 | 
			
		||||
#include <linux/atalk.h>
 | 
			
		||||
#include <net/ll_poll.h>
 | 
			
		||||
 | 
			
		||||
#ifdef CONFIG_NET_LL_RX_POLL
 | 
			
		||||
unsigned long sysctl_net_ll_poll __read_mostly;
 | 
			
		||||
EXPORT_SYMBOL_GPL(sysctl_net_ll_poll);
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
static int sock_no_open(struct inode *irrelevant, struct file *dontcare);
 | 
			
		||||
static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov,
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in a new issue