forked from mirrors/linux
		
	ipv4: PKTINFO doesnt need dst reference
Le lundi 07 novembre 2011 à 15:33 +0100, Eric Dumazet a écrit :
> At least, in recent kernels we dont change dst->refcnt in forwarding
> patch (usinf NOREF skb->dst)
>
> One particular point is the atomic_inc(dst->refcnt) we have to perform
> when queuing an UDP packet if socket asked PKTINFO stuff (for example a
> typical DNS server has to setup this option)
>
> I have one patch somewhere that stores the information in skb->cb[] and
> avoid the atomic_{inc|dec}(dst->refcnt).
>
OK I found it, I did some extra tests and believe its ready.
[PATCH net-next] ipv4: IP_PKTINFO doesnt need dst reference
When a socket uses IP_PKTINFO notifications, we currently force a dst
reference for each received skb. Reader has to access dst to get needed
information (rt_iif & rt_spec_dst) and must release dst reference.
We also forced a dst reference if skb was put in socket backlog, even
without IP_PKTINFO handling. This happens under stress/load.
We can instead store the needed information in skb->cb[], so that only
softirq handler really access dst, improving cache hit ratios.
This removes two atomic operations per packet, and false sharing as
well.
On a benchmark using a mono threaded receiver (doing only recvmsg()
calls), I can reach 720.000 pps instead of 570.000 pps.
IP_PKTINFO is typically used by DNS servers, and any multihomed aware
UDP application.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
			
			
This commit is contained in:
		
							parent
							
								
									acb32ba3de
								
							
						
					
					
						commit
						d826eb14ec
					
				
					 6 changed files with 28 additions and 22 deletions
				
			
		| 
						 | 
					@ -450,7 +450,7 @@ extern int ip_options_rcv_srr(struct sk_buff *skb);
 | 
				
			||||||
 *	Functions provided by ip_sockglue.c
 | 
					 *	Functions provided by ip_sockglue.c
 | 
				
			||||||
 */
 | 
					 */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
extern int	ip_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
 | 
					extern void	ipv4_pktinfo_prepare(struct sk_buff *skb);
 | 
				
			||||||
extern void	ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb);
 | 
					extern void	ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb);
 | 
				
			||||||
extern int	ip_cmsg_send(struct net *net,
 | 
					extern int	ip_cmsg_send(struct net *net,
 | 
				
			||||||
			     struct msghdr *msg, struct ipcm_cookie *ipc);
 | 
								     struct msghdr *msg, struct ipcm_cookie *ipc);
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -55,20 +55,13 @@
 | 
				
			||||||
/*
 | 
					/*
 | 
				
			||||||
 *	SOL_IP control messages.
 | 
					 *	SOL_IP control messages.
 | 
				
			||||||
 */
 | 
					 */
 | 
				
			||||||
 | 
					#define PKTINFO_SKB_CB(__skb) ((struct in_pktinfo *)((__skb)->cb))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static void ip_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb)
 | 
					static void ip_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	struct in_pktinfo info;
 | 
						struct in_pktinfo info = *PKTINFO_SKB_CB(skb);
 | 
				
			||||||
	struct rtable *rt = skb_rtable(skb);
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
	info.ipi_addr.s_addr = ip_hdr(skb)->daddr;
 | 
						info.ipi_addr.s_addr = ip_hdr(skb)->daddr;
 | 
				
			||||||
	if (rt) {
 | 
					 | 
				
			||||||
		info.ipi_ifindex = rt->rt_iif;
 | 
					 | 
				
			||||||
		info.ipi_spec_dst.s_addr = rt->rt_spec_dst;
 | 
					 | 
				
			||||||
	} else {
 | 
					 | 
				
			||||||
		info.ipi_ifindex = 0;
 | 
					 | 
				
			||||||
		info.ipi_spec_dst.s_addr = 0;
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
	put_cmsg(msg, SOL_IP, IP_PKTINFO, sizeof(info), &info);
 | 
						put_cmsg(msg, SOL_IP, IP_PKTINFO, sizeof(info), &info);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
| 
						 | 
					@ -992,20 +985,28 @@ static int do_ip_setsockopt(struct sock *sk, int level,
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/**
 | 
					/**
 | 
				
			||||||
 * ip_queue_rcv_skb - Queue an skb into sock receive queue
 | 
					 * ipv4_pktinfo_prepare - transfert some info from rtable to skb
 | 
				
			||||||
 * @sk: socket
 | 
					 * @sk: socket
 | 
				
			||||||
 * @skb: buffer
 | 
					 * @skb: buffer
 | 
				
			||||||
 *
 | 
					 *
 | 
				
			||||||
 * Queues an skb into socket receive queue. If IP_CMSG_PKTINFO option
 | 
					 * To support IP_CMSG_PKTINFO option, we store rt_iif and rt_spec_dst
 | 
				
			||||||
 * is not set, we drop skb dst entry now, while dst cache line is hot.
 | 
					 * in skb->cb[] before dst drop.
 | 
				
			||||||
 | 
					 * This way, receiver doesnt make cache line misses to read rtable.
 | 
				
			||||||
 */
 | 
					 */
 | 
				
			||||||
int ip_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 | 
					void ipv4_pktinfo_prepare(struct sk_buff *skb)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	if (!(inet_sk(sk)->cmsg_flags & IP_CMSG_PKTINFO))
 | 
						struct in_pktinfo *pktinfo = PKTINFO_SKB_CB(skb);
 | 
				
			||||||
 | 
						const struct rtable *rt = skb_rtable(skb);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if (rt) {
 | 
				
			||||||
 | 
							pktinfo->ipi_ifindex = rt->rt_iif;
 | 
				
			||||||
 | 
							pktinfo->ipi_spec_dst.s_addr = rt->rt_spec_dst;
 | 
				
			||||||
 | 
						} else {
 | 
				
			||||||
 | 
							pktinfo->ipi_ifindex = 0;
 | 
				
			||||||
 | 
							pktinfo->ipi_spec_dst.s_addr = 0;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
	skb_dst_drop(skb);
 | 
						skb_dst_drop(skb);
 | 
				
			||||||
	return sock_queue_rcv_skb(sk, skb);
 | 
					 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
EXPORT_SYMBOL(ip_queue_rcv_skb);
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
int ip_setsockopt(struct sock *sk, int level,
 | 
					int ip_setsockopt(struct sock *sk, int level,
 | 
				
			||||||
		int optname, char __user *optval, unsigned int optlen)
 | 
							int optname, char __user *optval, unsigned int optlen)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -292,7 +292,8 @@ static int raw_rcv_skb(struct sock * sk, struct sk_buff * skb)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	/* Charge it to the socket. */
 | 
						/* Charge it to the socket. */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if (ip_queue_rcv_skb(sk, skb) < 0) {
 | 
						ipv4_pktinfo_prepare(skb);
 | 
				
			||||||
 | 
						if (sock_queue_rcv_skb(sk, skb) < 0) {
 | 
				
			||||||
		kfree_skb(skb);
 | 
							kfree_skb(skb);
 | 
				
			||||||
		return NET_RX_DROP;
 | 
							return NET_RX_DROP;
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1357,7 +1357,7 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 | 
				
			||||||
	if (inet_sk(sk)->inet_daddr)
 | 
						if (inet_sk(sk)->inet_daddr)
 | 
				
			||||||
		sock_rps_save_rxhash(sk, skb);
 | 
							sock_rps_save_rxhash(sk, skb);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	rc = ip_queue_rcv_skb(sk, skb);
 | 
						rc = sock_queue_rcv_skb(sk, skb);
 | 
				
			||||||
	if (rc < 0) {
 | 
						if (rc < 0) {
 | 
				
			||||||
		int is_udplite = IS_UDPLITE(sk);
 | 
							int is_udplite = IS_UDPLITE(sk);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1473,6 +1473,7 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	rc = 0;
 | 
						rc = 0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						ipv4_pktinfo_prepare(skb);
 | 
				
			||||||
	bh_lock_sock(sk);
 | 
						bh_lock_sock(sk);
 | 
				
			||||||
	if (!sock_owned_by_user(sk))
 | 
						if (!sock_owned_by_user(sk))
 | 
				
			||||||
		rc = __udp_queue_rcv_skb(sk, skb);
 | 
							rc = __udp_queue_rcv_skb(sk, skb);
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -383,7 +383,8 @@ static inline int rawv6_rcv_skb(struct sock *sk, struct sk_buff *skb)
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/* Charge it to the socket. */
 | 
						/* Charge it to the socket. */
 | 
				
			||||||
	if (ip_queue_rcv_skb(sk, skb) < 0) {
 | 
						skb_dst_drop(skb);
 | 
				
			||||||
 | 
						if (sock_queue_rcv_skb(sk, skb) < 0) {
 | 
				
			||||||
		kfree_skb(skb);
 | 
							kfree_skb(skb);
 | 
				
			||||||
		return NET_RX_DROP;
 | 
							return NET_RX_DROP;
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -538,7 +538,9 @@ int udpv6_queue_rcv_skb(struct sock * sk, struct sk_buff *skb)
 | 
				
			||||||
			goto drop;
 | 
								goto drop;
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if ((rc = ip_queue_rcv_skb(sk, skb)) < 0) {
 | 
						skb_dst_drop(skb);
 | 
				
			||||||
 | 
						rc = sock_queue_rcv_skb(sk, skb);
 | 
				
			||||||
 | 
						if (rc < 0) {
 | 
				
			||||||
		/* Note that an ENOMEM error is charged twice */
 | 
							/* Note that an ENOMEM error is charged twice */
 | 
				
			||||||
		if (rc == -ENOMEM)
 | 
							if (rc == -ENOMEM)
 | 
				
			||||||
			UDP6_INC_STATS_BH(sock_net(sk),
 | 
								UDP6_INC_STATS_BH(sock_net(sk),
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in a new issue