mirror of
				https://github.com/torvalds/linux.git
				synced 2025-11-04 10:40:15 +02:00 
			
		
		
		
	tcp: remove dst refcount false sharing for prequeue mode
Alexander Duyck reported high false sharing on dst refcount in tcp stack
when prequeue is used. prequeue is the mechanism used when a thread is
blocked in recvmsg()/read() on a TCP socket, using a blocking model
rather than select()/poll()/epoll() non blocking one.
We already try to use RCU in input path as much as possible, but we were
forced to take a refcount on the dst when skb escaped RCU protected
region. When/if the user thread runs on different cpu, dst_release()
will then touch dst refcount again.
Commit 093162553c (tcp: force a dst refcount when prequeue packet)
was an example of a race fix.
It turns out the only remaining usage of skb->dst for a packet stored
in a TCP socket prequeue is IP early demux.
We can add a logic to detect when IP early demux is probably going
to use skb->dst. Because we do an optimistic check rather than duplicate
existing logic, we need to guard inet_sk_rx_dst_set() and
inet6_sk_rx_dst_set() from using a NULL dst.
Many thanks to Alexander for providing a nice bug report, git bisection,
and reproducer.
Tested using Alexander script on a 40Gb NIC, 8 RX queues.
Hosts have 24 cores, 48 hyper threads.
echo 0 >/proc/sys/net/ipv4/tcp_autocorking
for i in `seq 0 47`
do
  for j in `seq 0 2`
  do
     netperf -H $DEST -t TCP_STREAM -l 1000 \
             -c -C -T $i,$i -P 0 -- \
             -m 64 -s 64K -D &
  done
done
Before patch : ~6Mpps and ~95% cpu usage on receiver
After patch : ~9Mpps and ~35% cpu usage on receiver.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
			
			
This commit is contained in:
		
							parent
							
								
									32bc6d1a35
								
							
						
					
					
						commit
						ca777eff51
					
				
					 2 changed files with 25 additions and 10 deletions
				
			
		| 
						 | 
				
			
			@ -1559,7 +1559,17 @@ bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
 | 
			
		|||
	    skb_queue_len(&tp->ucopy.prequeue) == 0)
 | 
			
		||||
		return false;
 | 
			
		||||
 | 
			
		||||
	skb_dst_force(skb);
 | 
			
		||||
	/* Before escaping RCU protected region, we need to take care of skb
 | 
			
		||||
	 * dst. Prequeue is only enabled for established sockets.
 | 
			
		||||
	 * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
 | 
			
		||||
	 * Instead of doing full sk_rx_dst validity here, let's perform
 | 
			
		||||
	 * an optimistic check.
 | 
			
		||||
	 */
 | 
			
		||||
	if (likely(sk->sk_rx_dst))
 | 
			
		||||
		skb_dst_drop(skb);
 | 
			
		||||
	else
 | 
			
		||||
		skb_dst_force(skb);
 | 
			
		||||
 | 
			
		||||
	__skb_queue_tail(&tp->ucopy.prequeue, skb);
 | 
			
		||||
	tp->ucopy.memory += skb->truesize;
 | 
			
		||||
	if (tp->ucopy.memory > sk->sk_rcvbuf) {
 | 
			
		||||
| 
						 | 
				
			
			@ -1765,9 +1775,11 @@ void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
 | 
			
		|||
{
 | 
			
		||||
	struct dst_entry *dst = skb_dst(skb);
 | 
			
		||||
 | 
			
		||||
	dst_hold(dst);
 | 
			
		||||
	sk->sk_rx_dst = dst;
 | 
			
		||||
	inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
 | 
			
		||||
	if (dst) {
 | 
			
		||||
		dst_hold(dst);
 | 
			
		||||
		sk->sk_rx_dst = dst;
 | 
			
		||||
		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
EXPORT_SYMBOL(inet_sk_rx_dst_set);
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -93,13 +93,16 @@ static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(struct sock *sk,
 | 
			
		|||
static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
 | 
			
		||||
{
 | 
			
		||||
	struct dst_entry *dst = skb_dst(skb);
 | 
			
		||||
	const struct rt6_info *rt = (const struct rt6_info *)dst;
 | 
			
		||||
 | 
			
		||||
	dst_hold(dst);
 | 
			
		||||
	sk->sk_rx_dst = dst;
 | 
			
		||||
	inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
 | 
			
		||||
	if (rt->rt6i_node)
 | 
			
		||||
		inet6_sk(sk)->rx_dst_cookie = rt->rt6i_node->fn_sernum;
 | 
			
		||||
	if (dst) {
 | 
			
		||||
		const struct rt6_info *rt = (const struct rt6_info *)dst;
 | 
			
		||||
 | 
			
		||||
		dst_hold(dst);
 | 
			
		||||
		sk->sk_rx_dst = dst;
 | 
			
		||||
		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
 | 
			
		||||
		if (rt->rt6i_node)
 | 
			
		||||
			inet6_sk(sk)->rx_dst_cookie = rt->rt6i_node->fn_sernum;
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void tcp_v6_hash(struct sock *sk)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in a new issue