mirror of
				https://github.com/torvalds/linux.git
				synced 2025-11-04 10:40:15 +02:00 
			
		
		
		
	net: No more expensive sock_hold()/sock_put() on each tx
One of the problem with sock memory accounting is it uses a pair of sock_hold()/sock_put() for each transmitted packet. This slows down bidirectional flows because the receive path also needs to take a refcount on socket and might use a different cpu than transmit path or transmit completion path. So these two atomic operations also trigger cache line bounces. We can see this in tx or tx/rx workloads (media gateways for example), where sock_wfree() can be in top five functions in profiles. We use this sock_hold()/sock_put() so that sock freeing is delayed until all tx packets are completed. As we also update sk_wmem_alloc, we could offset sk_wmem_alloc by one unit at init time, until sk_free() is called. Once sk_free() is called, we atomic_dec_and_test(sk_wmem_alloc) to decrement initial offset and atomicaly check if any packets are in flight. skb_set_owner_w() doesnt call sock_hold() anymore sock_wfree() doesnt call sock_put() anymore, but check if sk_wmem_alloc reached 0 to perform the final freeing. Drawback is that a skb->truesize error could lead to unfreeable sockets, or even worse, prematurely calling __sk_free() on a live socket. Nice speedups on SMP. tbench for example, going from 2691 MB/s to 2711 MB/s on my 8 cpu dev machine, even if tbench was not really hitting sk_refcnt contention point. 5 % speedup on a UDP transmit workload (depends on number of flows), lowering TX completion cpu usage. Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
		
							parent
							
								
									f2333a014c
								
							
						
					
					
						commit
						2b85a34e91
					
				
					 4 changed files with 30 additions and 7 deletions
				
			
		| 
						 | 
				
			
			@ -1217,9 +1217,13 @@ static inline int skb_copy_to_page(struct sock *sk, char __user *from,
 | 
			
		|||
 | 
			
		||||
static inline void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
 | 
			
		||||
{
 | 
			
		||||
	sock_hold(sk);
 | 
			
		||||
	skb->sk = sk;
 | 
			
		||||
	skb->destructor = sock_wfree;
 | 
			
		||||
	/*
 | 
			
		||||
	 * We used to take a refcount on sk, but following operation
 | 
			
		||||
	 * is enough to guarantee sk_free() wont free this sock until
 | 
			
		||||
	 * all in-flight packets are completed
 | 
			
		||||
	 */
 | 
			
		||||
	atomic_add(skb->truesize, &sk->sk_wmem_alloc);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1008,7 +1008,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
 | 
			
		|||
}
 | 
			
		||||
EXPORT_SYMBOL(sk_alloc);
 | 
			
		||||
 | 
			
		||||
void sk_free(struct sock *sk)
 | 
			
		||||
static void __sk_free(struct sock *sk)
 | 
			
		||||
{
 | 
			
		||||
	struct sk_filter *filter;
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -1031,6 +1031,17 @@ void sk_free(struct sock *sk)
 | 
			
		|||
	put_net(sock_net(sk));
 | 
			
		||||
	sk_prot_free(sk->sk_prot_creator, sk);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void sk_free(struct sock *sk)
 | 
			
		||||
{
 | 
			
		||||
	/*
 | 
			
		||||
	 * We substract one from sk_wmem_alloc and can know if
 | 
			
		||||
	 * some packets are still in some tx queue.
 | 
			
		||||
	 * If not null, sock_wfree() will call __sk_free(sk) later
 | 
			
		||||
	 */
 | 
			
		||||
	if (atomic_dec_and_test(&sk->sk_wmem_alloc))
 | 
			
		||||
		__sk_free(sk);
 | 
			
		||||
}
 | 
			
		||||
EXPORT_SYMBOL(sk_free);
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
| 
						 | 
				
			
			@ -1071,7 +1082,10 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
 | 
			
		|||
		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
 | 
			
		||||
 | 
			
		||||
		atomic_set(&newsk->sk_rmem_alloc, 0);
 | 
			
		||||
		atomic_set(&newsk->sk_wmem_alloc, 0);
 | 
			
		||||
		/*
 | 
			
		||||
		 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
 | 
			
		||||
		 */
 | 
			
		||||
		atomic_set(&newsk->sk_wmem_alloc, 1);
 | 
			
		||||
		atomic_set(&newsk->sk_omem_alloc, 0);
 | 
			
		||||
		skb_queue_head_init(&newsk->sk_receive_queue);
 | 
			
		||||
		skb_queue_head_init(&newsk->sk_write_queue);
 | 
			
		||||
| 
						 | 
				
			
			@ -1175,12 +1189,18 @@ void __init sk_init(void)
 | 
			
		|||
void sock_wfree(struct sk_buff *skb)
 | 
			
		||||
{
 | 
			
		||||
	struct sock *sk = skb->sk;
 | 
			
		||||
	int res;
 | 
			
		||||
 | 
			
		||||
	/* In case it might be waiting for more memory. */
 | 
			
		||||
	atomic_sub(skb->truesize, &sk->sk_wmem_alloc);
 | 
			
		||||
	res = atomic_sub_return(skb->truesize, &sk->sk_wmem_alloc);
 | 
			
		||||
	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE))
 | 
			
		||||
		sk->sk_write_space(sk);
 | 
			
		||||
	sock_put(sk);
 | 
			
		||||
	/*
 | 
			
		||||
	 * if sk_wmem_alloc reached 0, we are last user and should
 | 
			
		||||
	 * free this sock, as sk_free() call could not do it.
 | 
			
		||||
	 */
 | 
			
		||||
	if (res == 0)
 | 
			
		||||
		__sk_free(sk);
 | 
			
		||||
}
 | 
			
		||||
EXPORT_SYMBOL(sock_wfree);
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -1819,6 +1839,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
 | 
			
		|||
	sk->sk_stamp = ktime_set(-1L, 0);
 | 
			
		||||
 | 
			
		||||
	atomic_set(&sk->sk_refcnt, 1);
 | 
			
		||||
	atomic_set(&sk->sk_wmem_alloc, 1);
 | 
			
		||||
	atomic_set(&sk->sk_drops, 0);
 | 
			
		||||
}
 | 
			
		||||
EXPORT_SYMBOL(sock_init_data);
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -498,7 +498,6 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 | 
			
		|||
 | 
			
		||||
			BUG_ON(frag->sk);
 | 
			
		||||
			if (skb->sk) {
 | 
			
		||||
				sock_hold(skb->sk);
 | 
			
		||||
				frag->sk = skb->sk;
 | 
			
		||||
				frag->destructor = sock_wfree;
 | 
			
		||||
				truesizes += frag->truesize;
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -680,7 +680,6 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 | 
			
		|||
 | 
			
		||||
			BUG_ON(frag->sk);
 | 
			
		||||
			if (skb->sk) {
 | 
			
		||||
				sock_hold(skb->sk);
 | 
			
		||||
				frag->sk = skb->sk;
 | 
			
		||||
				frag->destructor = sock_wfree;
 | 
			
		||||
				truesizes += frag->truesize;
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in a new issue