mirror of
				https://github.com/torvalds/linux.git
				synced 2025-11-04 10:40:15 +02:00 
			
		
		
		
	tcp: fix sk_rcvbuf overshoot
Current autosizing in tcp_rcv_space_adjust() is too aggressive. Instead of betting on possible losses and over estimate BDP, it is better to only account for slow start. The following patch is then adding a more precise tuning in the events of packet losses. Signed-off-by: Eric Dumazet <edumazet@google.com> Link: https://patch.msgid.link/20250513193919.1089692-3-edumazet@google.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
		
							parent
							
								
									c1269d3d12
								
							
						
					
					
						commit
						65c5287892
					
				
					 1 changed files with 25 additions and 34 deletions
				
			
		| 
						 | 
				
			
			@ -747,6 +747,29 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
 | 
			
		|||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void tcp_rcvbuf_grow(struct sock *sk)
 | 
			
		||||
{
 | 
			
		||||
	const struct net *net = sock_net(sk);
 | 
			
		||||
	struct tcp_sock *tp = tcp_sk(sk);
 | 
			
		||||
	int rcvwin, rcvbuf, cap;
 | 
			
		||||
 | 
			
		||||
	if (!READ_ONCE(net->ipv4.sysctl_tcp_moderate_rcvbuf) ||
 | 
			
		||||
	    (sk->sk_userlocks & SOCK_RCVBUF_LOCK))
 | 
			
		||||
		return;
 | 
			
		||||
 | 
			
		||||
	/* slow start: allow the sender to double its rate. */
 | 
			
		||||
	rcvwin = tp->rcvq_space.space << 1;
 | 
			
		||||
 | 
			
		||||
	cap = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]);
 | 
			
		||||
 | 
			
		||||
	rcvbuf = min_t(u32, tcp_space_from_win(sk, rcvwin), cap);
 | 
			
		||||
	if (rcvbuf > sk->sk_rcvbuf) {
 | 
			
		||||
		WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
 | 
			
		||||
		/* Make the window clamp follow along.  */
 | 
			
		||||
		WRITE_ONCE(tp->window_clamp,
 | 
			
		||||
			   tcp_win_from_space(sk, rcvbuf));
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
/*
 | 
			
		||||
 * This function should be called every time data is copied to user space.
 | 
			
		||||
 * It calculates the appropriate TCP receive buffer space.
 | 
			
		||||
| 
						 | 
				
			
			@ -771,42 +794,10 @@ void tcp_rcv_space_adjust(struct sock *sk)
 | 
			
		|||
 | 
			
		||||
	trace_tcp_rcvbuf_grow(sk, time);
 | 
			
		||||
 | 
			
		||||
	/* A bit of theory :
 | 
			
		||||
	 * copied = bytes received in previous RTT, our base window
 | 
			
		||||
	 * To cope with packet losses, we need a 2x factor
 | 
			
		||||
	 * To cope with slow start, and sender growing its cwin by 100 %
 | 
			
		||||
	 * every RTT, we need a 4x factor, because the ACK we are sending
 | 
			
		||||
	 * now is for the next RTT, not the current one :
 | 
			
		||||
	 * <prev RTT . ><current RTT .. ><next RTT .... >
 | 
			
		||||
	 */
 | 
			
		||||
 | 
			
		||||
	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) &&
 | 
			
		||||
	    !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
 | 
			
		||||
		u64 rcvwin, grow;
 | 
			
		||||
		int rcvbuf;
 | 
			
		||||
 | 
			
		||||
		/* minimal window to cope with packet losses, assuming
 | 
			
		||||
		 * steady state. Add some cushion because of small variations.
 | 
			
		||||
		 */
 | 
			
		||||
		rcvwin = ((u64)copied << 1) + 16 * tp->advmss;
 | 
			
		||||
 | 
			
		||||
		/* Accommodate for sender rate increase (eg. slow start) */
 | 
			
		||||
		grow = rcvwin * (copied - tp->rcvq_space.space);
 | 
			
		||||
		do_div(grow, tp->rcvq_space.space);
 | 
			
		||||
		rcvwin += (grow << 1);
 | 
			
		||||
 | 
			
		||||
		rcvbuf = min_t(u64, tcp_space_from_win(sk, rcvwin),
 | 
			
		||||
			       READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));
 | 
			
		||||
		if (rcvbuf > sk->sk_rcvbuf) {
 | 
			
		||||
			WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
 | 
			
		||||
 | 
			
		||||
			/* Make the window clamp follow along.  */
 | 
			
		||||
			WRITE_ONCE(tp->window_clamp,
 | 
			
		||||
				   tcp_win_from_space(sk, rcvbuf));
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
	tp->rcvq_space.space = copied;
 | 
			
		||||
 | 
			
		||||
	tcp_rcvbuf_grow(sk);
 | 
			
		||||
 | 
			
		||||
new_measure:
 | 
			
		||||
	tp->rcvq_space.seq = tp->copied_seq;
 | 
			
		||||
	tp->rcvq_space.time = tp->tcp_mstamp;
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in a new issue