forked from mirrors/linux
		
	mptcp: fix rcv buffer auto-tuning
The MPTCP code uses the assumption that the tcp_win_from_space() helper does not use any TCP-specific field, and thus works correctly operating on an MPTCP socket. The commitdfa2f04833("tcp: get rid of sysctl_tcp_adv_win_scale") broke such assumption, and as a consequence most MPTCP connections stall on zero-window event due to auto-tuning changing the rcv buffer size quite randomly. Address the issue syncing again the MPTCP auto-tuning code with the TCP one. To achieve that, factor out the windows size logic in socket independent helpers, and reuse them in mptcp_rcv_space_adjust(). The MPTCP level scaling_ratio is selected as the minimum one from the all the subflows, as a worst-case estimate. Fixes:dfa2f04833("tcp: get rid of sysctl_tcp_adv_win_scale") Signed-off-by: Paolo Abeni <pabeni@redhat.com> Co-developed-by: Matthieu Baerts <matthieu.baerts@tessares.net> Signed-off-by: Matthieu Baerts <matthieu.baerts@tessares.net> Reviewed-by: Eric Dumazet <edumazet@google.com> Acked-by: Soheil Hassas Yeganeh <soheil@google.com> Link: https://lore.kernel.org/r/20230720-upstream-net-next-20230720-mptcp-fix-rcv-buffer-auto-tuning-v1-1-175ef12b8380@tessares.net Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
		
							parent
							
								
									004a04b97b
								
							
						
					
					
						commit
						b8dc6d6ce9
					
				
					 4 changed files with 30 additions and 15 deletions
				
			
		|  | @ -1430,22 +1430,32 @@ void tcp_select_initial_window(const struct sock *sk, int __space, | ||||||
| 			       __u32 *window_clamp, int wscale_ok, | 			       __u32 *window_clamp, int wscale_ok, | ||||||
| 			       __u8 *rcv_wscale, __u32 init_rcv_wnd); | 			       __u8 *rcv_wscale, __u32 init_rcv_wnd); | ||||||
| 
 | 
 | ||||||
| static inline int tcp_win_from_space(const struct sock *sk, int space) | static inline int __tcp_win_from_space(u8 scaling_ratio, int space) | ||||||
| { | { | ||||||
| 	s64 scaled_space = (s64)space * tcp_sk(sk)->scaling_ratio; | 	s64 scaled_space = (s64)space * scaling_ratio; | ||||||
| 
 | 
 | ||||||
| 	return scaled_space >> TCP_RMEM_TO_WIN_SCALE; | 	return scaled_space >> TCP_RMEM_TO_WIN_SCALE; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| /* inverse of tcp_win_from_space() */ | static inline int tcp_win_from_space(const struct sock *sk, int space) | ||||||
| static inline int tcp_space_from_win(const struct sock *sk, int win) | { | ||||||
|  | 	return __tcp_win_from_space(tcp_sk(sk)->scaling_ratio, space); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | /* inverse of __tcp_win_from_space() */ | ||||||
|  | static inline int __tcp_space_from_win(u8 scaling_ratio, int win) | ||||||
| { | { | ||||||
| 	u64 val = (u64)win << TCP_RMEM_TO_WIN_SCALE; | 	u64 val = (u64)win << TCP_RMEM_TO_WIN_SCALE; | ||||||
| 
 | 
 | ||||||
| 	do_div(val, tcp_sk(sk)->scaling_ratio); | 	do_div(val, scaling_ratio); | ||||||
| 	return val; | 	return val; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | static inline int tcp_space_from_win(const struct sock *sk, int win) | ||||||
|  | { | ||||||
|  | 	return __tcp_space_from_win(tcp_sk(sk)->scaling_ratio, win); | ||||||
|  | } | ||||||
|  | 
 | ||||||
| static inline void tcp_scaling_ratio_init(struct sock *sk) | static inline void tcp_scaling_ratio_init(struct sock *sk) | ||||||
| { | { | ||||||
| 	/* Assume a conservative default of 1200 bytes of payload per 4K page.
 | 	/* Assume a conservative default of 1200 bytes of payload per 4K page.
 | ||||||
|  |  | ||||||
|  | @ -90,6 +90,7 @@ static int __mptcp_socket_create(struct mptcp_sock *msk) | ||||||
| 	if (err) | 	if (err) | ||||||
| 		return err; | 		return err; | ||||||
| 
 | 
 | ||||||
|  | 	msk->scaling_ratio = tcp_sk(ssock->sk)->scaling_ratio; | ||||||
| 	WRITE_ONCE(msk->first, ssock->sk); | 	WRITE_ONCE(msk->first, ssock->sk); | ||||||
| 	WRITE_ONCE(msk->subflow, ssock); | 	WRITE_ONCE(msk->subflow, ssock); | ||||||
| 	subflow = mptcp_subflow_ctx(ssock->sk); | 	subflow = mptcp_subflow_ctx(ssock->sk); | ||||||
|  | @ -1881,6 +1882,7 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) | ||||||
| { | { | ||||||
| 	struct mptcp_subflow_context *subflow; | 	struct mptcp_subflow_context *subflow; | ||||||
| 	struct sock *sk = (struct sock *)msk; | 	struct sock *sk = (struct sock *)msk; | ||||||
|  | 	u8 scaling_ratio = U8_MAX; | ||||||
| 	u32 time, advmss = 1; | 	u32 time, advmss = 1; | ||||||
| 	u64 rtt_us, mstamp; | 	u64 rtt_us, mstamp; | ||||||
| 
 | 
 | ||||||
|  | @ -1911,9 +1913,11 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) | ||||||
| 
 | 
 | ||||||
| 		rtt_us = max(sf_rtt_us, rtt_us); | 		rtt_us = max(sf_rtt_us, rtt_us); | ||||||
| 		advmss = max(sf_advmss, advmss); | 		advmss = max(sf_advmss, advmss); | ||||||
|  | 		scaling_ratio = min(tp->scaling_ratio, scaling_ratio); | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	msk->rcvq_space.rtt_us = rtt_us; | 	msk->rcvq_space.rtt_us = rtt_us; | ||||||
|  | 	msk->scaling_ratio = scaling_ratio; | ||||||
| 	if (time < (rtt_us >> 3) || rtt_us == 0) | 	if (time < (rtt_us >> 3) || rtt_us == 0) | ||||||
| 		return; | 		return; | ||||||
| 
 | 
 | ||||||
|  | @ -1922,8 +1926,8 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) | ||||||
| 
 | 
 | ||||||
| 	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) && | 	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) && | ||||||
| 	    !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { | 	    !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { | ||||||
| 		int rcvmem, rcvbuf; |  | ||||||
| 		u64 rcvwin, grow; | 		u64 rcvwin, grow; | ||||||
|  | 		int rcvbuf; | ||||||
| 
 | 
 | ||||||
| 		rcvwin = ((u64)msk->rcvq_space.copied << 1) + 16 * advmss; | 		rcvwin = ((u64)msk->rcvq_space.copied << 1) + 16 * advmss; | ||||||
| 
 | 
 | ||||||
|  | @ -1932,18 +1936,13 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) | ||||||
| 		do_div(grow, msk->rcvq_space.space); | 		do_div(grow, msk->rcvq_space.space); | ||||||
| 		rcvwin += (grow << 1); | 		rcvwin += (grow << 1); | ||||||
| 
 | 
 | ||||||
| 		rcvmem = SKB_TRUESIZE(advmss + MAX_TCP_HEADER); | 		rcvbuf = min_t(u64, __tcp_space_from_win(scaling_ratio, rcvwin), | ||||||
| 		while (tcp_win_from_space(sk, rcvmem) < advmss) |  | ||||||
| 			rcvmem += 128; |  | ||||||
| 
 |  | ||||||
| 		do_div(rcvwin, advmss); |  | ||||||
| 		rcvbuf = min_t(u64, rcvwin * rcvmem, |  | ||||||
| 			       READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])); | 			       READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])); | ||||||
| 
 | 
 | ||||||
| 		if (rcvbuf > sk->sk_rcvbuf) { | 		if (rcvbuf > sk->sk_rcvbuf) { | ||||||
| 			u32 window_clamp; | 			u32 window_clamp; | ||||||
| 
 | 
 | ||||||
| 			window_clamp = tcp_win_from_space(sk, rcvbuf); | 			window_clamp = __tcp_win_from_space(scaling_ratio, rcvbuf); | ||||||
| 			WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); | 			WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); | ||||||
| 
 | 
 | ||||||
| 			/* Make subflows follow along.  If we do not do this, we
 | 			/* Make subflows follow along.  If we do not do this, we
 | ||||||
|  |  | ||||||
|  | @ -321,6 +321,7 @@ struct mptcp_sock { | ||||||
| 		u64	time;	/* start time of measurement window */ | 		u64	time;	/* start time of measurement window */ | ||||||
| 		u64	rtt_us; /* last maximum rtt of subflows */ | 		u64	rtt_us; /* last maximum rtt of subflows */ | ||||||
| 	} rcvq_space; | 	} rcvq_space; | ||||||
|  | 	u8		scaling_ratio; | ||||||
| 
 | 
 | ||||||
| 	u32		subflow_id; | 	u32		subflow_id; | ||||||
| 	u32		setsockopt_seq; | 	u32		setsockopt_seq; | ||||||
|  | @ -351,9 +352,14 @@ static inline int __mptcp_rmem(const struct sock *sk) | ||||||
| 	return atomic_read(&sk->sk_rmem_alloc) - READ_ONCE(mptcp_sk(sk)->rmem_released); | 	return atomic_read(&sk->sk_rmem_alloc) - READ_ONCE(mptcp_sk(sk)->rmem_released); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | static inline int mptcp_win_from_space(const struct sock *sk, int space) | ||||||
|  | { | ||||||
|  | 	return __tcp_win_from_space(mptcp_sk(sk)->scaling_ratio, space); | ||||||
|  | } | ||||||
|  | 
 | ||||||
| static inline int __mptcp_space(const struct sock *sk) | static inline int __mptcp_space(const struct sock *sk) | ||||||
| { | { | ||||||
| 	return tcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) - __mptcp_rmem(sk)); | 	return mptcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) - __mptcp_rmem(sk)); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static inline struct mptcp_data_frag *mptcp_send_head(const struct sock *sk) | static inline struct mptcp_data_frag *mptcp_send_head(const struct sock *sk) | ||||||
|  |  | ||||||
|  | @ -1359,7 +1359,7 @@ void mptcp_space(const struct sock *ssk, int *space, int *full_space) | ||||||
| 	const struct sock *sk = subflow->conn; | 	const struct sock *sk = subflow->conn; | ||||||
| 
 | 
 | ||||||
| 	*space = __mptcp_space(sk); | 	*space = __mptcp_space(sk); | ||||||
| 	*full_space = tcp_full_space(sk); | 	*full_space = mptcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf)); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| void __mptcp_error_report(struct sock *sk) | void __mptcp_error_report(struct sock *sk) | ||||||
|  |  | ||||||
		Loading…
	
		Reference in a new issue
	
	 Paolo Abeni
						Paolo Abeni