mptcp: rcvbuf auto-tuning improvement

Apply to the MPTCP auto-tuning the same improvements introduced for the
TCP protocol by the merge commit 2da35e4b4d ("Merge branch
'tcp-receive-side-improvements'").

The main difference is that TCP subflow and the main MPTCP socket need
to account separately for OoO: MPTCP does not care for TCP-level OoO
and vice versa, as a consequence do not reflect MPTCP-level rcvbuf
increase due to OoO packets at the subflow level.

This refeactor additionally allow dropping the msk receive buffer update
at receive time, as the latter only intended to cope with subflow receive
buffer increase due to OoO packets.

Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/487
Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/559
Reviewed-by: Geliang Tang <geliang@kernel.org>
Tested-by: Geliang Tang <geliang@kernel.org>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20250927-net-next-mptcp-rcv-path-imp-v1-3-5da266aa9c1a@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
Paolo Abeni 2025-09-27 11:40:39 +02:00 committed by Jakub Kicinski
parent a755677974
commit e118cdc34d
2 changed files with 49 additions and 52 deletions

View file

@ -179,6 +179,35 @@ static bool mptcp_ooo_try_coalesce(struct mptcp_sock *msk, struct sk_buff *to,
return mptcp_try_coalesce((struct sock *)msk, to, from); return mptcp_try_coalesce((struct sock *)msk, to, from);
} }
/* "inspired" by tcp_rcvbuf_grow(), main difference:
* - mptcp does not maintain a msk-level window clamp
* - returns true when the receive buffer is actually updated
*/
static bool mptcp_rcvbuf_grow(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
const struct net *net = sock_net(sk);
int rcvwin, rcvbuf, cap;
if (!READ_ONCE(net->ipv4.sysctl_tcp_moderate_rcvbuf) ||
(sk->sk_userlocks & SOCK_RCVBUF_LOCK))
return false;
rcvwin = msk->rcvq_space.space << 1;
if (!RB_EMPTY_ROOT(&msk->out_of_order_queue))
rcvwin += MPTCP_SKB_CB(msk->ooo_last_skb)->end_seq - msk->ack_seq;
cap = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]);
rcvbuf = min_t(u32, mptcp_space_from_win(sk, rcvwin), cap);
if (rcvbuf > sk->sk_rcvbuf) {
WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
return true;
}
return false;
}
/* "inspired" by tcp_data_queue_ofo(), main differences: /* "inspired" by tcp_data_queue_ofo(), main differences:
* - use mptcp seqs * - use mptcp seqs
* - don't cope with sacks * - don't cope with sacks
@ -292,6 +321,9 @@ static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb)
end: end:
skb_condense(skb); skb_condense(skb);
skb_set_owner_r(skb, sk); skb_set_owner_r(skb, sk);
/* do not grow rcvbuf for not-yet-accepted or orphaned sockets. */
if (sk->sk_socket)
mptcp_rcvbuf_grow(sk);
} }
static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
@ -784,18 +816,10 @@ static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk)
return moved; return moved;
} }
static void __mptcp_rcvbuf_update(struct sock *sk, struct sock *ssk)
{
if (unlikely(ssk->sk_rcvbuf > sk->sk_rcvbuf))
WRITE_ONCE(sk->sk_rcvbuf, ssk->sk_rcvbuf);
}
static void __mptcp_data_ready(struct sock *sk, struct sock *ssk) static void __mptcp_data_ready(struct sock *sk, struct sock *ssk)
{ {
struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_sock *msk = mptcp_sk(sk);
__mptcp_rcvbuf_update(sk, ssk);
/* Wake-up the reader only for in-sequence data */ /* Wake-up the reader only for in-sequence data */
if (move_skbs_to_msk(msk, ssk) && mptcp_epollin_ready(sk)) if (move_skbs_to_msk(msk, ssk) && mptcp_epollin_ready(sk))
sk->sk_data_ready(sk); sk->sk_data_ready(sk);
@ -2014,48 +2038,26 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
if (msk->rcvq_space.copied <= msk->rcvq_space.space) if (msk->rcvq_space.copied <= msk->rcvq_space.space)
goto new_measure; goto new_measure;
if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) && msk->rcvq_space.space = msk->rcvq_space.copied;
!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { if (mptcp_rcvbuf_grow(sk)) {
u64 rcvwin, grow;
int rcvbuf;
rcvwin = ((u64)msk->rcvq_space.copied << 1) + 16 * advmss; /* Make subflows follow along. If we do not do this, we
* get drops at subflow level if skbs can't be moved to
* the mptcp rx queue fast enough (announced rcv_win can
* exceed ssk->sk_rcvbuf).
*/
mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk;
bool slow;
grow = rcvwin * (msk->rcvq_space.copied - msk->rcvq_space.space); ssk = mptcp_subflow_tcp_sock(subflow);
slow = lock_sock_fast(ssk);
do_div(grow, msk->rcvq_space.space); tcp_sk(ssk)->rcvq_space.space = msk->rcvq_space.copied;
rcvwin += (grow << 1); tcp_rcvbuf_grow(ssk);
unlock_sock_fast(ssk, slow);
rcvbuf = min_t(u64, mptcp_space_from_win(sk, rcvwin),
READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));
if (rcvbuf > sk->sk_rcvbuf) {
u32 window_clamp;
window_clamp = mptcp_win_from_space(sk, rcvbuf);
WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
/* Make subflows follow along. If we do not do this, we
* get drops at subflow level if skbs can't be moved to
* the mptcp rx queue fast enough (announced rcv_win can
* exceed ssk->sk_rcvbuf).
*/
mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk;
bool slow;
ssk = mptcp_subflow_tcp_sock(subflow);
slow = lock_sock_fast(ssk);
WRITE_ONCE(ssk->sk_rcvbuf, rcvbuf);
WRITE_ONCE(tcp_sk(ssk)->window_clamp, window_clamp);
if (tcp_can_send_ack(ssk))
tcp_cleanup_rbuf(ssk, 1);
unlock_sock_fast(ssk, slow);
}
} }
} }
msk->rcvq_space.space = msk->rcvq_space.copied;
new_measure: new_measure:
msk->rcvq_space.copied = 0; msk->rcvq_space.copied = 0;
msk->rcvq_space.time = mstamp; msk->rcvq_space.time = mstamp;
@ -2084,11 +2086,6 @@ static bool __mptcp_move_skbs(struct sock *sk)
if (list_empty(&msk->conn_list)) if (list_empty(&msk->conn_list))
return false; return false;
/* verify we can move any data from the subflow, eventually updating */
if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK))
mptcp_for_each_subflow(msk, subflow)
__mptcp_rcvbuf_update(sk, subflow->tcp_sock);
subflow = list_first_entry(&msk->conn_list, subflow = list_first_entry(&msk->conn_list,
struct mptcp_subflow_context, node); struct mptcp_subflow_context, node);
for (;;) { for (;;) {

View file

@ -341,8 +341,8 @@ struct mptcp_sock {
struct mptcp_pm_data pm; struct mptcp_pm_data pm;
struct mptcp_sched_ops *sched; struct mptcp_sched_ops *sched;
struct { struct {
u32 space; /* bytes copied in last measurement window */ int space; /* bytes copied in last measurement window */
u32 copied; /* bytes copied in this measurement window */ int copied; /* bytes copied in this measurement window */
u64 time; /* start time of measurement window */ u64 time; /* start time of measurement window */
u64 rtt_us; /* last maximum rtt of subflows */ u64 rtt_us; /* last maximum rtt of subflows */
} rcvq_space; } rcvq_space;