Merge branch 'tcp-receiver-changes'

Eric Dumazet says:

====================
tcp: receiver changes

Before accepting an incoming packet:

- Make sure to not accept a packet beyond advertized RWIN.
  If not, increment a new SNMP counter (LINUX_MIB_BEYOND_WINDOW)

- ooo packets should update rcv_mss and tp->scaling_ratio.

- Make sure to not accept packet beyond sk_rcvbuf limit.

This series includes three associated packetdrill tests.
====================

Link: https://patch.msgid.link/20250711114006.480026-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
Jakub Kicinski 2025-07-14 18:40:51 -07:00
commit 06baf9bfa6
9 changed files with 152 additions and 14 deletions

View file

@ -36,6 +36,7 @@ unsigned_long LINUX_MIB_TIMEWAITRECYCLED
unsigned_long LINUX_MIB_TIMEWAITKILLED
unsigned_long LINUX_MIB_PAWSACTIVEREJECTED
unsigned_long LINUX_MIB_PAWSESTABREJECTED
unsigned_long LINUX_MIB_BEYOND_WINDOW
unsigned_long LINUX_MIB_TSECR_REJECTED
unsigned_long LINUX_MIB_PAWS_OLD_ACK
unsigned_long LINUX_MIB_PAWS_TW_REJECTED

View file

@ -45,6 +45,7 @@
FN(TCP_LISTEN_OVERFLOW) \
FN(TCP_OLD_SEQUENCE) \
FN(TCP_INVALID_SEQUENCE) \
FN(TCP_INVALID_END_SEQUENCE) \
FN(TCP_INVALID_ACK_SEQUENCE) \
FN(TCP_RESET) \
FN(TCP_INVALID_SYN) \
@ -303,8 +304,14 @@ enum skb_drop_reason {
SKB_DROP_REASON_TCP_LISTEN_OVERFLOW,
/** @SKB_DROP_REASON_TCP_OLD_SEQUENCE: Old SEQ field (duplicate packet) */
SKB_DROP_REASON_TCP_OLD_SEQUENCE,
/** @SKB_DROP_REASON_TCP_INVALID_SEQUENCE: Not acceptable SEQ field */
/** @SKB_DROP_REASON_TCP_INVALID_SEQUENCE: Not acceptable SEQ field. */
SKB_DROP_REASON_TCP_INVALID_SEQUENCE,
/**
* @SKB_DROP_REASON_TCP_INVALID_END_SEQUENCE:
* Not acceptable END_SEQ field.
* Corresponds to LINUX_MIB_BEYOND_WINDOW.
*/
SKB_DROP_REASON_TCP_INVALID_END_SEQUENCE,
/**
* @SKB_DROP_REASON_TCP_INVALID_ACK_SEQUENCE: Not acceptable ACK SEQ
* field because ack sequence is not in the window between snd_una

View file

@ -1553,7 +1553,7 @@ __sk_rmem_schedule(struct sock *sk, int size, bool pfmemalloc)
}
static inline bool
sk_rmem_schedule(struct sock *sk, struct sk_buff *skb, int size)
sk_rmem_schedule(struct sock *sk, const struct sk_buff *skb, int size)
{
return __sk_rmem_schedule(sk, size, skb_pfmemalloc(skb));
}

View file

@ -186,6 +186,7 @@ enum
LINUX_MIB_TIMEWAITKILLED, /* TimeWaitKilled */
LINUX_MIB_PAWSACTIVEREJECTED, /* PAWSActiveRejected */
LINUX_MIB_PAWSESTABREJECTED, /* PAWSEstabRejected */
LINUX_MIB_BEYOND_WINDOW, /* BeyondWindow */
LINUX_MIB_TSECRREJECTED, /* TSEcrRejected */
LINUX_MIB_PAWS_OLD_ACK, /* PAWSOldAck */
LINUX_MIB_PAWS_TW_REJECTED, /* PAWSTimewait */

View file

@ -189,6 +189,7 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TWKilled", LINUX_MIB_TIMEWAITKILLED),
SNMP_MIB_ITEM("PAWSActive", LINUX_MIB_PAWSACTIVEREJECTED),
SNMP_MIB_ITEM("PAWSEstab", LINUX_MIB_PAWSESTABREJECTED),
SNMP_MIB_ITEM("BeyondWindow", LINUX_MIB_BEYOND_WINDOW),
SNMP_MIB_ITEM("TSEcrRejected", LINUX_MIB_TSECRREJECTED),
SNMP_MIB_ITEM("PAWSOldAck", LINUX_MIB_PAWS_OLD_ACK),
SNMP_MIB_ITEM("PAWSTimewait", LINUX_MIB_PAWS_TW_REJECTED),

View file

@ -4391,14 +4391,22 @@ static enum skb_drop_reason tcp_disordered_ack_check(const struct sock *sk,
* (borrowed from freebsd)
*/
static enum skb_drop_reason tcp_sequence(const struct tcp_sock *tp,
static enum skb_drop_reason tcp_sequence(const struct sock *sk,
u32 seq, u32 end_seq)
{
const struct tcp_sock *tp = tcp_sk(sk);
if (before(end_seq, tp->rcv_wup))
return SKB_DROP_REASON_TCP_OLD_SEQUENCE;
if (after(seq, tp->rcv_nxt + tcp_receive_window(tp)))
return SKB_DROP_REASON_TCP_INVALID_SEQUENCE;
if (after(end_seq, tp->rcv_nxt + tcp_receive_window(tp))) {
if (after(seq, tp->rcv_nxt + tcp_receive_window(tp)))
return SKB_DROP_REASON_TCP_INVALID_SEQUENCE;
/* Only accept this packet if receive queue is empty. */
if (skb_queue_len(&sk->sk_receive_queue))
return SKB_DROP_REASON_TCP_INVALID_END_SEQUENCE;
}
return SKB_NOT_DROPPED_YET;
}
@ -4880,10 +4888,20 @@ static void tcp_ofo_queue(struct sock *sk)
static bool tcp_prune_ofo_queue(struct sock *sk, const struct sk_buff *in_skb);
static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb);
static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
/* Check if this incoming skb can be added to socket receive queues
* while satisfying sk->sk_rcvbuf limit.
*/
static bool tcp_can_ingest(const struct sock *sk, const struct sk_buff *skb)
{
unsigned int new_mem = atomic_read(&sk->sk_rmem_alloc) + skb->truesize;
return new_mem <= sk->sk_rcvbuf;
}
static int tcp_try_rmem_schedule(struct sock *sk, const struct sk_buff *skb,
unsigned int size)
{
if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
if (!tcp_can_ingest(sk, skb) ||
!sk_rmem_schedule(sk, skb, size)) {
if (tcp_prune_queue(sk, skb) < 0)
@ -4915,6 +4933,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
return;
}
tcp_measure_rcv_mss(sk, skb);
/* Disable header prediction. */
tp->pred_flags = 0;
inet_csk_schedule_ack(sk);
@ -5498,7 +5517,7 @@ static bool tcp_prune_ofo_queue(struct sock *sk, const struct sk_buff *in_skb)
tcp_drop_reason(sk, skb, SKB_DROP_REASON_TCP_OFO_QUEUE_PRUNE);
tp->ooo_last_skb = rb_to_skb(prev);
if (!prev || goal <= 0) {
if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
if (tcp_can_ingest(sk, skb) &&
!tcp_under_memory_pressure(sk))
break;
goal = sk->sk_rcvbuf >> 3;
@ -5532,12 +5551,12 @@ static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb)
NET_INC_STATS(sock_net(sk), LINUX_MIB_PRUNECALLED);
if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
if (!tcp_can_ingest(sk, in_skb))
tcp_clamp_window(sk);
else if (tcp_under_memory_pressure(sk))
tcp_adjust_rcv_ssthresh(sk);
if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
if (tcp_can_ingest(sk, in_skb))
return 0;
tcp_collapse_ofo_queue(sk);
@ -5547,7 +5566,7 @@ static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb)
NULL,
tp->copied_seq, tp->rcv_nxt);
if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
if (tcp_can_ingest(sk, in_skb))
return 0;
/* Collapsing did not help, destructive actions follow.
@ -5555,7 +5574,7 @@ static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb)
tcp_prune_ofo_queue(sk, in_skb);
if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
if (tcp_can_ingest(sk, in_skb))
return 0;
/* If we are really being abused, tell the caller to silently
@ -5881,7 +5900,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
step1:
/* Step 1: check sequence number */
reason = tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
reason = tcp_sequence(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
if (reason) {
/* RFC793, page 37: "In all states except SYN-SENT, all reset
* (RST) segments are validated by checking their SEQ-fields."
@ -5892,6 +5911,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
if (!th->rst) {
if (th->syn)
goto syn_challenge;
NET_INC_STATS(sock_net(sk), LINUX_MIB_BEYOND_WINDOW);
if (!tcp_oow_rate_limited(sock_net(sk), skb,
LINUX_MIB_TCPACKSKIPPEDSEQ,
&tp->last_oow_ack_time))
@ -6110,6 +6130,10 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
if (tcp_checksum_complete(skb))
goto csum_error;
if (after(TCP_SKB_CB(skb)->end_seq,
tp->rcv_nxt + tcp_receive_window(tp)))
goto validate;
if ((int)skb->truesize > sk->sk_forward_alloc)
goto step5;
@ -6165,7 +6189,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
/*
* Standard slow path.
*/
validate:
if (!tcp_validate_incoming(sk, skb, th, 1))
return;

View file

@ -0,0 +1,27 @@
// SPDX-License-Identifier: GPL-2.0
--mss=1000
`./defaults.sh
sysctl -q net.ipv4.tcp_rmem="4096 131072 $((32*1024*1024))"`
+0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+0 bind(3, ..., ...) = 0
+0 listen(3, 1) = 0
+0 < S 0:0(0) win 65535 <mss 1000,nop,nop,sackOK,nop,wscale 7>
+0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 10>
+.1 < . 1:1(0) ack 1 win 257
+0 accept(3, ..., ...) = 4
+0 < . 2001:11001(9000) ack 1 win 257
+0 > . 1:1(0) ack 1 win 81 <nop,nop,sack 2001:11001>
// check that ooo packet properly updates tcpi_rcv_mss
+0 %{ assert tcpi_rcv_mss == 1000, tcpi_rcv_mss }%
+0 < . 11001:21001(10000) ack 1 win 257
+0 > . 1:1(0) ack 1 win 81 <nop,nop,sack 2001:21001>

View file

@ -0,0 +1,44 @@
// SPDX-License-Identifier: GPL-2.0
--mss=1000
`./defaults.sh`
0 `nstat -n`
// Establish a connection.
+0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+0 setsockopt(3, SOL_SOCKET, SO_RCVBUF, [10000], 4) = 0
+0 bind(3, ..., ...) = 0
+0 listen(3, 1) = 0
+0 < S 0:0(0) win 32792 <mss 1000,nop,wscale 7>
+0 > S. 0:0(0) ack 1 <mss 1460,nop,wscale 0>
+.1 < . 1:1(0) ack 1 win 257
+0 accept(3, ..., ...) = 4
+0 < P. 1:4001(4000) ack 1 win 257
+0 > . 1:1(0) ack 4001 win 5000
// packet in sequence : SKB_DROP_REASON_TCP_INVALID_END_SEQUENCE / LINUX_MIB_BEYOND_WINDOW
+0 < P. 4001:54001(50000) ack 1 win 257
+0 > . 1:1(0) ack 4001 win 5000
// ooo packet. : SKB_DROP_REASON_TCP_INVALID_END_SEQUENCE / LINUX_MIB_BEYOND_WINDOW
+1 < P. 5001:55001(50000) ack 1 win 257
+0 > . 1:1(0) ack 4001 win 5000
// SKB_DROP_REASON_TCP_INVALID_SEQUENCE / LINUX_MIB_BEYOND_WINDOW
+0 < P. 70001:80001(10000) ack 1 win 257
+0 > . 1:1(0) ack 4001 win 5000
+0 read(4, ..., 100000) = 4000
// If queue is empty, accept a packet even if its end_seq is above wup + rcv_wnd
+0 < P. 4001:54001(50000) ack 1 win 257
+.040 > . 1:1(0) ack 54001 win 0
// Check LINUX_MIB_BEYOND_WINDOW has been incremented 3 times.
+0 `nstat | grep TcpExtBeyondWindow | grep -q " 3 "`

View file

@ -0,0 +1,33 @@
// SPDX-License-Identifier: GPL-2.0
--mss=1000
`./defaults.sh`
0 `nstat -n`
// Establish a connection.
+0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+0 setsockopt(3, SOL_SOCKET, SO_RCVBUF, [20000], 4) = 0
+0 bind(3, ..., ...) = 0
+0 listen(3, 1) = 0
+0 < S 0:0(0) win 32792 <mss 1000,nop,wscale 7>
+0 > S. 0:0(0) ack 1 win 18980 <mss 1460,nop,wscale 0>
+.1 < . 1:1(0) ack 1 win 257
+0 accept(3, ..., ...) = 4
+0 < P. 1:20001(20000) ack 1 win 257
+.04 > . 1:1(0) ack 20001 win 18000
+0 setsockopt(4, SOL_SOCKET, SO_RCVBUF, [12000], 4) = 0
+0 < P. 20001:80001(60000) ack 1 win 257
+0 > . 1:1(0) ack 20001 win 18000
+0 read(4, ..., 20000) = 20000
// A too big packet is accepted if the receive queue is empty
+0 < P. 20001:80001(60000) ack 1 win 257
+0 > . 1:1(0) ack 80001 win 0