mirror of
https://github.com/torvalds/linux.git
synced 2025-11-03 01:59:51 +02:00
Kazuho Oku reported that setsockopt(SO_INCOMING_CPU) does not work
with setsockopt(SO_REUSEPORT) since v4.6.
With the combination of SO_REUSEPORT and SO_INCOMING_CPU, we could
build a highly efficient server application.
setsockopt(SO_INCOMING_CPU) associates a CPU with a TCP listener
or UDP socket, and then incoming packets processed on the CPU will
likely be distributed to the socket. Technically, a socket could
even receive packets handled on another CPU if no sockets in the
reuseport group have the same CPU receiving the flow.
The logic exists in compute_score() so that a socket will get a higher
score if it has the same CPU with the flow. However, the score gets
ignored after the blamed two commits, which introduced a faster socket
selection algorithm for SO_REUSEPORT.
This patch introduces a counter of sockets with SO_INCOMING_CPU in
a reuseport group to check if we should iterate all sockets to find
a proper one. We increment the counter when
* calling listen() if the socket has SO_INCOMING_CPU and SO_REUSEPORT
* enabling SO_INCOMING_CPU if the socket is in a reuseport group
Also, we decrement it when
* detaching a socket out of the group to apply SO_INCOMING_CPU to
migrated TCP requests
* disabling SO_INCOMING_CPU if the socket is in a reuseport group
When the counter reaches 0, we can get back to the O(1) selection
algorithm.
The overall changes are negligible for the non-SO_INCOMING_CPU case,
and the only notable thing is that we have to update sk_incomnig_cpu
under reuseport_lock. Otherwise, the race prevents transitioning to
the O(n) algorithm and results in the wrong socket selection.
cpu1 (setsockopt) cpu2 (listen)
+-----------------+ +-------------+
lock_sock(sk1) lock_sock(sk2)
reuseport_update_incoming_cpu(sk1, val)
.
| /* set CPU as 0 */
|- WRITE_ONCE(sk1->incoming_cpu, val)
|
| spin_lock_bh(&reuseport_lock)
| reuseport_grow(sk2, reuse)
| .
| |- more_socks_size = reuse->max_socks * 2U;
| |- if (more_socks_size > U16_MAX &&
| | reuse->num_closed_socks)
| | .
| | |- RCU_INIT_POINTER(sk1->sk_reuseport_cb, NULL);
| | `- __reuseport_detach_closed_sock(sk1, reuse)
| | .
| | `- reuseport_put_incoming_cpu(sk1, reuse)
| | .
| | | /* Read shutdown()ed sk1's sk_incoming_cpu
| | | * without lock_sock().
| | | */
| | `- if (sk1->sk_incoming_cpu >= 0)
| | .
| | | /* decrement not-yet-incremented
| | | * count, which is never incremented.
| | | */
| | `- __reuseport_put_incoming_cpu(reuse);
| |
| `- spin_lock_bh(&reuseport_lock)
|
|- spin_lock_bh(&reuseport_lock)
|
|- reuse = rcu_dereference_protected(sk1->sk_reuseport_cb, ...)
|- if (!reuse)
| .
| | /* Cannot increment reuse->incoming_cpu. */
| `- goto out;
|
`- spin_unlock_bh(&reuseport_lock)
Fixes: e32ea7e747 ("soreuseport: fast reuseport UDP socket selection")
Fixes: c125e80b88 ("soreuseport: fast reuseport TCP socket selection")
Reported-by: Kazuho Oku <kazuhooku@gmail.com>
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
64 lines
1.8 KiB
C
64 lines
1.8 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#ifndef _SOCK_REUSEPORT_H
|
|
#define _SOCK_REUSEPORT_H
|
|
|
|
#include <linux/filter.h>
|
|
#include <linux/skbuff.h>
|
|
#include <linux/types.h>
|
|
#include <linux/spinlock.h>
|
|
#include <net/sock.h>
|
|
|
|
extern spinlock_t reuseport_lock;
|
|
|
|
struct sock_reuseport {
|
|
struct rcu_head rcu;
|
|
|
|
u16 max_socks; /* length of socks */
|
|
u16 num_socks; /* elements in socks */
|
|
u16 num_closed_socks; /* closed elements in socks */
|
|
u16 incoming_cpu;
|
|
/* The last synq overflow event timestamp of this
|
|
* reuse->socks[] group.
|
|
*/
|
|
unsigned int synq_overflow_ts;
|
|
/* ID stays the same even after the size of socks[] grows. */
|
|
unsigned int reuseport_id;
|
|
unsigned int bind_inany:1;
|
|
unsigned int has_conns:1;
|
|
struct bpf_prog __rcu *prog; /* optional BPF sock selector */
|
|
struct sock *socks[]; /* array of sock pointers */
|
|
};
|
|
|
|
extern int reuseport_alloc(struct sock *sk, bool bind_inany);
|
|
extern int reuseport_add_sock(struct sock *sk, struct sock *sk2,
|
|
bool bind_inany);
|
|
extern void reuseport_detach_sock(struct sock *sk);
|
|
void reuseport_stop_listen_sock(struct sock *sk);
|
|
extern struct sock *reuseport_select_sock(struct sock *sk,
|
|
u32 hash,
|
|
struct sk_buff *skb,
|
|
int hdr_len);
|
|
struct sock *reuseport_migrate_sock(struct sock *sk,
|
|
struct sock *migrating_sk,
|
|
struct sk_buff *skb);
|
|
extern int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog);
|
|
extern int reuseport_detach_prog(struct sock *sk);
|
|
|
|
static inline bool reuseport_has_conns(struct sock *sk)
|
|
{
|
|
struct sock_reuseport *reuse;
|
|
bool ret = false;
|
|
|
|
rcu_read_lock();
|
|
reuse = rcu_dereference(sk->sk_reuseport_cb);
|
|
if (reuse && reuse->has_conns)
|
|
ret = true;
|
|
rcu_read_unlock();
|
|
|
|
return ret;
|
|
}
|
|
|
|
void reuseport_has_conns_set(struct sock *sk);
|
|
void reuseport_update_incoming_cpu(struct sock *sk, int val);
|
|
|
|
#endif /* _SOCK_REUSEPORT_H */
|