mirror of
				https://github.com/torvalds/linux.git
				synced 2025-11-04 02:30:34 +02:00 
			
		
		
		
	soreuseport: TCP/IPv4 implementation
Allow multiple listener sockets to bind to the same port. Motivation for soresuseport would be something like a web server binding to port 80 running with multiple threads, where each thread might have it's own listener socket. This could be done as an alternative to other models: 1) have one listener thread which dispatches completed connections to workers. 2) accept on a single listener socket from multiple threads. In case #1 the listener thread can easily become the bottleneck with high connection turn-over rate. In case #2, the proportion of connections accepted per thread tends to be uneven under high connection load (assuming simple event loop: while (1) { accept(); process() }, wakeup does not promote fairness among the sockets. We have seen the disproportion to be as high as 3:1 ratio between thread accepting most connections and the one accepting the fewest. With so_reusport the distribution is uniform. Signed-off-by: Tom Herbert <therbert@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
		
							parent
							
								
									055dc21a1d
								
							
						
					
					
						commit
						da5e36308d
					
				
					 5 changed files with 73 additions and 21 deletions
				
			
		| 
						 | 
				
			
			@ -81,7 +81,9 @@ struct inet_bind_bucket {
 | 
			
		|||
	struct net		*ib_net;
 | 
			
		||||
#endif
 | 
			
		||||
	unsigned short		port;
 | 
			
		||||
	signed short		fastreuse;
 | 
			
		||||
	signed char		fastreuse;
 | 
			
		||||
	signed char		fastreuseport;
 | 
			
		||||
	kuid_t			fastuid;
 | 
			
		||||
	int			num_owners;
 | 
			
		||||
	struct hlist_node	node;
 | 
			
		||||
	struct hlist_head	owners;
 | 
			
		||||
| 
						 | 
				
			
			@ -257,15 +259,19 @@ extern void inet_unhash(struct sock *sk);
 | 
			
		|||
 | 
			
		||||
extern struct sock *__inet_lookup_listener(struct net *net,
 | 
			
		||||
					   struct inet_hashinfo *hashinfo,
 | 
			
		||||
					   const __be32 saddr,
 | 
			
		||||
					   const __be16 sport,
 | 
			
		||||
					   const __be32 daddr,
 | 
			
		||||
					   const unsigned short hnum,
 | 
			
		||||
					   const int dif);
 | 
			
		||||
 | 
			
		||||
static inline struct sock *inet_lookup_listener(struct net *net,
 | 
			
		||||
		struct inet_hashinfo *hashinfo,
 | 
			
		||||
		__be32 saddr, __be16 sport,
 | 
			
		||||
		__be32 daddr, __be16 dport, int dif)
 | 
			
		||||
{
 | 
			
		||||
	return __inet_lookup_listener(net, hashinfo, daddr, ntohs(dport), dif);
 | 
			
		||||
	return __inet_lookup_listener(net, hashinfo, saddr, sport,
 | 
			
		||||
				      daddr, ntohs(dport), dif);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/* Socket demux engine toys. */
 | 
			
		||||
| 
						 | 
				
			
			@ -358,7 +364,8 @@ static inline struct sock *__inet_lookup(struct net *net,
 | 
			
		|||
	struct sock *sk = __inet_lookup_established(net, hashinfo,
 | 
			
		||||
				saddr, sport, daddr, hnum, dif);
 | 
			
		||||
 | 
			
		||||
	return sk ? : __inet_lookup_listener(net, hashinfo, daddr, hnum, dif);
 | 
			
		||||
	return sk ? : __inet_lookup_listener(net, hashinfo, saddr, sport,
 | 
			
		||||
					     daddr, hnum, dif);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static inline struct sock *inet_lookup(struct net *net,
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -82,6 +82,7 @@ nf_tproxy_get_sock_v4(struct net *net, const u8 protocol,
 | 
			
		|||
			break;
 | 
			
		||||
		case NFT_LOOKUP_LISTENER:
 | 
			
		||||
			sk = inet_lookup_listener(net, &tcp_hashinfo,
 | 
			
		||||
						    saddr, sport,
 | 
			
		||||
						    daddr, dport,
 | 
			
		||||
						    in->ifindex);
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -59,6 +59,8 @@ int inet_csk_bind_conflict(const struct sock *sk,
 | 
			
		|||
	struct sock *sk2;
 | 
			
		||||
	struct hlist_node *node;
 | 
			
		||||
	int reuse = sk->sk_reuse;
 | 
			
		||||
	int reuseport = sk->sk_reuseport;
 | 
			
		||||
	kuid_t uid = sock_i_uid((struct sock *)sk);
 | 
			
		||||
 | 
			
		||||
	/*
 | 
			
		||||
	 * Unlike other sk lookup places we do not check
 | 
			
		||||
| 
						 | 
				
			
			@ -73,8 +75,11 @@ int inet_csk_bind_conflict(const struct sock *sk,
 | 
			
		|||
		    (!sk->sk_bound_dev_if ||
 | 
			
		||||
		     !sk2->sk_bound_dev_if ||
 | 
			
		||||
		     sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
 | 
			
		||||
			if (!reuse || !sk2->sk_reuse ||
 | 
			
		||||
			    sk2->sk_state == TCP_LISTEN) {
 | 
			
		||||
			if ((!reuse || !sk2->sk_reuse ||
 | 
			
		||||
			    sk2->sk_state == TCP_LISTEN) &&
 | 
			
		||||
			    (!reuseport || !sk2->sk_reuseport ||
 | 
			
		||||
			    (sk2->sk_state != TCP_TIME_WAIT &&
 | 
			
		||||
			     !uid_eq(uid, sock_i_uid(sk2))))) {
 | 
			
		||||
				const __be32 sk2_rcv_saddr = sk_rcv_saddr(sk2);
 | 
			
		||||
				if (!sk2_rcv_saddr || !sk_rcv_saddr(sk) ||
 | 
			
		||||
				    sk2_rcv_saddr == sk_rcv_saddr(sk))
 | 
			
		||||
| 
						 | 
				
			
			@ -106,6 +111,7 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
 | 
			
		|||
	int ret, attempts = 5;
 | 
			
		||||
	struct net *net = sock_net(sk);
 | 
			
		||||
	int smallest_size = -1, smallest_rover;
 | 
			
		||||
	kuid_t uid = sock_i_uid(sk);
 | 
			
		||||
 | 
			
		||||
	local_bh_disable();
 | 
			
		||||
	if (!snum) {
 | 
			
		||||
| 
						 | 
				
			
			@ -125,9 +131,12 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
 | 
			
		|||
			spin_lock(&head->lock);
 | 
			
		||||
			inet_bind_bucket_for_each(tb, node, &head->chain)
 | 
			
		||||
				if (net_eq(ib_net(tb), net) && tb->port == rover) {
 | 
			
		||||
					if (tb->fastreuse > 0 &&
 | 
			
		||||
					    sk->sk_reuse &&
 | 
			
		||||
					    sk->sk_state != TCP_LISTEN &&
 | 
			
		||||
					if (((tb->fastreuse > 0 &&
 | 
			
		||||
					      sk->sk_reuse &&
 | 
			
		||||
					      sk->sk_state != TCP_LISTEN) ||
 | 
			
		||||
					     (tb->fastreuseport > 0 &&
 | 
			
		||||
					      sk->sk_reuseport &&
 | 
			
		||||
					      uid_eq(tb->fastuid, uid))) &&
 | 
			
		||||
					    (tb->num_owners < smallest_size || smallest_size == -1)) {
 | 
			
		||||
						smallest_size = tb->num_owners;
 | 
			
		||||
						smallest_rover = rover;
 | 
			
		||||
| 
						 | 
				
			
			@ -185,14 +194,17 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
 | 
			
		|||
		if (sk->sk_reuse == SK_FORCE_REUSE)
 | 
			
		||||
			goto success;
 | 
			
		||||
 | 
			
		||||
		if (tb->fastreuse > 0 &&
 | 
			
		||||
		    sk->sk_reuse && sk->sk_state != TCP_LISTEN &&
 | 
			
		||||
		if (((tb->fastreuse > 0 &&
 | 
			
		||||
		      sk->sk_reuse && sk->sk_state != TCP_LISTEN) ||
 | 
			
		||||
		     (tb->fastreuseport > 0 &&
 | 
			
		||||
		      sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
 | 
			
		||||
		    smallest_size == -1) {
 | 
			
		||||
			goto success;
 | 
			
		||||
		} else {
 | 
			
		||||
			ret = 1;
 | 
			
		||||
			if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) {
 | 
			
		||||
				if (sk->sk_reuse && sk->sk_state != TCP_LISTEN &&
 | 
			
		||||
				if (((sk->sk_reuse && sk->sk_state != TCP_LISTEN) ||
 | 
			
		||||
				     (sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
 | 
			
		||||
				    smallest_size != -1 && --attempts >= 0) {
 | 
			
		||||
					spin_unlock(&head->lock);
 | 
			
		||||
					goto again;
 | 
			
		||||
| 
						 | 
				
			
			@ -212,9 +224,23 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
 | 
			
		|||
			tb->fastreuse = 1;
 | 
			
		||||
		else
 | 
			
		||||
			tb->fastreuse = 0;
 | 
			
		||||
	} else if (tb->fastreuse &&
 | 
			
		||||
		   (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
 | 
			
		||||
		tb->fastreuse = 0;
 | 
			
		||||
		if (sk->sk_reuseport) {
 | 
			
		||||
			tb->fastreuseport = 1;
 | 
			
		||||
			tb->fastuid = uid;
 | 
			
		||||
		} else {
 | 
			
		||||
			tb->fastreuseport = 0;
 | 
			
		||||
			tb->fastuid = 0;
 | 
			
		||||
		}
 | 
			
		||||
	} else {
 | 
			
		||||
		if (tb->fastreuse &&
 | 
			
		||||
		    (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
 | 
			
		||||
			tb->fastreuse = 0;
 | 
			
		||||
		if (tb->fastreuseport &&
 | 
			
		||||
		    (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid))) {
 | 
			
		||||
			tb->fastreuseport = 0;
 | 
			
		||||
			tb->fastuid = 0;
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
success:
 | 
			
		||||
	if (!inet_csk(sk)->icsk_bind_hash)
 | 
			
		||||
		inet_bind_hash(sk, tb, snum);
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -39,6 +39,7 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
 | 
			
		|||
		write_pnet(&tb->ib_net, hold_net(net));
 | 
			
		||||
		tb->port      = snum;
 | 
			
		||||
		tb->fastreuse = 0;
 | 
			
		||||
		tb->fastreuseport = 0;
 | 
			
		||||
		tb->num_owners = 0;
 | 
			
		||||
		INIT_HLIST_HEAD(&tb->owners);
 | 
			
		||||
		hlist_add_head(&tb->node, &head->chain);
 | 
			
		||||
| 
						 | 
				
			
			@ -151,16 +152,16 @@ static inline int compute_score(struct sock *sk, struct net *net,
 | 
			
		|||
	if (net_eq(sock_net(sk), net) && inet->inet_num == hnum &&
 | 
			
		||||
			!ipv6_only_sock(sk)) {
 | 
			
		||||
		__be32 rcv_saddr = inet->inet_rcv_saddr;
 | 
			
		||||
		score = sk->sk_family == PF_INET ? 1 : 0;
 | 
			
		||||
		score = sk->sk_family == PF_INET ? 2 : 1;
 | 
			
		||||
		if (rcv_saddr) {
 | 
			
		||||
			if (rcv_saddr != daddr)
 | 
			
		||||
				return -1;
 | 
			
		||||
			score += 2;
 | 
			
		||||
			score += 4;
 | 
			
		||||
		}
 | 
			
		||||
		if (sk->sk_bound_dev_if) {
 | 
			
		||||
			if (sk->sk_bound_dev_if != dif)
 | 
			
		||||
				return -1;
 | 
			
		||||
			score += 2;
 | 
			
		||||
			score += 4;
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
	return score;
 | 
			
		||||
| 
						 | 
				
			
			@ -176,6 +177,7 @@ static inline int compute_score(struct sock *sk, struct net *net,
 | 
			
		|||
 | 
			
		||||
struct sock *__inet_lookup_listener(struct net *net,
 | 
			
		||||
				    struct inet_hashinfo *hashinfo,
 | 
			
		||||
				    const __be32 saddr, __be16 sport,
 | 
			
		||||
				    const __be32 daddr, const unsigned short hnum,
 | 
			
		||||
				    const int dif)
 | 
			
		||||
{
 | 
			
		||||
| 
						 | 
				
			
			@ -183,17 +185,29 @@ struct sock *__inet_lookup_listener(struct net *net,
 | 
			
		|||
	struct hlist_nulls_node *node;
 | 
			
		||||
	unsigned int hash = inet_lhashfn(net, hnum);
 | 
			
		||||
	struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
 | 
			
		||||
	int score, hiscore;
 | 
			
		||||
	int score, hiscore, matches = 0, reuseport = 0;
 | 
			
		||||
	u32 phash = 0;
 | 
			
		||||
 | 
			
		||||
	rcu_read_lock();
 | 
			
		||||
begin:
 | 
			
		||||
	result = NULL;
 | 
			
		||||
	hiscore = -1;
 | 
			
		||||
	hiscore = 0;
 | 
			
		||||
	sk_nulls_for_each_rcu(sk, node, &ilb->head) {
 | 
			
		||||
		score = compute_score(sk, net, hnum, daddr, dif);
 | 
			
		||||
		if (score > hiscore) {
 | 
			
		||||
			result = sk;
 | 
			
		||||
			hiscore = score;
 | 
			
		||||
			reuseport = sk->sk_reuseport;
 | 
			
		||||
			if (reuseport) {
 | 
			
		||||
				phash = inet_ehashfn(net, daddr, hnum,
 | 
			
		||||
						     saddr, sport);
 | 
			
		||||
				matches = 1;
 | 
			
		||||
			}
 | 
			
		||||
		} else if (score == hiscore && reuseport) {
 | 
			
		||||
			matches++;
 | 
			
		||||
			if (((u64)phash * matches) >> 32 == 0)
 | 
			
		||||
				result = sk;
 | 
			
		||||
			phash = next_pseudo_random32(phash);
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
	/*
 | 
			
		||||
| 
						 | 
				
			
			@ -501,7 +515,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
 | 
			
		|||
			inet_bind_bucket_for_each(tb, node, &head->chain) {
 | 
			
		||||
				if (net_eq(ib_net(tb), net) &&
 | 
			
		||||
				    tb->port == port) {
 | 
			
		||||
					if (tb->fastreuse >= 0)
 | 
			
		||||
					if (tb->fastreuse >= 0 ||
 | 
			
		||||
					    tb->fastreuseport >= 0)
 | 
			
		||||
						goto next_port;
 | 
			
		||||
					WARN_ON(hlist_empty(&tb->owners));
 | 
			
		||||
					if (!check_established(death_row, sk,
 | 
			
		||||
| 
						 | 
				
			
			@ -518,6 +533,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
 | 
			
		|||
				break;
 | 
			
		||||
			}
 | 
			
		||||
			tb->fastreuse = -1;
 | 
			
		||||
			tb->fastreuseport = -1;
 | 
			
		||||
			goto ok;
 | 
			
		||||
 | 
			
		||||
		next_port:
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -657,7 +657,8 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
 | 
			
		|||
		 * no RST generated if md5 hash doesn't match.
 | 
			
		||||
		 */
 | 
			
		||||
		sk1 = __inet_lookup_listener(dev_net(skb_dst(skb)->dev),
 | 
			
		||||
					     &tcp_hashinfo, ip_hdr(skb)->daddr,
 | 
			
		||||
					     &tcp_hashinfo, ip_hdr(skb)->saddr,
 | 
			
		||||
					     th->source, ip_hdr(skb)->daddr,
 | 
			
		||||
					     ntohs(th->source), inet_iif(skb));
 | 
			
		||||
		/* don't send rst if it can't find key */
 | 
			
		||||
		if (!sk1)
 | 
			
		||||
| 
						 | 
				
			
			@ -2074,6 +2075,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
 | 
			
		|||
	case TCP_TW_SYN: {
 | 
			
		||||
		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
 | 
			
		||||
							&tcp_hashinfo,
 | 
			
		||||
							iph->saddr, th->source,
 | 
			
		||||
							iph->daddr, th->dest,
 | 
			
		||||
							inet_iif(skb));
 | 
			
		||||
		if (sk2) {
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in a new issue