mirror of
				https://github.com/torvalds/linux.git
				synced 2025-11-04 02:30:34 +02:00 
			
		
		
		
	[NET]: Size listen hash tables using backlog hint
We currently allocate a fixed size (TCP_SYNQ_HSIZE=512) slots hash table for each LISTEN socket, regardless of various parameters (listen backlog for example) On x86_64, this means order-1 allocations (might fail), even for 'small' sockets, expecting few connections. On the contrary, a huge server wanting a backlog of 50000 is slowed down a bit because of this fixed limit. This patch makes the sizing of listen hash table a dynamic parameter, depending of : - net.core.somaxconn tunable (default is 128) - net.ipv4.tcp_max_syn_backlog tunable (default : 256, 1024 or 128) - backlog value given by user application (2nd parameter of listen()) For large allocations (bigger than PAGE_SIZE), we use vmalloc() instead of kmalloc(). We still limit memory allocation with the two existing tunables (somaxconn & tcp_max_syn_backlog). So for standard setups, this patch actually reduce RAM usage. Signed-off-by: Eric Dumazet <dada1@cosmosbay.com> Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
		
							parent
							
								
									3c62f75aac
								
							
						
					
					
						commit
						72a3effaf6
					
				
					 9 changed files with 38 additions and 24 deletions
				
			
		| 
						 | 
					@ -28,8 +28,8 @@ struct proto;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
struct request_sock_ops {
 | 
					struct request_sock_ops {
 | 
				
			||||||
	int		family;
 | 
						int		family;
 | 
				
			||||||
	kmem_cache_t	*slab;
 | 
					 | 
				
			||||||
	int		obj_size;
 | 
						int		obj_size;
 | 
				
			||||||
 | 
						kmem_cache_t	*slab;
 | 
				
			||||||
	int		(*rtx_syn_ack)(struct sock *sk,
 | 
						int		(*rtx_syn_ack)(struct sock *sk,
 | 
				
			||||||
				       struct request_sock *req,
 | 
									       struct request_sock *req,
 | 
				
			||||||
				       struct dst_entry *dst);
 | 
									       struct dst_entry *dst);
 | 
				
			||||||
| 
						 | 
					@ -51,13 +51,13 @@ struct request_sock {
 | 
				
			||||||
	u32				rcv_wnd;	  /* rcv_wnd offered first time */
 | 
						u32				rcv_wnd;	  /* rcv_wnd offered first time */
 | 
				
			||||||
	u32				ts_recent;
 | 
						u32				ts_recent;
 | 
				
			||||||
	unsigned long			expires;
 | 
						unsigned long			expires;
 | 
				
			||||||
	struct request_sock_ops		*rsk_ops;
 | 
						const struct request_sock_ops	*rsk_ops;
 | 
				
			||||||
	struct sock			*sk;
 | 
						struct sock			*sk;
 | 
				
			||||||
	u32				secid;
 | 
						u32				secid;
 | 
				
			||||||
	u32				peer_secid;
 | 
						u32				peer_secid;
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static inline struct request_sock *reqsk_alloc(struct request_sock_ops *ops)
 | 
					static inline struct request_sock *reqsk_alloc(const struct request_sock_ops *ops)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	struct request_sock *req = kmem_cache_alloc(ops->slab, SLAB_ATOMIC);
 | 
						struct request_sock *req = kmem_cache_alloc(ops->slab, SLAB_ATOMIC);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -121,7 +121,7 @@ struct request_sock_queue {
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
extern int reqsk_queue_alloc(struct request_sock_queue *queue,
 | 
					extern int reqsk_queue_alloc(struct request_sock_queue *queue,
 | 
				
			||||||
			     const int nr_table_entries);
 | 
								     unsigned int nr_table_entries);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static inline struct listen_sock *reqsk_queue_yank_listen_sk(struct request_sock_queue *queue)
 | 
					static inline struct listen_sock *reqsk_queue_yank_listen_sk(struct request_sock_queue *queue)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -138,7 +138,6 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo);
 | 
				
			||||||
#define MAX_TCP_SYNCNT		127
 | 
					#define MAX_TCP_SYNCNT		127
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define TCP_SYNQ_INTERVAL	(HZ/5)	/* Period of SYNACK timer */
 | 
					#define TCP_SYNQ_INTERVAL	(HZ/5)	/* Period of SYNACK timer */
 | 
				
			||||||
#define TCP_SYNQ_HSIZE		512	/* Size of SYNACK hash table */
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define TCP_PAWS_24DAYS	(60 * 60 * 24 * 24)
 | 
					#define TCP_PAWS_24DAYS	(60 * 60 * 24 * 24)
 | 
				
			||||||
#define TCP_PAWS_MSL	60		/* Per-host timestamps are invalidated
 | 
					#define TCP_PAWS_MSL	60		/* Per-host timestamps are invalidated
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -15,6 +15,7 @@
 | 
				
			||||||
#include <linux/random.h>
 | 
					#include <linux/random.h>
 | 
				
			||||||
#include <linux/slab.h>
 | 
					#include <linux/slab.h>
 | 
				
			||||||
#include <linux/string.h>
 | 
					#include <linux/string.h>
 | 
				
			||||||
 | 
					#include <linux/vmalloc.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#include <net/request_sock.h>
 | 
					#include <net/request_sock.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -29,22 +30,31 @@
 | 
				
			||||||
 * it is absolutely not enough even at 100conn/sec. 256 cures most
 | 
					 * it is absolutely not enough even at 100conn/sec. 256 cures most
 | 
				
			||||||
 * of problems. This value is adjusted to 128 for very small machines
 | 
					 * of problems. This value is adjusted to 128 for very small machines
 | 
				
			||||||
 * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
 | 
					 * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
 | 
				
			||||||
 * Further increasing requires to change hash table size.
 | 
					 * Note : Dont forget somaxconn that may limit backlog too.
 | 
				
			||||||
 */
 | 
					 */
 | 
				
			||||||
int sysctl_max_syn_backlog = 256;
 | 
					int sysctl_max_syn_backlog = 256;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
int reqsk_queue_alloc(struct request_sock_queue *queue,
 | 
					int reqsk_queue_alloc(struct request_sock_queue *queue,
 | 
				
			||||||
		      const int nr_table_entries)
 | 
							      unsigned int nr_table_entries)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	const int lopt_size = sizeof(struct listen_sock) +
 | 
						size_t lopt_size = sizeof(struct listen_sock);
 | 
				
			||||||
			      nr_table_entries * sizeof(struct request_sock *);
 | 
						struct listen_sock *lopt;
 | 
				
			||||||
	struct listen_sock *lopt = kzalloc(lopt_size, GFP_KERNEL);
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						nr_table_entries = min_t(u32, nr_table_entries, sysctl_max_syn_backlog);
 | 
				
			||||||
 | 
						nr_table_entries = max_t(u32, nr_table_entries, 8);
 | 
				
			||||||
 | 
						nr_table_entries = roundup_pow_of_two(nr_table_entries + 1);
 | 
				
			||||||
 | 
						lopt_size += nr_table_entries * sizeof(struct request_sock *);
 | 
				
			||||||
 | 
						if (lopt_size > PAGE_SIZE)
 | 
				
			||||||
 | 
							lopt = __vmalloc(lopt_size,
 | 
				
			||||||
 | 
								GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
 | 
				
			||||||
 | 
								PAGE_KERNEL);
 | 
				
			||||||
 | 
						else
 | 
				
			||||||
 | 
							lopt = kzalloc(lopt_size, GFP_KERNEL);
 | 
				
			||||||
	if (lopt == NULL)
 | 
						if (lopt == NULL)
 | 
				
			||||||
		return -ENOMEM;
 | 
							return -ENOMEM;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	for (lopt->max_qlen_log = 6;
 | 
						for (lopt->max_qlen_log = 3;
 | 
				
			||||||
	     (1 << lopt->max_qlen_log) < sysctl_max_syn_backlog;
 | 
						     (1 << lopt->max_qlen_log) < nr_table_entries;
 | 
				
			||||||
	     lopt->max_qlen_log++);
 | 
						     lopt->max_qlen_log++);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd));
 | 
						get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd));
 | 
				
			||||||
| 
						 | 
					@ -65,9 +75,11 @@ void reqsk_queue_destroy(struct request_sock_queue *queue)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	/* make all the listen_opt local to us */
 | 
						/* make all the listen_opt local to us */
 | 
				
			||||||
	struct listen_sock *lopt = reqsk_queue_yank_listen_sk(queue);
 | 
						struct listen_sock *lopt = reqsk_queue_yank_listen_sk(queue);
 | 
				
			||||||
 | 
						size_t lopt_size = sizeof(struct listen_sock) +
 | 
				
			||||||
 | 
							lopt->nr_table_entries * sizeof(struct request_sock *);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if (lopt->qlen != 0) {
 | 
						if (lopt->qlen != 0) {
 | 
				
			||||||
		int i;
 | 
							unsigned int i;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		for (i = 0; i < lopt->nr_table_entries; i++) {
 | 
							for (i = 0; i < lopt->nr_table_entries; i++) {
 | 
				
			||||||
			struct request_sock *req;
 | 
								struct request_sock *req;
 | 
				
			||||||
| 
						 | 
					@ -81,7 +93,10 @@ void reqsk_queue_destroy(struct request_sock_queue *queue)
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	BUG_TRAP(lopt->qlen == 0);
 | 
						BUG_TRAP(lopt->qlen == 0);
 | 
				
			||||||
	kfree(lopt);
 | 
						if (lopt_size > PAGE_SIZE)
 | 
				
			||||||
 | 
							vfree(lopt);
 | 
				
			||||||
 | 
						else
 | 
				
			||||||
 | 
							kfree(lopt);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
EXPORT_SYMBOL(reqsk_queue_destroy);
 | 
					EXPORT_SYMBOL(reqsk_queue_destroy);
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1022,7 +1022,7 @@ static void dccp_v4_reqsk_destructor(struct request_sock *req)
 | 
				
			||||||
	kfree(inet_rsk(req)->opt);
 | 
						kfree(inet_rsk(req)->opt);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static struct request_sock_ops dccp_request_sock_ops = {
 | 
					static struct request_sock_ops dccp_request_sock_ops _read_mostly = {
 | 
				
			||||||
	.family		= PF_INET,
 | 
						.family		= PF_INET,
 | 
				
			||||||
	.obj_size	= sizeof(struct dccp_request_sock),
 | 
						.obj_size	= sizeof(struct dccp_request_sock),
 | 
				
			||||||
	.rtx_syn_ack	= dccp_v4_send_response,
 | 
						.rtx_syn_ack	= dccp_v4_send_response,
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -262,12 +262,12 @@ int dccp_destroy_sock(struct sock *sk)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
EXPORT_SYMBOL_GPL(dccp_destroy_sock);
 | 
					EXPORT_SYMBOL_GPL(dccp_destroy_sock);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static inline int dccp_listen_start(struct sock *sk)
 | 
					static inline int dccp_listen_start(struct sock *sk, int backlog)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	struct dccp_sock *dp = dccp_sk(sk);
 | 
						struct dccp_sock *dp = dccp_sk(sk);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	dp->dccps_role = DCCP_ROLE_LISTEN;
 | 
						dp->dccps_role = DCCP_ROLE_LISTEN;
 | 
				
			||||||
	return inet_csk_listen_start(sk, TCP_SYNQ_HSIZE);
 | 
						return inet_csk_listen_start(sk, backlog);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
int dccp_disconnect(struct sock *sk, int flags)
 | 
					int dccp_disconnect(struct sock *sk, int flags)
 | 
				
			||||||
| 
						 | 
					@ -788,7 +788,7 @@ int inet_dccp_listen(struct socket *sock, int backlog)
 | 
				
			||||||
		 * FIXME: here it probably should be sk->sk_prot->listen_start
 | 
							 * FIXME: here it probably should be sk->sk_prot->listen_start
 | 
				
			||||||
		 * see tcp_listen_start
 | 
							 * see tcp_listen_start
 | 
				
			||||||
		 */
 | 
							 */
 | 
				
			||||||
		err = dccp_listen_start(sk);
 | 
							err = dccp_listen_start(sk, backlog);
 | 
				
			||||||
		if (err)
 | 
							if (err)
 | 
				
			||||||
			goto out;
 | 
								goto out;
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -204,7 +204,7 @@ int inet_listen(struct socket *sock, int backlog)
 | 
				
			||||||
	 * we can only allow the backlog to be adjusted.
 | 
						 * we can only allow the backlog to be adjusted.
 | 
				
			||||||
	 */
 | 
						 */
 | 
				
			||||||
	if (old_state != TCP_LISTEN) {
 | 
						if (old_state != TCP_LISTEN) {
 | 
				
			||||||
		err = inet_csk_listen_start(sk, TCP_SYNQ_HSIZE);
 | 
							err = inet_csk_listen_start(sk, backlog);
 | 
				
			||||||
		if (err)
 | 
							if (err)
 | 
				
			||||||
			goto out;
 | 
								goto out;
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -343,7 +343,7 @@ struct dst_entry* inet_csk_route_req(struct sock *sk,
 | 
				
			||||||
EXPORT_SYMBOL_GPL(inet_csk_route_req);
 | 
					EXPORT_SYMBOL_GPL(inet_csk_route_req);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport,
 | 
					static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport,
 | 
				
			||||||
				 const u32 rnd, const u16 synq_hsize)
 | 
									 const u32 rnd, const u32 synq_hsize)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	return jhash_2words((__force u32)raddr, (__force u32)rport, rnd) & (synq_hsize - 1);
 | 
						return jhash_2words((__force u32)raddr, (__force u32)rport, rnd) & (synq_hsize - 1);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -715,7 +715,7 @@ static struct ip_options *tcp_v4_save_options(struct sock *sk,
 | 
				
			||||||
	return dopt;
 | 
						return dopt;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
struct request_sock_ops tcp_request_sock_ops = {
 | 
					struct request_sock_ops tcp_request_sock_ops __read_mostly = {
 | 
				
			||||||
	.family		=	PF_INET,
 | 
						.family		=	PF_INET,
 | 
				
			||||||
	.obj_size	=	sizeof(struct tcp_request_sock),
 | 
						.obj_size	=	sizeof(struct tcp_request_sock),
 | 
				
			||||||
	.rtx_syn_ack	=	tcp_v4_send_synack,
 | 
						.rtx_syn_ack	=	tcp_v4_send_synack,
 | 
				
			||||||
| 
						 | 
					@ -1385,7 +1385,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
 | 
				
			||||||
	if (st->state == TCP_SEQ_STATE_OPENREQ) {
 | 
						if (st->state == TCP_SEQ_STATE_OPENREQ) {
 | 
				
			||||||
		struct request_sock *req = cur;
 | 
							struct request_sock *req = cur;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	       	icsk = inet_csk(st->syn_wait_sk);
 | 
							icsk = inet_csk(st->syn_wait_sk);
 | 
				
			||||||
		req = req->dl_next;
 | 
							req = req->dl_next;
 | 
				
			||||||
		while (1) {
 | 
							while (1) {
 | 
				
			||||||
			while (req) {
 | 
								while (req) {
 | 
				
			||||||
| 
						 | 
					@ -1395,7 +1395,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
 | 
				
			||||||
				}
 | 
									}
 | 
				
			||||||
				req = req->dl_next;
 | 
									req = req->dl_next;
 | 
				
			||||||
			}
 | 
								}
 | 
				
			||||||
			if (++st->sbucket >= TCP_SYNQ_HSIZE)
 | 
								if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
 | 
				
			||||||
				break;
 | 
									break;
 | 
				
			||||||
get_req:
 | 
					get_req:
 | 
				
			||||||
			req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
 | 
								req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -526,7 +526,7 @@ static void tcp_v6_reqsk_destructor(struct request_sock *req)
 | 
				
			||||||
		kfree_skb(inet6_rsk(req)->pktopts);
 | 
							kfree_skb(inet6_rsk(req)->pktopts);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static struct request_sock_ops tcp6_request_sock_ops = {
 | 
					static struct request_sock_ops tcp6_request_sock_ops _read_mostly = {
 | 
				
			||||||
	.family		=	AF_INET6,
 | 
						.family		=	AF_INET6,
 | 
				
			||||||
	.obj_size	=	sizeof(struct tcp6_request_sock),
 | 
						.obj_size	=	sizeof(struct tcp6_request_sock),
 | 
				
			||||||
	.rtx_syn_ack	=	tcp_v6_send_synack,
 | 
						.rtx_syn_ack	=	tcp_v6_send_synack,
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in a new issue