forked from mirrors/linux
		
	net: Allow accepted sockets to be bound to l3mdev domain
Allow accepted sockets to derive their sk_bound_dev_if setting from the l3mdev domain in which the packets originated. A sysctl setting is added to control the behavior which is similar to sk_mark and sysctl_tcp_fwmark_accept. This effectively allow a process to have a "VRF-global" listen socket, with child sockets bound to the VRF device in which the packet originated. A similar behavior can be achieved using sk_mark, but a solution using marks is incomplete as it does not handle duplicate addresses in different L3 domains/VRFs. Allowing sockets to inherit the sk_bound_dev_if from l3mdev domain provides a complete solution. Signed-off-by: David Ahern <dsa@cumulusnetworks.com> Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
		
							parent
							
								
									1a8524794f
								
							
						
					
					
						commit
						6dd9a14e92
					
				
					 8 changed files with 42 additions and 5 deletions
				
			
		|  | @ -335,6 +335,14 @@ tcp_keepalive_intvl - INTEGER | |||
| 	after probes started. Default value: 75sec i.e. connection | ||||
| 	will be aborted after ~11 minutes of retries. | ||||
| 
 | ||||
| tcp_l3mdev_accept - BOOLEAN | ||||
| 	Enables child sockets to inherit the L3 master device index. | ||||
| 	Enabling this option allows a "global" listen socket to work | ||||
| 	across L3 master domains (e.g., VRFs) with connected sockets | ||||
| 	derived from the listen socket to be bound to the L3 domain in | ||||
| 	which the packets originated. Only valid when the kernel was | ||||
| 	compiled with CONFIG_NET_L3_MASTER_DEV. | ||||
| 
 | ||||
| tcp_low_latency - BOOLEAN | ||||
| 	If set, the TCP stack makes decisions that prefer lower | ||||
| 	latency as opposed to higher throughput.  By default, this | ||||
|  |  | |||
|  | @ -28,6 +28,7 @@ | |||
| #include <net/request_sock.h> | ||||
| #include <net/netns/hash.h> | ||||
| #include <net/tcp_states.h> | ||||
| #include <net/l3mdev.h> | ||||
| 
 | ||||
| /** struct ip_options - IP Options
 | ||||
|  * | ||||
|  | @ -113,6 +114,19 @@ static inline u32 inet_request_mark(const struct sock *sk, struct sk_buff *skb) | |||
| 	return sk->sk_mark; | ||||
| } | ||||
| 
 | ||||
| static inline int inet_request_bound_dev_if(const struct sock *sk, | ||||
| 					    struct sk_buff *skb) | ||||
| { | ||||
| #ifdef CONFIG_NET_L3_MASTER_DEV | ||||
| 	struct net *net = sock_net(sk); | ||||
| 
 | ||||
| 	if (!sk->sk_bound_dev_if && net->ipv4.sysctl_tcp_l3mdev_accept) | ||||
| 		return l3mdev_master_ifindex_by_index(net, skb->skb_iif); | ||||
| #endif | ||||
| 
 | ||||
| 	return sk->sk_bound_dev_if; | ||||
| } | ||||
| 
 | ||||
| struct inet_cork { | ||||
| 	unsigned int		flags; | ||||
| 	__be32			addr; | ||||
|  |  | |||
|  | @ -86,6 +86,9 @@ struct netns_ipv4 { | |||
| 
 | ||||
| 	int sysctl_fwmark_reflect; | ||||
| 	int sysctl_tcp_fwmark_accept; | ||||
| #ifdef CONFIG_NET_L3_MASTER_DEV | ||||
| 	int sysctl_tcp_l3mdev_accept; | ||||
| #endif | ||||
| 	int sysctl_tcp_mtu_probing; | ||||
| 	int sysctl_tcp_base_mss; | ||||
| 	int sysctl_tcp_probe_threshold; | ||||
|  |  | |||
|  | @ -351,7 +351,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) | |||
| 	treq->snt_synack.v64	= 0; | ||||
| 	treq->tfo_listener	= false; | ||||
| 
 | ||||
| 	ireq->ir_iif = sk->sk_bound_dev_if; | ||||
| 	ireq->ir_iif = inet_request_bound_dev_if(sk, skb); | ||||
| 
 | ||||
| 	/* We throwed the options of the initial SYN away, so we hope
 | ||||
| 	 * the ACK carries the same options again (see RFC1122 4.2.3.8) | ||||
|  | @ -371,7 +371,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) | |||
| 	 * hasn't changed since we received the original syn, but I see | ||||
| 	 * no easy way to do this. | ||||
| 	 */ | ||||
| 	flowi4_init_output(&fl4, sk->sk_bound_dev_if, ireq->ir_mark, | ||||
| 	flowi4_init_output(&fl4, ireq->ir_iif, ireq->ir_mark, | ||||
| 			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, IPPROTO_TCP, | ||||
| 			   inet_sk_flowi_flags(sk), | ||||
| 			   opt->srr ? opt->faddr : ireq->ir_rmt_addr, | ||||
|  |  | |||
|  | @ -915,6 +915,17 @@ static struct ctl_table ipv4_net_table[] = { | |||
| 		.mode		= 0644, | ||||
| 		.proc_handler	= proc_dointvec, | ||||
| 	}, | ||||
| #ifdef CONFIG_NET_L3_MASTER_DEV | ||||
| 	{ | ||||
| 		.procname	= "tcp_l3mdev_accept", | ||||
| 		.data		= &init_net.ipv4.sysctl_tcp_l3mdev_accept, | ||||
| 		.maxlen		= sizeof(int), | ||||
| 		.mode		= 0644, | ||||
| 		.proc_handler	= proc_dointvec_minmax, | ||||
| 		.extra1		= &zero, | ||||
| 		.extra2		= &one, | ||||
| 	}, | ||||
| #endif | ||||
| 	{ | ||||
| 		.procname	= "tcp_mtu_probing", | ||||
| 		.data		= &init_net.ipv4.sysctl_tcp_mtu_probing, | ||||
|  |  | |||
|  | @ -6204,7 +6204,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, | |||
| 	tcp_openreq_init(req, &tmp_opt, skb, sk); | ||||
| 
 | ||||
| 	/* Note: tcp_v6_init_req() might override ir_iif for link locals */ | ||||
| 	inet_rsk(req)->ir_iif = sk->sk_bound_dev_if; | ||||
| 	inet_rsk(req)->ir_iif = inet_request_bound_dev_if(sk, skb); | ||||
| 
 | ||||
| 	af_ops->init_req(req, sk, skb); | ||||
| 
 | ||||
|  |  | |||
|  | @ -1276,6 +1276,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, | |||
| 	ireq		      = inet_rsk(req); | ||||
| 	sk_daddr_set(newsk, ireq->ir_rmt_addr); | ||||
| 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr); | ||||
| 	newsk->sk_bound_dev_if = ireq->ir_iif; | ||||
| 	newinet->inet_saddr	      = ireq->ir_loc_addr; | ||||
| 	inet_opt	      = ireq->opt; | ||||
| 	rcu_assign_pointer(newinet->inet_opt, inet_opt); | ||||
|  |  | |||
|  | @ -193,7 +193,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) | |||
| 		ireq->pktopts = skb; | ||||
| 	} | ||||
| 
 | ||||
| 	ireq->ir_iif = sk->sk_bound_dev_if; | ||||
| 	ireq->ir_iif = inet_request_bound_dev_if(sk, skb); | ||||
| 	/* So that link locals have meaning */ | ||||
| 	if (!sk->sk_bound_dev_if && | ||||
| 	    ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL) | ||||
|  | @ -224,7 +224,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) | |||
| 		fl6.daddr = ireq->ir_v6_rmt_addr; | ||||
| 		final_p = fl6_update_dst(&fl6, rcu_dereference(np->opt), &final); | ||||
| 		fl6.saddr = ireq->ir_v6_loc_addr; | ||||
| 		fl6.flowi6_oif = sk->sk_bound_dev_if; | ||||
| 		fl6.flowi6_oif = ireq->ir_iif; | ||||
| 		fl6.flowi6_mark = ireq->ir_mark; | ||||
| 		fl6.fl6_dport = ireq->ir_rmt_port; | ||||
| 		fl6.fl6_sport = inet_sk(sk)->inet_sport; | ||||
|  |  | |||
		Loading…
	
		Reference in a new issue
	
	 David Ahern
						David Ahern