forked from mirrors/linux
		
	ipv4: tcp: get rid of ugly unicast_sock
In commitbe9f4a44e7("ipv4: tcp: remove per net tcp_sock") I tried to address contention on a socket lock, but the solution I chose was horrible : commit3a7c384ffd("ipv4: tcp: unicast_sock should not land outside of TCP stack") addressed a selinux regression. commit0980e56e50("ipv4: tcp: set unicast_sock uc_ttl to -1") took care of another regression. commitb5ec8eeac4("ipv4: fix ip_send_skb()") fixed another regression. commit811230cd85("tcp: ipv4: initialize unicast_sock sk_pacing_rate") was another shot in the dark. Really, just use a proper socket per cpu, and remove the skb_orphan() call, to re-enable flow control. This solves a serious problem with FQ packet scheduler when used in hostile environments, as we do not want to allocate a flow structure for every RST packet sent in response to a spoofed packet. Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
		
							parent
							
								
									0d32ef8cef
								
							
						
					
					
						commit
						bdbbb8527b
					
				
					 4 changed files with 40 additions and 36 deletions
				
			
		| 
						 | 
					@ -181,7 +181,7 @@ static inline __u8 ip_reply_arg_flowi_flags(const struct ip_reply_arg *arg)
 | 
				
			||||||
	return (arg->flags & IP_REPLY_ARG_NOSRCCHECK) ? FLOWI_FLAG_ANYSRC : 0;
 | 
						return (arg->flags & IP_REPLY_ARG_NOSRCCHECK) ? FLOWI_FLAG_ANYSRC : 0;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void ip_send_unicast_reply(struct net *net, struct sk_buff *skb,
 | 
					void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
 | 
				
			||||||
			   const struct ip_options *sopt,
 | 
								   const struct ip_options *sopt,
 | 
				
			||||||
			   __be32 daddr, __be32 saddr,
 | 
								   __be32 daddr, __be32 saddr,
 | 
				
			||||||
			   const struct ip_reply_arg *arg,
 | 
								   const struct ip_reply_arg *arg,
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -52,6 +52,7 @@ struct netns_ipv4 {
 | 
				
			||||||
	struct inet_peer_base	*peers;
 | 
						struct inet_peer_base	*peers;
 | 
				
			||||||
	struct tcpm_hash_bucket	*tcp_metrics_hash;
 | 
						struct tcpm_hash_bucket	*tcp_metrics_hash;
 | 
				
			||||||
	unsigned int		tcp_metrics_hash_log;
 | 
						unsigned int		tcp_metrics_hash_log;
 | 
				
			||||||
 | 
						struct sock  * __percpu	*tcp_sk;
 | 
				
			||||||
	struct netns_frags	frags;
 | 
						struct netns_frags	frags;
 | 
				
			||||||
#ifdef CONFIG_NETFILTER
 | 
					#ifdef CONFIG_NETFILTER
 | 
				
			||||||
	struct xt_table		*iptable_filter;
 | 
						struct xt_table		*iptable_filter;
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1506,24 +1506,8 @@ static int ip_reply_glue_bits(void *dptr, char *to, int offset,
 | 
				
			||||||
/*
 | 
					/*
 | 
				
			||||||
 *	Generic function to send a packet as reply to another packet.
 | 
					 *	Generic function to send a packet as reply to another packet.
 | 
				
			||||||
 *	Used to send some TCP resets/acks so far.
 | 
					 *	Used to send some TCP resets/acks so far.
 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 *	Use a fake percpu inet socket to avoid false sharing and contention.
 | 
					 | 
				
			||||||
 */
 | 
					 */
 | 
				
			||||||
static DEFINE_PER_CPU(struct inet_sock, unicast_sock) = {
 | 
					void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
 | 
				
			||||||
	.sk = {
 | 
					 | 
				
			||||||
		.__sk_common = {
 | 
					 | 
				
			||||||
			.skc_refcnt = ATOMIC_INIT(1),
 | 
					 | 
				
			||||||
		},
 | 
					 | 
				
			||||||
		.sk_wmem_alloc	= ATOMIC_INIT(1),
 | 
					 | 
				
			||||||
		.sk_allocation	= GFP_ATOMIC,
 | 
					 | 
				
			||||||
		.sk_flags	= (1UL << SOCK_USE_WRITE_QUEUE),
 | 
					 | 
				
			||||||
		.sk_pacing_rate = ~0U,
 | 
					 | 
				
			||||||
	},
 | 
					 | 
				
			||||||
	.pmtudisc	= IP_PMTUDISC_WANT,
 | 
					 | 
				
			||||||
	.uc_ttl		= -1,
 | 
					 | 
				
			||||||
};
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
void ip_send_unicast_reply(struct net *net, struct sk_buff *skb,
 | 
					 | 
				
			||||||
			   const struct ip_options *sopt,
 | 
								   const struct ip_options *sopt,
 | 
				
			||||||
			   __be32 daddr, __be32 saddr,
 | 
								   __be32 daddr, __be32 saddr,
 | 
				
			||||||
			   const struct ip_reply_arg *arg,
 | 
								   const struct ip_reply_arg *arg,
 | 
				
			||||||
| 
						 | 
					@ -1533,9 +1517,8 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb,
 | 
				
			||||||
	struct ipcm_cookie ipc;
 | 
						struct ipcm_cookie ipc;
 | 
				
			||||||
	struct flowi4 fl4;
 | 
						struct flowi4 fl4;
 | 
				
			||||||
	struct rtable *rt = skb_rtable(skb);
 | 
						struct rtable *rt = skb_rtable(skb);
 | 
				
			||||||
 | 
						struct net *net = sock_net(sk);
 | 
				
			||||||
	struct sk_buff *nskb;
 | 
						struct sk_buff *nskb;
 | 
				
			||||||
	struct sock *sk;
 | 
					 | 
				
			||||||
	struct inet_sock *inet;
 | 
					 | 
				
			||||||
	int err;
 | 
						int err;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if (__ip_options_echo(&replyopts.opt.opt, skb, sopt))
 | 
						if (__ip_options_echo(&replyopts.opt.opt, skb, sopt))
 | 
				
			||||||
| 
						 | 
					@ -1566,15 +1549,11 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb,
 | 
				
			||||||
	if (IS_ERR(rt))
 | 
						if (IS_ERR(rt))
 | 
				
			||||||
		return;
 | 
							return;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	inet = &get_cpu_var(unicast_sock);
 | 
						inet_sk(sk)->tos = arg->tos;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	inet->tos = arg->tos;
 | 
					 | 
				
			||||||
	sk = &inet->sk;
 | 
					 | 
				
			||||||
	sk->sk_priority = skb->priority;
 | 
						sk->sk_priority = skb->priority;
 | 
				
			||||||
	sk->sk_protocol = ip_hdr(skb)->protocol;
 | 
						sk->sk_protocol = ip_hdr(skb)->protocol;
 | 
				
			||||||
	sk->sk_bound_dev_if = arg->bound_dev_if;
 | 
						sk->sk_bound_dev_if = arg->bound_dev_if;
 | 
				
			||||||
	sock_net_set(sk, net);
 | 
					 | 
				
			||||||
	__skb_queue_head_init(&sk->sk_write_queue);
 | 
					 | 
				
			||||||
	sk->sk_sndbuf = sysctl_wmem_default;
 | 
						sk->sk_sndbuf = sysctl_wmem_default;
 | 
				
			||||||
	err = ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base,
 | 
						err = ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base,
 | 
				
			||||||
			     len, 0, &ipc, &rt, MSG_DONTWAIT);
 | 
								     len, 0, &ipc, &rt, MSG_DONTWAIT);
 | 
				
			||||||
| 
						 | 
					@ -1590,13 +1569,10 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb,
 | 
				
			||||||
			  arg->csumoffset) = csum_fold(csum_add(nskb->csum,
 | 
								  arg->csumoffset) = csum_fold(csum_add(nskb->csum,
 | 
				
			||||||
								arg->csum));
 | 
													arg->csum));
 | 
				
			||||||
		nskb->ip_summed = CHECKSUM_NONE;
 | 
							nskb->ip_summed = CHECKSUM_NONE;
 | 
				
			||||||
		skb_orphan(nskb);
 | 
					 | 
				
			||||||
		skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb));
 | 
							skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb));
 | 
				
			||||||
		ip_push_pending_frames(sk, &fl4);
 | 
							ip_push_pending_frames(sk, &fl4);
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
out:
 | 
					out:
 | 
				
			||||||
	put_cpu_var(unicast_sock);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	ip_rt_put(rt);
 | 
						ip_rt_put(rt);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -683,7 +683,8 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
 | 
				
			||||||
		arg.bound_dev_if = sk->sk_bound_dev_if;
 | 
							arg.bound_dev_if = sk->sk_bound_dev_if;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	arg.tos = ip_hdr(skb)->tos;
 | 
						arg.tos = ip_hdr(skb)->tos;
 | 
				
			||||||
	ip_send_unicast_reply(net, skb, &TCP_SKB_CB(skb)->header.h4.opt,
 | 
						ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
 | 
				
			||||||
 | 
								      skb, &TCP_SKB_CB(skb)->header.h4.opt,
 | 
				
			||||||
			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 | 
								      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 | 
				
			||||||
			      &arg, arg.iov[0].iov_len);
 | 
								      &arg, arg.iov[0].iov_len);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -767,7 +768,8 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
 | 
				
			||||||
	if (oif)
 | 
						if (oif)
 | 
				
			||||||
		arg.bound_dev_if = oif;
 | 
							arg.bound_dev_if = oif;
 | 
				
			||||||
	arg.tos = tos;
 | 
						arg.tos = tos;
 | 
				
			||||||
	ip_send_unicast_reply(net, skb, &TCP_SKB_CB(skb)->header.h4.opt,
 | 
						ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
 | 
				
			||||||
 | 
								      skb, &TCP_SKB_CB(skb)->header.h4.opt,
 | 
				
			||||||
			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 | 
								      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 | 
				
			||||||
			      &arg, arg.iov[0].iov_len);
 | 
								      &arg, arg.iov[0].iov_len);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -2428,14 +2430,39 @@ struct proto tcp_prot = {
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
EXPORT_SYMBOL(tcp_prot);
 | 
					EXPORT_SYMBOL(tcp_prot);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static int __net_init tcp_sk_init(struct net *net)
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
	net->ipv4.sysctl_tcp_ecn = 2;
 | 
					 | 
				
			||||||
	return 0;
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
static void __net_exit tcp_sk_exit(struct net *net)
 | 
					static void __net_exit tcp_sk_exit(struct net *net)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
 | 
						int cpu;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						for_each_possible_cpu(cpu)
 | 
				
			||||||
 | 
							inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
 | 
				
			||||||
 | 
						free_percpu(net->ipv4.tcp_sk);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static int __net_init tcp_sk_init(struct net *net)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						int res, cpu;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						net->ipv4.tcp_sk = alloc_percpu(struct sock *);
 | 
				
			||||||
 | 
						if (!net->ipv4.tcp_sk)
 | 
				
			||||||
 | 
							return -ENOMEM;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						for_each_possible_cpu(cpu) {
 | 
				
			||||||
 | 
							struct sock *sk;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
 | 
				
			||||||
 | 
										   IPPROTO_TCP, net);
 | 
				
			||||||
 | 
							if (res)
 | 
				
			||||||
 | 
								goto fail;
 | 
				
			||||||
 | 
							*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
						net->ipv4.sysctl_tcp_ecn = 2;
 | 
				
			||||||
 | 
						return 0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					fail:
 | 
				
			||||||
 | 
						tcp_sk_exit(net);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						return res;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
 | 
					static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in a new issue