forked from mirrors/linux
		
	Merge branch 'udp-msg_zerocopy'
Willem de Bruijn says: ==================== udp msg_zerocopy Enable MSG_ZEROCOPY for udp sockets Patch 1/3 is the main patch, a rework of RFC patch http://patchwork.ozlabs.org/patch/899630/ more details in the patch commit message Patch 2/3 is an optimization to remove a branch from the UDP hot path and refcount_inc/refcount_dec_and_test pair when zerocopy is used. This used to be included in the first patch in v2. Patch 3/3 runs the already existing udp zerocopy tests as part of kselftest See also recent Linux Plumbers presentation https://linuxplumbersconf.org/event/2/contributions/106/attachments/104/128/willemdebruijn-lpc2018-udpgso-presentation-20181113.pdf Changes: v1 -> v2 - Fixup reverse christmas tree violation v2 -> v3 - Split refcount avoidance optimization into separate patch - Fix refcount leak on error in fragmented case (thanks to Paolo Abeni for pointing this one out!) - Fix refcount inc on zero v3 -> v4 - Move skb_zcopy_set below the only kfree_skb that might cause a premature uarg destroy before skb_zerocopy_put_abort - Move the entire skb_shinfo assignment block, to keep that cacheline access in one place ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
		
						commit
						6e360f7331
					
				
					 9 changed files with 90 additions and 27 deletions
				
			
		| 
						 | 
				
			
			@ -481,10 +481,11 @@ static inline void sock_zerocopy_get(struct ubuf_info *uarg)
 | 
			
		|||
}
 | 
			
		||||
 | 
			
		||||
void sock_zerocopy_put(struct ubuf_info *uarg);
 | 
			
		||||
void sock_zerocopy_put_abort(struct ubuf_info *uarg);
 | 
			
		||||
void sock_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref);
 | 
			
		||||
 | 
			
		||||
void sock_zerocopy_callback(struct ubuf_info *uarg, bool success);
 | 
			
		||||
 | 
			
		||||
int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len);
 | 
			
		||||
int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
 | 
			
		||||
			     struct msghdr *msg, int len,
 | 
			
		||||
			     struct ubuf_info *uarg);
 | 
			
		||||
| 
						 | 
				
			
			@ -1325,9 +1326,13 @@ static inline struct ubuf_info *skb_zcopy(struct sk_buff *skb)
 | 
			
		|||
	return is_zcopy ? skb_uarg(skb) : NULL;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg)
 | 
			
		||||
static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg,
 | 
			
		||||
				 bool *have_ref)
 | 
			
		||||
{
 | 
			
		||||
	if (skb && uarg && !skb_zcopy(skb)) {
 | 
			
		||||
		if (unlikely(have_ref && *have_ref))
 | 
			
		||||
			*have_ref = false;
 | 
			
		||||
		else
 | 
			
		||||
			sock_zerocopy_get(uarg);
 | 
			
		||||
		skb_shinfo(skb)->destructor_arg = uarg;
 | 
			
		||||
		skb_shinfo(skb)->tx_flags |= SKBTX_ZEROCOPY_FRAG;
 | 
			
		||||
| 
						 | 
				
			
			@ -1373,7 +1378,7 @@ static inline void skb_zcopy_abort(struct sk_buff *skb)
 | 
			
		|||
	struct ubuf_info *uarg = skb_zcopy(skb);
 | 
			
		||||
 | 
			
		||||
	if (uarg) {
 | 
			
		||||
		sock_zerocopy_put_abort(uarg);
 | 
			
		||||
		sock_zerocopy_put_abort(uarg, false);
 | 
			
		||||
		skb_shinfo(skb)->tx_flags &= ~SKBTX_ZEROCOPY_FRAG;
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1089,7 +1089,7 @@ void sock_zerocopy_put(struct ubuf_info *uarg)
 | 
			
		|||
}
 | 
			
		||||
EXPORT_SYMBOL_GPL(sock_zerocopy_put);
 | 
			
		||||
 | 
			
		||||
void sock_zerocopy_put_abort(struct ubuf_info *uarg)
 | 
			
		||||
void sock_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref)
 | 
			
		||||
{
 | 
			
		||||
	if (uarg) {
 | 
			
		||||
		struct sock *sk = skb_from_uarg(uarg)->sk;
 | 
			
		||||
| 
						 | 
				
			
			@ -1097,6 +1097,7 @@ void sock_zerocopy_put_abort(struct ubuf_info *uarg)
 | 
			
		|||
		atomic_dec(&sk->sk_zckey);
 | 
			
		||||
		uarg->len--;
 | 
			
		||||
 | 
			
		||||
		if (have_uref)
 | 
			
		||||
			sock_zerocopy_put(uarg);
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
| 
						 | 
				
			
			@ -1105,6 +1106,12 @@ EXPORT_SYMBOL_GPL(sock_zerocopy_put_abort);
 | 
			
		|||
extern int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb,
 | 
			
		||||
				   struct iov_iter *from, size_t length);
 | 
			
		||||
 | 
			
		||||
int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len)
 | 
			
		||||
{
 | 
			
		||||
	return __zerocopy_sg_from_iter(skb->sk, skb, &msg->msg_iter, len);
 | 
			
		||||
}
 | 
			
		||||
EXPORT_SYMBOL_GPL(skb_zerocopy_iter_dgram);
 | 
			
		||||
 | 
			
		||||
int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
 | 
			
		||||
			     struct msghdr *msg, int len,
 | 
			
		||||
			     struct ubuf_info *uarg)
 | 
			
		||||
| 
						 | 
				
			
			@ -1131,7 +1138,7 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
 | 
			
		|||
		return err;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	skb_zcopy_set(skb, uarg);
 | 
			
		||||
	skb_zcopy_set(skb, uarg, NULL);
 | 
			
		||||
	return skb->len - orig_len;
 | 
			
		||||
}
 | 
			
		||||
EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream);
 | 
			
		||||
| 
						 | 
				
			
			@ -1151,7 +1158,7 @@ static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig,
 | 
			
		|||
			if (skb_copy_ubufs(nskb, GFP_ATOMIC))
 | 
			
		||||
				return -EIO;
 | 
			
		||||
		}
 | 
			
		||||
		skb_zcopy_set(nskb, skb_uarg(orig));
 | 
			
		||||
		skb_zcopy_set(nskb, skb_uarg(orig), NULL);
 | 
			
		||||
	}
 | 
			
		||||
	return 0;
 | 
			
		||||
}
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1018,7 +1018,10 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
 | 
			
		|||
 | 
			
		||||
	case SO_ZEROCOPY:
 | 
			
		||||
		if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
 | 
			
		||||
			if (sk->sk_protocol != IPPROTO_TCP)
 | 
			
		||||
			if (!((sk->sk_type == SOCK_STREAM &&
 | 
			
		||||
			       sk->sk_protocol == IPPROTO_TCP) ||
 | 
			
		||||
			      (sk->sk_type == SOCK_DGRAM &&
 | 
			
		||||
			       sk->sk_protocol == IPPROTO_UDP)))
 | 
			
		||||
				ret = -ENOTSUPP;
 | 
			
		||||
		} else if (sk->sk_family != PF_RDS) {
 | 
			
		||||
			ret = -ENOTSUPP;
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -867,6 +867,7 @@ static int __ip_append_data(struct sock *sk,
 | 
			
		|||
			    unsigned int flags)
 | 
			
		||||
{
 | 
			
		||||
	struct inet_sock *inet = inet_sk(sk);
 | 
			
		||||
	struct ubuf_info *uarg = NULL;
 | 
			
		||||
	struct sk_buff *skb;
 | 
			
		||||
 | 
			
		||||
	struct ip_options *opt = cork->opt;
 | 
			
		||||
| 
						 | 
				
			
			@ -880,8 +881,8 @@ static int __ip_append_data(struct sock *sk,
 | 
			
		|||
	int csummode = CHECKSUM_NONE;
 | 
			
		||||
	struct rtable *rt = (struct rtable *)cork->dst;
 | 
			
		||||
	unsigned int wmem_alloc_delta = 0;
 | 
			
		||||
	bool paged, extra_uref;
 | 
			
		||||
	u32 tskey = 0;
 | 
			
		||||
	bool paged;
 | 
			
		||||
 | 
			
		||||
	skb = skb_peek_tail(queue);
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -916,6 +917,20 @@ static int __ip_append_data(struct sock *sk,
 | 
			
		|||
	    (!exthdrlen || (rt->dst.dev->features & NETIF_F_HW_ESP_TX_CSUM)))
 | 
			
		||||
		csummode = CHECKSUM_PARTIAL;
 | 
			
		||||
 | 
			
		||||
	if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
 | 
			
		||||
		uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
 | 
			
		||||
		if (!uarg)
 | 
			
		||||
			return -ENOBUFS;
 | 
			
		||||
		extra_uref = true;
 | 
			
		||||
		if (rt->dst.dev->features & NETIF_F_SG &&
 | 
			
		||||
		    csummode == CHECKSUM_PARTIAL) {
 | 
			
		||||
			paged = true;
 | 
			
		||||
		} else {
 | 
			
		||||
			uarg->zerocopy = 0;
 | 
			
		||||
			skb_zcopy_set(skb, uarg, &extra_uref);
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	cork->length += length;
 | 
			
		||||
 | 
			
		||||
	/* So, what's going on in the loop below?
 | 
			
		||||
| 
						 | 
				
			
			@ -1001,12 +1016,6 @@ static int __ip_append_data(struct sock *sk,
 | 
			
		|||
			skb->csum = 0;
 | 
			
		||||
			skb_reserve(skb, hh_len);
 | 
			
		||||
 | 
			
		||||
			/* only the initial fragment is time stamped */
 | 
			
		||||
			skb_shinfo(skb)->tx_flags = cork->tx_flags;
 | 
			
		||||
			cork->tx_flags = 0;
 | 
			
		||||
			skb_shinfo(skb)->tskey = tskey;
 | 
			
		||||
			tskey = 0;
 | 
			
		||||
 | 
			
		||||
			/*
 | 
			
		||||
			 *	Find where to start putting bytes.
 | 
			
		||||
			 */
 | 
			
		||||
| 
						 | 
				
			
			@ -1039,6 +1048,13 @@ static int __ip_append_data(struct sock *sk,
 | 
			
		|||
			exthdrlen = 0;
 | 
			
		||||
			csummode = CHECKSUM_NONE;
 | 
			
		||||
 | 
			
		||||
			/* only the initial fragment is time stamped */
 | 
			
		||||
			skb_shinfo(skb)->tx_flags = cork->tx_flags;
 | 
			
		||||
			cork->tx_flags = 0;
 | 
			
		||||
			skb_shinfo(skb)->tskey = tskey;
 | 
			
		||||
			tskey = 0;
 | 
			
		||||
			skb_zcopy_set(skb, uarg, &extra_uref);
 | 
			
		||||
 | 
			
		||||
			if ((flags & MSG_CONFIRM) && !skb_prev)
 | 
			
		||||
				skb_set_dst_pending_confirm(skb, 1);
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -1068,7 +1084,7 @@ static int __ip_append_data(struct sock *sk,
 | 
			
		|||
				err = -EFAULT;
 | 
			
		||||
				goto error;
 | 
			
		||||
			}
 | 
			
		||||
		} else {
 | 
			
		||||
		} else if (!uarg || !uarg->zerocopy) {
 | 
			
		||||
			int i = skb_shinfo(skb)->nr_frags;
 | 
			
		||||
 | 
			
		||||
			err = -ENOMEM;
 | 
			
		||||
| 
						 | 
				
			
			@ -1098,6 +1114,10 @@ static int __ip_append_data(struct sock *sk,
 | 
			
		|||
			skb->data_len += copy;
 | 
			
		||||
			skb->truesize += copy;
 | 
			
		||||
			wmem_alloc_delta += copy;
 | 
			
		||||
		} else {
 | 
			
		||||
			err = skb_zerocopy_iter_dgram(skb, from, copy);
 | 
			
		||||
			if (err < 0)
 | 
			
		||||
				goto error;
 | 
			
		||||
		}
 | 
			
		||||
		offset += copy;
 | 
			
		||||
		length -= copy;
 | 
			
		||||
| 
						 | 
				
			
			@ -1110,6 +1130,7 @@ static int __ip_append_data(struct sock *sk,
 | 
			
		|||
error_efault:
 | 
			
		||||
	err = -EFAULT;
 | 
			
		||||
error:
 | 
			
		||||
	sock_zerocopy_put_abort(uarg, extra_uref);
 | 
			
		||||
	cork->length -= length;
 | 
			
		||||
	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
 | 
			
		||||
	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1423,7 +1423,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
 | 
			
		|||
	if (copied + copied_syn)
 | 
			
		||||
		goto out;
 | 
			
		||||
out_err:
 | 
			
		||||
	sock_zerocopy_put_abort(uarg);
 | 
			
		||||
	sock_zerocopy_put_abort(uarg, true);
 | 
			
		||||
	err = sk_stream_error(sk, flags, err);
 | 
			
		||||
	/* make sure we wake any epoll edge trigger waiter */
 | 
			
		||||
	if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 &&
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1245,6 +1245,7 @@ static int __ip6_append_data(struct sock *sk,
 | 
			
		|||
{
 | 
			
		||||
	struct sk_buff *skb, *skb_prev = NULL;
 | 
			
		||||
	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
 | 
			
		||||
	struct ubuf_info *uarg = NULL;
 | 
			
		||||
	int exthdrlen = 0;
 | 
			
		||||
	int dst_exthdrlen = 0;
 | 
			
		||||
	int hh_len;
 | 
			
		||||
| 
						 | 
				
			
			@ -1257,7 +1258,7 @@ static int __ip6_append_data(struct sock *sk,
 | 
			
		|||
	int csummode = CHECKSUM_NONE;
 | 
			
		||||
	unsigned int maxnonfragsize, headersize;
 | 
			
		||||
	unsigned int wmem_alloc_delta = 0;
 | 
			
		||||
	bool paged;
 | 
			
		||||
	bool paged, extra_uref;
 | 
			
		||||
 | 
			
		||||
	skb = skb_peek_tail(queue);
 | 
			
		||||
	if (!skb) {
 | 
			
		||||
| 
						 | 
				
			
			@ -1322,6 +1323,20 @@ static int __ip6_append_data(struct sock *sk,
 | 
			
		|||
	    rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
 | 
			
		||||
		csummode = CHECKSUM_PARTIAL;
 | 
			
		||||
 | 
			
		||||
	if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
 | 
			
		||||
		uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
 | 
			
		||||
		if (!uarg)
 | 
			
		||||
			return -ENOBUFS;
 | 
			
		||||
		extra_uref = true;
 | 
			
		||||
		if (rt->dst.dev->features & NETIF_F_SG &&
 | 
			
		||||
		    csummode == CHECKSUM_PARTIAL) {
 | 
			
		||||
			paged = true;
 | 
			
		||||
		} else {
 | 
			
		||||
			uarg->zerocopy = 0;
 | 
			
		||||
			skb_zcopy_set(skb, uarg, &extra_uref);
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	/*
 | 
			
		||||
	 * Let's try using as much space as possible.
 | 
			
		||||
	 * Use MTU if total length of the message fits into the MTU.
 | 
			
		||||
| 
						 | 
				
			
			@ -1440,12 +1455,6 @@ static int __ip6_append_data(struct sock *sk,
 | 
			
		|||
			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
 | 
			
		||||
				    dst_exthdrlen);
 | 
			
		||||
 | 
			
		||||
			/* Only the initial fragment is time stamped */
 | 
			
		||||
			skb_shinfo(skb)->tx_flags = cork->tx_flags;
 | 
			
		||||
			cork->tx_flags = 0;
 | 
			
		||||
			skb_shinfo(skb)->tskey = tskey;
 | 
			
		||||
			tskey = 0;
 | 
			
		||||
 | 
			
		||||
			/*
 | 
			
		||||
			 *	Find where to start putting bytes
 | 
			
		||||
			 */
 | 
			
		||||
| 
						 | 
				
			
			@ -1477,6 +1486,13 @@ static int __ip6_append_data(struct sock *sk,
 | 
			
		|||
			exthdrlen = 0;
 | 
			
		||||
			dst_exthdrlen = 0;
 | 
			
		||||
 | 
			
		||||
			/* Only the initial fragment is time stamped */
 | 
			
		||||
			skb_shinfo(skb)->tx_flags = cork->tx_flags;
 | 
			
		||||
			cork->tx_flags = 0;
 | 
			
		||||
			skb_shinfo(skb)->tskey = tskey;
 | 
			
		||||
			tskey = 0;
 | 
			
		||||
			skb_zcopy_set(skb, uarg, &extra_uref);
 | 
			
		||||
 | 
			
		||||
			if ((flags & MSG_CONFIRM) && !skb_prev)
 | 
			
		||||
				skb_set_dst_pending_confirm(skb, 1);
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -1506,7 +1522,7 @@ static int __ip6_append_data(struct sock *sk,
 | 
			
		|||
				err = -EFAULT;
 | 
			
		||||
				goto error;
 | 
			
		||||
			}
 | 
			
		||||
		} else {
 | 
			
		||||
		} else if (!uarg || !uarg->zerocopy) {
 | 
			
		||||
			int i = skb_shinfo(skb)->nr_frags;
 | 
			
		||||
 | 
			
		||||
			err = -ENOMEM;
 | 
			
		||||
| 
						 | 
				
			
			@ -1536,6 +1552,10 @@ static int __ip6_append_data(struct sock *sk,
 | 
			
		|||
			skb->data_len += copy;
 | 
			
		||||
			skb->truesize += copy;
 | 
			
		||||
			wmem_alloc_delta += copy;
 | 
			
		||||
		} else {
 | 
			
		||||
			err = skb_zerocopy_iter_dgram(skb, from, copy);
 | 
			
		||||
			if (err < 0)
 | 
			
		||||
				goto error;
 | 
			
		||||
		}
 | 
			
		||||
		offset += copy;
 | 
			
		||||
		length -= copy;
 | 
			
		||||
| 
						 | 
				
			
			@ -1548,6 +1568,7 @@ static int __ip6_append_data(struct sock *sk,
 | 
			
		|||
error_efault:
 | 
			
		||||
	err = -EFAULT;
 | 
			
		||||
error:
 | 
			
		||||
	sock_zerocopy_put_abort(uarg, extra_uref);
 | 
			
		||||
	cork->length -= length;
 | 
			
		||||
	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
 | 
			
		||||
	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -651,12 +651,13 @@ static void do_flush_datagram(int fd, int type)
 | 
			
		|||
 | 
			
		||||
static void do_rx(int domain, int type, int protocol)
 | 
			
		||||
{
 | 
			
		||||
	const int cfg_receiver_wait_ms = 400;
 | 
			
		||||
	uint64_t tstop;
 | 
			
		||||
	int fd;
 | 
			
		||||
 | 
			
		||||
	fd = do_setup_rx(domain, type, protocol);
 | 
			
		||||
 | 
			
		||||
	tstop = gettimeofday_ms() + cfg_runtime_ms;
 | 
			
		||||
	tstop = gettimeofday_ms() + cfg_runtime_ms + cfg_receiver_wait_ms;
 | 
			
		||||
	do {
 | 
			
		||||
		if (type == SOCK_STREAM)
 | 
			
		||||
			do_flush_tcp(fd);
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -25,6 +25,8 @@ readonly path_sysctl_mem="net.core.optmem_max"
 | 
			
		|||
if [[ "$#" -eq "0" ]]; then
 | 
			
		||||
	$0 4 tcp -t 1
 | 
			
		||||
	$0 6 tcp -t 1
 | 
			
		||||
	$0 4 udp -t 1
 | 
			
		||||
	$0 6 udp -t 1
 | 
			
		||||
	echo "OK. All tests passed"
 | 
			
		||||
	exit 0
 | 
			
		||||
fi
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -35,6 +35,9 @@ run_udp() {
 | 
			
		|||
 | 
			
		||||
	echo "udp gso"
 | 
			
		||||
	run_in_netns ${args} -S 0
 | 
			
		||||
 | 
			
		||||
	echo "udp gso zerocopy"
 | 
			
		||||
	run_in_netns ${args} -S 0 -z
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
run_tcp() {
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in a new issue