forked from mirrors/linux
		
	sock: MSG_ZEROCOPY notification coalescing
In the simple case, each sendmsg() call generates data and eventually a zerocopy ready notification N, where N indicates the Nth successful invocation of sendmsg() with the MSG_ZEROCOPY flag on this socket. TCP and corked sockets can cause send() calls to append new data to an existing sk_buff and, thus, ubuf_info. In that case the notification must hold a range. odify ubuf_info to store a inclusive range [N..N+m] and add skb_zerocopy_realloc() to optionally extend an existing range. Also coalesce notifications in this common case: if a notification [1, 1] is about to be queued while [0, 0] is the queue tail, just modify the head of the queue to read [0, 1]. Coalescing is limited to a few TSO frames worth of data to bound notification latency. Signed-off-by: Willem de Bruijn <willemb@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
		
							parent
							
								
									1f8b977ab3
								
							
						
					
					
						commit
						4ab6c99d99
					
				
					 2 changed files with 106 additions and 10 deletions
				
			
		| 
						 | 
					@ -444,15 +444,26 @@ enum {
 | 
				
			||||||
 */
 | 
					 */
 | 
				
			||||||
struct ubuf_info {
 | 
					struct ubuf_info {
 | 
				
			||||||
	void (*callback)(struct ubuf_info *, bool zerocopy_success);
 | 
						void (*callback)(struct ubuf_info *, bool zerocopy_success);
 | 
				
			||||||
	void *ctx;
 | 
						union {
 | 
				
			||||||
 | 
							struct {
 | 
				
			||||||
			unsigned long desc;
 | 
								unsigned long desc;
 | 
				
			||||||
 | 
								void *ctx;
 | 
				
			||||||
 | 
							};
 | 
				
			||||||
 | 
							struct {
 | 
				
			||||||
 | 
								u32 id;
 | 
				
			||||||
 | 
								u16 len;
 | 
				
			||||||
			u16 zerocopy:1;
 | 
								u16 zerocopy:1;
 | 
				
			||||||
 | 
								u32 bytelen;
 | 
				
			||||||
 | 
							};
 | 
				
			||||||
 | 
						};
 | 
				
			||||||
	atomic_t refcnt;
 | 
						atomic_t refcnt;
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define skb_uarg(SKB)	((struct ubuf_info *)(skb_shinfo(SKB)->destructor_arg))
 | 
					#define skb_uarg(SKB)	((struct ubuf_info *)(skb_shinfo(SKB)->destructor_arg))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size);
 | 
					struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size);
 | 
				
			||||||
 | 
					struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size,
 | 
				
			||||||
 | 
										struct ubuf_info *uarg);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static inline void sock_zerocopy_get(struct ubuf_info *uarg)
 | 
					static inline void sock_zerocopy_get(struct ubuf_info *uarg)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -915,7 +915,9 @@ struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size)
 | 
				
			||||||
	uarg = (void *)skb->cb;
 | 
						uarg = (void *)skb->cb;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	uarg->callback = sock_zerocopy_callback;
 | 
						uarg->callback = sock_zerocopy_callback;
 | 
				
			||||||
	uarg->desc = atomic_inc_return(&sk->sk_zckey) - 1;
 | 
						uarg->id = ((u32)atomic_inc_return(&sk->sk_zckey)) - 1;
 | 
				
			||||||
 | 
						uarg->len = 1;
 | 
				
			||||||
 | 
						uarg->bytelen = size;
 | 
				
			||||||
	uarg->zerocopy = 1;
 | 
						uarg->zerocopy = 1;
 | 
				
			||||||
	atomic_set(&uarg->refcnt, 0);
 | 
						atomic_set(&uarg->refcnt, 0);
 | 
				
			||||||
	sock_hold(sk);
 | 
						sock_hold(sk);
 | 
				
			||||||
| 
						 | 
					@ -929,26 +931,101 @@ static inline struct sk_buff *skb_from_uarg(struct ubuf_info *uarg)
 | 
				
			||||||
	return container_of((void *)uarg, struct sk_buff, cb);
 | 
						return container_of((void *)uarg, struct sk_buff, cb);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size,
 | 
				
			||||||
 | 
										struct ubuf_info *uarg)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						if (uarg) {
 | 
				
			||||||
 | 
							const u32 byte_limit = 1 << 19;		/* limit to a few TSO */
 | 
				
			||||||
 | 
							u32 bytelen, next;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							/* realloc only when socket is locked (TCP, UDP cork),
 | 
				
			||||||
 | 
							 * so uarg->len and sk_zckey access is serialized
 | 
				
			||||||
 | 
							 */
 | 
				
			||||||
 | 
							if (!sock_owned_by_user(sk)) {
 | 
				
			||||||
 | 
								WARN_ON_ONCE(1);
 | 
				
			||||||
 | 
								return NULL;
 | 
				
			||||||
 | 
							}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							bytelen = uarg->bytelen + size;
 | 
				
			||||||
 | 
							if (uarg->len == USHRT_MAX - 1 || bytelen > byte_limit) {
 | 
				
			||||||
 | 
								/* TCP can create new skb to attach new uarg */
 | 
				
			||||||
 | 
								if (sk->sk_type == SOCK_STREAM)
 | 
				
			||||||
 | 
									goto new_alloc;
 | 
				
			||||||
 | 
								return NULL;
 | 
				
			||||||
 | 
							}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							next = (u32)atomic_read(&sk->sk_zckey);
 | 
				
			||||||
 | 
							if ((u32)(uarg->id + uarg->len) == next) {
 | 
				
			||||||
 | 
								uarg->len++;
 | 
				
			||||||
 | 
								uarg->bytelen = bytelen;
 | 
				
			||||||
 | 
								atomic_set(&sk->sk_zckey, ++next);
 | 
				
			||||||
 | 
								return uarg;
 | 
				
			||||||
 | 
							}
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					new_alloc:
 | 
				
			||||||
 | 
						return sock_zerocopy_alloc(sk, size);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					EXPORT_SYMBOL_GPL(sock_zerocopy_realloc);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static bool skb_zerocopy_notify_extend(struct sk_buff *skb, u32 lo, u16 len)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						struct sock_exterr_skb *serr = SKB_EXT_ERR(skb);
 | 
				
			||||||
 | 
						u32 old_lo, old_hi;
 | 
				
			||||||
 | 
						u64 sum_len;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						old_lo = serr->ee.ee_info;
 | 
				
			||||||
 | 
						old_hi = serr->ee.ee_data;
 | 
				
			||||||
 | 
						sum_len = old_hi - old_lo + 1ULL + len;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if (sum_len >= (1ULL << 32))
 | 
				
			||||||
 | 
							return false;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if (lo != old_hi + 1)
 | 
				
			||||||
 | 
							return false;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						serr->ee.ee_data += len;
 | 
				
			||||||
 | 
						return true;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void sock_zerocopy_callback(struct ubuf_info *uarg, bool success)
 | 
					void sock_zerocopy_callback(struct ubuf_info *uarg, bool success)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	struct sk_buff *skb = skb_from_uarg(uarg);
 | 
						struct sk_buff *tail, *skb = skb_from_uarg(uarg);
 | 
				
			||||||
	struct sock_exterr_skb *serr;
 | 
						struct sock_exterr_skb *serr;
 | 
				
			||||||
	struct sock *sk = skb->sk;
 | 
						struct sock *sk = skb->sk;
 | 
				
			||||||
	u16 id = uarg->desc;
 | 
						struct sk_buff_head *q;
 | 
				
			||||||
 | 
						unsigned long flags;
 | 
				
			||||||
 | 
						u32 lo, hi;
 | 
				
			||||||
 | 
						u16 len;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if (sock_flag(sk, SOCK_DEAD))
 | 
						/* if !len, there was only 1 call, and it was aborted
 | 
				
			||||||
 | 
						 * so do not queue a completion notification
 | 
				
			||||||
 | 
						 */
 | 
				
			||||||
 | 
						if (!uarg->len || sock_flag(sk, SOCK_DEAD))
 | 
				
			||||||
		goto release;
 | 
							goto release;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						len = uarg->len;
 | 
				
			||||||
 | 
						lo = uarg->id;
 | 
				
			||||||
 | 
						hi = uarg->id + len - 1;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	serr = SKB_EXT_ERR(skb);
 | 
						serr = SKB_EXT_ERR(skb);
 | 
				
			||||||
	memset(serr, 0, sizeof(*serr));
 | 
						memset(serr, 0, sizeof(*serr));
 | 
				
			||||||
	serr->ee.ee_errno = 0;
 | 
						serr->ee.ee_errno = 0;
 | 
				
			||||||
	serr->ee.ee_origin = SO_EE_ORIGIN_ZEROCOPY;
 | 
						serr->ee.ee_origin = SO_EE_ORIGIN_ZEROCOPY;
 | 
				
			||||||
	serr->ee.ee_data = id;
 | 
						serr->ee.ee_data = hi;
 | 
				
			||||||
 | 
						serr->ee.ee_info = lo;
 | 
				
			||||||
	if (!success)
 | 
						if (!success)
 | 
				
			||||||
		serr->ee.ee_code |= SO_EE_CODE_ZEROCOPY_COPIED;
 | 
							serr->ee.ee_code |= SO_EE_CODE_ZEROCOPY_COPIED;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	skb_queue_tail(&sk->sk_error_queue, skb);
 | 
						q = &sk->sk_error_queue;
 | 
				
			||||||
 | 
						spin_lock_irqsave(&q->lock, flags);
 | 
				
			||||||
 | 
						tail = skb_peek_tail(q);
 | 
				
			||||||
 | 
						if (!tail || SKB_EXT_ERR(tail)->ee.ee_origin != SO_EE_ORIGIN_ZEROCOPY ||
 | 
				
			||||||
 | 
						    !skb_zerocopy_notify_extend(tail, lo, len)) {
 | 
				
			||||||
 | 
							__skb_queue_tail(q, skb);
 | 
				
			||||||
		skb = NULL;
 | 
							skb = NULL;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
						spin_unlock_irqrestore(&q->lock, flags);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	sk->sk_error_report(sk);
 | 
						sk->sk_error_report(sk);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -975,6 +1052,7 @@ void sock_zerocopy_put_abort(struct ubuf_info *uarg)
 | 
				
			||||||
		struct sock *sk = skb_from_uarg(uarg)->sk;
 | 
							struct sock *sk = skb_from_uarg(uarg)->sk;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		atomic_dec(&sk->sk_zckey);
 | 
							atomic_dec(&sk->sk_zckey);
 | 
				
			||||||
 | 
							uarg->len--;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		/* sock_zerocopy_put expects a ref. Most sockets take one per
 | 
							/* sock_zerocopy_put expects a ref. Most sockets take one per
 | 
				
			||||||
		 * skb, which is zero on abort. tcp_sendmsg holds one extra, to
 | 
							 * skb, which is zero on abort. tcp_sendmsg holds one extra, to
 | 
				
			||||||
| 
						 | 
					@ -995,9 +1073,16 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
 | 
				
			||||||
			     struct msghdr *msg, int len,
 | 
								     struct msghdr *msg, int len,
 | 
				
			||||||
			     struct ubuf_info *uarg)
 | 
								     struct ubuf_info *uarg)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
 | 
						struct ubuf_info *orig_uarg = skb_zcopy(skb);
 | 
				
			||||||
	struct iov_iter orig_iter = msg->msg_iter;
 | 
						struct iov_iter orig_iter = msg->msg_iter;
 | 
				
			||||||
	int err, orig_len = skb->len;
 | 
						int err, orig_len = skb->len;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/* An skb can only point to one uarg. This edge case happens when
 | 
				
			||||||
 | 
						 * TCP appends to an skb, but zerocopy_realloc triggered a new alloc.
 | 
				
			||||||
 | 
						 */
 | 
				
			||||||
 | 
						if (orig_uarg && uarg != orig_uarg)
 | 
				
			||||||
 | 
							return -EEXIST;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	err = __zerocopy_sg_from_iter(sk, skb, &msg->msg_iter, len);
 | 
						err = __zerocopy_sg_from_iter(sk, skb, &msg->msg_iter, len);
 | 
				
			||||||
	if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) {
 | 
						if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) {
 | 
				
			||||||
		/* Streams do not free skb on error. Reset to prev state. */
 | 
							/* Streams do not free skb on error. Reset to prev state. */
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in a new issue