forked from mirrors/linux
		
	af_unix: improve STREAM behavior with fragmented memory
unix_stream_sendmsg() currently uses order-2 allocations,
and we had numerous reports this can fail.
The __GFP_REPEAT flag present in sock_alloc_send_pskb() is
not helping.
This patch extends the work done in commit eb6a24816b
("af_unix: reduce high order page allocations) for
datagram sockets.
This opens the possibility of zero copy IO (splice() and
friends)
The trick is to not use skb_pull() anymore in recvmsg() path,
and instead add a @consumed field in UNIXCB() to track amount
of already read payload in the skb.
There is a performance regression for large sends
because of extra page allocations that will be addressed
in a follow-up patch, allowing sock_alloc_send_pskb()
to attempt high order page allocations.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
			
			
This commit is contained in:
		
							parent
							
								
									149479d019
								
							
						
					
					
						commit
						e370a72363
					
				
					 2 changed files with 31 additions and 35 deletions
				
			
		|  | @ -35,6 +35,7 @@ struct unix_skb_parms { | |||
| #ifdef CONFIG_SECURITY_NETWORK | ||||
| 	u32			secid;		/* Security ID		*/ | ||||
| #endif | ||||
| 	u32			consumed; | ||||
| }; | ||||
| 
 | ||||
| #define UNIXCB(skb) 	(*(struct unix_skb_parms *)&((skb)->cb)) | ||||
|  |  | |||
|  | @ -1596,6 +1596,10 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock, | |||
| 	return err; | ||||
| } | ||||
| 
 | ||||
| /* We use paged skbs for stream sockets, and limit occupancy to 32768
 | ||||
|  * bytes, and a minimun of a full page. | ||||
|  */ | ||||
| #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768)) | ||||
| 
 | ||||
| static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, | ||||
| 			       struct msghdr *msg, size_t len) | ||||
|  | @ -1609,6 +1613,7 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, | |||
| 	struct scm_cookie tmp_scm; | ||||
| 	bool fds_sent = false; | ||||
| 	int max_level; | ||||
| 	int data_len; | ||||
| 
 | ||||
| 	if (NULL == siocb->scm) | ||||
| 		siocb->scm = &tmp_scm; | ||||
|  | @ -1635,40 +1640,21 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, | |||
| 		goto pipe_err; | ||||
| 
 | ||||
| 	while (sent < len) { | ||||
| 		/*
 | ||||
| 		 *	Optimisation for the fact that under 0.01% of X | ||||
| 		 *	messages typically need breaking up. | ||||
| 		 */ | ||||
| 
 | ||||
| 		size = len-sent; | ||||
| 		size = len - sent; | ||||
| 
 | ||||
| 		/* Keep two messages in the pipe so it schedules better */ | ||||
| 		if (size > ((sk->sk_sndbuf >> 1) - 64)) | ||||
| 			size = (sk->sk_sndbuf >> 1) - 64; | ||||
| 		size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64); | ||||
| 
 | ||||
| 		if (size > SKB_MAX_ALLOC) | ||||
| 			size = SKB_MAX_ALLOC; | ||||
| 		/* allow fallback to order-0 allocations */ | ||||
| 		size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ); | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 *	Grab a buffer | ||||
| 		 */ | ||||
| 		data_len = max_t(int, 0, size - SKB_MAX_HEAD(0)); | ||||
| 
 | ||||
| 		skb = sock_alloc_send_skb(sk, size, msg->msg_flags&MSG_DONTWAIT, | ||||
| 					  &err); | ||||
| 
 | ||||
| 		if (skb == NULL) | ||||
| 		skb = sock_alloc_send_pskb(sk, size - data_len, data_len, | ||||
| 					   msg->msg_flags & MSG_DONTWAIT, &err); | ||||
| 		if (!skb) | ||||
| 			goto out_err; | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 *	If you pass two values to the sock_alloc_send_skb | ||||
| 		 *	it tries to grab the large buffer with GFP_NOFS | ||||
| 		 *	(which can fail easily), and if it fails grab the | ||||
| 		 *	fallback size buffer which is under a page and will | ||||
| 		 *	succeed. [Alan] | ||||
| 		 */ | ||||
| 		size = min_t(int, size, skb_tailroom(skb)); | ||||
| 
 | ||||
| 
 | ||||
| 		/* Only send the fds in the first buffer */ | ||||
| 		err = unix_scm_to_skb(siocb->scm, skb, !fds_sent); | ||||
| 		if (err < 0) { | ||||
|  | @ -1678,7 +1664,10 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, | |||
| 		max_level = err + 1; | ||||
| 		fds_sent = true; | ||||
| 
 | ||||
| 		err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size); | ||||
| 		skb_put(skb, size - data_len); | ||||
| 		skb->data_len = data_len; | ||||
| 		skb->len = size; | ||||
| 		err = skb_copy_datagram_from_iovec(skb, 0, msg->msg_iov, 0, size); | ||||
| 		if (err) { | ||||
| 			kfree_skb(skb); | ||||
| 			goto out_err; | ||||
|  | @ -1890,6 +1879,11 @@ static long unix_stream_data_wait(struct sock *sk, long timeo, | |||
| 	return timeo; | ||||
| } | ||||
| 
 | ||||
| static unsigned int unix_skb_len(const struct sk_buff *skb) | ||||
| { | ||||
| 	return skb->len - UNIXCB(skb).consumed; | ||||
| } | ||||
| 
 | ||||
| static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock, | ||||
| 			       struct msghdr *msg, size_t size, | ||||
| 			       int flags) | ||||
|  | @ -1977,8 +1971,8 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock, | |||
| 		} | ||||
| 
 | ||||
| 		skip = sk_peek_offset(sk, flags); | ||||
| 		while (skip >= skb->len) { | ||||
| 			skip -= skb->len; | ||||
| 		while (skip >= unix_skb_len(skb)) { | ||||
| 			skip -= unix_skb_len(skb); | ||||
| 			last = skb; | ||||
| 			skb = skb_peek_next(skb, &sk->sk_receive_queue); | ||||
| 			if (!skb) | ||||
|  | @ -2005,8 +1999,9 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock, | |||
| 			sunaddr = NULL; | ||||
| 		} | ||||
| 
 | ||||
| 		chunk = min_t(unsigned int, skb->len - skip, size); | ||||
| 		if (memcpy_toiovec(msg->msg_iov, skb->data + skip, chunk)) { | ||||
| 		chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size); | ||||
| 		if (skb_copy_datagram_iovec(skb, UNIXCB(skb).consumed + skip, | ||||
| 					    msg->msg_iov, chunk)) { | ||||
| 			if (copied == 0) | ||||
| 				copied = -EFAULT; | ||||
| 			break; | ||||
|  | @ -2016,14 +2011,14 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock, | |||
| 
 | ||||
| 		/* Mark read part of skb as used */ | ||||
| 		if (!(flags & MSG_PEEK)) { | ||||
| 			skb_pull(skb, chunk); | ||||
| 			UNIXCB(skb).consumed += chunk; | ||||
| 
 | ||||
| 			sk_peek_offset_bwd(sk, chunk); | ||||
| 
 | ||||
| 			if (UNIXCB(skb).fp) | ||||
| 				unix_detach_fds(siocb->scm, skb); | ||||
| 
 | ||||
| 			if (skb->len) | ||||
| 			if (unix_skb_len(skb)) | ||||
| 				break; | ||||
| 
 | ||||
| 			skb_unlink(skb, &sk->sk_receive_queue); | ||||
|  | @ -2107,7 +2102,7 @@ long unix_inq_len(struct sock *sk) | |||
| 	if (sk->sk_type == SOCK_STREAM || | ||||
| 	    sk->sk_type == SOCK_SEQPACKET) { | ||||
| 		skb_queue_walk(&sk->sk_receive_queue, skb) | ||||
| 			amount += skb->len; | ||||
| 			amount += unix_skb_len(skb); | ||||
| 	} else { | ||||
| 		skb = skb_peek(&sk->sk_receive_queue); | ||||
| 		if (skb) | ||||
|  |  | |||
		Loading…
	
		Reference in a new issue
	
	 Eric Dumazet
						Eric Dumazet