forked from mirrors/linux
		
	splice, net: Use sendmsg(MSG_SPLICE_PAGES) rather than ->sendpage()
Replace generic_splice_sendpage() + splice_from_pipe + pipe_to_sendpage() with a net-specific handler, splice_to_socket(), that calls sendmsg() with MSG_SPLICE_PAGES set instead of calling ->sendpage(). MSG_MORE is used to indicate if the sendmsg() is expected to be followed with more data. This allows multiple pipe-buffer pages to be passed in a single call in a BVEC iterator, allowing the processing to be pushed down to a loop in the protocol driver. This helps pave the way for passing multipage folios down too. Protocols that haven't been converted to handle MSG_SPLICE_PAGES yet should just ignore it and do a normal sendmsg() for now - although that may be a bit slower as it may copy everything. Signed-off-by: David Howells <dhowells@redhat.com> Reviewed-by: Jakub Kicinski <kuba@kernel.org> cc: Jens Axboe <axboe@kernel.dk> cc: Matthew Wilcox <willy@infradead.org> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
		
							parent
							
								
									81840b3b91
								
							
						
					
					
						commit
						2dc334f1a6
					
				
					 4 changed files with 131 additions and 57 deletions
				
			
		
							
								
								
									
										158
									
								
								fs/splice.c
									
									
									
									
									
								
							
							
						
						
									
										158
									
								
								fs/splice.c
									
									
									
									
									
								
							|  | @ -33,6 +33,7 @@ | ||||||
| #include <linux/fsnotify.h> | #include <linux/fsnotify.h> | ||||||
| #include <linux/security.h> | #include <linux/security.h> | ||||||
| #include <linux/gfp.h> | #include <linux/gfp.h> | ||||||
|  | #include <linux/net.h> | ||||||
| #include <linux/socket.h> | #include <linux/socket.h> | ||||||
| #include <linux/sched/signal.h> | #include <linux/sched/signal.h> | ||||||
| 
 | 
 | ||||||
|  | @ -448,30 +449,6 @@ const struct pipe_buf_operations nosteal_pipe_buf_ops = { | ||||||
| }; | }; | ||||||
| EXPORT_SYMBOL(nosteal_pipe_buf_ops); | EXPORT_SYMBOL(nosteal_pipe_buf_ops); | ||||||
| 
 | 
 | ||||||
| /*
 |  | ||||||
|  * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos' |  | ||||||
|  * using sendpage(). Return the number of bytes sent. |  | ||||||
|  */ |  | ||||||
| static int pipe_to_sendpage(struct pipe_inode_info *pipe, |  | ||||||
| 			    struct pipe_buffer *buf, struct splice_desc *sd) |  | ||||||
| { |  | ||||||
| 	struct file *file = sd->u.file; |  | ||||||
| 	loff_t pos = sd->pos; |  | ||||||
| 	int more; |  | ||||||
| 
 |  | ||||||
| 	if (!likely(file->f_op->sendpage)) |  | ||||||
| 		return -EINVAL; |  | ||||||
| 
 |  | ||||||
| 	more = (sd->flags & SPLICE_F_MORE) ? MSG_MORE : 0; |  | ||||||
| 
 |  | ||||||
| 	if (sd->len < sd->total_len && |  | ||||||
| 	    pipe_occupancy(pipe->head, pipe->tail) > 1) |  | ||||||
| 		more |= MSG_SENDPAGE_NOTLAST; |  | ||||||
| 
 |  | ||||||
| 	return file->f_op->sendpage(file, buf->page, buf->offset, |  | ||||||
| 				    sd->len, &pos, more); |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| static void wakeup_pipe_writers(struct pipe_inode_info *pipe) | static void wakeup_pipe_writers(struct pipe_inode_info *pipe) | ||||||
| { | { | ||||||
| 	smp_mb(); | 	smp_mb(); | ||||||
|  | @ -652,7 +629,7 @@ static void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_des | ||||||
|  * Description: |  * Description: | ||||||
|  *    This function does little more than loop over the pipe and call |  *    This function does little more than loop over the pipe and call | ||||||
|  *    @actor to do the actual moving of a single struct pipe_buffer to |  *    @actor to do the actual moving of a single struct pipe_buffer to | ||||||
|  *    the desired destination. See pipe_to_file, pipe_to_sendpage, or |  *    the desired destination. See pipe_to_file, pipe_to_sendmsg, or | ||||||
|  *    pipe_to_user. |  *    pipe_to_user. | ||||||
|  * |  * | ||||||
|  */ |  */ | ||||||
|  | @ -833,8 +810,9 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, | ||||||
| 
 | 
 | ||||||
| EXPORT_SYMBOL(iter_file_splice_write); | EXPORT_SYMBOL(iter_file_splice_write); | ||||||
| 
 | 
 | ||||||
|  | #ifdef CONFIG_NET | ||||||
| /**
 | /**
 | ||||||
|  * generic_splice_sendpage - splice data from a pipe to a socket |  * splice_to_socket - splice data from a pipe to a socket | ||||||
|  * @pipe:	pipe to splice from |  * @pipe:	pipe to splice from | ||||||
|  * @out:	socket to write to |  * @out:	socket to write to | ||||||
|  * @ppos:	position in @out |  * @ppos:	position in @out | ||||||
|  | @ -846,13 +824,131 @@ EXPORT_SYMBOL(iter_file_splice_write); | ||||||
|  *    is involved. |  *    is involved. | ||||||
|  * |  * | ||||||
|  */ |  */ | ||||||
| ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out, | ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out, | ||||||
| 				loff_t *ppos, size_t len, unsigned int flags) | 			 loff_t *ppos, size_t len, unsigned int flags) | ||||||
| { | { | ||||||
| 	return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage); | 	struct socket *sock = sock_from_file(out); | ||||||
| } | 	struct bio_vec bvec[16]; | ||||||
|  | 	struct msghdr msg = {}; | ||||||
|  | 	ssize_t ret = 0; | ||||||
|  | 	size_t spliced = 0; | ||||||
|  | 	bool need_wakeup = false; | ||||||
| 
 | 
 | ||||||
| EXPORT_SYMBOL(generic_splice_sendpage); | 	pipe_lock(pipe); | ||||||
|  | 
 | ||||||
|  | 	while (len > 0) { | ||||||
|  | 		unsigned int head, tail, mask, bc = 0; | ||||||
|  | 		size_t remain = len; | ||||||
|  | 
 | ||||||
|  | 		/*
 | ||||||
|  | 		 * Check for signal early to make process killable when there | ||||||
|  | 		 * are always buffers available | ||||||
|  | 		 */ | ||||||
|  | 		ret = -ERESTARTSYS; | ||||||
|  | 		if (signal_pending(current)) | ||||||
|  | 			break; | ||||||
|  | 
 | ||||||
|  | 		while (pipe_empty(pipe->head, pipe->tail)) { | ||||||
|  | 			ret = 0; | ||||||
|  | 			if (!pipe->writers) | ||||||
|  | 				goto out; | ||||||
|  | 
 | ||||||
|  | 			if (spliced) | ||||||
|  | 				goto out; | ||||||
|  | 
 | ||||||
|  | 			ret = -EAGAIN; | ||||||
|  | 			if (flags & SPLICE_F_NONBLOCK) | ||||||
|  | 				goto out; | ||||||
|  | 
 | ||||||
|  | 			ret = -ERESTARTSYS; | ||||||
|  | 			if (signal_pending(current)) | ||||||
|  | 				goto out; | ||||||
|  | 
 | ||||||
|  | 			if (need_wakeup) { | ||||||
|  | 				wakeup_pipe_writers(pipe); | ||||||
|  | 				need_wakeup = false; | ||||||
|  | 			} | ||||||
|  | 
 | ||||||
|  | 			pipe_wait_readable(pipe); | ||||||
|  | 		} | ||||||
|  | 
 | ||||||
|  | 		head = pipe->head; | ||||||
|  | 		tail = pipe->tail; | ||||||
|  | 		mask = pipe->ring_size - 1; | ||||||
|  | 
 | ||||||
|  | 		while (!pipe_empty(head, tail)) { | ||||||
|  | 			struct pipe_buffer *buf = &pipe->bufs[tail & mask]; | ||||||
|  | 			size_t seg; | ||||||
|  | 
 | ||||||
|  | 			if (!buf->len) { | ||||||
|  | 				tail++; | ||||||
|  | 				continue; | ||||||
|  | 			} | ||||||
|  | 
 | ||||||
|  | 			seg = min_t(size_t, remain, buf->len); | ||||||
|  | 			seg = min_t(size_t, seg, PAGE_SIZE); | ||||||
|  | 
 | ||||||
|  | 			ret = pipe_buf_confirm(pipe, buf); | ||||||
|  | 			if (unlikely(ret)) { | ||||||
|  | 				if (ret == -ENODATA) | ||||||
|  | 					ret = 0; | ||||||
|  | 				break; | ||||||
|  | 			} | ||||||
|  | 
 | ||||||
|  | 			bvec_set_page(&bvec[bc++], buf->page, seg, buf->offset); | ||||||
|  | 			remain -= seg; | ||||||
|  | 			if (seg >= buf->len) | ||||||
|  | 				tail++; | ||||||
|  | 			if (bc >= ARRAY_SIZE(bvec)) | ||||||
|  | 				break; | ||||||
|  | 		} | ||||||
|  | 
 | ||||||
|  | 		if (!bc) | ||||||
|  | 			break; | ||||||
|  | 
 | ||||||
|  | 		msg.msg_flags = MSG_SPLICE_PAGES; | ||||||
|  | 		if (flags & SPLICE_F_MORE) | ||||||
|  | 			msg.msg_flags |= MSG_MORE; | ||||||
|  | 		if (remain && pipe_occupancy(pipe->head, tail) > 0) | ||||||
|  | 			msg.msg_flags |= MSG_MORE; | ||||||
|  | 
 | ||||||
|  | 		iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, bvec, bc, | ||||||
|  | 			      len - remain); | ||||||
|  | 		ret = sock_sendmsg(sock, &msg); | ||||||
|  | 		if (ret <= 0) | ||||||
|  | 			break; | ||||||
|  | 
 | ||||||
|  | 		spliced += ret; | ||||||
|  | 		len -= ret; | ||||||
|  | 		tail = pipe->tail; | ||||||
|  | 		while (ret > 0) { | ||||||
|  | 			struct pipe_buffer *buf = &pipe->bufs[tail & mask]; | ||||||
|  | 			size_t seg = min_t(size_t, ret, buf->len); | ||||||
|  | 
 | ||||||
|  | 			buf->offset += seg; | ||||||
|  | 			buf->len -= seg; | ||||||
|  | 			ret -= seg; | ||||||
|  | 
 | ||||||
|  | 			if (!buf->len) { | ||||||
|  | 				pipe_buf_release(pipe, buf); | ||||||
|  | 				tail++; | ||||||
|  | 			} | ||||||
|  | 		} | ||||||
|  | 
 | ||||||
|  | 		if (tail != pipe->tail) { | ||||||
|  | 			pipe->tail = tail; | ||||||
|  | 			if (pipe->files) | ||||||
|  | 				need_wakeup = true; | ||||||
|  | 		} | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | out: | ||||||
|  | 	pipe_unlock(pipe); | ||||||
|  | 	if (need_wakeup) | ||||||
|  | 		wakeup_pipe_writers(pipe); | ||||||
|  | 	return spliced ?: ret; | ||||||
|  | } | ||||||
|  | #endif | ||||||
| 
 | 
 | ||||||
| static int warn_unsupported(struct file *file, const char *op) | static int warn_unsupported(struct file *file, const char *op) | ||||||
| { | { | ||||||
|  |  | ||||||
|  | @ -2759,8 +2759,6 @@ extern ssize_t generic_file_splice_read(struct file *, loff_t *, | ||||||
| 		struct pipe_inode_info *, size_t, unsigned int); | 		struct pipe_inode_info *, size_t, unsigned int); | ||||||
| extern ssize_t iter_file_splice_write(struct pipe_inode_info *, | extern ssize_t iter_file_splice_write(struct pipe_inode_info *, | ||||||
| 		struct file *, loff_t *, size_t, unsigned int); | 		struct file *, loff_t *, size_t, unsigned int); | ||||||
| extern ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, |  | ||||||
| 		struct file *out, loff_t *, size_t len, unsigned int flags); |  | ||||||
| extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, | extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, | ||||||
| 		loff_t *opos, size_t len, unsigned int flags); | 		loff_t *opos, size_t len, unsigned int flags); | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -84,6 +84,8 @@ extern long do_splice(struct file *in, loff_t *off_in, | ||||||
| 
 | 
 | ||||||
| extern long do_tee(struct file *in, struct file *out, size_t len, | extern long do_tee(struct file *in, struct file *out, size_t len, | ||||||
| 		   unsigned int flags); | 		   unsigned int flags); | ||||||
|  | extern ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out, | ||||||
|  | 				loff_t *ppos, size_t len, unsigned int flags); | ||||||
| 
 | 
 | ||||||
| /*
 | /*
 | ||||||
|  * for dynamic pipe sizing |  * for dynamic pipe sizing | ||||||
|  |  | ||||||
							
								
								
									
										26
									
								
								net/socket.c
									
									
									
									
									
								
							
							
						
						
									
										26
									
								
								net/socket.c
									
									
									
									
									
								
							|  | @ -57,6 +57,7 @@ | ||||||
| #include <linux/mm.h> | #include <linux/mm.h> | ||||||
| #include <linux/socket.h> | #include <linux/socket.h> | ||||||
| #include <linux/file.h> | #include <linux/file.h> | ||||||
|  | #include <linux/splice.h> | ||||||
| #include <linux/net.h> | #include <linux/net.h> | ||||||
| #include <linux/interrupt.h> | #include <linux/interrupt.h> | ||||||
| #include <linux/thread_info.h> | #include <linux/thread_info.h> | ||||||
|  | @ -126,8 +127,6 @@ static long compat_sock_ioctl(struct file *file, | ||||||
| 			      unsigned int cmd, unsigned long arg); | 			      unsigned int cmd, unsigned long arg); | ||||||
| #endif | #endif | ||||||
| static int sock_fasync(int fd, struct file *filp, int on); | static int sock_fasync(int fd, struct file *filp, int on); | ||||||
| static ssize_t sock_sendpage(struct file *file, struct page *page, |  | ||||||
| 			     int offset, size_t size, loff_t *ppos, int more); |  | ||||||
| static ssize_t sock_splice_read(struct file *file, loff_t *ppos, | static ssize_t sock_splice_read(struct file *file, loff_t *ppos, | ||||||
| 				struct pipe_inode_info *pipe, size_t len, | 				struct pipe_inode_info *pipe, size_t len, | ||||||
| 				unsigned int flags); | 				unsigned int flags); | ||||||
|  | @ -162,8 +161,7 @@ static const struct file_operations socket_file_ops = { | ||||||
| 	.mmap =		sock_mmap, | 	.mmap =		sock_mmap, | ||||||
| 	.release =	sock_close, | 	.release =	sock_close, | ||||||
| 	.fasync =	sock_fasync, | 	.fasync =	sock_fasync, | ||||||
| 	.sendpage =	sock_sendpage, | 	.splice_write = splice_to_socket, | ||||||
| 	.splice_write = generic_splice_sendpage, |  | ||||||
| 	.splice_read =	sock_splice_read, | 	.splice_read =	sock_splice_read, | ||||||
| 	.show_fdinfo =	sock_show_fdinfo, | 	.show_fdinfo =	sock_show_fdinfo, | ||||||
| }; | }; | ||||||
|  | @ -1066,26 +1064,6 @@ int kernel_recvmsg(struct socket *sock, struct msghdr *msg, | ||||||
| } | } | ||||||
| EXPORT_SYMBOL(kernel_recvmsg); | EXPORT_SYMBOL(kernel_recvmsg); | ||||||
| 
 | 
 | ||||||
| static ssize_t sock_sendpage(struct file *file, struct page *page, |  | ||||||
| 			     int offset, size_t size, loff_t *ppos, int more) |  | ||||||
| { |  | ||||||
| 	struct socket *sock; |  | ||||||
| 	int flags; |  | ||||||
| 	int ret; |  | ||||||
| 
 |  | ||||||
| 	sock = file->private_data; |  | ||||||
| 
 |  | ||||||
| 	flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0; |  | ||||||
| 	/* more is a combination of MSG_MORE and MSG_SENDPAGE_NOTLAST */ |  | ||||||
| 	flags |= more; |  | ||||||
| 
 |  | ||||||
| 	ret = kernel_sendpage(sock, page, offset, size, flags); |  | ||||||
| 
 |  | ||||||
| 	if (trace_sock_send_length_enabled()) |  | ||||||
| 		call_trace_sock_send_length(sock->sk, ret, 0); |  | ||||||
| 	return ret; |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| static ssize_t sock_splice_read(struct file *file, loff_t *ppos, | static ssize_t sock_splice_read(struct file *file, loff_t *ppos, | ||||||
| 				struct pipe_inode_info *pipe, size_t len, | 				struct pipe_inode_info *pipe, size_t len, | ||||||
| 				unsigned int flags) | 				unsigned int flags) | ||||||
|  |  | ||||||
		Loading…
	
		Reference in a new issue
	
	 David Howells
						David Howells