mirror of
				https://github.com/torvalds/linux.git
				synced 2025-10-31 08:38:45 +02:00 
			
		
		
		
	btrfs: add io_uring command for encoded reads (ENCODED_READ ioctl)
Add an io_uring command for encoded reads, using the same interface as the existing BTRFS_IOC_ENCODED_READ ioctl. btrfs_uring_encoded_read() is an io_uring version of btrfs_ioctl_encoded_read(), which validates the user input and calls btrfs_encoded_read() to read the appropriate metadata. If we determine that we need to read an extent from disk, we call btrfs_encoded_read_regular_fill_pages() through btrfs_uring_read_extent() to prepare the bio. The existing btrfs_encoded_read_regular_fill_pages() is changed so that if it is passed a valid uring_ctx, rather than waking up any waiting threads it calls btrfs_uring_read_extent_endio(). This in turn copies the read data back to userspace, and calls io_uring_cmd_done() to complete the io_uring command. Because we're potentially doing a non-blocking read, btrfs_uring_read_extent() doesn't clean up after itself if it returns -EIOCBQUEUED. Instead, it allocates a priv struct, populates the fields there that we will need to unlock the inode and free our allocations, and defers this to the btrfs_uring_read_finished() that gets called when the bio completes. Signed-off-by: Mark Harmstone <maharmstone@fb.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
This commit is contained in:
		
							parent
							
								
									68d3b27e05
								
							
						
					
					
						commit
						34310c442e
					
				
					 6 changed files with 339 additions and 12 deletions
				
			
		|  | @ -613,7 +613,7 @@ int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info, | |||
| 					     int compress_type); | ||||
| int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, | ||||
| 					  u64 disk_bytenr, u64 disk_io_size, | ||||
| 					  struct page **pages); | ||||
| 					  struct page **pages, void *uring_ctx); | ||||
| ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, | ||||
| 			   struct btrfs_ioctl_encoded_io_args *encoded, | ||||
| 			   struct extent_state **cached_state, | ||||
|  |  | |||
|  | @ -3710,6 +3710,7 @@ const struct file_operations btrfs_file_operations = { | |||
| 	.compat_ioctl	= btrfs_compat_ioctl, | ||||
| #endif | ||||
| 	.remap_file_range = btrfs_remap_file_range, | ||||
| 	.uring_cmd	= btrfs_uring_cmd, | ||||
| 	.fop_flags	= FOP_BUFFER_RASYNC | FOP_BUFFER_WASYNC, | ||||
| }; | ||||
| 
 | ||||
|  |  | |||
|  | @ -9056,6 +9056,7 @@ static ssize_t btrfs_encoded_read_inline( | |||
| 
 | ||||
| struct btrfs_encoded_read_private { | ||||
| 	wait_queue_head_t wait; | ||||
| 	void *uring_ctx; | ||||
| 	atomic_t pending; | ||||
| 	blk_status_t status; | ||||
| }; | ||||
|  | @ -9075,14 +9076,22 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio) | |||
| 		 */ | ||||
| 		WRITE_ONCE(priv->status, bbio->bio.bi_status); | ||||
| 	} | ||||
| 	if (!atomic_dec_return(&priv->pending)) | ||||
| 		wake_up(&priv->wait); | ||||
| 	if (atomic_dec_return(&priv->pending) == 0) { | ||||
| 		int err = blk_status_to_errno(READ_ONCE(priv->status)); | ||||
| 
 | ||||
| 		if (priv->uring_ctx) { | ||||
| 			btrfs_uring_read_extent_endio(priv->uring_ctx, err); | ||||
| 			kfree(priv); | ||||
| 		} else { | ||||
| 			wake_up(&priv->wait); | ||||
| 		} | ||||
| 	} | ||||
| 	bio_put(&bbio->bio); | ||||
| } | ||||
| 
 | ||||
| int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, | ||||
| 					  u64 disk_bytenr, u64 disk_io_size, | ||||
| 					  struct page **pages) | ||||
| 					  struct page **pages, void *uring_ctx) | ||||
| { | ||||
| 	struct btrfs_fs_info *fs_info = inode->root->fs_info; | ||||
| 	struct btrfs_encoded_read_private *priv; | ||||
|  | @ -9097,6 +9106,7 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, | |||
| 	init_waitqueue_head(&priv->wait); | ||||
| 	atomic_set(&priv->pending, 1); | ||||
| 	priv->status = 0; | ||||
| 	priv->uring_ctx = uring_ctx; | ||||
| 
 | ||||
| 	bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info, | ||||
| 			       btrfs_encoded_read_endio, priv); | ||||
|  | @ -9125,12 +9135,23 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, | |||
| 	atomic_inc(&priv->pending); | ||||
| 	btrfs_submit_bbio(bbio, 0); | ||||
| 
 | ||||
| 	if (atomic_dec_return(&priv->pending)) | ||||
| 		io_wait_event(priv->wait, !atomic_read(&priv->pending)); | ||||
| 	/* See btrfs_encoded_read_endio() for ordering. */ | ||||
| 	ret = blk_status_to_errno(READ_ONCE(priv->status)); | ||||
| 	kfree(priv); | ||||
| 	return ret; | ||||
| 	if (uring_ctx) { | ||||
| 		if (atomic_dec_return(&priv->pending) == 0) { | ||||
| 			ret = blk_status_to_errno(READ_ONCE(priv->status)); | ||||
| 			btrfs_uring_read_extent_endio(uring_ctx, ret); | ||||
| 			kfree(priv); | ||||
| 			return ret; | ||||
| 		} | ||||
| 
 | ||||
| 		return -EIOCBQUEUED; | ||||
| 	} else { | ||||
| 		if (atomic_dec_return(&priv->pending) != 0) | ||||
| 			io_wait_event(priv->wait, !atomic_read(&priv->pending)); | ||||
| 		/* See btrfs_encoded_read_endio() for ordering. */ | ||||
| 		ret = blk_status_to_errno(READ_ONCE(priv->status)); | ||||
| 		kfree(priv); | ||||
| 		return ret; | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, struct iov_iter *iter, | ||||
|  | @ -9158,7 +9179,7 @@ ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, struct iov_iter *iter, | |||
| 		} | ||||
| 
 | ||||
| 	ret = btrfs_encoded_read_regular_fill_pages(inode, disk_bytenr, | ||||
| 						    disk_io_size, pages); | ||||
| 						    disk_io_size, pages, NULL); | ||||
| 	if (ret) | ||||
| 		goto out; | ||||
| 
 | ||||
|  |  | |||
							
								
								
									
										302
									
								
								fs/btrfs/ioctl.c
									
									
									
									
									
								
							
							
						
						
									
										302
									
								
								fs/btrfs/ioctl.c
									
									
									
									
									
								
							|  | @ -29,6 +29,7 @@ | |||
| #include <linux/fileattr.h> | ||||
| #include <linux/fsverity.h> | ||||
| #include <linux/sched/xacct.h> | ||||
| #include <linux/io_uring/cmd.h> | ||||
| #include "ctree.h" | ||||
| #include "disk-io.h" | ||||
| #include "export.h" | ||||
|  | @ -4719,6 +4720,307 @@ static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool | |||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Context that's attached to an encoded read io_uring command, in cmd->pdu. It | ||||
|  * contains the fields in btrfs_uring_read_extent that are necessary to finish | ||||
|  * off and cleanup the I/O in btrfs_uring_read_finished. | ||||
|  */ | ||||
| struct btrfs_uring_priv { | ||||
| 	struct io_uring_cmd *cmd; | ||||
| 	struct page **pages; | ||||
| 	unsigned long nr_pages; | ||||
| 	struct kiocb iocb; | ||||
| 	struct iovec *iov; | ||||
| 	struct iov_iter iter; | ||||
| 	struct extent_state *cached_state; | ||||
| 	u64 count; | ||||
| 	u64 start; | ||||
| 	u64 lockend; | ||||
| 	int err; | ||||
| 	bool compressed; | ||||
| }; | ||||
| 
 | ||||
| static void btrfs_uring_read_finished(struct io_uring_cmd *cmd, unsigned int issue_flags) | ||||
| { | ||||
| 	struct btrfs_uring_priv *priv = *io_uring_cmd_to_pdu(cmd, struct btrfs_uring_priv *); | ||||
| 	struct btrfs_inode *inode = BTRFS_I(file_inode(priv->iocb.ki_filp)); | ||||
| 	struct extent_io_tree *io_tree = &inode->io_tree; | ||||
| 	unsigned long index; | ||||
| 	u64 cur; | ||||
| 	size_t page_offset; | ||||
| 	ssize_t ret; | ||||
| 
 | ||||
| 	if (priv->err) { | ||||
| 		ret = priv->err; | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	if (priv->compressed) { | ||||
| 		index = 0; | ||||
| 		page_offset = 0; | ||||
| 	} else { | ||||
| 		index = (priv->iocb.ki_pos - priv->start) >> PAGE_SHIFT; | ||||
| 		page_offset = offset_in_page(priv->iocb.ki_pos - priv->start); | ||||
| 	} | ||||
| 	cur = 0; | ||||
| 	while (cur < priv->count) { | ||||
| 		size_t bytes = min_t(size_t, priv->count - cur, PAGE_SIZE - page_offset); | ||||
| 
 | ||||
| 		if (copy_page_to_iter(priv->pages[index], page_offset, bytes, | ||||
| 				      &priv->iter) != bytes) { | ||||
| 			ret = -EFAULT; | ||||
| 			goto out; | ||||
| 		} | ||||
| 
 | ||||
| 		index++; | ||||
| 		cur += bytes; | ||||
| 		page_offset = 0; | ||||
| 	} | ||||
| 	ret = priv->count; | ||||
| 
 | ||||
| out: | ||||
| 	unlock_extent(io_tree, priv->start, priv->lockend, &priv->cached_state); | ||||
| 	btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); | ||||
| 
 | ||||
| 	io_uring_cmd_done(cmd, ret, 0, issue_flags); | ||||
| 	add_rchar(current, ret); | ||||
| 
 | ||||
| 	for (index = 0; index < priv->nr_pages; index++) | ||||
| 		__free_page(priv->pages[index]); | ||||
| 
 | ||||
| 	kfree(priv->pages); | ||||
| 	kfree(priv->iov); | ||||
| 	kfree(priv); | ||||
| } | ||||
| 
 | ||||
| void btrfs_uring_read_extent_endio(void *ctx, int err) | ||||
| { | ||||
| 	struct btrfs_uring_priv *priv = ctx; | ||||
| 
 | ||||
| 	priv->err = err; | ||||
| 
 | ||||
| 	*io_uring_cmd_to_pdu(priv->cmd, struct btrfs_uring_priv *) = priv; | ||||
| 	io_uring_cmd_complete_in_task(priv->cmd, btrfs_uring_read_finished); | ||||
| } | ||||
| 
 | ||||
| static int btrfs_uring_read_extent(struct kiocb *iocb, struct iov_iter *iter, | ||||
| 				   u64 start, u64 lockend, | ||||
| 				   struct extent_state *cached_state, | ||||
| 				   u64 disk_bytenr, u64 disk_io_size, | ||||
| 				   size_t count, bool compressed, | ||||
| 				   struct iovec *iov, struct io_uring_cmd *cmd) | ||||
| { | ||||
| 	struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); | ||||
| 	struct extent_io_tree *io_tree = &inode->io_tree; | ||||
| 	struct page **pages; | ||||
| 	struct btrfs_uring_priv *priv = NULL; | ||||
| 	unsigned long nr_pages; | ||||
| 	int ret; | ||||
| 
 | ||||
| 	nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE); | ||||
| 	pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); | ||||
| 	if (!pages) | ||||
| 		return -ENOMEM; | ||||
| 	ret = btrfs_alloc_page_array(nr_pages, pages, 0); | ||||
| 	if (ret) { | ||||
| 		ret = -ENOMEM; | ||||
| 		goto out_fail; | ||||
| 	} | ||||
| 
 | ||||
| 	priv = kmalloc(sizeof(*priv), GFP_NOFS); | ||||
| 	if (!priv) { | ||||
| 		ret = -ENOMEM; | ||||
| 		goto out_fail; | ||||
| 	} | ||||
| 
 | ||||
| 	priv->iocb = *iocb; | ||||
| 	priv->iov = iov; | ||||
| 	priv->iter = *iter; | ||||
| 	priv->count = count; | ||||
| 	priv->cmd = cmd; | ||||
| 	priv->cached_state = cached_state; | ||||
| 	priv->compressed = compressed; | ||||
| 	priv->nr_pages = nr_pages; | ||||
| 	priv->pages = pages; | ||||
| 	priv->start = start; | ||||
| 	priv->lockend = lockend; | ||||
| 	priv->err = 0; | ||||
| 
 | ||||
| 	ret = btrfs_encoded_read_regular_fill_pages(inode, disk_bytenr, | ||||
| 						    disk_io_size, pages, priv); | ||||
| 	if (ret && ret != -EIOCBQUEUED) | ||||
| 		goto out_fail; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * If we return -EIOCBQUEUED, we're deferring the cleanup to | ||||
| 	 * btrfs_uring_read_finished(), which will handle unlocking the extent | ||||
| 	 * and inode and freeing the allocations. | ||||
| 	 */ | ||||
| 
 | ||||
| 	return -EIOCBQUEUED; | ||||
| 
 | ||||
| out_fail: | ||||
| 	unlock_extent(io_tree, start, lockend, &cached_state); | ||||
| 	btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); | ||||
| 	kfree(priv); | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue_flags) | ||||
| { | ||||
| 	size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args, flags); | ||||
| 	size_t copy_end; | ||||
| 	struct btrfs_ioctl_encoded_io_args args = { 0 }; | ||||
| 	int ret; | ||||
| 	u64 disk_bytenr, disk_io_size; | ||||
| 	struct file *file; | ||||
| 	struct btrfs_inode *inode; | ||||
| 	struct btrfs_fs_info *fs_info; | ||||
| 	struct extent_io_tree *io_tree; | ||||
| 	struct iovec iovstack[UIO_FASTIOV]; | ||||
| 	struct iovec *iov = iovstack; | ||||
| 	struct iov_iter iter; | ||||
| 	loff_t pos; | ||||
| 	struct kiocb kiocb; | ||||
| 	struct extent_state *cached_state = NULL; | ||||
| 	u64 start, lockend; | ||||
| 	void __user *sqe_addr; | ||||
| 
 | ||||
| 	if (!capable(CAP_SYS_ADMIN)) { | ||||
| 		ret = -EPERM; | ||||
| 		goto out_acct; | ||||
| 	} | ||||
| 	file = cmd->file; | ||||
| 	inode = BTRFS_I(file->f_inode); | ||||
| 	fs_info = inode->root->fs_info; | ||||
| 	io_tree = &inode->io_tree; | ||||
| 	sqe_addr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)); | ||||
| 
 | ||||
| 	if (issue_flags & IO_URING_F_COMPAT) { | ||||
| #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) | ||||
| 		struct btrfs_ioctl_encoded_io_args_32 args32; | ||||
| 
 | ||||
| 		copy_end = offsetofend(struct btrfs_ioctl_encoded_io_args_32, flags); | ||||
| 		if (copy_from_user(&args32, sqe_addr, copy_end)) { | ||||
| 			ret = -EFAULT; | ||||
| 			goto out_acct; | ||||
| 		} | ||||
| 		args.iov = compat_ptr(args32.iov); | ||||
| 		args.iovcnt = args32.iovcnt; | ||||
| 		args.offset = args32.offset; | ||||
| 		args.flags = args32.flags; | ||||
| #else | ||||
| 		return -ENOTTY; | ||||
| #endif | ||||
| 	} else { | ||||
| 		copy_end = copy_end_kernel; | ||||
| 		if (copy_from_user(&args, sqe_addr, copy_end)) { | ||||
| 			ret = -EFAULT; | ||||
| 			goto out_acct; | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	if (args.flags != 0) | ||||
| 		return -EINVAL; | ||||
| 
 | ||||
| 	ret = import_iovec(ITER_DEST, args.iov, args.iovcnt, ARRAY_SIZE(iovstack), | ||||
| 			   &iov, &iter); | ||||
| 	if (ret < 0) | ||||
| 		goto out_acct; | ||||
| 
 | ||||
| 	if (iov_iter_count(&iter) == 0) { | ||||
| 		ret = 0; | ||||
| 		goto out_free; | ||||
| 	} | ||||
| 
 | ||||
| 	pos = args.offset; | ||||
| 	ret = rw_verify_area(READ, file, &pos, args.len); | ||||
| 	if (ret < 0) | ||||
| 		goto out_free; | ||||
| 
 | ||||
| 	init_sync_kiocb(&kiocb, file); | ||||
| 	kiocb.ki_pos = pos; | ||||
| 
 | ||||
| 	if (issue_flags & IO_URING_F_NONBLOCK) | ||||
| 		kiocb.ki_flags |= IOCB_NOWAIT; | ||||
| 
 | ||||
| 	start = ALIGN_DOWN(pos, fs_info->sectorsize); | ||||
| 	lockend = start + BTRFS_MAX_UNCOMPRESSED - 1; | ||||
| 
 | ||||
| 	ret = btrfs_encoded_read(&kiocb, &iter, &args, &cached_state, | ||||
| 				 &disk_bytenr, &disk_io_size); | ||||
| 	if (ret < 0 && ret != -EIOCBQUEUED) | ||||
| 		goto out_free; | ||||
| 
 | ||||
| 	file_accessed(file); | ||||
| 
 | ||||
| 	if (copy_to_user(sqe_addr + copy_end, (const char *)&args + copy_end_kernel, | ||||
| 			 sizeof(args) - copy_end_kernel)) { | ||||
| 		if (ret == -EIOCBQUEUED) { | ||||
| 			unlock_extent(io_tree, start, lockend, &cached_state); | ||||
| 			btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); | ||||
| 		} | ||||
| 		ret = -EFAULT; | ||||
| 		goto out_free; | ||||
| 	} | ||||
| 
 | ||||
| 	if (ret == -EIOCBQUEUED) { | ||||
| 		u64 count; | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * If we've optimized things by storing the iovecs on the stack, | ||||
| 		 * undo this. | ||||
| 		 */ | ||||
| 		if (!iov) { | ||||
| 			iov = kmalloc(sizeof(struct iovec) * args.iovcnt, GFP_NOFS); | ||||
| 			if (!iov) { | ||||
| 				unlock_extent(io_tree, start, lockend, &cached_state); | ||||
| 				btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); | ||||
| 				ret = -ENOMEM; | ||||
| 				goto out_acct; | ||||
| 			} | ||||
| 
 | ||||
| 			memcpy(iov, iovstack, sizeof(struct iovec) * args.iovcnt); | ||||
| 		} | ||||
| 
 | ||||
| 		count = min_t(u64, iov_iter_count(&iter), disk_io_size); | ||||
| 
 | ||||
| 		/* Match ioctl by not returning past EOF if uncompressed. */ | ||||
| 		if (!args.compression) | ||||
| 			count = min_t(u64, count, args.len); | ||||
| 
 | ||||
| 		ret = btrfs_uring_read_extent(&kiocb, &iter, start, lockend, | ||||
| 					      cached_state, disk_bytenr, | ||||
| 					      disk_io_size, count, | ||||
| 					      args.compression, iov, cmd); | ||||
| 
 | ||||
| 		goto out_acct; | ||||
| 	} | ||||
| 
 | ||||
| out_free: | ||||
| 	kfree(iov); | ||||
| 
 | ||||
| out_acct: | ||||
| 	if (ret > 0) | ||||
| 		add_rchar(current, ret); | ||||
| 	inc_syscr(current); | ||||
| 
 | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) | ||||
| { | ||||
| 	switch (cmd->cmd_op) { | ||||
| 	case BTRFS_IOC_ENCODED_READ: | ||||
| #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) | ||||
| 	case BTRFS_IOC_ENCODED_READ_32: | ||||
| #endif | ||||
| 		return btrfs_uring_encoded_read(cmd, issue_flags); | ||||
| 	} | ||||
| 
 | ||||
| 	return -EINVAL; | ||||
| } | ||||
| 
 | ||||
| long btrfs_ioctl(struct file *file, unsigned int | ||||
| 		cmd, unsigned long arg) | ||||
| { | ||||
|  |  | |||
|  | @ -22,5 +22,7 @@ void btrfs_sync_inode_flags_to_i_flags(struct inode *inode); | |||
| int __pure btrfs_is_empty_uuid(const u8 *uuid); | ||||
| void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, | ||||
| 				     struct btrfs_ioctl_balance_args *bargs); | ||||
| int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags); | ||||
| void btrfs_uring_read_extent_endio(void *ctx, int err); | ||||
| 
 | ||||
| #endif | ||||
|  |  | |||
|  | @ -5669,7 +5669,8 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path, | |||
| 	ret = btrfs_encoded_read_regular_fill_pages(BTRFS_I(inode), | ||||
| 						    disk_bytenr, disk_num_bytes, | ||||
| 						    sctx->send_buf_pages + | ||||
| 						    (data_offset >> PAGE_SHIFT)); | ||||
| 						    (data_offset >> PAGE_SHIFT), | ||||
| 						    NULL); | ||||
| 	if (ret) | ||||
| 		goto out; | ||||
| 
 | ||||
|  |  | |||
		Loading…
	
		Reference in a new issue
	
	 Mark Harmstone
						Mark Harmstone