btrfs: enable experimental bs > ps support

With all the preparation patches, we're able to finally enable btrfs
block size (sector size) larger than page size support and give it a
full fstests run.

And obviously this new feature is hidden behind experimental flags, and
should not be considered as a core feature yet as btrfs' default block
size is still 4K.

But this is still a feature that will shine in the future where 16K
block sized device are widely adopted.

For now there are some features explicitly disabled:

- Direct IO
  This is the most complex part to support, the root reason is we can
  not control the pages of iov iter passed in.

  User space programs can only ensure the virtual addresses are
  contiguous, but have no control on their physical addresses.

  Our bs > ps support heavily relies on large folios, and direct IO
  memory can easily break it.

  So direct IO is disabled and will always fall back to buffered IO.

- RAID56
  In theory we can convert RAID56 to use large folios, but it will need
  to be converted back to page based if we want to support direct IO in
  the future.
  So just reject it for now.

- Encoded send
- Encoded read
  Both are utilizing btrfs_encoded_read_regular_fill_pages(), and send
  is utilizing vmallocated memory.
  Unfortunately for vmallocated memory we can not guarantee the minimal
  folio order.

  For send, it will just always fallback to regular writes, which reads
  from page cache and will follow the existing folio order requirement.

- Encoded write
  Encoded write itself is allocating pages by themselves, and we can
  easily change it to follow the minimal order.
  But since encoded read is already disabled, there is no need to only
  enable encoded write.

Finally just like what we did for bs < ps support in the past, add a
warning message for bs > ps mounts.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
This commit is contained in:
Qu Wenruo 2025-09-09 12:38:47 +09:30 committed by David Sterba
parent e9bed72e88
commit 98077f7f21
5 changed files with 58 additions and 15 deletions

View file

@ -786,6 +786,18 @@ static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
if (iov_iter_alignment(iter) & blocksize_mask)
return -EINVAL;
/*
* For bs > ps support, we heavily rely on large folios to make sure no
* block will cross large folio boundaries.
*
* But memory provided by direct IO is only virtually contiguous, not
* physically contiguous, and will break the btrfs' large folio requirement.
*
* So for bs > ps support, all direct IOs should fallback to buffered ones.
*/
if (fs_info->sectorsize > PAGE_SIZE)
return -EINVAL;
return 0;
}

View file

@ -3242,18 +3242,24 @@ int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount)
}
/*
* Subpage runtime limitation on v1 cache.
* Subpage/bs > ps runtime limitation on v1 cache.
*
* V1 space cache still has some hard coded PAGE_SIZE usage, while
* we're already defaulting to v2 cache, no need to bother v1 as it's
* going to be deprecated anyway.
*/
if (fs_info->sectorsize < PAGE_SIZE && btrfs_test_opt(fs_info, SPACE_CACHE)) {
if (fs_info->sectorsize != PAGE_SIZE && btrfs_test_opt(fs_info, SPACE_CACHE)) {
btrfs_warn(fs_info,
"v1 space cache is not supported for page size %lu with sectorsize %u",
PAGE_SIZE, fs_info->sectorsize);
return -EINVAL;
}
if (fs_info->sectorsize > PAGE_SIZE && btrfs_fs_incompat(fs_info, RAID56)) {
btrfs_err(fs_info,
"RAID56 is not supported for page size %lu with sectorsize %u",
PAGE_SIZE, fs_info->sectorsize);
return -EINVAL;
}
/* This can be called by remount, we need to protect the super block. */
spin_lock(&fs_info->super_lock);
@ -3388,6 +3394,10 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
fs_info->stripesize = stripesize;
fs_info->fs_devices->fs_info = fs_info;
if (fs_info->sectorsize > PAGE_SIZE)
btrfs_warn(fs_info,
"support for block size %u with page size %zu is experimental, some features may be missing",
fs_info->sectorsize, PAGE_SIZE);
/*
* Handle the space caching options appropriately now that we have the
* super block loaded and validated.

View file

@ -97,7 +97,6 @@ bool __attribute_const__ btrfs_supported_blocksize(u32 blocksize)
*/
if (IS_ENABLED(CONFIG_HIGHMEM) && blocksize > PAGE_SIZE)
return false;
if (blocksize <= PAGE_SIZE)
return true;
#endif
return false;

View file

@ -4418,6 +4418,10 @@ static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp,
goto out_acct;
}
if (fs_info->sectorsize > PAGE_SIZE) {
ret = -ENOTTY;
goto out_acct;
}
if (compat) {
#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
struct btrfs_ioctl_encoded_io_args_32 args32;
@ -4509,6 +4513,7 @@ static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp,
static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool compat)
{
struct btrfs_fs_info *fs_info = inode_to_fs_info(file->f_inode);
struct btrfs_ioctl_encoded_io_args args;
struct iovec iovstack[UIO_FASTIOV];
struct iovec *iov = iovstack;
@ -4522,6 +4527,11 @@ static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool
goto out_acct;
}
if (fs_info->sectorsize > PAGE_SIZE) {
ret = -ENOTTY;
goto out_acct;
}
if (!(file->f_mode & FMODE_WRITE)) {
ret = -EBADF;
goto out_acct;
@ -4780,14 +4790,14 @@ static int btrfs_uring_read_extent(struct kiocb *iocb, struct iov_iter *iter,
static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue_flags)
{
struct file *file = cmd->file;
struct btrfs_inode *inode = BTRFS_I(file->f_inode);
struct extent_io_tree *io_tree = &inode->io_tree;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args, flags);
size_t copy_end;
int ret;
u64 disk_bytenr, disk_io_size;
struct file *file;
struct btrfs_inode *inode;
struct btrfs_fs_info *fs_info;
struct extent_io_tree *io_tree;
loff_t pos;
struct kiocb kiocb;
struct extent_state *cached_state = NULL;
@ -4803,10 +4813,11 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue
ret = -EPERM;
goto out_acct;
}
file = cmd->file;
inode = BTRFS_I(file->f_inode);
fs_info = inode->root->fs_info;
io_tree = &inode->io_tree;
if (fs_info->sectorsize > PAGE_SIZE) {
ret = -ENOTTY;
goto out_acct;
}
sqe_addr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr));
if (issue_flags & IO_URING_F_COMPAT) {
@ -4933,9 +4944,10 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue
static int btrfs_uring_encoded_write(struct io_uring_cmd *cmd, unsigned int issue_flags)
{
struct file *file = cmd->file;
struct btrfs_fs_info *fs_info = inode_to_fs_info(file->f_inode);
loff_t pos;
struct kiocb kiocb;
struct file *file;
ssize_t ret;
void __user *sqe_addr;
struct io_btrfs_cmd *bc = io_uring_cmd_to_pdu(cmd, struct io_btrfs_cmd);
@ -4948,8 +4960,11 @@ static int btrfs_uring_encoded_write(struct io_uring_cmd *cmd, unsigned int issu
ret = -EPERM;
goto out_acct;
}
if (fs_info->sectorsize > PAGE_SIZE) {
ret = -ENOTTY;
goto out_acct;
}
file = cmd->file;
sqe_addr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr));
if (!(file->f_mode & FMODE_WRITE)) {

View file

@ -5654,7 +5654,14 @@ static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path,
ei = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
if ((sctx->flags & BTRFS_SEND_FLAG_COMPRESSED) &&
/*
* Do not go through encoded read for bs > ps cases.
*
* Encoded send is using vmallocated pages as buffer, which we can
* not ensure every folio is large enough to contain a block.
*/
if (sctx->send_root->fs_info->sectorsize <= PAGE_SIZE &&
(sctx->flags & BTRFS_SEND_FLAG_COMPRESSED) &&
btrfs_file_extent_compression(leaf, ei) != BTRFS_COMPRESS_NONE) {
bool is_inline = (btrfs_file_extent_type(leaf, ei) ==
BTRFS_FILE_EXTENT_INLINE);