From: Qu Wenruo <wqu@suse.com>
Date: Tue, 9 Sep 2025 03:08:47 +0000 (+0930)
Subject: btrfs: enable experimental bs > ps support
X-Git-Tag: v6.18-rc1~204^2~7
X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=98077f7f2180fa996710452564ebe71adc66af59;p=thirdparty%2Flinux.git

btrfs: enable experimental bs > ps support

With all the preparation patches, we're able to finally enable btrfs
block size (sector size) larger than page size support and give it a
full fstests run.

And obviously this new feature is hidden behind experimental flags, and
should not be considered as a core feature yet as btrfs' default block
size is still 4K.

But this is still a feature that will shine in the future where 16K
block sized device are widely adopted.

For now there are some features explicitly disabled:

- Direct IO
  This is the most complex part to support, the root reason is we can
  not control the pages of iov iter passed in.

  User space programs can only ensure the virtual addresses are
  contiguous, but have no control on their physical addresses.

  Our bs > ps support heavily relies on large folios, and direct IO
  memory can easily break it.

  So direct IO is disabled and will always fall back to buffered IO.

- RAID56
  In theory we can convert RAID56 to use large folios, but it will need
  to be converted back to page based if we want to support direct IO in
  the future.
  So just reject it for now.

- Encoded send
- Encoded read
  Both are utilizing btrfs_encoded_read_regular_fill_pages(), and send
  is utilizing vmallocated memory.
  Unfortunately for vmallocated memory we can not guarantee the minimal
  folio order.

  For send, it will just always fallback to regular writes, which reads
  from page cache and will follow the existing folio order requirement.

- Encoded write
  Encoded write itself is allocating pages by themselves, and we can
  easily change it to follow the minimal order.
  But since encoded read is already disabled, there is no need to only
  enable encoded write.

Finally just like what we did for bs < ps support in the past, add a
warning message for bs > ps mounts.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---

diff --git a/fs/btrfs/direct-io.c b/fs/btrfs/direct-io.c
index fe9a4bd7e6e68..802d4dbe5b381 100644
--- a/fs/btrfs/direct-io.c
+++ b/fs/btrfs/direct-io.c
@@ -786,6 +786,18 @@ static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
 	if (iov_iter_alignment(iter) & blocksize_mask)
 		return -EINVAL;
 
+	/*
+	 * For bs > ps support, we heavily rely on large folios to make sure no
+	 * block will cross large folio boundaries.
+	 *
+	 * But memory provided by direct IO is only virtually contiguous, not
+	 * physically contiguous, and will break the btrfs' large folio requirement.
+	 *
+	 * So for bs > ps support, all direct IOs should fallback to buffered ones.
+	 */
+	if (fs_info->sectorsize > PAGE_SIZE)
+		return -EINVAL;
+
 	return 0;
 }
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 5c57f523f4498..0aee3239518d7 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3242,18 +3242,24 @@ int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount)
 	}
 
 	/*
-	 * Subpage runtime limitation on v1 cache.
+	 * Subpage/bs > ps runtime limitation on v1 cache.
 	 *
 	 * V1 space cache still has some hard coded PAGE_SIZE usage, while
 	 * we're already defaulting to v2 cache, no need to bother v1 as it's
 	 * going to be deprecated anyway.
 	 */
-	if (fs_info->sectorsize < PAGE_SIZE && btrfs_test_opt(fs_info, SPACE_CACHE)) {
+	if (fs_info->sectorsize != PAGE_SIZE && btrfs_test_opt(fs_info, SPACE_CACHE)) {
 		btrfs_warn(fs_info,
 	"v1 space cache is not supported for page size %lu with sectorsize %u",
 			   PAGE_SIZE, fs_info->sectorsize);
 		return -EINVAL;
 	}
+	if (fs_info->sectorsize > PAGE_SIZE && btrfs_fs_incompat(fs_info, RAID56)) {
+		btrfs_err(fs_info,
+		"RAID56 is not supported for page size %lu with sectorsize %u",
+			  PAGE_SIZE, fs_info->sectorsize);
+		return -EINVAL;
+	}
 
 	/* This can be called by remount, we need to protect the super block. */
 	spin_lock(&fs_info->super_lock);
@@ -3388,6 +3394,10 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
 	fs_info->stripesize = stripesize;
 	fs_info->fs_devices->fs_info = fs_info;
 
+	if (fs_info->sectorsize > PAGE_SIZE)
+		btrfs_warn(fs_info,
+			   "support for block size %u with page size %zu is experimental, some features may be missing",
+			   fs_info->sectorsize, PAGE_SIZE);
 	/*
 	 * Handle the space caching options appropriately now that we have the
 	 * super block loaded and validated.
diff --git a/fs/btrfs/fs.c b/fs/btrfs/fs.c
index 29ad1c8591944..feb0a2faa8379 100644
--- a/fs/btrfs/fs.c
+++ b/fs/btrfs/fs.c
@@ -97,8 +97,7 @@ bool __attribute_const__ btrfs_supported_blocksize(u32 blocksize)
 	 */
 	if (IS_ENABLED(CONFIG_HIGHMEM) && blocksize > PAGE_SIZE)
 		return false;
-	if (blocksize <= PAGE_SIZE)
-		return true;
+	return true;
 #endif
 	return false;
 }
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 063291519b363..0e9e2b9993923 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -4418,6 +4418,10 @@ static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp,
 		goto out_acct;
 	}
 
+	if (fs_info->sectorsize > PAGE_SIZE) {
+		ret = -ENOTTY;
+		goto out_acct;
+	}
 	if (compat) {
 #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
 		struct btrfs_ioctl_encoded_io_args_32 args32;
@@ -4509,6 +4513,7 @@ out_acct:
 
 static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool compat)
 {
+	struct btrfs_fs_info *fs_info = inode_to_fs_info(file->f_inode);
 	struct btrfs_ioctl_encoded_io_args args;
 	struct iovec iovstack[UIO_FASTIOV];
 	struct iovec *iov = iovstack;
@@ -4522,6 +4527,11 @@ static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool
 		goto out_acct;
 	}
 
+	if (fs_info->sectorsize > PAGE_SIZE) {
+		ret = -ENOTTY;
+		goto out_acct;
+	}
+
 	if (!(file->f_mode & FMODE_WRITE)) {
 		ret = -EBADF;
 		goto out_acct;
@@ -4780,14 +4790,14 @@ out_fail:
 
 static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue_flags)
 {
+	struct file *file = cmd->file;
+	struct btrfs_inode *inode = BTRFS_I(file->f_inode);
+	struct extent_io_tree *io_tree = &inode->io_tree;
+	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 	size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args, flags);
 	size_t copy_end;
 	int ret;
 	u64 disk_bytenr, disk_io_size;
-	struct file *file;
-	struct btrfs_inode *inode;
-	struct btrfs_fs_info *fs_info;
-	struct extent_io_tree *io_tree;
 	loff_t pos;
 	struct kiocb kiocb;
 	struct extent_state *cached_state = NULL;
@@ -4803,10 +4813,11 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue
 		ret = -EPERM;
 		goto out_acct;
 	}
-	file = cmd->file;
-	inode = BTRFS_I(file->f_inode);
-	fs_info = inode->root->fs_info;
-	io_tree = &inode->io_tree;
+	if (fs_info->sectorsize > PAGE_SIZE) {
+		ret = -ENOTTY;
+		goto out_acct;
+	}
+
 	sqe_addr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr));
 
 	if (issue_flags & IO_URING_F_COMPAT) {
@@ -4933,9 +4944,10 @@ out_acct:
 
 static int btrfs_uring_encoded_write(struct io_uring_cmd *cmd, unsigned int issue_flags)
 {
+	struct file *file = cmd->file;
+	struct btrfs_fs_info *fs_info = inode_to_fs_info(file->f_inode);
 	loff_t pos;
 	struct kiocb kiocb;
-	struct file *file;
 	ssize_t ret;
 	void __user *sqe_addr;
 	struct io_btrfs_cmd *bc = io_uring_cmd_to_pdu(cmd, struct io_btrfs_cmd);
@@ -4948,8 +4960,11 @@ static int btrfs_uring_encoded_write(struct io_uring_cmd *cmd, unsigned int issu
 		ret = -EPERM;
 		goto out_acct;
 	}
+	if (fs_info->sectorsize > PAGE_SIZE) {
+		ret = -ENOTTY;
+		goto out_acct;
+	}
 
-	file = cmd->file;
 	sqe_addr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr));
 
 	if (!(file->f_mode & FMODE_WRITE)) {
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 32653fc44a758..5e073502b9e8e 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -5654,7 +5654,14 @@ static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path,
 
 	ei = btrfs_item_ptr(leaf, path->slots[0],
 			    struct btrfs_file_extent_item);
-	if ((sctx->flags & BTRFS_SEND_FLAG_COMPRESSED) &&
+	/*
+	 * Do not go through encoded read for bs > ps cases.
+	 *
+	 * Encoded send is using vmallocated pages as buffer, which we can
+	 * not ensure every folio is large enough to contain a block.
+	 */
+	if (sctx->send_root->fs_info->sectorsize <= PAGE_SIZE &&
+	    (sctx->flags & BTRFS_SEND_FLAG_COMPRESSED) &&
 	    btrfs_file_extent_compression(leaf, ei) != BTRFS_COMPRESS_NONE) {
 		bool is_inline = (btrfs_file_extent_type(leaf, ei) ==
 				  BTRFS_FILE_EXTENT_INLINE);