]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
fsverity: kick off hash readahead at data I/O submission time
authorChristoph Hellwig <hch@lst.de>
Mon, 2 Feb 2026 06:06:33 +0000 (07:06 +0100)
committerEric Biggers <ebiggers@kernel.org>
Tue, 3 Feb 2026 01:15:26 +0000 (17:15 -0800)
Currently all reads of the fsverity hashes are kicked off from the data
I/O completion handler, leading to needlessly dependent I/O.  This is
worked around a bit by performing readahead on the level 0 nodes, but
still fairly ineffective.

Switch to a model where the ->read_folio and ->readahead methods instead
kick off explicit readahead of the fsverity hashed so they are usually
available at I/O completion time.

For 64k sequential reads on my test VM this improves read performance
from 2.4GB/s - 2.6GB/s to 3.5GB/s - 3.9GB/s.  The improvements for
random reads are likely to be even bigger.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: David Sterba <dsterba@suse.com> # btrfs
Link: https://lore.kernel.org/r/20260202060754.270269-5-hch@lst.de
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
fs/btrfs/verity.c
fs/ext4/readpage.c
fs/ext4/verity.c
fs/f2fs/data.c
fs/f2fs/verity.c
fs/verity/pagecache.c
fs/verity/read_metadata.c
fs/verity/verify.c
include/linux/fsverity.h

index e7643c22a6bfd13c5f7883fe83a9df74881a24c4..c152bef71e8b1976d54d1c6110f0494d22573119 100644 (file)
@@ -697,7 +697,6 @@ int btrfs_get_verity_descriptor(struct inode *inode, void *buf, size_t buf_size)
  *
  * @inode:         inode to read a merkle tree page for
  * @index:         page index relative to the start of the merkle tree
- * @num_ra_pages:  number of pages to readahead. Optional, we ignore it
  *
  * The Merkle tree is stored in the filesystem btree, but its pages are cached
  * with a logical position past EOF in the inode's mapping.
@@ -705,8 +704,7 @@ int btrfs_get_verity_descriptor(struct inode *inode, void *buf, size_t buf_size)
  * Returns the page we read, or an ERR_PTR on error.
  */
 static struct page *btrfs_read_merkle_tree_page(struct inode *inode,
-                                               pgoff_t index,
-                                               unsigned long num_ra_pages)
+                                               pgoff_t index)
 {
        struct folio *folio;
        u64 off = (u64)index << PAGE_SHIFT;
index 5a7774f089e81a96274c89926de945d6fb171732..3049a66a05c86477010d4e739b0f7586eb8a943b 100644 (file)
@@ -397,18 +397,20 @@ next_page:
 
 int ext4_read_folio(struct file *file, struct folio *folio)
 {
-       int ret = -EAGAIN;
        struct inode *inode = folio->mapping->host;
+       int ret;
 
        trace_ext4_read_folio(inode, folio);
 
-       if (ext4_has_inline_data(inode))
+       if (ext4_has_inline_data(inode)) {
                ret = ext4_readpage_inline(inode, folio);
+               if (ret != -EAGAIN)
+                       return ret;
+       }
 
-       if (ret == -EAGAIN)
-               return ext4_mpage_readpages(inode, NULL, folio);
-
-       return ret;
+       if (ext4_need_verity(inode, folio->index))
+               fsverity_readahead(inode, folio->index, folio_nr_pages(folio));
+       return ext4_mpage_readpages(inode, NULL, folio);
 }
 
 void ext4_readahead(struct readahead_control *rac)
@@ -419,6 +421,9 @@ void ext4_readahead(struct readahead_control *rac)
        if (ext4_has_inline_data(inode))
                return;
 
+       if (ext4_need_verity(inode, readahead_index(rac)))
+               fsverity_readahead(inode, readahead_index(rac),
+                                  readahead_count(rac));
        ext4_mpage_readpages(inode, rac, NULL);
 }
 
index a071860ad36ae5a612c6c821733d62774831bac1..552cc5d81d9487be91864c76abbf55c9cb150397 100644 (file)
@@ -358,11 +358,17 @@ static int ext4_get_verity_descriptor(struct inode *inode, void *buf,
 }
 
 static struct page *ext4_read_merkle_tree_page(struct inode *inode,
-                                              pgoff_t index,
-                                              unsigned long num_ra_pages)
+                                              pgoff_t index)
 {
        index += ext4_verity_metadata_pos(inode) >> PAGE_SHIFT;
-       return generic_read_merkle_tree_page(inode, index, num_ra_pages);
+       return generic_read_merkle_tree_page(inode, index);
+}
+
+static void ext4_readahead_merkle_tree(struct inode *inode, pgoff_t index,
+                                      unsigned long nr_pages)
+{
+       index += ext4_verity_metadata_pos(inode) >> PAGE_SHIFT;
+       generic_readahead_merkle_tree(inode, index, nr_pages);
 }
 
 static int ext4_write_merkle_tree_block(struct file *file, const void *buf,
@@ -380,5 +386,6 @@ const struct fsverity_operations ext4_verityops = {
        .end_enable_verity      = ext4_end_enable_verity,
        .get_verity_descriptor  = ext4_get_verity_descriptor,
        .read_merkle_tree_page  = ext4_read_merkle_tree_page,
+       .readahead_merkle_tree  = ext4_readahead_merkle_tree,
        .write_merkle_tree_block = ext4_write_merkle_tree_block,
 };
index c30e69392a62361d746ae0fc2200b16af0124da2..58d8a311ef2c3b191bcfe89f23e3c5aff8428587 100644 (file)
@@ -2458,7 +2458,7 @@ next_page:
 static int f2fs_read_data_folio(struct file *file, struct folio *folio)
 {
        struct inode *inode = folio->mapping->host;
-       int ret = -EAGAIN;
+       int ret;
 
        trace_f2fs_readpage(folio, DATA);
 
@@ -2468,11 +2468,15 @@ static int f2fs_read_data_folio(struct file *file, struct folio *folio)
        }
 
        /* If the file has inline data, try to read it directly */
-       if (f2fs_has_inline_data(inode))
+       if (f2fs_has_inline_data(inode)) {
                ret = f2fs_read_inline_data(inode, folio);
-       if (ret == -EAGAIN)
-               ret = f2fs_mpage_readpages(inode, NULL, folio);
-       return ret;
+               if (ret != -EAGAIN)
+                       return ret;
+       }
+
+       if (f2fs_need_verity(inode, folio->index))
+               fsverity_readahead(inode, folio->index, folio_nr_pages(folio));
+       return f2fs_mpage_readpages(inode, NULL, folio);
 }
 
 static void f2fs_readahead(struct readahead_control *rac)
@@ -2488,6 +2492,9 @@ static void f2fs_readahead(struct readahead_control *rac)
        if (f2fs_has_inline_data(inode))
                return;
 
+       if (f2fs_need_verity(inode, readahead_index(rac)))
+               fsverity_readahead(inode, readahead_index(rac),
+                                  readahead_count(rac));
        f2fs_mpage_readpages(inode, rac, NULL);
 }
 
index d37e584423afe570a929685159a2e4365dff33c4..de2c876213199f2f87992c70108a6296e9203d47 100644 (file)
@@ -256,11 +256,17 @@ static int f2fs_get_verity_descriptor(struct inode *inode, void *buf,
 }
 
 static struct page *f2fs_read_merkle_tree_page(struct inode *inode,
-                                              pgoff_t index,
-                                              unsigned long num_ra_pages)
+                                              pgoff_t index)
 {
        index += f2fs_verity_metadata_pos(inode) >> PAGE_SHIFT;
-       return generic_read_merkle_tree_page(inode, index, num_ra_pages);
+       return generic_read_merkle_tree_page(inode, index);
+}
+
+static void f2fs_readahead_merkle_tree(struct inode *inode, pgoff_t index,
+                                      unsigned long nr_pages)
+{
+       index += f2fs_verity_metadata_pos(inode) >> PAGE_SHIFT;
+       generic_readahead_merkle_tree(inode, index, nr_pages);
 }
 
 static int f2fs_write_merkle_tree_block(struct file *file, const void *buf,
@@ -278,5 +284,6 @@ const struct fsverity_operations f2fs_verityops = {
        .end_enable_verity      = f2fs_end_enable_verity,
        .get_verity_descriptor  = f2fs_get_verity_descriptor,
        .read_merkle_tree_page  = f2fs_read_merkle_tree_page,
+       .readahead_merkle_tree  = f2fs_readahead_merkle_tree,
        .write_merkle_tree_block = f2fs_write_merkle_tree_block,
 };
index 8e0d6fde802f8305ba0210f1d745eca5c50dd3f1..1819314ecaa35e351d2a01b9e7b34151ac980135 100644 (file)
@@ -3,6 +3,7 @@
  * Copyright 2019 Google LLC
  */
 
+#include <linux/export.h>
 #include <linux/fsverity.h>
 #include <linux/pagemap.h>
 
  * generic_read_merkle_tree_page - generic ->read_merkle_tree_page helper
  * @inode:     inode containing the Merkle tree
  * @index:     0-based index of the Merkle tree page in the inode
- * @num_ra_pages: The number of Merkle tree pages that should be prefetched.
  *
  * The caller needs to adjust @index from the Merkle-tree relative index passed
  * to ->read_merkle_tree_page to the actual index where the Merkle tree is
  * stored in the page cache for @inode.
  */
-struct page *generic_read_merkle_tree_page(struct inode *inode, pgoff_t index,
-                                          unsigned long num_ra_pages)
+struct page *generic_read_merkle_tree_page(struct inode *inode, pgoff_t index)
 {
        struct folio *folio;
 
+       folio = read_mapping_folio(inode->i_mapping, index, NULL);
+       if (IS_ERR(folio))
+               return ERR_CAST(folio);
+       return folio_file_page(folio, index);
+}
+EXPORT_SYMBOL_GPL(generic_read_merkle_tree_page);
+
+/**
+ * generic_readahead_merkle_tree() - generic ->readahead_merkle_tree helper
+ * @inode:     inode containing the Merkle tree
+ * @index:     0-based index of the first Merkle tree page to read ahead in the
+ *             inode
+ * @nr_pages:  the number of Merkle tree pages that should be read ahead
+ *
+ * The caller needs to adjust @index from the Merkle-tree relative index passed
+ * to ->read_merkle_tree_page to the actual index where the Merkle tree is
+ * stored in the page cache for @inode.
+ */
+void generic_readahead_merkle_tree(struct inode *inode, pgoff_t index,
+                                  unsigned long nr_pages)
+{
+       struct folio *folio;
+
+       lockdep_assert_held(&inode->i_mapping->invalidate_lock);
+
        folio = __filemap_get_folio(inode->i_mapping, index, FGP_ACCESSED, 0);
        if (folio == ERR_PTR(-ENOENT) ||
            (!IS_ERR(folio) && !folio_test_uptodate(folio))) {
                DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index);
 
-               if (!IS_ERR(folio)) {
-                       folio_put(folio);
-               } else if (num_ra_pages > 1) {
-                       filemap_invalidate_lock_shared(inode->i_mapping);
-                       page_cache_ra_unbounded(&ractl, num_ra_pages, 0);
-                       filemap_invalidate_unlock_shared(inode->i_mapping);
-               }
-               folio = read_mapping_folio(inode->i_mapping, index, NULL);
+               page_cache_ra_unbounded(&ractl, nr_pages, 0);
        }
-       if (IS_ERR(folio))
-               return ERR_CAST(folio);
-       return folio_file_page(folio, index);
+       if (!IS_ERR(folio))
+               folio_put(folio);
 }
-EXPORT_SYMBOL_GPL(generic_read_merkle_tree_page);
+EXPORT_SYMBOL_GPL(generic_readahead_merkle_tree);
index cba5d6af4e04f4056f48e06483a30a838dff2f47..b4c0892430cde8f000f29e585af83d6c143ec18b 100644 (file)
@@ -28,24 +28,33 @@ static int fsverity_read_merkle_tree(struct inode *inode,
        if (offset >= end_offset)
                return 0;
        offs_in_page = offset_in_page(offset);
+       index = offset >> PAGE_SHIFT;
        last_index = (end_offset - 1) >> PAGE_SHIFT;
 
+       /*
+        * Kick off readahead for the range we are going to read to ensure a
+        * single large sequential read instead of lots of small ones.
+        */
+       if (inode->i_sb->s_vop->readahead_merkle_tree) {
+               filemap_invalidate_lock_shared(inode->i_mapping);
+               inode->i_sb->s_vop->readahead_merkle_tree(
+                       inode, index, last_index - index + 1);
+               filemap_invalidate_unlock_shared(inode->i_mapping);
+       }
+
        /*
         * Iterate through each Merkle tree page in the requested range and copy
         * the requested portion to userspace.  Note that the Merkle tree block
         * size isn't important here, as we are returning a byte stream; i.e.,
         * we can just work with pages even if the tree block size != PAGE_SIZE.
         */
-       for (index = offset >> PAGE_SHIFT; index <= last_index; index++) {
-               unsigned long num_ra_pages =
-                       min_t(unsigned long, last_index - index + 1,
-                             inode->i_sb->s_bdi->io_pages);
+       for (; index <= last_index; index++) {
                unsigned int bytes_to_copy = min_t(u64, end_offset - offset,
                                                   PAGE_SIZE - offs_in_page);
                struct page *page;
                const void *virt;
 
-               page = vops->read_merkle_tree_page(inode, index, num_ra_pages);
+               page = vops->read_merkle_tree_page(inode, index);
                if (IS_ERR(page)) {
                        err = PTR_ERR(page);
                        fsverity_err(inode,
index 86067c8b40cf3207d0cc274852f635b9cc90d61e..81e4c6012eb5e0d6472d41de55efab417524e440 100644 (file)
@@ -21,7 +21,6 @@ struct fsverity_pending_block {
 struct fsverity_verification_context {
        struct inode *inode;
        struct fsverity_info *vi;
-       unsigned long max_ra_pages;
 
        /*
         * This is the queue of data blocks that are pending verification.  When
@@ -37,6 +36,50 @@ struct fsverity_verification_context {
 
 static struct workqueue_struct *fsverity_read_workqueue;
 
+/**
+ * fsverity_readahead() - kick off readahead on fsverity hashes
+ * @inode:             inode that is being read
+ * @index:             first file data page index that is being read
+ * @nr_pages:          number of file data pages to be read
+ *
+ * Start readahead on the fsverity hashes that are needed to verify the file
+ * data in the range from @index to @index + @nr_pages (exclusive upper bound).
+ *
+ * To be called from the file systems' ->read_folio and ->readahead methods to
+ * ensure that the hashes are already cached on completion of the file data
+ * read if possible.
+ */
+void fsverity_readahead(struct inode *inode, pgoff_t index,
+                       unsigned long nr_pages)
+{
+       const struct fsverity_info *vi = *fsverity_info_addr(inode);
+       const struct merkle_tree_params *params = &vi->tree_params;
+       u64 start_hidx = (u64)index << params->log_blocks_per_page;
+       u64 end_hidx =
+               (((u64)index + nr_pages) << params->log_blocks_per_page) - 1;
+       int level;
+
+       if (!inode->i_sb->s_vop->readahead_merkle_tree)
+               return;
+
+       for (level = 0; level < params->num_levels; level++) {
+               unsigned long level_start = params->level_start[level];
+               unsigned long next_start_hidx = start_hidx >> params->log_arity;
+               unsigned long next_end_hidx = end_hidx >> params->log_arity;
+               pgoff_t start_idx = (level_start + next_start_hidx) >>
+                                   params->log_blocks_per_page;
+               pgoff_t end_idx = (level_start + next_end_hidx) >>
+                                 params->log_blocks_per_page;
+
+               inode->i_sb->s_vop->readahead_merkle_tree(
+                       inode, start_idx, end_idx - start_idx + 1);
+
+               start_hidx = next_start_hidx;
+               end_hidx = next_end_hidx;
+       }
+}
+EXPORT_SYMBOL_GPL(fsverity_readahead);
+
 /*
  * Returns true if the hash block with index @hblock_idx in the tree, located in
  * @hpage, has already been verified.
@@ -114,8 +157,7 @@ static bool is_hash_block_verified(struct fsverity_info *vi, struct page *hpage,
  * Return: %true if the data block is valid, else %false.
  */
 static bool verify_data_block(struct inode *inode, struct fsverity_info *vi,
-                             const struct fsverity_pending_block *dblock,
-                             unsigned long max_ra_pages)
+                             const struct fsverity_pending_block *dblock)
 {
        const u64 data_pos = dblock->pos;
        const struct merkle_tree_params *params = &vi->tree_params;
@@ -200,8 +242,7 @@ static bool verify_data_block(struct inode *inode, struct fsverity_info *vi,
                          (params->block_size - 1);
 
                hpage = inode->i_sb->s_vop->read_merkle_tree_page(inode,
-                               hpage_idx, level == 0 ? min(max_ra_pages,
-                                       params->tree_pages - hpage_idx) : 0);
+                                                                 hpage_idx);
                if (IS_ERR(hpage)) {
                        fsverity_err(inode,
                                     "Error %ld reading Merkle tree page %lu",
@@ -272,14 +313,12 @@ error:
 
 static void
 fsverity_init_verification_context(struct fsverity_verification_context *ctx,
-                                  struct inode *inode,
-                                  unsigned long max_ra_pages)
+                                  struct inode *inode)
 {
        struct fsverity_info *vi = *fsverity_info_addr(inode);
 
        ctx->inode = inode;
        ctx->vi = vi;
-       ctx->max_ra_pages = max_ra_pages;
        ctx->num_pending = 0;
        if (vi->tree_params.hash_alg->algo_id == HASH_ALGO_SHA256 &&
            sha256_finup_2x_is_optimized())
@@ -322,8 +361,7 @@ fsverity_verify_pending_blocks(struct fsverity_verification_context *ctx)
        }
 
        for (i = 0; i < ctx->num_pending; i++) {
-               if (!verify_data_block(ctx->inode, vi, &ctx->pending_blocks[i],
-                                      ctx->max_ra_pages))
+               if (!verify_data_block(ctx->inode, vi, &ctx->pending_blocks[i]))
                        return false;
        }
        fsverity_clear_pending_blocks(ctx);
@@ -373,7 +411,7 @@ bool fsverity_verify_blocks(struct folio *folio, size_t len, size_t offset)
 {
        struct fsverity_verification_context ctx;
 
-       fsverity_init_verification_context(&ctx, folio->mapping->host, 0);
+       fsverity_init_verification_context(&ctx, folio->mapping->host);
 
        if (fsverity_add_data_blocks(&ctx, folio, len, offset) &&
            fsverity_verify_pending_blocks(&ctx))
@@ -403,22 +441,8 @@ void fsverity_verify_bio(struct bio *bio)
        struct inode *inode = bio_first_folio_all(bio)->mapping->host;
        struct fsverity_verification_context ctx;
        struct folio_iter fi;
-       unsigned long max_ra_pages = 0;
-
-       if (bio->bi_opf & REQ_RAHEAD) {
-               /*
-                * If this bio is for data readahead, then we also do readahead
-                * of the first (largest) level of the Merkle tree.  Namely,
-                * when a Merkle tree page is read, we also try to piggy-back on
-                * some additional pages -- up to 1/4 the number of data pages.
-                *
-                * This improves sequential read performance, as it greatly
-                * reduces the number of I/O requests made to the Merkle tree.
-                */
-               max_ra_pages = bio->bi_iter.bi_size >> (PAGE_SHIFT + 2);
-       }
 
-       fsverity_init_verification_context(&ctx, inode, max_ra_pages);
+       fsverity_init_verification_context(&ctx, inode);
 
        bio_for_each_folio_all(fi, bio) {
                if (!fsverity_add_data_blocks(&ctx, fi.folio, fi.length,
index 8ddaa87fece3a29e4357725d3a4c2737985f65b1..722a42754a86c2855582b1939753abde973cdf06 100644 (file)
@@ -97,10 +97,6 @@ struct fsverity_operations {
         *
         * @inode: the inode
         * @index: 0-based index of the page within the Merkle tree
-        * @num_ra_pages: The number of Merkle tree pages that should be
-        *                prefetched starting at @index if the page at @index
-        *                isn't already cached.  Implementations may ignore this
-        *                argument; it's only a performance optimization.
         *
         * This can be called at any time on an open verity file.  It may be
         * called by multiple processes concurrently, even with the same page.
@@ -110,8 +106,23 @@ struct fsverity_operations {
         * Return: the page on success, ERR_PTR() on failure
         */
        struct page *(*read_merkle_tree_page)(struct inode *inode,
-                                             pgoff_t index,
-                                             unsigned long num_ra_pages);
+                                             pgoff_t index);
+
+       /**
+        * Perform readahead of a Merkle tree for the given inode.
+        *
+        * @inode: the inode
+        * @index: 0-based index of the first page within the Merkle tree
+        * @nr_pages: number of pages to be read ahead.
+        *
+        * This can be called at any time on an open verity file.  It may be
+        * called by multiple processes concurrently, even with the same range.
+        *
+        * Optional method so that ->read_merkle_tree_page preferably finds
+        * cached data instead of issuing dependent I/O.
+        */
+       void (*readahead_merkle_tree)(struct inode *inode, pgoff_t index,
+                                     unsigned long nr_pages);
 
        /**
         * Write a Merkle tree block to the given file.
@@ -308,8 +319,11 @@ static inline int fsverity_file_open(struct inode *inode, struct file *filp)
 }
 
 void fsverity_cleanup_inode(struct inode *inode);
+void fsverity_readahead(struct inode *inode, pgoff_t index,
+                       unsigned long nr_pages);
 
-struct page *generic_read_merkle_tree_page(struct inode *inode, pgoff_t index,
-                                          unsigned long num_ra_pages);
+struct page *generic_read_merkle_tree_page(struct inode *inode, pgoff_t index);
+void generic_readahead_merkle_tree(struct inode *inode, pgoff_t index,
+                                  unsigned long nr_pages);
 
 #endif /* _LINUX_FSVERITY_H */