f2fs: support large folio for immutable non-compressed case

author Jaegeuk Kim <jaegeuk@kernel.org>

Wed, 12 Nov 2025 21:52:01 +0000 (21:52 +0000)

committer Jaegeuk Kim <jaegeuk@kernel.org>

Tue, 16 Dec 2025 00:46:49 +0000 (00:46 +0000)
author Jaegeuk Kim <jaegeuk@kernel.org>
Wed, 12 Nov 2025 21:52:01 +0000 (21:52 +0000)
committer Jaegeuk Kim <jaegeuk@kernel.org>
Tue, 16 Dec 2025 00:46:49 +0000 (00:46 +0000)
diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst

index cb90d1ae82d012c3dd9b312b29d02ae494522039..9b3b835a174eb46058fafdf1010aaf66912b94eb 100644 (file)
--- a/Documentation/filesystems/f2fs.rst
+++ b/Documentation/filesystems/f2fs.rst
@@ -1033,3 +1033,46 @@ the reserved space back to F2FS for its own use.
  So, the key idea is, user can do any file operations on /dev/vdc, and
  reclaim the space after the use, while the space is counted as /data.
  That doesn't require modifying partition size and filesystem format.
+
+Per-file Read-Only Large Folio Support
+--------------------------------------
+
+F2FS implements large folio support on the read path to leverage high-order
+page allocation for significant performance gains. To minimize code complexity,
+this support is currently excluded from the write path, which requires handling
+complex optimizations such as compression and block allocation modes.
+
+This optional feature is triggered only when a file's immutable bit is set.
+Consequently, F2FS will return EOPNOTSUPP if a user attempts to open a cached
+file with write permissions, even immediately after clearing the bit. Write
+access is only restored once the cached inode is dropped. The usage flow is
+demonstrated below:
+
+.. code-block::
+
+   # f2fs_io setflags immutable /data/testfile_read_seq
+
+   /* flush and reload the inode to enable the large folio */
+   # sync && echo 3 > /proc/sys/vm/drop_caches
+
+   /* mmap(MAP_POPULATE) + mlock() */
+   # f2fs_io read 128 0 1024 mmap 1 0 /data/testfile_read_seq
+
+   /* mmap() + fadvise(POSIX_FADV_WILLNEED) + mlock() */
+   # f2fs_io read 128 0 1024 fadvise 1 0 /data/testfile_read_seq
+
+   /* mmap() + mlock2(MLOCK_ONFAULT) + madvise(MADV_POPULATE_READ) */
+   # f2fs_io read 128 0 1024 madvise 1 0 /data/testfile_read_seq
+
+   # f2fs_io clearflags immutable /data/testfile_read_seq
+
+   # f2fs_io write 1 0 1 zero buffered /data/testfile_read_seq
+   Failed to open /mnt/test/test: Operation not supported
+
+   /* flush and reload the inode to disable the large folio */
+   # sync && echo 3 > /proc/sys/vm/drop_caches
+
+   # f2fs_io write 1 0 1 zero buffered /data/testfile_read_seq
+   Written 4096 bytes with pattern = zero, total_time = 29 us, max_latency = 28 us
+
+   # rm /data/testfile_read_seq
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c

index c30e69392a62361d746ae0fc2200b16af0124da2..2a6ae274b8deb2b1e71cdc5c1c5b3248091279fc 100644 (file)
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -31,9 +31,15 @@
  
  static struct kmem_cache *bio_post_read_ctx_cache;
  static struct kmem_cache *bio_entry_slab;
+static struct kmem_cache *ffs_entry_slab;
  static mempool_t *bio_post_read_ctx_pool;
  static struct bio_set f2fs_bioset;
  
+struct f2fs_folio_state {
+       spinlock_t              state_lock;
+       unsigned int            read_pages_pending;
+};
+
  #define        F2FS_BIO_POOL_SIZE      NR_CURSEG_TYPE
  
  int __init f2fs_init_bioset(void)
@@ -138,11 +144,15 @@ static void f2fs_finish_read_bio(struct bio *bio, bool in_task)
  {
         struct folio_iter fi;
         struct bio_post_read_ctx *ctx = bio->bi_private;
+       unsigned long flags;
  
         bio_for_each_folio_all(fi, bio) {
                 struct folio *folio = fi.folio;
+               unsigned nr_pages = fi.length >> PAGE_SHIFT;
+               bool finished = true;
  
-               if (f2fs_is_compressed_page(folio)) {
+               if (!folio_test_large(folio) &&
+                   f2fs_is_compressed_page(folio)) {
                         if (ctx && !ctx->decompression_attempted)
                                 f2fs_end_read_compressed_page(folio, true, 0,
                                                         in_task);
@@ -150,8 +160,20 @@ static void f2fs_finish_read_bio(struct bio *bio, bool in_task)
                         continue;
                 }
  
-               dec_page_count(F2FS_F_SB(folio), __read_io_type(folio));
-               folio_end_read(folio, bio->bi_status == BLK_STS_OK);
+               if (folio_test_large(folio)) {
+                       struct f2fs_folio_state *ffs = folio->private;
+
+                       spin_lock_irqsave(&ffs->state_lock, flags);
+                       ffs->read_pages_pending -= nr_pages;
+                       finished = !ffs->read_pages_pending;
+                       spin_unlock_irqrestore(&ffs->state_lock, flags);
+               }
+
+               while (nr_pages--)
+                       dec_page_count(F2FS_F_SB(folio), __read_io_type(folio));
+
+               if (finished)
+                       folio_end_read(folio, bio->bi_status == BLK_STS_OK);
         }
  
         if (ctx)
@@ -509,6 +531,9 @@ static bool f2fs_crypt_mergeable_bio(struct bio *bio, const struct inode *inode,
  void f2fs_submit_read_bio(struct f2fs_sb_info *sbi, struct bio *bio,
                                  enum page_type type)
  {
+       if (!bio)
+               return;
+
         WARN_ON_ONCE(!is_read_io(bio_op(bio)));
         trace_f2fs_submit_read_bio(sbi->sb, type, bio);
  
@@ -1200,11 +1225,21 @@ struct folio *f2fs_get_read_data_folio(struct inode *inode, pgoff_t index,
         struct dnode_of_data dn;
         struct folio *folio;
         int err;
-
+retry:
         folio = f2fs_grab_cache_folio(mapping, index, for_write);
         if (IS_ERR(folio))
                 return folio;
  
+       if (folio_test_large(folio)) {
+               pgoff_t folio_index = mapping_align_index(mapping, index);
+
+               f2fs_folio_put(folio, true);
+               invalidate_inode_pages2_range(mapping, folio_index,
+                               folio_index + folio_nr_pages(folio) - 1);
+               f2fs_schedule_timeout(DEFAULT_SCHEDULE_TIMEOUT);
+               goto retry;
+       }
+
         if (f2fs_lookup_read_extent_cache_block(inode, index,
                                                 &dn.data_blkaddr)) {
                 if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), dn.data_blkaddr,
@@ -2332,6 +2367,179 @@ out:
  }
  #endif
  
+static struct f2fs_folio_state *ffs_find_or_alloc(struct folio *folio)
+{
+       struct f2fs_folio_state *ffs = folio->private;
+
+       if (ffs)
+               return ffs;
+
+       ffs = f2fs_kmem_cache_alloc(ffs_entry_slab, GFP_NOIO, true, NULL);
+
+       spin_lock_init(&ffs->state_lock);
+       folio_attach_private(folio, ffs);
+       return ffs;
+}
+
+static void ffs_detach_free(struct folio *folio)
+{
+       struct f2fs_folio_state *ffs;
+
+       if (!folio_test_large(folio)) {
+               folio_detach_private(folio);
+               return;
+       }
+
+       ffs = folio_detach_private(folio);
+       if (!ffs)
+               return;
+
+       WARN_ON_ONCE(ffs->read_pages_pending != 0);
+       kmem_cache_free(ffs_entry_slab, ffs);
+}
+
+static int f2fs_read_data_large_folio(struct inode *inode,
+               struct readahead_control *rac, struct folio *folio)
+{
+       struct bio *bio = NULL;
+       sector_t last_block_in_bio = 0;
+       struct f2fs_map_blocks map = {0, };
+       pgoff_t index, offset;
+       unsigned max_nr_pages = rac ? readahead_count(rac) :
+                               folio_nr_pages(folio);
+       unsigned nrpages;
+       struct f2fs_folio_state *ffs;
+       int ret = 0;
+
+       if (!IS_IMMUTABLE(inode))
+               return -EOPNOTSUPP;
+
+       if (f2fs_compressed_file(inode))
+               return -EOPNOTSUPP;
+
+       map.m_seg_type = NO_CHECK_TYPE;
+
+       if (rac)
+               folio = readahead_folio(rac);
+next_folio:
+       if (!folio)
+               goto out;
+
+       index = folio->index;
+       offset = 0;
+       ffs = NULL;
+       nrpages = folio_nr_pages(folio);
+
+       for (; nrpages; nrpages--) {
+               sector_t block_nr;
+               /*
+                * Map blocks using the previous result first.
+                */
+               if ((map.m_flags & F2FS_MAP_MAPPED) &&
+                               index > map.m_lblk &&
+                               index < (map.m_lblk + map.m_len))
+                       goto got_it;
+
+               /*
+                * Then do more f2fs_map_blocks() calls until we are
+                * done with this page.
+                */
+               memset(&map, 0, sizeof(map));
+               map.m_seg_type = NO_CHECK_TYPE;
+               map.m_lblk = index;
+               map.m_len = max_nr_pages;
+
+               ret = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_DEFAULT);
+               if (ret)
+                       goto err_out;
+got_it:
+               if ((map.m_flags & F2FS_MAP_MAPPED)) {
+                       block_nr = map.m_pblk + index - map.m_lblk;
+                       if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), block_nr,
+                                               DATA_GENERIC_ENHANCE_READ)) {
+                               ret = -EFSCORRUPTED;
+                               goto err_out;
+                       }
+               } else {
+                       folio_zero_range(folio, offset << PAGE_SHIFT, PAGE_SIZE);
+                       if (f2fs_need_verity(inode, index) &&
+                           !fsverity_verify_page(folio_file_page(folio,
+                                                               index))) {
+                               ret = -EIO;
+                               goto err_out;
+                       }
+                       continue;
+               }
+
+               /*
+                * This page will go to BIO.  Do we need to send this
+                * BIO off first?
+                */
+               if (bio && (!page_is_mergeable(F2FS_I_SB(inode), bio,
+                                               last_block_in_bio, block_nr) ||
+                       !f2fs_crypt_mergeable_bio(bio, inode, index, NULL))) {
+submit_and_realloc:
+                       f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
+                       bio = NULL;
+               }
+               if (bio == NULL)
+                       bio = f2fs_grab_read_bio(inode, block_nr,
+                                       max_nr_pages,
+                                       f2fs_ra_op_flags(rac),
+                                       index, false);
+
+               /*
+                * If the page is under writeback, we need to wait for
+                * its completion to see the correct decrypted data.
+                */
+               f2fs_wait_on_block_writeback(inode, block_nr);
+
+               if (!bio_add_folio(bio, folio, F2FS_BLKSIZE,
+                                       offset << PAGE_SHIFT))
+                       goto submit_and_realloc;
+
+               if (folio_test_large(folio)) {
+                       ffs = ffs_find_or_alloc(folio);
+
+                       /* set the bitmap to wait */
+                       spin_lock_irq(&ffs->state_lock);
+                       ffs->read_pages_pending++;
+                       spin_unlock_irq(&ffs->state_lock);
+               }
+
+               inc_page_count(F2FS_I_SB(inode), F2FS_RD_DATA);
+               f2fs_update_iostat(F2FS_I_SB(inode), NULL, FS_DATA_READ_IO,
+                               F2FS_BLKSIZE);
+               last_block_in_bio = block_nr;
+               index++;
+               offset++;
+       }
+       if (rac) {
+               folio = readahead_folio(rac);
+               goto next_folio;
+       }
+err_out:
+       /* Nothing was submitted. */
+       if (!bio) {
+               if (!ret)
+                       folio_mark_uptodate(folio);
+               folio_unlock(folio);
+               return ret;
+       }
+
+       if (ret) {
+               f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
+
+               /* Wait bios and clear uptodate. */
+               folio_lock(folio);
+               folio_clear_uptodate(folio);
+               folio_unlock(folio);
+       }
+out:
+       f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
+       return ret;
+}
+
  /*
   * This function was originally taken from fs/mpage.c, and customized for f2fs.
   * Major change was from block_size == page_size in f2fs by default.
@@ -2357,9 +2565,13 @@ static int f2fs_mpage_readpages(struct inode *inode,
         pgoff_t index;
  #endif
         unsigned nr_pages = rac ? readahead_count(rac) : 1;
+       struct address_space *mapping = rac ? rac->mapping : folio->mapping;
         unsigned max_nr_pages = nr_pages;
         int ret = 0;
  
+       if (mapping_large_folio_support(mapping))
+               return f2fs_read_data_large_folio(inode, rac, folio);
+
  #ifdef CONFIG_F2FS_FS_COMPRESSION
         if (f2fs_compressed_file(inode)) {
                 index = rac ? readahead_index(rac) : folio->index;
@@ -2450,8 +2662,7 @@ next_page:
                 }
  #endif
         }
-       if (bio)
-               f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
+       f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
         return ret;
  }
  
@@ -3735,7 +3946,12 @@ void f2fs_invalidate_folio(struct folio *folio, size_t offset, size_t length)
                         f2fs_remove_dirty_inode(inode);
                 }
         }
-       folio_detach_private(folio);
+
+       if (offset || length != folio_size(folio))
+               return;
+
+       folio_cancel_dirty(folio);
+       ffs_detach_free(folio);
  }
  
  bool f2fs_release_folio(struct folio *folio, gfp_t wait)
@@ -3744,7 +3960,7 @@ bool f2fs_release_folio(struct folio *folio, gfp_t wait)
         if (folio_test_dirty(folio))
                 return false;
  
-       folio_detach_private(folio);
+       ffs_detach_free(folio);
         return true;
  }
  
@@ -4150,12 +4366,25 @@ int __init f2fs_init_bio_entry_cache(void)
  {
         bio_entry_slab = f2fs_kmem_cache_create("f2fs_bio_entry_slab",
                         sizeof(struct bio_entry));
-       return bio_entry_slab ? 0 : -ENOMEM;
+
+       if (!bio_entry_slab)
+               return -ENOMEM;
+
+       ffs_entry_slab = f2fs_kmem_cache_create("f2fs_ffs_slab",
+                       sizeof(struct f2fs_folio_state));
+
+       if (!ffs_entry_slab) {
+               kmem_cache_destroy(bio_entry_slab);
+               return -ENOMEM;
+       }
+
+       return 0;
  }
  
  void f2fs_destroy_bio_entry_cache(void)
  {
         kmem_cache_destroy(bio_entry_slab);
+       kmem_cache_destroy(ffs_entry_slab);
  }
  
  static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h

index 20edbb99b814a7787be5514be794ff24fc65ac5e..53cbce96f1266e6324cddc672945394ace11b111 100644 (file)
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -4917,6 +4917,22 @@ static inline bool is_journalled_quota(struct f2fs_sb_info *sbi)
         return false;
  }
  
+static inline bool f2fs_quota_file(struct inode *inode)
+{
+#ifdef CONFIG_QUOTA
+       int i;
+
+       if (!f2fs_sb_has_quota_ino(F2FS_I_SB(inode)))
+               return false;
+
+       for (i = 0; i < MAXQUOTAS; i++) {
+               if (f2fs_qf_ino(F2FS_I_SB(inode)->sb, i) == inode->i_ino)
+                       return true;
+       }
+#endif
+       return false;
+}
+
  static inline bool f2fs_block_unit_discard(struct f2fs_sb_info *sbi)
  {
         return F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_BLOCK;
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c

index d7047ca6b98d8a41d69ea79bcbab3e4ae4cf30b6..e75e61ac50d72acd6a1bd9714bc19e15f18032d8 100644 (file)
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -624,6 +624,10 @@ static int f2fs_file_open(struct inode *inode, struct file *filp)
         if (!f2fs_is_compress_backend_ready(inode))
                 return -EOPNOTSUPP;
  
+       if (mapping_large_folio_support(inode->i_mapping) &&
+           filp->f_mode & FMODE_WRITE)
+               return -EOPNOTSUPP;
+
         err = fsverity_file_open(inode, filp);
         if (err)
                 return err;
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c

index 38b8994bc1b2c6a9eed97ee21b8e38d457cf9944..921fb02c0f494bb081b34706726957c3e526abb9 100644 (file)
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -597,6 +597,8 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
         if (ret)
                 goto bad_inode;
  make_now:
+       f2fs_set_inode_flags(inode);
+
         if (ino == F2FS_NODE_INO(sbi)) {
                 inode->i_mapping->a_ops = &f2fs_node_aops;
                 mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
@@ -618,6 +620,9 @@ make_now:
                 inode->i_op = &f2fs_file_inode_operations;
                 inode->i_fop = &f2fs_file_operations;
                 inode->i_mapping->a_ops = &f2fs_dblock_aops;
+               if (IS_IMMUTABLE(inode) && !f2fs_compressed_file(inode) &&
+                   !f2fs_quota_file(inode))
+                       mapping_set_folio_min_order(inode->i_mapping, 0);
         } else if (S_ISDIR(inode->i_mode)) {
                 inode->i_op = &f2fs_dir_inode_operations;
                 inode->i_fop = &f2fs_dir_operations;
@@ -638,7 +643,6 @@ make_now:
                 ret = -EIO;
                 goto bad_inode;
         }
-       f2fs_set_inode_flags(inode);
  
         unlock_new_inode(inode);
         trace_f2fs_iget(inode);
author	Jaegeuk Kim <jaegeuk@kernel.org>
	Wed, 12 Nov 2025 21:52:01 +0000 (21:52 +0000)
committer	Jaegeuk Kim <jaegeuk@kernel.org>
	Tue, 16 Dec 2025 00:46:49 +0000 (00:46 +0000)
Documentation/filesystems/f2fs.rst		patch \| blob \| blame \| history
fs/f2fs/data.c		patch \| blob \| blame \| history
fs/f2fs/f2fs.h		patch \| blob \| blame \| history
fs/f2fs/file.c		patch \| blob \| blame \| history
fs/f2fs/inode.c		patch \| blob \| blame \| history