So, the key idea is, user can do any file operations on /dev/vdc, and
reclaim the space after the use, while the space is counted as /data.
That doesn't require modifying partition size and filesystem format.
+
+Per-file Read-Only Large Folio Support
+--------------------------------------
+
+F2FS implements large folio support on the read path to leverage high-order
+page allocation for significant performance gains. To minimize code complexity,
+this support is currently excluded from the write path, which requires handling
+complex optimizations such as compression and block allocation modes.
+
+This optional feature is triggered only when a file's immutable bit is set.
+Consequently, F2FS will return EOPNOTSUPP if a user attempts to open a cached
+file with write permissions, even immediately after clearing the bit. Write
+access is only restored once the cached inode is dropped. The usage flow is
+demonstrated below:
+
+.. code-block::
+
+ # f2fs_io setflags immutable /data/testfile_read_seq
+
+ /* flush and reload the inode to enable the large folio */
+ # sync && echo 3 > /proc/sys/vm/drop_caches
+
+ /* mmap(MAP_POPULATE) + mlock() */
+ # f2fs_io read 128 0 1024 mmap 1 0 /data/testfile_read_seq
+
+ /* mmap() + fadvise(POSIX_FADV_WILLNEED) + mlock() */
+ # f2fs_io read 128 0 1024 fadvise 1 0 /data/testfile_read_seq
+
+ /* mmap() + mlock2(MLOCK_ONFAULT) + madvise(MADV_POPULATE_READ) */
+ # f2fs_io read 128 0 1024 madvise 1 0 /data/testfile_read_seq
+
+ # f2fs_io clearflags immutable /data/testfile_read_seq
+
+ # f2fs_io write 1 0 1 zero buffered /data/testfile_read_seq
+ Failed to open /mnt/test/test: Operation not supported
+
+ /* flush and reload the inode to disable the large folio */
+ # sync && echo 3 > /proc/sys/vm/drop_caches
+
+ # f2fs_io write 1 0 1 zero buffered /data/testfile_read_seq
+ Written 4096 bytes with pattern = zero, total_time = 29 us, max_latency = 28 us
+
+ # rm /data/testfile_read_seq
static struct kmem_cache *bio_post_read_ctx_cache;
static struct kmem_cache *bio_entry_slab;
+static struct kmem_cache *ffs_entry_slab;
static mempool_t *bio_post_read_ctx_pool;
static struct bio_set f2fs_bioset;
+struct f2fs_folio_state {
+ spinlock_t state_lock;
+ unsigned int read_pages_pending;
+};
+
#define F2FS_BIO_POOL_SIZE NR_CURSEG_TYPE
int __init f2fs_init_bioset(void)
{
struct folio_iter fi;
struct bio_post_read_ctx *ctx = bio->bi_private;
+ unsigned long flags;
bio_for_each_folio_all(fi, bio) {
struct folio *folio = fi.folio;
+ unsigned nr_pages = fi.length >> PAGE_SHIFT;
+ bool finished = true;
- if (f2fs_is_compressed_page(folio)) {
+ if (!folio_test_large(folio) &&
+ f2fs_is_compressed_page(folio)) {
if (ctx && !ctx->decompression_attempted)
f2fs_end_read_compressed_page(folio, true, 0,
in_task);
continue;
}
- dec_page_count(F2FS_F_SB(folio), __read_io_type(folio));
- folio_end_read(folio, bio->bi_status == BLK_STS_OK);
+ if (folio_test_large(folio)) {
+ struct f2fs_folio_state *ffs = folio->private;
+
+ spin_lock_irqsave(&ffs->state_lock, flags);
+ ffs->read_pages_pending -= nr_pages;
+ finished = !ffs->read_pages_pending;
+ spin_unlock_irqrestore(&ffs->state_lock, flags);
+ }
+
+ while (nr_pages--)
+ dec_page_count(F2FS_F_SB(folio), __read_io_type(folio));
+
+ if (finished)
+ folio_end_read(folio, bio->bi_status == BLK_STS_OK);
}
if (ctx)
void f2fs_submit_read_bio(struct f2fs_sb_info *sbi, struct bio *bio,
enum page_type type)
{
+ if (!bio)
+ return;
+
WARN_ON_ONCE(!is_read_io(bio_op(bio)));
trace_f2fs_submit_read_bio(sbi->sb, type, bio);
struct dnode_of_data dn;
struct folio *folio;
int err;
-
+retry:
folio = f2fs_grab_cache_folio(mapping, index, for_write);
if (IS_ERR(folio))
return folio;
+ if (folio_test_large(folio)) {
+ pgoff_t folio_index = mapping_align_index(mapping, index);
+
+ f2fs_folio_put(folio, true);
+ invalidate_inode_pages2_range(mapping, folio_index,
+ folio_index + folio_nr_pages(folio) - 1);
+ f2fs_schedule_timeout(DEFAULT_SCHEDULE_TIMEOUT);
+ goto retry;
+ }
+
if (f2fs_lookup_read_extent_cache_block(inode, index,
&dn.data_blkaddr)) {
if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), dn.data_blkaddr,
}
#endif
+static struct f2fs_folio_state *ffs_find_or_alloc(struct folio *folio)
+{
+ struct f2fs_folio_state *ffs = folio->private;
+
+ if (ffs)
+ return ffs;
+
+ ffs = f2fs_kmem_cache_alloc(ffs_entry_slab, GFP_NOIO, true, NULL);
+
+ spin_lock_init(&ffs->state_lock);
+ folio_attach_private(folio, ffs);
+ return ffs;
+}
+
+static void ffs_detach_free(struct folio *folio)
+{
+ struct f2fs_folio_state *ffs;
+
+ if (!folio_test_large(folio)) {
+ folio_detach_private(folio);
+ return;
+ }
+
+ ffs = folio_detach_private(folio);
+ if (!ffs)
+ return;
+
+ WARN_ON_ONCE(ffs->read_pages_pending != 0);
+ kmem_cache_free(ffs_entry_slab, ffs);
+}
+
+static int f2fs_read_data_large_folio(struct inode *inode,
+ struct readahead_control *rac, struct folio *folio)
+{
+ struct bio *bio = NULL;
+ sector_t last_block_in_bio = 0;
+ struct f2fs_map_blocks map = {0, };
+ pgoff_t index, offset;
+ unsigned max_nr_pages = rac ? readahead_count(rac) :
+ folio_nr_pages(folio);
+ unsigned nrpages;
+ struct f2fs_folio_state *ffs;
+ int ret = 0;
+
+ if (!IS_IMMUTABLE(inode))
+ return -EOPNOTSUPP;
+
+ if (f2fs_compressed_file(inode))
+ return -EOPNOTSUPP;
+
+ map.m_seg_type = NO_CHECK_TYPE;
+
+ if (rac)
+ folio = readahead_folio(rac);
+next_folio:
+ if (!folio)
+ goto out;
+
+ index = folio->index;
+ offset = 0;
+ ffs = NULL;
+ nrpages = folio_nr_pages(folio);
+
+ for (; nrpages; nrpages--) {
+ sector_t block_nr;
+ /*
+ * Map blocks using the previous result first.
+ */
+ if ((map.m_flags & F2FS_MAP_MAPPED) &&
+ index > map.m_lblk &&
+ index < (map.m_lblk + map.m_len))
+ goto got_it;
+
+ /*
+ * Then do more f2fs_map_blocks() calls until we are
+ * done with this page.
+ */
+ memset(&map, 0, sizeof(map));
+ map.m_seg_type = NO_CHECK_TYPE;
+ map.m_lblk = index;
+ map.m_len = max_nr_pages;
+
+ ret = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_DEFAULT);
+ if (ret)
+ goto err_out;
+got_it:
+ if ((map.m_flags & F2FS_MAP_MAPPED)) {
+ block_nr = map.m_pblk + index - map.m_lblk;
+ if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), block_nr,
+ DATA_GENERIC_ENHANCE_READ)) {
+ ret = -EFSCORRUPTED;
+ goto err_out;
+ }
+ } else {
+ folio_zero_range(folio, offset << PAGE_SHIFT, PAGE_SIZE);
+ if (f2fs_need_verity(inode, index) &&
+ !fsverity_verify_page(folio_file_page(folio,
+ index))) {
+ ret = -EIO;
+ goto err_out;
+ }
+ continue;
+ }
+
+ /*
+ * This page will go to BIO. Do we need to send this
+ * BIO off first?
+ */
+ if (bio && (!page_is_mergeable(F2FS_I_SB(inode), bio,
+ last_block_in_bio, block_nr) ||
+ !f2fs_crypt_mergeable_bio(bio, inode, index, NULL))) {
+submit_and_realloc:
+ f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
+ bio = NULL;
+ }
+ if (bio == NULL)
+ bio = f2fs_grab_read_bio(inode, block_nr,
+ max_nr_pages,
+ f2fs_ra_op_flags(rac),
+ index, false);
+
+ /*
+ * If the page is under writeback, we need to wait for
+ * its completion to see the correct decrypted data.
+ */
+ f2fs_wait_on_block_writeback(inode, block_nr);
+
+ if (!bio_add_folio(bio, folio, F2FS_BLKSIZE,
+ offset << PAGE_SHIFT))
+ goto submit_and_realloc;
+
+ if (folio_test_large(folio)) {
+ ffs = ffs_find_or_alloc(folio);
+
+ /* set the bitmap to wait */
+ spin_lock_irq(&ffs->state_lock);
+ ffs->read_pages_pending++;
+ spin_unlock_irq(&ffs->state_lock);
+ }
+
+ inc_page_count(F2FS_I_SB(inode), F2FS_RD_DATA);
+ f2fs_update_iostat(F2FS_I_SB(inode), NULL, FS_DATA_READ_IO,
+ F2FS_BLKSIZE);
+ last_block_in_bio = block_nr;
+ index++;
+ offset++;
+ }
+ if (rac) {
+ folio = readahead_folio(rac);
+ goto next_folio;
+ }
+err_out:
+ /* Nothing was submitted. */
+ if (!bio) {
+ if (!ret)
+ folio_mark_uptodate(folio);
+ folio_unlock(folio);
+ return ret;
+ }
+
+ if (ret) {
+ f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
+
+ /* Wait bios and clear uptodate. */
+ folio_lock(folio);
+ folio_clear_uptodate(folio);
+ folio_unlock(folio);
+ }
+out:
+ f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
+ return ret;
+}
+
/*
* This function was originally taken from fs/mpage.c, and customized for f2fs.
* Major change was from block_size == page_size in f2fs by default.
pgoff_t index;
#endif
unsigned nr_pages = rac ? readahead_count(rac) : 1;
+ struct address_space *mapping = rac ? rac->mapping : folio->mapping;
unsigned max_nr_pages = nr_pages;
int ret = 0;
+ if (mapping_large_folio_support(mapping))
+ return f2fs_read_data_large_folio(inode, rac, folio);
+
#ifdef CONFIG_F2FS_FS_COMPRESSION
if (f2fs_compressed_file(inode)) {
index = rac ? readahead_index(rac) : folio->index;
}
#endif
}
- if (bio)
- f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
+ f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
return ret;
}
f2fs_remove_dirty_inode(inode);
}
}
- folio_detach_private(folio);
+
+ if (offset || length != folio_size(folio))
+ return;
+
+ folio_cancel_dirty(folio);
+ ffs_detach_free(folio);
}
bool f2fs_release_folio(struct folio *folio, gfp_t wait)
if (folio_test_dirty(folio))
return false;
- folio_detach_private(folio);
+ ffs_detach_free(folio);
return true;
}
{
bio_entry_slab = f2fs_kmem_cache_create("f2fs_bio_entry_slab",
sizeof(struct bio_entry));
- return bio_entry_slab ? 0 : -ENOMEM;
+
+ if (!bio_entry_slab)
+ return -ENOMEM;
+
+ ffs_entry_slab = f2fs_kmem_cache_create("f2fs_ffs_slab",
+ sizeof(struct f2fs_folio_state));
+
+ if (!ffs_entry_slab) {
+ kmem_cache_destroy(bio_entry_slab);
+ return -ENOMEM;
+ }
+
+ return 0;
}
void f2fs_destroy_bio_entry_cache(void)
{
kmem_cache_destroy(bio_entry_slab);
+ kmem_cache_destroy(ffs_entry_slab);
}
static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,