--- /dev/null
+From 379ee0db89d417747760eeacdfe48e01b217ac08 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 5 Dec 2024 19:41:53 +0800
+Subject: dm array: fix cursor index when skipping across block boundaries
+
+From: Ming-Hung Tsai <mtsai@redhat.com>
+
+[ Upstream commit 0bb1968da2737ba68fd63857d1af2b301a18d3bf ]
+
+dm_array_cursor_skip() seeks to the target position by loading array
+blocks iteratively until the specified number of entries to skip is
+reached. When seeking across block boundaries, it uses
+dm_array_cursor_next() to step into the next block.
+dm_array_cursor_skip() must first move the cursor index to the end
+of the current block; otherwise, the cursor position could incorrectly
+remain in the same block, causing the actual number of skipped entries
+to be much smaller than expected.
+
+This bug affects cache resizing in v2 metadata and could lead to data
+loss if the fast device is shrunk during the first-time resume. For
+example:
+
+1. create a cache metadata consists of 32768 blocks, with a dirty block
+ assigned to the second bitmap block. cache_restore v1.0 is required.
+
+cat <<EOF >> cmeta.xml
+<superblock uuid="" block_size="64" nr_cache_blocks="32768" \
+policy="smq" hint_width="4">
+ <mappings>
+ <mapping cache_block="32767" origin_block="0" dirty="true"/>
+ </mappings>
+</superblock>
+EOF
+dmsetup create cmeta --table "0 8192 linear /dev/sdc 0"
+cache_restore -i cmeta.xml -o /dev/mapper/cmeta --metadata-version=2
+
+2. bring up the cache while attempt to discard all the blocks belonging
+ to the second bitmap block (block# 32576 to 32767). The last command
+ is expected to fail, but it actually succeeds.
+
+dmsetup create cdata --table "0 2084864 linear /dev/sdc 8192"
+dmsetup create corig --table "0 65536 linear /dev/sdc 2105344"
+dmsetup create cache --table "0 65536 cache /dev/mapper/cmeta \
+/dev/mapper/cdata /dev/mapper/corig 64 2 metadata2 writeback smq \
+2 migration_threshold 0"
+
+In addition to the reproducer described above, this fix can be
+verified using the "array_cursor/skip" tests in dm-unit:
+ dm-unit run /pdata/array_cursor/skip/ --kernel-dir <KERNEL_DIR>
+
+Signed-off-by: Ming-Hung Tsai <mtsai@redhat.com>
+Fixes: 9b696229aa7d ("dm persistent data: add cursor skip functions to the cursor APIs")
+Reviewed-by: Joe Thornber <thornber@redhat.com>
+Signed-off-by: Mike Snitzer <snitzer@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/md/persistent-data/dm-array.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c
+index 0850dfdffc8c..8f8792e55806 100644
+--- a/drivers/md/persistent-data/dm-array.c
++++ b/drivers/md/persistent-data/dm-array.c
+@@ -1003,6 +1003,7 @@ int dm_array_cursor_skip(struct dm_array_cursor *c, uint32_t count)
+ }
+
+ count -= remaining;
++ c->index += (remaining - 1);
+ r = dm_array_cursor_next(c);
+
+ } while (!r);
+--
+2.39.5
+
--- /dev/null
+From d8517732262f9518a97c457c1a892eafaf80328f Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 5 Dec 2024 19:41:51 +0800
+Subject: dm array: fix releasing a faulty array block twice in
+ dm_array_cursor_end
+
+From: Ming-Hung Tsai <mtsai@redhat.com>
+
+[ Upstream commit f2893c0804d86230ffb8f1c8703fdbb18648abc8 ]
+
+When dm_bm_read_lock() fails due to locking or checksum errors, it
+releases the faulty block implicitly while leaving an invalid output
+pointer behind. The caller of dm_bm_read_lock() should not operate on
+this invalid dm_block pointer, or it will lead to undefined result.
+For example, the dm_array_cursor incorrectly caches the invalid pointer
+on reading a faulty array block, causing a double release in
+dm_array_cursor_end(), then hitting the BUG_ON in dm-bufio cache_put().
+
+Reproduce steps:
+
+1. initialize a cache device
+
+dmsetup create cmeta --table "0 8192 linear /dev/sdc 0"
+dmsetup create cdata --table "0 65536 linear /dev/sdc 8192"
+dmsetup create corig --table "0 524288 linear /dev/sdc $262144"
+dd if=/dev/zero of=/dev/mapper/cmeta bs=4k count=1
+dmsetup create cache --table "0 524288 cache /dev/mapper/cmeta \
+/dev/mapper/cdata /dev/mapper/corig 128 2 metadata2 writethrough smq 0"
+
+2. wipe the second array block offline
+
+dmsteup remove cache cmeta cdata corig
+mapping_root=$(dd if=/dev/sdc bs=1c count=8 skip=192 \
+2>/dev/null | hexdump -e '1/8 "%u\n"')
+ablock=$(dd if=/dev/sdc bs=1c count=8 skip=$((4096*mapping_root+2056)) \
+2>/dev/null | hexdump -e '1/8 "%u\n"')
+dd if=/dev/zero of=/dev/sdc bs=4k count=1 seek=$ablock
+
+3. try reopen the cache device
+
+dmsetup create cmeta --table "0 8192 linear /dev/sdc 0"
+dmsetup create cdata --table "0 65536 linear /dev/sdc 8192"
+dmsetup create corig --table "0 524288 linear /dev/sdc $262144"
+dmsetup create cache --table "0 524288 cache /dev/mapper/cmeta \
+/dev/mapper/cdata /dev/mapper/corig 128 2 metadata2 writethrough smq 0"
+
+Kernel logs:
+
+(snip)
+device-mapper: array: array_block_check failed: blocknr 0 != wanted 10
+device-mapper: block manager: array validator check failed for block 10
+device-mapper: array: get_ablock failed
+device-mapper: cache metadata: dm_array_cursor_next for mapping failed
+------------[ cut here ]------------
+kernel BUG at drivers/md/dm-bufio.c:638!
+
+Fix by setting the cached block pointer to NULL on errors.
+
+In addition to the reproducer described above, this fix can be
+verified using the "array_cursor/damaged" test in dm-unit:
+ dm-unit run /pdata/array_cursor/damaged --kernel-dir <KERNEL_DIR>
+
+Signed-off-by: Ming-Hung Tsai <mtsai@redhat.com>
+Fixes: fdd1315aa5f0 ("dm array: introduce cursor api")
+Reviewed-by: Joe Thornber <thornber@redhat.com>
+Signed-off-by: Mike Snitzer <snitzer@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/md/persistent-data/dm-array.c | 12 ++++++++----
+ 1 file changed, 8 insertions(+), 4 deletions(-)
+
+diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c
+index 157c9bd2fed7..4866ff56125f 100644
+--- a/drivers/md/persistent-data/dm-array.c
++++ b/drivers/md/persistent-data/dm-array.c
+@@ -917,23 +917,27 @@ static int load_ablock(struct dm_array_cursor *c)
+ if (c->block)
+ unlock_ablock(c->info, c->block);
+
+- c->block = NULL;
+- c->ab = NULL;
+ c->index = 0;
+
+ r = dm_btree_cursor_get_value(&c->cursor, &key, &value_le);
+ if (r) {
+ DMERR("dm_btree_cursor_get_value failed");
+- dm_btree_cursor_end(&c->cursor);
++ goto out;
+
+ } else {
+ r = get_ablock(c->info, le64_to_cpu(value_le), &c->block, &c->ab);
+ if (r) {
+ DMERR("get_ablock failed");
+- dm_btree_cursor_end(&c->cursor);
++ goto out;
+ }
+ }
+
++ return 0;
++
++out:
++ dm_btree_cursor_end(&c->cursor);
++ c->block = NULL;
++ c->ab = NULL;
+ return r;
+ }
+
+--
+2.39.5
+
--- /dev/null
+From d657e02caac44de5b858bc66b6788c23d82d1f33 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 5 Dec 2024 19:41:52 +0800
+Subject: dm array: fix unreleased btree blocks on closing a faulty array
+ cursor
+
+From: Ming-Hung Tsai <mtsai@redhat.com>
+
+[ Upstream commit 626f128ee9c4133b1cfce4be2b34a1508949370e ]
+
+The cached block pointer in dm_array_cursor might be NULL if it reaches
+an unreadable array block, or the array is empty. Therefore,
+dm_array_cursor_end() should call dm_btree_cursor_end() unconditionally,
+to prevent leaving unreleased btree blocks.
+
+This fix can be verified using the "array_cursor/iterate/empty" test
+in dm-unit:
+ dm-unit run /pdata/array_cursor/iterate/empty --kernel-dir <KERNEL_DIR>
+
+Signed-off-by: Ming-Hung Tsai <mtsai@redhat.com>
+Fixes: fdd1315aa5f0 ("dm array: introduce cursor api")
+Reviewed-by: Joe Thornber <thornber@redhat.com>
+Signed-off-by: Mike Snitzer <snitzer@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/md/persistent-data/dm-array.c | 6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c
+index 4866ff56125f..0850dfdffc8c 100644
+--- a/drivers/md/persistent-data/dm-array.c
++++ b/drivers/md/persistent-data/dm-array.c
+@@ -960,10 +960,10 @@ EXPORT_SYMBOL_GPL(dm_array_cursor_begin);
+
+ void dm_array_cursor_end(struct dm_array_cursor *c)
+ {
+- if (c->block) {
++ if (c->block)
+ unlock_ablock(c->info, c->block);
+- dm_btree_cursor_end(&c->cursor);
+- }
++
++ dm_btree_cursor_end(&c->cursor);
+ }
+ EXPORT_SYMBOL_GPL(dm_array_cursor_end);
+
+--
+2.39.5
+
--- /dev/null
+From 6442221b5d8f00f1a08e5194fde67229095ea1f7 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 16 Dec 2024 13:39:42 +0800
+Subject: exfat: fix the infinite loop in __exfat_free_cluster()
+
+From: Yuezhang Mo <Yuezhang.Mo@sony.com>
+
+[ Upstream commit a5324b3a488d883aa2d42f72260054e87d0940a0 ]
+
+In __exfat_free_cluster(), the cluster chain is traversed until the
+EOF cluster. If the cluster chain includes a loop due to file system
+corruption, the EOF cluster cannot be traversed, resulting in an
+infinite loop.
+
+This commit uses the total number of clusters to prevent this infinite
+loop.
+
+Reported-by: syzbot+1de5a37cb85a2d536330@syzkaller.appspotmail.com
+Closes: https://syzkaller.appspot.com/bug?extid=1de5a37cb85a2d536330
+Tested-by: syzbot+1de5a37cb85a2d536330@syzkaller.appspotmail.com
+Fixes: 31023864e67a ("exfat: add fat entry operations")
+Signed-off-by: Yuezhang Mo <Yuezhang.Mo@sony.com>
+Reviewed-by: Sungjong Seo <sj1557.seo@samsung.com>
+Signed-off-by: Namjae Jeon <linkinjeon@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/exfat/fatent.c | 10 ++++++++++
+ 1 file changed, 10 insertions(+)
+
+diff --git a/fs/exfat/fatent.c b/fs/exfat/fatent.c
+index 773c320d68f3..9e5492ac409b 100644
+--- a/fs/exfat/fatent.c
++++ b/fs/exfat/fatent.c
+@@ -216,6 +216,16 @@ static int __exfat_free_cluster(struct inode *inode, struct exfat_chain *p_chain
+
+ if (err)
+ goto dec_used_clus;
++
++ if (num_clusters >= sbi->num_clusters - EXFAT_FIRST_CLUSTER) {
++ /*
++ * The cluster chain includes a loop, scan the
++ * bitmap to get the number of used clusters.
++ */
++ exfat_count_used_clusters(sb, &sbi->used_clusters);
++
++ return 0;
++ }
+ } while (clu != EXFAT_EOF_CLUSTER);
+ }
+
+--
+2.39.5
+
--- /dev/null
+From dd98a1f4fe72eb4042682add6dd320753c8b48e8 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 13 Dec 2024 13:08:37 +0800
+Subject: exfat: fix the infinite loop in exfat_readdir()
+
+From: Yuezhang Mo <Yuezhang.Mo@sony.com>
+
+[ Upstream commit fee873761bd978d077d8c55334b4966ac4cb7b59 ]
+
+If the file system is corrupted so that a cluster is linked to
+itself in the cluster chain, and there is an unused directory
+entry in the cluster, 'dentry' will not be incremented, causing
+condition 'dentry < max_dentries' unable to prevent an infinite
+loop.
+
+This infinite loop causes s_lock not to be released, and other
+tasks will hang, such as exfat_sync_fs().
+
+This commit stops traversing the cluster chain when there is unused
+directory entry in the cluster to avoid this infinite loop.
+
+Reported-by: syzbot+205c2644abdff9d3f9fc@syzkaller.appspotmail.com
+Closes: https://syzkaller.appspot.com/bug?extid=205c2644abdff9d3f9fc
+Tested-by: syzbot+205c2644abdff9d3f9fc@syzkaller.appspotmail.com
+Fixes: ca06197382bd ("exfat: add directory operations")
+Signed-off-by: Yuezhang Mo <Yuezhang.Mo@sony.com>
+Reviewed-by: Sungjong Seo <sj1557.seo@samsung.com>
+Signed-off-by: Namjae Jeon <linkinjeon@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/exfat/dir.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c
+index 7446bf09a04a..9d8848872fe8 100644
+--- a/fs/exfat/dir.c
++++ b/fs/exfat/dir.c
+@@ -125,7 +125,7 @@ static int exfat_readdir(struct inode *inode, loff_t *cpos, struct exfat_dir_ent
+ type = exfat_get_entry_type(ep);
+ if (type == TYPE_UNUSED) {
+ brelse(bh);
+- break;
++ goto out;
+ }
+
+ if (type != TYPE_FILE && type != TYPE_DIR) {
+@@ -189,6 +189,7 @@ static int exfat_readdir(struct inode *inode, loff_t *cpos, struct exfat_dir_ent
+ }
+ }
+
++out:
+ dir_entry->namebuf.lfn[0] = '\0';
+ *cpos = EXFAT_DEN_TO_B(dentry);
+ return 0;
+--
+2.39.5
+
--- /dev/null
+From 360e257e52eb050b6f78c12bbef02ff0cc89a7ea Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 12 Dec 2024 16:29:23 +0800
+Subject: exfat: fix the new buffer was not zeroed before writing
+
+From: Yuezhang Mo <Yuezhang.Mo@sony.com>
+
+[ Upstream commit 98e2fb26d1a9eafe79f46d15d54e68e014d81d8c ]
+
+Before writing, if a buffer_head marked as new, its data must
+be zeroed, otherwise uninitialized data in the page cache will
+be written.
+
+So this commit uses folio_zero_new_buffers() to zero the new
+buffers before ->write_end().
+
+Fixes: 6630ea49103c ("exfat: move extend valid_size into ->page_mkwrite()")
+Reported-by: syzbot+91ae49e1c1a2634d20c0@syzkaller.appspotmail.com
+Closes: https://syzkaller.appspot.com/bug?extid=91ae49e1c1a2634d20c0
+Tested-by: syzbot+91ae49e1c1a2634d20c0@syzkaller.appspotmail.com
+Signed-off-by: Yuezhang Mo <Yuezhang.Mo@sony.com>
+Reviewed-by: Sungjong Seo <sj1557.seo@samsung.com>
+Signed-off-by: Namjae Jeon <linkinjeon@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/exfat/file.c | 6 ++++++
+ 1 file changed, 6 insertions(+)
+
+diff --git a/fs/exfat/file.c b/fs/exfat/file.c
+index fb38769c3e39..05b51e721783 100644
+--- a/fs/exfat/file.c
++++ b/fs/exfat/file.c
+@@ -545,6 +545,7 @@ static int exfat_extend_valid_size(struct file *file, loff_t new_valid_size)
+ while (pos < new_valid_size) {
+ u32 len;
+ struct folio *folio;
++ unsigned long off;
+
+ len = PAGE_SIZE - (pos & (PAGE_SIZE - 1));
+ if (pos + len > new_valid_size)
+@@ -554,6 +555,9 @@ static int exfat_extend_valid_size(struct file *file, loff_t new_valid_size)
+ if (err)
+ goto out;
+
++ off = offset_in_folio(folio, pos);
++ folio_zero_new_buffers(folio, off, off + len);
++
+ err = ops->write_end(file, mapping, pos, len, len, folio, NULL);
+ if (err < 0)
+ goto out;
+@@ -563,6 +567,8 @@ static int exfat_extend_valid_size(struct file *file, loff_t new_valid_size)
+ cond_resched();
+ }
+
++ return 0;
++
+ out:
+ return err;
+ }
+--
+2.39.5
+
--- /dev/null
+From 7bf5bc73f000f77982d49184c607283b7bbfee9a Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 26 Sep 2024 16:01:21 +0200
+Subject: fs/writeback: convert wbc_account_cgroup_owner to take a folio
+
+From: Pankaj Raghav <p.raghav@samsung.com>
+
+[ Upstream commit 30dac24e14b52e1787572d1d4e06eeabe8a63630 ]
+
+Most of the callers of wbc_account_cgroup_owner() are converting a folio
+to page before calling the function. wbc_account_cgroup_owner() is
+converting the page back to a folio to call mem_cgroup_css_from_folio().
+
+Convert wbc_account_cgroup_owner() to take a folio instead of a page,
+and convert all callers to pass a folio directly except f2fs.
+
+Convert the page to folio for all the callers from f2fs as they were the
+only callers calling wbc_account_cgroup_owner() with a page. As f2fs is
+already in the process of converting to folios, these call sites might
+also soon be calling wbc_account_cgroup_owner() with a folio directly in
+the future.
+
+No functional changes. Only compile tested.
+
+Signed-off-by: Pankaj Raghav <p.raghav@samsung.com>
+Link: https://lore.kernel.org/r/20240926140121.203821-1-kernel@pankajraghav.com
+Acked-by: David Sterba <dsterba@suse.com>
+Acked-by: Tejun Heo <tj@kernel.org>
+Signed-off-by: Christian Brauner <brauner@kernel.org>
+Stable-dep-of: 51d20d1dacbe ("iomap: fix zero padding data issue in concurrent append writes")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ Documentation/admin-guide/cgroup-v2.rst | 2 +-
+ fs/btrfs/extent_io.c | 7 +++----
+ fs/btrfs/inode.c | 2 +-
+ fs/buffer.c | 4 ++--
+ fs/ext4/page-io.c | 2 +-
+ fs/f2fs/data.c | 9 ++++++---
+ fs/fs-writeback.c | 8 +++-----
+ fs/iomap/buffered-io.c | 2 +-
+ fs/mpage.c | 2 +-
+ include/linux/writeback.h | 4 ++--
+ 10 files changed, 21 insertions(+), 21 deletions(-)
+
+diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
+index 6d02168d78be..2cb58daf3089 100644
+--- a/Documentation/admin-guide/cgroup-v2.rst
++++ b/Documentation/admin-guide/cgroup-v2.rst
+@@ -2954,7 +2954,7 @@ following two functions.
+ a queue (device) has been associated with the bio and
+ before submission.
+
+- wbc_account_cgroup_owner(@wbc, @page, @bytes)
++ wbc_account_cgroup_owner(@wbc, @folio, @bytes)
+ Should be called for each data segment being written out.
+ While this function doesn't care exactly when it's called
+ during the writeback session, it's the easiest and most
+diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
+index 872cca54cc6c..42c9899d9241 100644
+--- a/fs/btrfs/extent_io.c
++++ b/fs/btrfs/extent_io.c
+@@ -786,7 +786,7 @@ static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl,
+ }
+
+ if (bio_ctrl->wbc)
+- wbc_account_cgroup_owner(bio_ctrl->wbc, &folio->page,
++ wbc_account_cgroup_owner(bio_ctrl->wbc, folio,
+ len);
+
+ size -= len;
+@@ -1708,7 +1708,7 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb,
+ ret = bio_add_folio(&bbio->bio, folio, eb->len,
+ eb->start - folio_pos(folio));
+ ASSERT(ret);
+- wbc_account_cgroup_owner(wbc, folio_page(folio, 0), eb->len);
++ wbc_account_cgroup_owner(wbc, folio, eb->len);
+ folio_unlock(folio);
+ } else {
+ int num_folios = num_extent_folios(eb);
+@@ -1722,8 +1722,7 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb,
+ folio_start_writeback(folio);
+ ret = bio_add_folio(&bbio->bio, folio, eb->folio_size, 0);
+ ASSERT(ret);
+- wbc_account_cgroup_owner(wbc, folio_page(folio, 0),
+- eb->folio_size);
++ wbc_account_cgroup_owner(wbc, folio, eb->folio_size);
+ wbc->nr_to_write -= folio_nr_pages(folio);
+ folio_unlock(folio);
+ }
+diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
+index b5cfb85af937..a3c861b2a6d2 100644
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -1729,7 +1729,7 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode,
+ * need full accuracy. Just account the whole thing
+ * against the first page.
+ */
+- wbc_account_cgroup_owner(wbc, &locked_folio->page,
++ wbc_account_cgroup_owner(wbc, locked_folio,
+ cur_end - start);
+ async_chunk[i].locked_folio = locked_folio;
+ locked_folio = NULL;
+diff --git a/fs/buffer.c b/fs/buffer.c
+index 1fc9a50def0b..32bd0f4c4223 100644
+--- a/fs/buffer.c
++++ b/fs/buffer.c
+@@ -2803,7 +2803,7 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
+ bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
+ bio->bi_write_hint = write_hint;
+
+- __bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
++ bio_add_folio_nofail(bio, bh->b_folio, bh->b_size, bh_offset(bh));
+
+ bio->bi_end_io = end_bio_bh_io_sync;
+ bio->bi_private = bh;
+@@ -2813,7 +2813,7 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
+
+ if (wbc) {
+ wbc_init_bio(wbc, bio);
+- wbc_account_cgroup_owner(wbc, bh->b_page, bh->b_size);
++ wbc_account_cgroup_owner(wbc, bh->b_folio, bh->b_size);
+ }
+
+ submit_bio(bio);
+diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
+index ad5543866d21..b7b9261fec3b 100644
+--- a/fs/ext4/page-io.c
++++ b/fs/ext4/page-io.c
+@@ -421,7 +421,7 @@ static void io_submit_add_bh(struct ext4_io_submit *io,
+ io_submit_init_bio(io, bh);
+ if (!bio_add_folio(io->io_bio, io_folio, bh->b_size, bh_offset(bh)))
+ goto submit_and_retry;
+- wbc_account_cgroup_owner(io->io_wbc, &folio->page, bh->b_size);
++ wbc_account_cgroup_owner(io->io_wbc, folio, bh->b_size);
+ io->io_next_block++;
+ }
+
+diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
+index da0960d496ae..1b0050b8421d 100644
+--- a/fs/f2fs/data.c
++++ b/fs/f2fs/data.c
+@@ -711,7 +711,8 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio)
+ }
+
+ if (fio->io_wbc && !is_read_io(fio->op))
+- wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE);
++ wbc_account_cgroup_owner(fio->io_wbc, page_folio(fio->page),
++ PAGE_SIZE);
+
+ inc_page_count(fio->sbi, is_read_io(fio->op) ?
+ __read_io_type(page) : WB_DATA_TYPE(fio->page, false));
+@@ -911,7 +912,8 @@ int f2fs_merge_page_bio(struct f2fs_io_info *fio)
+ }
+
+ if (fio->io_wbc)
+- wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE);
++ wbc_account_cgroup_owner(fio->io_wbc, page_folio(fio->page),
++ PAGE_SIZE);
+
+ inc_page_count(fio->sbi, WB_DATA_TYPE(page, false));
+
+@@ -1011,7 +1013,8 @@ void f2fs_submit_page_write(struct f2fs_io_info *fio)
+ }
+
+ if (fio->io_wbc)
+- wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE);
++ wbc_account_cgroup_owner(fio->io_wbc, page_folio(fio->page),
++ PAGE_SIZE);
+
+ io->last_block_in_bio = fio->new_blkaddr;
+
+diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
+index d8bec3c1bb1f..2391b09f4ced 100644
+--- a/fs/fs-writeback.c
++++ b/fs/fs-writeback.c
+@@ -890,17 +890,16 @@ EXPORT_SYMBOL_GPL(wbc_detach_inode);
+ /**
+ * wbc_account_cgroup_owner - account writeback to update inode cgroup ownership
+ * @wbc: writeback_control of the writeback in progress
+- * @page: page being written out
++ * @folio: folio being written out
+ * @bytes: number of bytes being written out
+ *
+- * @bytes from @page are about to written out during the writeback
++ * @bytes from @folio are about to written out during the writeback
+ * controlled by @wbc. Keep the book for foreign inode detection. See
+ * wbc_detach_inode().
+ */
+-void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
++void wbc_account_cgroup_owner(struct writeback_control *wbc, struct folio *folio,
+ size_t bytes)
+ {
+- struct folio *folio;
+ struct cgroup_subsys_state *css;
+ int id;
+
+@@ -913,7 +912,6 @@ void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
+ if (!wbc->wb || wbc->no_cgroup_owner)
+ return;
+
+- folio = page_folio(page);
+ css = mem_cgroup_css_from_folio(folio);
+ /* dead cgroups shouldn't contribute to inode ownership arbitration */
+ if (!(css->flags & CSS_ONLINE))
+diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
+index ef0b68bccbb6..ce73d2a48c1e 100644
+--- a/fs/iomap/buffered-io.c
++++ b/fs/iomap/buffered-io.c
+@@ -1784,7 +1784,7 @@ static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc,
+ if (ifs)
+ atomic_add(len, &ifs->write_bytes_pending);
+ wpc->ioend->io_size += len;
+- wbc_account_cgroup_owner(wbc, &folio->page, len);
++ wbc_account_cgroup_owner(wbc, folio, len);
+ return 0;
+ }
+
+diff --git a/fs/mpage.c b/fs/mpage.c
+index b5b5ddf9d513..82aecf372743 100644
+--- a/fs/mpage.c
++++ b/fs/mpage.c
+@@ -606,7 +606,7 @@ static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc,
+ * the confused fail path above (OOM) will be very confused when
+ * it finds all bh marked clean (i.e. it will not write anything)
+ */
+- wbc_account_cgroup_owner(wbc, &folio->page, folio_size(folio));
++ wbc_account_cgroup_owner(wbc, folio, folio_size(folio));
+ length = first_unmapped << blkbits;
+ if (!bio_add_folio(bio, folio, length, 0)) {
+ bio = mpage_bio_submit_write(bio);
+diff --git a/include/linux/writeback.h b/include/linux/writeback.h
+index d6db822e4bb3..641a057e0413 100644
+--- a/include/linux/writeback.h
++++ b/include/linux/writeback.h
+@@ -217,7 +217,7 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
+ struct inode *inode)
+ __releases(&inode->i_lock);
+ void wbc_detach_inode(struct writeback_control *wbc);
+-void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
++void wbc_account_cgroup_owner(struct writeback_control *wbc, struct folio *folio,
+ size_t bytes);
+ int cgroup_writeback_by_id(u64 bdi_id, int memcg_id,
+ enum wb_reason reason, struct wb_completion *done);
+@@ -324,7 +324,7 @@ static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio)
+ }
+
+ static inline void wbc_account_cgroup_owner(struct writeback_control *wbc,
+- struct page *page, size_t bytes)
++ struct folio *folio, size_t bytes)
+ {
+ }
+
+--
+2.39.5
+
--- /dev/null
+From 8f41839c65f944923d5454abb246f67c871987e2 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 1 Jan 2025 14:00:37 +0100
+Subject: fuse: respect FOPEN_KEEP_CACHE on opendir
+
+From: Amir Goldstein <amir73il@gmail.com>
+
+[ Upstream commit 03f275adb8fbd7b4ebe96a1ad5044d8e602692dc ]
+
+The re-factoring of fuse_dir_open() missed the need to invalidate
+directory inode page cache with open flag FOPEN_KEEP_CACHE.
+
+Fixes: 7de64d521bf92 ("fuse: break up fuse_open_common()")
+Reported-by: Prince Kumar <princer@google.com>
+Closes: https://lore.kernel.org/linux-fsdevel/CAEW=TRr7CYb4LtsvQPLj-zx5Y+EYBmGfM24SuzwyDoGVNoKm7w@mail.gmail.com/
+Signed-off-by: Amir Goldstein <amir73il@gmail.com>
+Link: https://lore.kernel.org/r/20250101130037.96680-1-amir73il@gmail.com
+Reviewed-by: Bernd Schubert <bernd.schubert@fastmail.fm>
+Signed-off-by: Christian Brauner <brauner@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/fuse/dir.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
+index 54104dd48af7..2e62e62c07f8 100644
+--- a/fs/fuse/dir.c
++++ b/fs/fuse/dir.c
+@@ -1680,6 +1680,8 @@ static int fuse_dir_open(struct inode *inode, struct file *file)
+ */
+ if (ff->open_flags & (FOPEN_STREAM | FOPEN_NONSEEKABLE))
+ nonseekable_open(inode, file);
++ if (!(ff->open_flags & FOPEN_KEEP_CACHE))
++ invalidate_inode_pages2(inode->i_mapping);
+ }
+
+ return err;
+--
+2.39.5
+
--- /dev/null
+From d9d2a9826311f03f7fc9a2144d20d27a241577a6 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 9 Dec 2024 19:42:40 +0800
+Subject: iomap: fix zero padding data issue in concurrent append writes
+
+From: Long Li <leo.lilong@huawei.com>
+
+[ Upstream commit 51d20d1dacbec589d459e11fc88fbca419f84a99 ]
+
+During concurrent append writes to XFS filesystem, zero padding data
+may appear in the file after power failure. This happens due to imprecise
+disk size updates when handling write completion.
+
+Consider this scenario with concurrent append writes same file:
+
+ Thread 1: Thread 2:
+ ------------ -----------
+ write [A, A+B]
+ update inode size to A+B
+ submit I/O [A, A+BS]
+ write [A+B, A+B+C]
+ update inode size to A+B+C
+ <I/O completes, updates disk size to min(A+B+C, A+BS)>
+ <power failure>
+
+After reboot:
+ 1) with A+B+C < A+BS, the file has zero padding in range [A+B, A+B+C]
+
+ |< Block Size (BS) >|
+ |DDDDDDDDDDDDDDDD0000000000000000|
+ ^ ^ ^
+ A A+B A+B+C
+ (EOF)
+
+ 2) with A+B+C > A+BS, the file has zero padding in range [A+B, A+BS]
+
+ |< Block Size (BS) >|< Block Size (BS) >|
+ |DDDDDDDDDDDDDDDD0000000000000000|00000000000000000000000000000000|
+ ^ ^ ^ ^
+ A A+B A+BS A+B+C
+ (EOF)
+
+ D = Valid Data
+ 0 = Zero Padding
+
+The issue stems from disk size being set to min(io_offset + io_size,
+inode->i_size) at I/O completion. Since io_offset+io_size is block
+size granularity, it may exceed the actual valid file data size. In
+the case of concurrent append writes, inode->i_size may be larger
+than the actual range of valid file data written to disk, leading to
+inaccurate disk size updates.
+
+This patch modifies the meaning of io_size to represent the size of
+valid data within EOF in an ioend. If the ioend spans beyond i_size,
+io_size will be trimmed to provide the file with more accurate size
+information. This is particularly useful for on-disk size updates
+at completion time.
+
+After this change, ioends that span i_size will not grow or merge with
+other ioends in concurrent scenarios. However, these cases that need
+growth/merging rarely occur and it seems no noticeable performance impact.
+Although rounding up io_size could enable ioend growth/merging in these
+scenarios, we decided to keep the code simple after discussion [1].
+
+Another benefit is that it makes the xfs_ioend_is_append() check more
+accurate, which can reduce unnecessary end bio callbacks of xfs_end_bio()
+in certain scenarios, such as repeated writes at the file tail without
+extending the file size.
+
+Link [1]: https://patchwork.kernel.org/project/xfs/patch/20241113091907.56937-1-leo.lilong@huawei.com
+
+Fixes: ae259a9c8593 ("fs: introduce iomap infrastructure") # goes further back than this
+Signed-off-by: Long Li <leo.lilong@huawei.com>
+Link: https://lore.kernel.org/r/20241209114241.3725722-3-leo.lilong@huawei.com
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Christian Brauner <brauner@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/iomap/buffered-io.c | 45 ++++++++++++++++++++++++++++++++++++++++++
+ include/linux/iomap.h | 2 +-
+ 2 files changed, 46 insertions(+), 1 deletion(-)
+
+diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
+index 05e5cc3bf976..25d1ede6bb0e 100644
+--- a/fs/iomap/buffered-io.c
++++ b/fs/iomap/buffered-io.c
+@@ -1784,7 +1784,52 @@ static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc,
+
+ if (ifs)
+ atomic_add(len, &ifs->write_bytes_pending);
++
++ /*
++ * Clamp io_offset and io_size to the incore EOF so that ondisk
++ * file size updates in the ioend completion are byte-accurate.
++ * This avoids recovering files with zeroed tail regions when
++ * writeback races with appending writes:
++ *
++ * Thread 1: Thread 2:
++ * ------------ -----------
++ * write [A, A+B]
++ * update inode size to A+B
++ * submit I/O [A, A+BS]
++ * write [A+B, A+B+C]
++ * update inode size to A+B+C
++ * <I/O completes, updates disk size to min(A+B+C, A+BS)>
++ * <power failure>
++ *
++ * After reboot:
++ * 1) with A+B+C < A+BS, the file has zero padding in range
++ * [A+B, A+B+C]
++ *
++ * |< Block Size (BS) >|
++ * |DDDDDDDDDDDD0000000000000|
++ * ^ ^ ^
++ * A A+B A+B+C
++ * (EOF)
++ *
++ * 2) with A+B+C > A+BS, the file has zero padding in range
++ * [A+B, A+BS]
++ *
++ * |< Block Size (BS) >|< Block Size (BS) >|
++ * |DDDDDDDDDDDD0000000000000|00000000000000000000000000|
++ * ^ ^ ^ ^
++ * A A+B A+BS A+B+C
++ * (EOF)
++ *
++ * D = Valid Data
++ * 0 = Zero Padding
++ *
++ * Note that this defeats the ability to chain the ioends of
++ * appending writes.
++ */
+ wpc->ioend->io_size += len;
++ if (wpc->ioend->io_offset + wpc->ioend->io_size > end_pos)
++ wpc->ioend->io_size = end_pos - wpc->ioend->io_offset;
++
+ wbc_account_cgroup_owner(wbc, folio, len);
+ return 0;
+ }
+diff --git a/include/linux/iomap.h b/include/linux/iomap.h
+index f61407e3b121..d204dcd35063 100644
+--- a/include/linux/iomap.h
++++ b/include/linux/iomap.h
+@@ -330,7 +330,7 @@ struct iomap_ioend {
+ u16 io_type;
+ u16 io_flags; /* IOMAP_F_* */
+ struct inode *io_inode; /* file being written to */
+- size_t io_size; /* size of the extent */
++ size_t io_size; /* size of data within eof */
+ loff_t io_offset; /* offset in the file */
+ sector_t io_sector; /* start sector of ioend */
+ struct bio io_bio; /* MUST BE LAST! */
+--
+2.39.5
+
--- /dev/null
+From b3d04c479944b549209c7b8b67402f6fa76a5615 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 9 Dec 2024 19:42:39 +0800
+Subject: iomap: pass byte granular end position to iomap_add_to_ioend
+
+From: Long Li <leo.lilong@huawei.com>
+
+[ Upstream commit b44679c63e4d3ac820998b6bd59fba89a72ad3e7 ]
+
+This is a preparatory patch for fixing zero padding issues in concurrent
+append write scenarios. In the following patches, we need to obtain
+byte-granular writeback end position for io_size trimming after EOF
+handling.
+
+Due to concurrent writeback and truncate operations, inode size may
+shrink. Resampling inode size would force writeback code to handle the
+newly appeared post-EOF blocks, which is undesirable. As Dave
+explained in [1]:
+
+"Really, the issue is that writeback mappings have to be able to
+handle the range being mapped suddenly appear to be beyond EOF.
+This behaviour is a longstanding writeback constraint, and is what
+iomap_writepage_handle_eof() is attempting to handle.
+
+We handle this by only sampling i_size_read() whilst we have the
+folio locked and can determine the action we should take with that
+folio (i.e. nothing, partial zeroing, or skip altogether). Once
+we've made the decision that the folio is within EOF and taken
+action on it (i.e. moved the folio to writeback state), we cannot
+then resample the inode size because a truncate may have started
+and changed the inode size."
+
+To avoid resampling inode size after EOF handling, we convert end_pos
+to byte-granular writeback position and return it from EOF handling
+function.
+
+Since iomap_set_range_dirty() can handle unaligned lengths, this
+conversion has no impact on it. However, iomap_find_dirty_range()
+requires aligned start and end range to find dirty blocks within the
+given range, so the end position needs to be rounded up when passed
+to it.
+
+LINK [1]: https://lore.kernel.org/linux-xfs/Z1Gg0pAa54MoeYME@localhost.localdomain/
+
+Signed-off-by: Long Li <leo.lilong@huawei.com>
+Link: https://lore.kernel.org/r/20241209114241.3725722-2-leo.lilong@huawei.com
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Signed-off-by: Christian Brauner <brauner@kernel.org>
+Stable-dep-of: 51d20d1dacbe ("iomap: fix zero padding data issue in concurrent append writes")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/iomap/buffered-io.c | 21 ++++++++++++---------
+ 1 file changed, 12 insertions(+), 9 deletions(-)
+
+diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
+index ce73d2a48c1e..05e5cc3bf976 100644
+--- a/fs/iomap/buffered-io.c
++++ b/fs/iomap/buffered-io.c
+@@ -1764,7 +1764,8 @@ static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos)
+ */
+ static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc,
+ struct writeback_control *wbc, struct folio *folio,
+- struct inode *inode, loff_t pos, unsigned len)
++ struct inode *inode, loff_t pos, loff_t end_pos,
++ unsigned len)
+ {
+ struct iomap_folio_state *ifs = folio->private;
+ size_t poff = offset_in_folio(folio, pos);
+@@ -1790,8 +1791,8 @@ static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc,
+
+ static int iomap_writepage_map_blocks(struct iomap_writepage_ctx *wpc,
+ struct writeback_control *wbc, struct folio *folio,
+- struct inode *inode, u64 pos, unsigned dirty_len,
+- unsigned *count)
++ struct inode *inode, u64 pos, u64 end_pos,
++ unsigned dirty_len, unsigned *count)
+ {
+ int error;
+
+@@ -1816,7 +1817,7 @@ static int iomap_writepage_map_blocks(struct iomap_writepage_ctx *wpc,
+ break;
+ default:
+ error = iomap_add_to_ioend(wpc, wbc, folio, inode, pos,
+- map_len);
++ end_pos, map_len);
+ if (!error)
+ (*count)++;
+ break;
+@@ -1887,11 +1888,11 @@ static bool iomap_writepage_handle_eof(struct folio *folio, struct inode *inode,
+ * remaining memory is zeroed when mapped, and writes to that
+ * region are not written out to the file.
+ *
+- * Also adjust the writeback range to skip all blocks entirely
+- * beyond i_size.
++ * Also adjust the end_pos to the end of file and skip writeback
++ * for all blocks entirely beyond i_size.
+ */
+ folio_zero_segment(folio, poff, folio_size(folio));
+- *end_pos = round_up(isize, i_blocksize(inode));
++ *end_pos = isize;
+ }
+
+ return true;
+@@ -1904,6 +1905,7 @@ static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
+ struct inode *inode = folio->mapping->host;
+ u64 pos = folio_pos(folio);
+ u64 end_pos = pos + folio_size(folio);
++ u64 end_aligned = 0;
+ unsigned count = 0;
+ int error = 0;
+ u32 rlen;
+@@ -1945,9 +1947,10 @@ static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
+ /*
+ * Walk through the folio to find dirty areas to write back.
+ */
+- while ((rlen = iomap_find_dirty_range(folio, &pos, end_pos))) {
++ end_aligned = round_up(end_pos, i_blocksize(inode));
++ while ((rlen = iomap_find_dirty_range(folio, &pos, end_aligned))) {
+ error = iomap_writepage_map_blocks(wpc, wbc, folio, inode,
+- pos, rlen, &count);
++ pos, end_pos, rlen, &count);
+ if (error)
+ break;
+ pos += rlen;
+--
+2.39.5
+
--- /dev/null
+From 1bf143314daebca0b27cae65a7cf9e32dca65c0b Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 3 Dec 2024 09:44:07 +0800
+Subject: jbd2: flush filesystem device before updating tail sequence
+
+From: Zhang Yi <yi.zhang@huawei.com>
+
+[ Upstream commit a0851ea9cd555c333795b85ddd908898b937c4e1 ]
+
+When committing transaction in jbd2_journal_commit_transaction(), the
+disk caches for the filesystem device should be flushed before updating
+the journal tail sequence. However, this step is missed if the journal
+is not located on the filesystem device. As a result, the filesystem may
+become inconsistent following a power failure or system crash. Fix it by
+ensuring that the filesystem device is flushed appropriately.
+
+Fixes: 3339578f0578 ("jbd2: cleanup journal tail after transaction commit")
+Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
+Link: https://lore.kernel.org/r/20241203014407.805916-3-yi.zhang@huaweicloud.com
+Reviewed-by: Jan Kara <jack@suse.cz>
+Signed-off-by: Christian Brauner <brauner@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/jbd2/commit.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
+index 4305a1ac808a..f95cf272a1b5 100644
+--- a/fs/jbd2/commit.c
++++ b/fs/jbd2/commit.c
+@@ -776,9 +776,9 @@ void jbd2_journal_commit_transaction(journal_t *journal)
+ /*
+ * If the journal is not located on the file system device,
+ * then we must flush the file system device before we issue
+- * the commit record
++ * the commit record and update the journal tail sequence.
+ */
+- if (commit_transaction->t_need_data_flush &&
++ if ((commit_transaction->t_need_data_flush || update_tail) &&
+ (journal->j_fs_dev != journal->j_dev) &&
+ (journal->j_flags & JBD2_BARRIER))
+ blkdev_issue_flush(journal->j_fs_dev);
+--
+2.39.5
+
--- /dev/null
+From 017a44717cbb8444df3a669b652b083704fed6ba Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 3 Dec 2024 09:44:06 +0800
+Subject: jbd2: increase IO priority for writing revoke records
+
+From: Zhang Yi <yi.zhang@huawei.com>
+
+[ Upstream commit ac1e21bd8c883aeac2f1835fc93b39c1e6838b35 ]
+
+Commit '6a3afb6ac6df ("jbd2: increase the journal IO's priority")'
+increases the priority of journal I/O by marking I/O with the
+JBD2_JOURNAL_REQ_FLAGS. However, that commit missed the revoke buffers,
+so also addresses that kind of I/Os.
+
+Fixes: 6a3afb6ac6df ("jbd2: increase the journal IO's priority")
+Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
+Link: https://lore.kernel.org/r/20241203014407.805916-2-yi.zhang@huaweicloud.com
+Reviewed-by: Kemeng Shi <shikemeng@huaweicloud.com>
+Reviewed-by: Jan Kara <jack@suse.cz>
+Signed-off-by: Christian Brauner <brauner@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/jbd2/revoke.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
+index 4556e4689024..ce63d5fde9c3 100644
+--- a/fs/jbd2/revoke.c
++++ b/fs/jbd2/revoke.c
+@@ -654,7 +654,7 @@ static void flush_descriptor(journal_t *journal,
+ set_buffer_jwrite(descriptor);
+ BUFFER_TRACE(descriptor, "write");
+ set_buffer_dirty(descriptor);
+- write_dirty_buffer(descriptor, REQ_SYNC);
++ write_dirty_buffer(descriptor, JBD2_JOURNAL_REQ_FLAGS);
+ }
+ #endif
+
+--
+2.39.5
+
--- /dev/null
+From 5a499cd36c28f96ffffe9d6e5cb5b6ac734b5e8e Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 13 Dec 2024 13:50:09 +0000
+Subject: netfs: Fix ceph copy to cache on write-begin
+
+From: David Howells <dhowells@redhat.com>
+
+[ Upstream commit 38cf8e945721ffe708fa675507465da7f4f2a9f7 ]
+
+At the end of netfs_unlock_read_folio() in which folios are marked
+appropriately for copying to the cache (either with by being marked dirty
+and having their private data set or by having PG_private_2 set) and then
+unlocked, the folio_queue struct has the entry pointing to the folio
+cleared. This presents a problem for netfs_pgpriv2_write_to_the_cache(),
+which is used to write folios marked with PG_private_2 to the cache as it
+expects to be able to trawl the folio_queue list thereafter to find the
+relevant folios, leading to a hang.
+
+Fix this by not clearing the folio_queue entry if we're going to do the
+deprecated copy-to-cache. The clearance will be done instead as the folios
+are written to the cache.
+
+This can be reproduced by starting cachefiles, mounting a ceph filesystem
+with "-o fsc" and writing to it.
+
+Fixes: 796a4049640b ("netfs: In readahead, put the folio refs as soon extracted")
+Reported-by: Max Kellermann <max.kellermann@ionos.com>
+Closes: https://lore.kernel.org/r/CAKPOu+_4m80thNy5_fvROoxBm689YtA0dZ-=gcmkzwYSY4syqw@mail.gmail.com/
+Signed-off-by: David Howells <dhowells@redhat.com>
+Link: https://lore.kernel.org/r/20241213135013.2964079-10-dhowells@redhat.com
+Fixes: ee4cdf7ba857 ("netfs: Speed up buffered reading")
+cc: Jeff Layton <jlayton@kernel.org>
+cc: Ilya Dryomov <idryomov@gmail.com>
+cc: Xiubo Li <xiubli@redhat.com>
+cc: netfs@lists.linux.dev
+cc: ceph-devel@vger.kernel.org
+cc: linux-fsdevel@vger.kernel.org
+Signed-off-by: Christian Brauner <brauner@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/netfs/read_collect.c | 6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+diff --git a/fs/netfs/read_collect.c b/fs/netfs/read_collect.c
+index d86fa02f68fb..e70eb4ea21c0 100644
+--- a/fs/netfs/read_collect.c
++++ b/fs/netfs/read_collect.c
+@@ -62,10 +62,14 @@ static void netfs_unlock_read_folio(struct netfs_io_subrequest *subreq,
+ } else {
+ trace_netfs_folio(folio, netfs_folio_trace_read_done);
+ }
++
++ folioq_clear(folioq, slot);
+ } else {
+ // TODO: Use of PG_private_2 is deprecated.
+ if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags))
+ netfs_pgpriv2_mark_copy_to_cache(subreq, rreq, folioq, slot);
++ else
++ folioq_clear(folioq, slot);
+ }
+
+ if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) {
+@@ -77,8 +81,6 @@ static void netfs_unlock_read_folio(struct netfs_io_subrequest *subreq,
+ folio_unlock(folio);
+ }
+ }
+-
+- folioq_clear(folioq, slot);
+ }
+
+ /*
+--
+2.39.5
+
--- /dev/null
+From 79ca233017cd28e3e5016ea9037225491c45bb43 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 13 Dec 2024 13:50:03 +0000
+Subject: netfs: Fix enomem handling in buffered reads
+
+From: David Howells <dhowells@redhat.com>
+
+[ Upstream commit 105549d09a539a876b7c3330ab52d8aceedad358 ]
+
+If netfs_read_to_pagecache() gets an error from either ->prepare_read() or
+from netfs_prepare_read_iterator(), it needs to decrement ->nr_outstanding,
+cancel the subrequest and break out of the issuing loop. Currently, it
+only does this for two of the cases, but there are two more that aren't
+handled.
+
+Fix this by moving the handling to a common place and jumping to it from
+all four places. This is in preference to inserting a wrapper around
+netfs_prepare_read_iterator() as proposed by Dmitry Antipov[1].
+
+Link: https://lore.kernel.org/r/20241202093943.227786-1-dmantipov@yandex.ru/ [1]
+
+Fixes: ee4cdf7ba857 ("netfs: Speed up buffered reading")
+Reported-by: syzbot+404b4b745080b6210c6c@syzkaller.appspotmail.com
+Closes: https://syzkaller.appspot.com/bug?extid=404b4b745080b6210c6c
+Signed-off-by: David Howells <dhowells@redhat.com>
+Link: https://lore.kernel.org/r/20241213135013.2964079-4-dhowells@redhat.com
+Tested-by: syzbot+404b4b745080b6210c6c@syzkaller.appspotmail.com
+cc: Dmitry Antipov <dmantipov@yandex.ru>
+cc: Jeff Layton <jlayton@kernel.org>
+cc: netfs@lists.linux.dev
+cc: linux-fsdevel@vger.kernel.org
+Signed-off-by: Christian Brauner <brauner@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/netfs/buffered_read.c | 28 ++++++++++++++++------------
+ 1 file changed, 16 insertions(+), 12 deletions(-)
+
+diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c
+index af46a598f4d7..2dd2260352db 100644
+--- a/fs/netfs/buffered_read.c
++++ b/fs/netfs/buffered_read.c
+@@ -275,22 +275,14 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq)
+ netfs_stat(&netfs_n_rh_download);
+ if (rreq->netfs_ops->prepare_read) {
+ ret = rreq->netfs_ops->prepare_read(subreq);
+- if (ret < 0) {
+- atomic_dec(&rreq->nr_outstanding);
+- netfs_put_subrequest(subreq, false,
+- netfs_sreq_trace_put_cancel);
+- break;
+- }
++ if (ret < 0)
++ goto prep_failed;
+ trace_netfs_sreq(subreq, netfs_sreq_trace_prepare);
+ }
+
+ slice = netfs_prepare_read_iterator(subreq);
+- if (slice < 0) {
+- atomic_dec(&rreq->nr_outstanding);
+- netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_cancel);
+- ret = slice;
+- break;
+- }
++ if (slice < 0)
++ goto prep_iter_failed;
+
+ rreq->netfs_ops->issue_read(subreq);
+ goto done;
+@@ -302,6 +294,8 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq)
+ trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
+ netfs_stat(&netfs_n_rh_zero);
+ slice = netfs_prepare_read_iterator(subreq);
++ if (slice < 0)
++ goto prep_iter_failed;
+ __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
+ netfs_read_subreq_terminated(subreq, 0, false);
+ goto done;
+@@ -310,6 +304,8 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq)
+ if (source == NETFS_READ_FROM_CACHE) {
+ trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
+ slice = netfs_prepare_read_iterator(subreq);
++ if (slice < 0)
++ goto prep_iter_failed;
+ netfs_read_cache_to_pagecache(rreq, subreq);
+ goto done;
+ }
+@@ -318,6 +314,14 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq)
+ WARN_ON_ONCE(1);
+ break;
+
++ prep_iter_failed:
++ ret = slice;
++ prep_failed:
++ subreq->error = ret;
++ atomic_dec(&rreq->nr_outstanding);
++ netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_cancel);
++ break;
++
+ done:
+ size -= slice;
+ start += slice;
+--
+2.39.5
+
--- /dev/null
+From 14827a5c1fab0ce07807324f737144e9ff115f5f Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 16 Dec 2024 20:34:45 +0000
+Subject: netfs: Fix is-caching check in read-retry
+
+From: David Howells <dhowells@redhat.com>
+
+[ Upstream commit d4e338de17cb6532bf805fae00db8b41e914009b ]
+
+netfs: Fix is-caching check in read-retry
+
+The read-retry code checks the NETFS_RREQ_COPY_TO_CACHE flag to determine
+if there might be failed reads from the cache that need turning into reads
+from the server, with the intention of skipping the complicated part if it
+can. The code that set the flag, however, got lost during the read-side
+rewrite.
+
+Fix the check to see if the cache_resources are valid instead. The flag
+can then be removed.
+
+Fixes: ee4cdf7ba857 ("netfs: Speed up buffered reading")
+Signed-off-by: David Howells <dhowells@redhat.com>
+Link: https://lore.kernel.org/r/3752048.1734381285@warthog.procyon.org.uk
+cc: Jeff Layton <jlayton@kernel.org>
+cc: netfs@lists.linux.dev
+cc: linux-fsdevel@vger.kernel.org
+Signed-off-by: Christian Brauner <brauner@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/netfs/read_retry.c | 2 +-
+ include/linux/netfs.h | 1 -
+ 2 files changed, 1 insertion(+), 2 deletions(-)
+
+diff --git a/fs/netfs/read_retry.c b/fs/netfs/read_retry.c
+index 0350592ea804..2701f7d45999 100644
+--- a/fs/netfs/read_retry.c
++++ b/fs/netfs/read_retry.c
+@@ -49,7 +49,7 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq)
+ * up to the first permanently failed one.
+ */
+ if (!rreq->netfs_ops->prepare_read &&
+- !test_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags)) {
++ !rreq->cache_resources.ops) {
+ struct netfs_io_subrequest *subreq;
+
+ list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
+diff --git a/include/linux/netfs.h b/include/linux/netfs.h
+index 5eaceef41e6c..474481ee8b7c 100644
+--- a/include/linux/netfs.h
++++ b/include/linux/netfs.h
+@@ -269,7 +269,6 @@ struct netfs_io_request {
+ size_t prev_donated; /* Fallback for subreq->prev_donated */
+ refcount_t ref;
+ unsigned long flags;
+-#define NETFS_RREQ_COPY_TO_CACHE 1 /* Need to write to the cache */
+ #define NETFS_RREQ_NO_UNLOCK_FOLIO 2 /* Don't unlock no_unlock_folio on completion */
+ #define NETFS_RREQ_DONT_UNLOCK_FOLIOS 3 /* Don't unlock the folios on completion */
+ #define NETFS_RREQ_FAILED 4 /* The request failed */
+--
+2.39.5
+
--- /dev/null
+From 616d54dc5174e16fb498a8dbb7864642636731fc Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 13 Dec 2024 13:50:07 +0000
+Subject: netfs: Fix missing barriers by using clear_and_wake_up_bit()
+
+From: David Howells <dhowells@redhat.com>
+
+[ Upstream commit aa3956418985bda1f68313eadde3267921847978 ]
+
+Use clear_and_wake_up_bit() rather than something like:
+
+ clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
+ wake_up_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS);
+
+as there needs to be a barrier inserted between which is present in
+clear_and_wake_up_bit().
+
+Fixes: 288ace2f57c9 ("netfs: New writeback implementation")
+Fixes: ee4cdf7ba857 ("netfs: Speed up buffered reading")
+Signed-off-by: David Howells <dhowells@redhat.com>
+Link: https://lore.kernel.org/r/20241213135013.2964079-8-dhowells@redhat.com
+Reviewed-by: Akira Yokosawa <akiyks@gmail.com>
+cc: Zilin Guan <zilin@seu.edu.cn>
+cc: Akira Yokosawa <akiyks@gmail.com>
+cc: Jeff Layton <jlayton@kernel.org>
+cc: netfs@lists.linux.dev
+cc: linux-fsdevel@vger.kernel.org
+Signed-off-by: Christian Brauner <brauner@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/netfs/read_collect.c | 3 +--
+ fs/netfs/write_collect.c | 9 +++------
+ 2 files changed, 4 insertions(+), 8 deletions(-)
+
+diff --git a/fs/netfs/read_collect.c b/fs/netfs/read_collect.c
+index 3cbb289535a8..d86fa02f68fb 100644
+--- a/fs/netfs/read_collect.c
++++ b/fs/netfs/read_collect.c
+@@ -378,8 +378,7 @@ static void netfs_rreq_assess(struct netfs_io_request *rreq)
+ task_io_account_read(rreq->transferred);
+
+ trace_netfs_rreq(rreq, netfs_rreq_trace_wake_ip);
+- clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
+- wake_up_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS);
++ clear_and_wake_up_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
+
+ trace_netfs_rreq(rreq, netfs_rreq_trace_done);
+ netfs_clear_subrequests(rreq, false);
+diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c
+index 1d438be2e1b4..82290c92ba7a 100644
+--- a/fs/netfs/write_collect.c
++++ b/fs/netfs/write_collect.c
+@@ -501,8 +501,7 @@ static void netfs_collect_write_results(struct netfs_io_request *wreq)
+ goto need_retry;
+ if ((notes & MADE_PROGRESS) && test_bit(NETFS_RREQ_PAUSE, &wreq->flags)) {
+ trace_netfs_rreq(wreq, netfs_rreq_trace_unpause);
+- clear_bit_unlock(NETFS_RREQ_PAUSE, &wreq->flags);
+- wake_up_bit(&wreq->flags, NETFS_RREQ_PAUSE);
++ clear_and_wake_up_bit(NETFS_RREQ_PAUSE, &wreq->flags);
+ }
+
+ if (notes & NEED_REASSESS) {
+@@ -605,8 +604,7 @@ void netfs_write_collection_worker(struct work_struct *work)
+
+ _debug("finished");
+ trace_netfs_rreq(wreq, netfs_rreq_trace_wake_ip);
+- clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &wreq->flags);
+- wake_up_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS);
++ clear_and_wake_up_bit(NETFS_RREQ_IN_PROGRESS, &wreq->flags);
+
+ if (wreq->iocb) {
+ size_t written = min(wreq->transferred, wreq->len);
+@@ -714,8 +712,7 @@ void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error,
+
+ trace_netfs_sreq(subreq, netfs_sreq_trace_terminated);
+
+- clear_bit_unlock(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
+- wake_up_bit(&subreq->flags, NETFS_SREQ_IN_PROGRESS);
++ clear_and_wake_up_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
+
+ /* If we are at the head of the queue, wake up the collector,
+ * transferring a ref to it if we were the ones to do so.
+--
+2.39.5
+
--- /dev/null
+From 1830a3cebd1a988697ffa193095aba899ceb58d0 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 13 Dec 2024 13:50:10 +0000
+Subject: netfs: Fix the (non-)cancellation of copy when cache is temporarily
+ disabled
+
+From: David Howells <dhowells@redhat.com>
+
+[ Upstream commit d0327c824338cdccad058723a31d038ecd553409 ]
+
+When the caching for a cookie is temporarily disabled (e.g. due to a DIO
+write on that file), future copying to the cache for that file is disabled
+until all fds open on that file are closed. However, if netfslib is using
+the deprecated PG_private_2 method (such as is currently used by ceph), and
+decides it wants to copy to the cache, netfs_advance_write() will just bail
+at the first check seeing that the cache stream is unavailable, and
+indicate that it dealt with all the content.
+
+This means that we have no subrequests to provide notifications to drive
+the state machine or even to pin the request and the request just gets
+discarded, leaving the folios with PG_private_2 set.
+
+Fix this by jumping directly to cancel the request if the cache is not
+available. That way, we don't remove mark3 from the folio_queue list and
+netfs_pgpriv2_cancel() will clean up the folios.
+
+This was found by running the generic/013 xfstest against ceph with an
+active cache and the "-o fsc" option passed to ceph. That would usually
+hang
+
+Fixes: ee4cdf7ba857 ("netfs: Speed up buffered reading")
+Reported-by: Max Kellermann <max.kellermann@ionos.com>
+Closes: https://lore.kernel.org/r/CAKPOu+_4m80thNy5_fvROoxBm689YtA0dZ-=gcmkzwYSY4syqw@mail.gmail.com/
+Signed-off-by: David Howells <dhowells@redhat.com>
+Link: https://lore.kernel.org/r/20241213135013.2964079-11-dhowells@redhat.com
+cc: Jeff Layton <jlayton@kernel.org>
+cc: Ilya Dryomov <idryomov@gmail.com>
+cc: Xiubo Li <xiubli@redhat.com>
+cc: netfs@lists.linux.dev
+cc: ceph-devel@vger.kernel.org
+cc: linux-fsdevel@vger.kernel.org
+Signed-off-by: Christian Brauner <brauner@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/netfs/read_pgpriv2.c | 4 ++++
+ 1 file changed, 4 insertions(+)
+
+diff --git a/fs/netfs/read_pgpriv2.c b/fs/netfs/read_pgpriv2.c
+index ba5af89d37fa..54d5004fec18 100644
+--- a/fs/netfs/read_pgpriv2.c
++++ b/fs/netfs/read_pgpriv2.c
+@@ -170,6 +170,10 @@ void netfs_pgpriv2_write_to_the_cache(struct netfs_io_request *rreq)
+
+ trace_netfs_write(wreq, netfs_write_trace_copy_to_cache);
+ netfs_stat(&netfs_n_wh_copy_to_cache);
++ if (!wreq->io_streams[1].avail) {
++ netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
++ goto couldnt_start;
++ }
+
+ for (;;) {
+ error = netfs_pgpriv2_copy_folio(wreq, folio);
+--
+2.39.5
+
--- /dev/null
+From 5717881a402aafc332fa6c0707f3cfce611a6f93 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 13 Dec 2024 13:50:04 +0000
+Subject: nfs: Fix oops in nfs_netfs_init_request() when copying to cache
+
+From: David Howells <dhowells@redhat.com>
+
+[ Upstream commit 86ad1a58f6a9453f49e06ef957a40a8dac00a13f ]
+
+When netfslib wants to copy some data that has just been read on behalf of
+nfs, it creates a new write request and calls nfs_netfs_init_request() to
+initialise it, but with a NULL file pointer. This causes
+nfs_file_open_context() to oops - however, we don't actually need the nfs
+context as we're only going to write to the cache.
+
+Fix this by just returning if we aren't given a file pointer and emit a
+warning if the request was for something other than copy-to-cache.
+
+Further, fix nfs_netfs_free_request() so that it doesn't try to free the
+context if the pointer is NULL.
+
+Fixes: ee4cdf7ba857 ("netfs: Speed up buffered reading")
+Reported-by: Max Kellermann <max.kellermann@ionos.com>
+Closes: https://lore.kernel.org/r/CAKPOu+9DyMbKLhyJb7aMLDTb=Fh0T8Teb9sjuf_pze+XWT1VaQ@mail.gmail.com/
+Signed-off-by: David Howells <dhowells@redhat.com>
+Link: https://lore.kernel.org/r/20241213135013.2964079-5-dhowells@redhat.com
+cc: Trond Myklebust <trondmy@kernel.org>
+cc: Anna Schumaker <anna@kernel.org>
+cc: Dave Wysochanski <dwysocha@redhat.com>
+cc: Jeff Layton <jlayton@kernel.org>
+cc: linux-nfs@vger.kernel.org
+cc: netfs@lists.linux.dev
+cc: linux-fsdevel@vger.kernel.org
+Signed-off-by: Christian Brauner <brauner@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/nfs/fscache.c | 9 ++++++++-
+ 1 file changed, 8 insertions(+), 1 deletion(-)
+
+diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
+index 810269ee0a50..d49e4ce27999 100644
+--- a/fs/nfs/fscache.c
++++ b/fs/nfs/fscache.c
+@@ -263,6 +263,12 @@ int nfs_netfs_readahead(struct readahead_control *ractl)
+ static atomic_t nfs_netfs_debug_id;
+ static int nfs_netfs_init_request(struct netfs_io_request *rreq, struct file *file)
+ {
++ if (!file) {
++ if (WARN_ON_ONCE(rreq->origin != NETFS_PGPRIV2_COPY_TO_CACHE))
++ return -EIO;
++ return 0;
++ }
++
+ rreq->netfs_priv = get_nfs_open_context(nfs_file_open_context(file));
+ rreq->debug_id = atomic_inc_return(&nfs_netfs_debug_id);
+ /* [DEPRECATED] Use PG_private_2 to mark folio being written to the cache. */
+@@ -274,7 +280,8 @@ static int nfs_netfs_init_request(struct netfs_io_request *rreq, struct file *fi
+
+ static void nfs_netfs_free_request(struct netfs_io_request *rreq)
+ {
+- put_nfs_open_context(rreq->netfs_priv);
++ if (rreq->netfs_priv)
++ put_nfs_open_context(rreq->netfs_priv);
+ }
+
+ static struct nfs_netfs_io_data *nfs_netfs_alloc(struct netfs_io_subrequest *sreq)
+--
+2.39.5
+
--- /dev/null
+From 1648fb8994b3d24f27ba4fdcf09efbaa36a626ef Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sun, 5 Jan 2025 17:24:03 +0100
+Subject: ovl: pass realinode to ovl_encode_real_fh() instead of realdentry
+
+From: Amir Goldstein <amir73il@gmail.com>
+
+[ Upstream commit 07aeefae7ff44d80524375253980b1bdee2396b0 ]
+
+We want to be able to encode an fid from an inode with no alias.
+
+Signed-off-by: Amir Goldstein <amir73il@gmail.com>
+Link: https://lore.kernel.org/r/20250105162404.357058-2-amir73il@gmail.com
+Signed-off-by: Christian Brauner <brauner@kernel.org>
+Stable-dep-of: c45beebfde34 ("ovl: support encoding fid from inode with no alias")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/overlayfs/copy_up.c | 11 ++++++-----
+ fs/overlayfs/export.c | 5 +++--
+ fs/overlayfs/namei.c | 4 ++--
+ fs/overlayfs/overlayfs.h | 2 +-
+ 4 files changed, 12 insertions(+), 10 deletions(-)
+
+diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
+index 2ed6ad641a20..a11a9d756a7b 100644
+--- a/fs/overlayfs/copy_up.c
++++ b/fs/overlayfs/copy_up.c
+@@ -416,13 +416,13 @@ int ovl_set_attr(struct ovl_fs *ofs, struct dentry *upperdentry,
+ return err;
+ }
+
+-struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct dentry *real,
++struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct inode *realinode,
+ bool is_upper)
+ {
+ struct ovl_fh *fh;
+ int fh_type, dwords;
+ int buflen = MAX_HANDLE_SZ;
+- uuid_t *uuid = &real->d_sb->s_uuid;
++ uuid_t *uuid = &realinode->i_sb->s_uuid;
+ int err;
+
+ /* Make sure the real fid stays 32bit aligned */
+@@ -439,7 +439,8 @@ struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct dentry *real,
+ * the price or reconnecting the dentry.
+ */
+ dwords = buflen >> 2;
+- fh_type = exportfs_encode_fh(real, (void *)fh->fb.fid, &dwords, 0);
++ fh_type = exportfs_encode_inode_fh(realinode, (void *)fh->fb.fid,
++ &dwords, NULL, 0);
+ buflen = (dwords << 2);
+
+ err = -EIO;
+@@ -481,7 +482,7 @@ struct ovl_fh *ovl_get_origin_fh(struct ovl_fs *ofs, struct dentry *origin)
+ if (!ovl_can_decode_fh(origin->d_sb))
+ return NULL;
+
+- return ovl_encode_real_fh(ofs, origin, false);
++ return ovl_encode_real_fh(ofs, d_inode(origin), false);
+ }
+
+ int ovl_set_origin_fh(struct ovl_fs *ofs, const struct ovl_fh *fh,
+@@ -506,7 +507,7 @@ static int ovl_set_upper_fh(struct ovl_fs *ofs, struct dentry *upper,
+ const struct ovl_fh *fh;
+ int err;
+
+- fh = ovl_encode_real_fh(ofs, upper, true);
++ fh = ovl_encode_real_fh(ofs, d_inode(upper), true);
+ if (IS_ERR(fh))
+ return PTR_ERR(fh);
+
+diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c
+index 5868cb222955..036c9f39a14d 100644
+--- a/fs/overlayfs/export.c
++++ b/fs/overlayfs/export.c
+@@ -223,6 +223,7 @@ static int ovl_check_encode_origin(struct dentry *dentry)
+ static int ovl_dentry_to_fid(struct ovl_fs *ofs, struct dentry *dentry,
+ u32 *fid, int buflen)
+ {
++ struct inode *inode = d_inode(dentry);
+ struct ovl_fh *fh = NULL;
+ int err, enc_lower;
+ int len;
+@@ -236,8 +237,8 @@ static int ovl_dentry_to_fid(struct ovl_fs *ofs, struct dentry *dentry,
+ goto fail;
+
+ /* Encode an upper or lower file handle */
+- fh = ovl_encode_real_fh(ofs, enc_lower ? ovl_dentry_lower(dentry) :
+- ovl_dentry_upper(dentry), !enc_lower);
++ fh = ovl_encode_real_fh(ofs, enc_lower ? ovl_inode_lower(inode) :
++ ovl_inode_upper(inode), !enc_lower);
+ if (IS_ERR(fh))
+ return PTR_ERR(fh);
+
+diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c
+index 5764f91d283e..42b73ae5ba01 100644
+--- a/fs/overlayfs/namei.c
++++ b/fs/overlayfs/namei.c
+@@ -542,7 +542,7 @@ int ovl_verify_origin_xattr(struct ovl_fs *ofs, struct dentry *dentry,
+ struct ovl_fh *fh;
+ int err;
+
+- fh = ovl_encode_real_fh(ofs, real, is_upper);
++ fh = ovl_encode_real_fh(ofs, d_inode(real), is_upper);
+ err = PTR_ERR(fh);
+ if (IS_ERR(fh)) {
+ fh = NULL;
+@@ -738,7 +738,7 @@ int ovl_get_index_name(struct ovl_fs *ofs, struct dentry *origin,
+ struct ovl_fh *fh;
+ int err;
+
+- fh = ovl_encode_real_fh(ofs, origin, false);
++ fh = ovl_encode_real_fh(ofs, d_inode(origin), false);
+ if (IS_ERR(fh))
+ return PTR_ERR(fh);
+
+diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
+index 0bfe35da4b7b..844874b4a91a 100644
+--- a/fs/overlayfs/overlayfs.h
++++ b/fs/overlayfs/overlayfs.h
+@@ -869,7 +869,7 @@ int ovl_copy_up_with_data(struct dentry *dentry);
+ int ovl_maybe_copy_up(struct dentry *dentry, int flags);
+ int ovl_copy_xattr(struct super_block *sb, const struct path *path, struct dentry *new);
+ int ovl_set_attr(struct ovl_fs *ofs, struct dentry *upper, struct kstat *stat);
+-struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct dentry *real,
++struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct inode *realinode,
+ bool is_upper);
+ struct ovl_fh *ovl_get_origin_fh(struct ovl_fs *ofs, struct dentry *origin);
+ int ovl_set_origin_fh(struct ovl_fs *ofs, const struct ovl_fh *fh,
+--
+2.39.5
+
--- /dev/null
+From 1f8eea074a5539d62be5677784cb2e61d22a67b3 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sun, 5 Jan 2025 17:24:04 +0100
+Subject: ovl: support encoding fid from inode with no alias
+
+From: Amir Goldstein <amir73il@gmail.com>
+
+[ Upstream commit c45beebfde34aa71afbc48b2c54cdda623515037 ]
+
+Dmitry Safonov reported that a WARN_ON() assertion can be trigered by
+userspace when calling inotify_show_fdinfo() for an overlayfs watched
+inode, whose dentry aliases were discarded with drop_caches.
+
+The WARN_ON() assertion in inotify_show_fdinfo() was removed, because
+it is possible for encoding file handle to fail for other reason, but
+the impact of failing to encode an overlayfs file handle goes beyond
+this assertion.
+
+As shown in the LTP test case mentioned in the link below, failure to
+encode an overlayfs file handle from a non-aliased inode also leads to
+failure to report an fid with FAN_DELETE_SELF fanotify events.
+
+As Dmitry notes in his analyzis of the problem, ovl_encode_fh() fails
+if it cannot find an alias for the inode, but this failure can be fixed.
+ovl_encode_fh() seldom uses the alias and in the case of non-decodable
+file handles, as is often the case with fanotify fid info,
+ovl_encode_fh() never needs to use the alias to encode a file handle.
+
+Defer finding an alias until it is actually needed so ovl_encode_fh()
+will not fail in the common case of FAN_DELETE_SELF fanotify events.
+
+Fixes: 16aac5ad1fa9 ("ovl: support encoding non-decodable file handles")
+Reported-by: Dmitry Safonov <dima@arista.com>
+Closes: https://lore.kernel.org/linux-fsdevel/CAOQ4uxiie81voLZZi2zXS1BziXZCM24nXqPAxbu8kxXCUWdwOg@mail.gmail.com/
+Signed-off-by: Amir Goldstein <amir73il@gmail.com>
+Link: https://lore.kernel.org/r/20250105162404.357058-3-amir73il@gmail.com
+Signed-off-by: Christian Brauner <brauner@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/overlayfs/export.c | 46 +++++++++++++++++++++++--------------------
+ 1 file changed, 25 insertions(+), 21 deletions(-)
+
+diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c
+index 036c9f39a14d..444aeeccb6da 100644
+--- a/fs/overlayfs/export.c
++++ b/fs/overlayfs/export.c
+@@ -176,35 +176,37 @@ static int ovl_connect_layer(struct dentry *dentry)
+ *
+ * Return 0 for upper file handle, > 0 for lower file handle or < 0 on error.
+ */
+-static int ovl_check_encode_origin(struct dentry *dentry)
++static int ovl_check_encode_origin(struct inode *inode)
+ {
+- struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
++ struct ovl_fs *ofs = OVL_FS(inode->i_sb);
+ bool decodable = ofs->config.nfs_export;
++ struct dentry *dentry;
++ int err;
+
+ /* No upper layer? */
+ if (!ovl_upper_mnt(ofs))
+ return 1;
+
+ /* Lower file handle for non-upper non-decodable */
+- if (!ovl_dentry_upper(dentry) && !decodable)
++ if (!ovl_inode_upper(inode) && !decodable)
+ return 1;
+
+ /* Upper file handle for pure upper */
+- if (!ovl_dentry_lower(dentry))
++ if (!ovl_inode_lower(inode))
+ return 0;
+
+ /*
+ * Root is never indexed, so if there's an upper layer, encode upper for
+ * root.
+ */
+- if (dentry == dentry->d_sb->s_root)
++ if (inode == d_inode(inode->i_sb->s_root))
+ return 0;
+
+ /*
+ * Upper decodable file handle for non-indexed upper.
+ */
+- if (ovl_dentry_upper(dentry) && decodable &&
+- !ovl_test_flag(OVL_INDEX, d_inode(dentry)))
++ if (ovl_inode_upper(inode) && decodable &&
++ !ovl_test_flag(OVL_INDEX, inode))
+ return 0;
+
+ /*
+@@ -213,17 +215,25 @@ static int ovl_check_encode_origin(struct dentry *dentry)
+ * ovl_connect_layer() will try to make origin's layer "connected" by
+ * copying up a "connectable" ancestor.
+ */
+- if (d_is_dir(dentry) && decodable)
+- return ovl_connect_layer(dentry);
++ if (!decodable || !S_ISDIR(inode->i_mode))
++ return 1;
++
++ dentry = d_find_any_alias(inode);
++ if (!dentry)
++ return -ENOENT;
++
++ err = ovl_connect_layer(dentry);
++ dput(dentry);
++ if (err < 0)
++ return err;
+
+ /* Lower file handle for indexed and non-upper dir/non-dir */
+ return 1;
+ }
+
+-static int ovl_dentry_to_fid(struct ovl_fs *ofs, struct dentry *dentry,
++static int ovl_dentry_to_fid(struct ovl_fs *ofs, struct inode *inode,
+ u32 *fid, int buflen)
+ {
+- struct inode *inode = d_inode(dentry);
+ struct ovl_fh *fh = NULL;
+ int err, enc_lower;
+ int len;
+@@ -232,7 +242,7 @@ static int ovl_dentry_to_fid(struct ovl_fs *ofs, struct dentry *dentry,
+ * Check if we should encode a lower or upper file handle and maybe
+ * copy up an ancestor to make lower file handle connectable.
+ */
+- err = enc_lower = ovl_check_encode_origin(dentry);
++ err = enc_lower = ovl_check_encode_origin(inode);
+ if (enc_lower < 0)
+ goto fail;
+
+@@ -252,8 +262,8 @@ static int ovl_dentry_to_fid(struct ovl_fs *ofs, struct dentry *dentry,
+ return err;
+
+ fail:
+- pr_warn_ratelimited("failed to encode file handle (%pd2, err=%i)\n",
+- dentry, err);
++ pr_warn_ratelimited("failed to encode file handle (ino=%lu, err=%i)\n",
++ inode->i_ino, err);
+ goto out;
+ }
+
+@@ -261,19 +271,13 @@ static int ovl_encode_fh(struct inode *inode, u32 *fid, int *max_len,
+ struct inode *parent)
+ {
+ struct ovl_fs *ofs = OVL_FS(inode->i_sb);
+- struct dentry *dentry;
+ int bytes, buflen = *max_len << 2;
+
+ /* TODO: encode connectable file handles */
+ if (parent)
+ return FILEID_INVALID;
+
+- dentry = d_find_any_alias(inode);
+- if (!dentry)
+- return FILEID_INVALID;
+-
+- bytes = ovl_dentry_to_fid(ofs, dentry, fid, buflen);
+- dput(dentry);
++ bytes = ovl_dentry_to_fid(ofs, inode, fid, buflen);
+ if (bytes <= 0)
+ return FILEID_INVALID;
+
+--
+2.39.5
+
--- /dev/null
+jbd2-increase-io-priority-for-writing-revoke-records.patch
+jbd2-flush-filesystem-device-before-updating-tail-se.patch
+fs-writeback-convert-wbc_account_cgroup_owner-to-tak.patch
+iomap-pass-byte-granular-end-position-to-iomap_add_t.patch
+iomap-fix-zero-padding-data-issue-in-concurrent-appe.patch
+dm-array-fix-releasing-a-faulty-array-block-twice-in.patch
+dm-array-fix-unreleased-btree-blocks-on-closing-a-fa.patch
+dm-array-fix-cursor-index-when-skipping-across-block.patch
+netfs-fix-enomem-handling-in-buffered-reads.patch
+nfs-fix-oops-in-nfs_netfs_init_request-when-copying-.patch
+netfs-fix-missing-barriers-by-using-clear_and_wake_u.patch
+netfs-fix-ceph-copy-to-cache-on-write-begin.patch
+netfs-fix-the-non-cancellation-of-copy-when-cache-is.patch
+netfs-fix-is-caching-check-in-read-retry.patch
+exfat-fix-the-infinite-loop-in-exfat_readdir.patch
+exfat-fix-the-new-buffer-was-not-zeroed-before-writi.patch
+exfat-fix-the-infinite-loop-in-__exfat_free_cluster.patch
+fuse-respect-fopen_keep_cache-on-opendir.patch
+ovl-pass-realinode-to-ovl_encode_real_fh-instead-of-.patch
+ovl-support-encoding-fid-from-inode-with-no-alias.patch