]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
Fixes for 6.12
authorSasha Levin <sashal@kernel.org>
Thu, 9 Jan 2025 13:52:23 +0000 (08:52 -0500)
committerSasha Levin <sashal@kernel.org>
Thu, 9 Jan 2025 13:52:23 +0000 (08:52 -0500)
Signed-off-by: Sasha Levin <sashal@kernel.org>
21 files changed:
queue-6.12/dm-array-fix-cursor-index-when-skipping-across-block.patch [new file with mode: 0644]
queue-6.12/dm-array-fix-releasing-a-faulty-array-block-twice-in.patch [new file with mode: 0644]
queue-6.12/dm-array-fix-unreleased-btree-blocks-on-closing-a-fa.patch [new file with mode: 0644]
queue-6.12/exfat-fix-the-infinite-loop-in-__exfat_free_cluster.patch [new file with mode: 0644]
queue-6.12/exfat-fix-the-infinite-loop-in-exfat_readdir.patch [new file with mode: 0644]
queue-6.12/exfat-fix-the-new-buffer-was-not-zeroed-before-writi.patch [new file with mode: 0644]
queue-6.12/fs-writeback-convert-wbc_account_cgroup_owner-to-tak.patch [new file with mode: 0644]
queue-6.12/fuse-respect-fopen_keep_cache-on-opendir.patch [new file with mode: 0644]
queue-6.12/iomap-fix-zero-padding-data-issue-in-concurrent-appe.patch [new file with mode: 0644]
queue-6.12/iomap-pass-byte-granular-end-position-to-iomap_add_t.patch [new file with mode: 0644]
queue-6.12/jbd2-flush-filesystem-device-before-updating-tail-se.patch [new file with mode: 0644]
queue-6.12/jbd2-increase-io-priority-for-writing-revoke-records.patch [new file with mode: 0644]
queue-6.12/netfs-fix-ceph-copy-to-cache-on-write-begin.patch [new file with mode: 0644]
queue-6.12/netfs-fix-enomem-handling-in-buffered-reads.patch [new file with mode: 0644]
queue-6.12/netfs-fix-is-caching-check-in-read-retry.patch [new file with mode: 0644]
queue-6.12/netfs-fix-missing-barriers-by-using-clear_and_wake_u.patch [new file with mode: 0644]
queue-6.12/netfs-fix-the-non-cancellation-of-copy-when-cache-is.patch [new file with mode: 0644]
queue-6.12/nfs-fix-oops-in-nfs_netfs_init_request-when-copying-.patch [new file with mode: 0644]
queue-6.12/ovl-pass-realinode-to-ovl_encode_real_fh-instead-of-.patch [new file with mode: 0644]
queue-6.12/ovl-support-encoding-fid-from-inode-with-no-alias.patch [new file with mode: 0644]
queue-6.12/series [new file with mode: 0644]

diff --git a/queue-6.12/dm-array-fix-cursor-index-when-skipping-across-block.patch b/queue-6.12/dm-array-fix-cursor-index-when-skipping-across-block.patch
new file mode 100644 (file)
index 0000000..cabc10e
--- /dev/null
@@ -0,0 +1,74 @@
+From 379ee0db89d417747760eeacdfe48e01b217ac08 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 5 Dec 2024 19:41:53 +0800
+Subject: dm array: fix cursor index when skipping across block boundaries
+
+From: Ming-Hung Tsai <mtsai@redhat.com>
+
+[ Upstream commit 0bb1968da2737ba68fd63857d1af2b301a18d3bf ]
+
+dm_array_cursor_skip() seeks to the target position by loading array
+blocks iteratively until the specified number of entries to skip is
+reached. When seeking across block boundaries, it uses
+dm_array_cursor_next() to step into the next block.
+dm_array_cursor_skip() must first move the cursor index to the end
+of the current block; otherwise, the cursor position could incorrectly
+remain in the same block, causing the actual number of skipped entries
+to be much smaller than expected.
+
+This bug affects cache resizing in v2 metadata and could lead to data
+loss if the fast device is shrunk during the first-time resume. For
+example:
+
+1. create a cache metadata consists of 32768 blocks, with a dirty block
+   assigned to the second bitmap block. cache_restore v1.0 is required.
+
+cat <<EOF >> cmeta.xml
+<superblock uuid="" block_size="64" nr_cache_blocks="32768" \
+policy="smq" hint_width="4">
+  <mappings>
+    <mapping cache_block="32767" origin_block="0" dirty="true"/>
+  </mappings>
+</superblock>
+EOF
+dmsetup create cmeta --table "0 8192 linear /dev/sdc 0"
+cache_restore -i cmeta.xml -o /dev/mapper/cmeta --metadata-version=2
+
+2. bring up the cache while attempt to discard all the blocks belonging
+   to the second bitmap block (block# 32576 to 32767). The last command
+   is expected to fail, but it actually succeeds.
+
+dmsetup create cdata --table "0 2084864 linear /dev/sdc 8192"
+dmsetup create corig --table "0 65536 linear /dev/sdc 2105344"
+dmsetup create cache --table "0 65536 cache /dev/mapper/cmeta \
+/dev/mapper/cdata /dev/mapper/corig 64 2 metadata2 writeback smq \
+2 migration_threshold 0"
+
+In addition to the reproducer described above, this fix can be
+verified using the "array_cursor/skip" tests in dm-unit:
+  dm-unit run /pdata/array_cursor/skip/ --kernel-dir <KERNEL_DIR>
+
+Signed-off-by: Ming-Hung Tsai <mtsai@redhat.com>
+Fixes: 9b696229aa7d ("dm persistent data: add cursor skip functions to the cursor APIs")
+Reviewed-by: Joe Thornber <thornber@redhat.com>
+Signed-off-by: Mike Snitzer <snitzer@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/md/persistent-data/dm-array.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c
+index 0850dfdffc8c..8f8792e55806 100644
+--- a/drivers/md/persistent-data/dm-array.c
++++ b/drivers/md/persistent-data/dm-array.c
+@@ -1003,6 +1003,7 @@ int dm_array_cursor_skip(struct dm_array_cursor *c, uint32_t count)
+               }
+               count -= remaining;
++              c->index += (remaining - 1);
+               r = dm_array_cursor_next(c);
+       } while (!r);
+-- 
+2.39.5
+
diff --git a/queue-6.12/dm-array-fix-releasing-a-faulty-array-block-twice-in.patch b/queue-6.12/dm-array-fix-releasing-a-faulty-array-block-twice-in.patch
new file mode 100644 (file)
index 0000000..36b6d1e
--- /dev/null
@@ -0,0 +1,110 @@
+From d8517732262f9518a97c457c1a892eafaf80328f Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 5 Dec 2024 19:41:51 +0800
+Subject: dm array: fix releasing a faulty array block twice in
+ dm_array_cursor_end
+
+From: Ming-Hung Tsai <mtsai@redhat.com>
+
+[ Upstream commit f2893c0804d86230ffb8f1c8703fdbb18648abc8 ]
+
+When dm_bm_read_lock() fails due to locking or checksum errors, it
+releases the faulty block implicitly while leaving an invalid output
+pointer behind. The caller of dm_bm_read_lock() should not operate on
+this invalid dm_block pointer, or it will lead to undefined result.
+For example, the dm_array_cursor incorrectly caches the invalid pointer
+on reading a faulty array block, causing a double release in
+dm_array_cursor_end(), then hitting the BUG_ON in dm-bufio cache_put().
+
+Reproduce steps:
+
+1. initialize a cache device
+
+dmsetup create cmeta --table "0 8192 linear /dev/sdc 0"
+dmsetup create cdata --table "0 65536 linear /dev/sdc 8192"
+dmsetup create corig --table "0 524288 linear /dev/sdc $262144"
+dd if=/dev/zero of=/dev/mapper/cmeta bs=4k count=1
+dmsetup create cache --table "0 524288 cache /dev/mapper/cmeta \
+/dev/mapper/cdata /dev/mapper/corig 128 2 metadata2 writethrough smq 0"
+
+2. wipe the second array block offline
+
+dmsteup remove cache cmeta cdata corig
+mapping_root=$(dd if=/dev/sdc bs=1c count=8 skip=192 \
+2>/dev/null | hexdump -e '1/8 "%u\n"')
+ablock=$(dd if=/dev/sdc bs=1c count=8 skip=$((4096*mapping_root+2056)) \
+2>/dev/null | hexdump -e '1/8 "%u\n"')
+dd if=/dev/zero of=/dev/sdc bs=4k count=1 seek=$ablock
+
+3. try reopen the cache device
+
+dmsetup create cmeta --table "0 8192 linear /dev/sdc 0"
+dmsetup create cdata --table "0 65536 linear /dev/sdc 8192"
+dmsetup create corig --table "0 524288 linear /dev/sdc $262144"
+dmsetup create cache --table "0 524288 cache /dev/mapper/cmeta \
+/dev/mapper/cdata /dev/mapper/corig 128 2 metadata2 writethrough smq 0"
+
+Kernel logs:
+
+(snip)
+device-mapper: array: array_block_check failed: blocknr 0 != wanted 10
+device-mapper: block manager: array validator check failed for block 10
+device-mapper: array: get_ablock failed
+device-mapper: cache metadata: dm_array_cursor_next for mapping failed
+------------[ cut here ]------------
+kernel BUG at drivers/md/dm-bufio.c:638!
+
+Fix by setting the cached block pointer to NULL on errors.
+
+In addition to the reproducer described above, this fix can be
+verified using the "array_cursor/damaged" test in dm-unit:
+  dm-unit run /pdata/array_cursor/damaged --kernel-dir <KERNEL_DIR>
+
+Signed-off-by: Ming-Hung Tsai <mtsai@redhat.com>
+Fixes: fdd1315aa5f0 ("dm array: introduce cursor api")
+Reviewed-by: Joe Thornber <thornber@redhat.com>
+Signed-off-by: Mike Snitzer <snitzer@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/md/persistent-data/dm-array.c | 12 ++++++++----
+ 1 file changed, 8 insertions(+), 4 deletions(-)
+
+diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c
+index 157c9bd2fed7..4866ff56125f 100644
+--- a/drivers/md/persistent-data/dm-array.c
++++ b/drivers/md/persistent-data/dm-array.c
+@@ -917,23 +917,27 @@ static int load_ablock(struct dm_array_cursor *c)
+       if (c->block)
+               unlock_ablock(c->info, c->block);
+-      c->block = NULL;
+-      c->ab = NULL;
+       c->index = 0;
+       r = dm_btree_cursor_get_value(&c->cursor, &key, &value_le);
+       if (r) {
+               DMERR("dm_btree_cursor_get_value failed");
+-              dm_btree_cursor_end(&c->cursor);
++              goto out;
+       } else {
+               r = get_ablock(c->info, le64_to_cpu(value_le), &c->block, &c->ab);
+               if (r) {
+                       DMERR("get_ablock failed");
+-                      dm_btree_cursor_end(&c->cursor);
++                      goto out;
+               }
+       }
++      return 0;
++
++out:
++      dm_btree_cursor_end(&c->cursor);
++      c->block = NULL;
++      c->ab = NULL;
+       return r;
+ }
+-- 
+2.39.5
+
diff --git a/queue-6.12/dm-array-fix-unreleased-btree-blocks-on-closing-a-fa.patch b/queue-6.12/dm-array-fix-unreleased-btree-blocks-on-closing-a-fa.patch
new file mode 100644 (file)
index 0000000..122f74c
--- /dev/null
@@ -0,0 +1,49 @@
+From d657e02caac44de5b858bc66b6788c23d82d1f33 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 5 Dec 2024 19:41:52 +0800
+Subject: dm array: fix unreleased btree blocks on closing a faulty array
+ cursor
+
+From: Ming-Hung Tsai <mtsai@redhat.com>
+
+[ Upstream commit 626f128ee9c4133b1cfce4be2b34a1508949370e ]
+
+The cached block pointer in dm_array_cursor might be NULL if it reaches
+an unreadable array block, or the array is empty. Therefore,
+dm_array_cursor_end() should call dm_btree_cursor_end() unconditionally,
+to prevent leaving unreleased btree blocks.
+
+This fix can be verified using the "array_cursor/iterate/empty" test
+in dm-unit:
+  dm-unit run /pdata/array_cursor/iterate/empty --kernel-dir <KERNEL_DIR>
+
+Signed-off-by: Ming-Hung Tsai <mtsai@redhat.com>
+Fixes: fdd1315aa5f0 ("dm array: introduce cursor api")
+Reviewed-by: Joe Thornber <thornber@redhat.com>
+Signed-off-by: Mike Snitzer <snitzer@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/md/persistent-data/dm-array.c | 6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c
+index 4866ff56125f..0850dfdffc8c 100644
+--- a/drivers/md/persistent-data/dm-array.c
++++ b/drivers/md/persistent-data/dm-array.c
+@@ -960,10 +960,10 @@ EXPORT_SYMBOL_GPL(dm_array_cursor_begin);
+ void dm_array_cursor_end(struct dm_array_cursor *c)
+ {
+-      if (c->block) {
++      if (c->block)
+               unlock_ablock(c->info, c->block);
+-              dm_btree_cursor_end(&c->cursor);
+-      }
++
++      dm_btree_cursor_end(&c->cursor);
+ }
+ EXPORT_SYMBOL_GPL(dm_array_cursor_end);
+-- 
+2.39.5
+
diff --git a/queue-6.12/exfat-fix-the-infinite-loop-in-__exfat_free_cluster.patch b/queue-6.12/exfat-fix-the-infinite-loop-in-__exfat_free_cluster.patch
new file mode 100644 (file)
index 0000000..08f7912
--- /dev/null
@@ -0,0 +1,53 @@
+From 6442221b5d8f00f1a08e5194fde67229095ea1f7 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 16 Dec 2024 13:39:42 +0800
+Subject: exfat: fix the infinite loop in __exfat_free_cluster()
+
+From: Yuezhang Mo <Yuezhang.Mo@sony.com>
+
+[ Upstream commit a5324b3a488d883aa2d42f72260054e87d0940a0 ]
+
+In __exfat_free_cluster(), the cluster chain is traversed until the
+EOF cluster. If the cluster chain includes a loop due to file system
+corruption, the EOF cluster cannot be traversed, resulting in an
+infinite loop.
+
+This commit uses the total number of clusters to prevent this infinite
+loop.
+
+Reported-by: syzbot+1de5a37cb85a2d536330@syzkaller.appspotmail.com
+Closes: https://syzkaller.appspot.com/bug?extid=1de5a37cb85a2d536330
+Tested-by: syzbot+1de5a37cb85a2d536330@syzkaller.appspotmail.com
+Fixes: 31023864e67a ("exfat: add fat entry operations")
+Signed-off-by: Yuezhang Mo <Yuezhang.Mo@sony.com>
+Reviewed-by: Sungjong Seo <sj1557.seo@samsung.com>
+Signed-off-by: Namjae Jeon <linkinjeon@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/exfat/fatent.c | 10 ++++++++++
+ 1 file changed, 10 insertions(+)
+
+diff --git a/fs/exfat/fatent.c b/fs/exfat/fatent.c
+index 773c320d68f3..9e5492ac409b 100644
+--- a/fs/exfat/fatent.c
++++ b/fs/exfat/fatent.c
+@@ -216,6 +216,16 @@ static int __exfat_free_cluster(struct inode *inode, struct exfat_chain *p_chain
+                       if (err)
+                               goto dec_used_clus;
++
++                      if (num_clusters >= sbi->num_clusters - EXFAT_FIRST_CLUSTER) {
++                              /*
++                               * The cluster chain includes a loop, scan the
++                               * bitmap to get the number of used clusters.
++                               */
++                              exfat_count_used_clusters(sb, &sbi->used_clusters);
++
++                              return 0;
++                      }
+               } while (clu != EXFAT_EOF_CLUSTER);
+       }
+-- 
+2.39.5
+
diff --git a/queue-6.12/exfat-fix-the-infinite-loop-in-exfat_readdir.patch b/queue-6.12/exfat-fix-the-infinite-loop-in-exfat_readdir.patch
new file mode 100644 (file)
index 0000000..4257cd8
--- /dev/null
@@ -0,0 +1,57 @@
+From dd98a1f4fe72eb4042682add6dd320753c8b48e8 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 13 Dec 2024 13:08:37 +0800
+Subject: exfat: fix the infinite loop in exfat_readdir()
+
+From: Yuezhang Mo <Yuezhang.Mo@sony.com>
+
+[ Upstream commit fee873761bd978d077d8c55334b4966ac4cb7b59 ]
+
+If the file system is corrupted so that a cluster is linked to
+itself in the cluster chain, and there is an unused directory
+entry in the cluster, 'dentry' will not be incremented, causing
+condition 'dentry < max_dentries' unable to prevent an infinite
+loop.
+
+This infinite loop causes s_lock not to be released, and other
+tasks will hang, such as exfat_sync_fs().
+
+This commit stops traversing the cluster chain when there is unused
+directory entry in the cluster to avoid this infinite loop.
+
+Reported-by: syzbot+205c2644abdff9d3f9fc@syzkaller.appspotmail.com
+Closes: https://syzkaller.appspot.com/bug?extid=205c2644abdff9d3f9fc
+Tested-by: syzbot+205c2644abdff9d3f9fc@syzkaller.appspotmail.com
+Fixes: ca06197382bd ("exfat: add directory operations")
+Signed-off-by: Yuezhang Mo <Yuezhang.Mo@sony.com>
+Reviewed-by: Sungjong Seo <sj1557.seo@samsung.com>
+Signed-off-by: Namjae Jeon <linkinjeon@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/exfat/dir.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c
+index 7446bf09a04a..9d8848872fe8 100644
+--- a/fs/exfat/dir.c
++++ b/fs/exfat/dir.c
+@@ -125,7 +125,7 @@ static int exfat_readdir(struct inode *inode, loff_t *cpos, struct exfat_dir_ent
+                       type = exfat_get_entry_type(ep);
+                       if (type == TYPE_UNUSED) {
+                               brelse(bh);
+-                              break;
++                              goto out;
+                       }
+                       if (type != TYPE_FILE && type != TYPE_DIR) {
+@@ -189,6 +189,7 @@ static int exfat_readdir(struct inode *inode, loff_t *cpos, struct exfat_dir_ent
+               }
+       }
++out:
+       dir_entry->namebuf.lfn[0] = '\0';
+       *cpos = EXFAT_DEN_TO_B(dentry);
+       return 0;
+-- 
+2.39.5
+
diff --git a/queue-6.12/exfat-fix-the-new-buffer-was-not-zeroed-before-writi.patch b/queue-6.12/exfat-fix-the-new-buffer-was-not-zeroed-before-writi.patch
new file mode 100644 (file)
index 0000000..bd94c8d
--- /dev/null
@@ -0,0 +1,62 @@
+From 360e257e52eb050b6f78c12bbef02ff0cc89a7ea Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 12 Dec 2024 16:29:23 +0800
+Subject: exfat: fix the new buffer was not zeroed before writing
+
+From: Yuezhang Mo <Yuezhang.Mo@sony.com>
+
+[ Upstream commit 98e2fb26d1a9eafe79f46d15d54e68e014d81d8c ]
+
+Before writing, if a buffer_head marked as new, its data must
+be zeroed, otherwise uninitialized data in the page cache will
+be written.
+
+So this commit uses folio_zero_new_buffers() to zero the new
+buffers before ->write_end().
+
+Fixes: 6630ea49103c ("exfat: move extend valid_size into ->page_mkwrite()")
+Reported-by: syzbot+91ae49e1c1a2634d20c0@syzkaller.appspotmail.com
+Closes: https://syzkaller.appspot.com/bug?extid=91ae49e1c1a2634d20c0
+Tested-by: syzbot+91ae49e1c1a2634d20c0@syzkaller.appspotmail.com
+Signed-off-by: Yuezhang Mo <Yuezhang.Mo@sony.com>
+Reviewed-by: Sungjong Seo <sj1557.seo@samsung.com>
+Signed-off-by: Namjae Jeon <linkinjeon@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/exfat/file.c | 6 ++++++
+ 1 file changed, 6 insertions(+)
+
+diff --git a/fs/exfat/file.c b/fs/exfat/file.c
+index fb38769c3e39..05b51e721783 100644
+--- a/fs/exfat/file.c
++++ b/fs/exfat/file.c
+@@ -545,6 +545,7 @@ static int exfat_extend_valid_size(struct file *file, loff_t new_valid_size)
+       while (pos < new_valid_size) {
+               u32 len;
+               struct folio *folio;
++              unsigned long off;
+               len = PAGE_SIZE - (pos & (PAGE_SIZE - 1));
+               if (pos + len > new_valid_size)
+@@ -554,6 +555,9 @@ static int exfat_extend_valid_size(struct file *file, loff_t new_valid_size)
+               if (err)
+                       goto out;
++              off = offset_in_folio(folio, pos);
++              folio_zero_new_buffers(folio, off, off + len);
++
+               err = ops->write_end(file, mapping, pos, len, len, folio, NULL);
+               if (err < 0)
+                       goto out;
+@@ -563,6 +567,8 @@ static int exfat_extend_valid_size(struct file *file, loff_t new_valid_size)
+               cond_resched();
+       }
++      return 0;
++
+ out:
+       return err;
+ }
+-- 
+2.39.5
+
diff --git a/queue-6.12/fs-writeback-convert-wbc_account_cgroup_owner-to-tak.patch b/queue-6.12/fs-writeback-convert-wbc_account_cgroup_owner-to-tak.patch
new file mode 100644 (file)
index 0000000..c18c678
--- /dev/null
@@ -0,0 +1,255 @@
+From 7bf5bc73f000f77982d49184c607283b7bbfee9a Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 26 Sep 2024 16:01:21 +0200
+Subject: fs/writeback: convert wbc_account_cgroup_owner to take a folio
+
+From: Pankaj Raghav <p.raghav@samsung.com>
+
+[ Upstream commit 30dac24e14b52e1787572d1d4e06eeabe8a63630 ]
+
+Most of the callers of wbc_account_cgroup_owner() are converting a folio
+to page before calling the function. wbc_account_cgroup_owner() is
+converting the page back to a folio to call mem_cgroup_css_from_folio().
+
+Convert wbc_account_cgroup_owner() to take a folio instead of a page,
+and convert all callers to pass a folio directly except f2fs.
+
+Convert the page to folio for all the callers from f2fs as they were the
+only callers calling wbc_account_cgroup_owner() with a page. As f2fs is
+already in the process of converting to folios, these call sites might
+also soon be calling wbc_account_cgroup_owner() with a folio directly in
+the future.
+
+No functional changes. Only compile tested.
+
+Signed-off-by: Pankaj Raghav <p.raghav@samsung.com>
+Link: https://lore.kernel.org/r/20240926140121.203821-1-kernel@pankajraghav.com
+Acked-by: David Sterba <dsterba@suse.com>
+Acked-by: Tejun Heo <tj@kernel.org>
+Signed-off-by: Christian Brauner <brauner@kernel.org>
+Stable-dep-of: 51d20d1dacbe ("iomap: fix zero padding data issue in concurrent append writes")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ Documentation/admin-guide/cgroup-v2.rst | 2 +-
+ fs/btrfs/extent_io.c                    | 7 +++----
+ fs/btrfs/inode.c                        | 2 +-
+ fs/buffer.c                             | 4 ++--
+ fs/ext4/page-io.c                       | 2 +-
+ fs/f2fs/data.c                          | 9 ++++++---
+ fs/fs-writeback.c                       | 8 +++-----
+ fs/iomap/buffered-io.c                  | 2 +-
+ fs/mpage.c                              | 2 +-
+ include/linux/writeback.h               | 4 ++--
+ 10 files changed, 21 insertions(+), 21 deletions(-)
+
+diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
+index 6d02168d78be..2cb58daf3089 100644
+--- a/Documentation/admin-guide/cgroup-v2.rst
++++ b/Documentation/admin-guide/cgroup-v2.rst
+@@ -2954,7 +2954,7 @@ following two functions.
+       a queue (device) has been associated with the bio and
+       before submission.
+-  wbc_account_cgroup_owner(@wbc, @page, @bytes)
++  wbc_account_cgroup_owner(@wbc, @folio, @bytes)
+       Should be called for each data segment being written out.
+       While this function doesn't care exactly when it's called
+       during the writeback session, it's the easiest and most
+diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
+index 872cca54cc6c..42c9899d9241 100644
+--- a/fs/btrfs/extent_io.c
++++ b/fs/btrfs/extent_io.c
+@@ -786,7 +786,7 @@ static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl,
+               }
+               if (bio_ctrl->wbc)
+-                      wbc_account_cgroup_owner(bio_ctrl->wbc, &folio->page,
++                      wbc_account_cgroup_owner(bio_ctrl->wbc, folio,
+                                                len);
+               size -= len;
+@@ -1708,7 +1708,7 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb,
+               ret = bio_add_folio(&bbio->bio, folio, eb->len,
+                                   eb->start - folio_pos(folio));
+               ASSERT(ret);
+-              wbc_account_cgroup_owner(wbc, folio_page(folio, 0), eb->len);
++              wbc_account_cgroup_owner(wbc, folio, eb->len);
+               folio_unlock(folio);
+       } else {
+               int num_folios = num_extent_folios(eb);
+@@ -1722,8 +1722,7 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb,
+                       folio_start_writeback(folio);
+                       ret = bio_add_folio(&bbio->bio, folio, eb->folio_size, 0);
+                       ASSERT(ret);
+-                      wbc_account_cgroup_owner(wbc, folio_page(folio, 0),
+-                                               eb->folio_size);
++                      wbc_account_cgroup_owner(wbc, folio, eb->folio_size);
+                       wbc->nr_to_write -= folio_nr_pages(folio);
+                       folio_unlock(folio);
+               }
+diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
+index b5cfb85af937..a3c861b2a6d2 100644
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -1729,7 +1729,7 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode,
+                        * need full accuracy.  Just account the whole thing
+                        * against the first page.
+                        */
+-                      wbc_account_cgroup_owner(wbc, &locked_folio->page,
++                      wbc_account_cgroup_owner(wbc, locked_folio,
+                                                cur_end - start);
+                       async_chunk[i].locked_folio = locked_folio;
+                       locked_folio = NULL;
+diff --git a/fs/buffer.c b/fs/buffer.c
+index 1fc9a50def0b..32bd0f4c4223 100644
+--- a/fs/buffer.c
++++ b/fs/buffer.c
+@@ -2803,7 +2803,7 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
+       bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
+       bio->bi_write_hint = write_hint;
+-      __bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
++      bio_add_folio_nofail(bio, bh->b_folio, bh->b_size, bh_offset(bh));
+       bio->bi_end_io = end_bio_bh_io_sync;
+       bio->bi_private = bh;
+@@ -2813,7 +2813,7 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
+       if (wbc) {
+               wbc_init_bio(wbc, bio);
+-              wbc_account_cgroup_owner(wbc, bh->b_page, bh->b_size);
++              wbc_account_cgroup_owner(wbc, bh->b_folio, bh->b_size);
+       }
+       submit_bio(bio);
+diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
+index ad5543866d21..b7b9261fec3b 100644
+--- a/fs/ext4/page-io.c
++++ b/fs/ext4/page-io.c
+@@ -421,7 +421,7 @@ static void io_submit_add_bh(struct ext4_io_submit *io,
+               io_submit_init_bio(io, bh);
+       if (!bio_add_folio(io->io_bio, io_folio, bh->b_size, bh_offset(bh)))
+               goto submit_and_retry;
+-      wbc_account_cgroup_owner(io->io_wbc, &folio->page, bh->b_size);
++      wbc_account_cgroup_owner(io->io_wbc, folio, bh->b_size);
+       io->io_next_block++;
+ }
+diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
+index da0960d496ae..1b0050b8421d 100644
+--- a/fs/f2fs/data.c
++++ b/fs/f2fs/data.c
+@@ -711,7 +711,8 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio)
+       }
+       if (fio->io_wbc && !is_read_io(fio->op))
+-              wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE);
++              wbc_account_cgroup_owner(fio->io_wbc, page_folio(fio->page),
++                                       PAGE_SIZE);
+       inc_page_count(fio->sbi, is_read_io(fio->op) ?
+                       __read_io_type(page) : WB_DATA_TYPE(fio->page, false));
+@@ -911,7 +912,8 @@ int f2fs_merge_page_bio(struct f2fs_io_info *fio)
+       }
+       if (fio->io_wbc)
+-              wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE);
++              wbc_account_cgroup_owner(fio->io_wbc, page_folio(fio->page),
++                                       PAGE_SIZE);
+       inc_page_count(fio->sbi, WB_DATA_TYPE(page, false));
+@@ -1011,7 +1013,8 @@ void f2fs_submit_page_write(struct f2fs_io_info *fio)
+       }
+       if (fio->io_wbc)
+-              wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE);
++              wbc_account_cgroup_owner(fio->io_wbc, page_folio(fio->page),
++                                       PAGE_SIZE);
+       io->last_block_in_bio = fio->new_blkaddr;
+diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
+index d8bec3c1bb1f..2391b09f4ced 100644
+--- a/fs/fs-writeback.c
++++ b/fs/fs-writeback.c
+@@ -890,17 +890,16 @@ EXPORT_SYMBOL_GPL(wbc_detach_inode);
+ /**
+  * wbc_account_cgroup_owner - account writeback to update inode cgroup ownership
+  * @wbc: writeback_control of the writeback in progress
+- * @page: page being written out
++ * @folio: folio being written out
+  * @bytes: number of bytes being written out
+  *
+- * @bytes from @page are about to written out during the writeback
++ * @bytes from @folio are about to written out during the writeback
+  * controlled by @wbc.  Keep the book for foreign inode detection.  See
+  * wbc_detach_inode().
+  */
+-void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
++void wbc_account_cgroup_owner(struct writeback_control *wbc, struct folio *folio,
+                             size_t bytes)
+ {
+-      struct folio *folio;
+       struct cgroup_subsys_state *css;
+       int id;
+@@ -913,7 +912,6 @@ void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
+       if (!wbc->wb || wbc->no_cgroup_owner)
+               return;
+-      folio = page_folio(page);
+       css = mem_cgroup_css_from_folio(folio);
+       /* dead cgroups shouldn't contribute to inode ownership arbitration */
+       if (!(css->flags & CSS_ONLINE))
+diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
+index ef0b68bccbb6..ce73d2a48c1e 100644
+--- a/fs/iomap/buffered-io.c
++++ b/fs/iomap/buffered-io.c
+@@ -1784,7 +1784,7 @@ static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc,
+       if (ifs)
+               atomic_add(len, &ifs->write_bytes_pending);
+       wpc->ioend->io_size += len;
+-      wbc_account_cgroup_owner(wbc, &folio->page, len);
++      wbc_account_cgroup_owner(wbc, folio, len);
+       return 0;
+ }
+diff --git a/fs/mpage.c b/fs/mpage.c
+index b5b5ddf9d513..82aecf372743 100644
+--- a/fs/mpage.c
++++ b/fs/mpage.c
+@@ -606,7 +606,7 @@ static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc,
+        * the confused fail path above (OOM) will be very confused when
+        * it finds all bh marked clean (i.e. it will not write anything)
+        */
+-      wbc_account_cgroup_owner(wbc, &folio->page, folio_size(folio));
++      wbc_account_cgroup_owner(wbc, folio, folio_size(folio));
+       length = first_unmapped << blkbits;
+       if (!bio_add_folio(bio, folio, length, 0)) {
+               bio = mpage_bio_submit_write(bio);
+diff --git a/include/linux/writeback.h b/include/linux/writeback.h
+index d6db822e4bb3..641a057e0413 100644
+--- a/include/linux/writeback.h
++++ b/include/linux/writeback.h
+@@ -217,7 +217,7 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
+                                struct inode *inode)
+       __releases(&inode->i_lock);
+ void wbc_detach_inode(struct writeback_control *wbc);
+-void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
++void wbc_account_cgroup_owner(struct writeback_control *wbc, struct folio *folio,
+                             size_t bytes);
+ int cgroup_writeback_by_id(u64 bdi_id, int memcg_id,
+                          enum wb_reason reason, struct wb_completion *done);
+@@ -324,7 +324,7 @@ static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio)
+ }
+ static inline void wbc_account_cgroup_owner(struct writeback_control *wbc,
+-                                          struct page *page, size_t bytes)
++                                          struct folio *folio, size_t bytes)
+ {
+ }
+-- 
+2.39.5
+
diff --git a/queue-6.12/fuse-respect-fopen_keep_cache-on-opendir.patch b/queue-6.12/fuse-respect-fopen_keep_cache-on-opendir.patch
new file mode 100644 (file)
index 0000000..b3e8a3a
--- /dev/null
@@ -0,0 +1,40 @@
+From 8f41839c65f944923d5454abb246f67c871987e2 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 1 Jan 2025 14:00:37 +0100
+Subject: fuse: respect FOPEN_KEEP_CACHE on opendir
+
+From: Amir Goldstein <amir73il@gmail.com>
+
+[ Upstream commit 03f275adb8fbd7b4ebe96a1ad5044d8e602692dc ]
+
+The re-factoring of fuse_dir_open() missed the need to invalidate
+directory inode page cache with open flag FOPEN_KEEP_CACHE.
+
+Fixes: 7de64d521bf92 ("fuse: break up fuse_open_common()")
+Reported-by: Prince Kumar <princer@google.com>
+Closes: https://lore.kernel.org/linux-fsdevel/CAEW=TRr7CYb4LtsvQPLj-zx5Y+EYBmGfM24SuzwyDoGVNoKm7w@mail.gmail.com/
+Signed-off-by: Amir Goldstein <amir73il@gmail.com>
+Link: https://lore.kernel.org/r/20250101130037.96680-1-amir73il@gmail.com
+Reviewed-by: Bernd Schubert <bernd.schubert@fastmail.fm>
+Signed-off-by: Christian Brauner <brauner@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/fuse/dir.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
+index 54104dd48af7..2e62e62c07f8 100644
+--- a/fs/fuse/dir.c
++++ b/fs/fuse/dir.c
+@@ -1680,6 +1680,8 @@ static int fuse_dir_open(struct inode *inode, struct file *file)
+                */
+               if (ff->open_flags & (FOPEN_STREAM | FOPEN_NONSEEKABLE))
+                       nonseekable_open(inode, file);
++              if (!(ff->open_flags & FOPEN_KEEP_CACHE))
++                      invalidate_inode_pages2(inode->i_mapping);
+       }
+       return err;
+-- 
+2.39.5
+
diff --git a/queue-6.12/iomap-fix-zero-padding-data-issue-in-concurrent-appe.patch b/queue-6.12/iomap-fix-zero-padding-data-issue-in-concurrent-appe.patch
new file mode 100644 (file)
index 0000000..8e005f9
--- /dev/null
@@ -0,0 +1,156 @@
+From d9d2a9826311f03f7fc9a2144d20d27a241577a6 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 9 Dec 2024 19:42:40 +0800
+Subject: iomap: fix zero padding data issue in concurrent append writes
+
+From: Long Li <leo.lilong@huawei.com>
+
+[ Upstream commit 51d20d1dacbec589d459e11fc88fbca419f84a99 ]
+
+During concurrent append writes to XFS filesystem, zero padding data
+may appear in the file after power failure. This happens due to imprecise
+disk size updates when handling write completion.
+
+Consider this scenario with concurrent append writes same file:
+
+  Thread 1:                  Thread 2:
+  ------------               -----------
+  write [A, A+B]
+  update inode size to A+B
+  submit I/O [A, A+BS]
+                             write [A+B, A+B+C]
+                             update inode size to A+B+C
+  <I/O completes, updates disk size to min(A+B+C, A+BS)>
+  <power failure>
+
+After reboot:
+  1) with A+B+C < A+BS, the file has zero padding in range [A+B, A+B+C]
+
+  |<         Block Size (BS)      >|
+  |DDDDDDDDDDDDDDDD0000000000000000|
+  ^               ^        ^
+  A              A+B     A+B+C
+                         (EOF)
+
+  2) with A+B+C > A+BS, the file has zero padding in range [A+B, A+BS]
+
+  |<         Block Size (BS)      >|<           Block Size (BS)    >|
+  |DDDDDDDDDDDDDDDD0000000000000000|00000000000000000000000000000000|
+  ^               ^                ^               ^
+  A              A+B              A+BS           A+B+C
+                                  (EOF)
+
+  D = Valid Data
+  0 = Zero Padding
+
+The issue stems from disk size being set to min(io_offset + io_size,
+inode->i_size) at I/O completion. Since io_offset+io_size is block
+size granularity, it may exceed the actual valid file data size. In
+the case of concurrent append writes, inode->i_size may be larger
+than the actual range of valid file data written to disk, leading to
+inaccurate disk size updates.
+
+This patch modifies the meaning of io_size to represent the size of
+valid data within EOF in an ioend. If the ioend spans beyond i_size,
+io_size will be trimmed to provide the file with more accurate size
+information. This is particularly useful for on-disk size updates
+at completion time.
+
+After this change, ioends that span i_size will not grow or merge with
+other ioends in concurrent scenarios. However, these cases that need
+growth/merging rarely occur and it seems no noticeable performance impact.
+Although rounding up io_size could enable ioend growth/merging in these
+scenarios, we decided to keep the code simple after discussion [1].
+
+Another benefit is that it makes the xfs_ioend_is_append() check more
+accurate, which can reduce unnecessary end bio callbacks of xfs_end_bio()
+in certain scenarios, such as repeated writes at the file tail without
+extending the file size.
+
+Link [1]: https://patchwork.kernel.org/project/xfs/patch/20241113091907.56937-1-leo.lilong@huawei.com
+
+Fixes: ae259a9c8593 ("fs: introduce iomap infrastructure") # goes further back than this
+Signed-off-by: Long Li <leo.lilong@huawei.com>
+Link: https://lore.kernel.org/r/20241209114241.3725722-3-leo.lilong@huawei.com
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Christian Brauner <brauner@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/iomap/buffered-io.c | 45 ++++++++++++++++++++++++++++++++++++++++++
+ include/linux/iomap.h  |  2 +-
+ 2 files changed, 46 insertions(+), 1 deletion(-)
+
+diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
+index 05e5cc3bf976..25d1ede6bb0e 100644
+--- a/fs/iomap/buffered-io.c
++++ b/fs/iomap/buffered-io.c
+@@ -1784,7 +1784,52 @@ static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc,
+       if (ifs)
+               atomic_add(len, &ifs->write_bytes_pending);
++
++      /*
++       * Clamp io_offset and io_size to the incore EOF so that ondisk
++       * file size updates in the ioend completion are byte-accurate.
++       * This avoids recovering files with zeroed tail regions when
++       * writeback races with appending writes:
++       *
++       *    Thread 1:                  Thread 2:
++       *    ------------               -----------
++       *    write [A, A+B]
++       *    update inode size to A+B
++       *    submit I/O [A, A+BS]
++       *                               write [A+B, A+B+C]
++       *                               update inode size to A+B+C
++       *    <I/O completes, updates disk size to min(A+B+C, A+BS)>
++       *    <power failure>
++       *
++       *  After reboot:
++       *    1) with A+B+C < A+BS, the file has zero padding in range
++       *       [A+B, A+B+C]
++       *
++       *    |<     Block Size (BS)   >|
++       *    |DDDDDDDDDDDD0000000000000|
++       *    ^           ^        ^
++       *    A          A+B     A+B+C
++       *                       (EOF)
++       *
++       *    2) with A+B+C > A+BS, the file has zero padding in range
++       *       [A+B, A+BS]
++       *
++       *    |<     Block Size (BS)   >|<     Block Size (BS)    >|
++       *    |DDDDDDDDDDDD0000000000000|00000000000000000000000000|
++       *    ^           ^             ^           ^
++       *    A          A+B           A+BS       A+B+C
++       *                             (EOF)
++       *
++       *    D = Valid Data
++       *    0 = Zero Padding
++       *
++       * Note that this defeats the ability to chain the ioends of
++       * appending writes.
++       */
+       wpc->ioend->io_size += len;
++      if (wpc->ioend->io_offset + wpc->ioend->io_size > end_pos)
++              wpc->ioend->io_size = end_pos - wpc->ioend->io_offset;
++
+       wbc_account_cgroup_owner(wbc, folio, len);
+       return 0;
+ }
+diff --git a/include/linux/iomap.h b/include/linux/iomap.h
+index f61407e3b121..d204dcd35063 100644
+--- a/include/linux/iomap.h
++++ b/include/linux/iomap.h
+@@ -330,7 +330,7 @@ struct iomap_ioend {
+       u16                     io_type;
+       u16                     io_flags;       /* IOMAP_F_* */
+       struct inode            *io_inode;      /* file being written to */
+-      size_t                  io_size;        /* size of the extent */
++      size_t                  io_size;        /* size of data within eof */
+       loff_t                  io_offset;      /* offset in the file */
+       sector_t                io_sector;      /* start sector of ioend */
+       struct bio              io_bio;         /* MUST BE LAST! */
+-- 
+2.39.5
+
diff --git a/queue-6.12/iomap-pass-byte-granular-end-position-to-iomap_add_t.patch b/queue-6.12/iomap-pass-byte-granular-end-position-to-iomap_add_t.patch
new file mode 100644 (file)
index 0000000..d4153ba
--- /dev/null
@@ -0,0 +1,127 @@
+From b3d04c479944b549209c7b8b67402f6fa76a5615 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 9 Dec 2024 19:42:39 +0800
+Subject: iomap: pass byte granular end position to iomap_add_to_ioend
+
+From: Long Li <leo.lilong@huawei.com>
+
+[ Upstream commit b44679c63e4d3ac820998b6bd59fba89a72ad3e7 ]
+
+This is a preparatory patch for fixing zero padding issues in concurrent
+append write scenarios. In the following patches, we need to obtain
+byte-granular writeback end position for io_size trimming after EOF
+handling.
+
+Due to concurrent writeback and truncate operations, inode size may
+shrink. Resampling inode size would force writeback code to handle the
+newly appeared post-EOF blocks, which is undesirable. As Dave
+explained in [1]:
+
+"Really, the issue is that writeback mappings have to be able to
+handle the range being mapped suddenly appear to be beyond EOF.
+This behaviour is a longstanding writeback constraint, and is what
+iomap_writepage_handle_eof() is attempting to handle.
+
+We handle this by only sampling i_size_read() whilst we have the
+folio locked and can determine the action we should take with that
+folio (i.e. nothing, partial zeroing, or skip altogether). Once
+we've made the decision that the folio is within EOF and taken
+action on it (i.e. moved the folio to writeback state), we cannot
+then resample the inode size because a truncate may have started
+and changed the inode size."
+
+To avoid resampling inode size after EOF handling, we convert end_pos
+to byte-granular writeback position and return it from EOF handling
+function.
+
+Since iomap_set_range_dirty() can handle unaligned lengths, this
+conversion has no impact on it. However, iomap_find_dirty_range()
+requires aligned start and end range to find dirty blocks within the
+given range, so the end position needs to be rounded up when passed
+to it.
+
+LINK [1]: https://lore.kernel.org/linux-xfs/Z1Gg0pAa54MoeYME@localhost.localdomain/
+
+Signed-off-by: Long Li <leo.lilong@huawei.com>
+Link: https://lore.kernel.org/r/20241209114241.3725722-2-leo.lilong@huawei.com
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Signed-off-by: Christian Brauner <brauner@kernel.org>
+Stable-dep-of: 51d20d1dacbe ("iomap: fix zero padding data issue in concurrent append writes")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/iomap/buffered-io.c | 21 ++++++++++++---------
+ 1 file changed, 12 insertions(+), 9 deletions(-)
+
+diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
+index ce73d2a48c1e..05e5cc3bf976 100644
+--- a/fs/iomap/buffered-io.c
++++ b/fs/iomap/buffered-io.c
+@@ -1764,7 +1764,8 @@ static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos)
+  */
+ static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc,
+               struct writeback_control *wbc, struct folio *folio,
+-              struct inode *inode, loff_t pos, unsigned len)
++              struct inode *inode, loff_t pos, loff_t end_pos,
++              unsigned len)
+ {
+       struct iomap_folio_state *ifs = folio->private;
+       size_t poff = offset_in_folio(folio, pos);
+@@ -1790,8 +1791,8 @@ static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc,
+ static int iomap_writepage_map_blocks(struct iomap_writepage_ctx *wpc,
+               struct writeback_control *wbc, struct folio *folio,
+-              struct inode *inode, u64 pos, unsigned dirty_len,
+-              unsigned *count)
++              struct inode *inode, u64 pos, u64 end_pos,
++              unsigned dirty_len, unsigned *count)
+ {
+       int error;
+@@ -1816,7 +1817,7 @@ static int iomap_writepage_map_blocks(struct iomap_writepage_ctx *wpc,
+                       break;
+               default:
+                       error = iomap_add_to_ioend(wpc, wbc, folio, inode, pos,
+-                                      map_len);
++                                      end_pos, map_len);
+                       if (!error)
+                               (*count)++;
+                       break;
+@@ -1887,11 +1888,11 @@ static bool iomap_writepage_handle_eof(struct folio *folio, struct inode *inode,
+                *    remaining memory is zeroed when mapped, and writes to that
+                *    region are not written out to the file.
+                *
+-               * Also adjust the writeback range to skip all blocks entirely
+-               * beyond i_size.
++               * Also adjust the end_pos to the end of file and skip writeback
++               * for all blocks entirely beyond i_size.
+                */
+               folio_zero_segment(folio, poff, folio_size(folio));
+-              *end_pos = round_up(isize, i_blocksize(inode));
++              *end_pos = isize;
+       }
+       return true;
+@@ -1904,6 +1905,7 @@ static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
+       struct inode *inode = folio->mapping->host;
+       u64 pos = folio_pos(folio);
+       u64 end_pos = pos + folio_size(folio);
++      u64 end_aligned = 0;
+       unsigned count = 0;
+       int error = 0;
+       u32 rlen;
+@@ -1945,9 +1947,10 @@ static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
+       /*
+        * Walk through the folio to find dirty areas to write back.
+        */
+-      while ((rlen = iomap_find_dirty_range(folio, &pos, end_pos))) {
++      end_aligned = round_up(end_pos, i_blocksize(inode));
++      while ((rlen = iomap_find_dirty_range(folio, &pos, end_aligned))) {
+               error = iomap_writepage_map_blocks(wpc, wbc, folio, inode,
+-                              pos, rlen, &count);
++                              pos, end_pos, rlen, &count);
+               if (error)
+                       break;
+               pos += rlen;
+-- 
+2.39.5
+
diff --git a/queue-6.12/jbd2-flush-filesystem-device-before-updating-tail-se.patch b/queue-6.12/jbd2-flush-filesystem-device-before-updating-tail-se.patch
new file mode 100644 (file)
index 0000000..b798a24
--- /dev/null
@@ -0,0 +1,45 @@
+From 1bf143314daebca0b27cae65a7cf9e32dca65c0b Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 3 Dec 2024 09:44:07 +0800
+Subject: jbd2: flush filesystem device before updating tail sequence
+
+From: Zhang Yi <yi.zhang@huawei.com>
+
+[ Upstream commit a0851ea9cd555c333795b85ddd908898b937c4e1 ]
+
+When committing transaction in jbd2_journal_commit_transaction(), the
+disk caches for the filesystem device should be flushed before updating
+the journal tail sequence. However, this step is missed if the journal
+is not located on the filesystem device. As a result, the filesystem may
+become inconsistent following a power failure or system crash. Fix it by
+ensuring that the filesystem device is flushed appropriately.
+
+Fixes: 3339578f0578 ("jbd2: cleanup journal tail after transaction commit")
+Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
+Link: https://lore.kernel.org/r/20241203014407.805916-3-yi.zhang@huaweicloud.com
+Reviewed-by: Jan Kara <jack@suse.cz>
+Signed-off-by: Christian Brauner <brauner@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/jbd2/commit.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
+index 4305a1ac808a..f95cf272a1b5 100644
+--- a/fs/jbd2/commit.c
++++ b/fs/jbd2/commit.c
+@@ -776,9 +776,9 @@ void jbd2_journal_commit_transaction(journal_t *journal)
+       /*
+        * If the journal is not located on the file system device,
+        * then we must flush the file system device before we issue
+-       * the commit record
++       * the commit record and update the journal tail sequence.
+        */
+-      if (commit_transaction->t_need_data_flush &&
++      if ((commit_transaction->t_need_data_flush || update_tail) &&
+           (journal->j_fs_dev != journal->j_dev) &&
+           (journal->j_flags & JBD2_BARRIER))
+               blkdev_issue_flush(journal->j_fs_dev);
+-- 
+2.39.5
+
diff --git a/queue-6.12/jbd2-increase-io-priority-for-writing-revoke-records.patch b/queue-6.12/jbd2-increase-io-priority-for-writing-revoke-records.patch
new file mode 100644 (file)
index 0000000..c5c0916
--- /dev/null
@@ -0,0 +1,41 @@
+From 017a44717cbb8444df3a669b652b083704fed6ba Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 3 Dec 2024 09:44:06 +0800
+Subject: jbd2: increase IO priority for writing revoke records
+
+From: Zhang Yi <yi.zhang@huawei.com>
+
+[ Upstream commit ac1e21bd8c883aeac2f1835fc93b39c1e6838b35 ]
+
+Commit '6a3afb6ac6df ("jbd2: increase the journal IO's priority")'
+increases the priority of journal I/O by marking I/O with the
+JBD2_JOURNAL_REQ_FLAGS. However, that commit missed the revoke buffers,
+so also addresses that kind of I/Os.
+
+Fixes: 6a3afb6ac6df ("jbd2: increase the journal IO's priority")
+Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
+Link: https://lore.kernel.org/r/20241203014407.805916-2-yi.zhang@huaweicloud.com
+Reviewed-by: Kemeng Shi <shikemeng@huaweicloud.com>
+Reviewed-by: Jan Kara <jack@suse.cz>
+Signed-off-by: Christian Brauner <brauner@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/jbd2/revoke.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
+index 4556e4689024..ce63d5fde9c3 100644
+--- a/fs/jbd2/revoke.c
++++ b/fs/jbd2/revoke.c
+@@ -654,7 +654,7 @@ static void flush_descriptor(journal_t *journal,
+       set_buffer_jwrite(descriptor);
+       BUFFER_TRACE(descriptor, "write");
+       set_buffer_dirty(descriptor);
+-      write_dirty_buffer(descriptor, REQ_SYNC);
++      write_dirty_buffer(descriptor, JBD2_JOURNAL_REQ_FLAGS);
+ }
+ #endif
+-- 
+2.39.5
+
diff --git a/queue-6.12/netfs-fix-ceph-copy-to-cache-on-write-begin.patch b/queue-6.12/netfs-fix-ceph-copy-to-cache-on-write-begin.patch
new file mode 100644 (file)
index 0000000..30737c0
--- /dev/null
@@ -0,0 +1,74 @@
+From 5a499cd36c28f96ffffe9d6e5cb5b6ac734b5e8e Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 13 Dec 2024 13:50:09 +0000
+Subject: netfs: Fix ceph copy to cache on write-begin
+
+From: David Howells <dhowells@redhat.com>
+
+[ Upstream commit 38cf8e945721ffe708fa675507465da7f4f2a9f7 ]
+
+At the end of netfs_unlock_read_folio() in which folios are marked
+appropriately for copying to the cache (either with by being marked dirty
+and having their private data set or by having PG_private_2 set) and then
+unlocked, the folio_queue struct has the entry pointing to the folio
+cleared.  This presents a problem for netfs_pgpriv2_write_to_the_cache(),
+which is used to write folios marked with PG_private_2 to the cache as it
+expects to be able to trawl the folio_queue list thereafter to find the
+relevant folios, leading to a hang.
+
+Fix this by not clearing the folio_queue entry if we're going to do the
+deprecated copy-to-cache.  The clearance will be done instead as the folios
+are written to the cache.
+
+This can be reproduced by starting cachefiles, mounting a ceph filesystem
+with "-o fsc" and writing to it.
+
+Fixes: 796a4049640b ("netfs: In readahead, put the folio refs as soon extracted")
+Reported-by: Max Kellermann <max.kellermann@ionos.com>
+Closes: https://lore.kernel.org/r/CAKPOu+_4m80thNy5_fvROoxBm689YtA0dZ-=gcmkzwYSY4syqw@mail.gmail.com/
+Signed-off-by: David Howells <dhowells@redhat.com>
+Link: https://lore.kernel.org/r/20241213135013.2964079-10-dhowells@redhat.com
+Fixes: ee4cdf7ba857 ("netfs: Speed up buffered reading")
+cc: Jeff Layton <jlayton@kernel.org>
+cc: Ilya Dryomov <idryomov@gmail.com>
+cc: Xiubo Li <xiubli@redhat.com>
+cc: netfs@lists.linux.dev
+cc: ceph-devel@vger.kernel.org
+cc: linux-fsdevel@vger.kernel.org
+Signed-off-by: Christian Brauner <brauner@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/netfs/read_collect.c | 6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+diff --git a/fs/netfs/read_collect.c b/fs/netfs/read_collect.c
+index d86fa02f68fb..e70eb4ea21c0 100644
+--- a/fs/netfs/read_collect.c
++++ b/fs/netfs/read_collect.c
+@@ -62,10 +62,14 @@ static void netfs_unlock_read_folio(struct netfs_io_subrequest *subreq,
+               } else {
+                       trace_netfs_folio(folio, netfs_folio_trace_read_done);
+               }
++
++              folioq_clear(folioq, slot);
+       } else {
+               // TODO: Use of PG_private_2 is deprecated.
+               if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags))
+                       netfs_pgpriv2_mark_copy_to_cache(subreq, rreq, folioq, slot);
++              else
++                      folioq_clear(folioq, slot);
+       }
+       if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) {
+@@ -77,8 +81,6 @@ static void netfs_unlock_read_folio(struct netfs_io_subrequest *subreq,
+                       folio_unlock(folio);
+               }
+       }
+-
+-      folioq_clear(folioq, slot);
+ }
+ /*
+-- 
+2.39.5
+
diff --git a/queue-6.12/netfs-fix-enomem-handling-in-buffered-reads.patch b/queue-6.12/netfs-fix-enomem-handling-in-buffered-reads.patch
new file mode 100644 (file)
index 0000000..b8a09ef
--- /dev/null
@@ -0,0 +1,104 @@
+From 79ca233017cd28e3e5016ea9037225491c45bb43 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 13 Dec 2024 13:50:03 +0000
+Subject: netfs: Fix enomem handling in buffered reads
+
+From: David Howells <dhowells@redhat.com>
+
+[ Upstream commit 105549d09a539a876b7c3330ab52d8aceedad358 ]
+
+If netfs_read_to_pagecache() gets an error from either ->prepare_read() or
+from netfs_prepare_read_iterator(), it needs to decrement ->nr_outstanding,
+cancel the subrequest and break out of the issuing loop.  Currently, it
+only does this for two of the cases, but there are two more that aren't
+handled.
+
+Fix this by moving the handling to a common place and jumping to it from
+all four places.  This is in preference to inserting a wrapper around
+netfs_prepare_read_iterator() as proposed by Dmitry Antipov[1].
+
+Link: https://lore.kernel.org/r/20241202093943.227786-1-dmantipov@yandex.ru/ [1]
+
+Fixes: ee4cdf7ba857 ("netfs: Speed up buffered reading")
+Reported-by: syzbot+404b4b745080b6210c6c@syzkaller.appspotmail.com
+Closes: https://syzkaller.appspot.com/bug?extid=404b4b745080b6210c6c
+Signed-off-by: David Howells <dhowells@redhat.com>
+Link: https://lore.kernel.org/r/20241213135013.2964079-4-dhowells@redhat.com
+Tested-by: syzbot+404b4b745080b6210c6c@syzkaller.appspotmail.com
+cc: Dmitry Antipov <dmantipov@yandex.ru>
+cc: Jeff Layton <jlayton@kernel.org>
+cc: netfs@lists.linux.dev
+cc: linux-fsdevel@vger.kernel.org
+Signed-off-by: Christian Brauner <brauner@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/netfs/buffered_read.c | 28 ++++++++++++++++------------
+ 1 file changed, 16 insertions(+), 12 deletions(-)
+
+diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c
+index af46a598f4d7..2dd2260352db 100644
+--- a/fs/netfs/buffered_read.c
++++ b/fs/netfs/buffered_read.c
+@@ -275,22 +275,14 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq)
+                       netfs_stat(&netfs_n_rh_download);
+                       if (rreq->netfs_ops->prepare_read) {
+                               ret = rreq->netfs_ops->prepare_read(subreq);
+-                              if (ret < 0) {
+-                                      atomic_dec(&rreq->nr_outstanding);
+-                                      netfs_put_subrequest(subreq, false,
+-                                                           netfs_sreq_trace_put_cancel);
+-                                      break;
+-                              }
++                              if (ret < 0)
++                                      goto prep_failed;
+                               trace_netfs_sreq(subreq, netfs_sreq_trace_prepare);
+                       }
+                       slice = netfs_prepare_read_iterator(subreq);
+-                      if (slice < 0) {
+-                              atomic_dec(&rreq->nr_outstanding);
+-                              netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_cancel);
+-                              ret = slice;
+-                              break;
+-                      }
++                      if (slice < 0)
++                              goto prep_iter_failed;
+                       rreq->netfs_ops->issue_read(subreq);
+                       goto done;
+@@ -302,6 +294,8 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq)
+                       trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
+                       netfs_stat(&netfs_n_rh_zero);
+                       slice = netfs_prepare_read_iterator(subreq);
++                      if (slice < 0)
++                              goto prep_iter_failed;
+                       __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
+                       netfs_read_subreq_terminated(subreq, 0, false);
+                       goto done;
+@@ -310,6 +304,8 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq)
+               if (source == NETFS_READ_FROM_CACHE) {
+                       trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
+                       slice = netfs_prepare_read_iterator(subreq);
++                      if (slice < 0)
++                              goto prep_iter_failed;
+                       netfs_read_cache_to_pagecache(rreq, subreq);
+                       goto done;
+               }
+@@ -318,6 +314,14 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq)
+               WARN_ON_ONCE(1);
+               break;
++      prep_iter_failed:
++              ret = slice;
++      prep_failed:
++              subreq->error = ret;
++              atomic_dec(&rreq->nr_outstanding);
++              netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_cancel);
++              break;
++
+       done:
+               size -= slice;
+               start += slice;
+-- 
+2.39.5
+
diff --git a/queue-6.12/netfs-fix-is-caching-check-in-read-retry.patch b/queue-6.12/netfs-fix-is-caching-check-in-read-retry.patch
new file mode 100644 (file)
index 0000000..09211eb
--- /dev/null
@@ -0,0 +1,61 @@
+From 14827a5c1fab0ce07807324f737144e9ff115f5f Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 16 Dec 2024 20:34:45 +0000
+Subject: netfs: Fix is-caching check in read-retry
+
+From: David Howells <dhowells@redhat.com>
+
+[ Upstream commit d4e338de17cb6532bf805fae00db8b41e914009b ]
+
+netfs: Fix is-caching check in read-retry
+
+The read-retry code checks the NETFS_RREQ_COPY_TO_CACHE flag to determine
+if there might be failed reads from the cache that need turning into reads
+from the server, with the intention of skipping the complicated part if it
+can.  The code that set the flag, however, got lost during the read-side
+rewrite.
+
+Fix the check to see if the cache_resources are valid instead.  The flag
+can then be removed.
+
+Fixes: ee4cdf7ba857 ("netfs: Speed up buffered reading")
+Signed-off-by: David Howells <dhowells@redhat.com>
+Link: https://lore.kernel.org/r/3752048.1734381285@warthog.procyon.org.uk
+cc: Jeff Layton <jlayton@kernel.org>
+cc: netfs@lists.linux.dev
+cc: linux-fsdevel@vger.kernel.org
+Signed-off-by: Christian Brauner <brauner@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/netfs/read_retry.c | 2 +-
+ include/linux/netfs.h | 1 -
+ 2 files changed, 1 insertion(+), 2 deletions(-)
+
+diff --git a/fs/netfs/read_retry.c b/fs/netfs/read_retry.c
+index 0350592ea804..2701f7d45999 100644
+--- a/fs/netfs/read_retry.c
++++ b/fs/netfs/read_retry.c
+@@ -49,7 +49,7 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq)
+        * up to the first permanently failed one.
+        */
+       if (!rreq->netfs_ops->prepare_read &&
+-          !test_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags)) {
++          !rreq->cache_resources.ops) {
+               struct netfs_io_subrequest *subreq;
+               list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
+diff --git a/include/linux/netfs.h b/include/linux/netfs.h
+index 5eaceef41e6c..474481ee8b7c 100644
+--- a/include/linux/netfs.h
++++ b/include/linux/netfs.h
+@@ -269,7 +269,6 @@ struct netfs_io_request {
+       size_t                  prev_donated;   /* Fallback for subreq->prev_donated */
+       refcount_t              ref;
+       unsigned long           flags;
+-#define NETFS_RREQ_COPY_TO_CACHE      1       /* Need to write to the cache */
+ #define NETFS_RREQ_NO_UNLOCK_FOLIO    2       /* Don't unlock no_unlock_folio on completion */
+ #define NETFS_RREQ_DONT_UNLOCK_FOLIOS 3       /* Don't unlock the folios on completion */
+ #define NETFS_RREQ_FAILED             4       /* The request failed */
+-- 
+2.39.5
+
diff --git a/queue-6.12/netfs-fix-missing-barriers-by-using-clear_and_wake_u.patch b/queue-6.12/netfs-fix-missing-barriers-by-using-clear_and_wake_u.patch
new file mode 100644 (file)
index 0000000..175e823
--- /dev/null
@@ -0,0 +1,85 @@
+From 616d54dc5174e16fb498a8dbb7864642636731fc Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 13 Dec 2024 13:50:07 +0000
+Subject: netfs: Fix missing barriers by using clear_and_wake_up_bit()
+
+From: David Howells <dhowells@redhat.com>
+
+[ Upstream commit aa3956418985bda1f68313eadde3267921847978 ]
+
+Use clear_and_wake_up_bit() rather than something like:
+
+       clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
+       wake_up_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS);
+
+as there needs to be a barrier inserted between which is present in
+clear_and_wake_up_bit().
+
+Fixes: 288ace2f57c9 ("netfs: New writeback implementation")
+Fixes: ee4cdf7ba857 ("netfs: Speed up buffered reading")
+Signed-off-by: David Howells <dhowells@redhat.com>
+Link: https://lore.kernel.org/r/20241213135013.2964079-8-dhowells@redhat.com
+Reviewed-by: Akira Yokosawa <akiyks@gmail.com>
+cc: Zilin Guan <zilin@seu.edu.cn>
+cc: Akira Yokosawa <akiyks@gmail.com>
+cc: Jeff Layton <jlayton@kernel.org>
+cc: netfs@lists.linux.dev
+cc: linux-fsdevel@vger.kernel.org
+Signed-off-by: Christian Brauner <brauner@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/netfs/read_collect.c  | 3 +--
+ fs/netfs/write_collect.c | 9 +++------
+ 2 files changed, 4 insertions(+), 8 deletions(-)
+
+diff --git a/fs/netfs/read_collect.c b/fs/netfs/read_collect.c
+index 3cbb289535a8..d86fa02f68fb 100644
+--- a/fs/netfs/read_collect.c
++++ b/fs/netfs/read_collect.c
+@@ -378,8 +378,7 @@ static void netfs_rreq_assess(struct netfs_io_request *rreq)
+       task_io_account_read(rreq->transferred);
+       trace_netfs_rreq(rreq, netfs_rreq_trace_wake_ip);
+-      clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
+-      wake_up_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS);
++      clear_and_wake_up_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
+       trace_netfs_rreq(rreq, netfs_rreq_trace_done);
+       netfs_clear_subrequests(rreq, false);
+diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c
+index 1d438be2e1b4..82290c92ba7a 100644
+--- a/fs/netfs/write_collect.c
++++ b/fs/netfs/write_collect.c
+@@ -501,8 +501,7 @@ static void netfs_collect_write_results(struct netfs_io_request *wreq)
+               goto need_retry;
+       if ((notes & MADE_PROGRESS) && test_bit(NETFS_RREQ_PAUSE, &wreq->flags)) {
+               trace_netfs_rreq(wreq, netfs_rreq_trace_unpause);
+-              clear_bit_unlock(NETFS_RREQ_PAUSE, &wreq->flags);
+-              wake_up_bit(&wreq->flags, NETFS_RREQ_PAUSE);
++              clear_and_wake_up_bit(NETFS_RREQ_PAUSE, &wreq->flags);
+       }
+       if (notes & NEED_REASSESS) {
+@@ -605,8 +604,7 @@ void netfs_write_collection_worker(struct work_struct *work)
+       _debug("finished");
+       trace_netfs_rreq(wreq, netfs_rreq_trace_wake_ip);
+-      clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &wreq->flags);
+-      wake_up_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS);
++      clear_and_wake_up_bit(NETFS_RREQ_IN_PROGRESS, &wreq->flags);
+       if (wreq->iocb) {
+               size_t written = min(wreq->transferred, wreq->len);
+@@ -714,8 +712,7 @@ void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error,
+       trace_netfs_sreq(subreq, netfs_sreq_trace_terminated);
+-      clear_bit_unlock(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
+-      wake_up_bit(&subreq->flags, NETFS_SREQ_IN_PROGRESS);
++      clear_and_wake_up_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
+       /* If we are at the head of the queue, wake up the collector,
+        * transferring a ref to it if we were the ones to do so.
+-- 
+2.39.5
+
diff --git a/queue-6.12/netfs-fix-the-non-cancellation-of-copy-when-cache-is.patch b/queue-6.12/netfs-fix-the-non-cancellation-of-copy-when-cache-is.patch
new file mode 100644 (file)
index 0000000..9bbb602
--- /dev/null
@@ -0,0 +1,65 @@
+From 1830a3cebd1a988697ffa193095aba899ceb58d0 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 13 Dec 2024 13:50:10 +0000
+Subject: netfs: Fix the (non-)cancellation of copy when cache is temporarily
+ disabled
+
+From: David Howells <dhowells@redhat.com>
+
+[ Upstream commit d0327c824338cdccad058723a31d038ecd553409 ]
+
+When the caching for a cookie is temporarily disabled (e.g. due to a DIO
+write on that file), future copying to the cache for that file is disabled
+until all fds open on that file are closed.  However, if netfslib is using
+the deprecated PG_private_2 method (such as is currently used by ceph), and
+decides it wants to copy to the cache, netfs_advance_write() will just bail
+at the first check seeing that the cache stream is unavailable, and
+indicate that it dealt with all the content.
+
+This means that we have no subrequests to provide notifications to drive
+the state machine or even to pin the request and the request just gets
+discarded, leaving the folios with PG_private_2 set.
+
+Fix this by jumping directly to cancel the request if the cache is not
+available.  That way, we don't remove mark3 from the folio_queue list and
+netfs_pgpriv2_cancel() will clean up the folios.
+
+This was found by running the generic/013 xfstest against ceph with an
+active cache and the "-o fsc" option passed to ceph.  That would usually
+hang
+
+Fixes: ee4cdf7ba857 ("netfs: Speed up buffered reading")
+Reported-by: Max Kellermann <max.kellermann@ionos.com>
+Closes: https://lore.kernel.org/r/CAKPOu+_4m80thNy5_fvROoxBm689YtA0dZ-=gcmkzwYSY4syqw@mail.gmail.com/
+Signed-off-by: David Howells <dhowells@redhat.com>
+Link: https://lore.kernel.org/r/20241213135013.2964079-11-dhowells@redhat.com
+cc: Jeff Layton <jlayton@kernel.org>
+cc: Ilya Dryomov <idryomov@gmail.com>
+cc: Xiubo Li <xiubli@redhat.com>
+cc: netfs@lists.linux.dev
+cc: ceph-devel@vger.kernel.org
+cc: linux-fsdevel@vger.kernel.org
+Signed-off-by: Christian Brauner <brauner@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/netfs/read_pgpriv2.c | 4 ++++
+ 1 file changed, 4 insertions(+)
+
+diff --git a/fs/netfs/read_pgpriv2.c b/fs/netfs/read_pgpriv2.c
+index ba5af89d37fa..54d5004fec18 100644
+--- a/fs/netfs/read_pgpriv2.c
++++ b/fs/netfs/read_pgpriv2.c
+@@ -170,6 +170,10 @@ void netfs_pgpriv2_write_to_the_cache(struct netfs_io_request *rreq)
+       trace_netfs_write(wreq, netfs_write_trace_copy_to_cache);
+       netfs_stat(&netfs_n_wh_copy_to_cache);
++      if (!wreq->io_streams[1].avail) {
++              netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
++              goto couldnt_start;
++      }
+       for (;;) {
+               error = netfs_pgpriv2_copy_folio(wreq, folio);
+-- 
+2.39.5
+
diff --git a/queue-6.12/nfs-fix-oops-in-nfs_netfs_init_request-when-copying-.patch b/queue-6.12/nfs-fix-oops-in-nfs_netfs_init_request-when-copying-.patch
new file mode 100644 (file)
index 0000000..8d78d85
--- /dev/null
@@ -0,0 +1,69 @@
+From 5717881a402aafc332fa6c0707f3cfce611a6f93 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 13 Dec 2024 13:50:04 +0000
+Subject: nfs: Fix oops in nfs_netfs_init_request() when copying to cache
+
+From: David Howells <dhowells@redhat.com>
+
+[ Upstream commit 86ad1a58f6a9453f49e06ef957a40a8dac00a13f ]
+
+When netfslib wants to copy some data that has just been read on behalf of
+nfs, it creates a new write request and calls nfs_netfs_init_request() to
+initialise it, but with a NULL file pointer.  This causes
+nfs_file_open_context() to oops - however, we don't actually need the nfs
+context as we're only going to write to the cache.
+
+Fix this by just returning if we aren't given a file pointer and emit a
+warning if the request was for something other than copy-to-cache.
+
+Further, fix nfs_netfs_free_request() so that it doesn't try to free the
+context if the pointer is NULL.
+
+Fixes: ee4cdf7ba857 ("netfs: Speed up buffered reading")
+Reported-by: Max Kellermann <max.kellermann@ionos.com>
+Closes: https://lore.kernel.org/r/CAKPOu+9DyMbKLhyJb7aMLDTb=Fh0T8Teb9sjuf_pze+XWT1VaQ@mail.gmail.com/
+Signed-off-by: David Howells <dhowells@redhat.com>
+Link: https://lore.kernel.org/r/20241213135013.2964079-5-dhowells@redhat.com
+cc: Trond Myklebust <trondmy@kernel.org>
+cc: Anna Schumaker <anna@kernel.org>
+cc: Dave Wysochanski <dwysocha@redhat.com>
+cc: Jeff Layton <jlayton@kernel.org>
+cc: linux-nfs@vger.kernel.org
+cc: netfs@lists.linux.dev
+cc: linux-fsdevel@vger.kernel.org
+Signed-off-by: Christian Brauner <brauner@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/nfs/fscache.c | 9 ++++++++-
+ 1 file changed, 8 insertions(+), 1 deletion(-)
+
+diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
+index 810269ee0a50..d49e4ce27999 100644
+--- a/fs/nfs/fscache.c
++++ b/fs/nfs/fscache.c
+@@ -263,6 +263,12 @@ int nfs_netfs_readahead(struct readahead_control *ractl)
+ static atomic_t nfs_netfs_debug_id;
+ static int nfs_netfs_init_request(struct netfs_io_request *rreq, struct file *file)
+ {
++      if (!file) {
++              if (WARN_ON_ONCE(rreq->origin != NETFS_PGPRIV2_COPY_TO_CACHE))
++                      return -EIO;
++              return 0;
++      }
++
+       rreq->netfs_priv = get_nfs_open_context(nfs_file_open_context(file));
+       rreq->debug_id = atomic_inc_return(&nfs_netfs_debug_id);
+       /* [DEPRECATED] Use PG_private_2 to mark folio being written to the cache. */
+@@ -274,7 +280,8 @@ static int nfs_netfs_init_request(struct netfs_io_request *rreq, struct file *fi
+ static void nfs_netfs_free_request(struct netfs_io_request *rreq)
+ {
+-      put_nfs_open_context(rreq->netfs_priv);
++      if (rreq->netfs_priv)
++              put_nfs_open_context(rreq->netfs_priv);
+ }
+ static struct nfs_netfs_io_data *nfs_netfs_alloc(struct netfs_io_subrequest *sreq)
+-- 
+2.39.5
+
diff --git a/queue-6.12/ovl-pass-realinode-to-ovl_encode_real_fh-instead-of-.patch b/queue-6.12/ovl-pass-realinode-to-ovl_encode_real_fh-instead-of-.patch
new file mode 100644 (file)
index 0000000..dd5f2b4
--- /dev/null
@@ -0,0 +1,132 @@
+From 1648fb8994b3d24f27ba4fdcf09efbaa36a626ef Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sun, 5 Jan 2025 17:24:03 +0100
+Subject: ovl: pass realinode to ovl_encode_real_fh() instead of realdentry
+
+From: Amir Goldstein <amir73il@gmail.com>
+
+[ Upstream commit 07aeefae7ff44d80524375253980b1bdee2396b0 ]
+
+We want to be able to encode an fid from an inode with no alias.
+
+Signed-off-by: Amir Goldstein <amir73il@gmail.com>
+Link: https://lore.kernel.org/r/20250105162404.357058-2-amir73il@gmail.com
+Signed-off-by: Christian Brauner <brauner@kernel.org>
+Stable-dep-of: c45beebfde34 ("ovl: support encoding fid from inode with no alias")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/overlayfs/copy_up.c   | 11 ++++++-----
+ fs/overlayfs/export.c    |  5 +++--
+ fs/overlayfs/namei.c     |  4 ++--
+ fs/overlayfs/overlayfs.h |  2 +-
+ 4 files changed, 12 insertions(+), 10 deletions(-)
+
+diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
+index 2ed6ad641a20..a11a9d756a7b 100644
+--- a/fs/overlayfs/copy_up.c
++++ b/fs/overlayfs/copy_up.c
+@@ -416,13 +416,13 @@ int ovl_set_attr(struct ovl_fs *ofs, struct dentry *upperdentry,
+       return err;
+ }
+-struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct dentry *real,
++struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct inode *realinode,
+                                 bool is_upper)
+ {
+       struct ovl_fh *fh;
+       int fh_type, dwords;
+       int buflen = MAX_HANDLE_SZ;
+-      uuid_t *uuid = &real->d_sb->s_uuid;
++      uuid_t *uuid = &realinode->i_sb->s_uuid;
+       int err;
+       /* Make sure the real fid stays 32bit aligned */
+@@ -439,7 +439,8 @@ struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct dentry *real,
+        * the price or reconnecting the dentry.
+        */
+       dwords = buflen >> 2;
+-      fh_type = exportfs_encode_fh(real, (void *)fh->fb.fid, &dwords, 0);
++      fh_type = exportfs_encode_inode_fh(realinode, (void *)fh->fb.fid,
++                                         &dwords, NULL, 0);
+       buflen = (dwords << 2);
+       err = -EIO;
+@@ -481,7 +482,7 @@ struct ovl_fh *ovl_get_origin_fh(struct ovl_fs *ofs, struct dentry *origin)
+       if (!ovl_can_decode_fh(origin->d_sb))
+               return NULL;
+-      return ovl_encode_real_fh(ofs, origin, false);
++      return ovl_encode_real_fh(ofs, d_inode(origin), false);
+ }
+ int ovl_set_origin_fh(struct ovl_fs *ofs, const struct ovl_fh *fh,
+@@ -506,7 +507,7 @@ static int ovl_set_upper_fh(struct ovl_fs *ofs, struct dentry *upper,
+       const struct ovl_fh *fh;
+       int err;
+-      fh = ovl_encode_real_fh(ofs, upper, true);
++      fh = ovl_encode_real_fh(ofs, d_inode(upper), true);
+       if (IS_ERR(fh))
+               return PTR_ERR(fh);
+diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c
+index 5868cb222955..036c9f39a14d 100644
+--- a/fs/overlayfs/export.c
++++ b/fs/overlayfs/export.c
+@@ -223,6 +223,7 @@ static int ovl_check_encode_origin(struct dentry *dentry)
+ static int ovl_dentry_to_fid(struct ovl_fs *ofs, struct dentry *dentry,
+                            u32 *fid, int buflen)
+ {
++      struct inode *inode = d_inode(dentry);
+       struct ovl_fh *fh = NULL;
+       int err, enc_lower;
+       int len;
+@@ -236,8 +237,8 @@ static int ovl_dentry_to_fid(struct ovl_fs *ofs, struct dentry *dentry,
+               goto fail;
+       /* Encode an upper or lower file handle */
+-      fh = ovl_encode_real_fh(ofs, enc_lower ? ovl_dentry_lower(dentry) :
+-                              ovl_dentry_upper(dentry), !enc_lower);
++      fh = ovl_encode_real_fh(ofs, enc_lower ? ovl_inode_lower(inode) :
++                              ovl_inode_upper(inode), !enc_lower);
+       if (IS_ERR(fh))
+               return PTR_ERR(fh);
+diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c
+index 5764f91d283e..42b73ae5ba01 100644
+--- a/fs/overlayfs/namei.c
++++ b/fs/overlayfs/namei.c
+@@ -542,7 +542,7 @@ int ovl_verify_origin_xattr(struct ovl_fs *ofs, struct dentry *dentry,
+       struct ovl_fh *fh;
+       int err;
+-      fh = ovl_encode_real_fh(ofs, real, is_upper);
++      fh = ovl_encode_real_fh(ofs, d_inode(real), is_upper);
+       err = PTR_ERR(fh);
+       if (IS_ERR(fh)) {
+               fh = NULL;
+@@ -738,7 +738,7 @@ int ovl_get_index_name(struct ovl_fs *ofs, struct dentry *origin,
+       struct ovl_fh *fh;
+       int err;
+-      fh = ovl_encode_real_fh(ofs, origin, false);
++      fh = ovl_encode_real_fh(ofs, d_inode(origin), false);
+       if (IS_ERR(fh))
+               return PTR_ERR(fh);
+diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
+index 0bfe35da4b7b..844874b4a91a 100644
+--- a/fs/overlayfs/overlayfs.h
++++ b/fs/overlayfs/overlayfs.h
+@@ -869,7 +869,7 @@ int ovl_copy_up_with_data(struct dentry *dentry);
+ int ovl_maybe_copy_up(struct dentry *dentry, int flags);
+ int ovl_copy_xattr(struct super_block *sb, const struct path *path, struct dentry *new);
+ int ovl_set_attr(struct ovl_fs *ofs, struct dentry *upper, struct kstat *stat);
+-struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct dentry *real,
++struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct inode *realinode,
+                                 bool is_upper);
+ struct ovl_fh *ovl_get_origin_fh(struct ovl_fs *ofs, struct dentry *origin);
+ int ovl_set_origin_fh(struct ovl_fs *ofs, const struct ovl_fh *fh,
+-- 
+2.39.5
+
diff --git a/queue-6.12/ovl-support-encoding-fid-from-inode-with-no-alias.patch b/queue-6.12/ovl-support-encoding-fid-from-inode-with-no-alias.patch
new file mode 100644 (file)
index 0000000..2c7e210
--- /dev/null
@@ -0,0 +1,165 @@
+From 1f8eea074a5539d62be5677784cb2e61d22a67b3 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sun, 5 Jan 2025 17:24:04 +0100
+Subject: ovl: support encoding fid from inode with no alias
+
+From: Amir Goldstein <amir73il@gmail.com>
+
+[ Upstream commit c45beebfde34aa71afbc48b2c54cdda623515037 ]
+
+Dmitry Safonov reported that a WARN_ON() assertion can be trigered by
+userspace when calling inotify_show_fdinfo() for an overlayfs watched
+inode, whose dentry aliases were discarded with drop_caches.
+
+The WARN_ON() assertion in inotify_show_fdinfo() was removed, because
+it is possible for encoding file handle to fail for other reason, but
+the impact of failing to encode an overlayfs file handle goes beyond
+this assertion.
+
+As shown in the LTP test case mentioned in the link below, failure to
+encode an overlayfs file handle from a non-aliased inode also leads to
+failure to report an fid with FAN_DELETE_SELF fanotify events.
+
+As Dmitry notes in his analyzis of the problem, ovl_encode_fh() fails
+if it cannot find an alias for the inode, but this failure can be fixed.
+ovl_encode_fh() seldom uses the alias and in the case of non-decodable
+file handles, as is often the case with fanotify fid info,
+ovl_encode_fh() never needs to use the alias to encode a file handle.
+
+Defer finding an alias until it is actually needed so ovl_encode_fh()
+will not fail in the common case of FAN_DELETE_SELF fanotify events.
+
+Fixes: 16aac5ad1fa9 ("ovl: support encoding non-decodable file handles")
+Reported-by: Dmitry Safonov <dima@arista.com>
+Closes: https://lore.kernel.org/linux-fsdevel/CAOQ4uxiie81voLZZi2zXS1BziXZCM24nXqPAxbu8kxXCUWdwOg@mail.gmail.com/
+Signed-off-by: Amir Goldstein <amir73il@gmail.com>
+Link: https://lore.kernel.org/r/20250105162404.357058-3-amir73il@gmail.com
+Signed-off-by: Christian Brauner <brauner@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/overlayfs/export.c | 46 +++++++++++++++++++++++--------------------
+ 1 file changed, 25 insertions(+), 21 deletions(-)
+
+diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c
+index 036c9f39a14d..444aeeccb6da 100644
+--- a/fs/overlayfs/export.c
++++ b/fs/overlayfs/export.c
+@@ -176,35 +176,37 @@ static int ovl_connect_layer(struct dentry *dentry)
+  *
+  * Return 0 for upper file handle, > 0 for lower file handle or < 0 on error.
+  */
+-static int ovl_check_encode_origin(struct dentry *dentry)
++static int ovl_check_encode_origin(struct inode *inode)
+ {
+-      struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
++      struct ovl_fs *ofs = OVL_FS(inode->i_sb);
+       bool decodable = ofs->config.nfs_export;
++      struct dentry *dentry;
++      int err;
+       /* No upper layer? */
+       if (!ovl_upper_mnt(ofs))
+               return 1;
+       /* Lower file handle for non-upper non-decodable */
+-      if (!ovl_dentry_upper(dentry) && !decodable)
++      if (!ovl_inode_upper(inode) && !decodable)
+               return 1;
+       /* Upper file handle for pure upper */
+-      if (!ovl_dentry_lower(dentry))
++      if (!ovl_inode_lower(inode))
+               return 0;
+       /*
+        * Root is never indexed, so if there's an upper layer, encode upper for
+        * root.
+        */
+-      if (dentry == dentry->d_sb->s_root)
++      if (inode == d_inode(inode->i_sb->s_root))
+               return 0;
+       /*
+        * Upper decodable file handle for non-indexed upper.
+        */
+-      if (ovl_dentry_upper(dentry) && decodable &&
+-          !ovl_test_flag(OVL_INDEX, d_inode(dentry)))
++      if (ovl_inode_upper(inode) && decodable &&
++          !ovl_test_flag(OVL_INDEX, inode))
+               return 0;
+       /*
+@@ -213,17 +215,25 @@ static int ovl_check_encode_origin(struct dentry *dentry)
+        * ovl_connect_layer() will try to make origin's layer "connected" by
+        * copying up a "connectable" ancestor.
+        */
+-      if (d_is_dir(dentry) && decodable)
+-              return ovl_connect_layer(dentry);
++      if (!decodable || !S_ISDIR(inode->i_mode))
++              return 1;
++
++      dentry = d_find_any_alias(inode);
++      if (!dentry)
++              return -ENOENT;
++
++      err = ovl_connect_layer(dentry);
++      dput(dentry);
++      if (err < 0)
++              return err;
+       /* Lower file handle for indexed and non-upper dir/non-dir */
+       return 1;
+ }
+-static int ovl_dentry_to_fid(struct ovl_fs *ofs, struct dentry *dentry,
++static int ovl_dentry_to_fid(struct ovl_fs *ofs, struct inode *inode,
+                            u32 *fid, int buflen)
+ {
+-      struct inode *inode = d_inode(dentry);
+       struct ovl_fh *fh = NULL;
+       int err, enc_lower;
+       int len;
+@@ -232,7 +242,7 @@ static int ovl_dentry_to_fid(struct ovl_fs *ofs, struct dentry *dentry,
+        * Check if we should encode a lower or upper file handle and maybe
+        * copy up an ancestor to make lower file handle connectable.
+        */
+-      err = enc_lower = ovl_check_encode_origin(dentry);
++      err = enc_lower = ovl_check_encode_origin(inode);
+       if (enc_lower < 0)
+               goto fail;
+@@ -252,8 +262,8 @@ static int ovl_dentry_to_fid(struct ovl_fs *ofs, struct dentry *dentry,
+       return err;
+ fail:
+-      pr_warn_ratelimited("failed to encode file handle (%pd2, err=%i)\n",
+-                          dentry, err);
++      pr_warn_ratelimited("failed to encode file handle (ino=%lu, err=%i)\n",
++                          inode->i_ino, err);
+       goto out;
+ }
+@@ -261,19 +271,13 @@ static int ovl_encode_fh(struct inode *inode, u32 *fid, int *max_len,
+                        struct inode *parent)
+ {
+       struct ovl_fs *ofs = OVL_FS(inode->i_sb);
+-      struct dentry *dentry;
+       int bytes, buflen = *max_len << 2;
+       /* TODO: encode connectable file handles */
+       if (parent)
+               return FILEID_INVALID;
+-      dentry = d_find_any_alias(inode);
+-      if (!dentry)
+-              return FILEID_INVALID;
+-
+-      bytes = ovl_dentry_to_fid(ofs, dentry, fid, buflen);
+-      dput(dentry);
++      bytes = ovl_dentry_to_fid(ofs, inode, fid, buflen);
+       if (bytes <= 0)
+               return FILEID_INVALID;
+-- 
+2.39.5
+
diff --git a/queue-6.12/series b/queue-6.12/series
new file mode 100644 (file)
index 0000000..3c381c9
--- /dev/null
@@ -0,0 +1,20 @@
+jbd2-increase-io-priority-for-writing-revoke-records.patch
+jbd2-flush-filesystem-device-before-updating-tail-se.patch
+fs-writeback-convert-wbc_account_cgroup_owner-to-tak.patch
+iomap-pass-byte-granular-end-position-to-iomap_add_t.patch
+iomap-fix-zero-padding-data-issue-in-concurrent-appe.patch
+dm-array-fix-releasing-a-faulty-array-block-twice-in.patch
+dm-array-fix-unreleased-btree-blocks-on-closing-a-fa.patch
+dm-array-fix-cursor-index-when-skipping-across-block.patch
+netfs-fix-enomem-handling-in-buffered-reads.patch
+nfs-fix-oops-in-nfs_netfs_init_request-when-copying-.patch
+netfs-fix-missing-barriers-by-using-clear_and_wake_u.patch
+netfs-fix-ceph-copy-to-cache-on-write-begin.patch
+netfs-fix-the-non-cancellation-of-copy-when-cache-is.patch
+netfs-fix-is-caching-check-in-read-retry.patch
+exfat-fix-the-infinite-loop-in-exfat_readdir.patch
+exfat-fix-the-new-buffer-was-not-zeroed-before-writi.patch
+exfat-fix-the-infinite-loop-in-__exfat_free_cluster.patch
+fuse-respect-fopen_keep_cache-on-opendir.patch
+ovl-pass-realinode-to-ovl_encode_real_fh-instead-of-.patch
+ovl-support-encoding-fid-from-inode-with-no-alias.patch