From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Tue, 10 Sep 2024 07:56:47 +0000 (+0200)
Subject: 5.15-stable patches
X-Git-Tag: v4.19.322~24
X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=426532147fa68fe88c15c2efcf9f2a45e1e36135;p=thirdparty%2Fkernel%2Fstable-queue.git

5.15-stable patches

added patches:
	btrfs-fix-race-between-direct-io-write-and-fsync-when-using-same-fd.patch
	memcg-protect-concurrent-access-to-mem_cgroup_idr.patch
---

diff --git a/queue-5.15/btrfs-fix-race-between-direct-io-write-and-fsync-when-using-same-fd.patch b/queue-5.15/btrfs-fix-race-between-direct-io-write-and-fsync-when-using-same-fd.patch
new file mode 100644
index 00000000000..7211a12d137
--- /dev/null
+++ b/queue-5.15/btrfs-fix-race-between-direct-io-write-and-fsync-when-using-same-fd.patch
@@ -0,0 +1,313 @@
+From cd9253c23aedd61eb5ff11f37a36247cd46faf86 Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Thu, 29 Aug 2024 18:25:49 +0100
+Subject: btrfs: fix race between direct IO write and fsync when using same fd
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit cd9253c23aedd61eb5ff11f37a36247cd46faf86 upstream.
+
+If we have 2 threads that are using the same file descriptor and one of
+them is doing direct IO writes while the other is doing fsync, we have a
+race where we can end up either:
+
+1) Attempt a fsync without holding the inode's lock, triggering an
+   assertion failures when assertions are enabled;
+
+2) Do an invalid memory access from the fsync task because the file private
+   points to memory allocated on stack by the direct IO task and it may be
+   used by the fsync task after the stack was destroyed.
+
+The race happens like this:
+
+1) A user space program opens a file descriptor with O_DIRECT;
+
+2) The program spawns 2 threads using libpthread for example;
+
+3) One of the threads uses the file descriptor to do direct IO writes,
+   while the other calls fsync using the same file descriptor.
+
+4) Call task A the thread doing direct IO writes and task B the thread
+   doing fsyncs;
+
+5) Task A does a direct IO write, and at btrfs_direct_write() sets the
+   file's private to an on stack allocated private with the member
+   'fsync_skip_inode_lock' set to true;
+
+6) Task B enters btrfs_sync_file() and sees that there's a private
+   structure associated to the file which has 'fsync_skip_inode_lock' set
+   to true, so it skips locking the inode's VFS lock;
+
+7) Task A completes the direct IO write, and resets the file's private to
+   NULL since it had no prior private and our private was stack allocated.
+   Then it unlocks the inode's VFS lock;
+
+8) Task B enters btrfs_get_ordered_extents_for_logging(), then the
+   assertion that checks the inode's VFS lock is held fails, since task B
+   never locked it and task A has already unlocked it.
+
+The stack trace produced is the following:
+
+   assertion failed: inode_is_locked(&inode->vfs_inode), in fs/btrfs/ordered-data.c:983
+   ------------[ cut here ]------------
+   kernel BUG at fs/btrfs/ordered-data.c:983!
+   Oops: invalid opcode: 0000 [#1] PREEMPT SMP PTI
+   CPU: 9 PID: 5072 Comm: worker Tainted: G     U     OE      6.10.5-1-default #1 openSUSE Tumbleweed 69f48d427608e1c09e60ea24c6c55e2ca1b049e8
+   Hardware name: Acer Predator PH315-52/Covini_CFS, BIOS V1.12 07/28/2020
+   RIP: 0010:btrfs_get_ordered_extents_for_logging.cold+0x1f/0x42 [btrfs]
+   Code: 50 d6 86 c0 e8 (...)
+   RSP: 0018:ffff9e4a03dcfc78 EFLAGS: 00010246
+   RAX: 0000000000000054 RBX: ffff9078a9868e98 RCX: 0000000000000000
+   RDX: 0000000000000000 RSI: ffff907dce4a7800 RDI: ffff907dce4a7800
+   RBP: ffff907805518800 R08: 0000000000000000 R09: ffff9e4a03dcfb38
+   R10: ffff9e4a03dcfb30 R11: 0000000000000003 R12: ffff907684ae7800
+   R13: 0000000000000001 R14: ffff90774646b600 R15: 0000000000000000
+   FS:  00007f04b96006c0(0000) GS:ffff907dce480000(0000) knlGS:0000000000000000
+   CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+   CR2: 00007f32acbfc000 CR3: 00000001fd4fa005 CR4: 00000000003726f0
+   Call Trace:
+    <TASK>
+    ? __die_body.cold+0x14/0x24
+    ? die+0x2e/0x50
+    ? do_trap+0xca/0x110
+    ? do_error_trap+0x6a/0x90
+    ? btrfs_get_ordered_extents_for_logging.cold+0x1f/0x42 [btrfs bb26272d49b4cdc847cf3f7faadd459b62caee9a]
+    ? exc_invalid_op+0x50/0x70
+    ? btrfs_get_ordered_extents_for_logging.cold+0x1f/0x42 [btrfs bb26272d49b4cdc847cf3f7faadd459b62caee9a]
+    ? asm_exc_invalid_op+0x1a/0x20
+    ? btrfs_get_ordered_extents_for_logging.cold+0x1f/0x42 [btrfs bb26272d49b4cdc847cf3f7faadd459b62caee9a]
+    ? btrfs_get_ordered_extents_for_logging.cold+0x1f/0x42 [btrfs bb26272d49b4cdc847cf3f7faadd459b62caee9a]
+    btrfs_sync_file+0x21a/0x4d0 [btrfs bb26272d49b4cdc847cf3f7faadd459b62caee9a]
+    ? __seccomp_filter+0x31d/0x4f0
+    __x64_sys_fdatasync+0x4f/0x90
+    do_syscall_64+0x82/0x160
+    ? do_futex+0xcb/0x190
+    ? __x64_sys_futex+0x10e/0x1d0
+    ? switch_fpu_return+0x4f/0xd0
+    ? syscall_exit_to_user_mode+0x72/0x220
+    ? do_syscall_64+0x8e/0x160
+    ? syscall_exit_to_user_mode+0x72/0x220
+    ? do_syscall_64+0x8e/0x160
+    ? syscall_exit_to_user_mode+0x72/0x220
+    ? do_syscall_64+0x8e/0x160
+    ? syscall_exit_to_user_mode+0x72/0x220
+    ? do_syscall_64+0x8e/0x160
+    entry_SYSCALL_64_after_hwframe+0x76/0x7e
+
+Another problem here is if task B grabs the private pointer and then uses
+it after task A has finished, since the private was allocated in the stack
+of task A, it results in some invalid memory access with a hard to predict
+result.
+
+This issue, triggering the assertion, was observed with QEMU workloads by
+two users in the Link tags below.
+
+Fix this by not relying on a file's private to pass information to fsync
+that it should skip locking the inode and instead pass this information
+through a special value stored in current->journal_info. This is safe
+because in the relevant section of the direct IO write path we are not
+holding a transaction handle, so current->journal_info is NULL.
+
+The following C program triggers the issue:
+
+   $ cat repro.c
+   /* Get the O_DIRECT definition. */
+   #ifndef _GNU_SOURCE
+   #define _GNU_SOURCE
+   #endif
+
+   #include <stdio.h>
+   #include <stdlib.h>
+   #include <unistd.h>
+   #include <stdint.h>
+   #include <fcntl.h>
+   #include <errno.h>
+   #include <string.h>
+   #include <pthread.h>
+
+   static int fd;
+
+   static ssize_t do_write(int fd, const void *buf, size_t count, off_t offset)
+   {
+       while (count > 0) {
+           ssize_t ret;
+
+           ret = pwrite(fd, buf, count, offset);
+           if (ret < 0) {
+               if (errno == EINTR)
+                   continue;
+               return ret;
+           }
+           count -= ret;
+           buf += ret;
+       }
+       return 0;
+   }
+
+   static void *fsync_loop(void *arg)
+   {
+       while (1) {
+           int ret;
+
+           ret = fsync(fd);
+           if (ret != 0) {
+               perror("Fsync failed");
+               exit(6);
+           }
+       }
+   }
+
+   int main(int argc, char *argv[])
+   {
+       long pagesize;
+       void *write_buf;
+       pthread_t fsyncer;
+       int ret;
+
+       if (argc != 2) {
+           fprintf(stderr, "Use: %s <file path>\n", argv[0]);
+           return 1;
+       }
+
+       fd = open(argv[1], O_WRONLY | O_CREAT | O_TRUNC | O_DIRECT, 0666);
+       if (fd == -1) {
+           perror("Failed to open/create file");
+           return 1;
+       }
+
+       pagesize = sysconf(_SC_PAGE_SIZE);
+       if (pagesize == -1) {
+           perror("Failed to get page size");
+           return 2;
+       }
+
+       ret = posix_memalign(&write_buf, pagesize, pagesize);
+       if (ret) {
+           perror("Failed to allocate buffer");
+           return 3;
+       }
+
+       ret = pthread_create(&fsyncer, NULL, fsync_loop, NULL);
+       if (ret != 0) {
+           fprintf(stderr, "Failed to create writer thread: %d\n", ret);
+           return 4;
+       }
+
+       while (1) {
+           ret = do_write(fd, write_buf, pagesize, 0);
+           if (ret != 0) {
+               perror("Write failed");
+               exit(5);
+           }
+       }
+
+       return 0;
+   }
+
+   $ mkfs.btrfs -f /dev/sdi
+   $ mount /dev/sdi /mnt/sdi
+   $ timeout 10 ./repro /mnt/sdi/foo
+
+Usually the race is triggered within less than 1 second. A test case for
+fstests will follow soon.
+
+Reported-by: Paulo Dias <paulo.miguel.dias@gmail.com>
+Link: https://bugzilla.kernel.org/show_bug.cgi?id=219187
+Reported-by: Andreas Jahn <jahn-andi@web.de>
+Link: https://bugzilla.kernel.org/show_bug.cgi?id=219199
+Reported-by: syzbot+4704b3cc972bd76024f1@syzkaller.appspotmail.com
+Link: https://lore.kernel.org/linux-btrfs/00000000000044ff540620d7dee2@google.com/
+Fixes: 939b656bc8ab ("btrfs: fix corruption after buffer fault in during direct IO append write")
+CC: stable@vger.kernel.org # 5.15+
+Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/ctree.h       |    1 -
+ fs/btrfs/file.c        |   25 ++++++++++---------------
+ fs/btrfs/transaction.h |    6 ++++++
+ 3 files changed, 16 insertions(+), 16 deletions(-)
+
+--- a/fs/btrfs/ctree.h
++++ b/fs/btrfs/ctree.h
+@@ -1383,7 +1383,6 @@ struct btrfs_drop_extents_args {
+ struct btrfs_file_private {
+ 	void *filldir_buf;
+ 	u64 last_index;
+-	bool fsync_skip_inode_lock;
+ };
+ 
+ 
+--- a/fs/btrfs/file.c
++++ b/fs/btrfs/file.c
+@@ -1992,13 +1992,6 @@ again:
+ 	if (IS_ERR_OR_NULL(dio)) {
+ 		err = PTR_ERR_OR_ZERO(dio);
+ 	} else {
+-		struct btrfs_file_private stack_private = { 0 };
+-		struct btrfs_file_private *private;
+-		const bool have_private = (file->private_data != NULL);
+-
+-		if (!have_private)
+-			file->private_data = &stack_private;
+-
+ 		/*
+ 		 * If we have a synchoronous write, we must make sure the fsync
+ 		 * triggered by the iomap_dio_complete() call below doesn't
+@@ -2007,13 +2000,10 @@ again:
+ 		 * partial writes due to the input buffer (or parts of it) not
+ 		 * being already faulted in.
+ 		 */
+-		private = file->private_data;
+-		private->fsync_skip_inode_lock = true;
++		ASSERT(current->journal_info == NULL);
++		current->journal_info = BTRFS_TRANS_DIO_WRITE_STUB;
+ 		err = iomap_dio_complete(dio);
+-		private->fsync_skip_inode_lock = false;
+-
+-		if (!have_private)
+-			file->private_data = NULL;
++		current->journal_info = NULL;
+ 	}
+ 
+ 	/* No increment (+=) because iomap returns a cumulative value. */
+@@ -2195,7 +2185,6 @@ static inline bool skip_inode_logging(co
+  */
+ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
+ {
+-	struct btrfs_file_private *private = file->private_data;
+ 	struct dentry *dentry = file_dentry(file);
+ 	struct inode *inode = d_inode(dentry);
+ 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+@@ -2205,7 +2194,13 @@ int btrfs_sync_file(struct file *file, l
+ 	int ret = 0, err;
+ 	u64 len;
+ 	bool full_sync;
+-	const bool skip_ilock = (private ? private->fsync_skip_inode_lock : false);
++	bool skip_ilock = false;
++
++	if (current->journal_info == BTRFS_TRANS_DIO_WRITE_STUB) {
++		skip_ilock = true;
++		current->journal_info = NULL;
++		lockdep_assert_held(&inode->i_rwsem);
++	}
+ 
+ 	trace_btrfs_sync_file(file, datasync);
+ 
+--- a/fs/btrfs/transaction.h
++++ b/fs/btrfs/transaction.h
+@@ -11,6 +11,12 @@
+ #include "delayed-ref.h"
+ #include "ctree.h"
+ 
++/*
++ * Signal that a direct IO write is in progress, to avoid deadlock for sync
++ * direct IO writes when fsync is called during the direct IO write path.
++ */
++#define BTRFS_TRANS_DIO_WRITE_STUB	((void *) 1)
++
+ enum btrfs_trans_state {
+ 	TRANS_STATE_RUNNING,
+ 	TRANS_STATE_COMMIT_START,
diff --git a/queue-5.15/memcg-protect-concurrent-access-to-mem_cgroup_idr.patch b/queue-5.15/memcg-protect-concurrent-access-to-mem_cgroup_idr.patch
new file mode 100644
index 00000000000..d29c2d93837
--- /dev/null
+++ b/queue-5.15/memcg-protect-concurrent-access-to-mem_cgroup_idr.patch
@@ -0,0 +1,109 @@
+From 9972605a238339b85bd16b084eed5f18414d22db Mon Sep 17 00:00:00 2001
+From: Shakeel Butt <shakeel.butt@linux.dev>
+Date: Fri, 2 Aug 2024 16:58:22 -0700
+Subject: memcg: protect concurrent access to mem_cgroup_idr
+
+From: Shakeel Butt <shakeel.butt@linux.dev>
+
+commit 9972605a238339b85bd16b084eed5f18414d22db upstream.
+
+Commit 73f576c04b94 ("mm: memcontrol: fix cgroup creation failure after
+many small jobs") decoupled the memcg IDs from the CSS ID space to fix the
+cgroup creation failures.  It introduced IDR to maintain the memcg ID
+space.  The IDR depends on external synchronization mechanisms for
+modifications.  For the mem_cgroup_idr, the idr_alloc() and idr_replace()
+happen within css callback and thus are protected through cgroup_mutex
+from concurrent modifications.  However idr_remove() for mem_cgroup_idr
+was not protected against concurrency and can be run concurrently for
+different memcgs when they hit their refcnt to zero.  Fix that.
+
+We have been seeing list_lru based kernel crashes at a low frequency in
+our fleet for a long time.  These crashes were in different part of
+list_lru code including list_lru_add(), list_lru_del() and reparenting
+code.  Upon further inspection, it looked like for a given object (dentry
+and inode), the super_block's list_lru didn't have list_lru_one for the
+memcg of that object.  The initial suspicions were either the object is
+not allocated through kmem_cache_alloc_lru() or somehow
+memcg_list_lru_alloc() failed to allocate list_lru_one() for a memcg but
+returned success.  No evidence were found for these cases.
+
+Looking more deeply, we started seeing situations where valid memcg's id
+is not present in mem_cgroup_idr and in some cases multiple valid memcgs
+have same id and mem_cgroup_idr is pointing to one of them.  So, the most
+reasonable explanation is that these situations can happen due to race
+between multiple idr_remove() calls or race between
+idr_alloc()/idr_replace() and idr_remove().  These races are causing
+multiple memcgs to acquire the same ID and then offlining of one of them
+would cleanup list_lrus on the system for all of them.  Later access from
+other memcgs to the list_lru cause crashes due to missing list_lru_one.
+
+Link: https://lkml.kernel.org/r/20240802235822.1830976-1-shakeel.butt@linux.dev
+Fixes: 73f576c04b94 ("mm: memcontrol: fix cgroup creation failure after many small jobs")
+Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
+Acked-by: Muchun Song <muchun.song@linux.dev>
+Reviewed-by: Roman Gushchin <roman.gushchin@linux.dev>
+Acked-by: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+[ Adapted over commit be740503ed03 ("mm: memcontrol: fix cannot alloc the
+  maximum memcg ID") and 6f0df8e16eb5 ("memcontrol: ensure memcg acquired by id
+  is properly set up") both are not in the tree ]
+Signed-off-by: Tomas Krcka <krckatom@amazon.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/memcontrol.c |   23 ++++++++++++++++++++---
+ 1 file changed, 20 insertions(+), 3 deletions(-)
+
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -5083,11 +5083,28 @@ static struct cftype mem_cgroup_legacy_f
+  */
+ 
+ static DEFINE_IDR(mem_cgroup_idr);
++static DEFINE_SPINLOCK(memcg_idr_lock);
++
++static int mem_cgroup_alloc_id(void)
++{
++	int ret;
++
++	idr_preload(GFP_KERNEL);
++	spin_lock(&memcg_idr_lock);
++	ret = idr_alloc(&mem_cgroup_idr, NULL, 1, MEM_CGROUP_ID_MAX + 1,
++			GFP_NOWAIT);
++	spin_unlock(&memcg_idr_lock);
++	idr_preload_end();
++	return ret;
++}
+ 
+ static void mem_cgroup_id_remove(struct mem_cgroup *memcg)
+ {
+ 	if (memcg->id.id > 0) {
++		spin_lock(&memcg_idr_lock);
+ 		idr_remove(&mem_cgroup_idr, memcg->id.id);
++		spin_unlock(&memcg_idr_lock);
++
+ 		memcg->id.id = 0;
+ 	}
+ }
+@@ -5201,9 +5218,7 @@ static struct mem_cgroup *mem_cgroup_all
+ 	if (!memcg)
+ 		return ERR_PTR(error);
+ 
+-	memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL,
+-				 1, MEM_CGROUP_ID_MAX,
+-				 GFP_KERNEL);
++	memcg->id.id = mem_cgroup_alloc_id();
+ 	if (memcg->id.id < 0) {
+ 		error = memcg->id.id;
+ 		goto fail;
+@@ -5244,7 +5259,9 @@ static struct mem_cgroup *mem_cgroup_all
+ 	INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue);
+ 	memcg->deferred_split_queue.split_queue_len = 0;
+ #endif
++	spin_lock(&memcg_idr_lock);
+ 	idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
++	spin_unlock(&memcg_idr_lock);
+ 	return memcg;
+ fail:
+ 	mem_cgroup_id_remove(memcg);
diff --git a/queue-5.15/series b/queue-5.15/series
index 35a86c809c7..879587e2ed3 100644
--- a/queue-5.15/series
+++ b/queue-5.15/series
@@ -210,3 +210,5 @@ gso-fix-dodgy-bit-handling-for-gso_udp_l4.patch
 net-drop-bad-gso-csum_start-and-offset-in-virtio_net_hdr.patch
 x86-mm-fix-pti-for-i386-some-more.patch
 net-sunrpc-remap-eperm-in-case-of-connection-failure-in-xs_tcp_setup_socket.patch
+btrfs-fix-race-between-direct-io-write-and-fsync-when-using-same-fd.patch
+memcg-protect-concurrent-access-to-mem_cgroup_idr.patch