--- /dev/null
+From cd9253c23aedd61eb5ff11f37a36247cd46faf86 Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Thu, 29 Aug 2024 18:25:49 +0100
+Subject: btrfs: fix race between direct IO write and fsync when using same fd
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit cd9253c23aedd61eb5ff11f37a36247cd46faf86 upstream.
+
+If we have 2 threads that are using the same file descriptor and one of
+them is doing direct IO writes while the other is doing fsync, we have a
+race where we can end up either:
+
+1) Attempt a fsync without holding the inode's lock, triggering an
+ assertion failures when assertions are enabled;
+
+2) Do an invalid memory access from the fsync task because the file private
+ points to memory allocated on stack by the direct IO task and it may be
+ used by the fsync task after the stack was destroyed.
+
+The race happens like this:
+
+1) A user space program opens a file descriptor with O_DIRECT;
+
+2) The program spawns 2 threads using libpthread for example;
+
+3) One of the threads uses the file descriptor to do direct IO writes,
+ while the other calls fsync using the same file descriptor.
+
+4) Call task A the thread doing direct IO writes and task B the thread
+ doing fsyncs;
+
+5) Task A does a direct IO write, and at btrfs_direct_write() sets the
+ file's private to an on stack allocated private with the member
+ 'fsync_skip_inode_lock' set to true;
+
+6) Task B enters btrfs_sync_file() and sees that there's a private
+ structure associated to the file which has 'fsync_skip_inode_lock' set
+ to true, so it skips locking the inode's VFS lock;
+
+7) Task A completes the direct IO write, and resets the file's private to
+ NULL since it had no prior private and our private was stack allocated.
+ Then it unlocks the inode's VFS lock;
+
+8) Task B enters btrfs_get_ordered_extents_for_logging(), then the
+ assertion that checks the inode's VFS lock is held fails, since task B
+ never locked it and task A has already unlocked it.
+
+The stack trace produced is the following:
+
+ assertion failed: inode_is_locked(&inode->vfs_inode), in fs/btrfs/ordered-data.c:983
+ ------------[ cut here ]------------
+ kernel BUG at fs/btrfs/ordered-data.c:983!
+ Oops: invalid opcode: 0000 [#1] PREEMPT SMP PTI
+ CPU: 9 PID: 5072 Comm: worker Tainted: G U OE 6.10.5-1-default #1 openSUSE Tumbleweed 69f48d427608e1c09e60ea24c6c55e2ca1b049e8
+ Hardware name: Acer Predator PH315-52/Covini_CFS, BIOS V1.12 07/28/2020
+ RIP: 0010:btrfs_get_ordered_extents_for_logging.cold+0x1f/0x42 [btrfs]
+ Code: 50 d6 86 c0 e8 (...)
+ RSP: 0018:ffff9e4a03dcfc78 EFLAGS: 00010246
+ RAX: 0000000000000054 RBX: ffff9078a9868e98 RCX: 0000000000000000
+ RDX: 0000000000000000 RSI: ffff907dce4a7800 RDI: ffff907dce4a7800
+ RBP: ffff907805518800 R08: 0000000000000000 R09: ffff9e4a03dcfb38
+ R10: ffff9e4a03dcfb30 R11: 0000000000000003 R12: ffff907684ae7800
+ R13: 0000000000000001 R14: ffff90774646b600 R15: 0000000000000000
+ FS: 00007f04b96006c0(0000) GS:ffff907dce480000(0000) knlGS:0000000000000000
+ CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+ CR2: 00007f32acbfc000 CR3: 00000001fd4fa005 CR4: 00000000003726f0
+ Call Trace:
+ <TASK>
+ ? __die_body.cold+0x14/0x24
+ ? die+0x2e/0x50
+ ? do_trap+0xca/0x110
+ ? do_error_trap+0x6a/0x90
+ ? btrfs_get_ordered_extents_for_logging.cold+0x1f/0x42 [btrfs bb26272d49b4cdc847cf3f7faadd459b62caee9a]
+ ? exc_invalid_op+0x50/0x70
+ ? btrfs_get_ordered_extents_for_logging.cold+0x1f/0x42 [btrfs bb26272d49b4cdc847cf3f7faadd459b62caee9a]
+ ? asm_exc_invalid_op+0x1a/0x20
+ ? btrfs_get_ordered_extents_for_logging.cold+0x1f/0x42 [btrfs bb26272d49b4cdc847cf3f7faadd459b62caee9a]
+ ? btrfs_get_ordered_extents_for_logging.cold+0x1f/0x42 [btrfs bb26272d49b4cdc847cf3f7faadd459b62caee9a]
+ btrfs_sync_file+0x21a/0x4d0 [btrfs bb26272d49b4cdc847cf3f7faadd459b62caee9a]
+ ? __seccomp_filter+0x31d/0x4f0
+ __x64_sys_fdatasync+0x4f/0x90
+ do_syscall_64+0x82/0x160
+ ? do_futex+0xcb/0x190
+ ? __x64_sys_futex+0x10e/0x1d0
+ ? switch_fpu_return+0x4f/0xd0
+ ? syscall_exit_to_user_mode+0x72/0x220
+ ? do_syscall_64+0x8e/0x160
+ ? syscall_exit_to_user_mode+0x72/0x220
+ ? do_syscall_64+0x8e/0x160
+ ? syscall_exit_to_user_mode+0x72/0x220
+ ? do_syscall_64+0x8e/0x160
+ ? syscall_exit_to_user_mode+0x72/0x220
+ ? do_syscall_64+0x8e/0x160
+ entry_SYSCALL_64_after_hwframe+0x76/0x7e
+
+Another problem here is if task B grabs the private pointer and then uses
+it after task A has finished, since the private was allocated in the stack
+of task A, it results in some invalid memory access with a hard to predict
+result.
+
+This issue, triggering the assertion, was observed with QEMU workloads by
+two users in the Link tags below.
+
+Fix this by not relying on a file's private to pass information to fsync
+that it should skip locking the inode and instead pass this information
+through a special value stored in current->journal_info. This is safe
+because in the relevant section of the direct IO write path we are not
+holding a transaction handle, so current->journal_info is NULL.
+
+The following C program triggers the issue:
+
+ $ cat repro.c
+ /* Get the O_DIRECT definition. */
+ #ifndef _GNU_SOURCE
+ #define _GNU_SOURCE
+ #endif
+
+ #include <stdio.h>
+ #include <stdlib.h>
+ #include <unistd.h>
+ #include <stdint.h>
+ #include <fcntl.h>
+ #include <errno.h>
+ #include <string.h>
+ #include <pthread.h>
+
+ static int fd;
+
+ static ssize_t do_write(int fd, const void *buf, size_t count, off_t offset)
+ {
+ while (count > 0) {
+ ssize_t ret;
+
+ ret = pwrite(fd, buf, count, offset);
+ if (ret < 0) {
+ if (errno == EINTR)
+ continue;
+ return ret;
+ }
+ count -= ret;
+ buf += ret;
+ }
+ return 0;
+ }
+
+ static void *fsync_loop(void *arg)
+ {
+ while (1) {
+ int ret;
+
+ ret = fsync(fd);
+ if (ret != 0) {
+ perror("Fsync failed");
+ exit(6);
+ }
+ }
+ }
+
+ int main(int argc, char *argv[])
+ {
+ long pagesize;
+ void *write_buf;
+ pthread_t fsyncer;
+ int ret;
+
+ if (argc != 2) {
+ fprintf(stderr, "Use: %s <file path>\n", argv[0]);
+ return 1;
+ }
+
+ fd = open(argv[1], O_WRONLY | O_CREAT | O_TRUNC | O_DIRECT, 0666);
+ if (fd == -1) {
+ perror("Failed to open/create file");
+ return 1;
+ }
+
+ pagesize = sysconf(_SC_PAGE_SIZE);
+ if (pagesize == -1) {
+ perror("Failed to get page size");
+ return 2;
+ }
+
+ ret = posix_memalign(&write_buf, pagesize, pagesize);
+ if (ret) {
+ perror("Failed to allocate buffer");
+ return 3;
+ }
+
+ ret = pthread_create(&fsyncer, NULL, fsync_loop, NULL);
+ if (ret != 0) {
+ fprintf(stderr, "Failed to create writer thread: %d\n", ret);
+ return 4;
+ }
+
+ while (1) {
+ ret = do_write(fd, write_buf, pagesize, 0);
+ if (ret != 0) {
+ perror("Write failed");
+ exit(5);
+ }
+ }
+
+ return 0;
+ }
+
+ $ mkfs.btrfs -f /dev/sdi
+ $ mount /dev/sdi /mnt/sdi
+ $ timeout 10 ./repro /mnt/sdi/foo
+
+Usually the race is triggered within less than 1 second. A test case for
+fstests will follow soon.
+
+Reported-by: Paulo Dias <paulo.miguel.dias@gmail.com>
+Link: https://bugzilla.kernel.org/show_bug.cgi?id=219187
+Reported-by: Andreas Jahn <jahn-andi@web.de>
+Link: https://bugzilla.kernel.org/show_bug.cgi?id=219199
+Reported-by: syzbot+4704b3cc972bd76024f1@syzkaller.appspotmail.com
+Link: https://lore.kernel.org/linux-btrfs/00000000000044ff540620d7dee2@google.com/
+Fixes: 939b656bc8ab ("btrfs: fix corruption after buffer fault in during direct IO append write")
+CC: stable@vger.kernel.org # 5.15+
+Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/ctree.h | 1 -
+ fs/btrfs/file.c | 25 ++++++++++---------------
+ fs/btrfs/transaction.h | 6 ++++++
+ 3 files changed, 16 insertions(+), 16 deletions(-)
+
+--- a/fs/btrfs/ctree.h
++++ b/fs/btrfs/ctree.h
+@@ -1383,7 +1383,6 @@ struct btrfs_drop_extents_args {
+ struct btrfs_file_private {
+ void *filldir_buf;
+ u64 last_index;
+- bool fsync_skip_inode_lock;
+ };
+
+
+--- a/fs/btrfs/file.c
++++ b/fs/btrfs/file.c
+@@ -1992,13 +1992,6 @@ again:
+ if (IS_ERR_OR_NULL(dio)) {
+ err = PTR_ERR_OR_ZERO(dio);
+ } else {
+- struct btrfs_file_private stack_private = { 0 };
+- struct btrfs_file_private *private;
+- const bool have_private = (file->private_data != NULL);
+-
+- if (!have_private)
+- file->private_data = &stack_private;
+-
+ /*
+ * If we have a synchoronous write, we must make sure the fsync
+ * triggered by the iomap_dio_complete() call below doesn't
+@@ -2007,13 +2000,10 @@ again:
+ * partial writes due to the input buffer (or parts of it) not
+ * being already faulted in.
+ */
+- private = file->private_data;
+- private->fsync_skip_inode_lock = true;
++ ASSERT(current->journal_info == NULL);
++ current->journal_info = BTRFS_TRANS_DIO_WRITE_STUB;
+ err = iomap_dio_complete(dio);
+- private->fsync_skip_inode_lock = false;
+-
+- if (!have_private)
+- file->private_data = NULL;
++ current->journal_info = NULL;
+ }
+
+ /* No increment (+=) because iomap returns a cumulative value. */
+@@ -2195,7 +2185,6 @@ static inline bool skip_inode_logging(co
+ */
+ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
+ {
+- struct btrfs_file_private *private = file->private_data;
+ struct dentry *dentry = file_dentry(file);
+ struct inode *inode = d_inode(dentry);
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+@@ -2205,7 +2194,13 @@ int btrfs_sync_file(struct file *file, l
+ int ret = 0, err;
+ u64 len;
+ bool full_sync;
+- const bool skip_ilock = (private ? private->fsync_skip_inode_lock : false);
++ bool skip_ilock = false;
++
++ if (current->journal_info == BTRFS_TRANS_DIO_WRITE_STUB) {
++ skip_ilock = true;
++ current->journal_info = NULL;
++ lockdep_assert_held(&inode->i_rwsem);
++ }
+
+ trace_btrfs_sync_file(file, datasync);
+
+--- a/fs/btrfs/transaction.h
++++ b/fs/btrfs/transaction.h
+@@ -11,6 +11,12 @@
+ #include "delayed-ref.h"
+ #include "ctree.h"
+
++/*
++ * Signal that a direct IO write is in progress, to avoid deadlock for sync
++ * direct IO writes when fsync is called during the direct IO write path.
++ */
++#define BTRFS_TRANS_DIO_WRITE_STUB ((void *) 1)
++
+ enum btrfs_trans_state {
+ TRANS_STATE_RUNNING,
+ TRANS_STATE_COMMIT_START,
--- /dev/null
+From 9972605a238339b85bd16b084eed5f18414d22db Mon Sep 17 00:00:00 2001
+From: Shakeel Butt <shakeel.butt@linux.dev>
+Date: Fri, 2 Aug 2024 16:58:22 -0700
+Subject: memcg: protect concurrent access to mem_cgroup_idr
+
+From: Shakeel Butt <shakeel.butt@linux.dev>
+
+commit 9972605a238339b85bd16b084eed5f18414d22db upstream.
+
+Commit 73f576c04b94 ("mm: memcontrol: fix cgroup creation failure after
+many small jobs") decoupled the memcg IDs from the CSS ID space to fix the
+cgroup creation failures. It introduced IDR to maintain the memcg ID
+space. The IDR depends on external synchronization mechanisms for
+modifications. For the mem_cgroup_idr, the idr_alloc() and idr_replace()
+happen within css callback and thus are protected through cgroup_mutex
+from concurrent modifications. However idr_remove() for mem_cgroup_idr
+was not protected against concurrency and can be run concurrently for
+different memcgs when they hit their refcnt to zero. Fix that.
+
+We have been seeing list_lru based kernel crashes at a low frequency in
+our fleet for a long time. These crashes were in different part of
+list_lru code including list_lru_add(), list_lru_del() and reparenting
+code. Upon further inspection, it looked like for a given object (dentry
+and inode), the super_block's list_lru didn't have list_lru_one for the
+memcg of that object. The initial suspicions were either the object is
+not allocated through kmem_cache_alloc_lru() or somehow
+memcg_list_lru_alloc() failed to allocate list_lru_one() for a memcg but
+returned success. No evidence were found for these cases.
+
+Looking more deeply, we started seeing situations where valid memcg's id
+is not present in mem_cgroup_idr and in some cases multiple valid memcgs
+have same id and mem_cgroup_idr is pointing to one of them. So, the most
+reasonable explanation is that these situations can happen due to race
+between multiple idr_remove() calls or race between
+idr_alloc()/idr_replace() and idr_remove(). These races are causing
+multiple memcgs to acquire the same ID and then offlining of one of them
+would cleanup list_lrus on the system for all of them. Later access from
+other memcgs to the list_lru cause crashes due to missing list_lru_one.
+
+Link: https://lkml.kernel.org/r/20240802235822.1830976-1-shakeel.butt@linux.dev
+Fixes: 73f576c04b94 ("mm: memcontrol: fix cgroup creation failure after many small jobs")
+Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
+Acked-by: Muchun Song <muchun.song@linux.dev>
+Reviewed-by: Roman Gushchin <roman.gushchin@linux.dev>
+Acked-by: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+[ Adapted over commit be740503ed03 ("mm: memcontrol: fix cannot alloc the
+ maximum memcg ID") and 6f0df8e16eb5 ("memcontrol: ensure memcg acquired by id
+ is properly set up") both are not in the tree ]
+Signed-off-by: Tomas Krcka <krckatom@amazon.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/memcontrol.c | 23 ++++++++++++++++++++---
+ 1 file changed, 20 insertions(+), 3 deletions(-)
+
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -5083,11 +5083,28 @@ static struct cftype mem_cgroup_legacy_f
+ */
+
+ static DEFINE_IDR(mem_cgroup_idr);
++static DEFINE_SPINLOCK(memcg_idr_lock);
++
++static int mem_cgroup_alloc_id(void)
++{
++ int ret;
++
++ idr_preload(GFP_KERNEL);
++ spin_lock(&memcg_idr_lock);
++ ret = idr_alloc(&mem_cgroup_idr, NULL, 1, MEM_CGROUP_ID_MAX + 1,
++ GFP_NOWAIT);
++ spin_unlock(&memcg_idr_lock);
++ idr_preload_end();
++ return ret;
++}
+
+ static void mem_cgroup_id_remove(struct mem_cgroup *memcg)
+ {
+ if (memcg->id.id > 0) {
++ spin_lock(&memcg_idr_lock);
+ idr_remove(&mem_cgroup_idr, memcg->id.id);
++ spin_unlock(&memcg_idr_lock);
++
+ memcg->id.id = 0;
+ }
+ }
+@@ -5201,9 +5218,7 @@ static struct mem_cgroup *mem_cgroup_all
+ if (!memcg)
+ return ERR_PTR(error);
+
+- memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL,
+- 1, MEM_CGROUP_ID_MAX,
+- GFP_KERNEL);
++ memcg->id.id = mem_cgroup_alloc_id();
+ if (memcg->id.id < 0) {
+ error = memcg->id.id;
+ goto fail;
+@@ -5244,7 +5259,9 @@ static struct mem_cgroup *mem_cgroup_all
+ INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue);
+ memcg->deferred_split_queue.split_queue_len = 0;
+ #endif
++ spin_lock(&memcg_idr_lock);
+ idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
++ spin_unlock(&memcg_idr_lock);
+ return memcg;
+ fail:
+ mem_cgroup_id_remove(memcg);