From: Greg Kroah-Hartman Date: Tue, 10 Sep 2024 07:56:47 +0000 (+0200) Subject: 5.15-stable patches X-Git-Tag: v4.19.322~24 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=426532147fa68fe88c15c2efcf9f2a45e1e36135;p=thirdparty%2Fkernel%2Fstable-queue.git 5.15-stable patches added patches: btrfs-fix-race-between-direct-io-write-and-fsync-when-using-same-fd.patch memcg-protect-concurrent-access-to-mem_cgroup_idr.patch --- diff --git a/queue-5.15/btrfs-fix-race-between-direct-io-write-and-fsync-when-using-same-fd.patch b/queue-5.15/btrfs-fix-race-between-direct-io-write-and-fsync-when-using-same-fd.patch new file mode 100644 index 00000000000..7211a12d137 --- /dev/null +++ b/queue-5.15/btrfs-fix-race-between-direct-io-write-and-fsync-when-using-same-fd.patch @@ -0,0 +1,313 @@ +From cd9253c23aedd61eb5ff11f37a36247cd46faf86 Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Thu, 29 Aug 2024 18:25:49 +0100 +Subject: btrfs: fix race between direct IO write and fsync when using same fd + +From: Filipe Manana + +commit cd9253c23aedd61eb5ff11f37a36247cd46faf86 upstream. + +If we have 2 threads that are using the same file descriptor and one of +them is doing direct IO writes while the other is doing fsync, we have a +race where we can end up either: + +1) Attempt a fsync without holding the inode's lock, triggering an + assertion failures when assertions are enabled; + +2) Do an invalid memory access from the fsync task because the file private + points to memory allocated on stack by the direct IO task and it may be + used by the fsync task after the stack was destroyed. + +The race happens like this: + +1) A user space program opens a file descriptor with O_DIRECT; + +2) The program spawns 2 threads using libpthread for example; + +3) One of the threads uses the file descriptor to do direct IO writes, + while the other calls fsync using the same file descriptor. + +4) Call task A the thread doing direct IO writes and task B the thread + doing fsyncs; + +5) Task A does a direct IO write, and at btrfs_direct_write() sets the + file's private to an on stack allocated private with the member + 'fsync_skip_inode_lock' set to true; + +6) Task B enters btrfs_sync_file() and sees that there's a private + structure associated to the file which has 'fsync_skip_inode_lock' set + to true, so it skips locking the inode's VFS lock; + +7) Task A completes the direct IO write, and resets the file's private to + NULL since it had no prior private and our private was stack allocated. + Then it unlocks the inode's VFS lock; + +8) Task B enters btrfs_get_ordered_extents_for_logging(), then the + assertion that checks the inode's VFS lock is held fails, since task B + never locked it and task A has already unlocked it. + +The stack trace produced is the following: + + assertion failed: inode_is_locked(&inode->vfs_inode), in fs/btrfs/ordered-data.c:983 + ------------[ cut here ]------------ + kernel BUG at fs/btrfs/ordered-data.c:983! + Oops: invalid opcode: 0000 [#1] PREEMPT SMP PTI + CPU: 9 PID: 5072 Comm: worker Tainted: G U OE 6.10.5-1-default #1 openSUSE Tumbleweed 69f48d427608e1c09e60ea24c6c55e2ca1b049e8 + Hardware name: Acer Predator PH315-52/Covini_CFS, BIOS V1.12 07/28/2020 + RIP: 0010:btrfs_get_ordered_extents_for_logging.cold+0x1f/0x42 [btrfs] + Code: 50 d6 86 c0 e8 (...) + RSP: 0018:ffff9e4a03dcfc78 EFLAGS: 00010246 + RAX: 0000000000000054 RBX: ffff9078a9868e98 RCX: 0000000000000000 + RDX: 0000000000000000 RSI: ffff907dce4a7800 RDI: ffff907dce4a7800 + RBP: ffff907805518800 R08: 0000000000000000 R09: ffff9e4a03dcfb38 + R10: ffff9e4a03dcfb30 R11: 0000000000000003 R12: ffff907684ae7800 + R13: 0000000000000001 R14: ffff90774646b600 R15: 0000000000000000 + FS: 00007f04b96006c0(0000) GS:ffff907dce480000(0000) knlGS:0000000000000000 + CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 + CR2: 00007f32acbfc000 CR3: 00000001fd4fa005 CR4: 00000000003726f0 + Call Trace: + + ? __die_body.cold+0x14/0x24 + ? die+0x2e/0x50 + ? do_trap+0xca/0x110 + ? do_error_trap+0x6a/0x90 + ? btrfs_get_ordered_extents_for_logging.cold+0x1f/0x42 [btrfs bb26272d49b4cdc847cf3f7faadd459b62caee9a] + ? exc_invalid_op+0x50/0x70 + ? btrfs_get_ordered_extents_for_logging.cold+0x1f/0x42 [btrfs bb26272d49b4cdc847cf3f7faadd459b62caee9a] + ? asm_exc_invalid_op+0x1a/0x20 + ? btrfs_get_ordered_extents_for_logging.cold+0x1f/0x42 [btrfs bb26272d49b4cdc847cf3f7faadd459b62caee9a] + ? btrfs_get_ordered_extents_for_logging.cold+0x1f/0x42 [btrfs bb26272d49b4cdc847cf3f7faadd459b62caee9a] + btrfs_sync_file+0x21a/0x4d0 [btrfs bb26272d49b4cdc847cf3f7faadd459b62caee9a] + ? __seccomp_filter+0x31d/0x4f0 + __x64_sys_fdatasync+0x4f/0x90 + do_syscall_64+0x82/0x160 + ? do_futex+0xcb/0x190 + ? __x64_sys_futex+0x10e/0x1d0 + ? switch_fpu_return+0x4f/0xd0 + ? syscall_exit_to_user_mode+0x72/0x220 + ? do_syscall_64+0x8e/0x160 + ? syscall_exit_to_user_mode+0x72/0x220 + ? do_syscall_64+0x8e/0x160 + ? syscall_exit_to_user_mode+0x72/0x220 + ? do_syscall_64+0x8e/0x160 + ? syscall_exit_to_user_mode+0x72/0x220 + ? do_syscall_64+0x8e/0x160 + entry_SYSCALL_64_after_hwframe+0x76/0x7e + +Another problem here is if task B grabs the private pointer and then uses +it after task A has finished, since the private was allocated in the stack +of task A, it results in some invalid memory access with a hard to predict +result. + +This issue, triggering the assertion, was observed with QEMU workloads by +two users in the Link tags below. + +Fix this by not relying on a file's private to pass information to fsync +that it should skip locking the inode and instead pass this information +through a special value stored in current->journal_info. This is safe +because in the relevant section of the direct IO write path we are not +holding a transaction handle, so current->journal_info is NULL. + +The following C program triggers the issue: + + $ cat repro.c + /* Get the O_DIRECT definition. */ + #ifndef _GNU_SOURCE + #define _GNU_SOURCE + #endif + + #include + #include + #include + #include + #include + #include + #include + #include + + static int fd; + + static ssize_t do_write(int fd, const void *buf, size_t count, off_t offset) + { + while (count > 0) { + ssize_t ret; + + ret = pwrite(fd, buf, count, offset); + if (ret < 0) { + if (errno == EINTR) + continue; + return ret; + } + count -= ret; + buf += ret; + } + return 0; + } + + static void *fsync_loop(void *arg) + { + while (1) { + int ret; + + ret = fsync(fd); + if (ret != 0) { + perror("Fsync failed"); + exit(6); + } + } + } + + int main(int argc, char *argv[]) + { + long pagesize; + void *write_buf; + pthread_t fsyncer; + int ret; + + if (argc != 2) { + fprintf(stderr, "Use: %s \n", argv[0]); + return 1; + } + + fd = open(argv[1], O_WRONLY | O_CREAT | O_TRUNC | O_DIRECT, 0666); + if (fd == -1) { + perror("Failed to open/create file"); + return 1; + } + + pagesize = sysconf(_SC_PAGE_SIZE); + if (pagesize == -1) { + perror("Failed to get page size"); + return 2; + } + + ret = posix_memalign(&write_buf, pagesize, pagesize); + if (ret) { + perror("Failed to allocate buffer"); + return 3; + } + + ret = pthread_create(&fsyncer, NULL, fsync_loop, NULL); + if (ret != 0) { + fprintf(stderr, "Failed to create writer thread: %d\n", ret); + return 4; + } + + while (1) { + ret = do_write(fd, write_buf, pagesize, 0); + if (ret != 0) { + perror("Write failed"); + exit(5); + } + } + + return 0; + } + + $ mkfs.btrfs -f /dev/sdi + $ mount /dev/sdi /mnt/sdi + $ timeout 10 ./repro /mnt/sdi/foo + +Usually the race is triggered within less than 1 second. A test case for +fstests will follow soon. + +Reported-by: Paulo Dias +Link: https://bugzilla.kernel.org/show_bug.cgi?id=219187 +Reported-by: Andreas Jahn +Link: https://bugzilla.kernel.org/show_bug.cgi?id=219199 +Reported-by: syzbot+4704b3cc972bd76024f1@syzkaller.appspotmail.com +Link: https://lore.kernel.org/linux-btrfs/00000000000044ff540620d7dee2@google.com/ +Fixes: 939b656bc8ab ("btrfs: fix corruption after buffer fault in during direct IO append write") +CC: stable@vger.kernel.org # 5.15+ +Reviewed-by: Josef Bacik +Signed-off-by: Filipe Manana +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/ctree.h | 1 - + fs/btrfs/file.c | 25 ++++++++++--------------- + fs/btrfs/transaction.h | 6 ++++++ + 3 files changed, 16 insertions(+), 16 deletions(-) + +--- a/fs/btrfs/ctree.h ++++ b/fs/btrfs/ctree.h +@@ -1383,7 +1383,6 @@ struct btrfs_drop_extents_args { + struct btrfs_file_private { + void *filldir_buf; + u64 last_index; +- bool fsync_skip_inode_lock; + }; + + +--- a/fs/btrfs/file.c ++++ b/fs/btrfs/file.c +@@ -1992,13 +1992,6 @@ again: + if (IS_ERR_OR_NULL(dio)) { + err = PTR_ERR_OR_ZERO(dio); + } else { +- struct btrfs_file_private stack_private = { 0 }; +- struct btrfs_file_private *private; +- const bool have_private = (file->private_data != NULL); +- +- if (!have_private) +- file->private_data = &stack_private; +- + /* + * If we have a synchoronous write, we must make sure the fsync + * triggered by the iomap_dio_complete() call below doesn't +@@ -2007,13 +2000,10 @@ again: + * partial writes due to the input buffer (or parts of it) not + * being already faulted in. + */ +- private = file->private_data; +- private->fsync_skip_inode_lock = true; ++ ASSERT(current->journal_info == NULL); ++ current->journal_info = BTRFS_TRANS_DIO_WRITE_STUB; + err = iomap_dio_complete(dio); +- private->fsync_skip_inode_lock = false; +- +- if (!have_private) +- file->private_data = NULL; ++ current->journal_info = NULL; + } + + /* No increment (+=) because iomap returns a cumulative value. */ +@@ -2195,7 +2185,6 @@ static inline bool skip_inode_logging(co + */ + int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) + { +- struct btrfs_file_private *private = file->private_data; + struct dentry *dentry = file_dentry(file); + struct inode *inode = d_inode(dentry); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); +@@ -2205,7 +2194,13 @@ int btrfs_sync_file(struct file *file, l + int ret = 0, err; + u64 len; + bool full_sync; +- const bool skip_ilock = (private ? private->fsync_skip_inode_lock : false); ++ bool skip_ilock = false; ++ ++ if (current->journal_info == BTRFS_TRANS_DIO_WRITE_STUB) { ++ skip_ilock = true; ++ current->journal_info = NULL; ++ lockdep_assert_held(&inode->i_rwsem); ++ } + + trace_btrfs_sync_file(file, datasync); + +--- a/fs/btrfs/transaction.h ++++ b/fs/btrfs/transaction.h +@@ -11,6 +11,12 @@ + #include "delayed-ref.h" + #include "ctree.h" + ++/* ++ * Signal that a direct IO write is in progress, to avoid deadlock for sync ++ * direct IO writes when fsync is called during the direct IO write path. ++ */ ++#define BTRFS_TRANS_DIO_WRITE_STUB ((void *) 1) ++ + enum btrfs_trans_state { + TRANS_STATE_RUNNING, + TRANS_STATE_COMMIT_START, diff --git a/queue-5.15/memcg-protect-concurrent-access-to-mem_cgroup_idr.patch b/queue-5.15/memcg-protect-concurrent-access-to-mem_cgroup_idr.patch new file mode 100644 index 00000000000..d29c2d93837 --- /dev/null +++ b/queue-5.15/memcg-protect-concurrent-access-to-mem_cgroup_idr.patch @@ -0,0 +1,109 @@ +From 9972605a238339b85bd16b084eed5f18414d22db Mon Sep 17 00:00:00 2001 +From: Shakeel Butt +Date: Fri, 2 Aug 2024 16:58:22 -0700 +Subject: memcg: protect concurrent access to mem_cgroup_idr + +From: Shakeel Butt + +commit 9972605a238339b85bd16b084eed5f18414d22db upstream. + +Commit 73f576c04b94 ("mm: memcontrol: fix cgroup creation failure after +many small jobs") decoupled the memcg IDs from the CSS ID space to fix the +cgroup creation failures. It introduced IDR to maintain the memcg ID +space. The IDR depends on external synchronization mechanisms for +modifications. For the mem_cgroup_idr, the idr_alloc() and idr_replace() +happen within css callback and thus are protected through cgroup_mutex +from concurrent modifications. However idr_remove() for mem_cgroup_idr +was not protected against concurrency and can be run concurrently for +different memcgs when they hit their refcnt to zero. Fix that. + +We have been seeing list_lru based kernel crashes at a low frequency in +our fleet for a long time. These crashes were in different part of +list_lru code including list_lru_add(), list_lru_del() and reparenting +code. Upon further inspection, it looked like for a given object (dentry +and inode), the super_block's list_lru didn't have list_lru_one for the +memcg of that object. The initial suspicions were either the object is +not allocated through kmem_cache_alloc_lru() or somehow +memcg_list_lru_alloc() failed to allocate list_lru_one() for a memcg but +returned success. No evidence were found for these cases. + +Looking more deeply, we started seeing situations where valid memcg's id +is not present in mem_cgroup_idr and in some cases multiple valid memcgs +have same id and mem_cgroup_idr is pointing to one of them. So, the most +reasonable explanation is that these situations can happen due to race +between multiple idr_remove() calls or race between +idr_alloc()/idr_replace() and idr_remove(). These races are causing +multiple memcgs to acquire the same ID and then offlining of one of them +would cleanup list_lrus on the system for all of them. Later access from +other memcgs to the list_lru cause crashes due to missing list_lru_one. + +Link: https://lkml.kernel.org/r/20240802235822.1830976-1-shakeel.butt@linux.dev +Fixes: 73f576c04b94 ("mm: memcontrol: fix cgroup creation failure after many small jobs") +Signed-off-by: Shakeel Butt +Acked-by: Muchun Song +Reviewed-by: Roman Gushchin +Acked-by: Johannes Weiner +Cc: Michal Hocko +Cc: +Signed-off-by: Andrew Morton +[ Adapted over commit be740503ed03 ("mm: memcontrol: fix cannot alloc the + maximum memcg ID") and 6f0df8e16eb5 ("memcontrol: ensure memcg acquired by id + is properly set up") both are not in the tree ] +Signed-off-by: Tomas Krcka +Signed-off-by: Greg Kroah-Hartman +--- + mm/memcontrol.c | 23 ++++++++++++++++++++--- + 1 file changed, 20 insertions(+), 3 deletions(-) + +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@ -5083,11 +5083,28 @@ static struct cftype mem_cgroup_legacy_f + */ + + static DEFINE_IDR(mem_cgroup_idr); ++static DEFINE_SPINLOCK(memcg_idr_lock); ++ ++static int mem_cgroup_alloc_id(void) ++{ ++ int ret; ++ ++ idr_preload(GFP_KERNEL); ++ spin_lock(&memcg_idr_lock); ++ ret = idr_alloc(&mem_cgroup_idr, NULL, 1, MEM_CGROUP_ID_MAX + 1, ++ GFP_NOWAIT); ++ spin_unlock(&memcg_idr_lock); ++ idr_preload_end(); ++ return ret; ++} + + static void mem_cgroup_id_remove(struct mem_cgroup *memcg) + { + if (memcg->id.id > 0) { ++ spin_lock(&memcg_idr_lock); + idr_remove(&mem_cgroup_idr, memcg->id.id); ++ spin_unlock(&memcg_idr_lock); ++ + memcg->id.id = 0; + } + } +@@ -5201,9 +5218,7 @@ static struct mem_cgroup *mem_cgroup_all + if (!memcg) + return ERR_PTR(error); + +- memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL, +- 1, MEM_CGROUP_ID_MAX, +- GFP_KERNEL); ++ memcg->id.id = mem_cgroup_alloc_id(); + if (memcg->id.id < 0) { + error = memcg->id.id; + goto fail; +@@ -5244,7 +5259,9 @@ static struct mem_cgroup *mem_cgroup_all + INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue); + memcg->deferred_split_queue.split_queue_len = 0; + #endif ++ spin_lock(&memcg_idr_lock); + idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); ++ spin_unlock(&memcg_idr_lock); + return memcg; + fail: + mem_cgroup_id_remove(memcg); diff --git a/queue-5.15/series b/queue-5.15/series index 35a86c809c7..879587e2ed3 100644 --- a/queue-5.15/series +++ b/queue-5.15/series @@ -210,3 +210,5 @@ gso-fix-dodgy-bit-handling-for-gso_udp_l4.patch net-drop-bad-gso-csum_start-and-offset-in-virtio_net_hdr.patch x86-mm-fix-pti-for-i386-some-more.patch net-sunrpc-remap-eperm-in-case-of-connection-failure-in-xs_tcp_setup_socket.patch +btrfs-fix-race-between-direct-io-write-and-fsync-when-using-same-fd.patch +memcg-protect-concurrent-access-to-mem_cgroup_idr.patch