From: Greg Kroah-Hartman Date: Tue, 10 Sep 2024 07:56:55 +0000 (+0200) Subject: 6.1-stable patches X-Git-Tag: v4.19.322~23 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=d210e4e474e0b4bcac48e4323610b5d73d1a1dfa;p=thirdparty%2Fkernel%2Fstable-queue.git 6.1-stable patches added patches: bpf-silence-a-warning-in-btf_type_id_size.patch btrfs-fix-race-between-direct-io-write-and-fsync-when-using-same-fd.patch memcg-protect-concurrent-access-to-mem_cgroup_idr.patch --- diff --git a/queue-6.1/bpf-silence-a-warning-in-btf_type_id_size.patch b/queue-6.1/bpf-silence-a-warning-in-btf_type_id_size.patch new file mode 100644 index 00000000000..9f1320d57ac --- /dev/null +++ b/queue-6.1/bpf-silence-a-warning-in-btf_type_id_size.patch @@ -0,0 +1,90 @@ +From e6c2f594ed961273479505b42040782820190305 Mon Sep 17 00:00:00 2001 +From: Yonghong Song +Date: Tue, 30 May 2023 13:50:29 -0700 +Subject: bpf: Silence a warning in btf_type_id_size() + +From: Yonghong Song + +commit e6c2f594ed961273479505b42040782820190305 upstream. + +syzbot reported a warning in [1] with the following stacktrace: + WARNING: CPU: 0 PID: 5005 at kernel/bpf/btf.c:1988 btf_type_id_size+0x2d9/0x9d0 kernel/bpf/btf.c:1988 + ... + RIP: 0010:btf_type_id_size+0x2d9/0x9d0 kernel/bpf/btf.c:1988 + ... + Call Trace: + + map_check_btf kernel/bpf/syscall.c:1024 [inline] + map_create+0x1157/0x1860 kernel/bpf/syscall.c:1198 + __sys_bpf+0x127f/0x5420 kernel/bpf/syscall.c:5040 + __do_sys_bpf kernel/bpf/syscall.c:5162 [inline] + __se_sys_bpf kernel/bpf/syscall.c:5160 [inline] + __x64_sys_bpf+0x79/0xc0 kernel/bpf/syscall.c:5160 + do_syscall_x64 arch/x86/entry/common.c:50 [inline] + do_syscall_64+0x39/0xb0 arch/x86/entry/common.c:80 + entry_SYSCALL_64_after_hwframe+0x63/0xcd + +With the following btf + [1] DECL_TAG 'a' type_id=4 component_idx=-1 + [2] PTR '(anon)' type_id=0 + [3] TYPE_TAG 'a' type_id=2 + [4] VAR 'a' type_id=3, linkage=static +and when the bpf_attr.btf_key_type_id = 1 (DECL_TAG), +the following WARN_ON_ONCE in btf_type_id_size() is triggered: + if (WARN_ON_ONCE(!btf_type_is_modifier(size_type) && + !btf_type_is_var(size_type))) + return NULL; + +Note that 'return NULL' is the correct behavior as we don't want +a DECL_TAG type to be used as a btf_{key,value}_type_id even +for the case like 'DECL_TAG -> STRUCT'. So there +is no correctness issue here, we just want to silence warning. + +To silence the warning, I added DECL_TAG as one of kinds in +btf_type_nosize() which will cause btf_type_id_size() returning +NULL earlier without the warning. + + [1] https://lore.kernel.org/bpf/000000000000e0df8d05fc75ba86@google.com/ + +Reported-by: syzbot+958967f249155967d42a@syzkaller.appspotmail.com +Signed-off-by: Yonghong Song +Link: https://lore.kernel.org/r/20230530205029.264910-1-yhs@fb.com +Signed-off-by: Martin KaFai Lau +Signed-off-by: Diogo Jahchan Koike +Signed-off-by: Greg Kroah-Hartman +--- + kernel/bpf/btf.c | 13 +++++++------ + 1 file changed, 7 insertions(+), 6 deletions(-) + +--- a/kernel/bpf/btf.c ++++ b/kernel/bpf/btf.c +@@ -466,10 +466,16 @@ static bool btf_type_is_fwd(const struct + return BTF_INFO_KIND(t->info) == BTF_KIND_FWD; + } + ++static bool btf_type_is_decl_tag(const struct btf_type *t) ++{ ++ return BTF_INFO_KIND(t->info) == BTF_KIND_DECL_TAG; ++} ++ + static bool btf_type_nosize(const struct btf_type *t) + { + return btf_type_is_void(t) || btf_type_is_fwd(t) || +- btf_type_is_func(t) || btf_type_is_func_proto(t); ++ btf_type_is_func(t) || btf_type_is_func_proto(t) || ++ btf_type_is_decl_tag(t); + } + + static bool btf_type_nosize_or_null(const struct btf_type *t) +@@ -492,11 +498,6 @@ static bool btf_type_is_datasec(const st + return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC; + } + +-static bool btf_type_is_decl_tag(const struct btf_type *t) +-{ +- return BTF_INFO_KIND(t->info) == BTF_KIND_DECL_TAG; +-} +- + static bool btf_type_is_decl_tag_target(const struct btf_type *t) + { + return btf_type_is_func(t) || btf_type_is_struct(t) || diff --git a/queue-6.1/btrfs-fix-race-between-direct-io-write-and-fsync-when-using-same-fd.patch b/queue-6.1/btrfs-fix-race-between-direct-io-write-and-fsync-when-using-same-fd.patch new file mode 100644 index 00000000000..757bde2d49b --- /dev/null +++ b/queue-6.1/btrfs-fix-race-between-direct-io-write-and-fsync-when-using-same-fd.patch @@ -0,0 +1,315 @@ +From cd9253c23aedd61eb5ff11f37a36247cd46faf86 Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Thu, 29 Aug 2024 18:25:49 +0100 +Subject: btrfs: fix race between direct IO write and fsync when using same fd + +From: Filipe Manana + +commit cd9253c23aedd61eb5ff11f37a36247cd46faf86 upstream. + +If we have 2 threads that are using the same file descriptor and one of +them is doing direct IO writes while the other is doing fsync, we have a +race where we can end up either: + +1) Attempt a fsync without holding the inode's lock, triggering an + assertion failures when assertions are enabled; + +2) Do an invalid memory access from the fsync task because the file private + points to memory allocated on stack by the direct IO task and it may be + used by the fsync task after the stack was destroyed. + +The race happens like this: + +1) A user space program opens a file descriptor with O_DIRECT; + +2) The program spawns 2 threads using libpthread for example; + +3) One of the threads uses the file descriptor to do direct IO writes, + while the other calls fsync using the same file descriptor. + +4) Call task A the thread doing direct IO writes and task B the thread + doing fsyncs; + +5) Task A does a direct IO write, and at btrfs_direct_write() sets the + file's private to an on stack allocated private with the member + 'fsync_skip_inode_lock' set to true; + +6) Task B enters btrfs_sync_file() and sees that there's a private + structure associated to the file which has 'fsync_skip_inode_lock' set + to true, so it skips locking the inode's VFS lock; + +7) Task A completes the direct IO write, and resets the file's private to + NULL since it had no prior private and our private was stack allocated. + Then it unlocks the inode's VFS lock; + +8) Task B enters btrfs_get_ordered_extents_for_logging(), then the + assertion that checks the inode's VFS lock is held fails, since task B + never locked it and task A has already unlocked it. + +The stack trace produced is the following: + + assertion failed: inode_is_locked(&inode->vfs_inode), in fs/btrfs/ordered-data.c:983 + ------------[ cut here ]------------ + kernel BUG at fs/btrfs/ordered-data.c:983! + Oops: invalid opcode: 0000 [#1] PREEMPT SMP PTI + CPU: 9 PID: 5072 Comm: worker Tainted: G U OE 6.10.5-1-default #1 openSUSE Tumbleweed 69f48d427608e1c09e60ea24c6c55e2ca1b049e8 + Hardware name: Acer Predator PH315-52/Covini_CFS, BIOS V1.12 07/28/2020 + RIP: 0010:btrfs_get_ordered_extents_for_logging.cold+0x1f/0x42 [btrfs] + Code: 50 d6 86 c0 e8 (...) + RSP: 0018:ffff9e4a03dcfc78 EFLAGS: 00010246 + RAX: 0000000000000054 RBX: ffff9078a9868e98 RCX: 0000000000000000 + RDX: 0000000000000000 RSI: ffff907dce4a7800 RDI: ffff907dce4a7800 + RBP: ffff907805518800 R08: 0000000000000000 R09: ffff9e4a03dcfb38 + R10: ffff9e4a03dcfb30 R11: 0000000000000003 R12: ffff907684ae7800 + R13: 0000000000000001 R14: ffff90774646b600 R15: 0000000000000000 + FS: 00007f04b96006c0(0000) GS:ffff907dce480000(0000) knlGS:0000000000000000 + CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 + CR2: 00007f32acbfc000 CR3: 00000001fd4fa005 CR4: 00000000003726f0 + Call Trace: + + ? __die_body.cold+0x14/0x24 + ? die+0x2e/0x50 + ? do_trap+0xca/0x110 + ? do_error_trap+0x6a/0x90 + ? btrfs_get_ordered_extents_for_logging.cold+0x1f/0x42 [btrfs bb26272d49b4cdc847cf3f7faadd459b62caee9a] + ? exc_invalid_op+0x50/0x70 + ? btrfs_get_ordered_extents_for_logging.cold+0x1f/0x42 [btrfs bb26272d49b4cdc847cf3f7faadd459b62caee9a] + ? asm_exc_invalid_op+0x1a/0x20 + ? btrfs_get_ordered_extents_for_logging.cold+0x1f/0x42 [btrfs bb26272d49b4cdc847cf3f7faadd459b62caee9a] + ? btrfs_get_ordered_extents_for_logging.cold+0x1f/0x42 [btrfs bb26272d49b4cdc847cf3f7faadd459b62caee9a] + btrfs_sync_file+0x21a/0x4d0 [btrfs bb26272d49b4cdc847cf3f7faadd459b62caee9a] + ? __seccomp_filter+0x31d/0x4f0 + __x64_sys_fdatasync+0x4f/0x90 + do_syscall_64+0x82/0x160 + ? do_futex+0xcb/0x190 + ? __x64_sys_futex+0x10e/0x1d0 + ? switch_fpu_return+0x4f/0xd0 + ? syscall_exit_to_user_mode+0x72/0x220 + ? do_syscall_64+0x8e/0x160 + ? syscall_exit_to_user_mode+0x72/0x220 + ? do_syscall_64+0x8e/0x160 + ? syscall_exit_to_user_mode+0x72/0x220 + ? do_syscall_64+0x8e/0x160 + ? syscall_exit_to_user_mode+0x72/0x220 + ? do_syscall_64+0x8e/0x160 + entry_SYSCALL_64_after_hwframe+0x76/0x7e + +Another problem here is if task B grabs the private pointer and then uses +it after task A has finished, since the private was allocated in the stack +of task A, it results in some invalid memory access with a hard to predict +result. + +This issue, triggering the assertion, was observed with QEMU workloads by +two users in the Link tags below. + +Fix this by not relying on a file's private to pass information to fsync +that it should skip locking the inode and instead pass this information +through a special value stored in current->journal_info. This is safe +because in the relevant section of the direct IO write path we are not +holding a transaction handle, so current->journal_info is NULL. + +The following C program triggers the issue: + + $ cat repro.c + /* Get the O_DIRECT definition. */ + #ifndef _GNU_SOURCE + #define _GNU_SOURCE + #endif + + #include + #include + #include + #include + #include + #include + #include + #include + + static int fd; + + static ssize_t do_write(int fd, const void *buf, size_t count, off_t offset) + { + while (count > 0) { + ssize_t ret; + + ret = pwrite(fd, buf, count, offset); + if (ret < 0) { + if (errno == EINTR) + continue; + return ret; + } + count -= ret; + buf += ret; + } + return 0; + } + + static void *fsync_loop(void *arg) + { + while (1) { + int ret; + + ret = fsync(fd); + if (ret != 0) { + perror("Fsync failed"); + exit(6); + } + } + } + + int main(int argc, char *argv[]) + { + long pagesize; + void *write_buf; + pthread_t fsyncer; + int ret; + + if (argc != 2) { + fprintf(stderr, "Use: %s \n", argv[0]); + return 1; + } + + fd = open(argv[1], O_WRONLY | O_CREAT | O_TRUNC | O_DIRECT, 0666); + if (fd == -1) { + perror("Failed to open/create file"); + return 1; + } + + pagesize = sysconf(_SC_PAGE_SIZE); + if (pagesize == -1) { + perror("Failed to get page size"); + return 2; + } + + ret = posix_memalign(&write_buf, pagesize, pagesize); + if (ret) { + perror("Failed to allocate buffer"); + return 3; + } + + ret = pthread_create(&fsyncer, NULL, fsync_loop, NULL); + if (ret != 0) { + fprintf(stderr, "Failed to create writer thread: %d\n", ret); + return 4; + } + + while (1) { + ret = do_write(fd, write_buf, pagesize, 0); + if (ret != 0) { + perror("Write failed"); + exit(5); + } + } + + return 0; + } + + $ mkfs.btrfs -f /dev/sdi + $ mount /dev/sdi /mnt/sdi + $ timeout 10 ./repro /mnt/sdi/foo + +Usually the race is triggered within less than 1 second. A test case for +fstests will follow soon. + +Reported-by: Paulo Dias +Link: https://bugzilla.kernel.org/show_bug.cgi?id=219187 +Reported-by: Andreas Jahn +Link: https://bugzilla.kernel.org/show_bug.cgi?id=219199 +Reported-by: syzbot+4704b3cc972bd76024f1@syzkaller.appspotmail.com +Link: https://lore.kernel.org/linux-btrfs/00000000000044ff540620d7dee2@google.com/ +Fixes: 939b656bc8ab ("btrfs: fix corruption after buffer fault in during direct IO append write") +CC: stable@vger.kernel.org # 5.15+ +Reviewed-by: Josef Bacik +Signed-off-by: Filipe Manana +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + +--- + fs/btrfs/ctree.h | 1 - + fs/btrfs/file.c | 25 ++++++++++--------------- + fs/btrfs/transaction.h | 6 ++++++ + 3 files changed, 16 insertions(+), 16 deletions(-) + +--- a/fs/btrfs/ctree.h ++++ b/fs/btrfs/ctree.h +@@ -1553,7 +1553,6 @@ struct btrfs_drop_extents_args { + struct btrfs_file_private { + void *filldir_buf; + u64 last_index; +- bool fsync_skip_inode_lock; + }; + + +--- a/fs/btrfs/file.c ++++ b/fs/btrfs/file.c +@@ -1534,13 +1534,6 @@ again: + if (IS_ERR_OR_NULL(dio)) { + err = PTR_ERR_OR_ZERO(dio); + } else { +- struct btrfs_file_private stack_private = { 0 }; +- struct btrfs_file_private *private; +- const bool have_private = (file->private_data != NULL); +- +- if (!have_private) +- file->private_data = &stack_private; +- + /* + * If we have a synchoronous write, we must make sure the fsync + * triggered by the iomap_dio_complete() call below doesn't +@@ -1549,13 +1542,10 @@ again: + * partial writes due to the input buffer (or parts of it) not + * being already faulted in. + */ +- private = file->private_data; +- private->fsync_skip_inode_lock = true; ++ ASSERT(current->journal_info == NULL); ++ current->journal_info = BTRFS_TRANS_DIO_WRITE_STUB; + err = iomap_dio_complete(dio); +- private->fsync_skip_inode_lock = false; +- +- if (!have_private) +- file->private_data = NULL; ++ current->journal_info = NULL; + } + + /* No increment (+=) because iomap returns a cumulative value. */ +@@ -1795,7 +1785,6 @@ static inline bool skip_inode_logging(co + */ + int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) + { +- struct btrfs_file_private *private = file->private_data; + struct dentry *dentry = file_dentry(file); + struct inode *inode = d_inode(dentry); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); +@@ -1805,7 +1794,13 @@ int btrfs_sync_file(struct file *file, l + int ret = 0, err; + u64 len; + bool full_sync; +- const bool skip_ilock = (private ? private->fsync_skip_inode_lock : false); ++ bool skip_ilock = false; ++ ++ if (current->journal_info == BTRFS_TRANS_DIO_WRITE_STUB) { ++ skip_ilock = true; ++ current->journal_info = NULL; ++ lockdep_assert_held(&inode->i_rwsem); ++ } + + trace_btrfs_sync_file(file, datasync); + +--- a/fs/btrfs/transaction.h ++++ b/fs/btrfs/transaction.h +@@ -11,6 +11,12 @@ + #include "delayed-ref.h" + #include "ctree.h" + ++/* ++ * Signal that a direct IO write is in progress, to avoid deadlock for sync ++ * direct IO writes when fsync is called during the direct IO write path. ++ */ ++#define BTRFS_TRANS_DIO_WRITE_STUB ((void *) 1) ++ + enum btrfs_trans_state { + TRANS_STATE_RUNNING, + TRANS_STATE_COMMIT_START, diff --git a/queue-6.1/memcg-protect-concurrent-access-to-mem_cgroup_idr.patch b/queue-6.1/memcg-protect-concurrent-access-to-mem_cgroup_idr.patch new file mode 100644 index 00000000000..bd63f40e20d --- /dev/null +++ b/queue-6.1/memcg-protect-concurrent-access-to-mem_cgroup_idr.patch @@ -0,0 +1,107 @@ +From 9972605a238339b85bd16b084eed5f18414d22db Mon Sep 17 00:00:00 2001 +From: Shakeel Butt +Date: Fri, 2 Aug 2024 16:58:22 -0700 +Subject: memcg: protect concurrent access to mem_cgroup_idr + +From: Shakeel Butt + +commit 9972605a238339b85bd16b084eed5f18414d22db upstream. + +Commit 73f576c04b94 ("mm: memcontrol: fix cgroup creation failure after +many small jobs") decoupled the memcg IDs from the CSS ID space to fix the +cgroup creation failures. It introduced IDR to maintain the memcg ID +space. The IDR depends on external synchronization mechanisms for +modifications. For the mem_cgroup_idr, the idr_alloc() and idr_replace() +happen within css callback and thus are protected through cgroup_mutex +from concurrent modifications. However idr_remove() for mem_cgroup_idr +was not protected against concurrency and can be run concurrently for +different memcgs when they hit their refcnt to zero. Fix that. + +We have been seeing list_lru based kernel crashes at a low frequency in +our fleet for a long time. These crashes were in different part of +list_lru code including list_lru_add(), list_lru_del() and reparenting +code. Upon further inspection, it looked like for a given object (dentry +and inode), the super_block's list_lru didn't have list_lru_one for the +memcg of that object. The initial suspicions were either the object is +not allocated through kmem_cache_alloc_lru() or somehow +memcg_list_lru_alloc() failed to allocate list_lru_one() for a memcg but +returned success. No evidence were found for these cases. + +Looking more deeply, we started seeing situations where valid memcg's id +is not present in mem_cgroup_idr and in some cases multiple valid memcgs +have same id and mem_cgroup_idr is pointing to one of them. So, the most +reasonable explanation is that these situations can happen due to race +between multiple idr_remove() calls or race between +idr_alloc()/idr_replace() and idr_remove(). These races are causing +multiple memcgs to acquire the same ID and then offlining of one of them +would cleanup list_lrus on the system for all of them. Later access from +other memcgs to the list_lru cause crashes due to missing list_lru_one. + +Link: https://lkml.kernel.org/r/20240802235822.1830976-1-shakeel.butt@linux.dev +Fixes: 73f576c04b94 ("mm: memcontrol: fix cgroup creation failure after many small jobs") +Signed-off-by: Shakeel Butt +Acked-by: Muchun Song +Reviewed-by: Roman Gushchin +Acked-by: Johannes Weiner +Cc: Michal Hocko +Cc: +Signed-off-by: Andrew Morton +[ Adapted over commit 6f0df8e16eb5 ("memcontrol: ensure memcg acquired by id is + properly set up") not in the tree ] +Signed-off-by: Tomas Krcka +Signed-off-by: Greg Kroah-Hartman +--- + mm/memcontrol.c | 22 ++++++++++++++++++++-- + 1 file changed, 20 insertions(+), 2 deletions(-) + +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@ -5143,11 +5143,28 @@ static struct cftype mem_cgroup_legacy_f + */ + + static DEFINE_IDR(mem_cgroup_idr); ++static DEFINE_SPINLOCK(memcg_idr_lock); ++ ++static int mem_cgroup_alloc_id(void) ++{ ++ int ret; ++ ++ idr_preload(GFP_KERNEL); ++ spin_lock(&memcg_idr_lock); ++ ret = idr_alloc(&mem_cgroup_idr, NULL, 1, MEM_CGROUP_ID_MAX + 1, ++ GFP_NOWAIT); ++ spin_unlock(&memcg_idr_lock); ++ idr_preload_end(); ++ return ret; ++} + + static void mem_cgroup_id_remove(struct mem_cgroup *memcg) + { + if (memcg->id.id > 0) { ++ spin_lock(&memcg_idr_lock); + idr_remove(&mem_cgroup_idr, memcg->id.id); ++ spin_unlock(&memcg_idr_lock); ++ + memcg->id.id = 0; + } + } +@@ -5270,8 +5287,7 @@ static struct mem_cgroup *mem_cgroup_all + if (!memcg) + return ERR_PTR(error); + +- memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL, +- 1, MEM_CGROUP_ID_MAX + 1, GFP_KERNEL); ++ memcg->id.id = mem_cgroup_alloc_id(); + if (memcg->id.id < 0) { + error = memcg->id.id; + goto fail; +@@ -5316,7 +5332,9 @@ static struct mem_cgroup *mem_cgroup_all + INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue); + memcg->deferred_split_queue.split_queue_len = 0; + #endif ++ spin_lock(&memcg_idr_lock); + idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); ++ spin_unlock(&memcg_idr_lock); + lru_gen_init_memcg(memcg); + return memcg; + fail: diff --git a/queue-6.1/series b/queue-6.1/series index a36cbf0ca7a..2604a537c35 100644 --- a/queue-6.1/series +++ b/queue-6.1/series @@ -185,3 +185,6 @@ gpio-rockchip-fix-of-node-leak-in-probe.patch gpio-modepin-enable-module-autoloading.patch ublk_drv-fix-null-pointer-dereference-in-ublk_ctrl_s.patch x86-mm-fix-pti-for-i386-some-more.patch +btrfs-fix-race-between-direct-io-write-and-fsync-when-using-same-fd.patch +bpf-silence-a-warning-in-btf_type_id_size.patch +memcg-protect-concurrent-access-to-mem_cgroup_idr.patch