From 84cfbcb649044d29bda75206b60a8a05c6d48050 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sat, 9 Dec 2023 13:44:08 +0100 Subject: [PATCH] 6.1-stable patches added patches: cgroup_freezer-cgroup_freezing-check-if-not-frozen.patch checkstack-fix-printed-address.patch hugetlb-fix-null-ptr-deref-in-hugetlb_vma_lock_write.patch mm-fix-oops-when-filemap_map_pmd-without-prealloc_pte.patch nilfs2-fix-missing-error-check-for-sb_set_blocksize-call.patch nilfs2-prevent-warning-in-nilfs_sufile_set_segment_usage.patch packet-move-reference-count-in-packet_sock-to-atomic_long_t.patch platform-surface-aggregator-fix-recv_buf-return-value.patch r8169-fix-rtl8125b-pause-frames-blasting-when-suspended.patch regmap-fix-bogus-error-on-regcache_sync-success.patch ring-buffer-test-last-update-in-32bit-version-of-__rb_time_read.patch tracing-always-update-snapshot-buffer-size.patch tracing-disable-snapshot-buffer-when-stopping-instance-tracers.patch tracing-fix-a-possible-race-when-disabling-buffered-events.patch tracing-fix-incomplete-locking-when-disabling-buffered-events.patch --- ...-cgroup_freezing-check-if-not-frozen.patch | 67 ++++++ .../checkstack-fix-printed-address.patch | 60 ++++++ ...-ptr-deref-in-hugetlb_vma_lock_write.patch | 100 +++++++++ ...filemap_map_pmd-without-prealloc_pte.patch | 62 ++++++ ...rror-check-for-sb_set_blocksize-call.patch | 79 +++++++ ...ng-in-nilfs_sufile_set_segment_usage.patch | 109 ++++++++++ ...ount-in-packet_sock-to-atomic_long_t.patch | 109 ++++++++++ ...aggregator-fix-recv_buf-return-value.patch | 49 +++++ ...pause-frames-blasting-when-suspended.patch | 69 ++++++ ...bogus-error-on-regcache_sync-success.patch | 45 ++++ ...e-in-32bit-version-of-__rb_time_read.patch | 50 +++++ queue-6.1/series | 15 ++ ...g-always-update-snapshot-buffer-size.patch | 83 +++++++ ...uffer-when-stopping-instance-tracers.patch | 203 ++++++++++++++++++ ...-race-when-disabling-buffered-events.patch | 82 +++++++ ...cking-when-disabling-buffered-events.patch | 153 +++++++++++++ 16 files changed, 1335 insertions(+) create mode 100644 queue-6.1/cgroup_freezer-cgroup_freezing-check-if-not-frozen.patch create mode 100644 queue-6.1/checkstack-fix-printed-address.patch create mode 100644 queue-6.1/hugetlb-fix-null-ptr-deref-in-hugetlb_vma_lock_write.patch create mode 100644 queue-6.1/mm-fix-oops-when-filemap_map_pmd-without-prealloc_pte.patch create mode 100644 queue-6.1/nilfs2-fix-missing-error-check-for-sb_set_blocksize-call.patch create mode 100644 queue-6.1/nilfs2-prevent-warning-in-nilfs_sufile_set_segment_usage.patch create mode 100644 queue-6.1/packet-move-reference-count-in-packet_sock-to-atomic_long_t.patch create mode 100644 queue-6.1/platform-surface-aggregator-fix-recv_buf-return-value.patch create mode 100644 queue-6.1/r8169-fix-rtl8125b-pause-frames-blasting-when-suspended.patch create mode 100644 queue-6.1/regmap-fix-bogus-error-on-regcache_sync-success.patch create mode 100644 queue-6.1/ring-buffer-test-last-update-in-32bit-version-of-__rb_time_read.patch create mode 100644 queue-6.1/tracing-always-update-snapshot-buffer-size.patch create mode 100644 queue-6.1/tracing-disable-snapshot-buffer-when-stopping-instance-tracers.patch create mode 100644 queue-6.1/tracing-fix-a-possible-race-when-disabling-buffered-events.patch create mode 100644 queue-6.1/tracing-fix-incomplete-locking-when-disabling-buffered-events.patch diff --git a/queue-6.1/cgroup_freezer-cgroup_freezing-check-if-not-frozen.patch b/queue-6.1/cgroup_freezer-cgroup_freezing-check-if-not-frozen.patch new file mode 100644 index 00000000000..d91972ebd3c --- /dev/null +++ b/queue-6.1/cgroup_freezer-cgroup_freezing-check-if-not-frozen.patch @@ -0,0 +1,67 @@ +From cff5f49d433fcd0063c8be7dd08fa5bf190c6c37 Mon Sep 17 00:00:00 2001 +From: Tim Van Patten +Date: Wed, 15 Nov 2023 09:20:43 -0700 +Subject: cgroup_freezer: cgroup_freezing: Check if not frozen + +From: Tim Van Patten + +commit cff5f49d433fcd0063c8be7dd08fa5bf190c6c37 upstream. + +__thaw_task() was recently updated to warn if the task being thawed was +part of a freezer cgroup that is still currently freezing: + + void __thaw_task(struct task_struct *p) + { + ... + if (WARN_ON_ONCE(freezing(p))) + goto unlock; + +This has exposed a bug in cgroup1 freezing where when CGROUP_FROZEN is +asserted, the CGROUP_FREEZING bits are not also cleared at the same +time. Meaning, when a cgroup is marked FROZEN it continues to be marked +FREEZING as well. This causes the WARNING to trigger, because +cgroup_freezing() thinks the cgroup is still freezing. + +There are two ways to fix this: + +1. Whenever FROZEN is set, clear FREEZING for the cgroup and all +children cgroups. +2. Update cgroup_freezing() to also verify that FROZEN is not set. + +This patch implements option (2), since it's smaller and more +straightforward. + +Signed-off-by: Tim Van Patten +Tested-by: Mark Hasemeyer +Fixes: f5d39b020809 ("freezer,sched: Rewrite core freezer logic") +Cc: stable@vger.kernel.org # v6.1+ +Signed-off-by: Tejun Heo +Signed-off-by: Greg Kroah-Hartman +--- + kernel/cgroup/legacy_freezer.c | 8 +++++++- + 1 file changed, 7 insertions(+), 1 deletion(-) + +diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c +index 122dacb3a443..66d1708042a7 100644 +--- a/kernel/cgroup/legacy_freezer.c ++++ b/kernel/cgroup/legacy_freezer.c +@@ -66,9 +66,15 @@ static struct freezer *parent_freezer(struct freezer *freezer) + bool cgroup_freezing(struct task_struct *task) + { + bool ret; ++ unsigned int state; + + rcu_read_lock(); +- ret = task_freezer(task)->state & CGROUP_FREEZING; ++ /* Check if the cgroup is still FREEZING, but not FROZEN. The extra ++ * !FROZEN check is required, because the FREEZING bit is not cleared ++ * when the state FROZEN is reached. ++ */ ++ state = task_freezer(task)->state; ++ ret = (state & CGROUP_FREEZING) && !(state & CGROUP_FROZEN); + rcu_read_unlock(); + + return ret; +-- +2.43.0 + diff --git a/queue-6.1/checkstack-fix-printed-address.patch b/queue-6.1/checkstack-fix-printed-address.patch new file mode 100644 index 00000000000..5ba17f2b1e7 --- /dev/null +++ b/queue-6.1/checkstack-fix-printed-address.patch @@ -0,0 +1,60 @@ +From ee34db3f271cea4d4252048617919c2caafe698b Mon Sep 17 00:00:00 2001 +From: Heiko Carstens +Date: Mon, 20 Nov 2023 19:37:17 +0100 +Subject: checkstack: fix printed address + +From: Heiko Carstens + +commit ee34db3f271cea4d4252048617919c2caafe698b upstream. + +All addresses printed by checkstack have an extra incorrect 0 appended at +the end. + +This was introduced with commit 677f1410e058 ("scripts/checkstack.pl: don't +display $dre as different entity"): since then the address is taken from +the line which contains the function name, instead of the line which +contains stack consumption. E.g. on s390: + +0000000000100a30 : +... + 100a44: e3 f0 ff 70 ff 71 lay %r15,-144(%r15) + +So the used regex which matches spaces and hexadecimal numbers to extract +an address now matches a different substring. Subsequently replacing spaces +with 0 appends a zero at the and, instead of replacing leading spaces. + +Fix this by using the proper regex, and simplify the code a bit. + +Link: https://lkml.kernel.org/r/20231120183719.2188479-2-hca@linux.ibm.com +Fixes: 677f1410e058 ("scripts/checkstack.pl: don't display $dre as different entity") +Signed-off-by: Heiko Carstens +Cc: Maninder Singh +Cc: Masahiro Yamada +Cc: Vaneet Narang +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + scripts/checkstack.pl | 8 ++------ + 1 file changed, 2 insertions(+), 6 deletions(-) + +--- a/scripts/checkstack.pl ++++ b/scripts/checkstack.pl +@@ -146,15 +146,11 @@ $total_size = 0; + while (my $line = ) { + if ($line =~ m/$funcre/) { + $func = $1; +- next if $line !~ m/^($xs*)/; ++ next if $line !~ m/^($x*)/; + if ($total_size > $min_stack) { + push @stack, "$intro$total_size\n"; + } +- +- $addr = $1; +- $addr =~ s/ /0/g; +- $addr = "0x$addr"; +- ++ $addr = "0x$1"; + $intro = "$addr $func [$file]:"; + my $padlen = 56 - length($intro); + while ($padlen > 0) { diff --git a/queue-6.1/hugetlb-fix-null-ptr-deref-in-hugetlb_vma_lock_write.patch b/queue-6.1/hugetlb-fix-null-ptr-deref-in-hugetlb_vma_lock_write.patch new file mode 100644 index 00000000000..ba09354017e --- /dev/null +++ b/queue-6.1/hugetlb-fix-null-ptr-deref-in-hugetlb_vma_lock_write.patch @@ -0,0 +1,100 @@ +From 187da0f8250aa94bd96266096aef6f694e0b4cd2 Mon Sep 17 00:00:00 2001 +From: Mike Kravetz +Date: Mon, 13 Nov 2023 17:20:33 -0800 +Subject: hugetlb: fix null-ptr-deref in hugetlb_vma_lock_write + +From: Mike Kravetz + +commit 187da0f8250aa94bd96266096aef6f694e0b4cd2 upstream. + +The routine __vma_private_lock tests for the existence of a reserve map +associated with a private hugetlb mapping. A pointer to the reserve map +is in vma->vm_private_data. __vma_private_lock was checking the pointer +for NULL. However, it is possible that the low bits of the pointer could +be used as flags. In such instances, vm_private_data is not NULL and not +a valid pointer. This results in the null-ptr-deref reported by syzbot: + +general protection fault, probably for non-canonical address 0xdffffc000000001d: + 0000 [#1] PREEMPT SMP KASAN +KASAN: null-ptr-deref in range [0x00000000000000e8-0x00000000000000ef] +CPU: 0 PID: 5048 Comm: syz-executor139 Not tainted 6.6.0-rc7-syzkaller-00142-g88 +8cf78c29e2 #0 +Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 1 +0/09/2023 +RIP: 0010:__lock_acquire+0x109/0x5de0 kernel/locking/lockdep.c:5004 +... +Call Trace: + + lock_acquire kernel/locking/lockdep.c:5753 [inline] + lock_acquire+0x1ae/0x510 kernel/locking/lockdep.c:5718 + down_write+0x93/0x200 kernel/locking/rwsem.c:1573 + hugetlb_vma_lock_write mm/hugetlb.c:300 [inline] + hugetlb_vma_lock_write+0xae/0x100 mm/hugetlb.c:291 + __hugetlb_zap_begin+0x1e9/0x2b0 mm/hugetlb.c:5447 + hugetlb_zap_begin include/linux/hugetlb.h:258 [inline] + unmap_vmas+0x2f4/0x470 mm/memory.c:1733 + exit_mmap+0x1ad/0xa60 mm/mmap.c:3230 + __mmput+0x12a/0x4d0 kernel/fork.c:1349 + mmput+0x62/0x70 kernel/fork.c:1371 + exit_mm kernel/exit.c:567 [inline] + do_exit+0x9ad/0x2a20 kernel/exit.c:861 + __do_sys_exit kernel/exit.c:991 [inline] + __se_sys_exit kernel/exit.c:989 [inline] + __x64_sys_exit+0x42/0x50 kernel/exit.c:989 + do_syscall_x64 arch/x86/entry/common.c:50 [inline] + do_syscall_64+0x38/0xb0 arch/x86/entry/common.c:80 + entry_SYSCALL_64_after_hwframe+0x63/0xcd + +Mask off low bit flags before checking for NULL pointer. In addition, the +reserve map only 'belongs' to the OWNER (parent in parent/child +relationships) so also check for the OWNER flag. + +Link: https://lkml.kernel.org/r/20231114012033.259600-1-mike.kravetz@oracle.com +Reported-by: syzbot+6ada951e7c0f7bc8a71e@syzkaller.appspotmail.com +Closes: https://lore.kernel.org/linux-mm/00000000000078d1e00608d7878b@google.com/ +Fixes: bf4916922c60 ("hugetlbfs: extend hugetlb_vma_lock to private VMAs") +Signed-off-by: Mike Kravetz +Reviewed-by: Rik van Riel +Cc: Edward Adam Davis +Cc: Muchun Song +Cc: Nathan Chancellor +Cc: Nick Desaulniers +Cc: Tom Rix +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/hugetlb.h | 5 +---- + mm/hugetlb.c | 7 +++++++ + 2 files changed, 8 insertions(+), 4 deletions(-) + +--- a/include/linux/hugetlb.h ++++ b/include/linux/hugetlb.h +@@ -880,10 +880,7 @@ static inline bool hugepage_migration_su + return arch_hugetlb_migration_supported(h); + } + +-static inline bool __vma_private_lock(struct vm_area_struct *vma) +-{ +- return (!(vma->vm_flags & VM_MAYSHARE)) && vma->vm_private_data; +-} ++bool __vma_private_lock(struct vm_area_struct *vma); + + /* + * Movability check is different as compared to migration check. +--- a/mm/hugetlb.c ++++ b/mm/hugetlb.c +@@ -1189,6 +1189,13 @@ static int is_vma_resv_set(struct vm_are + return (get_vma_private_data(vma) & flag) != 0; + } + ++bool __vma_private_lock(struct vm_area_struct *vma) ++{ ++ return !(vma->vm_flags & VM_MAYSHARE) && ++ get_vma_private_data(vma) & ~HPAGE_RESV_MASK && ++ is_vma_resv_set(vma, HPAGE_RESV_OWNER); ++} ++ + void hugetlb_dup_vma_private(struct vm_area_struct *vma) + { + VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); diff --git a/queue-6.1/mm-fix-oops-when-filemap_map_pmd-without-prealloc_pte.patch b/queue-6.1/mm-fix-oops-when-filemap_map_pmd-without-prealloc_pte.patch new file mode 100644 index 00000000000..2e8482b4067 --- /dev/null +++ b/queue-6.1/mm-fix-oops-when-filemap_map_pmd-without-prealloc_pte.patch @@ -0,0 +1,62 @@ +From 9aa1345d66b8132745ffb99b348b1492088da9e2 Mon Sep 17 00:00:00 2001 +From: Hugh Dickins +Date: Fri, 17 Nov 2023 00:49:18 -0800 +Subject: mm: fix oops when filemap_map_pmd() without prealloc_pte +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Hugh Dickins + +commit 9aa1345d66b8132745ffb99b348b1492088da9e2 upstream. + +syzbot reports oops in lockdep's __lock_acquire(), called from +__pte_offset_map_lock() called from filemap_map_pages(); or when I run the +repro, the oops comes in pmd_install(), called from filemap_map_pmd() +called from filemap_map_pages(), just before the __pte_offset_map_lock(). + +The problem is that filemap_map_pmd() has been assuming that when it finds +pmd_none(), a page table has already been prepared in prealloc_pte; and +indeed do_fault_around() has been careful to preallocate one there, when +it finds pmd_none(): but what if *pmd became none in between? + +My 6.6 mods in mm/khugepaged.c, avoiding mmap_lock for write, have made it +easy for *pmd to be cleared while servicing a page fault; but even before +those, a huge *pmd might be zapped while a fault is serviced. + +The difference in symptomatic stack traces comes from the "memory model" +in use: pmd_install() uses pmd_populate() uses page_to_pfn(): in some +models that is strict, and will oops on the NULL prealloc_pte; in other +models, it will construct a bogus value to be populated into *pmd, then +__pte_offset_map_lock() oops when trying to access split ptlock pointer +(or some other symptom in normal case of ptlock embedded not pointer). + +Link: https://lore.kernel.org/linux-mm/20231115065506.19780-1-jose.pekkarinen@foxhound.fi/ +Link: https://lkml.kernel.org/r/6ed0c50c-78ef-0719-b3c5-60c0c010431c@google.com +Fixes: f9ce0be71d1f ("mm: Cleanup faultaround and finish_fault() codepaths") +Signed-off-by: Hugh Dickins +Reported-and-tested-by: syzbot+89edd67979b52675ddec@syzkaller.appspotmail.com +Closes: https://lore.kernel.org/linux-mm/0000000000005e44550608a0806c@google.com/ +Reviewed-by: David Hildenbrand +Cc: Jann Horn , +Cc: José Pekkarinen +Cc: Kirill A. Shutemov +Cc: Matthew Wilcox (Oracle) +Cc: [5.12+] +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + mm/filemap.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/mm/filemap.c ++++ b/mm/filemap.c +@@ -3319,7 +3319,7 @@ static bool filemap_map_pmd(struct vm_fa + } + } + +- if (pmd_none(*vmf->pmd)) ++ if (pmd_none(*vmf->pmd) && vmf->prealloc_pte) + pmd_install(mm, vmf->pmd, &vmf->prealloc_pte); + + /* See comment in handle_pte_fault() */ diff --git a/queue-6.1/nilfs2-fix-missing-error-check-for-sb_set_blocksize-call.patch b/queue-6.1/nilfs2-fix-missing-error-check-for-sb_set_blocksize-call.patch new file mode 100644 index 00000000000..5339d1e4310 --- /dev/null +++ b/queue-6.1/nilfs2-fix-missing-error-check-for-sb_set_blocksize-call.patch @@ -0,0 +1,79 @@ +From d61d0ab573649789bf9eb909c89a1a193b2e3d10 Mon Sep 17 00:00:00 2001 +From: Ryusuke Konishi +Date: Wed, 29 Nov 2023 23:15:47 +0900 +Subject: nilfs2: fix missing error check for sb_set_blocksize call + +From: Ryusuke Konishi + +commit d61d0ab573649789bf9eb909c89a1a193b2e3d10 upstream. + +When mounting a filesystem image with a block size larger than the page +size, nilfs2 repeatedly outputs long error messages with stack traces to +the kernel log, such as the following: + + getblk(): invalid block size 8192 requested + logical block size: 512 + ... + Call Trace: + dump_stack_lvl+0x92/0xd4 + dump_stack+0xd/0x10 + bdev_getblk+0x33a/0x354 + __breadahead+0x11/0x80 + nilfs_search_super_root+0xe2/0x704 [nilfs2] + load_nilfs+0x72/0x504 [nilfs2] + nilfs_mount+0x30f/0x518 [nilfs2] + legacy_get_tree+0x1b/0x40 + vfs_get_tree+0x18/0xc4 + path_mount+0x786/0xa88 + __ia32_sys_mount+0x147/0x1a8 + __do_fast_syscall_32+0x56/0xc8 + do_fast_syscall_32+0x29/0x58 + do_SYSENTER_32+0x15/0x18 + entry_SYSENTER_32+0x98/0xf1 + ... + +This overloads the system logger. And to make matters worse, it sometimes +crashes the kernel with a memory access violation. + +This is because the return value of the sb_set_blocksize() call, which +should be checked for errors, is not checked. + +The latter issue is due to out-of-buffer memory being accessed based on a +large block size that caused sb_set_blocksize() to fail for buffers read +with the initial minimum block size that remained unupdated in the +super_block structure. + +Since nilfs2 mkfs tool does not accept block sizes larger than the system +page size, this has been overlooked. However, it is possible to create +this situation by intentionally modifying the tool or by passing a +filesystem image created on a system with a large page size to a system +with a smaller page size and mounting it. + +Fix this issue by inserting the expected error handling for the call to +sb_set_blocksize(). + +Link: https://lkml.kernel.org/r/20231129141547.4726-1-konishi.ryusuke@gmail.com +Signed-off-by: Ryusuke Konishi +Tested-by: Ryusuke Konishi +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + fs/nilfs2/the_nilfs.c | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +--- a/fs/nilfs2/the_nilfs.c ++++ b/fs/nilfs2/the_nilfs.c +@@ -716,7 +716,11 @@ int init_nilfs(struct the_nilfs *nilfs, + goto failed_sbh; + } + nilfs_release_super_block(nilfs); +- sb_set_blocksize(sb, blocksize); ++ if (!sb_set_blocksize(sb, blocksize)) { ++ nilfs_err(sb, "bad blocksize %d", blocksize); ++ err = -EINVAL; ++ goto out; ++ } + + err = nilfs_load_super_block(nilfs, sb, blocksize, &sbp); + if (err) diff --git a/queue-6.1/nilfs2-prevent-warning-in-nilfs_sufile_set_segment_usage.patch b/queue-6.1/nilfs2-prevent-warning-in-nilfs_sufile_set_segment_usage.patch new file mode 100644 index 00000000000..95d748bda8d --- /dev/null +++ b/queue-6.1/nilfs2-prevent-warning-in-nilfs_sufile_set_segment_usage.patch @@ -0,0 +1,109 @@ +From 675abf8df1353e0e3bde314993e0796c524cfbf0 Mon Sep 17 00:00:00 2001 +From: Ryusuke Konishi +Date: Tue, 5 Dec 2023 17:59:47 +0900 +Subject: nilfs2: prevent WARNING in nilfs_sufile_set_segment_usage() + +From: Ryusuke Konishi + +commit 675abf8df1353e0e3bde314993e0796c524cfbf0 upstream. + +If nilfs2 reads a disk image with corrupted segment usage metadata, and +its segment usage information is marked as an error for the segment at the +write location, nilfs_sufile_set_segment_usage() can trigger WARN_ONs +during log writing. + +Segments newly allocated for writing with nilfs_sufile_alloc() will not +have this error flag set, but this unexpected situation will occur if the +segment indexed by either nilfs->ns_segnum or nilfs->ns_nextnum (active +segment) was marked in error. + +Fix this issue by inserting a sanity check to treat it as a file system +corruption. + +Since error returns are not allowed during the execution phase where +nilfs_sufile_set_segment_usage() is used, this inserts the sanity check +into nilfs_sufile_mark_dirty() which pre-reads the buffer containing the +segment usage record to be updated and sets it up in a dirty state for +writing. + +In addition, nilfs_sufile_set_segment_usage() is also called when +canceling log writing and undoing segment usage update, so in order to +avoid issuing the same kernel warning in that case, in case of +cancellation, avoid checking the error flag in +nilfs_sufile_set_segment_usage(). + +Link: https://lkml.kernel.org/r/20231205085947.4431-1-konishi.ryusuke@gmail.com +Signed-off-by: Ryusuke Konishi +Reported-by: syzbot+14e9f834f6ddecece094@syzkaller.appspotmail.com +Closes: https://syzkaller.appspot.com/bug?extid=14e9f834f6ddecece094 +Tested-by: Ryusuke Konishi +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + fs/nilfs2/sufile.c | 42 +++++++++++++++++++++++++++++++++++------- + 1 file changed, 35 insertions(+), 7 deletions(-) + +--- a/fs/nilfs2/sufile.c ++++ b/fs/nilfs2/sufile.c +@@ -501,15 +501,38 @@ int nilfs_sufile_mark_dirty(struct inode + + down_write(&NILFS_MDT(sufile)->mi_sem); + ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &bh); +- if (!ret) { +- mark_buffer_dirty(bh); +- nilfs_mdt_mark_dirty(sufile); +- kaddr = kmap_atomic(bh->b_page); +- su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr); ++ if (ret) ++ goto out_sem; ++ ++ kaddr = kmap_atomic(bh->b_page); ++ su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr); ++ if (unlikely(nilfs_segment_usage_error(su))) { ++ struct the_nilfs *nilfs = sufile->i_sb->s_fs_info; ++ ++ kunmap_atomic(kaddr); ++ brelse(bh); ++ if (nilfs_segment_is_active(nilfs, segnum)) { ++ nilfs_error(sufile->i_sb, ++ "active segment %llu is erroneous", ++ (unsigned long long)segnum); ++ } else { ++ /* ++ * Segments marked erroneous are never allocated by ++ * nilfs_sufile_alloc(); only active segments, ie, ++ * the segments indexed by ns_segnum or ns_nextnum, ++ * can be erroneous here. ++ */ ++ WARN_ON_ONCE(1); ++ } ++ ret = -EIO; ++ } else { + nilfs_segment_usage_set_dirty(su); + kunmap_atomic(kaddr); ++ mark_buffer_dirty(bh); ++ nilfs_mdt_mark_dirty(sufile); + brelse(bh); + } ++out_sem: + up_write(&NILFS_MDT(sufile)->mi_sem); + return ret; + } +@@ -536,9 +559,14 @@ int nilfs_sufile_set_segment_usage(struc + + kaddr = kmap_atomic(bh->b_page); + su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr); +- WARN_ON(nilfs_segment_usage_error(su)); +- if (modtime) ++ if (modtime) { ++ /* ++ * Check segusage error and set su_lastmod only when updating ++ * this entry with a valid timestamp, not for cancellation. ++ */ ++ WARN_ON_ONCE(nilfs_segment_usage_error(su)); + su->su_lastmod = cpu_to_le64(modtime); ++ } + su->su_nblocks = cpu_to_le32(nblocks); + kunmap_atomic(kaddr); + diff --git a/queue-6.1/packet-move-reference-count-in-packet_sock-to-atomic_long_t.patch b/queue-6.1/packet-move-reference-count-in-packet_sock-to-atomic_long_t.patch new file mode 100644 index 00000000000..5dfcda8ae79 --- /dev/null +++ b/queue-6.1/packet-move-reference-count-in-packet_sock-to-atomic_long_t.patch @@ -0,0 +1,109 @@ +From db3fadacaf0c817b222090290d06ca2a338422d0 Mon Sep 17 00:00:00 2001 +From: Daniel Borkmann +Date: Fri, 1 Dec 2023 14:10:21 +0100 +Subject: packet: Move reference count in packet_sock to atomic_long_t + +From: Daniel Borkmann + +commit db3fadacaf0c817b222090290d06ca2a338422d0 upstream. + +In some potential instances the reference count on struct packet_sock +could be saturated and cause overflows which gets the kernel a bit +confused. To prevent this, move to a 64-bit atomic reference count on +64-bit architectures to prevent the possibility of this type to overflow. + +Because we can not handle saturation, using refcount_t is not possible +in this place. Maybe someday in the future if it changes it could be +used. Also, instead of using plain atomic64_t, use atomic_long_t instead. +32-bit machines tend to be memory-limited (i.e. anything that increases +a reference uses so much memory that you can't actually get to 2**32 +references). 32-bit architectures also tend to have serious problems +with 64-bit atomics. Hence, atomic_long_t is the more natural solution. + +Reported-by: "The UK's National Cyber Security Centre (NCSC)" +Co-developed-by: Greg Kroah-Hartman +Signed-off-by: Greg Kroah-Hartman +Signed-off-by: Daniel Borkmann +Cc: Linus Torvalds +Cc: stable@kernel.org +Reviewed-by: Willem de Bruijn +Reviewed-by: Eric Dumazet +Link: https://lore.kernel.org/r/20231201131021.19999-1-daniel@iogearbox.net +Signed-off-by: Jakub Kicinski +Signed-off-by: Greg Kroah-Hartman +--- + net/packet/af_packet.c | 16 ++++++++-------- + net/packet/internal.h | 2 +- + 2 files changed, 9 insertions(+), 9 deletions(-) + +--- a/net/packet/af_packet.c ++++ b/net/packet/af_packet.c +@@ -4275,7 +4275,7 @@ static void packet_mm_open(struct vm_are + struct sock *sk = sock->sk; + + if (sk) +- atomic_inc(&pkt_sk(sk)->mapped); ++ atomic_long_inc(&pkt_sk(sk)->mapped); + } + + static void packet_mm_close(struct vm_area_struct *vma) +@@ -4285,7 +4285,7 @@ static void packet_mm_close(struct vm_ar + struct sock *sk = sock->sk; + + if (sk) +- atomic_dec(&pkt_sk(sk)->mapped); ++ atomic_long_dec(&pkt_sk(sk)->mapped); + } + + static const struct vm_operations_struct packet_mmap_ops = { +@@ -4380,7 +4380,7 @@ static int packet_set_ring(struct sock * + + err = -EBUSY; + if (!closing) { +- if (atomic_read(&po->mapped)) ++ if (atomic_long_read(&po->mapped)) + goto out; + if (packet_read_pending(rb)) + goto out; +@@ -4483,7 +4483,7 @@ static int packet_set_ring(struct sock * + + err = -EBUSY; + mutex_lock(&po->pg_vec_lock); +- if (closing || atomic_read(&po->mapped) == 0) { ++ if (closing || atomic_long_read(&po->mapped) == 0) { + err = 0; + spin_lock_bh(&rb_queue->lock); + swap(rb->pg_vec, pg_vec); +@@ -4501,9 +4501,9 @@ static int packet_set_ring(struct sock * + po->prot_hook.func = (po->rx_ring.pg_vec) ? + tpacket_rcv : packet_rcv; + skb_queue_purge(rb_queue); +- if (atomic_read(&po->mapped)) +- pr_err("packet_mmap: vma is busy: %d\n", +- atomic_read(&po->mapped)); ++ if (atomic_long_read(&po->mapped)) ++ pr_err("packet_mmap: vma is busy: %ld\n", ++ atomic_long_read(&po->mapped)); + } + mutex_unlock(&po->pg_vec_lock); + +@@ -4581,7 +4581,7 @@ static int packet_mmap(struct file *file + } + } + +- atomic_inc(&po->mapped); ++ atomic_long_inc(&po->mapped); + vma->vm_ops = &packet_mmap_ops; + err = 0; + +--- a/net/packet/internal.h ++++ b/net/packet/internal.h +@@ -126,7 +126,7 @@ struct packet_sock { + __be16 num; + struct packet_rollover *rollover; + struct packet_mclist *mclist; +- atomic_t mapped; ++ atomic_long_t mapped; + enum tpacket_versions tp_version; + unsigned int tp_hdrlen; + unsigned int tp_reserve; diff --git a/queue-6.1/platform-surface-aggregator-fix-recv_buf-return-value.patch b/queue-6.1/platform-surface-aggregator-fix-recv_buf-return-value.patch new file mode 100644 index 00000000000..04a12986771 --- /dev/null +++ b/queue-6.1/platform-surface-aggregator-fix-recv_buf-return-value.patch @@ -0,0 +1,49 @@ +From c8820c92caf0770bec976b01fa9e82bb993c5865 Mon Sep 17 00:00:00 2001 +From: Francesco Dolcini +Date: Tue, 28 Nov 2023 20:49:35 +0100 +Subject: platform/surface: aggregator: fix recv_buf() return value +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Francesco Dolcini + +commit c8820c92caf0770bec976b01fa9e82bb993c5865 upstream. + +Serdev recv_buf() callback is supposed to return the amount of bytes +consumed, therefore an int in between 0 and count. + +Do not return negative number in case of issue, when +ssam_controller_receive_buf() returns ESHUTDOWN just returns 0, e.g. no +bytes consumed, this keep the exact same behavior as it was before. + +This fixes a potential WARN in serdev-ttyport.c:ttyport_receive_buf(). + +Fixes: c167b9c7e3d6 ("platform/surface: Add Surface Aggregator subsystem") +Cc: stable@vger.kernel.org +Signed-off-by: Francesco Dolcini +Reviewed-by: Maximilian Luz +Link: https://lore.kernel.org/r/20231128194935.11350-1-francesco@dolcini.it +Reviewed-by: Ilpo Järvinen +Signed-off-by: Ilpo Järvinen +Signed-off-by: Greg Kroah-Hartman +--- + drivers/platform/surface/aggregator/core.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +--- a/drivers/platform/surface/aggregator/core.c ++++ b/drivers/platform/surface/aggregator/core.c +@@ -231,9 +231,12 @@ static int ssam_receive_buf(struct serde + size_t n) + { + struct ssam_controller *ctrl; ++ int ret; + + ctrl = serdev_device_get_drvdata(dev); +- return ssam_controller_receive_buf(ctrl, buf, n); ++ ret = ssam_controller_receive_buf(ctrl, buf, n); ++ ++ return ret < 0 ? 0 : ret; + } + + static void ssam_write_wakeup(struct serdev_device *dev) diff --git a/queue-6.1/r8169-fix-rtl8125b-pause-frames-blasting-when-suspended.patch b/queue-6.1/r8169-fix-rtl8125b-pause-frames-blasting-when-suspended.patch new file mode 100644 index 00000000000..ac0a3855a22 --- /dev/null +++ b/queue-6.1/r8169-fix-rtl8125b-pause-frames-blasting-when-suspended.patch @@ -0,0 +1,69 @@ +From 4b0768b6556af56ee9b7cf4e68452a2b6289ae45 Mon Sep 17 00:00:00 2001 +From: ChunHao Lin +Date: Wed, 29 Nov 2023 23:53:50 +0800 +Subject: r8169: fix rtl8125b PAUSE frames blasting when suspended + +From: ChunHao Lin + +commit 4b0768b6556af56ee9b7cf4e68452a2b6289ae45 upstream. + +When FIFO reaches near full state, device will issue pause frame. +If pause slot is enabled(set to 1), in this time, device will issue +pause frame only once. But if pause slot is disabled(set to 0), device +will keep sending pause frames until FIFO reaches near empty state. + +When pause slot is disabled, if there is no one to handle receive +packets, device FIFO will reach near full state and keep sending +pause frames. That will impact entire local area network. + +This issue can be reproduced in Chromebox (not Chromebook) in +developer mode running a test image (and v5.10 kernel): +1) ping -f $CHROMEBOX (from workstation on same local network) +2) run "powerd_dbus_suspend" from command line on the $CHROMEBOX +3) ping $ROUTER (wait until ping fails from workstation) + +Takes about ~20-30 seconds after step 2 for the local network to +stop working. + +Fix this issue by enabling pause slot to only send pause frame once +when FIFO reaches near full state. + +Fixes: f1bce4ad2f1c ("r8169: add support for RTL8125") +Reported-by: Grant Grundler +Tested-by: Grant Grundler +Cc: stable@vger.kernel.org +Signed-off-by: ChunHao Lin +Reviewed-by: Jacob Keller +Reviewed-by: Heiner Kallweit +Link: https://lore.kernel.org/r/20231129155350.5843-1-hau@realtek.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/realtek/r8169_main.c | 7 ++++++- + 1 file changed, 6 insertions(+), 1 deletion(-) + +--- a/drivers/net/ethernet/realtek/r8169_main.c ++++ b/drivers/net/ethernet/realtek/r8169_main.c +@@ -193,6 +193,7 @@ enum rtl_registers { + /* No threshold before first PCI xfer */ + #define RX_FIFO_THRESH (7 << RXCFG_FIFO_SHIFT) + #define RX_EARLY_OFF (1 << 11) ++#define RX_PAUSE_SLOT_ON (1 << 11) /* 8125b and later */ + #define RXCFG_DMA_SHIFT 8 + /* Unlimited maximum PCI burst. */ + #define RX_DMA_BURST (7 << RXCFG_DMA_SHIFT) +@@ -2237,9 +2238,13 @@ static void rtl_init_rxcfg(struct rtl816 + case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_53: + RTL_W32(tp, RxConfig, RX128_INT_EN | RX_MULTI_EN | RX_DMA_BURST | RX_EARLY_OFF); + break; +- case RTL_GIGA_MAC_VER_61 ... RTL_GIGA_MAC_VER_63: ++ case RTL_GIGA_MAC_VER_61: + RTL_W32(tp, RxConfig, RX_FETCH_DFLT_8125 | RX_DMA_BURST); + break; ++ case RTL_GIGA_MAC_VER_63: ++ RTL_W32(tp, RxConfig, RX_FETCH_DFLT_8125 | RX_DMA_BURST | ++ RX_PAUSE_SLOT_ON); ++ break; + default: + RTL_W32(tp, RxConfig, RX128_INT_EN | RX_DMA_BURST); + break; diff --git a/queue-6.1/regmap-fix-bogus-error-on-regcache_sync-success.patch b/queue-6.1/regmap-fix-bogus-error-on-regcache_sync-success.patch new file mode 100644 index 00000000000..43e7ef81861 --- /dev/null +++ b/queue-6.1/regmap-fix-bogus-error-on-regcache_sync-success.patch @@ -0,0 +1,45 @@ +From fea88064445a59584460f7f67d102b6e5fc1ca1d Mon Sep 17 00:00:00 2001 +From: Matthias Reichl +Date: Sun, 3 Dec 2023 23:22:16 +0100 +Subject: regmap: fix bogus error on regcache_sync success + +From: Matthias Reichl + +commit fea88064445a59584460f7f67d102b6e5fc1ca1d upstream. + +Since commit 0ec7731655de ("regmap: Ensure range selector registers +are updated after cache sync") opening pcm512x based soundcards fail +with EINVAL and dmesg shows sync cache and pm_runtime_get errors: + +[ 228.794676] pcm512x 1-004c: Failed to sync cache: -22 +[ 228.794740] pcm512x 1-004c: ASoC: error at snd_soc_pcm_component_pm_runtime_get on pcm512x.1-004c: -22 + +This is caused by the cache check result leaking out into the +regcache_sync return value. + +Fix this by making the check local-only, as the comment above the +regcache_read call states a non-zero return value means there's +nothing to do so the return value should not be altered. + +Fixes: 0ec7731655de ("regmap: Ensure range selector registers are updated after cache sync") +Cc: stable@vger.kernel.org +Signed-off-by: Matthias Reichl +Link: https://lore.kernel.org/r/20231203222216.96547-1-hias@horus.com +Signed-off-by: Mark Brown +Signed-off-by: Greg Kroah-Hartman +--- + drivers/base/regmap/regcache.c | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +--- a/drivers/base/regmap/regcache.c ++++ b/drivers/base/regmap/regcache.c +@@ -410,8 +410,7 @@ out: + rb_entry(node, struct regmap_range_node, node); + + /* If there's nothing in the cache there's nothing to sync */ +- ret = regcache_read(map, this->selector_reg, &i); +- if (ret != 0) ++ if (regcache_read(map, this->selector_reg, &i) != 0) + continue; + + ret = _regmap_write(map, this->selector_reg, i); diff --git a/queue-6.1/ring-buffer-test-last-update-in-32bit-version-of-__rb_time_read.patch b/queue-6.1/ring-buffer-test-last-update-in-32bit-version-of-__rb_time_read.patch new file mode 100644 index 00000000000..6fb64e44bd6 --- /dev/null +++ b/queue-6.1/ring-buffer-test-last-update-in-32bit-version-of-__rb_time_read.patch @@ -0,0 +1,50 @@ +From f458a1453424e03462b5bb539673c9a3cddda480 Mon Sep 17 00:00:00 2001 +From: "Steven Rostedt (Google)" +Date: Wed, 6 Dec 2023 10:00:50 -0500 +Subject: ring-buffer: Test last update in 32bit version of __rb_time_read() + +From: Steven Rostedt (Google) + +commit f458a1453424e03462b5bb539673c9a3cddda480 upstream. + +Since 64 bit cmpxchg() is very expensive on 32bit architectures, the +timestamp used by the ring buffer does some interesting tricks to be able +to still have an atomic 64 bit number. It originally just used 60 bits and +broke it up into two 32 bit words where the extra 2 bits were used for +synchronization. But this was not enough for all use cases, and all 64 +bits were required. + +The 32bit version of the ring buffer timestamp was then broken up into 3 +32bit words using the same counter trick. But one update was not done. The +check to see if the read operation was done without interruption only +checked the first two words and not last one (like it had before this +update). Fix it by making sure all three updates happen without +interruption by comparing the initial counter with the last updated +counter. + +Link: https://lore.kernel.org/linux-trace-kernel/20231206100050.3100b7bb@gandalf.local.home + +Cc: stable@vger.kernel.org +Cc: Masami Hiramatsu +Cc: Mark Rutland +Cc: Mathieu Desnoyers +Fixes: f03f2abce4f39 ("ring-buffer: Have 32 bit time stamps use all 64 bits") +Signed-off-by: Steven Rostedt (Google) +Signed-off-by: Greg Kroah-Hartman +--- + kernel/trace/ring_buffer.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/kernel/trace/ring_buffer.c ++++ b/kernel/trace/ring_buffer.c +@@ -646,8 +646,8 @@ static inline bool __rb_time_read(rb_tim + + *cnt = rb_time_cnt(top); + +- /* If top and bottom counts don't match, this interrupted a write */ +- if (*cnt != rb_time_cnt(bottom)) ++ /* If top and msb counts don't match, this interrupted a write */ ++ if (*cnt != rb_time_cnt(msb)) + return false; + + /* The shift to msb will lose its cnt bits */ diff --git a/queue-6.1/series b/queue-6.1/series index 928754c7b09..cf26fc5403a 100644 --- a/queue-6.1/series +++ b/queue-6.1/series @@ -108,3 +108,18 @@ alsa-pcm-fix-out-of-bounds-in-snd_pcm_state_names.patch alsa-hda-realtek-enable-headset-on-lenovo-m90-gen5.patch alsa-hda-realtek-add-new-framework-laptop-to-quirks.patch alsa-hda-realtek-add-framework-laptop-16-to-quirks.patch +ring-buffer-test-last-update-in-32bit-version-of-__rb_time_read.patch +nilfs2-fix-missing-error-check-for-sb_set_blocksize-call.patch +nilfs2-prevent-warning-in-nilfs_sufile_set_segment_usage.patch +cgroup_freezer-cgroup_freezing-check-if-not-frozen.patch +checkstack-fix-printed-address.patch +tracing-always-update-snapshot-buffer-size.patch +tracing-disable-snapshot-buffer-when-stopping-instance-tracers.patch +tracing-fix-incomplete-locking-when-disabling-buffered-events.patch +tracing-fix-a-possible-race-when-disabling-buffered-events.patch +packet-move-reference-count-in-packet_sock-to-atomic_long_t.patch +r8169-fix-rtl8125b-pause-frames-blasting-when-suspended.patch +regmap-fix-bogus-error-on-regcache_sync-success.patch +platform-surface-aggregator-fix-recv_buf-return-value.patch +hugetlb-fix-null-ptr-deref-in-hugetlb_vma_lock_write.patch +mm-fix-oops-when-filemap_map_pmd-without-prealloc_pte.patch diff --git a/queue-6.1/tracing-always-update-snapshot-buffer-size.patch b/queue-6.1/tracing-always-update-snapshot-buffer-size.patch new file mode 100644 index 00000000000..c97c4c5cd42 --- /dev/null +++ b/queue-6.1/tracing-always-update-snapshot-buffer-size.patch @@ -0,0 +1,83 @@ +From 7be76461f302ec05cbd62b90b2a05c64299ca01f Mon Sep 17 00:00:00 2001 +From: "Steven Rostedt (Google)" +Date: Tue, 5 Dec 2023 16:52:09 -0500 +Subject: tracing: Always update snapshot buffer size + +From: Steven Rostedt (Google) + +commit 7be76461f302ec05cbd62b90b2a05c64299ca01f upstream. + +It use to be that only the top level instance had a snapshot buffer (for +latency tracers like wakeup and irqsoff). The update of the ring buffer +size would check if the instance was the top level and if so, it would +also update the snapshot buffer as it needs to be the same as the main +buffer. + +Now that lower level instances also has a snapshot buffer, they too need +to update their snapshot buffer sizes when the main buffer is changed, +otherwise the following can be triggered: + + # cd /sys/kernel/tracing + # echo 1500 > buffer_size_kb + # mkdir instances/foo + # echo irqsoff > instances/foo/current_tracer + # echo 1000 > instances/foo/buffer_size_kb + +Produces: + + WARNING: CPU: 2 PID: 856 at kernel/trace/trace.c:1938 update_max_tr_single.part.0+0x27d/0x320 + +Which is: + + ret = ring_buffer_swap_cpu(tr->max_buffer.buffer, tr->array_buffer.buffer, cpu); + + if (ret == -EBUSY) { + [..] + } + + WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY); <== here + +That's because ring_buffer_swap_cpu() has: + + int ret = -EINVAL; + + [..] + + /* At least make sure the two buffers are somewhat the same */ + if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages) + goto out; + + [..] + out: + return ret; + } + +Instead, update all instances' snapshot buffer sizes when their main +buffer size is updated. + +Link: https://lkml.kernel.org/r/20231205220010.454662151@goodmis.org + +Cc: stable@vger.kernel.org +Cc: Masami Hiramatsu +Cc: Mark Rutland +Cc: Mathieu Desnoyers +Cc: Andrew Morton +Fixes: 6d9b3fa5e7f6 ("tracing: Move tracing_max_latency into trace_array") +Signed-off-by: Steven Rostedt (Google) +Signed-off-by: Greg Kroah-Hartman +--- + kernel/trace/trace.c | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +--- a/kernel/trace/trace.c ++++ b/kernel/trace/trace.c +@@ -6306,8 +6306,7 @@ static int __tracing_resize_ring_buffer( + return ret; + + #ifdef CONFIG_TRACER_MAX_TRACE +- if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL) || +- !tr->current_trace->use_max_tr) ++ if (!tr->current_trace->use_max_tr) + goto out; + + ret = ring_buffer_resize(tr->max_buffer.buffer, size, cpu); diff --git a/queue-6.1/tracing-disable-snapshot-buffer-when-stopping-instance-tracers.patch b/queue-6.1/tracing-disable-snapshot-buffer-when-stopping-instance-tracers.patch new file mode 100644 index 00000000000..31e1582a93e --- /dev/null +++ b/queue-6.1/tracing-disable-snapshot-buffer-when-stopping-instance-tracers.patch @@ -0,0 +1,203 @@ +From b538bf7d0ec11ca49f536dfda742a5f6db90a798 Mon Sep 17 00:00:00 2001 +From: "Steven Rostedt (Google)" +Date: Tue, 5 Dec 2023 16:52:11 -0500 +Subject: tracing: Disable snapshot buffer when stopping instance tracers + +From: Steven Rostedt (Google) + +commit b538bf7d0ec11ca49f536dfda742a5f6db90a798 upstream. + +It use to be that only the top level instance had a snapshot buffer (for +latency tracers like wakeup and irqsoff). When stopping a tracer in an +instance would not disable the snapshot buffer. This could have some +unintended consequences if the irqsoff tracer is enabled. + +Consolidate the tracing_start/stop() with tracing_start/stop_tr() so that +all instances behave the same. The tracing_start/stop() functions will +just call their respective tracing_start/stop_tr() with the global_array +passed in. + +Link: https://lkml.kernel.org/r/20231205220011.041220035@goodmis.org + +Cc: stable@vger.kernel.org +Cc: Masami Hiramatsu +Cc: Mark Rutland +Cc: Mathieu Desnoyers +Cc: Andrew Morton +Fixes: 6d9b3fa5e7f6 ("tracing: Move tracing_max_latency into trace_array") +Signed-off-by: Steven Rostedt (Google) +Signed-off-by: Greg Kroah-Hartman +--- + kernel/trace/trace.c | 110 +++++++++++++++------------------------------------ + 1 file changed, 34 insertions(+), 76 deletions(-) + +--- a/kernel/trace/trace.c ++++ b/kernel/trace/trace.c +@@ -2297,13 +2297,7 @@ int is_tracing_stopped(void) + return global_trace.stop_count; + } + +-/** +- * tracing_start - quick start of the tracer +- * +- * If tracing is enabled but was stopped by tracing_stop, +- * this will start the tracer back up. +- */ +-void tracing_start(void) ++static void tracing_start_tr(struct trace_array *tr) + { + struct trace_buffer *buffer; + unsigned long flags; +@@ -2311,119 +2305,83 @@ void tracing_start(void) + if (tracing_disabled) + return; + +- raw_spin_lock_irqsave(&global_trace.start_lock, flags); +- if (--global_trace.stop_count) { +- if (global_trace.stop_count < 0) { ++ raw_spin_lock_irqsave(&tr->start_lock, flags); ++ if (--tr->stop_count) { ++ if (WARN_ON_ONCE(tr->stop_count < 0)) { + /* Someone screwed up their debugging */ +- WARN_ON_ONCE(1); +- global_trace.stop_count = 0; ++ tr->stop_count = 0; + } + goto out; + } + + /* Prevent the buffers from switching */ +- arch_spin_lock(&global_trace.max_lock); ++ arch_spin_lock(&tr->max_lock); + +- buffer = global_trace.array_buffer.buffer; ++ buffer = tr->array_buffer.buffer; + if (buffer) + ring_buffer_record_enable(buffer); + + #ifdef CONFIG_TRACER_MAX_TRACE +- buffer = global_trace.max_buffer.buffer; ++ buffer = tr->max_buffer.buffer; + if (buffer) + ring_buffer_record_enable(buffer); + #endif + +- arch_spin_unlock(&global_trace.max_lock); +- +- out: +- raw_spin_unlock_irqrestore(&global_trace.start_lock, flags); +-} +- +-static void tracing_start_tr(struct trace_array *tr) +-{ +- struct trace_buffer *buffer; +- unsigned long flags; +- +- if (tracing_disabled) +- return; +- +- /* If global, we need to also start the max tracer */ +- if (tr->flags & TRACE_ARRAY_FL_GLOBAL) +- return tracing_start(); +- +- raw_spin_lock_irqsave(&tr->start_lock, flags); +- +- if (--tr->stop_count) { +- if (tr->stop_count < 0) { +- /* Someone screwed up their debugging */ +- WARN_ON_ONCE(1); +- tr->stop_count = 0; +- } +- goto out; +- } +- +- buffer = tr->array_buffer.buffer; +- if (buffer) +- ring_buffer_record_enable(buffer); ++ arch_spin_unlock(&tr->max_lock); + + out: + raw_spin_unlock_irqrestore(&tr->start_lock, flags); + } + + /** +- * tracing_stop - quick stop of the tracer ++ * tracing_start - quick start of the tracer + * +- * Light weight way to stop tracing. Use in conjunction with +- * tracing_start. ++ * If tracing is enabled but was stopped by tracing_stop, ++ * this will start the tracer back up. + */ +-void tracing_stop(void) ++void tracing_start(void) ++ ++{ ++ return tracing_start_tr(&global_trace); ++} ++ ++static void tracing_stop_tr(struct trace_array *tr) + { + struct trace_buffer *buffer; + unsigned long flags; + +- raw_spin_lock_irqsave(&global_trace.start_lock, flags); +- if (global_trace.stop_count++) ++ raw_spin_lock_irqsave(&tr->start_lock, flags); ++ if (tr->stop_count++) + goto out; + + /* Prevent the buffers from switching */ +- arch_spin_lock(&global_trace.max_lock); ++ arch_spin_lock(&tr->max_lock); + +- buffer = global_trace.array_buffer.buffer; ++ buffer = tr->array_buffer.buffer; + if (buffer) + ring_buffer_record_disable(buffer); + + #ifdef CONFIG_TRACER_MAX_TRACE +- buffer = global_trace.max_buffer.buffer; ++ buffer = tr->max_buffer.buffer; + if (buffer) + ring_buffer_record_disable(buffer); + #endif + +- arch_spin_unlock(&global_trace.max_lock); ++ arch_spin_unlock(&tr->max_lock); + + out: +- raw_spin_unlock_irqrestore(&global_trace.start_lock, flags); ++ raw_spin_unlock_irqrestore(&tr->start_lock, flags); + } + +-static void tracing_stop_tr(struct trace_array *tr) ++/** ++ * tracing_stop - quick stop of the tracer ++ * ++ * Light weight way to stop tracing. Use in conjunction with ++ * tracing_start. ++ */ ++void tracing_stop(void) + { +- struct trace_buffer *buffer; +- unsigned long flags; +- +- /* If global, we need to also stop the max tracer */ +- if (tr->flags & TRACE_ARRAY_FL_GLOBAL) +- return tracing_stop(); +- +- raw_spin_lock_irqsave(&tr->start_lock, flags); +- if (tr->stop_count++) +- goto out; +- +- buffer = tr->array_buffer.buffer; +- if (buffer) +- ring_buffer_record_disable(buffer); +- +- out: +- raw_spin_unlock_irqrestore(&tr->start_lock, flags); ++ return tracing_stop_tr(&global_trace); + } + + static int trace_save_cmdline(struct task_struct *tsk) diff --git a/queue-6.1/tracing-fix-a-possible-race-when-disabling-buffered-events.patch b/queue-6.1/tracing-fix-a-possible-race-when-disabling-buffered-events.patch new file mode 100644 index 00000000000..bca3075363f --- /dev/null +++ b/queue-6.1/tracing-fix-a-possible-race-when-disabling-buffered-events.patch @@ -0,0 +1,82 @@ +From c0591b1cccf708a47bc465c62436d669a4213323 Mon Sep 17 00:00:00 2001 +From: Petr Pavlu +Date: Tue, 5 Dec 2023 17:17:36 +0100 +Subject: tracing: Fix a possible race when disabling buffered events + +From: Petr Pavlu + +commit c0591b1cccf708a47bc465c62436d669a4213323 upstream. + +Function trace_buffered_event_disable() is responsible for freeing pages +backing buffered events and this process can run concurrently with +trace_event_buffer_lock_reserve(). + +The following race is currently possible: + +* Function trace_buffered_event_disable() is called on CPU 0. It + increments trace_buffered_event_cnt on each CPU and waits via + synchronize_rcu() for each user of trace_buffered_event to complete. + +* After synchronize_rcu() is finished, function + trace_buffered_event_disable() has the exclusive access to + trace_buffered_event. All counters trace_buffered_event_cnt are at 1 + and all pointers trace_buffered_event are still valid. + +* At this point, on a different CPU 1, the execution reaches + trace_event_buffer_lock_reserve(). The function calls + preempt_disable_notrace() and only now enters an RCU read-side + critical section. The function proceeds and reads a still valid + pointer from trace_buffered_event[CPU1] into the local variable + "entry". However, it doesn't yet read trace_buffered_event_cnt[CPU1] + which happens later. + +* Function trace_buffered_event_disable() continues. It frees + trace_buffered_event[CPU1] and decrements + trace_buffered_event_cnt[CPU1] back to 0. + +* Function trace_event_buffer_lock_reserve() continues. It reads and + increments trace_buffered_event_cnt[CPU1] from 0 to 1. This makes it + believe that it can use the "entry" that it already obtained but the + pointer is now invalid and any access results in a use-after-free. + +Fix the problem by making a second synchronize_rcu() call after all +trace_buffered_event values are set to NULL. This waits on all potential +users in trace_event_buffer_lock_reserve() that still read a previous +pointer from trace_buffered_event. + +Link: https://lore.kernel.org/all/20231127151248.7232-2-petr.pavlu@suse.com/ +Link: https://lkml.kernel.org/r/20231205161736.19663-4-petr.pavlu@suse.com + +Cc: stable@vger.kernel.org +Fixes: 0fc1b09ff1ff ("tracing: Use temp buffer when filtering events") +Signed-off-by: Petr Pavlu +Signed-off-by: Steven Rostedt (Google) +Signed-off-by: Greg Kroah-Hartman +--- + kernel/trace/trace.c | 12 ++++++++---- + 1 file changed, 8 insertions(+), 4 deletions(-) + +--- a/kernel/trace/trace.c ++++ b/kernel/trace/trace.c +@@ -2728,13 +2728,17 @@ void trace_buffered_event_disable(void) + free_page((unsigned long)per_cpu(trace_buffered_event, cpu)); + per_cpu(trace_buffered_event, cpu) = NULL; + } ++ + /* +- * Make sure trace_buffered_event is NULL before clearing +- * trace_buffered_event_cnt. ++ * Wait for all CPUs that potentially started checking if they can use ++ * their event buffer only after the previous synchronize_rcu() call and ++ * they still read a valid pointer from trace_buffered_event. It must be ++ * ensured they don't see cleared trace_buffered_event_cnt else they ++ * could wrongly decide to use the pointed-to buffer which is now freed. + */ +- smp_wmb(); ++ synchronize_rcu(); + +- /* Do the work on each cpu */ ++ /* For each CPU, relinquish the buffer */ + on_each_cpu_mask(tracing_buffer_mask, enable_trace_buffered_event, NULL, + true); + } diff --git a/queue-6.1/tracing-fix-incomplete-locking-when-disabling-buffered-events.patch b/queue-6.1/tracing-fix-incomplete-locking-when-disabling-buffered-events.patch new file mode 100644 index 00000000000..bd206f50d28 --- /dev/null +++ b/queue-6.1/tracing-fix-incomplete-locking-when-disabling-buffered-events.patch @@ -0,0 +1,153 @@ +From 7fed14f7ac9cf5e38c693836fe4a874720141845 Mon Sep 17 00:00:00 2001 +From: Petr Pavlu +Date: Tue, 5 Dec 2023 17:17:34 +0100 +Subject: tracing: Fix incomplete locking when disabling buffered events + +From: Petr Pavlu + +commit 7fed14f7ac9cf5e38c693836fe4a874720141845 upstream. + +The following warning appears when using buffered events: + +[ 203.556451] WARNING: CPU: 53 PID: 10220 at kernel/trace/ring_buffer.c:3912 ring_buffer_discard_commit+0x2eb/0x420 +[...] +[ 203.670690] CPU: 53 PID: 10220 Comm: stress-ng-sysin Tainted: G E 6.7.0-rc2-default #4 56e6d0fcf5581e6e51eaaecbdaec2a2338c80f3a +[ 203.670704] Hardware name: Intel Corp. GROVEPORT/GROVEPORT, BIOS GVPRCRB1.86B.0016.D04.1705030402 05/03/2017 +[ 203.670709] RIP: 0010:ring_buffer_discard_commit+0x2eb/0x420 +[ 203.735721] Code: 4c 8b 4a 50 48 8b 42 48 49 39 c1 0f 84 b3 00 00 00 49 83 e8 01 75 b1 48 8b 42 10 f0 ff 40 08 0f 0b e9 fc fe ff ff f0 ff 47 08 <0f> 0b e9 77 fd ff ff 48 8b 42 10 f0 ff 40 08 0f 0b e9 f5 fe ff ff +[ 203.735734] RSP: 0018:ffffb4ae4f7b7d80 EFLAGS: 00010202 +[ 203.735745] RAX: 0000000000000000 RBX: ffffb4ae4f7b7de0 RCX: ffff8ac10662c000 +[ 203.735754] RDX: ffff8ac0c750be00 RSI: ffff8ac10662c000 RDI: ffff8ac0c004d400 +[ 203.781832] RBP: ffff8ac0c039cea0 R08: 0000000000000000 R09: 0000000000000000 +[ 203.781839] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 +[ 203.781842] R13: ffff8ac10662c000 R14: ffff8ac0c004d400 R15: ffff8ac10662c008 +[ 203.781846] FS: 00007f4cd8a67740(0000) GS:ffff8ad798880000(0000) knlGS:0000000000000000 +[ 203.781851] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +[ 203.781855] CR2: 0000559766a74028 CR3: 00000001804c4000 CR4: 00000000001506f0 +[ 203.781862] Call Trace: +[ 203.781870] +[ 203.851949] trace_event_buffer_commit+0x1ea/0x250 +[ 203.851967] trace_event_raw_event_sys_enter+0x83/0xe0 +[ 203.851983] syscall_trace_enter.isra.0+0x182/0x1a0 +[ 203.851990] do_syscall_64+0x3a/0xe0 +[ 203.852075] entry_SYSCALL_64_after_hwframe+0x6e/0x76 +[ 203.852090] RIP: 0033:0x7f4cd870fa77 +[ 203.982920] Code: 00 b8 ff ff ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 66 90 b8 89 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d e9 43 0e 00 f7 d8 64 89 01 48 +[ 203.982932] RSP: 002b:00007fff99717dd8 EFLAGS: 00000246 ORIG_RAX: 0000000000000089 +[ 203.982942] RAX: ffffffffffffffda RBX: 0000558ea1d7b6f0 RCX: 00007f4cd870fa77 +[ 203.982948] RDX: 0000000000000000 RSI: 00007fff99717de0 RDI: 0000558ea1d7b6f0 +[ 203.982957] RBP: 00007fff99717de0 R08: 00007fff997180e0 R09: 00007fff997180e0 +[ 203.982962] R10: 00007fff997180e0 R11: 0000000000000246 R12: 00007fff99717f40 +[ 204.049239] R13: 00007fff99718590 R14: 0000558e9f2127a8 R15: 00007fff997180b0 +[ 204.049256] + +For instance, it can be triggered by running these two commands in +parallel: + + $ while true; do + echo hist:key=id.syscall:val=hitcount > \ + /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/trigger; + done + $ stress-ng --sysinfo $(nproc) + +The warning indicates that the current ring_buffer_per_cpu is not in the +committing state. It happens because the active ring_buffer_event +doesn't actually come from the ring_buffer_per_cpu but is allocated from +trace_buffered_event. + +The bug is in function trace_buffered_event_disable() where the +following normally happens: + +* The code invokes disable_trace_buffered_event() via + smp_call_function_many() and follows it by synchronize_rcu(). This + increments the per-CPU variable trace_buffered_event_cnt on each + target CPU and grants trace_buffered_event_disable() the exclusive + access to the per-CPU variable trace_buffered_event. + +* Maintenance is performed on trace_buffered_event, all per-CPU event + buffers get freed. + +* The code invokes enable_trace_buffered_event() via + smp_call_function_many(). This decrements trace_buffered_event_cnt and + releases the access to trace_buffered_event. + +A problem is that smp_call_function_many() runs a given function on all +target CPUs except on the current one. The following can then occur: + +* Task X executing trace_buffered_event_disable() runs on CPU 0. + +* The control reaches synchronize_rcu() and the task gets rescheduled on + another CPU 1. + +* The RCU synchronization finishes. At this point, + trace_buffered_event_disable() has the exclusive access to all + trace_buffered_event variables except trace_buffered_event[CPU0] + because trace_buffered_event_cnt[CPU0] is never incremented and if the + buffer is currently unused, remains set to 0. + +* A different task Y is scheduled on CPU 0 and hits a trace event. The + code in trace_event_buffer_lock_reserve() sees that + trace_buffered_event_cnt[CPU0] is set to 0 and decides the use the + buffer provided by trace_buffered_event[CPU0]. + +* Task X continues its execution in trace_buffered_event_disable(). The + code incorrectly frees the event buffer pointed by + trace_buffered_event[CPU0] and resets the variable to NULL. + +* Task Y writes event data to the now freed buffer and later detects the + created inconsistency. + +The issue is observable since commit dea499781a11 ("tracing: Fix warning +in trace_buffered_event_disable()") which moved the call of +trace_buffered_event_disable() in __ftrace_event_enable_disable() +earlier, prior to invoking call->class->reg(.. TRACE_REG_UNREGISTER ..). +The underlying problem in trace_buffered_event_disable() is however +present since the original implementation in commit 0fc1b09ff1ff +("tracing: Use temp buffer when filtering events"). + +Fix the problem by replacing the two smp_call_function_many() calls with +on_each_cpu_mask() which invokes a given callback on all CPUs. + +Link: https://lore.kernel.org/all/20231127151248.7232-2-petr.pavlu@suse.com/ +Link: https://lkml.kernel.org/r/20231205161736.19663-2-petr.pavlu@suse.com + +Cc: stable@vger.kernel.org +Fixes: 0fc1b09ff1ff ("tracing: Use temp buffer when filtering events") +Fixes: dea499781a11 ("tracing: Fix warning in trace_buffered_event_disable()") +Signed-off-by: Petr Pavlu +Signed-off-by: Steven Rostedt (Google) +Signed-off-by: Greg Kroah-Hartman +--- + kernel/trace/trace.c | 12 ++++-------- + 1 file changed, 4 insertions(+), 8 deletions(-) + +--- a/kernel/trace/trace.c ++++ b/kernel/trace/trace.c +@@ -2717,11 +2717,9 @@ void trace_buffered_event_disable(void) + if (--trace_buffered_event_ref) + return; + +- preempt_disable(); + /* For each CPU, set the buffer as used. */ +- smp_call_function_many(tracing_buffer_mask, +- disable_trace_buffered_event, NULL, 1); +- preempt_enable(); ++ on_each_cpu_mask(tracing_buffer_mask, disable_trace_buffered_event, ++ NULL, true); + + /* Wait for all current users to finish */ + synchronize_rcu(); +@@ -2736,11 +2734,9 @@ void trace_buffered_event_disable(void) + */ + smp_wmb(); + +- preempt_disable(); + /* Do the work on each cpu */ +- smp_call_function_many(tracing_buffer_mask, +- enable_trace_buffered_event, NULL, 1); +- preempt_enable(); ++ on_each_cpu_mask(tracing_buffer_mask, enable_trace_buffered_event, NULL, ++ true); + } + + static struct trace_buffer *temp_buffer; -- 2.47.3