Fixes for 5.10

author Sasha Levin <sashal@kernel.org>

Wed, 17 Apr 2024 17:16:43 +0000 (13:16 -0400)

committer Sasha Levin <sashal@kernel.org>

Wed, 17 Apr 2024 17:16:43 +0000 (13:16 -0400)
author Sasha Levin <sashal@kernel.org>
Wed, 17 Apr 2024 17:16:43 +0000 (13:16 -0400)
committer Sasha Levin <sashal@kernel.org>
Wed, 17 Apr 2024 17:16:43 +0000 (13:16 -0400)
diff --git a/queue-5.10/btrfs-record-delayed-inode-root-in-transaction.patch b/queue-5.10/btrfs-record-delayed-inode-root-in-transaction.patch

new file mode 100644 (file)

index 0000000..c2f547f
--- /dev/null
+++ b/queue-5.10/btrfs-record-delayed-inode-root-in-transaction.patch
@@ -0,0 +1,41 @@
+From 689b640210b16cf7e6c56384591e9042d3bd2b23 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 21 Mar 2024 10:14:24 -0700
+Subject: btrfs: record delayed inode root in transaction
+
+From: Boris Burkov <boris@bur.io>
+
+[ Upstream commit 71537e35c324ea6fbd68377a4f26bb93a831ae35 ]
+
+When running delayed inode updates, we do not record the inode's root in
+the transaction, but we do allocate PREALLOC and thus converted PERTRANS
+space for it. To be sure we free that PERTRANS meta rsv, we must ensure
+that we record the root in the transaction.
+
+Fixes: 4f5427ccce5d ("btrfs: delayed-inode: Use new qgroup meta rsv for delayed inode and item")
+CC: stable@vger.kernel.org # 6.1+
+Reviewed-by: Qu Wenruo <wqu@suse.com>
+Signed-off-by: Boris Burkov <boris@bur.io>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/delayed-inode.c | 3 +++
+ 1 file changed, 3 insertions(+)
+
+diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
+index bcffe7886530a..cdfc791b3c405 100644
+--- a/fs/btrfs/delayed-inode.c
++++ b/fs/btrfs/delayed-inode.c
+@@ -1135,6 +1135,9 @@ __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
+       if (ret)
+               return ret;
+ 
++      ret = btrfs_record_root_in_trans(trans, node->root);
++      if (ret)
++              return ret;
+       ret = btrfs_update_delayed_inode(trans, node->root, path, node);
+       return ret;
+ }
+-- 
+2.43.0
+
diff --git a/queue-5.10/ring-buffer-only-update-pages_touched-when-a-new-pag.patch b/queue-5.10/ring-buffer-only-update-pages_touched-when-a-new-pag.patch

new file mode 100644 (file)

index 0000000..bfe989a
--- /dev/null
+++ b/queue-5.10/ring-buffer-only-update-pages_touched-when-a-new-pag.patch
@@ -0,0 +1,88 @@
+From a61d018c82ac88217d69f08bfc706e49abde2980 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 9 Apr 2024 15:13:09 -0400
+Subject: ring-buffer: Only update pages_touched when a new page is touched
+
+From: Steven Rostedt (Google) <rostedt@goodmis.org>
+
+[ Upstream commit ffe3986fece696cf65e0ef99e74c75f848be8e30 ]
+
+The "buffer_percent" logic that is used by the ring buffer splice code to
+only wake up the tasks when there's no data after the buffer is filled to
+the percentage of the "buffer_percent" file is dependent on three
+variables that determine the amount of data that is in the ring buffer:
+
+ 1) pages_read - incremented whenever a new sub-buffer is consumed
+ 2) pages_lost - incremented every time a writer overwrites a sub-buffer
+ 3) pages_touched - incremented when a write goes to a new sub-buffer
+
+The percentage is the calculation of:
+
+  (pages_touched - (pages_lost + pages_read)) / nr_pages
+
+Basically, the amount of data is the total number of sub-bufs that have been
+touched, minus the number of sub-bufs lost and sub-bufs consumed. This is
+divided by the total count to give the buffer percentage. When the
+percentage is greater than the value in the "buffer_percent" file, it
+wakes up splice readers waiting for that amount.
+
+It was observed that over time, the amount read from the splice was
+constantly decreasing the longer the trace was running. That is, if one
+asked for 60%, it would read over 60% when it first starts tracing, but
+then it would be woken up at under 60% and would slowly decrease the
+amount of data read after being woken up, where the amount becomes much
+less than the buffer percent.
+
+This was due to an accounting of the pages_touched incrementation. This
+value is incremented whenever a writer transfers to a new sub-buffer. But
+the place where it was incremented was incorrect. If a writer overflowed
+the current sub-buffer it would go to the next one. If it gets preempted
+by an interrupt at that time, and the interrupt performs a trace, it too
+will end up going to the next sub-buffer. But only one should increment
+the counter. Unfortunately, that was not the case.
+
+Change the cmpxchg() that does the real switch of the tail-page into a
+try_cmpxchg(), and on success, perform the increment of pages_touched. This
+will only increment the counter once for when the writer moves to a new
+sub-buffer, and not when there's a race and is incremented for when a
+writer and its preempting writer both move to the same new sub-buffer.
+
+Link: https://lore.kernel.org/linux-trace-kernel/20240409151309.0d0e5056@gandalf.local.home
+
+Cc: stable@vger.kernel.org
+Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+Fixes: 2c2b0a78b3739 ("ring-buffer: Add percentage of ring buffer full to wake up reader")
+Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
+Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/trace/ring_buffer.c | 6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
+index 2df8e13a29e57..f6a2749cfb880 100644
+--- a/kernel/trace/ring_buffer.c
++++ b/kernel/trace/ring_buffer.c
+@@ -1439,7 +1439,6 @@ static void rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer,
+       old_write = local_add_return(RB_WRITE_INTCNT, &next_page->write);
+       old_entries = local_add_return(RB_WRITE_INTCNT, &next_page->entries);
+ 
+-      local_inc(&cpu_buffer->pages_touched);
+       /*
+        * Just make sure we have seen our old_write and synchronize
+        * with any interrupts that come in.
+@@ -1476,8 +1475,9 @@ static void rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer,
+                */
+               local_set(&next_page->page->commit, 0);
+ 
+-              /* Again, either we update tail_page or an interrupt does */
+-              (void)cmpxchg(&cpu_buffer->tail_page, tail_page, next_page);
++              /* Either we update tail_page or an interrupt does */
++              if (try_cmpxchg(&cpu_buffer->tail_page, &tail_page, next_page))
++                      local_inc(&cpu_buffer->pages_touched);
+       }
+ }
+ 
+-- 
+2.43.0
+
diff --git a/queue-5.10/riscv-enable-per-task-stack-canaries.patch b/queue-5.10/riscv-enable-per-task-stack-canaries.patch

new file mode 100644 (file)

index 0000000..f10648d
--- /dev/null
+++ b/queue-5.10/riscv-enable-per-task-stack-canaries.patch
@@ -0,0 +1,166 @@
+From ff916d422bbc9d7bdb7174ee4cd77c609e1fe795 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 17 Dec 2020 16:29:18 +0000
+Subject: riscv: Enable per-task stack canaries
+
+From: Guo Ren <guoren@linux.alibaba.com>
+
+[ Upstream commit fea2fed201ee5647699018a56fbb6a5e8cc053a5 ]
+
+This enables the use of per-task stack canary values if GCC has
+support for emitting the stack canary reference relative to the
+value of tp, which holds the task struct pointer in the riscv
+kernel.
+
+After compare arm64 and x86 implementations, seems arm64's is more
+flexible and readable. The key point is how gcc get the offset of
+stack_canary from gs/el0_sp.
+
+x86: Use a fix offset from gs, not flexible.
+
+struct fixed_percpu_data {
+       /*
+        * GCC hardcodes the stack canary as %gs:40.  Since the
+        * irq_stack is the object at %gs:0, we reserve the bottom
+        * 48 bytes of the irq stack for the canary.
+        */
+       char            gs_base[40]; // :(
+       unsigned long   stack_canary;
+};
+
+arm64: Use -mstack-protector-guard-offset & guard-reg
+       gcc options:
+       -mstack-protector-guard=sysreg
+       -mstack-protector-guard-reg=sp_el0
+       -mstack-protector-guard-offset=xxx
+
+riscv: Use -mstack-protector-guard-offset & guard-reg
+       gcc options:
+       -mstack-protector-guard=tls
+       -mstack-protector-guard-reg=tp
+       -mstack-protector-guard-offset=xxx
+
+ GCC's implementation has been merged:
+ commit c931e8d5a96463427040b0d11f9c4352ac22b2b0
+ Author: Cooper Qu <cooper.qu@linux.alibaba.com>
+ Date:   Mon Jul 13 16:15:08 2020 +0800
+
+     RISC-V: Add support for TLS stack protector canary access
+
+In the end, these codes are inserted by gcc before return:
+
+*  0xffffffe00020b396 <+120>:   ld      a5,1008(tp) # 0x3f0
+*  0xffffffe00020b39a <+124>:   xor     a5,a5,a4
+*  0xffffffe00020b39c <+126>:   mv      a0,s5
+*  0xffffffe00020b39e <+128>:   bnez    a5,0xffffffe00020b61c <_do_fork+766>
+   0xffffffe00020b3a2 <+132>:   ld      ra,136(sp)
+   0xffffffe00020b3a4 <+134>:   ld      s0,128(sp)
+   0xffffffe00020b3a6 <+136>:   ld      s1,120(sp)
+   0xffffffe00020b3a8 <+138>:   ld      s2,112(sp)
+   0xffffffe00020b3aa <+140>:   ld      s3,104(sp)
+   0xffffffe00020b3ac <+142>:   ld      s4,96(sp)
+   0xffffffe00020b3ae <+144>:   ld      s5,88(sp)
+   0xffffffe00020b3b0 <+146>:   ld      s6,80(sp)
+   0xffffffe00020b3b2 <+148>:   ld      s7,72(sp)
+   0xffffffe00020b3b4 <+150>:   addi    sp,sp,144
+   0xffffffe00020b3b6 <+152>:   ret
+   ...
+*  0xffffffe00020b61c <+766>:   auipc   ra,0x7f8
+*  0xffffffe00020b620 <+770>:   jalr    -1764(ra) # 0xffffffe000a02f38 <__stack_chk_fail>
+
+Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
+Signed-off-by: Cooper Qu <cooper.qu@linux.alibaba.com>
+Reviewed-by: Kees Cook <keescook@chromium.org>
+Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>
+Stable-dep-of: d14fa1fcf69d ("riscv: process: Fix kernel gp leakage")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/riscv/Kconfig                      |  7 +++++++
+ arch/riscv/Makefile                     | 10 ++++++++++
+ arch/riscv/include/asm/stackprotector.h |  3 ++-
+ arch/riscv/kernel/asm-offsets.c         |  3 +++
+ arch/riscv/kernel/process.c             |  2 +-
+ 5 files changed, 23 insertions(+), 2 deletions(-)
+
+diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
+index b28fabfc91bf7..0248da3be3e70 100644
+--- a/arch/riscv/Kconfig
++++ b/arch/riscv/Kconfig
+@@ -445,6 +445,13 @@ config EFI
+         allow the kernel to be booted as an EFI application. This
+         is only useful on systems that have UEFI firmware.
+ 
++config CC_HAVE_STACKPROTECTOR_TLS
++      def_bool $(cc-option,-mstack-protector-guard=tls -mstack-protector-guard-reg=tp -mstack-protector-guard-offset=0)
++
++config STACKPROTECTOR_PER_TASK
++      def_bool y
++      depends on STACKPROTECTOR && CC_HAVE_STACKPROTECTOR_TLS
++
+ endmenu
+ 
+ config BUILTIN_DTB
+diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile
+index daa679440000a..8572d23fba700 100644
+--- a/arch/riscv/Makefile
++++ b/arch/riscv/Makefile
+@@ -88,6 +88,16 @@ KBUILD_AFLAGS_MODULE += $(call as-option,-Wa$(comma)-mno-relax)
+ # architectures.  It's faster to have GCC emit only aligned accesses.
+ KBUILD_CFLAGS += $(call cc-option,-mstrict-align)
+ 
++ifeq ($(CONFIG_STACKPROTECTOR_PER_TASK),y)
++prepare: stack_protector_prepare
++stack_protector_prepare: prepare0
++      $(eval KBUILD_CFLAGS += -mstack-protector-guard=tls               \
++                              -mstack-protector-guard-reg=tp            \
++                              -mstack-protector-guard-offset=$(shell    \
++                      awk '{if ($$2 == "TSK_STACK_CANARY") print $$3;}' \
++                                      include/generated/asm-offsets.h))
++endif
++
+ # arch specific predefines for sparse
+ CHECKFLAGS += -D__riscv -D__riscv_xlen=$(BITS)
+ 
+diff --git a/arch/riscv/include/asm/stackprotector.h b/arch/riscv/include/asm/stackprotector.h
+index 5962f8891f06f..09093af46565e 100644
+--- a/arch/riscv/include/asm/stackprotector.h
++++ b/arch/riscv/include/asm/stackprotector.h
+@@ -24,6 +24,7 @@ static __always_inline void boot_init_stack_canary(void)
+       canary &= CANARY_MASK;
+ 
+       current->stack_canary = canary;
+-      __stack_chk_guard = current->stack_canary;
++      if (!IS_ENABLED(CONFIG_STACKPROTECTOR_PER_TASK))
++              __stack_chk_guard = current->stack_canary;
+ }
+ #endif /* _ASM_RISCV_STACKPROTECTOR_H */
+diff --git a/arch/riscv/kernel/asm-offsets.c b/arch/riscv/kernel/asm-offsets.c
+index db203442c08f9..877ff65b4e136 100644
+--- a/arch/riscv/kernel/asm-offsets.c
++++ b/arch/riscv/kernel/asm-offsets.c
+@@ -66,6 +66,9 @@ void asm_offsets(void)
+       OFFSET(TASK_THREAD_F30, task_struct, thread.fstate.f[30]);
+       OFFSET(TASK_THREAD_F31, task_struct, thread.fstate.f[31]);
+       OFFSET(TASK_THREAD_FCSR, task_struct, thread.fstate.fcsr);
++#ifdef CONFIG_STACKPROTECTOR
++      OFFSET(TSK_STACK_CANARY, task_struct, stack_canary);
++#endif
+ 
+       DEFINE(PT_SIZE, sizeof(struct pt_regs));
+       OFFSET(PT_EPC, pt_regs, epc);
+diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c
+index 7868050ff426d..d83d7761a157d 100644
+--- a/arch/riscv/kernel/process.c
++++ b/arch/riscv/kernel/process.c
+@@ -24,7 +24,7 @@
+ 
+ register unsigned long gp_in_global __asm__("gp");
+ 
+-#ifdef CONFIG_STACKPROTECTOR
++#if defined(CONFIG_STACKPROTECTOR) && !defined(CONFIG_STACKPROTECTOR_PER_TASK)
+ #include <linux/stackprotector.h>
+ unsigned long __stack_chk_guard __read_mostly;
+ EXPORT_SYMBOL(__stack_chk_guard);
+-- 
+2.43.0
+
diff --git a/queue-5.10/riscv-process-fix-kernel-gp-leakage.patch b/queue-5.10/riscv-process-fix-kernel-gp-leakage.patch

new file mode 100644 (file)

index 0000000..2abdf36
--- /dev/null
+++ b/queue-5.10/riscv-process-fix-kernel-gp-leakage.patch
@@ -0,0 +1,85 @@
+From fbb74578fc995c4ce1080f0fdb743e10d8dc1610 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 27 Mar 2024 02:12:58 -0400
+Subject: riscv: process: Fix kernel gp leakage
+
+From: Stefan O'Rear <sorear@fastmail.com>
+
+[ Upstream commit d14fa1fcf69db9d070e75f1c4425211fa619dfc8 ]
+
+childregs represents the registers which are active for the new thread
+in user context. For a kernel thread, childregs->gp is never used since
+the kernel gp is not touched by switch_to. For a user mode helper, the
+gp value can be observed in user space after execve or possibly by other
+means.
+
+[From the email thread]
+
+The /* Kernel thread */ comment is somewhat inaccurate in that it is also used
+for user_mode_helper threads, which exec a user process, e.g. /sbin/init or
+when /proc/sys/kernel/core_pattern is a pipe. Such threads do not have
+PF_KTHREAD set and are valid targets for ptrace etc. even before they exec.
+
+childregs is the *user* context during syscall execution and it is observable
+from userspace in at least five ways:
+
+1. kernel_execve does not currently clear integer registers, so the starting
+   register state for PID 1 and other user processes started by the kernel has
+   sp = user stack, gp = kernel __global_pointer$, all other integer registers
+   zeroed by the memset in the patch comment.
+
+   This is a bug in its own right, but I'm unwilling to bet that it is the only
+   way to exploit the issue addressed by this patch.
+
+2. ptrace(PTRACE_GETREGSET): you can PTRACE_ATTACH to a user_mode_helper thread
+   before it execs, but ptrace requires SIGSTOP to be delivered which can only
+   happen at user/kernel boundaries.
+
+3. /proc/*/task/*/syscall: this is perfectly happy to read pt_regs for
+   user_mode_helpers before the exec completes, but gp is not one of the
+   registers it returns.
+
+4. PERF_SAMPLE_REGS_USER: LOCKDOWN_PERF normally prevents access to kernel
+   addresses via PERF_SAMPLE_REGS_INTR, but due to this bug kernel addresses
+   are also exposed via PERF_SAMPLE_REGS_USER which is permitted under
+   LOCKDOWN_PERF. I have not attempted to write exploit code.
+
+5. Much of the tracing infrastructure allows access to user registers. I have
+   not attempted to determine which forms of tracing allow access to user
+   registers without already allowing access to kernel registers.
+
+Fixes: 7db91e57a0ac ("RISC-V: Task implementation")
+Cc: stable@vger.kernel.org
+Signed-off-by: Stefan O'Rear <sorear@fastmail.com>
+Reviewed-by: Alexandre Ghiti <alexghiti@rivosinc.com>
+Link: https://lore.kernel.org/r/20240327061258.2370291-1-sorear@fastmail.com
+Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/riscv/kernel/process.c | 3 ---
+ 1 file changed, 3 deletions(-)
+
+diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c
+index d83d7761a157d..9dac6bec316e4 100644
+--- a/arch/riscv/kernel/process.c
++++ b/arch/riscv/kernel/process.c
+@@ -22,8 +22,6 @@
+ #include <asm/switch_to.h>
+ #include <asm/thread_info.h>
+ 
+-register unsigned long gp_in_global __asm__("gp");
+-
+ #if defined(CONFIG_STACKPROTECTOR) && !defined(CONFIG_STACKPROTECTOR_PER_TASK)
+ #include <linux/stackprotector.h>
+ unsigned long __stack_chk_guard __read_mostly;
+@@ -117,7 +115,6 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg,
+       if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
+               /* Kernel thread */
+               memset(childregs, 0, sizeof(struct pt_regs));
+-              childregs->gp = gp_in_global;
+               /* Supervisor/Machine, irqs on: */
+               childregs->status = SR_PP | SR_PIE;
+ 
+-- 
+2.43.0
+
diff --git a/queue-5.10/series b/queue-5.10/series

index b8ecb6603be27568c78b136e00399968c834821a..62be3710b37007f9881738813e457b0bff35b14d 100644 (file)
--- a/queue-5.10/series
+++ b/queue-5.10/series
@@ -27,3 +27,7 @@ x86-cpu-actually-turn-off-mitigations-by-default-for-speculation_mitigations-n.p
  selftests-timers-fix-abs-warning-in-posix_timers-test.patch
  x86-apic-force-native_apic_mem_read-to-use-the-mov-instruction.patch
  irqflags-explicitly-ignore-lockdep_hrtimer_exit-argument.patch
+btrfs-record-delayed-inode-root-in-transaction.patch
+riscv-enable-per-task-stack-canaries.patch
+riscv-process-fix-kernel-gp-leakage.patch
+ring-buffer-only-update-pages_touched-when-a-new-pag.patch
author	Sasha Levin <sashal@kernel.org>
	Wed, 17 Apr 2024 17:16:43 +0000 (13:16 -0400)
committer	Sasha Levin <sashal@kernel.org>
	Wed, 17 Apr 2024 17:16:43 +0000 (13:16 -0400)
queue-5.10/btrfs-record-delayed-inode-root-in-transaction.patch	[new file with mode: 0644]	patch \| blob
queue-5.10/ring-buffer-only-update-pages_touched-when-a-new-pag.patch	[new file with mode: 0644]	patch \| blob
queue-5.10/riscv-enable-per-task-stack-canaries.patch	[new file with mode: 0644]	patch \| blob
queue-5.10/riscv-process-fix-kernel-gp-leakage.patch	[new file with mode: 0644]	patch \| blob
queue-5.10/series		patch \| blob \| blame \| history