From 6b4301bbcd254ed520ffa481100cefbe279e19bc Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Wed, 12 Jun 2024 15:56:50 +0200
Subject: [PATCH] 5.4-stable patches

added patches:
	x86-mm-remove-broken-vsyscall-emulation-code-from-the-page-fault-code.patch
---
 queue-5.4/series                              |   1 +
 ...lation-code-from-the-page-fault-code.patch | 225 ++++++++++++++++++
 2 files changed, 226 insertions(+)
 create mode 100644 queue-5.4/x86-mm-remove-broken-vsyscall-emulation-code-from-the-page-fault-code.patch

diff --git a/queue-5.4/series b/queue-5.4/series
index 69d098e4e96..9dd5437e457 100644
--- a/queue-5.4/series
+++ b/queue-5.4/series
@@ -168,3 +168,4 @@ io_uring-fail-nop-if-non-zero-op-flags-is-passed-in.patch
 afs-don-t-cross-.backup-mountpoint-from-backup-volume.patch
 nilfs2-fix-use-after-free-of-timer-for-log-writer-thread.patch
 vxlan-fix-regression-when-dropping-packets-due-to-invalid-src-addresses.patch
+x86-mm-remove-broken-vsyscall-emulation-code-from-the-page-fault-code.patch
diff --git a/queue-5.4/x86-mm-remove-broken-vsyscall-emulation-code-from-the-page-fault-code.patch b/queue-5.4/x86-mm-remove-broken-vsyscall-emulation-code-from-the-page-fault-code.patch
new file mode 100644
index 00000000000..b3ab6fe7fe0
--- /dev/null
+++ b/queue-5.4/x86-mm-remove-broken-vsyscall-emulation-code-from-the-page-fault-code.patch
@@ -0,0 +1,225 @@
+From 02b670c1f88e78f42a6c5aee155c7b26960ca054 Mon Sep 17 00:00:00 2001
+From: Linus Torvalds <torvalds@linux-foundation.org>
+Date: Mon, 29 Apr 2024 10:00:51 +0200
+Subject: x86/mm: Remove broken vsyscall emulation code from the page fault code
+
+From: Linus Torvalds <torvalds@linux-foundation.org>
+
+commit 02b670c1f88e78f42a6c5aee155c7b26960ca054 upstream.
+
+The syzbot-reported stack trace from hell in this discussion thread
+actually has three nested page faults:
+
+  https://lore.kernel.org/r/000000000000d5f4fc0616e816d4@google.com
+
+... and I think that's actually the important thing here:
+
+ - the first page fault is from user space, and triggers the vsyscall
+   emulation.
+
+ - the second page fault is from __do_sys_gettimeofday(), and that should
+   just have caused the exception that then sets the return value to
+   -EFAULT
+
+ - the third nested page fault is due to _raw_spin_unlock_irqrestore() ->
+   preempt_schedule() -> trace_sched_switch(), which then causes a BPF
+   trace program to run, which does that bpf_probe_read_compat(), which
+   causes that page fault under pagefault_disable().
+
+It's quite the nasty backtrace, and there's a lot going on.
+
+The problem is literally the vsyscall emulation, which sets
+
+        current->thread.sig_on_uaccess_err = 1;
+
+and that causes the fixup_exception() code to send the signal *despite* the
+exception being caught.
+
+And I think that is in fact completely bogus.  It's completely bogus
+exactly because it sends that signal even when it *shouldn't* be sent -
+like for the BPF user mode trace gathering.
+
+In other words, I think the whole "sig_on_uaccess_err" thing is entirely
+broken, because it makes any nested page-faults do all the wrong things.
+
+Now, arguably, I don't think anybody should enable vsyscall emulation any
+more, but this test case clearly does.
+
+I think we should just make the "send SIGSEGV" be something that the
+vsyscall emulation does on its own, not this broken per-thread state for
+something that isn't actually per thread.
+
+The x86 page fault code actually tried to deal with the "incorrect nesting"
+by having that:
+
+                if (in_interrupt())
+                        return;
+
+which ignores the sig_on_uaccess_err case when it happens in interrupts,
+but as shown by this example, these nested page faults do not need to be
+about interrupts at all.
+
+IOW, I think the only right thing is to remove that horrendously broken
+code.
+
+The attached patch looks like the ObviouslyCorrect(tm) thing to do.
+
+NOTE! This broken code goes back to this commit in 2011:
+
+  4fc3490114bb ("x86-64: Set siginfo and context on vsyscall emulation faults")
+
+... and back then the reason was to get all the siginfo details right.
+Honestly, I do not for a moment believe that it's worth getting the siginfo
+details right here, but part of the commit says:
+
+    This fixes issues with UML when vsyscall=emulate.
+
+... and so my patch to remove this garbage will probably break UML in this
+situation.
+
+I do not believe that anybody should be running with vsyscall=emulate in
+2024 in the first place, much less if you are doing things like UML. But
+let's see if somebody screams.
+
+Reported-and-tested-by: syzbot+83e7f982ca045ab4405c@syzkaller.appspotmail.com
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Tested-by: Jiri Olsa <jolsa@kernel.org>
+Acked-by: Andy Lutomirski <luto@kernel.org>
+Link: https://lore.kernel.org/r/CAHk-=wh9D6f7HUkDgZHKmDCHUQmp+Co89GP+b8+z+G56BKeyNg@mail.gmail.com
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+[gpiccoli: Backport the patch due to differences in the trees. The main changes
+ between 5.4.y and 5.15.y are due to renaming the fixup function, by
+ commit 6456a2a69ee1 ("x86/fault: Rename no_context() to kernelmode_fixup_or_oops()"),
+ and on processor.h thread_struct due to commit cf122cfba5b1 ("kill uaccess_try()").
+ Following 2 commits cause divergence in the diffs too (in the removed lines):
+ cd072dab453a ("x86/fault: Add a helper function to sanitize error code")
+ d4ffd5df9d18 ("x86/fault: Fix wrong signal when vsyscall fails with pkey").]
+Signed-off-by: Guilherme G. Piccoli <gpiccoli@igalia.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/entry/vsyscall/vsyscall_64.c |   28 ++--------------------------
+ arch/x86/include/asm/processor.h      |    1 -
+ arch/x86/mm/fault.c                   |   27 +--------------------------
+ 3 files changed, 3 insertions(+), 53 deletions(-)
+
+--- a/arch/x86/entry/vsyscall/vsyscall_64.c
++++ b/arch/x86/entry/vsyscall/vsyscall_64.c
+@@ -98,11 +98,6 @@ static int addr_to_vsyscall_nr(unsigned
+ 
+ static bool write_ok_or_segv(unsigned long ptr, size_t size)
+ {
+-	/*
+-	 * XXX: if access_ok, get_user, and put_user handled
+-	 * sig_on_uaccess_err, this could go away.
+-	 */
+-
+ 	if (!access_ok((void __user *)ptr, size)) {
+ 		struct thread_struct *thread = &current->thread;
+ 
+@@ -120,10 +115,8 @@ static bool write_ok_or_segv(unsigned lo
+ bool emulate_vsyscall(unsigned long error_code,
+ 		      struct pt_regs *regs, unsigned long address)
+ {
+-	struct task_struct *tsk;
+ 	unsigned long caller;
+ 	int vsyscall_nr, syscall_nr, tmp;
+-	int prev_sig_on_uaccess_err;
+ 	long ret;
+ 	unsigned long orig_dx;
+ 
+@@ -172,8 +165,6 @@ bool emulate_vsyscall(unsigned long erro
+ 		goto sigsegv;
+ 	}
+ 
+-	tsk = current;
+-
+ 	/*
+ 	 * Check for access_ok violations and find the syscall nr.
+ 	 *
+@@ -233,12 +224,8 @@ bool emulate_vsyscall(unsigned long erro
+ 		goto do_ret;  /* skip requested */
+ 
+ 	/*
+-	 * With a real vsyscall, page faults cause SIGSEGV.  We want to
+-	 * preserve that behavior to make writing exploits harder.
++	 * With a real vsyscall, page faults cause SIGSEGV.
+ 	 */
+-	prev_sig_on_uaccess_err = current->thread.sig_on_uaccess_err;
+-	current->thread.sig_on_uaccess_err = 1;
+-
+ 	ret = -EFAULT;
+ 	switch (vsyscall_nr) {
+ 	case 0:
+@@ -261,23 +248,12 @@ bool emulate_vsyscall(unsigned long erro
+ 		break;
+ 	}
+ 
+-	current->thread.sig_on_uaccess_err = prev_sig_on_uaccess_err;
+-
+ check_fault:
+ 	if (ret == -EFAULT) {
+ 		/* Bad news -- userspace fed a bad pointer to a vsyscall. */
+ 		warn_bad_vsyscall(KERN_INFO, regs,
+ 				  "vsyscall fault (exploit attempt?)");
+-
+-		/*
+-		 * If we failed to generate a signal for any reason,
+-		 * generate one here.  (This should be impossible.)
+-		 */
+-		if (WARN_ON_ONCE(!sigismember(&tsk->pending.signal, SIGBUS) &&
+-				 !sigismember(&tsk->pending.signal, SIGSEGV)))
+-			goto sigsegv;
+-
+-		return true;  /* Don't emulate the ret. */
++		goto sigsegv;
+ 	}
+ 
+ 	regs->ax = ret;
+--- a/arch/x86/include/asm/processor.h
++++ b/arch/x86/include/asm/processor.h
+@@ -487,7 +487,6 @@ struct thread_struct {
+ 
+ 	mm_segment_t		addr_limit;
+ 
+-	unsigned int		sig_on_uaccess_err:1;
+ 	unsigned int		uaccess_err:1;	/* uaccess failed */
+ 
+ 	/* Floating point and extended processor state */
+--- a/arch/x86/mm/fault.c
++++ b/arch/x86/mm/fault.c
+@@ -743,33 +743,8 @@ no_context(struct pt_regs *regs, unsigne
+ 	}
+ 
+ 	/* Are we prepared to handle this kernel fault? */
+-	if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) {
+-		/*
+-		 * Any interrupt that takes a fault gets the fixup. This makes
+-		 * the below recursive fault logic only apply to a faults from
+-		 * task context.
+-		 */
+-		if (in_interrupt())
+-			return;
+-
+-		/*
+-		 * Per the above we're !in_interrupt(), aka. task context.
+-		 *
+-		 * In this case we need to make sure we're not recursively
+-		 * faulting through the emulate_vsyscall() logic.
+-		 */
+-		if (current->thread.sig_on_uaccess_err && signal) {
+-			set_signal_archinfo(address, error_code);
+-
+-			/* XXX: hwpoison faults will set the wrong code. */
+-			force_sig_fault(signal, si_code, (void __user *)address);
+-		}
+-
+-		/*
+-		 * Barring that, we can do the fixup and be happy.
+-		 */
++	if (fixup_exception(regs, X86_TRAP_PF, error_code, address))
+ 		return;
+-	}
+ 
+ #ifdef CONFIG_VMAP_STACK
+ 	/*
-- 
2.47.3