--- /dev/null
+From 81065b35e2486c024c7aa86caed452e1f01a59d4 Mon Sep 17 00:00:00 2001
+From: Tony Luck <tony.luck@intel.com>
+Date: Mon, 13 Sep 2021 14:52:39 -0700
+Subject: x86/mce: Avoid infinite loop for copy from user recovery
+
+From: Tony Luck <tony.luck@intel.com>
+
+commit 81065b35e2486c024c7aa86caed452e1f01a59d4 upstream.
+
+There are two cases for machine check recovery:
+
+1) The machine check was triggered by ring3 (application) code.
+ This is the simpler case. The machine check handler simply queues
+ work to be executed on return to user. That code unmaps the page
+ from all users and arranges to send a SIGBUS to the task that
+ triggered the poison.
+
+2) The machine check was triggered in kernel code that is covered by
+ an exception table entry. In this case the machine check handler
+ still queues a work entry to unmap the page, etc. but this will
+ not be called right away because the #MC handler returns to the
+ fix up code address in the exception table entry.
+
+Problems occur if the kernel triggers another machine check before the
+return to user processes the first queued work item.
+
+Specifically, the work is queued using the ->mce_kill_me callback
+structure in the task struct for the current thread. Attempting to queue
+a second work item using this same callback results in a loop in the
+linked list of work functions to call. So when the kernel does return to
+user, it enters an infinite loop processing the same entry for ever.
+
+There are some legitimate scenarios where the kernel may take a second
+machine check before returning to the user.
+
+1) Some code (e.g. futex) first tries a get_user() with page faults
+ disabled. If this fails, the code retries with page faults enabled
+ expecting that this will resolve the page fault.
+
+2) Copy from user code retries a copy in byte-at-time mode to check
+ whether any additional bytes can be copied.
+
+On the other side of the fence are some bad drivers that do not check
+the return value from individual get_user() calls and may access
+multiple user addresses without noticing that some/all calls have
+failed.
+
+Fix by adding a counter (current->mce_count) to keep track of repeated
+machine checks before task_work() is called. First machine check saves
+the address information and calls task_work_add(). Subsequent machine
+checks before that task_work call back is executed check that the address
+is in the same page as the first machine check (since the callback will
+offline exactly one page).
+
+Expected worst case is four machine checks before moving on (e.g. one
+user access with page faults disabled, then a repeat to the same address
+with page faults enabled ... repeat in copy tail bytes). Just in case
+there is some code that loops forever enforce a limit of 10.
+
+ [ bp: Massage commit message, drop noinstr, fix typo, extend panic
+ messages. ]
+
+Fixes: 5567d11c21a1 ("x86/mce: Send #MC singal from task work")
+Signed-off-by: Tony Luck <tony.luck@intel.com>
+Signed-off-by: Borislav Petkov <bp@suse.de>
+Cc: <stable@vger.kernel.org>
+Link: https://lkml.kernel.org/r/YT/IJ9ziLqmtqEPu@agluck-desk2.amr.corp.intel.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kernel/cpu/mce/core.c | 45 ++++++++++++++++++++++++++++++-----------
+ include/linux/sched.h | 1
+ 2 files changed, 34 insertions(+), 12 deletions(-)
+
+--- a/arch/x86/kernel/cpu/mce/core.c
++++ b/arch/x86/kernel/cpu/mce/core.c
+@@ -1241,6 +1241,9 @@ static void __mc_scan_banks(struct mce *
+
+ static void kill_me_now(struct callback_head *ch)
+ {
++ struct task_struct *p = container_of(ch, struct task_struct, mce_kill_me);
++
++ p->mce_count = 0;
+ force_sig(SIGBUS);
+ }
+
+@@ -1249,6 +1252,7 @@ static void kill_me_maybe(struct callbac
+ struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me);
+ int flags = MF_ACTION_REQUIRED;
+
++ p->mce_count = 0;
+ pr_err("Uncorrected hardware memory error in user-access at %llx", p->mce_addr);
+
+ if (!p->mce_ripv)
+@@ -1269,17 +1273,34 @@ static void kill_me_maybe(struct callbac
+ }
+ }
+
+-static void queue_task_work(struct mce *m, int kill_it)
++static void queue_task_work(struct mce *m, char *msg, int kill_current_task)
+ {
+- current->mce_addr = m->addr;
+- current->mce_kflags = m->kflags;
+- current->mce_ripv = !!(m->mcgstatus & MCG_STATUS_RIPV);
+- current->mce_whole_page = whole_page(m);
+-
+- if (kill_it)
+- current->mce_kill_me.func = kill_me_now;
+- else
+- current->mce_kill_me.func = kill_me_maybe;
++ int count = ++current->mce_count;
++
++ /* First call, save all the details */
++ if (count == 1) {
++ current->mce_addr = m->addr;
++ current->mce_kflags = m->kflags;
++ current->mce_ripv = !!(m->mcgstatus & MCG_STATUS_RIPV);
++ current->mce_whole_page = whole_page(m);
++
++ if (kill_current_task)
++ current->mce_kill_me.func = kill_me_now;
++ else
++ current->mce_kill_me.func = kill_me_maybe;
++ }
++
++ /* Ten is likely overkill. Don't expect more than two faults before task_work() */
++ if (count > 10)
++ mce_panic("Too many consecutive machine checks while accessing user data", m, msg);
++
++ /* Second or later call, make sure page address matches the one from first call */
++ if (count > 1 && (current->mce_addr >> PAGE_SHIFT) != (m->addr >> PAGE_SHIFT))
++ mce_panic("Consecutive machine checks to different user pages", m, msg);
++
++ /* Do not call task_work_add() more than once */
++ if (count > 1)
++ return;
+
+ task_work_add(current, ¤t->mce_kill_me, TWA_RESUME);
+ }
+@@ -1427,7 +1448,7 @@ noinstr void do_machine_check(struct pt_
+ /* If this triggers there is no way to recover. Die hard. */
+ BUG_ON(!on_thread_stack() || !user_mode(regs));
+
+- queue_task_work(&m, kill_it);
++ queue_task_work(&m, msg, kill_it);
+
+ } else {
+ /*
+@@ -1445,7 +1466,7 @@ noinstr void do_machine_check(struct pt_
+ }
+
+ if (m.kflags & MCE_IN_KERNEL_COPYIN)
+- queue_task_work(&m, kill_it);
++ queue_task_work(&m, msg, kill_it);
+ }
+ out:
+ mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -1354,6 +1354,7 @@ struct task_struct {
+ mce_whole_page : 1,
+ __mce_reserved : 62;
+ struct callback_head mce_kill_me;
++ int mce_count;
+ #endif
+
+ /*