#ifndef __LINUX_IRQENTRYCOMMON_H
#define __LINUX_IRQENTRYCOMMON_H
+#include <linux/context_tracking.h>
+#include <linux/kmsan.h>
+#include <linux/rseq.h>
#include <linux/static_call_types.h>
#include <linux/syscalls.h>
-#include <linux/context_tracking.h>
#include <linux/tick.h>
-#include <linux/kmsan.h>
#include <linux/unwind_deferred.h>
#include <asm/entry-common.h>
arch_exit_to_user_mode_prepare(regs, ti_work);
+ rseq_exit_to_user_mode();
+
/* Ensure that kernel state is sane for a return to userspace */
kmap_assert_nomap();
lockdep_assert_irqs_disabled();
return true;
}
-static int rseq_need_restart(struct task_struct *t, u32 cs_flags)
+static int rseq_check_flags(struct task_struct *t, u32 cs_flags)
{
- u32 flags, event_mask;
+ u32 flags;
int ret;
if (rseq_warn_flags("rseq_cs", cs_flags))
if (rseq_warn_flags("rseq", flags))
return -EINVAL;
-
- /*
- * Load and clear event mask atomically with respect to
- * scheduler preemption and membarrier IPIs.
- */
- scoped_guard(RSEQ_EVENT_GUARD) {
- event_mask = t->rseq_event_mask;
- t->rseq_event_mask = 0;
- }
-
- return !!event_mask;
+ return 0;
}
static int clear_rseq_cs(struct rseq __user *rseq)
return ip - rseq_cs->start_ip < rseq_cs->post_commit_offset;
}
-static int rseq_ip_fixup(struct pt_regs *regs)
+static int rseq_ip_fixup(struct pt_regs *regs, bool abort)
{
unsigned long ip = instruction_pointer(regs);
struct task_struct *t = current;
*/
if (!in_rseq_cs(ip, &rseq_cs))
return clear_rseq_cs(t->rseq);
- ret = rseq_need_restart(t, rseq_cs.flags);
- if (ret <= 0)
+ ret = rseq_check_flags(t, rseq_cs.flags);
+ if (ret < 0)
return ret;
+ if (!abort)
+ return 0;
ret = clear_rseq_cs(t->rseq);
if (ret)
return ret;
return;
/*
- * regs is NULL if and only if the caller is in a syscall path. Skip
- * fixup and leave rseq_cs as is so that rseq_sycall() will detect and
- * kill a misbehaving userspace on debug kernels.
+ * If invoked from hypervisors or IO-URING, then @regs is a NULL
+ * pointer, so fixup cannot be done. If the syscall which led to
+ * this invocation was invoked inside a critical section, then it
+ * will either end up in this code again or a possible violation of
+ * a syscall inside a critical region can only be detected by the
+ * debug code in rseq_syscall() in a debug enabled kernel.
*/
if (regs) {
- ret = rseq_ip_fixup(regs);
- if (unlikely(ret < 0))
- goto error;
+ /*
+ * Read and clear the event mask first. If the task was not
+ * preempted or migrated or a signal is on the way, there
+ * is no point in doing any of the heavy lifting here on
+ * production kernels. In that case TIF_NOTIFY_RESUME was
+ * raised by some other functionality.
+ *
+ * This is correct because the read/clear operation is
+ * guarded against scheduler preemption, which makes it CPU
+ * local atomic. If the task is preempted right after
+ * re-enabling preemption then TIF_NOTIFY_RESUME is set
+ * again and this function is invoked another time _before_
+ * the task is able to return to user mode.
+ *
+ * On a debug kernel, invoke the fixup code unconditionally
+ * with the result handed in to allow the detection of
+ * inconsistencies.
+ */
+ u32 event_mask;
+
+ scoped_guard(RSEQ_EVENT_GUARD) {
+ event_mask = t->rseq_event_mask;
+ t->rseq_event_mask = 0;
+ }
+
+ if (IS_ENABLED(CONFIG_DEBUG_RSEQ) || event_mask) {
+ ret = rseq_ip_fixup(regs, !!event_mask);
+ if (unlikely(ret < 0))
+ goto error;
+ }
}
if (unlikely(rseq_update_cpu_node_id(t)))
goto error;