#ifdef CONFIG_RSEQ
#include <linux/jump_label.h>
#include <linux/rseq.h>
+#include <linux/uaccess.h>
#include <linux/tracepoint-defs.h>
DECLARE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled);
+#ifdef RSEQ_BUILD_SLOW_PATH
+#define rseq_inline
+#else
+#define rseq_inline __always_inline
+#endif
+
+bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr);
+
static __always_inline void rseq_note_user_irq_entry(void)
{
if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY))
current->rseq.event.user_irq = true;
}
+/*
+ * Check whether there is a valid critical section and whether the
+ * instruction pointer in @regs is inside the critical section.
+ *
+ * - If the critical section is invalid, terminate the task.
+ *
+ * - If valid and the instruction pointer is inside, set it to the abort IP.
+ *
+ * - If valid and the instruction pointer is outside, clear the critical
+ * section address.
+ *
+ * Returns true, if the section was valid and either fixup or clear was
+ * done, false otherwise.
+ *
+ * In the failure case task::rseq_event::fatal is set when a invalid
+ * section was found. It's clear when the failure was an unresolved page
+ * fault.
+ *
+ * If inlined into the exit to user path with interrupts disabled, the
+ * caller has to protect against page faults with pagefault_disable().
+ *
+ * In preemptible task context this would be counterproductive as the page
+ * faults could not be fully resolved. As a consequence unresolved page
+ * faults in task context are fatal too.
+ */
+
+#ifdef RSEQ_BUILD_SLOW_PATH
+/*
+ * The debug version is put out of line, but kept here so the code stays
+ * together.
+ *
+ * @csaddr has already been checked by the caller to be in user space
+ */
+bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs,
+ unsigned long csaddr)
+{
+ struct rseq_cs __user *ucs = (struct rseq_cs __user *)(unsigned long)csaddr;
+ u64 start_ip, abort_ip, offset, cs_end, head, tasksize = TASK_SIZE;
+ unsigned long ip = instruction_pointer(regs);
+ u64 __user *uc_head = (u64 __user *) ucs;
+ u32 usig, __user *uc_sig;
+
+ scoped_user_rw_access(ucs, efault) {
+ /*
+ * Evaluate the user pile and exit if one of the conditions
+ * is not fulfilled.
+ */
+ unsafe_get_user(start_ip, &ucs->start_ip, efault);
+ if (unlikely(start_ip >= tasksize))
+ goto die;
+ /* If outside, just clear the critical section. */
+ if (ip < start_ip)
+ goto clear;
+
+ unsafe_get_user(offset, &ucs->post_commit_offset, efault);
+ cs_end = start_ip + offset;
+ /* Check for overflow and wraparound */
+ if (unlikely(cs_end >= tasksize || cs_end < start_ip))
+ goto die;
+
+ /* If not inside, clear it. */
+ if (ip >= cs_end)
+ goto clear;
+
+ unsafe_get_user(abort_ip, &ucs->abort_ip, efault);
+ /* Ensure it's "valid" */
+ if (unlikely(abort_ip >= tasksize || abort_ip < sizeof(*uc_sig)))
+ goto die;
+ /* Validate that the abort IP is not in the critical section */
+ if (unlikely(abort_ip - start_ip < offset))
+ goto die;
+
+ /*
+ * Check version and flags for 0. No point in emitting
+ * deprecated warnings before dying. That could be done in
+ * the slow path eventually, but *shrug*.
+ */
+ unsafe_get_user(head, uc_head, efault);
+ if (unlikely(head))
+ goto die;
+
+ /* abort_ip - 4 is >= 0. See abort_ip check above */
+ uc_sig = (u32 __user *)(unsigned long)(abort_ip - sizeof(*uc_sig));
+ unsafe_get_user(usig, uc_sig, efault);
+ if (unlikely(usig != t->rseq.sig))
+ goto die;
+
+ /* rseq_event.user_irq is only valid if CONFIG_GENERIC_IRQ_ENTRY=y */
+ if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) {
+ /* If not in interrupt from user context, let it die */
+ if (unlikely(!t->rseq.event.user_irq))
+ goto die;
+ }
+ unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault);
+ instruction_pointer_set(regs, (unsigned long)abort_ip);
+ rseq_stat_inc(rseq_stats.fixup);
+ break;
+ clear:
+ unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault);
+ rseq_stat_inc(rseq_stats.clear);
+ abort_ip = 0ULL;
+ }
+
+ if (unlikely(abort_ip))
+ rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip);
+ return true;
+die:
+ t->rseq.event.fatal = true;
+efault:
+ return false;
+}
+
+#endif /* RSEQ_BUILD_SLOW_PATH */
+
+/*
+ * This only ensures that abort_ip is in the user address space and
+ * validates that it is preceded by the signature.
+ *
+ * No other sanity checks are done here, that's what the debug code is for.
+ */
+static rseq_inline bool
+rseq_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr)
+{
+ struct rseq_cs __user *ucs = (struct rseq_cs __user *)(unsigned long)csaddr;
+ unsigned long ip = instruction_pointer(regs);
+ u64 start_ip, abort_ip, offset;
+ u32 usig, __user *uc_sig;
+
+ rseq_stat_inc(rseq_stats.cs);
+
+ if (unlikely(csaddr >= TASK_SIZE)) {
+ t->rseq.event.fatal = true;
+ return false;
+ }
+
+ if (static_branch_unlikely(&rseq_debug_enabled))
+ return rseq_debug_update_user_cs(t, regs, csaddr);
+
+ scoped_user_rw_access(ucs, efault) {
+ unsafe_get_user(start_ip, &ucs->start_ip, efault);
+ unsafe_get_user(offset, &ucs->post_commit_offset, efault);
+ unsafe_get_user(abort_ip, &ucs->abort_ip, efault);
+
+ /*
+ * No sanity checks. If user space screwed it up, it can
+ * keep the pieces. That's what debug code is for.
+ *
+ * If outside, just clear the critical section.
+ */
+ if (ip - start_ip >= offset)
+ goto clear;
+
+ /*
+ * Two requirements for @abort_ip:
+ * - Must be in user space as x86 IRET would happily return to
+ * the kernel.
+ * - The four bytes preceding the instruction at @abort_ip must
+ * contain the signature.
+ *
+ * The latter protects against the following attack vector:
+ *
+ * An attacker with limited abilities to write, creates a critical
+ * section descriptor, sets the abort IP to a library function or
+ * some other ROP gadget and stores the address of the descriptor
+ * in TLS::rseq::rseq_cs. An RSEQ abort would then evade ROP
+ * protection.
+ */
+ if (abort_ip >= TASK_SIZE || abort_ip < sizeof(*uc_sig))
+ goto die;
+
+ /* The address is guaranteed to be >= 0 and < TASK_SIZE */
+ uc_sig = (u32 __user *)(unsigned long)(abort_ip - sizeof(*uc_sig));
+ unsafe_get_user(usig, uc_sig, efault);
+ if (unlikely(usig != t->rseq.sig))
+ goto die;
+
+ /* Invalidate the critical section */
+ unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault);
+ /* Update the instruction pointer */
+ instruction_pointer_set(regs, (unsigned long)abort_ip);
+ rseq_stat_inc(rseq_stats.fixup);
+ break;
+ clear:
+ unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault);
+ rseq_stat_inc(rseq_stats.clear);
+ abort_ip = 0ULL;
+ }
+
+ if (unlikely(abort_ip))
+ rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip);
+ return true;
+die:
+ t->rseq.event.fatal = true;
+efault:
+ return false;
+}
+
static __always_inline void rseq_exit_to_user_mode(void)
{
struct rseq_event *ev = ¤t->rseq.event;
return -EFAULT;
}
-/*
- * Get the user-space pointer value stored in the 'rseq_cs' field.
- */
-static int rseq_get_rseq_cs_ptr_val(struct rseq __user *rseq, u64 *rseq_cs)
-{
- if (!rseq_cs)
- return -EFAULT;
-
-#ifdef CONFIG_64BIT
- if (get_user(*rseq_cs, &rseq->rseq_cs))
- return -EFAULT;
-#else
- if (copy_from_user(rseq_cs, &rseq->rseq_cs, sizeof(*rseq_cs)))
- return -EFAULT;
-#endif
-
- return 0;
-}
-
-/*
- * If the rseq_cs field of 'struct rseq' contains a valid pointer to
- * user-space, copy 'struct rseq_cs' from user-space and validate its fields.
- */
-static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs)
-{
- struct rseq_cs __user *urseq_cs;
- u64 ptr;
- u32 __user *usig;
- u32 sig;
- int ret;
-
- ret = rseq_get_rseq_cs_ptr_val(t->rseq.usrptr, &ptr);
- if (ret)
- return ret;
-
- /* If the rseq_cs pointer is NULL, return a cleared struct rseq_cs. */
- if (!ptr) {
- memset(rseq_cs, 0, sizeof(*rseq_cs));
- return 0;
- }
- /* Check that the pointer value fits in the user-space process space. */
- if (ptr >= TASK_SIZE)
- return -EINVAL;
- urseq_cs = (struct rseq_cs __user *)(unsigned long)ptr;
- if (copy_from_user(rseq_cs, urseq_cs, sizeof(*rseq_cs)))
- return -EFAULT;
-
- if (rseq_cs->start_ip >= TASK_SIZE ||
- rseq_cs->start_ip + rseq_cs->post_commit_offset >= TASK_SIZE ||
- rseq_cs->abort_ip >= TASK_SIZE ||
- rseq_cs->version > 0)
- return -EINVAL;
- /* Check for overflow. */
- if (rseq_cs->start_ip + rseq_cs->post_commit_offset < rseq_cs->start_ip)
- return -EINVAL;
- /* Ensure that abort_ip is not in the critical section. */
- if (rseq_cs->abort_ip - rseq_cs->start_ip < rseq_cs->post_commit_offset)
- return -EINVAL;
-
- usig = (u32 __user *)(unsigned long)(rseq_cs->abort_ip - sizeof(u32));
- ret = get_user(sig, usig);
- if (ret)
- return ret;
-
- if (current->rseq.sig != sig) {
- printk_ratelimited(KERN_WARNING
- "Possible attack attempt. Unexpected rseq signature 0x%x, expecting 0x%x (pid=%d, addr=%p).\n",
- sig, current->rseq.sig, current->pid, usig);
- return -EINVAL;
- }
- return 0;
-}
-
-static bool rseq_warn_flags(const char *str, u32 flags)
-{
- u32 test_flags;
-
- if (!flags)
- return false;
- test_flags = flags & RSEQ_CS_NO_RESTART_FLAGS;
- if (test_flags)
- pr_warn_once("Deprecated flags (%u) in %s ABI structure", test_flags, str);
- test_flags = flags & ~RSEQ_CS_NO_RESTART_FLAGS;
- if (test_flags)
- pr_warn_once("Unknown flags (%u) in %s ABI structure", test_flags, str);
- return true;
-}
-
-static int rseq_check_flags(struct task_struct *t, u32 cs_flags)
-{
- u32 flags;
- int ret;
-
- if (rseq_warn_flags("rseq_cs", cs_flags))
- return -EINVAL;
-
- /* Get thread flags. */
- ret = get_user(flags, &t->rseq.usrptr->flags);
- if (ret)
- return ret;
-
- if (rseq_warn_flags("rseq", flags))
- return -EINVAL;
- return 0;
-}
-
-static int clear_rseq_cs(struct rseq __user *rseq)
-{
- /*
- * The rseq_cs field is set to NULL on preemption or signal
- * delivery on top of rseq assembly block, as well as on top
- * of code outside of the rseq assembly block. This performs
- * a lazy clear of the rseq_cs field.
- *
- * Set rseq_cs to NULL.
- */
-#ifdef CONFIG_64BIT
- return put_user(0UL, &rseq->rseq_cs);
-#else
- if (clear_user(&rseq->rseq_cs, sizeof(rseq->rseq_cs)))
- return -EFAULT;
- return 0;
-#endif
-}
-
-/*
- * Unsigned comparison will be true when ip >= start_ip, and when
- * ip < start_ip + post_commit_offset.
- */
-static bool in_rseq_cs(unsigned long ip, struct rseq_cs *rseq_cs)
-{
- return ip - rseq_cs->start_ip < rseq_cs->post_commit_offset;
-}
-
-static int rseq_ip_fixup(struct pt_regs *regs, bool abort)
+static bool rseq_handle_cs(struct task_struct *t, struct pt_regs *regs)
{
- unsigned long ip = instruction_pointer(regs);
- struct task_struct *t = current;
- struct rseq_cs rseq_cs;
- int ret;
-
- rseq_stat_inc(rseq_stats.cs);
-
- ret = rseq_get_rseq_cs(t, &rseq_cs);
- if (ret)
- return ret;
-
- /*
- * Handle potentially not being within a critical section.
- * If not nested over a rseq critical section, restart is useless.
- * Clear the rseq_cs pointer and return.
- */
- if (!in_rseq_cs(ip, &rseq_cs)) {
- rseq_stat_inc(rseq_stats.clear);
- return clear_rseq_cs(t->rseq.usrptr);
- }
- ret = rseq_check_flags(t, rseq_cs.flags);
- if (ret < 0)
- return ret;
- if (!abort)
- return 0;
- ret = clear_rseq_cs(t->rseq.usrptr);
- if (ret)
- return ret;
- rseq_stat_inc(rseq_stats.fixup);
- trace_rseq_ip_fixup(ip, rseq_cs.start_ip, rseq_cs.post_commit_offset,
- rseq_cs.abort_ip);
- instruction_pointer_set(regs, (unsigned long)rseq_cs.abort_ip);
- return 0;
+ struct rseq __user *urseq = t->rseq.usrptr;
+ u64 csaddr;
+
+ scoped_user_read_access(urseq, efault)
+ unsafe_get_user(csaddr, &urseq->rseq_cs, efault);
+ if (likely(!csaddr))
+ return true;
+ return rseq_update_user_cs(t, regs, csaddr);
+efault:
+ return false;
}
/*
void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
{
struct task_struct *t = current;
- int ret, sig;
bool event;
+ int sig;
/*
* If invoked from hypervisors before entering the guest via
if (!IS_ENABLED(CONFIG_DEBUG_RSEQ) && !event)
return;
- ret = rseq_ip_fixup(regs, event);
- if (unlikely(ret < 0))
+ if (!rseq_handle_cs(t, regs))
goto error;
if (unlikely(rseq_update_cpu_node_id(t)))
}
#ifdef CONFIG_DEBUG_RSEQ
+/*
+ * Unsigned comparison will be true when ip >= start_ip, and when
+ * ip < start_ip + post_commit_offset.
+ */
+static bool in_rseq_cs(unsigned long ip, struct rseq_cs *rseq_cs)
+{
+ return ip - rseq_cs->start_ip < rseq_cs->post_commit_offset;
+}
+
+/*
+ * If the rseq_cs field of 'struct rseq' contains a valid pointer to
+ * user-space, copy 'struct rseq_cs' from user-space and validate its fields.
+ */
+static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs)
+{
+ struct rseq __user *urseq = t->rseq.usrptr;
+ struct rseq_cs __user *urseq_cs;
+ u32 __user *usig;
+ u64 ptr;
+ u32 sig;
+ int ret;
+
+ if (get_user(ptr, &rseq->rseq_cs))
+ return -EFAULT;
+
+ /* If the rseq_cs pointer is NULL, return a cleared struct rseq_cs. */
+ if (!ptr) {
+ memset(rseq_cs, 0, sizeof(*rseq_cs));
+ return 0;
+ }
+ /* Check that the pointer value fits in the user-space process space. */
+ if (ptr >= TASK_SIZE)
+ return -EINVAL;
+ urseq_cs = (struct rseq_cs __user *)(unsigned long)ptr;
+ if (copy_from_user(rseq_cs, urseq_cs, sizeof(*rseq_cs)))
+ return -EFAULT;
+
+ if (rseq_cs->start_ip >= TASK_SIZE ||
+ rseq_cs->start_ip + rseq_cs->post_commit_offset >= TASK_SIZE ||
+ rseq_cs->abort_ip >= TASK_SIZE ||
+ rseq_cs->version > 0)
+ return -EINVAL;
+ /* Check for overflow. */
+ if (rseq_cs->start_ip + rseq_cs->post_commit_offset < rseq_cs->start_ip)
+ return -EINVAL;
+ /* Ensure that abort_ip is not in the critical section. */
+ if (rseq_cs->abort_ip - rseq_cs->start_ip < rseq_cs->post_commit_offset)
+ return -EINVAL;
+
+ usig = (u32 __user *)(unsigned long)(rseq_cs->abort_ip - sizeof(u32));
+ ret = get_user(sig, usig);
+ if (ret)
+ return ret;
+
+ if (current->rseq.sig != sig) {
+ printk_ratelimited(KERN_WARNING
+ "Possible attack attempt. Unexpected rseq signature 0x%x, expecting 0x%x (pid=%d, addr=%p).\n",
+ sig, current->rseq.sig, current->pid, usig);
+ return -EINVAL;
+ }
+ return 0;
+}
/*
* Terminate the process if a syscall is issued within a restartable