--- /dev/null
+From 3e1ab8301f1fefaccfcc1491b5dbc75ece355783 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sat, 6 Feb 2021 20:10:24 -0500
+Subject: bpf: Allow variable-offset stack access
+
+From: Andrei Matei <andreimatei1@gmail.com>
+
+[ Upstream commit 01f810ace9ed37255f27608a0864abebccf0aab3 ]
+
+Before this patch, variable offset access to the stack was dissalowed
+for regular instructions, but was allowed for "indirect" accesses (i.e.
+helpers). This patch removes the restriction, allowing reading and
+writing to the stack through stack pointers with variable offsets. This
+makes stack-allocated buffers more usable in programs, and brings stack
+pointers closer to other types of pointers.
+
+The motivation is being able to use stack-allocated buffers for data
+manipulation. When the stack size limit is sufficient, allocating
+buffers on the stack is simpler than per-cpu arrays, or other
+alternatives.
+
+In unpriviledged programs, variable-offset reads and writes are
+disallowed (they were already disallowed for the indirect access case)
+because the speculative execution checking code doesn't support them.
+Additionally, when writing through a variable-offset stack pointer, if
+any pointers are in the accessible range, there's possilibities of later
+leaking pointers because the write cannot be tracked precisely.
+
+Writes with variable offset mark the whole range as initialized, even
+though we don't know which stack slots are actually written. This is in
+order to not reject future reads to these slots. Note that this doesn't
+affect writes done through helpers; like before, helpers need the whole
+stack range to be initialized to begin with.
+All the stack slots are in range are considered scalars after the write;
+variable-offset register spills are not tracked.
+
+For reads, all the stack slots in the variable range needs to be
+initialized (but see above about what writes do), otherwise the read is
+rejected. All register spilled in stack slots that might be read are
+marked as having been read, however reads through such pointers don't do
+register filling; the target register will always be either a scalar or
+a constant zero.
+
+Signed-off-by: Andrei Matei <andreimatei1@gmail.com>
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Link: https://lore.kernel.org/bpf/20210207011027.676572-2-andreimatei1@gmail.com
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/linux/bpf.h | 5 +
+ include/linux/bpf_verifier.h | 3 +-
+ kernel/bpf/verifier.c | 657 +++++++++++++++++++++++++++--------
+ 3 files changed, 518 insertions(+), 147 deletions(-)
+
+diff --git a/include/linux/bpf.h b/include/linux/bpf.h
+index b416bba3a62b..8ad819132dde 100644
+--- a/include/linux/bpf.h
++++ b/include/linux/bpf.h
+@@ -1259,6 +1259,11 @@ static inline bool bpf_allow_ptr_leaks(void)
+ return perfmon_capable();
+ }
+
++static inline bool bpf_allow_uninit_stack(void)
++{
++ return perfmon_capable();
++}
++
+ static inline bool bpf_allow_ptr_to_map_access(void)
+ {
+ return perfmon_capable();
+diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
+index e83ef6f6bf43..85bac3191e12 100644
+--- a/include/linux/bpf_verifier.h
++++ b/include/linux/bpf_verifier.h
+@@ -187,7 +187,7 @@ struct bpf_func_state {
+ * 0 = main function, 1 = first callee.
+ */
+ u32 frameno;
+- /* subprog number == index within subprog_stack_depth
++ /* subprog number == index within subprog_info
+ * zero == main subprog
+ */
+ u32 subprogno;
+@@ -390,6 +390,7 @@ struct bpf_verifier_env {
+ u32 used_map_cnt; /* number of used maps */
+ u32 id_gen; /* used to generate unique reg IDs */
+ bool allow_ptr_leaks;
++ bool allow_uninit_stack;
+ bool allow_ptr_to_map_access;
+ bool bpf_capable;
+ bool bypass_spec_v1;
+diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
+index 2e09e691a6be..94923c2bdd81 100644
+--- a/kernel/bpf/verifier.c
++++ b/kernel/bpf/verifier.c
+@@ -2268,12 +2268,14 @@ static void save_register_state(struct bpf_func_state *state,
+ state->stack[spi].slot_type[i] = STACK_SPILL;
+ }
+
+-/* check_stack_read/write functions track spill/fill of registers,
++/* check_stack_{read,write}_fixed_off functions track spill/fill of registers,
+ * stack boundary and alignment are checked in check_mem_access()
+ */
+-static int check_stack_write(struct bpf_verifier_env *env,
+- struct bpf_func_state *state, /* func where register points to */
+- int off, int size, int value_regno, int insn_idx)
++static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
++ /* stack frame we're writing to */
++ struct bpf_func_state *state,
++ int off, int size, int value_regno,
++ int insn_idx)
+ {
+ struct bpf_func_state *cur; /* state of the current function */
+ int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err;
+@@ -2399,9 +2401,175 @@ static int check_stack_write(struct bpf_verifier_env *env,
+ return 0;
+ }
+
+-static int check_stack_read(struct bpf_verifier_env *env,
+- struct bpf_func_state *reg_state /* func where register points to */,
+- int off, int size, int value_regno)
++/* Write the stack: 'stack[ptr_regno + off] = value_regno'. 'ptr_regno' is
++ * known to contain a variable offset.
++ * This function checks whether the write is permitted and conservatively
++ * tracks the effects of the write, considering that each stack slot in the
++ * dynamic range is potentially written to.
++ *
++ * 'off' includes 'regno->off'.
++ * 'value_regno' can be -1, meaning that an unknown value is being written to
++ * the stack.
++ *
++ * Spilled pointers in range are not marked as written because we don't know
++ * what's going to be actually written. This means that read propagation for
++ * future reads cannot be terminated by this write.
++ *
++ * For privileged programs, uninitialized stack slots are considered
++ * initialized by this write (even though we don't know exactly what offsets
++ * are going to be written to). The idea is that we don't want the verifier to
++ * reject future reads that access slots written to through variable offsets.
++ */
++static int check_stack_write_var_off(struct bpf_verifier_env *env,
++ /* func where register points to */
++ struct bpf_func_state *state,
++ int ptr_regno, int off, int size,
++ int value_regno, int insn_idx)
++{
++ struct bpf_func_state *cur; /* state of the current function */
++ int min_off, max_off;
++ int i, err;
++ struct bpf_reg_state *ptr_reg = NULL, *value_reg = NULL;
++ bool writing_zero = false;
++ /* set if the fact that we're writing a zero is used to let any
++ * stack slots remain STACK_ZERO
++ */
++ bool zero_used = false;
++
++ cur = env->cur_state->frame[env->cur_state->curframe];
++ ptr_reg = &cur->regs[ptr_regno];
++ min_off = ptr_reg->smin_value + off;
++ max_off = ptr_reg->smax_value + off + size;
++ if (value_regno >= 0)
++ value_reg = &cur->regs[value_regno];
++ if (value_reg && register_is_null(value_reg))
++ writing_zero = true;
++
++ err = realloc_func_state(state, round_up(-min_off, BPF_REG_SIZE),
++ state->acquired_refs, true);
++ if (err)
++ return err;
++
++
++ /* Variable offset writes destroy any spilled pointers in range. */
++ for (i = min_off; i < max_off; i++) {
++ u8 new_type, *stype;
++ int slot, spi;
++
++ slot = -i - 1;
++ spi = slot / BPF_REG_SIZE;
++ stype = &state->stack[spi].slot_type[slot % BPF_REG_SIZE];
++
++ if (!env->allow_ptr_leaks
++ && *stype != NOT_INIT
++ && *stype != SCALAR_VALUE) {
++ /* Reject the write if there's are spilled pointers in
++ * range. If we didn't reject here, the ptr status
++ * would be erased below (even though not all slots are
++ * actually overwritten), possibly opening the door to
++ * leaks.
++ */
++ verbose(env, "spilled ptr in range of var-offset stack write; insn %d, ptr off: %d",
++ insn_idx, i);
++ return -EINVAL;
++ }
++
++ /* Erase all spilled pointers. */
++ state->stack[spi].spilled_ptr.type = NOT_INIT;
++
++ /* Update the slot type. */
++ new_type = STACK_MISC;
++ if (writing_zero && *stype == STACK_ZERO) {
++ new_type = STACK_ZERO;
++ zero_used = true;
++ }
++ /* If the slot is STACK_INVALID, we check whether it's OK to
++ * pretend that it will be initialized by this write. The slot
++ * might not actually be written to, and so if we mark it as
++ * initialized future reads might leak uninitialized memory.
++ * For privileged programs, we will accept such reads to slots
++ * that may or may not be written because, if we're reject
++ * them, the error would be too confusing.
++ */
++ if (*stype == STACK_INVALID && !env->allow_uninit_stack) {
++ verbose(env, "uninit stack in range of var-offset write prohibited for !root; insn %d, off: %d",
++ insn_idx, i);
++ return -EINVAL;
++ }
++ *stype = new_type;
++ }
++ if (zero_used) {
++ /* backtracking doesn't work for STACK_ZERO yet. */
++ err = mark_chain_precision(env, value_regno);
++ if (err)
++ return err;
++ }
++ return 0;
++}
++
++/* When register 'dst_regno' is assigned some values from stack[min_off,
++ * max_off), we set the register's type according to the types of the
++ * respective stack slots. If all the stack values are known to be zeros, then
++ * so is the destination reg. Otherwise, the register is considered to be
++ * SCALAR. This function does not deal with register filling; the caller must
++ * ensure that all spilled registers in the stack range have been marked as
++ * read.
++ */
++static void mark_reg_stack_read(struct bpf_verifier_env *env,
++ /* func where src register points to */
++ struct bpf_func_state *ptr_state,
++ int min_off, int max_off, int dst_regno)
++{
++ struct bpf_verifier_state *vstate = env->cur_state;
++ struct bpf_func_state *state = vstate->frame[vstate->curframe];
++ int i, slot, spi;
++ u8 *stype;
++ int zeros = 0;
++
++ for (i = min_off; i < max_off; i++) {
++ slot = -i - 1;
++ spi = slot / BPF_REG_SIZE;
++ stype = ptr_state->stack[spi].slot_type;
++ if (stype[slot % BPF_REG_SIZE] != STACK_ZERO)
++ break;
++ zeros++;
++ }
++ if (zeros == max_off - min_off) {
++ /* any access_size read into register is zero extended,
++ * so the whole register == const_zero
++ */
++ __mark_reg_const_zero(&state->regs[dst_regno]);
++ /* backtracking doesn't support STACK_ZERO yet,
++ * so mark it precise here, so that later
++ * backtracking can stop here.
++ * Backtracking may not need this if this register
++ * doesn't participate in pointer adjustment.
++ * Forward propagation of precise flag is not
++ * necessary either. This mark is only to stop
++ * backtracking. Any register that contributed
++ * to const 0 was marked precise before spill.
++ */
++ state->regs[dst_regno].precise = true;
++ } else {
++ /* have read misc data from the stack */
++ mark_reg_unknown(env, state->regs, dst_regno);
++ }
++ state->regs[dst_regno].live |= REG_LIVE_WRITTEN;
++}
++
++/* Read the stack at 'off' and put the results into the register indicated by
++ * 'dst_regno'. It handles reg filling if the addressed stack slot is a
++ * spilled reg.
++ *
++ * 'dst_regno' can be -1, meaning that the read value is not going to a
++ * register.
++ *
++ * The access is assumed to be within the current stack bounds.
++ */
++static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
++ /* func where src register points to */
++ struct bpf_func_state *reg_state,
++ int off, int size, int dst_regno)
+ {
+ struct bpf_verifier_state *vstate = env->cur_state;
+ struct bpf_func_state *state = vstate->frame[vstate->curframe];
+@@ -2409,11 +2577,6 @@ static int check_stack_read(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg;
+ u8 *stype;
+
+- if (reg_state->allocated_stack <= slot) {
+- verbose(env, "invalid read from stack off %d+0 size %d\n",
+- off, size);
+- return -EACCES;
+- }
+ stype = reg_state->stack[spi].slot_type;
+ reg = ®_state->stack[spi].spilled_ptr;
+
+@@ -2424,9 +2587,9 @@ static int check_stack_read(struct bpf_verifier_env *env,
+ verbose(env, "invalid size of register fill\n");
+ return -EACCES;
+ }
+- if (value_regno >= 0) {
+- mark_reg_unknown(env, state->regs, value_regno);
+- state->regs[value_regno].live |= REG_LIVE_WRITTEN;
++ if (dst_regno >= 0) {
++ mark_reg_unknown(env, state->regs, dst_regno);
++ state->regs[dst_regno].live |= REG_LIVE_WRITTEN;
+ }
+ mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
+ return 0;
+@@ -2438,16 +2601,16 @@ static int check_stack_read(struct bpf_verifier_env *env,
+ }
+ }
+
+- if (value_regno >= 0) {
++ if (dst_regno >= 0) {
+ /* restore register state from stack */
+- state->regs[value_regno] = *reg;
++ state->regs[dst_regno] = *reg;
+ /* mark reg as written since spilled pointer state likely
+ * has its liveness marks cleared by is_state_visited()
+ * which resets stack/reg liveness for state transitions
+ */
+- state->regs[value_regno].live |= REG_LIVE_WRITTEN;
++ state->regs[dst_regno].live |= REG_LIVE_WRITTEN;
+ } else if (__is_pointer_value(env->allow_ptr_leaks, reg)) {
+- /* If value_regno==-1, the caller is asking us whether
++ /* If dst_regno==-1, the caller is asking us whether
+ * it is acceptable to use this value as a SCALAR_VALUE
+ * (e.g. for XADD).
+ * We must not allow unprivileged callers to do that
+@@ -2459,70 +2622,167 @@ static int check_stack_read(struct bpf_verifier_env *env,
+ }
+ mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
+ } else {
+- int zeros = 0;
++ u8 type;
+
+ for (i = 0; i < size; i++) {
+- if (stype[(slot - i) % BPF_REG_SIZE] == STACK_MISC)
++ type = stype[(slot - i) % BPF_REG_SIZE];
++ if (type == STACK_MISC)
+ continue;
+- if (stype[(slot - i) % BPF_REG_SIZE] == STACK_ZERO) {
+- zeros++;
++ if (type == STACK_ZERO)
+ continue;
+- }
+ verbose(env, "invalid read from stack off %d+%d size %d\n",
+ off, i, size);
+ return -EACCES;
+ }
+ mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
+- if (value_regno >= 0) {
+- if (zeros == size) {
+- /* any size read into register is zero extended,
+- * so the whole register == const_zero
+- */
+- __mark_reg_const_zero(&state->regs[value_regno]);
+- /* backtracking doesn't support STACK_ZERO yet,
+- * so mark it precise here, so that later
+- * backtracking can stop here.
+- * Backtracking may not need this if this register
+- * doesn't participate in pointer adjustment.
+- * Forward propagation of precise flag is not
+- * necessary either. This mark is only to stop
+- * backtracking. Any register that contributed
+- * to const 0 was marked precise before spill.
+- */
+- state->regs[value_regno].precise = true;
+- } else {
+- /* have read misc data from the stack */
+- mark_reg_unknown(env, state->regs, value_regno);
+- }
+- state->regs[value_regno].live |= REG_LIVE_WRITTEN;
+- }
++ if (dst_regno >= 0)
++ mark_reg_stack_read(env, reg_state, off, off + size, dst_regno);
+ }
+ return 0;
+ }
+
+-static int check_stack_access(struct bpf_verifier_env *env,
+- const struct bpf_reg_state *reg,
+- int off, int size)
++enum stack_access_src {
++ ACCESS_DIRECT = 1, /* the access is performed by an instruction */
++ ACCESS_HELPER = 2, /* the access is performed by a helper */
++};
++
++static int check_stack_range_initialized(struct bpf_verifier_env *env,
++ int regno, int off, int access_size,
++ bool zero_size_allowed,
++ enum stack_access_src type,
++ struct bpf_call_arg_meta *meta);
++
++static struct bpf_reg_state *reg_state(struct bpf_verifier_env *env, int regno)
++{
++ return cur_regs(env) + regno;
++}
++
++/* Read the stack at 'ptr_regno + off' and put the result into the register
++ * 'dst_regno'.
++ * 'off' includes the pointer register's fixed offset(i.e. 'ptr_regno.off'),
++ * but not its variable offset.
++ * 'size' is assumed to be <= reg size and the access is assumed to be aligned.
++ *
++ * As opposed to check_stack_read_fixed_off, this function doesn't deal with
++ * filling registers (i.e. reads of spilled register cannot be detected when
++ * the offset is not fixed). We conservatively mark 'dst_regno' as containing
++ * SCALAR_VALUE. That's why we assert that the 'ptr_regno' has a variable
++ * offset; for a fixed offset check_stack_read_fixed_off should be used
++ * instead.
++ */
++static int check_stack_read_var_off(struct bpf_verifier_env *env,
++ int ptr_regno, int off, int size, int dst_regno)
+ {
+- /* Stack accesses must be at a fixed offset, so that we
+- * can determine what type of data were returned. See
+- * check_stack_read().
++ /* The state of the source register. */
++ struct bpf_reg_state *reg = reg_state(env, ptr_regno);
++ struct bpf_func_state *ptr_state = func(env, reg);
++ int err;
++ int min_off, max_off;
++
++ /* Note that we pass a NULL meta, so raw access will not be permitted.
+ */
+- if (!tnum_is_const(reg->var_off)) {
++ err = check_stack_range_initialized(env, ptr_regno, off, size,
++ false, ACCESS_DIRECT, NULL);
++ if (err)
++ return err;
++
++ min_off = reg->smin_value + off;
++ max_off = reg->smax_value + off;
++ mark_reg_stack_read(env, ptr_state, min_off, max_off + size, dst_regno);
++ return 0;
++}
++
++/* check_stack_read dispatches to check_stack_read_fixed_off or
++ * check_stack_read_var_off.
++ *
++ * The caller must ensure that the offset falls within the allocated stack
++ * bounds.
++ *
++ * 'dst_regno' is a register which will receive the value from the stack. It
++ * can be -1, meaning that the read value is not going to a register.
++ */
++static int check_stack_read(struct bpf_verifier_env *env,
++ int ptr_regno, int off, int size,
++ int dst_regno)
++{
++ struct bpf_reg_state *reg = reg_state(env, ptr_regno);
++ struct bpf_func_state *state = func(env, reg);
++ int err;
++ /* Some accesses are only permitted with a static offset. */
++ bool var_off = !tnum_is_const(reg->var_off);
++
++ /* The offset is required to be static when reads don't go to a
++ * register, in order to not leak pointers (see
++ * check_stack_read_fixed_off).
++ */
++ if (dst_regno < 0 && var_off) {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+- verbose(env, "variable stack access var_off=%s off=%d size=%d\n",
++ verbose(env, "variable offset stack pointer cannot be passed into helper function; var_off=%s off=%d size=%d\n",
+ tn_buf, off, size);
+ return -EACCES;
+ }
++ /* Variable offset is prohibited for unprivileged mode for simplicity
++ * since it requires corresponding support in Spectre masking for stack
++ * ALU. See also retrieve_ptr_limit().
++ */
++ if (!env->bypass_spec_v1 && var_off) {
++ char tn_buf[48];
+
+- if (off >= 0 || off < -MAX_BPF_STACK) {
+- verbose(env, "invalid stack off=%d size=%d\n", off, size);
++ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
++ verbose(env, "R%d variable offset stack access prohibited for !root, var_off=%s\n",
++ ptr_regno, tn_buf);
+ return -EACCES;
+ }
+
+- return 0;
++ if (!var_off) {
++ off += reg->var_off.value;
++ err = check_stack_read_fixed_off(env, state, off, size,
++ dst_regno);
++ } else {
++ /* Variable offset stack reads need more conservative handling
++ * than fixed offset ones. Note that dst_regno >= 0 on this
++ * branch.
++ */
++ err = check_stack_read_var_off(env, ptr_regno, off, size,
++ dst_regno);
++ }
++ return err;
++}
++
++
++/* check_stack_write dispatches to check_stack_write_fixed_off or
++ * check_stack_write_var_off.
++ *
++ * 'ptr_regno' is the register used as a pointer into the stack.
++ * 'off' includes 'ptr_regno->off', but not its variable offset (if any).
++ * 'value_regno' is the register whose value we're writing to the stack. It can
++ * be -1, meaning that we're not writing from a register.
++ *
++ * The caller must ensure that the offset falls within the maximum stack size.
++ */
++static int check_stack_write(struct bpf_verifier_env *env,
++ int ptr_regno, int off, int size,
++ int value_regno, int insn_idx)
++{
++ struct bpf_reg_state *reg = reg_state(env, ptr_regno);
++ struct bpf_func_state *state = func(env, reg);
++ int err;
++
++ if (tnum_is_const(reg->var_off)) {
++ off += reg->var_off.value;
++ err = check_stack_write_fixed_off(env, state, off, size,
++ value_regno, insn_idx);
++ } else {
++ /* Variable offset stack reads need more conservative handling
++ * than fixed offset ones.
++ */
++ err = check_stack_write_var_off(env, state,
++ ptr_regno, off, size,
++ value_regno, insn_idx);
++ }
++ return err;
+ }
+
+ static int check_map_access_type(struct bpf_verifier_env *env, u32 regno,
+@@ -2851,11 +3111,6 @@ static int check_sock_access(struct bpf_verifier_env *env, int insn_idx,
+ return -EACCES;
+ }
+
+-static struct bpf_reg_state *reg_state(struct bpf_verifier_env *env, int regno)
+-{
+- return cur_regs(env) + regno;
+-}
+-
+ static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
+ {
+ return __is_pointer_value(env->allow_ptr_leaks, reg_state(env, regno));
+@@ -2974,8 +3229,8 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
+ break;
+ case PTR_TO_STACK:
+ pointer_desc = "stack ";
+- /* The stack spill tracking logic in check_stack_write()
+- * and check_stack_read() relies on stack accesses being
++ /* The stack spill tracking logic in check_stack_write_fixed_off()
++ * and check_stack_read_fixed_off() relies on stack accesses being
+ * aligned.
+ */
+ strict = true;
+@@ -3393,6 +3648,91 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env,
+ return 0;
+ }
+
++/* Check that the stack access at the given offset is within bounds. The
++ * maximum valid offset is -1.
++ *
++ * The minimum valid offset is -MAX_BPF_STACK for writes, and
++ * -state->allocated_stack for reads.
++ */
++static int check_stack_slot_within_bounds(int off,
++ struct bpf_func_state *state,
++ enum bpf_access_type t)
++{
++ int min_valid_off;
++
++ if (t == BPF_WRITE)
++ min_valid_off = -MAX_BPF_STACK;
++ else
++ min_valid_off = -state->allocated_stack;
++
++ if (off < min_valid_off || off > -1)
++ return -EACCES;
++ return 0;
++}
++
++/* Check that the stack access at 'regno + off' falls within the maximum stack
++ * bounds.
++ *
++ * 'off' includes `regno->offset`, but not its dynamic part (if any).
++ */
++static int check_stack_access_within_bounds(
++ struct bpf_verifier_env *env,
++ int regno, int off, int access_size,
++ enum stack_access_src src, enum bpf_access_type type)
++{
++ struct bpf_reg_state *regs = cur_regs(env);
++ struct bpf_reg_state *reg = regs + regno;
++ struct bpf_func_state *state = func(env, reg);
++ int min_off, max_off;
++ int err;
++ char *err_extra;
++
++ if (src == ACCESS_HELPER)
++ /* We don't know if helpers are reading or writing (or both). */
++ err_extra = " indirect access to";
++ else if (type == BPF_READ)
++ err_extra = " read from";
++ else
++ err_extra = " write to";
++
++ if (tnum_is_const(reg->var_off)) {
++ min_off = reg->var_off.value + off;
++ if (access_size > 0)
++ max_off = min_off + access_size - 1;
++ else
++ max_off = min_off;
++ } else {
++ if (reg->smax_value >= BPF_MAX_VAR_OFF ||
++ reg->smin_value <= -BPF_MAX_VAR_OFF) {
++ verbose(env, "invalid unbounded variable-offset%s stack R%d\n",
++ err_extra, regno);
++ return -EACCES;
++ }
++ min_off = reg->smin_value + off;
++ if (access_size > 0)
++ max_off = reg->smax_value + off + access_size - 1;
++ else
++ max_off = min_off;
++ }
++
++ err = check_stack_slot_within_bounds(min_off, state, type);
++ if (!err)
++ err = check_stack_slot_within_bounds(max_off, state, type);
++
++ if (err) {
++ if (tnum_is_const(reg->var_off)) {
++ verbose(env, "invalid%s stack R%d off=%d size=%d\n",
++ err_extra, regno, off, access_size);
++ } else {
++ char tn_buf[48];
++
++ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
++ verbose(env, "invalid variable-offset%s stack R%d var_off=%s size=%d\n",
++ err_extra, regno, tn_buf, access_size);
++ }
++ }
++ return err;
++}
+
+ /* check whether memory at (regno + off) is accessible for t = (read | write)
+ * if t==write, value_regno is a register which value is stored into memory
+@@ -3505,8 +3845,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
+ }
+
+ } else if (reg->type == PTR_TO_STACK) {
+- off += reg->var_off.value;
+- err = check_stack_access(env, reg, off, size);
++ /* Basic bounds checks. */
++ err = check_stack_access_within_bounds(env, regno, off, size, ACCESS_DIRECT, t);
+ if (err)
+ return err;
+
+@@ -3515,12 +3855,12 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
+ if (err)
+ return err;
+
+- if (t == BPF_WRITE)
+- err = check_stack_write(env, state, off, size,
+- value_regno, insn_idx);
+- else
+- err = check_stack_read(env, state, off, size,
++ if (t == BPF_READ)
++ err = check_stack_read(env, regno, off, size,
+ value_regno);
++ else
++ err = check_stack_write(env, regno, off, size,
++ value_regno, insn_idx);
+ } else if (reg_is_pkt_pointer(reg)) {
+ if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) {
+ verbose(env, "cannot write into packet\n");
+@@ -3642,49 +3982,53 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins
+ BPF_SIZE(insn->code), BPF_WRITE, -1, true);
+ }
+
+-static int __check_stack_boundary(struct bpf_verifier_env *env, u32 regno,
+- int off, int access_size,
+- bool zero_size_allowed)
++/* When register 'regno' is used to read the stack (either directly or through
++ * a helper function) make sure that it's within stack boundary and, depending
++ * on the access type, that all elements of the stack are initialized.
++ *
++ * 'off' includes 'regno->off', but not its dynamic part (if any).
++ *
++ * All registers that have been spilled on the stack in the slots within the
++ * read offsets are marked as read.
++ */
++static int check_stack_range_initialized(
++ struct bpf_verifier_env *env, int regno, int off,
++ int access_size, bool zero_size_allowed,
++ enum stack_access_src type, struct bpf_call_arg_meta *meta)
+ {
+ struct bpf_reg_state *reg = reg_state(env, regno);
++ struct bpf_func_state *state = func(env, reg);
++ int err, min_off, max_off, i, j, slot, spi;
++ char *err_extra = type == ACCESS_HELPER ? " indirect" : "";
++ enum bpf_access_type bounds_check_type;
++ /* Some accesses can write anything into the stack, others are
++ * read-only.
++ */
++ bool clobber = false;
+
+- if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 ||
+- access_size < 0 || (access_size == 0 && !zero_size_allowed)) {
+- if (tnum_is_const(reg->var_off)) {
+- verbose(env, "invalid stack type R%d off=%d access_size=%d\n",
+- regno, off, access_size);
+- } else {
+- char tn_buf[48];
+-
+- tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+- verbose(env, "invalid stack type R%d var_off=%s access_size=%d\n",
+- regno, tn_buf, access_size);
+- }
++ if (access_size == 0 && !zero_size_allowed) {
++ verbose(env, "invalid zero-sized read\n");
+ return -EACCES;
+ }
+- return 0;
+-}
+
+-/* when register 'regno' is passed into function that will read 'access_size'
+- * bytes from that pointer, make sure that it's within stack boundary
+- * and all elements of stack are initialized.
+- * Unlike most pointer bounds-checking functions, this one doesn't take an
+- * 'off' argument, so it has to add in reg->off itself.
+- */
+-static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
+- int access_size, bool zero_size_allowed,
+- struct bpf_call_arg_meta *meta)
+-{
+- struct bpf_reg_state *reg = reg_state(env, regno);
+- struct bpf_func_state *state = func(env, reg);
+- int err, min_off, max_off, i, j, slot, spi;
++ if (type == ACCESS_HELPER) {
++ /* The bounds checks for writes are more permissive than for
++ * reads. However, if raw_mode is not set, we'll do extra
++ * checks below.
++ */
++ bounds_check_type = BPF_WRITE;
++ clobber = true;
++ } else {
++ bounds_check_type = BPF_READ;
++ }
++ err = check_stack_access_within_bounds(env, regno, off, access_size,
++ type, bounds_check_type);
++ if (err)
++ return err;
++
+
+ if (tnum_is_const(reg->var_off)) {
+- min_off = max_off = reg->var_off.value + reg->off;
+- err = __check_stack_boundary(env, regno, min_off, access_size,
+- zero_size_allowed);
+- if (err)
+- return err;
++ min_off = max_off = reg->var_off.value + off;
+ } else {
+ /* Variable offset is prohibited for unprivileged mode for
+ * simplicity since it requires corresponding support in
+@@ -3695,8 +4039,8 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+- verbose(env, "R%d indirect variable offset stack access prohibited for !root, var_off=%s\n",
+- regno, tn_buf);
++ verbose(env, "R%d%s variable offset stack access prohibited for !root, var_off=%s\n",
++ regno, err_extra, tn_buf);
+ return -EACCES;
+ }
+ /* Only initialized buffer on stack is allowed to be accessed
+@@ -3708,28 +4052,8 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
+ if (meta && meta->raw_mode)
+ meta = NULL;
+
+- if (reg->smax_value >= BPF_MAX_VAR_OFF ||
+- reg->smax_value <= -BPF_MAX_VAR_OFF) {
+- verbose(env, "R%d unbounded indirect variable offset stack access\n",
+- regno);
+- return -EACCES;
+- }
+- min_off = reg->smin_value + reg->off;
+- max_off = reg->smax_value + reg->off;
+- err = __check_stack_boundary(env, regno, min_off, access_size,
+- zero_size_allowed);
+- if (err) {
+- verbose(env, "R%d min value is outside of stack bound\n",
+- regno);
+- return err;
+- }
+- err = __check_stack_boundary(env, regno, max_off, access_size,
+- zero_size_allowed);
+- if (err) {
+- verbose(env, "R%d max value is outside of stack bound\n",
+- regno);
+- return err;
+- }
++ min_off = reg->smin_value + off;
++ max_off = reg->smax_value + off;
+ }
+
+ if (meta && meta->raw_mode) {
+@@ -3749,8 +4073,10 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
+ if (*stype == STACK_MISC)
+ goto mark;
+ if (*stype == STACK_ZERO) {
+- /* helper can write anything into the stack */
+- *stype = STACK_MISC;
++ if (clobber) {
++ /* helper can write anything into the stack */
++ *stype = STACK_MISC;
++ }
+ goto mark;
+ }
+
+@@ -3761,22 +4087,24 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
+ if (state->stack[spi].slot_type[0] == STACK_SPILL &&
+ (state->stack[spi].spilled_ptr.type == SCALAR_VALUE ||
+ env->allow_ptr_leaks)) {
+- __mark_reg_unknown(env, &state->stack[spi].spilled_ptr);
+- for (j = 0; j < BPF_REG_SIZE; j++)
+- state->stack[spi].slot_type[j] = STACK_MISC;
++ if (clobber) {
++ __mark_reg_unknown(env, &state->stack[spi].spilled_ptr);
++ for (j = 0; j < BPF_REG_SIZE; j++)
++ state->stack[spi].slot_type[j] = STACK_MISC;
++ }
+ goto mark;
+ }
+
+ err:
+ if (tnum_is_const(reg->var_off)) {
+- verbose(env, "invalid indirect read from stack off %d+%d size %d\n",
+- min_off, i - min_off, access_size);
++ verbose(env, "invalid%s read from stack R%d off %d+%d size %d\n",
++ err_extra, regno, min_off, i - min_off, access_size);
+ } else {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+- verbose(env, "invalid indirect read from stack var_off %s+%d size %d\n",
+- tn_buf, i - min_off, access_size);
++ verbose(env, "invalid%s read from stack R%d var_off %s+%d size %d\n",
++ err_extra, regno, tn_buf, i - min_off, access_size);
+ }
+ return -EACCES;
+ mark:
+@@ -3825,8 +4153,10 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
+ "rdwr",
+ &env->prog->aux->max_rdwr_access);
+ case PTR_TO_STACK:
+- return check_stack_boundary(env, regno, access_size,
+- zero_size_allowed, meta);
++ return check_stack_range_initialized(
++ env,
++ regno, reg->off, access_size,
++ zero_size_allowed, ACCESS_HELPER, meta);
+ default: /* scalar_value or invalid ptr */
+ /* Allow zero-byte read from NULL, regardless of pointer type */
+ if (zero_size_allowed && access_size == 0 &&
+@@ -5519,6 +5849,41 @@ static int sanitize_err(struct bpf_verifier_env *env,
+ return -EACCES;
+ }
+
++/* check that stack access falls within stack limits and that 'reg' doesn't
++ * have a variable offset.
++ *
++ * Variable offset is prohibited for unprivileged mode for simplicity since it
++ * requires corresponding support in Spectre masking for stack ALU. See also
++ * retrieve_ptr_limit().
++ *
++ *
++ * 'off' includes 'reg->off'.
++ */
++static int check_stack_access_for_ptr_arithmetic(
++ struct bpf_verifier_env *env,
++ int regno,
++ const struct bpf_reg_state *reg,
++ int off)
++{
++ if (!tnum_is_const(reg->var_off)) {
++ char tn_buf[48];
++
++ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
++ verbose(env, "R%d variable stack access prohibited for !root, var_off=%s off=%d\n",
++ regno, tn_buf, off);
++ return -EACCES;
++ }
++
++ if (off >= 0 || off < -MAX_BPF_STACK) {
++ verbose(env, "R%d stack pointer arithmetic goes out of range, "
++ "prohibited for !root; off=%d\n", regno, off);
++ return -EACCES;
++ }
++
++ return 0;
++}
++
++
+ /* Handles arithmetic on a pointer and a scalar: computes new min/max and var_off.
+ * Caller should also handle BPF_MOV case separately.
+ * If we return -EACCES, caller may want to try again treating pointer as a
+@@ -5753,10 +6118,9 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
+ "prohibited for !root\n", dst);
+ return -EACCES;
+ } else if (dst_reg->type == PTR_TO_STACK &&
+- check_stack_access(env, dst_reg, dst_reg->off +
+- dst_reg->var_off.value, 1)) {
+- verbose(env, "R%d stack pointer arithmetic goes out of range, "
+- "prohibited for !root\n", dst);
++ check_stack_access_for_ptr_arithmetic(
++ env, dst, dst_reg, dst_reg->off +
++ dst_reg->var_off.value)) {
+ return -EACCES;
+ }
+ }
+@@ -11952,6 +12316,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
+ env->strict_alignment = false;
+
+ env->allow_ptr_leaks = bpf_allow_ptr_leaks();
++ env->allow_uninit_stack = bpf_allow_uninit_stack();
+ env->allow_ptr_to_map_access = bpf_allow_ptr_to_map_access();
+ env->bypass_spec_v1 = bpf_bypass_spec_v1();
+ env->bypass_spec_v4 = bpf_bypass_spec_v4();
+--
+2.30.2
+
--- /dev/null
+From 3a78b31ba7c8d170c0fd8d9ed2b42a01cde42d5a Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 24 Mar 2021 10:38:26 +0100
+Subject: bpf: Tighten speculative pointer arithmetic mask
+
+From: Daniel Borkmann <daniel@iogearbox.net>
+
+[ Upstream commit 7fedb63a8307dda0ec3b8969a3b233a1dd7ea8e0 ]
+
+This work tightens the offset mask we use for unprivileged pointer arithmetic
+in order to mitigate a corner case reported by Piotr and Benedict where in
+the speculative domain it is possible to advance, for example, the map value
+pointer by up to value_size-1 out-of-bounds in order to leak kernel memory
+via side-channel to user space.
+
+Before this change, the computed ptr_limit for retrieve_ptr_limit() helper
+represents largest valid distance when moving pointer to the right or left
+which is then fed as aux->alu_limit to generate masking instructions against
+the offset register. After the change, the derived aux->alu_limit represents
+the largest potential value of the offset register which we mask against which
+is just a narrower subset of the former limit.
+
+For minimal complexity, we call sanitize_ptr_alu() from 2 observation points
+in adjust_ptr_min_max_vals(), that is, before and after the simulated alu
+operation. In the first step, we retieve the alu_state and alu_limit before
+the operation as well as we branch-off a verifier path and push it to the
+verification stack as we did before which checks the dst_reg under truncation,
+in other words, when the speculative domain would attempt to move the pointer
+out-of-bounds.
+
+In the second step, we retrieve the new alu_limit and calculate the absolute
+distance between both. Moreover, we commit the alu_state and final alu_limit
+via update_alu_sanitation_state() to the env's instruction aux data, and bail
+out from there if there is a mismatch due to coming from different verification
+paths with different states.
+
+Reported-by: Piotr Krysiuk <piotras@gmail.com>
+Reported-by: Benedict Schlueter <benedict.schlueter@rub.de>
+Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
+Reviewed-by: John Fastabend <john.fastabend@gmail.com>
+Acked-by: Alexei Starovoitov <ast@kernel.org>
+Tested-by: Benedict Schlueter <benedict.schlueter@rub.de>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/bpf/verifier.c | 73 ++++++++++++++++++++++++++-----------------
+ 1 file changed, 44 insertions(+), 29 deletions(-)
+
+diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
+index 1b97fd364ce2..b9180509917e 100644
+--- a/kernel/bpf/verifier.c
++++ b/kernel/bpf/verifier.c
+@@ -5674,7 +5674,7 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg,
+ bool off_is_neg = off_reg->smin_value < 0;
+ bool mask_to_left = (opcode == BPF_ADD && off_is_neg) ||
+ (opcode == BPF_SUB && !off_is_neg);
+- u32 off, max = 0, ptr_limit = 0;
++ u32 max = 0, ptr_limit = 0;
+
+ if (!tnum_is_const(off_reg->var_off) &&
+ (off_reg->smin_value < 0) != (off_reg->smax_value < 0))
+@@ -5683,26 +5683,18 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg,
+ switch (ptr_reg->type) {
+ case PTR_TO_STACK:
+ /* Offset 0 is out-of-bounds, but acceptable start for the
+- * left direction, see BPF_REG_FP.
++ * left direction, see BPF_REG_FP. Also, unknown scalar
++ * offset where we would need to deal with min/max bounds is
++ * currently prohibited for unprivileged.
+ */
+ max = MAX_BPF_STACK + mask_to_left;
+- /* Indirect variable offset stack access is prohibited in
+- * unprivileged mode so it's not handled here.
+- */
+- off = ptr_reg->off + ptr_reg->var_off.value;
+- if (mask_to_left)
+- ptr_limit = MAX_BPF_STACK + off;
+- else
+- ptr_limit = -off - 1;
++ ptr_limit = -(ptr_reg->var_off.value + ptr_reg->off);
+ break;
+ case PTR_TO_MAP_VALUE:
+ max = ptr_reg->map_ptr->value_size;
+- if (mask_to_left) {
+- ptr_limit = ptr_reg->umax_value + ptr_reg->off;
+- } else {
+- off = ptr_reg->smin_value + ptr_reg->off;
+- ptr_limit = ptr_reg->map_ptr->value_size - off - 1;
+- }
++ ptr_limit = (mask_to_left ?
++ ptr_reg->smin_value :
++ ptr_reg->umax_value) + ptr_reg->off;
+ break;
+ default:
+ return REASON_TYPE;
+@@ -5757,10 +5749,12 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env,
+ struct bpf_insn *insn,
+ const struct bpf_reg_state *ptr_reg,
+ const struct bpf_reg_state *off_reg,
+- struct bpf_reg_state *dst_reg)
++ struct bpf_reg_state *dst_reg,
++ struct bpf_insn_aux_data *tmp_aux,
++ const bool commit_window)
+ {
++ struct bpf_insn_aux_data *aux = commit_window ? cur_aux(env) : tmp_aux;
+ struct bpf_verifier_state *vstate = env->cur_state;
+- struct bpf_insn_aux_data *aux = cur_aux(env);
+ bool off_is_neg = off_reg->smin_value < 0;
+ bool ptr_is_dst_reg = ptr_reg == dst_reg;
+ u8 opcode = BPF_OP(insn->code);
+@@ -5779,18 +5773,33 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env,
+ if (vstate->speculative)
+ goto do_sim;
+
+- alu_state = off_is_neg ? BPF_ALU_NEG_VALUE : 0;
+- alu_state |= ptr_is_dst_reg ?
+- BPF_ALU_SANITIZE_SRC : BPF_ALU_SANITIZE_DST;
+-
+ err = retrieve_ptr_limit(ptr_reg, off_reg, &alu_limit, opcode);
+ if (err < 0)
+ return err;
+
++ if (commit_window) {
++ /* In commit phase we narrow the masking window based on
++ * the observed pointer move after the simulated operation.
++ */
++ alu_state = tmp_aux->alu_state;
++ alu_limit = abs(tmp_aux->alu_limit - alu_limit);
++ } else {
++ alu_state = off_is_neg ? BPF_ALU_NEG_VALUE : 0;
++ alu_state |= ptr_is_dst_reg ?
++ BPF_ALU_SANITIZE_SRC : BPF_ALU_SANITIZE_DST;
++ }
++
+ err = update_alu_sanitation_state(aux, alu_state, alu_limit);
+ if (err < 0)
+ return err;
+ do_sim:
++ /* If we're in commit phase, we're done here given we already
++ * pushed the truncated dst_reg into the speculative verification
++ * stack.
++ */
++ if (commit_window)
++ return 0;
++
+ /* Simulate and find potential out-of-bounds access under
+ * speculative execution from truncation as a result of
+ * masking when off was not within expected range. If off
+@@ -5933,6 +5942,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
+ smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value;
+ u64 umin_val = off_reg->umin_value, umax_val = off_reg->umax_value,
+ umin_ptr = ptr_reg->umin_value, umax_ptr = ptr_reg->umax_value;
++ struct bpf_insn_aux_data tmp_aux = {};
+ u8 opcode = BPF_OP(insn->code);
+ u32 dst = insn->dst_reg;
+ int ret;
+@@ -5999,12 +6009,15 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
+ /* pointer types do not carry 32-bit bounds at the moment. */
+ __mark_reg32_unbounded(dst_reg);
+
+- switch (opcode) {
+- case BPF_ADD:
+- ret = sanitize_ptr_alu(env, insn, ptr_reg, off_reg, dst_reg);
++ if (sanitize_needed(opcode)) {
++ ret = sanitize_ptr_alu(env, insn, ptr_reg, off_reg, dst_reg,
++ &tmp_aux, false);
+ if (ret < 0)
+ return sanitize_err(env, insn, ret, off_reg, dst_reg);
++ }
+
++ switch (opcode) {
++ case BPF_ADD:
+ /* We can take a fixed offset as long as it doesn't overflow
+ * the s32 'off' field
+ */
+@@ -6055,10 +6068,6 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
+ }
+ break;
+ case BPF_SUB:
+- ret = sanitize_ptr_alu(env, insn, ptr_reg, off_reg, dst_reg);
+- if (ret < 0)
+- return sanitize_err(env, insn, ret, off_reg, dst_reg);
+-
+ if (dst_reg == off_reg) {
+ /* scalar -= pointer. Creates an unknown scalar */
+ verbose(env, "R%d tried to subtract pointer from scalar\n",
+@@ -6141,6 +6150,12 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
+
+ if (sanitize_check_bounds(env, insn, dst_reg) < 0)
+ return -EACCES;
++ if (sanitize_needed(opcode)) {
++ ret = sanitize_ptr_alu(env, insn, dst_reg, off_reg, dst_reg,
++ &tmp_aux, true);
++ if (ret < 0)
++ return sanitize_err(env, insn, ret, off_reg, dst_reg);
++ }
+
+ return 0;
+ }
+--
+2.30.2
+