]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
bpf, arm64: JIT support for private stack
authorPuranjay Mohan <puranjay@kernel.org>
Thu, 24 Jul 2025 12:02:54 +0000 (12:02 +0000)
committerDaniel Borkmann <daniel@iogearbox.net>
Sat, 26 Jul 2025 19:26:56 +0000 (21:26 +0200)
The private stack is allocated in bpf_int_jit_compile() with 16-byte
alignment. It includes additional guard regions to detect stack
overflows and underflows at runtime.

Memory layout:

              +------------------------------------------------------+
              |                                                      |
              |  16 bytes padding (overflow guard - stack top)       |
              |  [ detects writes beyond top of stack ]              |
     BPF FP ->+------------------------------------------------------+
              |                                                      |
              |  BPF private stack (sized by verifier)               |
              |  [ 16-byte aligned ]                                 |
              |                                                      |
BPF PRIV SP ->+------------------------------------------------------+
              |                                                      |
              |  16 bytes padding (underflow guard - stack bottom)   |
              |  [ detects accesses before start of stack ]          |
              |                                                      |
              +------------------------------------------------------+

On detection of an overflow or underflow, the kernel emits messages
like:

    BPF private stack overflow/underflow detected for prog <prog_name>

After commit bd737fcb6485 ("bpf, arm64: Get rid of fpb"), Jited BPF
programs use the stack in two ways:

1. Via the BPF frame pointer (top of stack), using negative offsets.
2. Via the stack pointer (bottom of stack), using positive offsets in
   LDR/STR instructions.

When a private stack is used, ARM64 callee-saved register x27 replaces
the stack pointer. The BPF frame pointer usage remains unchanged; but
it now points to the top of the private stack.

Relevant tests (Enabled in following patch):

 #415/1   struct_ops_private_stack/private_stack:OK
 #415/2   struct_ops_private_stack/private_stack_fail:OK
 #415/3   struct_ops_private_stack/private_stack_recur:OK
 #415     struct_ops_private_stack:OK
 #549/1   verifier_private_stack/Private stack, single prog:OK
 #549/2   verifier_private_stack/Private stack, subtree > MAX_BPF_STACK:OK
 #549/3   verifier_private_stack/No private stack:OK
 #549/4   verifier_private_stack/Private stack, callback:OK
 #549/5   verifier_private_stack/Private stack, exception in main prog:OK
 #549/6   verifier_private_stack/Private stack, exception in subprog:OK
 #549/7   verifier_private_stack/Private stack, async callback, not nested:OK
 #549/8   verifier_private_stack/Private stack, async callback, potential nesting:OK
 #549     verifier_private_stack:OK
 Summary: 2/11 PASSED, 0 SKIPPED, 0 FAILED

Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/bpf/20250724120257.7299-3-puranjay@kernel.org
arch/arm64/net/bpf_jit_comp.c

index 97ab651c0bd5dc9fdfbe66265c39f575a7534a11..97dfd54328098d85828f3fa6cae71f551f2ca2e0 100644 (file)
@@ -30,6 +30,7 @@
 #define TMP_REG_2 (MAX_BPF_JIT_REG + 1)
 #define TCCNT_PTR (MAX_BPF_JIT_REG + 2)
 #define TMP_REG_3 (MAX_BPF_JIT_REG + 3)
+#define PRIVATE_SP (MAX_BPF_JIT_REG + 4)
 #define ARENA_VM_START (MAX_BPF_JIT_REG + 5)
 
 #define check_imm(bits, imm) do {                              \
@@ -68,6 +69,8 @@ static const int bpf2a64[] = {
        [TCCNT_PTR] = A64_R(26),
        /* temporary register for blinding constants */
        [BPF_REG_AX] = A64_R(9),
+       /* callee saved register for private stack pointer */
+       [PRIVATE_SP] = A64_R(27),
        /* callee saved register for kern_vm_start address */
        [ARENA_VM_START] = A64_R(28),
 };
@@ -86,6 +89,7 @@ struct jit_ctx {
        u64 user_vm_start;
        u64 arena_vm_start;
        bool fp_used;
+       bool priv_sp_used;
        bool write;
 };
 
@@ -98,6 +102,10 @@ struct bpf_plt {
 #define PLT_TARGET_SIZE   sizeof_field(struct bpf_plt, target)
 #define PLT_TARGET_OFFSET offsetof(struct bpf_plt, target)
 
+/* Memory size/value to protect private stack overflow/underflow */
+#define PRIV_STACK_GUARD_SZ    16
+#define PRIV_STACK_GUARD_VAL   0xEB9F12345678eb9fULL
+
 static inline void emit(const u32 insn, struct jit_ctx *ctx)
 {
        if (ctx->image != NULL && ctx->write)
@@ -387,8 +395,11 @@ static void find_used_callee_regs(struct jit_ctx *ctx)
        if (reg_used & 8)
                ctx->used_callee_reg[i++] = bpf2a64[BPF_REG_9];
 
-       if (reg_used & 16)
+       if (reg_used & 16) {
                ctx->used_callee_reg[i++] = bpf2a64[BPF_REG_FP];
+               if (ctx->priv_sp_used)
+                       ctx->used_callee_reg[i++] = bpf2a64[PRIVATE_SP];
+       }
 
        if (ctx->arena_vm_start)
                ctx->used_callee_reg[i++] = bpf2a64[ARENA_VM_START];
@@ -462,6 +473,19 @@ static void pop_callee_regs(struct jit_ctx *ctx)
        }
 }
 
+static void emit_percpu_ptr(const u8 dst_reg, void __percpu *ptr,
+                           struct jit_ctx *ctx)
+{
+       const u8 tmp = bpf2a64[TMP_REG_1];
+
+       emit_a64_mov_i64(dst_reg, (__force const u64)ptr, ctx);
+       if (cpus_have_cap(ARM64_HAS_VIRT_HOST_EXTN))
+               emit(A64_MRS_TPIDR_EL2(tmp), ctx);
+       else
+               emit(A64_MRS_TPIDR_EL1(tmp), ctx);
+       emit(A64_ADD(1, dst_reg, dst_reg, tmp), ctx);
+}
+
 #define BTI_INSNS (IS_ENABLED(CONFIG_ARM64_BTI_KERNEL) ? 1 : 0)
 #define PAC_INSNS (IS_ENABLED(CONFIG_ARM64_PTR_AUTH_KERNEL) ? 1 : 0)
 
@@ -477,6 +501,8 @@ static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf)
        const bool is_main_prog = !bpf_is_subprog(prog);
        const u8 fp = bpf2a64[BPF_REG_FP];
        const u8 arena_vm_base = bpf2a64[ARENA_VM_START];
+       const u8 priv_sp = bpf2a64[PRIVATE_SP];
+       void __percpu *priv_stack_ptr;
        const int idx0 = ctx->idx;
        int cur_offset;
 
@@ -552,15 +578,23 @@ static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf)
                emit(A64_SUB_I(1, A64_SP, A64_FP, 96), ctx);
        }
 
-       if (ctx->fp_used)
-               /* Set up BPF prog stack base register */
-               emit(A64_MOV(1, fp, A64_SP), ctx);
-
        /* Stack must be multiples of 16B */
        ctx->stack_size = round_up(prog->aux->stack_depth, 16);
 
+       if (ctx->fp_used) {
+               if (ctx->priv_sp_used) {
+                       /* Set up private stack pointer */
+                       priv_stack_ptr = prog->aux->priv_stack_ptr + PRIV_STACK_GUARD_SZ;
+                       emit_percpu_ptr(priv_sp, priv_stack_ptr, ctx);
+                       emit(A64_ADD_I(1, fp, priv_sp, ctx->stack_size), ctx);
+               } else {
+                       /* Set up BPF prog stack base register */
+                       emit(A64_MOV(1, fp, A64_SP), ctx);
+               }
+       }
+
        /* Set up function call stack */
-       if (ctx->stack_size)
+       if (ctx->stack_size && !ctx->priv_sp_used)
                emit(A64_SUB_I(1, A64_SP, A64_SP, ctx->stack_size), ctx);
 
        if (ctx->arena_vm_start)
@@ -624,7 +658,7 @@ static int emit_bpf_tail_call(struct jit_ctx *ctx)
        emit(A64_STR64I(tcc, ptr, 0), ctx);
 
        /* restore SP */
-       if (ctx->stack_size)
+       if (ctx->stack_size && !ctx->priv_sp_used)
                emit(A64_ADD_I(1, A64_SP, A64_SP, ctx->stack_size), ctx);
 
        pop_callee_regs(ctx);
@@ -992,7 +1026,7 @@ static void build_epilogue(struct jit_ctx *ctx, bool was_classic)
        const u8 ptr = bpf2a64[TCCNT_PTR];
 
        /* We're done with BPF stack */
-       if (ctx->stack_size)
+       if (ctx->stack_size && !ctx->priv_sp_used)
                emit(A64_ADD_I(1, A64_SP, A64_SP, ctx->stack_size), ctx);
 
        pop_callee_regs(ctx);
@@ -1121,6 +1155,7 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx,
        const u8 tmp2 = bpf2a64[TMP_REG_2];
        const u8 fp = bpf2a64[BPF_REG_FP];
        const u8 arena_vm_base = bpf2a64[ARENA_VM_START];
+       const u8 priv_sp = bpf2a64[PRIVATE_SP];
        const s16 off = insn->off;
        const s32 imm = insn->imm;
        const int i = insn - ctx->prog->insnsi;
@@ -1565,7 +1600,7 @@ emit_cond_jmp:
                        src = tmp2;
                }
                if (src == fp) {
-                       src_adj = A64_SP;
+                       src_adj = ctx->priv_sp_used ? priv_sp : A64_SP;
                        off_adj = off + ctx->stack_size;
                } else {
                        src_adj = src;
@@ -1655,7 +1690,7 @@ emit_cond_jmp:
                        dst = tmp2;
                }
                if (dst == fp) {
-                       dst_adj = A64_SP;
+                       dst_adj = ctx->priv_sp_used ? priv_sp : A64_SP;
                        off_adj = off + ctx->stack_size;
                } else {
                        dst_adj = dst;
@@ -1717,7 +1752,7 @@ emit_cond_jmp:
                        dst = tmp2;
                }
                if (dst == fp) {
-                       dst_adj = A64_SP;
+                       dst_adj = ctx->priv_sp_used ? priv_sp : A64_SP;
                        off_adj = off + ctx->stack_size;
                } else {
                        dst_adj = dst;
@@ -1860,6 +1895,39 @@ static inline void bpf_flush_icache(void *start, void *end)
        flush_icache_range((unsigned long)start, (unsigned long)end);
 }
 
+static void priv_stack_init_guard(void __percpu *priv_stack_ptr, int alloc_size)
+{
+       int cpu, underflow_idx = (alloc_size - PRIV_STACK_GUARD_SZ) >> 3;
+       u64 *stack_ptr;
+
+       for_each_possible_cpu(cpu) {
+               stack_ptr = per_cpu_ptr(priv_stack_ptr, cpu);
+               stack_ptr[0] = PRIV_STACK_GUARD_VAL;
+               stack_ptr[1] = PRIV_STACK_GUARD_VAL;
+               stack_ptr[underflow_idx] = PRIV_STACK_GUARD_VAL;
+               stack_ptr[underflow_idx + 1] = PRIV_STACK_GUARD_VAL;
+       }
+}
+
+static void priv_stack_check_guard(void __percpu *priv_stack_ptr, int alloc_size,
+                                  struct bpf_prog *prog)
+{
+       int cpu, underflow_idx = (alloc_size - PRIV_STACK_GUARD_SZ) >> 3;
+       u64 *stack_ptr;
+
+       for_each_possible_cpu(cpu) {
+               stack_ptr = per_cpu_ptr(priv_stack_ptr, cpu);
+               if (stack_ptr[0] != PRIV_STACK_GUARD_VAL ||
+                   stack_ptr[1] != PRIV_STACK_GUARD_VAL ||
+                   stack_ptr[underflow_idx] != PRIV_STACK_GUARD_VAL ||
+                   stack_ptr[underflow_idx + 1] != PRIV_STACK_GUARD_VAL) {
+                       pr_err("BPF private stack overflow/underflow detected for prog %sx\n",
+                              bpf_jit_get_prog_name(prog));
+                       break;
+               }
+       }
+}
+
 struct arm64_jit_data {
        struct bpf_binary_header *header;
        u8 *ro_image;
@@ -1872,9 +1940,11 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
        int image_size, prog_size, extable_size, extable_align, extable_offset;
        struct bpf_prog *tmp, *orig_prog = prog;
        struct bpf_binary_header *header;
-       struct bpf_binary_header *ro_header;
+       struct bpf_binary_header *ro_header = NULL;
        struct arm64_jit_data *jit_data;
+       void __percpu *priv_stack_ptr = NULL;
        bool was_classic = bpf_prog_was_classic(prog);
+       int priv_stack_alloc_sz;
        bool tmp_blinded = false;
        bool extra_pass = false;
        struct jit_ctx ctx;
@@ -1906,6 +1976,23 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
                }
                prog->aux->jit_data = jit_data;
        }
+       priv_stack_ptr = prog->aux->priv_stack_ptr;
+       if (!priv_stack_ptr && prog->aux->jits_use_priv_stack) {
+               /* Allocate actual private stack size with verifier-calculated
+                * stack size plus two memory guards to protect overflow and
+                * underflow.
+                */
+               priv_stack_alloc_sz = round_up(prog->aux->stack_depth, 16) +
+                                     2 * PRIV_STACK_GUARD_SZ;
+               priv_stack_ptr = __alloc_percpu_gfp(priv_stack_alloc_sz, 16, GFP_KERNEL);
+               if (!priv_stack_ptr) {
+                       prog = orig_prog;
+                       goto out_priv_stack;
+               }
+
+               priv_stack_init_guard(priv_stack_ptr, priv_stack_alloc_sz);
+               prog->aux->priv_stack_ptr = priv_stack_ptr;
+       }
        if (jit_data->ctx.offset) {
                ctx = jit_data->ctx;
                ro_image_ptr = jit_data->ro_image;
@@ -1929,6 +2016,9 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
        ctx.user_vm_start = bpf_arena_get_user_vm_start(prog->aux->arena);
        ctx.arena_vm_start = bpf_arena_get_kern_vm_start(prog->aux->arena);
 
+       if (priv_stack_ptr)
+               ctx.priv_sp_used = true;
+
        /* Pass 1: Estimate the maximum image size.
         *
         * BPF line info needs ctx->offset[i] to be the offset of
@@ -2068,7 +2158,12 @@ skip_init_ctx:
                        ctx.offset[i] *= AARCH64_INSN_SIZE;
                bpf_prog_fill_jited_linfo(prog, ctx.offset + 1);
 out_off:
+               if (!ro_header && priv_stack_ptr) {
+                       free_percpu(priv_stack_ptr);
+                       prog->aux->priv_stack_ptr = NULL;
+               }
                kvfree(ctx.offset);
+out_priv_stack:
                kfree(jit_data);
                prog->aux->jit_data = NULL;
        }
@@ -2087,6 +2182,11 @@ out_free_hdr:
        goto out_off;
 }
 
+bool bpf_jit_supports_private_stack(void)
+{
+       return true;
+}
+
 bool bpf_jit_supports_kfunc_call(void)
 {
        return true;
@@ -2932,6 +3032,8 @@ void bpf_jit_free(struct bpf_prog *prog)
        if (prog->jited) {
                struct arm64_jit_data *jit_data = prog->aux->jit_data;
                struct bpf_binary_header *hdr;
+               void __percpu *priv_stack_ptr;
+               int priv_stack_alloc_sz;
 
                /*
                 * If we fail the final pass of JIT (from jit_subprogs),
@@ -2945,6 +3047,13 @@ void bpf_jit_free(struct bpf_prog *prog)
                }
                hdr = bpf_jit_binary_pack_hdr(prog);
                bpf_jit_binary_pack_free(hdr, NULL);
+               priv_stack_ptr = prog->aux->priv_stack_ptr;
+               if (priv_stack_ptr) {
+                       priv_stack_alloc_sz = round_up(prog->aux->stack_depth, 16) +
+                                             2 * PRIV_STACK_GUARD_SZ;
+                       priv_stack_check_guard(priv_stack_ptr, priv_stack_alloc_sz, prog);
+                       free_percpu(prog->aux->priv_stack_ptr);
+               }
                WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(prog));
        }