From: Greg Kroah-Hartman Date: Fri, 9 Mar 2018 22:17:02 +0000 (-0800) Subject: 4.15-stable patches X-Git-Tag: v3.18.99~9 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=0f18866ec5ff822fc59634662c0197926234190e;p=thirdparty%2Fkernel%2Fstable-queue.git 4.15-stable patches added patches: bpf-add-schedule-points-in-percpu-arrays-management.patch bpf-allow-xadd-only-on-aligned-memory.patch bpf-arm64-fix-out-of-bounds-access-in-tail-call.patch bpf-fix-memory-leak-in-lpm_trie-map_free-callback-function.patch bpf-fix-mlock-precharge-on-arraymaps.patch bpf-fix-rcu-lockdep-warning-for-lpm_trie-map_free-callback.patch bpf-ppc64-fix-out-of-bounds-access-in-tail-call.patch bpf-x64-implement-retpoline-for-tail-call.patch --- diff --git a/queue-4.15/bpf-add-schedule-points-in-percpu-arrays-management.patch b/queue-4.15/bpf-add-schedule-points-in-percpu-arrays-management.patch new file mode 100644 index 00000000000..b342b7c9328 --- /dev/null +++ b/queue-4.15/bpf-add-schedule-points-in-percpu-arrays-management.patch @@ -0,0 +1,52 @@ +From foo@baz Fri Mar 9 14:15:30 PST 2018 +From: Daniel Borkmann +Date: Thu, 8 Mar 2018 13:16:47 +0100 +Subject: bpf: add schedule points in percpu arrays management +To: gregkh@linuxfoundation.org +Cc: ast@kernel.org, daniel@iogearbox.net, stable@vger.kernel.org, Eric Dumazet +Message-ID: <96ee52a83c789138e21bf415a34569d15b12948a.1520507630.git.daniel@iogearbox.net> + +From: Eric Dumazet + +[ upstream commit 32fff239de37ef226d5b66329dd133f64d63b22d ] + +syszbot managed to trigger RCU detected stalls in +bpf_array_free_percpu() + +It takes time to allocate a huge percpu map, but even more time to free +it. + +Since we run in process context, use cond_resched() to yield cpu if +needed. + +Fixes: a10423b87a7e ("bpf: introduce BPF_MAP_TYPE_PERCPU_ARRAY map") +Signed-off-by: Eric Dumazet +Reported-by: syzbot +Signed-off-by: Daniel Borkmann +Signed-off-by: Greg Kroah-Hartman +--- + kernel/bpf/arraymap.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +--- a/kernel/bpf/arraymap.c ++++ b/kernel/bpf/arraymap.c +@@ -26,8 +26,10 @@ static void bpf_array_free_percpu(struct + { + int i; + +- for (i = 0; i < array->map.max_entries; i++) ++ for (i = 0; i < array->map.max_entries; i++) { + free_percpu(array->pptrs[i]); ++ cond_resched(); ++ } + } + + static int bpf_array_alloc_percpu(struct bpf_array *array) +@@ -43,6 +45,7 @@ static int bpf_array_alloc_percpu(struct + return -ENOMEM; + } + array->pptrs[i] = ptr; ++ cond_resched(); + } + + return 0; diff --git a/queue-4.15/bpf-allow-xadd-only-on-aligned-memory.patch b/queue-4.15/bpf-allow-xadd-only-on-aligned-memory.patch new file mode 100644 index 00000000000..0d7648f2db3 --- /dev/null +++ b/queue-4.15/bpf-allow-xadd-only-on-aligned-memory.patch @@ -0,0 +1,258 @@ +From foo@baz Fri Mar 9 14:15:30 PST 2018 +From: Daniel Borkmann +Date: Thu, 8 Mar 2018 13:16:48 +0100 +Subject: bpf: allow xadd only on aligned memory +To: gregkh@linuxfoundation.org +Cc: ast@kernel.org, daniel@iogearbox.net, stable@vger.kernel.org +Message-ID: <3cc2bcb1248a1821430d0e7574a8dbd9d387f7ec.1520507630.git.daniel@iogearbox.net> + +From: Daniel Borkmann + +[ upstream commit ca36960211eb228bcbc7aaebfa0d027368a94c60 ] + +The requirements around atomic_add() / atomic64_add() resp. their +JIT implementations differ across architectures. E.g. while x86_64 +seems just fine with BPF's xadd on unaligned memory, on arm64 it +triggers via interpreter but also JIT the following crash: + + [ 830.864985] Unable to handle kernel paging request at virtual address ffff8097d7ed6703 + [...] + [ 830.916161] Internal error: Oops: 96000021 [#1] SMP + [ 830.984755] CPU: 37 PID: 2788 Comm: test_verifier Not tainted 4.16.0-rc2+ #8 + [ 830.991790] Hardware name: Huawei TaiShan 2280 /BC11SPCD, BIOS 1.29 07/17/2017 + [ 830.998998] pstate: 80400005 (Nzcv daif +PAN -UAO) + [ 831.003793] pc : __ll_sc_atomic_add+0x4/0x18 + [ 831.008055] lr : ___bpf_prog_run+0x1198/0x1588 + [ 831.012485] sp : ffff00001ccabc20 + [ 831.015786] x29: ffff00001ccabc20 x28: ffff8017d56a0f00 + [ 831.021087] x27: 0000000000000001 x26: 0000000000000000 + [ 831.026387] x25: 000000c168d9db98 x24: 0000000000000000 + [ 831.031686] x23: ffff000008203878 x22: ffff000009488000 + [ 831.036986] x21: ffff000008b14e28 x20: ffff00001ccabcb0 + [ 831.042286] x19: ffff0000097b5080 x18: 0000000000000a03 + [ 831.047585] x17: 0000000000000000 x16: 0000000000000000 + [ 831.052885] x15: 0000ffffaeca8000 x14: 0000000000000000 + [ 831.058184] x13: 0000000000000000 x12: 0000000000000000 + [ 831.063484] x11: 0000000000000001 x10: 0000000000000000 + [ 831.068783] x9 : 0000000000000000 x8 : 0000000000000000 + [ 831.074083] x7 : 0000000000000000 x6 : 000580d428000000 + [ 831.079383] x5 : 0000000000000018 x4 : 0000000000000000 + [ 831.084682] x3 : ffff00001ccabcb0 x2 : 0000000000000001 + [ 831.089982] x1 : ffff8097d7ed6703 x0 : 0000000000000001 + [ 831.095282] Process test_verifier (pid: 2788, stack limit = 0x0000000018370044) + [ 831.102577] Call trace: + [ 831.105012] __ll_sc_atomic_add+0x4/0x18 + [ 831.108923] __bpf_prog_run32+0x4c/0x70 + [ 831.112748] bpf_test_run+0x78/0xf8 + [ 831.116224] bpf_prog_test_run_xdp+0xb4/0x120 + [ 831.120567] SyS_bpf+0x77c/0x1110 + [ 831.123873] el0_svc_naked+0x30/0x34 + [ 831.127437] Code: 97fffe97 17ffffec 00000000 f9800031 (885f7c31) + +Reason for this is because memory is required to be aligned. In +case of BPF, we always enforce alignment in terms of stack access, +but not when accessing map values or packet data when the underlying +arch (e.g. arm64) has CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS set. + +xadd on packet data that is local to us anyway is just wrong, so +forbid this case entirely. The only place where xadd makes sense in +fact are map values; xadd on stack is wrong as well, but it's been +around for much longer. Specifically enforce strict alignment in case +of xadd, so that we handle this case generically and avoid such crashes +in the first place. + +Fixes: 17a5267067f3 ("bpf: verifier (add verifier core)") +Signed-off-by: Daniel Borkmann +Signed-off-by: Alexei Starovoitov +Signed-off-by: Daniel Borkmann +Signed-off-by: Greg Kroah-Hartman +--- + kernel/bpf/verifier.c | 42 ++++++++++++-------- + tools/testing/selftests/bpf/test_verifier.c | 58 ++++++++++++++++++++++++++++ + 2 files changed, 84 insertions(+), 16 deletions(-) + +--- a/kernel/bpf/verifier.c ++++ b/kernel/bpf/verifier.c +@@ -985,6 +985,13 @@ static bool is_ctx_reg(struct bpf_verifi + return reg->type == PTR_TO_CTX; + } + ++static bool is_pkt_reg(struct bpf_verifier_env *env, int regno) ++{ ++ const struct bpf_reg_state *reg = cur_regs(env) + regno; ++ ++ return type_is_pkt_pointer(reg->type); ++} ++ + static int check_pkt_ptr_alignment(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, + int off, int size, bool strict) +@@ -1045,10 +1052,10 @@ static int check_generic_ptr_alignment(s + } + + static int check_ptr_alignment(struct bpf_verifier_env *env, +- const struct bpf_reg_state *reg, +- int off, int size) ++ const struct bpf_reg_state *reg, int off, ++ int size, bool strict_alignment_once) + { +- bool strict = env->strict_alignment; ++ bool strict = env->strict_alignment || strict_alignment_once; + const char *pointer_desc = ""; + + switch (reg->type) { +@@ -1108,9 +1115,9 @@ static void coerce_reg_to_size(struct bp + * if t==write && value_regno==-1, some unknown value is stored into memory + * if t==read && value_regno==-1, don't care what we read from memory + */ +-static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regno, int off, +- int bpf_size, enum bpf_access_type t, +- int value_regno) ++static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regno, ++ int off, int bpf_size, enum bpf_access_type t, ++ int value_regno, bool strict_alignment_once) + { + struct bpf_verifier_state *state = env->cur_state; + struct bpf_reg_state *regs = cur_regs(env); +@@ -1122,7 +1129,7 @@ static int check_mem_access(struct bpf_v + return size; + + /* alignment checks will add in reg->off themselves */ +- err = check_ptr_alignment(env, reg, off, size); ++ err = check_ptr_alignment(env, reg, off, size, strict_alignment_once); + if (err) + return err; + +@@ -1265,21 +1272,23 @@ static int check_xadd(struct bpf_verifie + return -EACCES; + } + +- if (is_ctx_reg(env, insn->dst_reg)) { +- verbose(env, "BPF_XADD stores into R%d context is not allowed\n", +- insn->dst_reg); ++ if (is_ctx_reg(env, insn->dst_reg) || ++ is_pkt_reg(env, insn->dst_reg)) { ++ verbose(env, "BPF_XADD stores into R%d %s is not allowed\n", ++ insn->dst_reg, is_ctx_reg(env, insn->dst_reg) ? ++ "context" : "packet"); + return -EACCES; + } + + /* check whether atomic_add can read the memory */ + err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, +- BPF_SIZE(insn->code), BPF_READ, -1); ++ BPF_SIZE(insn->code), BPF_READ, -1, true); + if (err) + return err; + + /* check whether atomic_add can write into the same memory */ + return check_mem_access(env, insn_idx, insn->dst_reg, insn->off, +- BPF_SIZE(insn->code), BPF_WRITE, -1); ++ BPF_SIZE(insn->code), BPF_WRITE, -1, true); + } + + /* Does this register contain a constant zero? */ +@@ -1763,7 +1772,8 @@ static int check_call(struct bpf_verifie + * is inferred from register state. + */ + for (i = 0; i < meta.access_size; i++) { +- err = check_mem_access(env, insn_idx, meta.regno, i, BPF_B, BPF_WRITE, -1); ++ err = check_mem_access(env, insn_idx, meta.regno, i, BPF_B, ++ BPF_WRITE, -1, false); + if (err) + return err; + } +@@ -3933,7 +3943,7 @@ static int do_check(struct bpf_verifier_ + */ + err = check_mem_access(env, insn_idx, insn->src_reg, insn->off, + BPF_SIZE(insn->code), BPF_READ, +- insn->dst_reg); ++ insn->dst_reg, false); + if (err) + return err; + +@@ -3985,7 +3995,7 @@ static int do_check(struct bpf_verifier_ + /* check that memory (dst_reg + off) is writeable */ + err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, + BPF_SIZE(insn->code), BPF_WRITE, +- insn->src_reg); ++ insn->src_reg, false); + if (err) + return err; + +@@ -4020,7 +4030,7 @@ static int do_check(struct bpf_verifier_ + /* check that memory (dst_reg + off) is writeable */ + err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, + BPF_SIZE(insn->code), BPF_WRITE, +- -1); ++ -1, false); + if (err) + return err; + +--- a/tools/testing/selftests/bpf/test_verifier.c ++++ b/tools/testing/selftests/bpf/test_verifier.c +@@ -8852,6 +8852,64 @@ static struct bpf_test tests[] = { + .result = REJECT, + .prog_type = BPF_PROG_TYPE_CGROUP_SOCK, + }, ++ { ++ "xadd/w check unaligned stack", ++ .insns = { ++ BPF_MOV64_IMM(BPF_REG_0, 1), ++ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -8), ++ BPF_STX_XADD(BPF_W, BPF_REG_10, BPF_REG_0, -7), ++ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_10, -8), ++ BPF_EXIT_INSN(), ++ }, ++ .result = REJECT, ++ .errstr = "misaligned stack access off", ++ .prog_type = BPF_PROG_TYPE_SCHED_CLS, ++ }, ++ { ++ "xadd/w check unaligned map", ++ .insns = { ++ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), ++ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), ++ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), ++ BPF_LD_MAP_FD(BPF_REG_1, 0), ++ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, ++ BPF_FUNC_map_lookup_elem), ++ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), ++ BPF_EXIT_INSN(), ++ BPF_MOV64_IMM(BPF_REG_1, 1), ++ BPF_STX_XADD(BPF_W, BPF_REG_0, BPF_REG_1, 3), ++ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, 3), ++ BPF_EXIT_INSN(), ++ }, ++ .fixup_map1 = { 3 }, ++ .result = REJECT, ++ .errstr = "misaligned value access off", ++ .prog_type = BPF_PROG_TYPE_SCHED_CLS, ++ }, ++ { ++ "xadd/w check unaligned pkt", ++ .insns = { ++ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, ++ offsetof(struct xdp_md, data)), ++ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, ++ offsetof(struct xdp_md, data_end)), ++ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2), ++ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8), ++ BPF_JMP_REG(BPF_JLT, BPF_REG_1, BPF_REG_3, 2), ++ BPF_MOV64_IMM(BPF_REG_0, 99), ++ BPF_JMP_IMM(BPF_JA, 0, 0, 6), ++ BPF_MOV64_IMM(BPF_REG_0, 1), ++ BPF_ST_MEM(BPF_W, BPF_REG_2, 0, 0), ++ BPF_ST_MEM(BPF_W, BPF_REG_2, 3, 0), ++ BPF_STX_XADD(BPF_W, BPF_REG_2, BPF_REG_0, 1), ++ BPF_STX_XADD(BPF_W, BPF_REG_2, BPF_REG_0, 2), ++ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_2, 1), ++ BPF_EXIT_INSN(), ++ }, ++ .result = REJECT, ++ .errstr = "BPF_XADD stores into R2 packet", ++ .prog_type = BPF_PROG_TYPE_XDP, ++ }, + }; + + static int probe_filter_length(const struct bpf_insn *fp) diff --git a/queue-4.15/bpf-arm64-fix-out-of-bounds-access-in-tail-call.patch b/queue-4.15/bpf-arm64-fix-out-of-bounds-access-in-tail-call.patch new file mode 100644 index 00000000000..8c312098be9 --- /dev/null +++ b/queue-4.15/bpf-arm64-fix-out-of-bounds-access-in-tail-call.patch @@ -0,0 +1,158 @@ +From foo@baz Fri Mar 9 14:15:30 PST 2018 +From: Daniel Borkmann +Date: Thu, 8 Mar 2018 13:16:46 +0100 +Subject: bpf, arm64: fix out of bounds access in tail call +To: gregkh@linuxfoundation.org +Cc: ast@kernel.org, daniel@iogearbox.net, stable@vger.kernel.org +Message-ID: + +From: Daniel Borkmann + +[ upstream commit 16338a9b3ac30740d49f5dfed81bac0ffa53b9c7 ] + +I recently noticed a crash on arm64 when feeding a bogus index +into BPF tail call helper. The crash would not occur when the +interpreter is used, but only in case of JIT. Output looks as +follows: + + [ 347.007486] Unable to handle kernel paging request at virtual address fffb850e96492510 + [...] + [ 347.043065] [fffb850e96492510] address between user and kernel address ranges + [ 347.050205] Internal error: Oops: 96000004 [#1] SMP + [...] + [ 347.190829] x13: 0000000000000000 x12: 0000000000000000 + [ 347.196128] x11: fffc047ebe782800 x10: ffff808fd7d0fd10 + [ 347.201427] x9 : 0000000000000000 x8 : 0000000000000000 + [ 347.206726] x7 : 0000000000000000 x6 : 001c991738000000 + [ 347.212025] x5 : 0000000000000018 x4 : 000000000000ba5a + [ 347.217325] x3 : 00000000000329c4 x2 : ffff808fd7cf0500 + [ 347.222625] x1 : ffff808fd7d0fc00 x0 : ffff808fd7cf0500 + [ 347.227926] Process test_verifier (pid: 4548, stack limit = 0x000000007467fa61) + [ 347.235221] Call trace: + [ 347.237656] 0xffff000002f3a4fc + [ 347.240784] bpf_test_run+0x78/0xf8 + [ 347.244260] bpf_prog_test_run_skb+0x148/0x230 + [ 347.248694] SyS_bpf+0x77c/0x1110 + [ 347.251999] el0_svc_naked+0x30/0x34 + [ 347.255564] Code: 9100075a d280220a 8b0a002a d37df04b (f86b694b) + [...] + +In this case the index used in BPF r3 is the same as in r1 +at the time of the call, meaning we fed a pointer as index; +here, it had the value 0xffff808fd7cf0500 which sits in x2. + +While I found tail calls to be working in general (also for +hitting the error cases), I noticed the following in the code +emission: + + # bpftool p d j i 988 + [...] + 38: ldr w10, [x1,x10] + 3c: cmp w2, w10 + 40: b.ge 0x000000000000007c <-- signed cmp + 44: mov x10, #0x20 // #32 + 48: cmp x26, x10 + 4c: b.gt 0x000000000000007c + 50: add x26, x26, #0x1 + 54: mov x10, #0x110 // #272 + 58: add x10, x1, x10 + 5c: lsl x11, x2, #3 + 60: ldr x11, [x10,x11] <-- faulting insn (f86b694b) + 64: cbz x11, 0x000000000000007c + [...] + +Meaning, the tests passed because commit ddb55992b04d ("arm64: +bpf: implement bpf_tail_call() helper") was using signed compares +instead of unsigned which as a result had the test wrongly passing. + +Change this but also the tail call count test both into unsigned +and cap the index as u32. Latter we did as well in 90caccdd8cc0 +("bpf: fix bpf_tail_call() x64 JIT") and is needed in addition here, +too. Tested on HiSilicon Hi1616. + +Result after patch: + + # bpftool p d j i 268 + [...] + 38: ldr w10, [x1,x10] + 3c: add w2, w2, #0x0 + 40: cmp w2, w10 + 44: b.cs 0x0000000000000080 + 48: mov x10, #0x20 // #32 + 4c: cmp x26, x10 + 50: b.hi 0x0000000000000080 + 54: add x26, x26, #0x1 + 58: mov x10, #0x110 // #272 + 5c: add x10, x1, x10 + 60: lsl x11, x2, #3 + 64: ldr x11, [x10,x11] + 68: cbz x11, 0x0000000000000080 + [...] + +Fixes: ddb55992b04d ("arm64: bpf: implement bpf_tail_call() helper") +Signed-off-by: Daniel Borkmann +Signed-off-by: Alexei Starovoitov +Signed-off-by: Daniel Borkmann +Signed-off-by: Greg Kroah-Hartman +--- + arch/arm64/net/bpf_jit_comp.c | 5 +++-- + tools/testing/selftests/bpf/test_verifier.c | 26 ++++++++++++++++++++++++++ + 2 files changed, 29 insertions(+), 2 deletions(-) + +--- a/arch/arm64/net/bpf_jit_comp.c ++++ b/arch/arm64/net/bpf_jit_comp.c +@@ -238,8 +238,9 @@ static int emit_bpf_tail_call(struct jit + off = offsetof(struct bpf_array, map.max_entries); + emit_a64_mov_i64(tmp, off, ctx); + emit(A64_LDR32(tmp, r2, tmp), ctx); ++ emit(A64_MOV(0, r3, r3), ctx); + emit(A64_CMP(0, r3, tmp), ctx); +- emit(A64_B_(A64_COND_GE, jmp_offset), ctx); ++ emit(A64_B_(A64_COND_CS, jmp_offset), ctx); + + /* if (tail_call_cnt > MAX_TAIL_CALL_CNT) + * goto out; +@@ -247,7 +248,7 @@ static int emit_bpf_tail_call(struct jit + */ + emit_a64_mov_i64(tmp, MAX_TAIL_CALL_CNT, ctx); + emit(A64_CMP(1, tcc, tmp), ctx); +- emit(A64_B_(A64_COND_GT, jmp_offset), ctx); ++ emit(A64_B_(A64_COND_HI, jmp_offset), ctx); + emit(A64_ADD_I(1, tcc, tcc, 1), ctx); + + /* prog = array->ptrs[index]; +--- a/tools/testing/selftests/bpf/test_verifier.c ++++ b/tools/testing/selftests/bpf/test_verifier.c +@@ -2255,6 +2255,32 @@ static struct bpf_test tests[] = { + .result = ACCEPT, + }, + { ++ "runtime/jit: pass negative index to tail_call", ++ .insns = { ++ BPF_MOV64_IMM(BPF_REG_3, -1), ++ BPF_LD_MAP_FD(BPF_REG_2, 0), ++ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, ++ BPF_FUNC_tail_call), ++ BPF_MOV64_IMM(BPF_REG_0, 0), ++ BPF_EXIT_INSN(), ++ }, ++ .fixup_prog = { 1 }, ++ .result = ACCEPT, ++ }, ++ { ++ "runtime/jit: pass > 32bit index to tail_call", ++ .insns = { ++ BPF_LD_IMM64(BPF_REG_3, 0x100000000ULL), ++ BPF_LD_MAP_FD(BPF_REG_2, 0), ++ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, ++ BPF_FUNC_tail_call), ++ BPF_MOV64_IMM(BPF_REG_0, 0), ++ BPF_EXIT_INSN(), ++ }, ++ .fixup_prog = { 2 }, ++ .result = ACCEPT, ++ }, ++ { + "stack pointer arithmetic", + .insns = { + BPF_MOV64_IMM(BPF_REG_1, 4), diff --git a/queue-4.15/bpf-fix-memory-leak-in-lpm_trie-map_free-callback-function.patch b/queue-4.15/bpf-fix-memory-leak-in-lpm_trie-map_free-callback-function.patch new file mode 100644 index 00000000000..9bbf75ba9e2 --- /dev/null +++ b/queue-4.15/bpf-fix-memory-leak-in-lpm_trie-map_free-callback-function.patch @@ -0,0 +1,67 @@ +From foo@baz Fri Mar 9 14:15:30 PST 2018 +From: Daniel Borkmann +Date: Thu, 8 Mar 2018 13:16:43 +0100 +Subject: bpf: fix memory leak in lpm_trie map_free callback function +To: gregkh@linuxfoundation.org +Cc: ast@kernel.org, daniel@iogearbox.net, stable@vger.kernel.org, Yonghong Song +Message-ID: <92a9bac0950f4c6def7378bc548eeaea6518b4da.1520507630.git.daniel@iogearbox.net> + +From: Yonghong Song + +[ upstream commit 9a3efb6b661f71d5675369ace9257833f0e78ef3 ] + +There is a memory leak happening in lpm_trie map_free callback +function trie_free. The trie structure itself does not get freed. + +Also, trie_free function did not do synchronize_rcu before freeing +various data structures. This is incorrect as some rcu_read_lock +region(s) for lookup, update, delete or get_next_key may not complete yet. +The fix is to add synchronize_rcu in the beginning of trie_free. +The useless spin_lock is removed from this function as well. + +Fixes: b95a5c4db09b ("bpf: add a longest prefix match trie map implementation") +Reported-by: Mathieu Malaterre +Reported-by: Alexei Starovoitov +Tested-by: Mathieu Malaterre +Signed-off-by: Yonghong Song +Signed-off-by: Alexei Starovoitov +Signed-off-by: Daniel Borkmann +Signed-off-by: Greg Kroah-Hartman +--- + kernel/bpf/lpm_trie.c | 11 +++++++---- + 1 file changed, 7 insertions(+), 4 deletions(-) + +--- a/kernel/bpf/lpm_trie.c ++++ b/kernel/bpf/lpm_trie.c +@@ -560,7 +560,10 @@ static void trie_free(struct bpf_map *ma + struct lpm_trie_node __rcu **slot; + struct lpm_trie_node *node; + +- raw_spin_lock(&trie->lock); ++ /* Wait for outstanding programs to complete ++ * update/lookup/delete/get_next_key and free the trie. ++ */ ++ synchronize_rcu(); + + /* Always start at the root and walk down to a node that has no + * children. Then free that node, nullify its reference in the parent +@@ -574,7 +577,7 @@ static void trie_free(struct bpf_map *ma + node = rcu_dereference_protected(*slot, + lockdep_is_held(&trie->lock)); + if (!node) +- goto unlock; ++ goto out; + + if (rcu_access_pointer(node->child[0])) { + slot = &node->child[0]; +@@ -592,8 +595,8 @@ static void trie_free(struct bpf_map *ma + } + } + +-unlock: +- raw_spin_unlock(&trie->lock); ++out: ++ kfree(trie); + } + + static int trie_get_next_key(struct bpf_map *map, void *key, void *next_key) diff --git a/queue-4.15/bpf-fix-mlock-precharge-on-arraymaps.patch b/queue-4.15/bpf-fix-mlock-precharge-on-arraymaps.patch new file mode 100644 index 00000000000..247afa0e8c5 --- /dev/null +++ b/queue-4.15/bpf-fix-mlock-precharge-on-arraymaps.patch @@ -0,0 +1,95 @@ +From foo@baz Fri Mar 9 14:15:30 PST 2018 +From: Daniel Borkmann +Date: Thu, 8 Mar 2018 13:16:42 +0100 +Subject: bpf: fix mlock precharge on arraymaps +To: gregkh@linuxfoundation.org +Cc: ast@kernel.org, daniel@iogearbox.net, stable@vger.kernel.org, Dennis Zhou +Message-ID: <95c9eef7020bed6b0c05547ad9987ef060c0b9cb.1520507630.git.daniel@iogearbox.net> + +From: Daniel Borkmann + +[ upstream commit 9c2d63b843a5c8a8d0559cc067b5398aa5ec3ffc ] + +syzkaller recently triggered OOM during percpu map allocation; +while there is work in progress by Dennis Zhou to add __GFP_NORETRY +semantics for percpu allocator under pressure, there seems also a +missing bpf_map_precharge_memlock() check in array map allocation. + +Given today the actual bpf_map_charge_memlock() happens after the +find_and_alloc_map() in syscall path, the bpf_map_precharge_memlock() +is there to bail out early before we go and do the map setup work +when we find that we hit the limits anyway. Therefore add this for +array map as well. + +Fixes: 6c9059817432 ("bpf: pre-allocate hash map elements") +Fixes: a10423b87a7e ("bpf: introduce BPF_MAP_TYPE_PERCPU_ARRAY map") +Reported-by: syzbot+adb03f3f0bb57ce3acda@syzkaller.appspotmail.com +Signed-off-by: Daniel Borkmann +Cc: Dennis Zhou +Signed-off-by: Alexei Starovoitov +Signed-off-by: Daniel Borkmann +Signed-off-by: Greg Kroah-Hartman +--- + kernel/bpf/arraymap.c | 28 ++++++++++++++++------------ + 1 file changed, 16 insertions(+), 12 deletions(-) + +--- a/kernel/bpf/arraymap.c ++++ b/kernel/bpf/arraymap.c +@@ -52,11 +52,11 @@ static int bpf_array_alloc_percpu(struct + static struct bpf_map *array_map_alloc(union bpf_attr *attr) + { + bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; +- int numa_node = bpf_map_attr_numa_node(attr); ++ int ret, numa_node = bpf_map_attr_numa_node(attr); + u32 elem_size, index_mask, max_entries; + bool unpriv = !capable(CAP_SYS_ADMIN); ++ u64 cost, array_size, mask64; + struct bpf_array *array; +- u64 array_size, mask64; + + /* check sanity of attributes */ + if (attr->max_entries == 0 || attr->key_size != 4 || +@@ -101,8 +101,19 @@ static struct bpf_map *array_map_alloc(u + array_size += (u64) max_entries * elem_size; + + /* make sure there is no u32 overflow later in round_up() */ +- if (array_size >= U32_MAX - PAGE_SIZE) ++ cost = array_size; ++ if (cost >= U32_MAX - PAGE_SIZE) + return ERR_PTR(-ENOMEM); ++ if (percpu) { ++ cost += (u64)attr->max_entries * elem_size * num_possible_cpus(); ++ if (cost >= U32_MAX - PAGE_SIZE) ++ return ERR_PTR(-ENOMEM); ++ } ++ cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; ++ ++ ret = bpf_map_precharge_memlock(cost); ++ if (ret < 0) ++ return ERR_PTR(ret); + + /* allocate all map elements and zero-initialize them */ + array = bpf_map_area_alloc(array_size, numa_node); +@@ -118,20 +129,13 @@ static struct bpf_map *array_map_alloc(u + array->map.max_entries = attr->max_entries; + array->map.map_flags = attr->map_flags; + array->map.numa_node = numa_node; ++ array->map.pages = cost; + array->elem_size = elem_size; + +- if (!percpu) +- goto out; +- +- array_size += (u64) attr->max_entries * elem_size * num_possible_cpus(); +- +- if (array_size >= U32_MAX - PAGE_SIZE || +- bpf_array_alloc_percpu(array)) { ++ if (percpu && bpf_array_alloc_percpu(array)) { + bpf_map_area_free(array); + return ERR_PTR(-ENOMEM); + } +-out: +- array->map.pages = round_up(array_size, PAGE_SIZE) >> PAGE_SHIFT; + + return &array->map; + } diff --git a/queue-4.15/bpf-fix-rcu-lockdep-warning-for-lpm_trie-map_free-callback.patch b/queue-4.15/bpf-fix-rcu-lockdep-warning-for-lpm_trie-map_free-callback.patch new file mode 100644 index 00000000000..5cf4d11a90a --- /dev/null +++ b/queue-4.15/bpf-fix-rcu-lockdep-warning-for-lpm_trie-map_free-callback.patch @@ -0,0 +1,62 @@ +From foo@baz Fri Mar 9 14:15:30 PST 2018 +From: Daniel Borkmann +Date: Thu, 8 Mar 2018 13:16:44 +0100 +Subject: bpf: fix rcu lockdep warning for lpm_trie map_free callback +To: gregkh@linuxfoundation.org +Cc: ast@kernel.org, daniel@iogearbox.net, stable@vger.kernel.org, Yonghong Song +Message-ID: <3181ff4299652cbcfec7f1f2632ce1e486a3a25a.1520507630.git.daniel@iogearbox.net> + +From: Yonghong Song + +[ upstream commit 6c5f61023c5b0edb0c8a64c902fe97c6453b1852 ] + +Commit 9a3efb6b661f ("bpf: fix memory leak in lpm_trie map_free callback function") +fixed a memory leak and removed unnecessary locks in map_free callback function. +Unfortrunately, it introduced a lockdep warning. When lockdep checking is turned on, +running tools/testing/selftests/bpf/test_lpm_map will have: + + [ 98.294321] ============================= + [ 98.294807] WARNING: suspicious RCU usage + [ 98.295359] 4.16.0-rc2+ #193 Not tainted + [ 98.295907] ----------------------------- + [ 98.296486] /home/yhs/work/bpf/kernel/bpf/lpm_trie.c:572 suspicious rcu_dereference_check() usage! + [ 98.297657] + [ 98.297657] other info that might help us debug this: + [ 98.297657] + [ 98.298663] + [ 98.298663] rcu_scheduler_active = 2, debug_locks = 1 + [ 98.299536] 2 locks held by kworker/2:1/54: + [ 98.300152] #0: ((wq_completion)"events"){+.+.}, at: [<00000000196bc1f0>] process_one_work+0x157/0x5c0 + [ 98.301381] #1: ((work_completion)(&map->work)){+.+.}, at: [<00000000196bc1f0>] process_one_work+0x157/0x5c0 + +Since actual trie tree removal happens only after no other +accesses to the tree are possible, replacing + rcu_dereference_protected(*slot, lockdep_is_held(&trie->lock)) +with + rcu_dereference_protected(*slot, 1) +fixed the issue. + +Fixes: 9a3efb6b661f ("bpf: fix memory leak in lpm_trie map_free callback function") +Reported-by: Eric Dumazet +Suggested-by: Eric Dumazet +Signed-off-by: Yonghong Song +Reviewed-by: Eric Dumazet +Acked-by: David S. Miller +Signed-off-by: Daniel Borkmann +Signed-off-by: Greg Kroah-Hartman +--- + kernel/bpf/lpm_trie.c | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +--- a/kernel/bpf/lpm_trie.c ++++ b/kernel/bpf/lpm_trie.c +@@ -574,8 +574,7 @@ static void trie_free(struct bpf_map *ma + slot = &trie->root; + + for (;;) { +- node = rcu_dereference_protected(*slot, +- lockdep_is_held(&trie->lock)); ++ node = rcu_dereference_protected(*slot, 1); + if (!node) + goto out; + diff --git a/queue-4.15/bpf-ppc64-fix-out-of-bounds-access-in-tail-call.patch b/queue-4.15/bpf-ppc64-fix-out-of-bounds-access-in-tail-call.patch new file mode 100644 index 00000000000..72acb3f793a --- /dev/null +++ b/queue-4.15/bpf-ppc64-fix-out-of-bounds-access-in-tail-call.patch @@ -0,0 +1,42 @@ +From foo@baz Fri Mar 9 14:15:30 PST 2018 +From: Daniel Borkmann +Date: Thu, 8 Mar 2018 13:16:49 +0100 +Subject: bpf, ppc64: fix out of bounds access in tail call +To: gregkh@linuxfoundation.org +Cc: ast@kernel.org, daniel@iogearbox.net, stable@vger.kernel.org +Message-ID: + +From: Daniel Borkmann + +[ upstream commit d269176e766c71c998cb75b4ea8cbc321cc0019d ] + +While working on 16338a9b3ac3 ("bpf, arm64: fix out of bounds access in +tail call") I noticed that ppc64 JIT is partially affected as well. While +the bound checking is correctly performed as unsigned comparison, the +register with the index value however, is never truncated into 32 bit +space, so e.g. a index value of 0x100000000ULL with a map of 1 element +would pass with PPC_CMPLW() whereas we later on continue with the full +64 bit register value. Therefore, as we do in interpreter and other JITs +truncate the value to 32 bit initially in order to fix access. + +Fixes: ce0761419fae ("powerpc/bpf: Implement support for tail calls") +Signed-off-by: Daniel Borkmann +Reviewed-by: Naveen N. Rao +Tested-by: Naveen N. Rao +Signed-off-by: Alexei Starovoitov +Signed-off-by: Daniel Borkmann +Signed-off-by: Greg Kroah-Hartman +--- + arch/powerpc/net/bpf_jit_comp64.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/arch/powerpc/net/bpf_jit_comp64.c ++++ b/arch/powerpc/net/bpf_jit_comp64.c +@@ -242,6 +242,7 @@ static void bpf_jit_emit_tail_call(u32 * + * goto out; + */ + PPC_LWZ(b2p[TMP_REG_1], b2p_bpf_array, offsetof(struct bpf_array, map.max_entries)); ++ PPC_RLWINM(b2p_index, b2p_index, 0, 0, 31); + PPC_CMPLW(b2p_index, b2p[TMP_REG_1]); + PPC_BCC(COND_GE, out); + diff --git a/queue-4.15/bpf-x64-implement-retpoline-for-tail-call.patch b/queue-4.15/bpf-x64-implement-retpoline-for-tail-call.patch new file mode 100644 index 00000000000..1f4209e4268 --- /dev/null +++ b/queue-4.15/bpf-x64-implement-retpoline-for-tail-call.patch @@ -0,0 +1,181 @@ +From foo@baz Fri Mar 9 14:15:30 PST 2018 +From: Daniel Borkmann +Date: Thu, 8 Mar 2018 13:16:45 +0100 +Subject: bpf, x64: implement retpoline for tail call +To: gregkh@linuxfoundation.org +Cc: ast@kernel.org, daniel@iogearbox.net, stable@vger.kernel.org +Message-ID: + +From: Daniel Borkmann + +[ upstream commit a493a87f38cfa48caaa95c9347be2d914c6fdf29 ] + +Implement a retpoline [0] for the BPF tail call JIT'ing that converts +the indirect jump via jmp %rax that is used to make the long jump into +another JITed BPF image. Since this is subject to speculative execution, +we need to control the transient instruction sequence here as well +when CONFIG_RETPOLINE is set, and direct it into a pause + lfence loop. +The latter aligns also with what gcc / clang emits (e.g. [1]). + +JIT dump after patch: + + # bpftool p d x i 1 + 0: (18) r2 = map[id:1] + 2: (b7) r3 = 0 + 3: (85) call bpf_tail_call#12 + 4: (b7) r0 = 2 + 5: (95) exit + +With CONFIG_RETPOLINE: + + # bpftool p d j i 1 + [...] + 33: cmp %edx,0x24(%rsi) + 36: jbe 0x0000000000000072 |* + 38: mov 0x24(%rbp),%eax + 3e: cmp $0x20,%eax + 41: ja 0x0000000000000072 | + 43: add $0x1,%eax + 46: mov %eax,0x24(%rbp) + 4c: mov 0x90(%rsi,%rdx,8),%rax + 54: test %rax,%rax + 57: je 0x0000000000000072 | + 59: mov 0x28(%rax),%rax + 5d: add $0x25,%rax + 61: callq 0x000000000000006d |+ + 66: pause | + 68: lfence | + 6b: jmp 0x0000000000000066 | + 6d: mov %rax,(%rsp) | + 71: retq | + 72: mov $0x2,%eax + [...] + + * relative fall-through jumps in error case + + retpoline for indirect jump + +Without CONFIG_RETPOLINE: + + # bpftool p d j i 1 + [...] + 33: cmp %edx,0x24(%rsi) + 36: jbe 0x0000000000000063 |* + 38: mov 0x24(%rbp),%eax + 3e: cmp $0x20,%eax + 41: ja 0x0000000000000063 | + 43: add $0x1,%eax + 46: mov %eax,0x24(%rbp) + 4c: mov 0x90(%rsi,%rdx,8),%rax + 54: test %rax,%rax + 57: je 0x0000000000000063 | + 59: mov 0x28(%rax),%rax + 5d: add $0x25,%rax + 61: jmpq *%rax |- + 63: mov $0x2,%eax + [...] + + * relative fall-through jumps in error case + - plain indirect jump as before + + [0] https://support.google.com/faqs/answer/7625886 + [1] https://github.com/gcc-mirror/gcc/commit/a31e654fa107be968b802786d747e962c2fcdb2b + +Signed-off-by: Daniel Borkmann +Signed-off-by: Alexei Starovoitov +Signed-off-by: Daniel Borkmann +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/include/asm/nospec-branch.h | 37 +++++++++++++++++++++++++++++++++++ + arch/x86/net/bpf_jit_comp.c | 9 ++++---- + 2 files changed, 42 insertions(+), 4 deletions(-) + +--- a/arch/x86/include/asm/nospec-branch.h ++++ b/arch/x86/include/asm/nospec-branch.h +@@ -177,4 +177,41 @@ static inline void indirect_branch_predi + } + + #endif /* __ASSEMBLY__ */ ++ ++/* ++ * Below is used in the eBPF JIT compiler and emits the byte sequence ++ * for the following assembly: ++ * ++ * With retpolines configured: ++ * ++ * callq do_rop ++ * spec_trap: ++ * pause ++ * lfence ++ * jmp spec_trap ++ * do_rop: ++ * mov %rax,(%rsp) ++ * retq ++ * ++ * Without retpolines configured: ++ * ++ * jmp *%rax ++ */ ++#ifdef CONFIG_RETPOLINE ++# define RETPOLINE_RAX_BPF_JIT_SIZE 17 ++# define RETPOLINE_RAX_BPF_JIT() \ ++ EMIT1_off32(0xE8, 7); /* callq do_rop */ \ ++ /* spec_trap: */ \ ++ EMIT2(0xF3, 0x90); /* pause */ \ ++ EMIT3(0x0F, 0xAE, 0xE8); /* lfence */ \ ++ EMIT2(0xEB, 0xF9); /* jmp spec_trap */ \ ++ /* do_rop: */ \ ++ EMIT4(0x48, 0x89, 0x04, 0x24); /* mov %rax,(%rsp) */ \ ++ EMIT1(0xC3); /* retq */ ++#else ++# define RETPOLINE_RAX_BPF_JIT_SIZE 2 ++# define RETPOLINE_RAX_BPF_JIT() \ ++ EMIT2(0xFF, 0xE0); /* jmp *%rax */ ++#endif ++ + #endif /* _ASM_X86_NOSPEC_BRANCH_H_ */ +--- a/arch/x86/net/bpf_jit_comp.c ++++ b/arch/x86/net/bpf_jit_comp.c +@@ -13,6 +13,7 @@ + #include + #include + #include ++#include + #include + + int bpf_jit_enable __read_mostly; +@@ -287,7 +288,7 @@ static void emit_bpf_tail_call(u8 **ppro + EMIT2(0x89, 0xD2); /* mov edx, edx */ + EMIT3(0x39, 0x56, /* cmp dword ptr [rsi + 16], edx */ + offsetof(struct bpf_array, map.max_entries)); +-#define OFFSET1 43 /* number of bytes to jump */ ++#define OFFSET1 (41 + RETPOLINE_RAX_BPF_JIT_SIZE) /* number of bytes to jump */ + EMIT2(X86_JBE, OFFSET1); /* jbe out */ + label1 = cnt; + +@@ -296,7 +297,7 @@ static void emit_bpf_tail_call(u8 **ppro + */ + EMIT2_off32(0x8B, 0x85, 36); /* mov eax, dword ptr [rbp + 36] */ + EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */ +-#define OFFSET2 32 ++#define OFFSET2 (30 + RETPOLINE_RAX_BPF_JIT_SIZE) + EMIT2(X86_JA, OFFSET2); /* ja out */ + label2 = cnt; + EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */ +@@ -310,7 +311,7 @@ static void emit_bpf_tail_call(u8 **ppro + * goto out; + */ + EMIT3(0x48, 0x85, 0xC0); /* test rax,rax */ +-#define OFFSET3 10 ++#define OFFSET3 (8 + RETPOLINE_RAX_BPF_JIT_SIZE) + EMIT2(X86_JE, OFFSET3); /* je out */ + label3 = cnt; + +@@ -323,7 +324,7 @@ static void emit_bpf_tail_call(u8 **ppro + * rdi == ctx (1st arg) + * rax == prog->bpf_func + prologue_size + */ +- EMIT2(0xFF, 0xE0); /* jmp rax */ ++ RETPOLINE_RAX_BPF_JIT(); + + /* out: */ + BUILD_BUG_ON(cnt - label1 != OFFSET1);