From: Greg Kroah-Hartman Date: Thu, 1 Feb 2018 08:07:15 +0000 (+0100) Subject: 4.4-stable patches X-Git-Tag: v4.4.115~20 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=799a7f55316065e246d021868340d13e6a71ee2d;p=thirdparty%2Fkernel%2Fstable-queue.git 4.4-stable patches added patches: bpf-arsh-is-not-supported-in-32-bit-alu-thus-reject-it.patch bpf-avoid-false-sharing-of-map-refcount-with-max_entries.patch bpf-fix-32-bit-divide-by-zero.patch bpf-fix-bpf_tail_call-x64-jit.patch bpf-fix-branch-pruning-logic.patch bpf-fix-divides-by-zero.patch bpf-introduce-bpf_jit_always_on-config.patch bpf-reject-stores-into-ctx-via-st-and-xadd.patch x86-bpf_jit-small-optimization-in-emit_bpf_tail_call.patch --- diff --git a/queue-4.4/bpf-arsh-is-not-supported-in-32-bit-alu-thus-reject-it.patch b/queue-4.4/bpf-arsh-is-not-supported-in-32-bit-alu-thus-reject-it.patch new file mode 100644 index 00000000000..df0365d0e93 --- /dev/null +++ b/queue-4.4/bpf-arsh-is-not-supported-in-32-bit-alu-thus-reject-it.patch @@ -0,0 +1,49 @@ +From foo@baz Thu Feb 1 09:05:44 CET 2018 +From: Daniel Borkmann +Date: Tue, 30 Jan 2018 03:37:42 +0100 +Subject: bpf: arsh is not supported in 32 bit alu thus reject it +To: gregkh@linuxfoundation.org +Cc: ast@kernel.org, daniel@iogearbox.net, stable@vger.kernel.org +Message-ID: <60932351924d42bf28628b0a01a693602cc0d9b9.1517279268.git.daniel@iogearbox.net> + +From: Daniel Borkmann + +[ upstream commit 7891a87efc7116590eaba57acc3c422487802c6f ] + +The following snippet was throwing an 'unknown opcode cc' warning +in BPF interpreter: + + 0: (18) r0 = 0x0 + 2: (7b) *(u64 *)(r10 -16) = r0 + 3: (cc) (u32) r0 s>>= (u32) r0 + 4: (95) exit + +Although a number of JITs do support BPF_ALU | BPF_ARSH | BPF_{K,X} +generation, not all of them do and interpreter does neither. We can +leave existing ones and implement it later in bpf-next for the +remaining ones, but reject this properly in verifier for the time +being. + +Fixes: 17a5267067f3 ("bpf: verifier (add verifier core)") +Reported-by: syzbot+93c4904c5c70348a6890@syzkaller.appspotmail.com +Signed-off-by: Daniel Borkmann +Signed-off-by: Alexei Starovoitov +Signed-off-by: Greg Kroah-Hartman +--- + kernel/bpf/verifier.c | 5 +++++ + 1 file changed, 5 insertions(+) + +--- a/kernel/bpf/verifier.c ++++ b/kernel/bpf/verifier.c +@@ -1165,6 +1165,11 @@ static int check_alu_op(struct verifier_ + return -EINVAL; + } + ++ if (opcode == BPF_ARSH && BPF_CLASS(insn->code) != BPF_ALU64) { ++ verbose("BPF_ARSH not supported for 32 bit ALU\n"); ++ return -EINVAL; ++ } ++ + if ((opcode == BPF_LSH || opcode == BPF_RSH || + opcode == BPF_ARSH) && BPF_SRC(insn->code) == BPF_K) { + int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32; diff --git a/queue-4.4/bpf-avoid-false-sharing-of-map-refcount-with-max_entries.patch b/queue-4.4/bpf-avoid-false-sharing-of-map-refcount-with-max_entries.patch new file mode 100644 index 00000000000..93ab57f62ef --- /dev/null +++ b/queue-4.4/bpf-avoid-false-sharing-of-map-refcount-with-max_entries.patch @@ -0,0 +1,126 @@ +From foo@baz Thu Feb 1 09:05:44 CET 2018 +From: Daniel Borkmann +Date: Tue, 30 Jan 2018 03:37:43 +0100 +Subject: bpf: avoid false sharing of map refcount with max_entries +To: gregkh@linuxfoundation.org +Cc: ast@kernel.org, daniel@iogearbox.net, stable@vger.kernel.org +Message-ID: <6c5f91e38c952be4831f6764a92cedb7a48be095.1517279268.git.daniel@iogearbox.net> + +From: Daniel Borkmann + +[ upstream commit be95a845cc4402272994ce290e3ad928aff06cb9 ] + +In addition to commit b2157399cc98 ("bpf: prevent out-of-bounds +speculation") also change the layout of struct bpf_map such that +false sharing of fast-path members like max_entries is avoided +when the maps reference counter is altered. Therefore enforce +them to be placed into separate cachelines. + +pahole dump after change: + + struct bpf_map { + const struct bpf_map_ops * ops; /* 0 8 */ + struct bpf_map * inner_map_meta; /* 8 8 */ + void * security; /* 16 8 */ + enum bpf_map_type map_type; /* 24 4 */ + u32 key_size; /* 28 4 */ + u32 value_size; /* 32 4 */ + u32 max_entries; /* 36 4 */ + u32 map_flags; /* 40 4 */ + u32 pages; /* 44 4 */ + u32 id; /* 48 4 */ + int numa_node; /* 52 4 */ + bool unpriv_array; /* 56 1 */ + + /* XXX 7 bytes hole, try to pack */ + + /* --- cacheline 1 boundary (64 bytes) --- */ + struct user_struct * user; /* 64 8 */ + atomic_t refcnt; /* 72 4 */ + atomic_t usercnt; /* 76 4 */ + struct work_struct work; /* 80 32 */ + char name[16]; /* 112 16 */ + /* --- cacheline 2 boundary (128 bytes) --- */ + + /* size: 128, cachelines: 2, members: 17 */ + /* sum members: 121, holes: 1, sum holes: 7 */ + }; + +Now all entries in the first cacheline are read only throughout +the life time of the map, set up once during map creation. Overall +struct size and number of cachelines doesn't change from the +reordering. struct bpf_map is usually first member and embedded +in map structs in specific map implementations, so also avoid those +members to sit at the end where it could potentially share the +cacheline with first map values e.g. in the array since remote +CPUs could trigger map updates just as well for those (easily +dirtying members like max_entries intentionally as well) while +having subsequent values in cache. + +Quoting from Google's Project Zero blog [1]: + + Additionally, at least on the Intel machine on which this was + tested, bouncing modified cache lines between cores is slow, + apparently because the MESI protocol is used for cache coherence + [8]. Changing the reference counter of an eBPF array on one + physical CPU core causes the cache line containing the reference + counter to be bounced over to that CPU core, making reads of the + reference counter on all other CPU cores slow until the changed + reference counter has been written back to memory. Because the + length and the reference counter of an eBPF array are stored in + the same cache line, this also means that changing the reference + counter on one physical CPU core causes reads of the eBPF array's + length to be slow on other physical CPU cores (intentional false + sharing). + +While this doesn't 'control' the out-of-bounds speculation through +masking the index as in commit b2157399cc98, triggering a manipulation +of the map's reference counter is really trivial, so lets not allow +to easily affect max_entries from it. + +Splitting to separate cachelines also generally makes sense from +a performance perspective anyway in that fast-path won't have a +cache miss if the map gets pinned, reused in other progs, etc out +of control path, thus also avoids unintentional false sharing. + + [1] https://googleprojectzero.blogspot.ch/2018/01/reading-privileged-memory-with-side.html + +Signed-off-by: Daniel Borkmann +Signed-off-by: Alexei Starovoitov +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/bpf.h | 16 ++++++++++++---- + 1 file changed, 12 insertions(+), 4 deletions(-) + +--- a/include/linux/bpf.h ++++ b/include/linux/bpf.h +@@ -31,17 +31,25 @@ struct bpf_map_ops { + }; + + struct bpf_map { +- atomic_t refcnt; ++ /* 1st cacheline with read-mostly members of which some ++ * are also accessed in fast-path (e.g. ops, max_entries). ++ */ ++ const struct bpf_map_ops *ops ____cacheline_aligned; + enum bpf_map_type map_type; + u32 key_size; + u32 value_size; + u32 max_entries; + u32 pages; + bool unpriv_array; +- struct user_struct *user; +- const struct bpf_map_ops *ops; +- struct work_struct work; ++ /* 7 bytes hole */ ++ ++ /* 2nd cacheline with misc members to avoid false sharing ++ * particularly with refcounting. ++ */ ++ struct user_struct *user ____cacheline_aligned; ++ atomic_t refcnt; + atomic_t usercnt; ++ struct work_struct work; + }; + + struct bpf_map_type_list { diff --git a/queue-4.4/bpf-fix-32-bit-divide-by-zero.patch b/queue-4.4/bpf-fix-32-bit-divide-by-zero.patch new file mode 100644 index 00000000000..92064330670 --- /dev/null +++ b/queue-4.4/bpf-fix-32-bit-divide-by-zero.patch @@ -0,0 +1,67 @@ +From foo@baz Thu Feb 1 09:05:44 CET 2018 +From: Daniel Borkmann +Date: Tue, 30 Jan 2018 03:37:45 +0100 +Subject: bpf: fix 32-bit divide by zero +To: gregkh@linuxfoundation.org +Cc: ast@kernel.org, daniel@iogearbox.net, stable@vger.kernel.org +Message-ID: <7e8a78250e8cf1f486b4cdb005e3ff313b992816.1517279268.git.daniel@iogearbox.net> + +From: Alexei Starovoitov + +[ upstream commit 68fda450a7df51cff9e5a4d4a4d9d0d5f2589153 ] + +due to some JITs doing if (src_reg == 0) check in 64-bit mode +for div/mod operations mask upper 32-bits of src register +before doing the check + +Fixes: 622582786c9e ("net: filter: x86: internal BPF JIT") +Fixes: 7a12b5031c6b ("sparc64: Add eBPF JIT.") +Reported-by: syzbot+48340bb518e88849e2e3@syzkaller.appspotmail.com +Signed-off-by: Alexei Starovoitov +Signed-off-by: Daniel Borkmann +Signed-off-by: Greg Kroah-Hartman +--- + kernel/bpf/verifier.c | 18 ++++++++++++++++++ + net/core/filter.c | 4 ++++ + 2 files changed, 22 insertions(+) + +--- a/kernel/bpf/verifier.c ++++ b/kernel/bpf/verifier.c +@@ -2248,6 +2248,24 @@ static int fixup_bpf_calls(struct verifi + int i, cnt, delta = 0; + + for (i = 0; i < insn_cnt; i++, insn++) { ++ if (insn->code == (BPF_ALU | BPF_MOD | BPF_X) || ++ insn->code == (BPF_ALU | BPF_DIV | BPF_X)) { ++ /* due to JIT bugs clear upper 32-bits of src register ++ * before div/mod operation ++ */ ++ insn_buf[0] = BPF_MOV32_REG(insn->src_reg, insn->src_reg); ++ insn_buf[1] = *insn; ++ cnt = 2; ++ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); ++ if (!new_prog) ++ return -ENOMEM; ++ ++ delta += cnt - 1; ++ env->prog = prog = new_prog; ++ insn = new_prog->insnsi + i + delta; ++ continue; ++ } ++ + if (insn->code != (BPF_JMP | BPF_CALL)) + continue; + +--- a/net/core/filter.c ++++ b/net/core/filter.c +@@ -430,6 +430,10 @@ do_pass: + convert_bpf_extensions(fp, &insn)) + break; + ++ if (fp->code == (BPF_ALU | BPF_DIV | BPF_X) || ++ fp->code == (BPF_ALU | BPF_MOD | BPF_X)) ++ *insn++ = BPF_MOV32_REG(BPF_REG_X, BPF_REG_X); ++ + *insn = BPF_RAW_INSN(fp->code, BPF_REG_A, BPF_REG_X, 0, fp->k); + break; + diff --git a/queue-4.4/bpf-fix-bpf_tail_call-x64-jit.patch b/queue-4.4/bpf-fix-bpf_tail_call-x64-jit.patch new file mode 100644 index 00000000000..0aa3cfed2ee --- /dev/null +++ b/queue-4.4/bpf-fix-bpf_tail_call-x64-jit.patch @@ -0,0 +1,60 @@ +From foo@baz Thu Feb 1 09:05:44 CET 2018 +From: Daniel Borkmann +Date: Tue, 30 Jan 2018 03:37:40 +0100 +Subject: bpf: fix bpf_tail_call() x64 JIT +To: gregkh@linuxfoundation.org +Cc: ast@kernel.org, daniel@iogearbox.net, stable@vger.kernel.org, Alexei Starovoitov , "David S . Miller" +Message-ID: <1d696e8c8bf884fb67aca8fe4ab8ba132b8a2ed1.1517279268.git.daniel@iogearbox.net> + +From: Alexei Starovoitov + +[ upstream commit 90caccdd8cc0215705f18b92771b449b01e2474a ] + +- bpf prog_array just like all other types of bpf array accepts 32-bit index. + Clarify that in the comment. +- fix x64 JIT of bpf_tail_call which was incorrectly loading 8 instead of 4 bytes +- tighten corresponding check in the interpreter to stay consistent + +The JIT bug can be triggered after introduction of BPF_F_NUMA_NODE flag +in commit 96eabe7a40aa in 4.14. Before that the map_flags would stay zero and +though JIT code is wrong it will check bounds correctly. +Hence two fixes tags. All other JITs don't have this problem. + +Signed-off-by: Alexei Starovoitov +Fixes: 96eabe7a40aa ("bpf: Allow selecting numa node during map creation") +Fixes: b52f00e6a715 ("x86: bpf_jit: implement bpf_tail_call() helper") +Acked-by: Daniel Borkmann +Acked-by: Martin KaFai Lau +Reviewed-by: Eric Dumazet +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/net/bpf_jit_comp.c | 4 ++-- + kernel/bpf/core.c | 2 +- + 2 files changed, 3 insertions(+), 3 deletions(-) + +--- a/arch/x86/net/bpf_jit_comp.c ++++ b/arch/x86/net/bpf_jit_comp.c +@@ -266,9 +266,9 @@ static void emit_bpf_tail_call(u8 **ppro + /* if (index >= array->map.max_entries) + * goto out; + */ +- EMIT4(0x48, 0x8B, 0x46, /* mov rax, qword ptr [rsi + 16] */ ++ EMIT2(0x89, 0xD2); /* mov edx, edx */ ++ EMIT3(0x39, 0x56, /* cmp dword ptr [rsi + 16], edx */ + offsetof(struct bpf_array, map.max_entries)); +- EMIT3(0x48, 0x39, 0xD0); /* cmp rax, rdx */ + #define OFFSET1 43 /* number of bytes to jump */ + EMIT2(X86_JBE, OFFSET1); /* jbe out */ + label1 = cnt; +--- a/kernel/bpf/core.c ++++ b/kernel/bpf/core.c +@@ -517,7 +517,7 @@ select_insn: + struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2; + struct bpf_array *array = container_of(map, struct bpf_array, map); + struct bpf_prog *prog; +- u64 index = BPF_R3; ++ u32 index = BPF_R3; + + if (unlikely(index >= array->map.max_entries)) + goto out; diff --git a/queue-4.4/bpf-fix-branch-pruning-logic.patch b/queue-4.4/bpf-fix-branch-pruning-logic.patch new file mode 100644 index 00000000000..11ec20edcc8 --- /dev/null +++ b/queue-4.4/bpf-fix-branch-pruning-logic.patch @@ -0,0 +1,116 @@ +From foo@baz Thu Feb 1 09:05:44 CET 2018 +From: Daniel Borkmann +Date: Tue, 30 Jan 2018 03:37:38 +0100 +Subject: bpf: fix branch pruning logic +To: gregkh@linuxfoundation.org +Cc: ast@kernel.org, daniel@iogearbox.net, stable@vger.kernel.org +Message-ID: + +From: Alexei Starovoitov + +[ Upstream commit c131187db2d3fa2f8bf32fdf4e9a4ef805168467 ] + +when the verifier detects that register contains a runtime constant +and it's compared with another constant it will prune exploration +of the branch that is guaranteed not to be taken at runtime. +This is all correct, but malicious program may be constructed +in such a way that it always has a constant comparison and +the other branch is never taken under any conditions. +In this case such path through the program will not be explored +by the verifier. It won't be taken at run-time either, but since +all instructions are JITed the malicious program may cause JITs +to complain about using reserved fields, etc. +To fix the issue we have to track the instructions explored by +the verifier and sanitize instructions that are dead at run time +with NOPs. We cannot reject such dead code, since llvm generates +it for valid C code, since it doesn't do as much data flow +analysis as the verifier does. + +Fixes: 17a5267067f3 ("bpf: verifier (add verifier core)") +Signed-off-by: Alexei Starovoitov +Acked-by: Daniel Borkmann +Signed-off-by: Daniel Borkmann +Signed-off-by: Greg Kroah-Hartman +--- + kernel/bpf/verifier.c | 28 ++++++++++++++++++++++++++++ + 1 file changed, 28 insertions(+) + +--- a/kernel/bpf/verifier.c ++++ b/kernel/bpf/verifier.c +@@ -191,6 +191,7 @@ struct bpf_insn_aux_data { + enum bpf_reg_type ptr_type; /* pointer type for load/store insns */ + struct bpf_map *map_ptr; /* pointer for call insn into lookup_elem */ + }; ++ bool seen; /* this insn was processed by the verifier */ + }; + + #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ +@@ -1793,6 +1794,7 @@ static int do_check(struct verifier_env + print_bpf_insn(env, insn); + } + ++ env->insn_aux_data[insn_idx].seen = true; + if (class == BPF_ALU || class == BPF_ALU64) { + err = check_alu_op(env, insn); + if (err) +@@ -1988,6 +1990,7 @@ process_bpf_exit: + return err; + + insn_idx++; ++ env->insn_aux_data[insn_idx].seen = true; + } else { + verbose("invalid BPF_LD mode\n"); + return -EINVAL; +@@ -2125,6 +2128,7 @@ static int adjust_insn_aux_data(struct v + u32 off, u32 cnt) + { + struct bpf_insn_aux_data *new_data, *old_data = env->insn_aux_data; ++ int i; + + if (cnt == 1) + return 0; +@@ -2134,6 +2138,8 @@ static int adjust_insn_aux_data(struct v + memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off); + memcpy(new_data + off + cnt - 1, old_data + off, + sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1)); ++ for (i = off; i < off + cnt - 1; i++) ++ new_data[i].seen = true; + env->insn_aux_data = new_data; + vfree(old_data); + return 0; +@@ -2152,6 +2158,25 @@ static struct bpf_prog *bpf_patch_insn_d + return new_prog; + } + ++/* The verifier does more data flow analysis than llvm and will not explore ++ * branches that are dead at run time. Malicious programs can have dead code ++ * too. Therefore replace all dead at-run-time code with nops. ++ */ ++static void sanitize_dead_code(struct verifier_env *env) ++{ ++ struct bpf_insn_aux_data *aux_data = env->insn_aux_data; ++ struct bpf_insn nop = BPF_MOV64_REG(BPF_REG_0, BPF_REG_0); ++ struct bpf_insn *insn = env->prog->insnsi; ++ const int insn_cnt = env->prog->len; ++ int i; ++ ++ for (i = 0; i < insn_cnt; i++) { ++ if (aux_data[i].seen) ++ continue; ++ memcpy(insn + i, &nop, sizeof(nop)); ++ } ++} ++ + /* convert load instructions that access fields of 'struct __sk_buff' + * into sequence of instructions that access fields of 'struct sk_buff' + */ +@@ -2371,6 +2396,9 @@ skip_full_check: + free_states(env); + + if (ret == 0) ++ sanitize_dead_code(env); ++ ++ if (ret == 0) + /* program is valid, convert *(u32*)(ctx + off) accesses */ + ret = convert_ctx_accesses(env); + diff --git a/queue-4.4/bpf-fix-divides-by-zero.patch b/queue-4.4/bpf-fix-divides-by-zero.patch new file mode 100644 index 00000000000..642ef43548a --- /dev/null +++ b/queue-4.4/bpf-fix-divides-by-zero.patch @@ -0,0 +1,46 @@ +From foo@baz Thu Feb 1 09:05:44 CET 2018 +From: Daniel Borkmann +Date: Tue, 30 Jan 2018 03:37:44 +0100 +Subject: bpf: fix divides by zero +To: gregkh@linuxfoundation.org +Cc: ast@kernel.org, daniel@iogearbox.net, stable@vger.kernel.org, Eric Dumazet +Message-ID: + +From: Eric Dumazet + +[ upstream commit c366287ebd698ef5e3de300d90cd62ee9ee7373e ] + +Divides by zero are not nice, lets avoid them if possible. + +Also do_div() seems not needed when dealing with 32bit operands, +but this seems a minor detail. + +Fixes: bd4cf0ed331a ("net: filter: rework/optimize internal BPF interpreter's instruction set") +Signed-off-by: Eric Dumazet +Reported-by: syzbot +Signed-off-by: Alexei Starovoitov +Signed-off-by: Greg Kroah-Hartman +--- + kernel/bpf/core.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/kernel/bpf/core.c ++++ b/kernel/bpf/core.c +@@ -444,7 +444,7 @@ select_insn: + DST = tmp; + CONT; + ALU_MOD_X: +- if (unlikely(SRC == 0)) ++ if (unlikely((u32)SRC == 0)) + return 0; + tmp = (u32) DST; + DST = do_div(tmp, (u32) SRC); +@@ -463,7 +463,7 @@ select_insn: + DST = div64_u64(DST, SRC); + CONT; + ALU_DIV_X: +- if (unlikely(SRC == 0)) ++ if (unlikely((u32)SRC == 0)) + return 0; + tmp = (u32) DST; + do_div(tmp, (u32) SRC); diff --git a/queue-4.4/bpf-introduce-bpf_jit_always_on-config.patch b/queue-4.4/bpf-introduce-bpf_jit_always_on-config.patch new file mode 100644 index 00000000000..8045b6bd173 --- /dev/null +++ b/queue-4.4/bpf-introduce-bpf_jit_always_on-config.patch @@ -0,0 +1,254 @@ +From foo@baz Thu Feb 1 09:05:44 CET 2018 +From: Daniel Borkmann +Date: Tue, 30 Jan 2018 03:37:41 +0100 +Subject: bpf: introduce BPF_JIT_ALWAYS_ON config +To: gregkh@linuxfoundation.org +Cc: ast@kernel.org, daniel@iogearbox.net, stable@vger.kernel.org +Message-ID: <8fa0284c9e3811cc7ae467dd3490da45ff76b46b.1517279268.git.daniel@iogearbox.net> + +From: Alexei Starovoitov + +[ upstream commit 290af86629b25ffd1ed6232c4e9107da031705cb ] + +The BPF interpreter has been used as part of the spectre 2 attack CVE-2017-5715. + +A quote from goolge project zero blog: +"At this point, it would normally be necessary to locate gadgets in +the host kernel code that can be used to actually leak data by reading +from an attacker-controlled location, shifting and masking the result +appropriately and then using the result of that as offset to an +attacker-controlled address for a load. But piecing gadgets together +and figuring out which ones work in a speculation context seems annoying. +So instead, we decided to use the eBPF interpreter, which is built into +the host kernel - while there is no legitimate way to invoke it from inside +a VM, the presence of the code in the host kernel's text section is sufficient +to make it usable for the attack, just like with ordinary ROP gadgets." + +To make attacker job harder introduce BPF_JIT_ALWAYS_ON config +option that removes interpreter from the kernel in favor of JIT-only mode. +So far eBPF JIT is supported by: +x64, arm64, arm32, sparc64, s390, powerpc64, mips64 + +The start of JITed program is randomized and code page is marked as read-only. +In addition "constant blinding" can be turned on with net.core.bpf_jit_harden + +v2->v3: +- move __bpf_prog_ret0 under ifdef (Daniel) + +v1->v2: +- fix init order, test_bpf and cBPF (Daniel's feedback) +- fix offloaded bpf (Jakub's feedback) +- add 'return 0' dummy in case something can invoke prog->bpf_func +- retarget bpf tree. For bpf-next the patch would need one extra hunk. + It will be sent when the trees are merged back to net-next + +Considered doing: + int bpf_jit_enable __read_mostly = BPF_EBPF_JIT_DEFAULT; +but it seems better to land the patch as-is and in bpf-next remove +bpf_jit_enable global variable from all JITs, consolidate in one place +and remove this jit_init() function. + +Signed-off-by: Alexei Starovoitov +Signed-off-by: Daniel Borkmann +Signed-off-by: Greg Kroah-Hartman +--- + arch/arm64/Kconfig | 1 + + arch/s390/Kconfig | 1 + + arch/x86/Kconfig | 1 + + init/Kconfig | 7 +++++++ + kernel/bpf/core.c | 24 +++++++++++++++++++++++- + lib/test_bpf.c | 13 ++++++++----- + net/Kconfig | 3 +++ + net/core/filter.c | 4 +++- + net/core/sysctl_net_core.c | 6 ++++++ + net/socket.c | 9 +++++++++ + 10 files changed, 62 insertions(+), 7 deletions(-) + +--- a/arch/arm64/Kconfig ++++ b/arch/arm64/Kconfig +@@ -54,6 +54,7 @@ config ARM64 + select HAVE_ARCH_SECCOMP_FILTER + select HAVE_ARCH_TRACEHOOK + select HAVE_BPF_JIT ++ select HAVE_EBPF_JIT + select HAVE_C_RECORDMCOUNT + select HAVE_CC_STACKPROTECTOR + select HAVE_CMPXCHG_DOUBLE +--- a/arch/s390/Kconfig ++++ b/arch/s390/Kconfig +@@ -123,6 +123,7 @@ config S390 + select HAVE_ARCH_TRACEHOOK + select HAVE_ARCH_TRANSPARENT_HUGEPAGE + select HAVE_BPF_JIT if PACK_STACK && HAVE_MARCH_Z196_FEATURES ++ select HAVE_EBPF_JIT if PACK_STACK && HAVE_MARCH_Z196_FEATURES + select HAVE_CMPXCHG_DOUBLE + select HAVE_CMPXCHG_LOCAL + select HAVE_DEBUG_KMEMLEAK +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -88,6 +88,7 @@ config X86 + select HAVE_ARCH_TRACEHOOK + select HAVE_ARCH_TRANSPARENT_HUGEPAGE + select HAVE_BPF_JIT if X86_64 ++ select HAVE_EBPF_JIT if X86_64 + select HAVE_CC_STACKPROTECTOR + select HAVE_CMPXCHG_DOUBLE + select HAVE_CMPXCHG_LOCAL +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -1556,6 +1556,13 @@ config BPF_SYSCALL + Enable the bpf() system call that allows to manipulate eBPF + programs and maps via file descriptors. + ++config BPF_JIT_ALWAYS_ON ++ bool "Permanently enable BPF JIT and remove BPF interpreter" ++ depends on BPF_SYSCALL && HAVE_EBPF_JIT && BPF_JIT ++ help ++ Enables BPF JIT and removes BPF interpreter to avoid ++ speculative execution of BPF instructions by the interpreter ++ + config SHMEM + bool "Use full shmem filesystem" if EXPERT + default y +--- a/kernel/bpf/core.c ++++ b/kernel/bpf/core.c +@@ -256,6 +256,7 @@ noinline u64 __bpf_call_base(u64 r1, u64 + } + EXPORT_SYMBOL_GPL(__bpf_call_base); + ++#ifndef CONFIG_BPF_JIT_ALWAYS_ON + /** + * __bpf_prog_run - run eBPF program on a given context + * @ctx: is the data we are operating on +@@ -725,6 +726,13 @@ load_byte: + return 0; + } + ++#else ++static unsigned int __bpf_prog_ret0(void *ctx, const struct bpf_insn *insn) ++{ ++ return 0; ++} ++#endif ++ + bool bpf_prog_array_compatible(struct bpf_array *array, + const struct bpf_prog *fp) + { +@@ -771,9 +779,23 @@ static int bpf_check_tail_call(const str + */ + int bpf_prog_select_runtime(struct bpf_prog *fp) + { ++#ifndef CONFIG_BPF_JIT_ALWAYS_ON + fp->bpf_func = (void *) __bpf_prog_run; +- ++#else ++ fp->bpf_func = (void *) __bpf_prog_ret0; ++#endif ++ ++ /* eBPF JITs can rewrite the program in case constant ++ * blinding is active. However, in case of error during ++ * blinding, bpf_int_jit_compile() must always return a ++ * valid program, which in this case would simply not ++ * be JITed, but falls back to the interpreter. ++ */ + bpf_int_jit_compile(fp); ++#ifdef CONFIG_BPF_JIT_ALWAYS_ON ++ if (!fp->jited) ++ return -ENOTSUPP; ++#endif + bpf_prog_lock_ro(fp); + + /* The tail call compatibility check can only be done at +--- a/lib/test_bpf.c ++++ b/lib/test_bpf.c +@@ -5304,9 +5304,8 @@ static struct bpf_prog *generate_filter( + return NULL; + } + } +- /* We don't expect to fail. */ + if (*err) { +- pr_cont("FAIL to attach err=%d len=%d\n", ++ pr_cont("FAIL to prog_create err=%d len=%d\n", + *err, fprog.len); + return NULL; + } +@@ -5325,7 +5324,11 @@ static struct bpf_prog *generate_filter( + fp->type = BPF_PROG_TYPE_SOCKET_FILTER; + memcpy(fp->insnsi, fptr, fp->len * sizeof(struct bpf_insn)); + +- bpf_prog_select_runtime(fp); ++ *err = bpf_prog_select_runtime(fp); ++ if (*err) { ++ pr_cont("FAIL to select_runtime err=%d\n", *err); ++ return NULL; ++ } + break; + } + +@@ -5511,8 +5514,8 @@ static __init int test_bpf(void) + pass_cnt++; + continue; + } +- +- return err; ++ err_cnt++; ++ continue; + } + + pr_cont("jited:%u ", fp->jited); +--- a/net/Kconfig ++++ b/net/Kconfig +@@ -388,3 +388,6 @@ endif # if NET + # Used by archs to tell that they support BPF_JIT + config HAVE_BPF_JIT + bool ++ ++config HAVE_EBPF_JIT ++ bool +--- a/net/core/filter.c ++++ b/net/core/filter.c +@@ -984,7 +984,9 @@ static struct bpf_prog *bpf_migrate_filt + */ + goto out_err_free; + +- bpf_prog_select_runtime(fp); ++ err = bpf_prog_select_runtime(fp); ++ if (err) ++ goto out_err_free; + + kfree(old_prog); + return fp; +--- a/net/core/sysctl_net_core.c ++++ b/net/core/sysctl_net_core.c +@@ -292,7 +292,13 @@ static struct ctl_table net_core_table[] + .data = &bpf_jit_enable, + .maxlen = sizeof(int), + .mode = 0644, ++#ifndef CONFIG_BPF_JIT_ALWAYS_ON + .proc_handler = proc_dointvec ++#else ++ .proc_handler = proc_dointvec_minmax, ++ .extra1 = &one, ++ .extra2 = &one, ++#endif + }, + #endif + { +--- a/net/socket.c ++++ b/net/socket.c +@@ -2534,6 +2534,15 @@ out_fs: + + core_initcall(sock_init); /* early initcall */ + ++static int __init jit_init(void) ++{ ++#ifdef CONFIG_BPF_JIT_ALWAYS_ON ++ bpf_jit_enable = 1; ++#endif ++ return 0; ++} ++pure_initcall(jit_init); ++ + #ifdef CONFIG_PROC_FS + void socket_seq_show(struct seq_file *seq) + { diff --git a/queue-4.4/bpf-reject-stores-into-ctx-via-st-and-xadd.patch b/queue-4.4/bpf-reject-stores-into-ctx-via-st-and-xadd.patch new file mode 100644 index 00000000000..2e1d163dbfa --- /dev/null +++ b/queue-4.4/bpf-reject-stores-into-ctx-via-st-and-xadd.patch @@ -0,0 +1,72 @@ +From foo@baz Thu Feb 1 09:05:44 CET 2018 +From: Daniel Borkmann +Date: Tue, 30 Jan 2018 03:37:46 +0100 +Subject: bpf: reject stores into ctx via st and xadd +To: gregkh@linuxfoundation.org +Cc: ast@kernel.org, daniel@iogearbox.net, stable@vger.kernel.org +Message-ID: <7d49693fcf1d0f23f0f14e8da18acfe03da9fc18.1517279268.git.daniel@iogearbox.net> + +From: Daniel Borkmann + +[ upstream commit f37a8cb84cce18762e8f86a70bd6a49a66ab964c ] + +Alexei found that verifier does not reject stores into context +via BPF_ST instead of BPF_STX. And while looking at it, we +also should not allow XADD variant of BPF_STX. + +The context rewriter is only assuming either BPF_LDX_MEM- or +BPF_STX_MEM-type operations, thus reject anything other than +that so that assumptions in the rewriter properly hold. Add +test cases as well for BPF selftests. + +Fixes: d691f9e8d440 ("bpf: allow programs to write to certain skb fields") +Reported-by: Alexei Starovoitov +Signed-off-by: Daniel Borkmann +Signed-off-by: Alexei Starovoitov +Signed-off-by: Greg Kroah-Hartman +--- + kernel/bpf/verifier.c | 19 +++++++++++++++++++ + 1 file changed, 19 insertions(+) + +--- a/kernel/bpf/verifier.c ++++ b/kernel/bpf/verifier.c +@@ -683,6 +683,13 @@ static bool is_pointer_value(struct veri + } + } + ++static bool is_ctx_reg(struct verifier_env *env, int regno) ++{ ++ const struct reg_state *reg = &env->cur_state.regs[regno]; ++ ++ return reg->type == PTR_TO_CTX; ++} ++ + /* check whether memory at (regno + off) is accessible for t = (read | write) + * if t==write, value_regno is a register which value is stored into memory + * if t==read, value_regno is a register which will receive the value from memory +@@ -779,6 +786,12 @@ static int check_xadd(struct verifier_en + return -EACCES; + } + ++ if (is_ctx_reg(env, insn->dst_reg)) { ++ verbose("BPF_XADD stores into R%d context is not allowed\n", ++ insn->dst_reg); ++ return -EACCES; ++ } ++ + /* check whether atomic_add can read the memory */ + err = check_mem_access(env, insn->dst_reg, insn->off, + BPF_SIZE(insn->code), BPF_READ, -1); +@@ -1909,6 +1922,12 @@ static int do_check(struct verifier_env + if (err) + return err; + ++ if (is_ctx_reg(env, insn->dst_reg)) { ++ verbose("BPF_ST stores into R%d context is not allowed\n", ++ insn->dst_reg); ++ return -EACCES; ++ } ++ + /* check that memory (dst_reg + off) is writeable */ + err = check_mem_access(env, insn->dst_reg, insn->off, + BPF_SIZE(insn->code), BPF_WRITE, diff --git a/queue-4.4/series b/queue-4.4/series index f87e6b5e722..db87e518728 100644 --- a/queue-4.4/series +++ b/queue-4.4/series @@ -1 +1,10 @@ loop-fix-concurrent-lo_open-lo_release.patch +bpf-fix-branch-pruning-logic.patch +x86-bpf_jit-small-optimization-in-emit_bpf_tail_call.patch +bpf-fix-bpf_tail_call-x64-jit.patch +bpf-introduce-bpf_jit_always_on-config.patch +bpf-arsh-is-not-supported-in-32-bit-alu-thus-reject-it.patch +bpf-avoid-false-sharing-of-map-refcount-with-max_entries.patch +bpf-fix-divides-by-zero.patch +bpf-fix-32-bit-divide-by-zero.patch +bpf-reject-stores-into-ctx-via-st-and-xadd.patch diff --git a/queue-4.4/x86-bpf_jit-small-optimization-in-emit_bpf_tail_call.patch b/queue-4.4/x86-bpf_jit-small-optimization-in-emit_bpf_tail_call.patch new file mode 100644 index 00000000000..12e538595d6 --- /dev/null +++ b/queue-4.4/x86-bpf_jit-small-optimization-in-emit_bpf_tail_call.patch @@ -0,0 +1,70 @@ +From foo@baz Thu Feb 1 09:05:44 CET 2018 +From: Daniel Borkmann +Date: Tue, 30 Jan 2018 03:37:39 +0100 +Subject: x86: bpf_jit: small optimization in emit_bpf_tail_call() +To: gregkh@linuxfoundation.org +Cc: ast@kernel.org, daniel@iogearbox.net, stable@vger.kernel.org, Eric Dumazet , "David S . Miller" +Message-ID: + +From: Eric Dumazet + +[ upstream commit 84ccac6e7854ebbfb56d2fc6d5bef9be49bb304c ] + +Saves 4 bytes replacing following instructions : + +lea rax, [rsi + rdx * 8 + offsetof(...)] +mov rax, qword ptr [rax] +cmp rax, 0 + +by : + +mov rax, [rsi + rdx * 8 + offsetof(...)] +test rax, rax + +Signed-off-by: Eric Dumazet +Cc: Alexei Starovoitov +Cc: Daniel Borkmann +Acked-by: Daniel Borkmann +Acked-by: Alexei Starovoitov +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/net/bpf_jit_comp.c | 9 ++++----- + 1 file changed, 4 insertions(+), 5 deletions(-) + +--- a/arch/x86/net/bpf_jit_comp.c ++++ b/arch/x86/net/bpf_jit_comp.c +@@ -269,7 +269,7 @@ static void emit_bpf_tail_call(u8 **ppro + EMIT4(0x48, 0x8B, 0x46, /* mov rax, qword ptr [rsi + 16] */ + offsetof(struct bpf_array, map.max_entries)); + EMIT3(0x48, 0x39, 0xD0); /* cmp rax, rdx */ +-#define OFFSET1 47 /* number of bytes to jump */ ++#define OFFSET1 43 /* number of bytes to jump */ + EMIT2(X86_JBE, OFFSET1); /* jbe out */ + label1 = cnt; + +@@ -278,21 +278,20 @@ static void emit_bpf_tail_call(u8 **ppro + */ + EMIT2_off32(0x8B, 0x85, -STACKSIZE + 36); /* mov eax, dword ptr [rbp - 516] */ + EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */ +-#define OFFSET2 36 ++#define OFFSET2 32 + EMIT2(X86_JA, OFFSET2); /* ja out */ + label2 = cnt; + EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */ + EMIT2_off32(0x89, 0x85, -STACKSIZE + 36); /* mov dword ptr [rbp - 516], eax */ + + /* prog = array->ptrs[index]; */ +- EMIT4_off32(0x48, 0x8D, 0x84, 0xD6, /* lea rax, [rsi + rdx * 8 + offsetof(...)] */ ++ EMIT4_off32(0x48, 0x8B, 0x84, 0xD6, /* mov rax, [rsi + rdx * 8 + offsetof(...)] */ + offsetof(struct bpf_array, ptrs)); +- EMIT3(0x48, 0x8B, 0x00); /* mov rax, qword ptr [rax] */ + + /* if (prog == NULL) + * goto out; + */ +- EMIT4(0x48, 0x83, 0xF8, 0x00); /* cmp rax, 0 */ ++ EMIT3(0x48, 0x85, 0xC0); /* test rax,rax */ + #define OFFSET3 10 + EMIT2(X86_JE, OFFSET3); /* je out */ + label3 = cnt;