--- /dev/null
+From f1463802276802f1c4ac79c1035e9d73146678c2 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 13 Jul 2021 08:18:31 +0000
+Subject: bpf: Fix leakage due to insufficient speculative store bypass
+ mitigation
+
+From: Daniel Borkmann <daniel@iogearbox.net>
+
+[ Upstream commit 2039f26f3aca5b0e419b98f65dd36481337b86ee ]
+
+Spectre v4 gadgets make use of memory disambiguation, which is a set of
+techniques that execute memory access instructions, that is, loads and
+stores, out of program order; Intel's optimization manual, section 2.4.4.5:
+
+ A load instruction micro-op may depend on a preceding store. Many
+ microarchitectures block loads until all preceding store addresses are
+ known. The memory disambiguator predicts which loads will not depend on
+ any previous stores. When the disambiguator predicts that a load does
+ not have such a dependency, the load takes its data from the L1 data
+ cache. Eventually, the prediction is verified. If an actual conflict is
+ detected, the load and all succeeding instructions are re-executed.
+
+af86ca4e3088 ("bpf: Prevent memory disambiguation attack") tried to mitigate
+this attack by sanitizing the memory locations through preemptive "fast"
+(low latency) stores of zero prior to the actual "slow" (high latency) store
+of a pointer value such that upon dependency misprediction the CPU then
+speculatively executes the load of the pointer value and retrieves the zero
+value instead of the attacker controlled scalar value previously stored at
+that location, meaning, subsequent access in the speculative domain is then
+redirected to the "zero page".
+
+The sanitized preemptive store of zero prior to the actual "slow" store is
+done through a simple ST instruction based on r10 (frame pointer) with
+relative offset to the stack location that the verifier has been tracking
+on the original used register for STX, which does not have to be r10. Thus,
+there are no memory dependencies for this store, since it's only using r10
+and immediate constant of zero; hence af86ca4e3088 /assumed/ a low latency
+operation.
+
+However, a recent attack demonstrated that this mitigation is not sufficient
+since the preemptive store of zero could also be turned into a "slow" store
+and is thus bypassed as well:
+
+ [...]
+ // r2 = oob address (e.g. scalar)
+ // r7 = pointer to map value
+ 31: (7b) *(u64 *)(r10 -16) = r2
+ // r9 will remain "fast" register, r10 will become "slow" register below
+ 32: (bf) r9 = r10
+ // JIT maps BPF reg to x86 reg:
+ // r9 -> r15 (callee saved)
+ // r10 -> rbp
+ // train store forward prediction to break dependency link between both r9
+ // and r10 by evicting them from the predictor's LRU table.
+ 33: (61) r0 = *(u32 *)(r7 +24576)
+ 34: (63) *(u32 *)(r7 +29696) = r0
+ 35: (61) r0 = *(u32 *)(r7 +24580)
+ 36: (63) *(u32 *)(r7 +29700) = r0
+ 37: (61) r0 = *(u32 *)(r7 +24584)
+ 38: (63) *(u32 *)(r7 +29704) = r0
+ 39: (61) r0 = *(u32 *)(r7 +24588)
+ 40: (63) *(u32 *)(r7 +29708) = r0
+ [...]
+ 543: (61) r0 = *(u32 *)(r7 +25596)
+ 544: (63) *(u32 *)(r7 +30716) = r0
+ // prepare call to bpf_ringbuf_output() helper. the latter will cause rbp
+ // to spill to stack memory while r13/r14/r15 (all callee saved regs) remain
+ // in hardware registers. rbp becomes slow due to push/pop latency. below is
+ // disasm of bpf_ringbuf_output() helper for better visual context:
+ //
+ // ffffffff8117ee20: 41 54 push r12
+ // ffffffff8117ee22: 55 push rbp
+ // ffffffff8117ee23: 53 push rbx
+ // ffffffff8117ee24: 48 f7 c1 fc ff ff ff test rcx,0xfffffffffffffffc
+ // ffffffff8117ee2b: 0f 85 af 00 00 00 jne ffffffff8117eee0 <-- jump taken
+ // [...]
+ // ffffffff8117eee0: 49 c7 c4 ea ff ff ff mov r12,0xffffffffffffffea
+ // ffffffff8117eee7: 5b pop rbx
+ // ffffffff8117eee8: 5d pop rbp
+ // ffffffff8117eee9: 4c 89 e0 mov rax,r12
+ // ffffffff8117eeec: 41 5c pop r12
+ // ffffffff8117eeee: c3 ret
+ 545: (18) r1 = map[id:4]
+ 547: (bf) r2 = r7
+ 548: (b7) r3 = 0
+ 549: (b7) r4 = 4
+ 550: (85) call bpf_ringbuf_output#194288
+ // instruction 551 inserted by verifier \
+ 551: (7a) *(u64 *)(r10 -16) = 0 | /both/ are now slow stores here
+ // storing map value pointer r7 at fp-16 | since value of r10 is "slow".
+ 552: (7b) *(u64 *)(r10 -16) = r7 /
+ // following "fast" read to the same memory location, but due to dependency
+ // misprediction it will speculatively execute before insn 551/552 completes.
+ 553: (79) r2 = *(u64 *)(r9 -16)
+ // in speculative domain contains attacker controlled r2. in non-speculative
+ // domain this contains r7, and thus accesses r7 +0 below.
+ 554: (71) r3 = *(u8 *)(r2 +0)
+ // leak r3
+
+As can be seen, the current speculative store bypass mitigation which the
+verifier inserts at line 551 is insufficient since /both/, the write of
+the zero sanitation as well as the map value pointer are a high latency
+instruction due to prior memory access via push/pop of r10 (rbp) in contrast
+to the low latency read in line 553 as r9 (r15) which stays in hardware
+registers. Thus, architecturally, fp-16 is r7, however, microarchitecturally,
+fp-16 can still be r2.
+
+Initial thoughts to address this issue was to track spilled pointer loads
+from stack and enforce their load via LDX through r10 as well so that /both/
+the preemptive store of zero /as well as/ the load use the /same/ register
+such that a dependency is created between the store and load. However, this
+option is not sufficient either since it can be bypassed as well under
+speculation. An updated attack with pointer spill/fills now _all_ based on
+r10 would look as follows:
+
+ [...]
+ // r2 = oob address (e.g. scalar)
+ // r7 = pointer to map value
+ [...]
+ // longer store forward prediction training sequence than before.
+ 2062: (61) r0 = *(u32 *)(r7 +25588)
+ 2063: (63) *(u32 *)(r7 +30708) = r0
+ 2064: (61) r0 = *(u32 *)(r7 +25592)
+ 2065: (63) *(u32 *)(r7 +30712) = r0
+ 2066: (61) r0 = *(u32 *)(r7 +25596)
+ 2067: (63) *(u32 *)(r7 +30716) = r0
+ // store the speculative load address (scalar) this time after the store
+ // forward prediction training.
+ 2068: (7b) *(u64 *)(r10 -16) = r2
+ // preoccupy the CPU store port by running sequence of dummy stores.
+ 2069: (63) *(u32 *)(r7 +29696) = r0
+ 2070: (63) *(u32 *)(r7 +29700) = r0
+ 2071: (63) *(u32 *)(r7 +29704) = r0
+ 2072: (63) *(u32 *)(r7 +29708) = r0
+ 2073: (63) *(u32 *)(r7 +29712) = r0
+ 2074: (63) *(u32 *)(r7 +29716) = r0
+ 2075: (63) *(u32 *)(r7 +29720) = r0
+ 2076: (63) *(u32 *)(r7 +29724) = r0
+ 2077: (63) *(u32 *)(r7 +29728) = r0
+ 2078: (63) *(u32 *)(r7 +29732) = r0
+ 2079: (63) *(u32 *)(r7 +29736) = r0
+ 2080: (63) *(u32 *)(r7 +29740) = r0
+ 2081: (63) *(u32 *)(r7 +29744) = r0
+ 2082: (63) *(u32 *)(r7 +29748) = r0
+ 2083: (63) *(u32 *)(r7 +29752) = r0
+ 2084: (63) *(u32 *)(r7 +29756) = r0
+ 2085: (63) *(u32 *)(r7 +29760) = r0
+ 2086: (63) *(u32 *)(r7 +29764) = r0
+ 2087: (63) *(u32 *)(r7 +29768) = r0
+ 2088: (63) *(u32 *)(r7 +29772) = r0
+ 2089: (63) *(u32 *)(r7 +29776) = r0
+ 2090: (63) *(u32 *)(r7 +29780) = r0
+ 2091: (63) *(u32 *)(r7 +29784) = r0
+ 2092: (63) *(u32 *)(r7 +29788) = r0
+ 2093: (63) *(u32 *)(r7 +29792) = r0
+ 2094: (63) *(u32 *)(r7 +29796) = r0
+ 2095: (63) *(u32 *)(r7 +29800) = r0
+ 2096: (63) *(u32 *)(r7 +29804) = r0
+ 2097: (63) *(u32 *)(r7 +29808) = r0
+ 2098: (63) *(u32 *)(r7 +29812) = r0
+ // overwrite scalar with dummy pointer; same as before, also including the
+ // sanitation store with 0 from the current mitigation by the verifier.
+ 2099: (7a) *(u64 *)(r10 -16) = 0 | /both/ are now slow stores here
+ 2100: (7b) *(u64 *)(r10 -16) = r7 | since store unit is still busy.
+ // load from stack intended to bypass stores.
+ 2101: (79) r2 = *(u64 *)(r10 -16)
+ 2102: (71) r3 = *(u8 *)(r2 +0)
+ // leak r3
+ [...]
+
+Looking at the CPU microarchitecture, the scheduler might issue loads (such
+as seen in line 2101) before stores (line 2099,2100) because the load execution
+units become available while the store execution unit is still busy with the
+sequence of dummy stores (line 2069-2098). And so the load may use the prior
+stored scalar from r2 at address r10 -16 for speculation. The updated attack
+may work less reliable on CPU microarchitectures where loads and stores share
+execution resources.
+
+This concludes that the sanitizing with zero stores from af86ca4e3088 ("bpf:
+Prevent memory disambiguation attack") is insufficient. Moreover, the detection
+of stack reuse from af86ca4e3088 where previously data (STACK_MISC) has been
+written to a given stack slot where a pointer value is now to be stored does
+not have sufficient coverage as precondition for the mitigation either; for
+several reasons outlined as follows:
+
+ 1) Stack content from prior program runs could still be preserved and is
+ therefore not "random", best example is to split a speculative store
+ bypass attack between tail calls, program A would prepare and store the
+ oob address at a given stack slot and then tail call into program B which
+ does the "slow" store of a pointer to the stack with subsequent "fast"
+ read. From program B PoV such stack slot type is STACK_INVALID, and
+ therefore also must be subject to mitigation.
+
+ 2) The STACK_SPILL must not be coupled to register_is_const(&stack->spilled_ptr)
+ condition, for example, the previous content of that memory location could
+ also be a pointer to map or map value. Without the fix, a speculative
+ store bypass is not mitigated in such precondition and can then lead to
+ a type confusion in the speculative domain leaking kernel memory near
+ these pointer types.
+
+While brainstorming on various alternative mitigation possibilities, we also
+stumbled upon a retrospective from Chrome developers [0]:
+
+ [...] For variant 4, we implemented a mitigation to zero the unused memory
+ of the heap prior to allocation, which cost about 1% when done concurrently
+ and 4% for scavenging. Variant 4 defeats everything we could think of. We
+ explored more mitigations for variant 4 but the threat proved to be more
+ pervasive and dangerous than we anticipated. For example, stack slots used
+ by the register allocator in the optimizing compiler could be subject to
+ type confusion, leading to pointer crafting. Mitigating type confusion for
+ stack slots alone would have required a complete redesign of the backend of
+ the optimizing compiler, perhaps man years of work, without a guarantee of
+ completeness. [...]
+
+From BPF side, the problem space is reduced, however, options are rather
+limited. One idea that has been explored was to xor-obfuscate pointer spills
+to the BPF stack:
+
+ [...]
+ // preoccupy the CPU store port by running sequence of dummy stores.
+ [...]
+ 2106: (63) *(u32 *)(r7 +29796) = r0
+ 2107: (63) *(u32 *)(r7 +29800) = r0
+ 2108: (63) *(u32 *)(r7 +29804) = r0
+ 2109: (63) *(u32 *)(r7 +29808) = r0
+ 2110: (63) *(u32 *)(r7 +29812) = r0
+ // overwrite scalar with dummy pointer; xored with random 'secret' value
+ // of 943576462 before store ...
+ 2111: (b4) w11 = 943576462
+ 2112: (af) r11 ^= r7
+ 2113: (7b) *(u64 *)(r10 -16) = r11
+ 2114: (79) r11 = *(u64 *)(r10 -16)
+ 2115: (b4) w2 = 943576462
+ 2116: (af) r2 ^= r11
+ // ... and restored with the same 'secret' value with the help of AX reg.
+ 2117: (71) r3 = *(u8 *)(r2 +0)
+ [...]
+
+While the above would not prevent speculation, it would make data leakage
+infeasible by directing it to random locations. In order to be effective
+and prevent type confusion under speculation, such random secret would have
+to be regenerated for each store. The additional complexity involved for a
+tracking mechanism that prevents jumps such that restoring spilled pointers
+would not get corrupted is not worth the gain for unprivileged. Hence, the
+fix in here eventually opted for emitting a non-public BPF_ST | BPF_NOSPEC
+instruction which the x86 JIT translates into a lfence opcode. Inserting the
+latter in between the store and load instruction is one of the mitigations
+options [1]. The x86 instruction manual notes:
+
+ [...] An LFENCE that follows an instruction that stores to memory might
+ complete before the data being stored have become globally visible. [...]
+
+The latter meaning that the preceding store instruction finished execution
+and the store is at minimum guaranteed to be in the CPU's store queue, but
+it's not guaranteed to be in that CPU's L1 cache at that point (globally
+visible). The latter would only be guaranteed via sfence. So the load which
+is guaranteed to execute after the lfence for that local CPU would have to
+rely on store-to-load forwarding. [2], in section 2.3 on store buffers says:
+
+ [...] For every store operation that is added to the ROB, an entry is
+ allocated in the store buffer. This entry requires both the virtual and
+ physical address of the target. Only if there is no free entry in the store
+ buffer, the frontend stalls until there is an empty slot available in the
+ store buffer again. Otherwise, the CPU can immediately continue adding
+ subsequent instructions to the ROB and execute them out of order. On Intel
+ CPUs, the store buffer has up to 56 entries. [...]
+
+One small upside on the fix is that it lifts constraints from af86ca4e3088
+where the sanitize_stack_off relative to r10 must be the same when coming
+from different paths. The BPF_ST | BPF_NOSPEC gets emitted after a BPF_STX
+or BPF_ST instruction. This happens either when we store a pointer or data
+value to the BPF stack for the first time, or upon later pointer spills.
+The former needs to be enforced since otherwise stale stack data could be
+leaked under speculation as outlined earlier. For non-x86 JITs the BPF_ST |
+BPF_NOSPEC mapping is currently optimized away, but others could emit a
+speculation barrier as well if necessary. For real-world unprivileged
+programs e.g. generated by LLVM, pointer spill/fill is only generated upon
+register pressure and LLVM only tries to do that for pointers which are not
+used often. The program main impact will be the initial BPF_ST | BPF_NOSPEC
+sanitation for the STACK_INVALID case when the first write to a stack slot
+occurs e.g. upon map lookup. In future we might refine ways to mitigate
+the latter cost.
+
+ [0] https://arxiv.org/pdf/1902.05178.pdf
+ [1] https://msrc-blog.microsoft.com/2018/05/21/analysis-and-mitigation-of-speculative-store-bypass-cve-2018-3639/
+ [2] https://arxiv.org/pdf/1905.05725.pdf
+
+Fixes: af86ca4e3088 ("bpf: Prevent memory disambiguation attack")
+Fixes: f7cf25b2026d ("bpf: track spill/fill of constants")
+Co-developed-by: Piotr Krysiuk <piotras@gmail.com>
+Co-developed-by: Benedict Schlueter <benedict.schlueter@rub.de>
+Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
+Signed-off-by: Piotr Krysiuk <piotras@gmail.com>
+Signed-off-by: Benedict Schlueter <benedict.schlueter@rub.de>
+Acked-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/linux/bpf_verifier.h | 2 +-
+ kernel/bpf/verifier.c | 87 +++++++++++++-----------------------
+ 2 files changed, 33 insertions(+), 56 deletions(-)
+
+diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
+index 06841517ab1e..f9832dae4586 100644
+--- a/include/linux/bpf_verifier.h
++++ b/include/linux/bpf_verifier.h
+@@ -333,8 +333,8 @@ struct bpf_insn_aux_data {
+ };
+ u64 map_key_state; /* constant (32 bit) key tracking for maps */
+ int ctx_field_size; /* the ctx field size for load insn, maybe 0 */
+- int sanitize_stack_off; /* stack slot to be cleared */
+ u32 seen; /* this insn was processed by the verifier at env->pass_cnt */
++ bool sanitize_stack_spill; /* subject to Spectre v4 sanitation */
+ bool zext_dst; /* this insn zero extends dst reg */
+ u8 alu_state; /* used in combination with alu_limit */
+
+diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
+index e6db39a00de2..6c602066c2c7 100644
+--- a/kernel/bpf/verifier.c
++++ b/kernel/bpf/verifier.c
+@@ -2607,6 +2607,19 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
+ cur = env->cur_state->frame[env->cur_state->curframe];
+ if (value_regno >= 0)
+ reg = &cur->regs[value_regno];
++ if (!env->bypass_spec_v4) {
++ bool sanitize = reg && is_spillable_regtype(reg->type);
++
++ for (i = 0; i < size; i++) {
++ if (state->stack[spi].slot_type[i] == STACK_INVALID) {
++ sanitize = true;
++ break;
++ }
++ }
++
++ if (sanitize)
++ env->insn_aux_data[insn_idx].sanitize_stack_spill = true;
++ }
+
+ if (reg && size == BPF_REG_SIZE && register_is_bounded(reg) &&
+ !register_is_null(reg) && env->bpf_capable) {
+@@ -2629,47 +2642,10 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
+ verbose(env, "invalid size of register spill\n");
+ return -EACCES;
+ }
+-
+ if (state != cur && reg->type == PTR_TO_STACK) {
+ verbose(env, "cannot spill pointers to stack into stack frame of the caller\n");
+ return -EINVAL;
+ }
+-
+- if (!env->bypass_spec_v4) {
+- bool sanitize = false;
+-
+- if (state->stack[spi].slot_type[0] == STACK_SPILL &&
+- register_is_const(&state->stack[spi].spilled_ptr))
+- sanitize = true;
+- for (i = 0; i < BPF_REG_SIZE; i++)
+- if (state->stack[spi].slot_type[i] == STACK_MISC) {
+- sanitize = true;
+- break;
+- }
+- if (sanitize) {
+- int *poff = &env->insn_aux_data[insn_idx].sanitize_stack_off;
+- int soff = (-spi - 1) * BPF_REG_SIZE;
+-
+- /* detected reuse of integer stack slot with a pointer
+- * which means either llvm is reusing stack slot or
+- * an attacker is trying to exploit CVE-2018-3639
+- * (speculative store bypass)
+- * Have to sanitize that slot with preemptive
+- * store of zero.
+- */
+- if (*poff && *poff != soff) {
+- /* disallow programs where single insn stores
+- * into two different stack slots, since verifier
+- * cannot sanitize them
+- */
+- verbose(env,
+- "insn %d cannot access two stack slots fp%d and fp%d",
+- insn_idx, *poff, soff);
+- return -EINVAL;
+- }
+- *poff = soff;
+- }
+- }
+ save_register_state(state, spi, reg);
+ } else {
+ u8 type = STACK_MISC;
+@@ -11891,35 +11867,33 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
+
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ bpf_convert_ctx_access_t convert_ctx_access;
++ bool ctx_access;
+
+ if (insn->code == (BPF_LDX | BPF_MEM | BPF_B) ||
+ insn->code == (BPF_LDX | BPF_MEM | BPF_H) ||
+ insn->code == (BPF_LDX | BPF_MEM | BPF_W) ||
+- insn->code == (BPF_LDX | BPF_MEM | BPF_DW))
++ insn->code == (BPF_LDX | BPF_MEM | BPF_DW)) {
+ type = BPF_READ;
+- else if (insn->code == (BPF_STX | BPF_MEM | BPF_B) ||
+- insn->code == (BPF_STX | BPF_MEM | BPF_H) ||
+- insn->code == (BPF_STX | BPF_MEM | BPF_W) ||
+- insn->code == (BPF_STX | BPF_MEM | BPF_DW))
++ ctx_access = true;
++ } else if (insn->code == (BPF_STX | BPF_MEM | BPF_B) ||
++ insn->code == (BPF_STX | BPF_MEM | BPF_H) ||
++ insn->code == (BPF_STX | BPF_MEM | BPF_W) ||
++ insn->code == (BPF_STX | BPF_MEM | BPF_DW) ||
++ insn->code == (BPF_ST | BPF_MEM | BPF_B) ||
++ insn->code == (BPF_ST | BPF_MEM | BPF_H) ||
++ insn->code == (BPF_ST | BPF_MEM | BPF_W) ||
++ insn->code == (BPF_ST | BPF_MEM | BPF_DW)) {
+ type = BPF_WRITE;
+- else
++ ctx_access = BPF_CLASS(insn->code) == BPF_STX;
++ } else {
+ continue;
++ }
+
+ if (type == BPF_WRITE &&
+- env->insn_aux_data[i + delta].sanitize_stack_off) {
++ env->insn_aux_data[i + delta].sanitize_stack_spill) {
+ struct bpf_insn patch[] = {
+- /* Sanitize suspicious stack slot with zero.
+- * There are no memory dependencies for this store,
+- * since it's only using frame pointer and immediate
+- * constant of zero
+- */
+- BPF_ST_MEM(BPF_DW, BPF_REG_FP,
+- env->insn_aux_data[i + delta].sanitize_stack_off,
+- 0),
+- /* the original STX instruction will immediately
+- * overwrite the same stack slot with appropriate value
+- */
+ *insn,
++ BPF_ST_NOSPEC(),
+ };
+
+ cnt = ARRAY_SIZE(patch);
+@@ -11933,6 +11907,9 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
+ continue;
+ }
+
++ if (!ctx_access)
++ continue;
++
+ switch (env->insn_aux_data[i + delta].ptr_type) {
+ case PTR_TO_CTX:
+ if (!ops->convert_ctx_access)
+--
+2.30.2
+
--- /dev/null
+From c39f7adcc01e57641251b699dcdec65174c52d0b Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 13 Jul 2021 08:18:31 +0000
+Subject: bpf: Introduce BPF nospec instruction for mitigating Spectre v4
+
+From: Daniel Borkmann <daniel@iogearbox.net>
+
+[ Upstream commit f5e81d1117501546b7be050c5fbafa6efd2c722c ]
+
+In case of JITs, each of the JIT backends compiles the BPF nospec instruction
+/either/ to a machine instruction which emits a speculation barrier /or/ to
+/no/ machine instruction in case the underlying architecture is not affected
+by Speculative Store Bypass or has different mitigations in place already.
+
+This covers both x86 and (implicitly) arm64: In case of x86, we use 'lfence'
+instruction for mitigation. In case of arm64, we rely on the firmware mitigation
+as controlled via the ssbd kernel parameter. Whenever the mitigation is enabled,
+it works for all of the kernel code with no need to provide any additional
+instructions here (hence only comment in arm64 JIT). Other archs can follow
+as needed. The BPF nospec instruction is specifically targeting Spectre v4
+since i) we don't use a serialization barrier for the Spectre v1 case, and
+ii) mitigation instructions for v1 and v4 might be different on some archs.
+
+The BPF nospec is required for a future commit, where the BPF verifier does
+annotate intermediate BPF programs with speculation barriers.
+
+Co-developed-by: Piotr Krysiuk <piotras@gmail.com>
+Co-developed-by: Benedict Schlueter <benedict.schlueter@rub.de>
+Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
+Signed-off-by: Piotr Krysiuk <piotras@gmail.com>
+Signed-off-by: Benedict Schlueter <benedict.schlueter@rub.de>
+Acked-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/arm/net/bpf_jit_32.c | 3 +++
+ arch/arm64/net/bpf_jit_comp.c | 13 +++++++++++++
+ arch/mips/net/ebpf_jit.c | 3 +++
+ arch/powerpc/net/bpf_jit_comp32.c | 6 ++++++
+ arch/powerpc/net/bpf_jit_comp64.c | 6 ++++++
+ arch/riscv/net/bpf_jit_comp32.c | 4 ++++
+ arch/riscv/net/bpf_jit_comp64.c | 4 ++++
+ arch/s390/net/bpf_jit_comp.c | 5 +++++
+ arch/sparc/net/bpf_jit_comp_64.c | 3 +++
+ arch/x86/net/bpf_jit_comp.c | 7 +++++++
+ arch/x86/net/bpf_jit_comp32.c | 6 ++++++
+ include/linux/filter.h | 15 +++++++++++++++
+ kernel/bpf/core.c | 19 ++++++++++++++++++-
+ kernel/bpf/disasm.c | 16 +++++++++-------
+ 14 files changed, 102 insertions(+), 8 deletions(-)
+
+diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c
+index 897634d0a67c..a951276f0547 100644
+--- a/arch/arm/net/bpf_jit_32.c
++++ b/arch/arm/net/bpf_jit_32.c
+@@ -1602,6 +1602,9 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx)
+ rn = arm_bpf_get_reg32(src_lo, tmp2[1], ctx);
+ emit_ldx_r(dst, rn, off, ctx, BPF_SIZE(code));
+ break;
++ /* speculation barrier */
++ case BPF_ST | BPF_NOSPEC:
++ break;
+ /* ST: *(size *)(dst + off) = imm */
+ case BPF_ST | BPF_MEM | BPF_W:
+ case BPF_ST | BPF_MEM | BPF_H:
+diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
+index f7b194878a99..5a876af34230 100644
+--- a/arch/arm64/net/bpf_jit_comp.c
++++ b/arch/arm64/net/bpf_jit_comp.c
+@@ -829,6 +829,19 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx,
+ return ret;
+ break;
+
++ /* speculation barrier */
++ case BPF_ST | BPF_NOSPEC:
++ /*
++ * Nothing required here.
++ *
++ * In case of arm64, we rely on the firmware mitigation of
++ * Speculative Store Bypass as controlled via the ssbd kernel
++ * parameter. Whenever the mitigation is enabled, it works
++ * for all of the kernel code with no need to provide any
++ * additional instructions.
++ */
++ break;
++
+ /* ST: *(size *)(dst + off) = imm */
+ case BPF_ST | BPF_MEM | BPF_W:
+ case BPF_ST | BPF_MEM | BPF_H:
+diff --git a/arch/mips/net/ebpf_jit.c b/arch/mips/net/ebpf_jit.c
+index 939dd06764bc..3a73e9375712 100644
+--- a/arch/mips/net/ebpf_jit.c
++++ b/arch/mips/net/ebpf_jit.c
+@@ -1355,6 +1355,9 @@ static int build_one_insn(const struct bpf_insn *insn, struct jit_ctx *ctx,
+ }
+ break;
+
++ case BPF_ST | BPF_NOSPEC: /* speculation barrier */
++ break;
++
+ case BPF_ST | BPF_B | BPF_MEM:
+ case BPF_ST | BPF_H | BPF_MEM:
+ case BPF_ST | BPF_W | BPF_MEM:
+diff --git a/arch/powerpc/net/bpf_jit_comp32.c b/arch/powerpc/net/bpf_jit_comp32.c
+index 68476780047a..6c0915285b17 100644
+--- a/arch/powerpc/net/bpf_jit_comp32.c
++++ b/arch/powerpc/net/bpf_jit_comp32.c
+@@ -737,6 +737,12 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context *
+ }
+ break;
+
++ /*
++ * BPF_ST NOSPEC (speculation barrier)
++ */
++ case BPF_ST | BPF_NOSPEC:
++ break;
++
+ /*
+ * BPF_ST(X)
+ */
+diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c
+index 94411af24013..d3ad8dfba1f6 100644
+--- a/arch/powerpc/net/bpf_jit_comp64.c
++++ b/arch/powerpc/net/bpf_jit_comp64.c
+@@ -627,6 +627,12 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context *
+ }
+ break;
+
++ /*
++ * BPF_ST NOSPEC (speculation barrier)
++ */
++ case BPF_ST | BPF_NOSPEC:
++ break;
++
+ /*
+ * BPF_ST(X)
+ */
+diff --git a/arch/riscv/net/bpf_jit_comp32.c b/arch/riscv/net/bpf_jit_comp32.c
+index 81de865f4c7c..e6497424cbf6 100644
+--- a/arch/riscv/net/bpf_jit_comp32.c
++++ b/arch/riscv/net/bpf_jit_comp32.c
+@@ -1251,6 +1251,10 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
+ return -1;
+ break;
+
++ /* speculation barrier */
++ case BPF_ST | BPF_NOSPEC:
++ break;
++
+ case BPF_ST | BPF_MEM | BPF_B:
+ case BPF_ST | BPF_MEM | BPF_H:
+ case BPF_ST | BPF_MEM | BPF_W:
+diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c
+index 87e3bf5b9086..3af4131c22c7 100644
+--- a/arch/riscv/net/bpf_jit_comp64.c
++++ b/arch/riscv/net/bpf_jit_comp64.c
+@@ -939,6 +939,10 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
+ emit_ld(rd, 0, RV_REG_T1, ctx);
+ break;
+
++ /* speculation barrier */
++ case BPF_ST | BPF_NOSPEC:
++ break;
++
+ /* ST: *(size *)(dst + off) = imm */
+ case BPF_ST | BPF_MEM | BPF_B:
+ emit_imm(RV_REG_T1, imm, ctx);
+diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
+index 2ae419f5115a..88419263a89a 100644
+--- a/arch/s390/net/bpf_jit_comp.c
++++ b/arch/s390/net/bpf_jit_comp.c
+@@ -1153,6 +1153,11 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
+ break;
+ }
+ break;
++ /*
++ * BPF_NOSPEC (speculation barrier)
++ */
++ case BPF_ST | BPF_NOSPEC:
++ break;
+ /*
+ * BPF_ST(X)
+ */
+diff --git a/arch/sparc/net/bpf_jit_comp_64.c b/arch/sparc/net/bpf_jit_comp_64.c
+index 4b8d3c65d266..9a2f20cbd48b 100644
+--- a/arch/sparc/net/bpf_jit_comp_64.c
++++ b/arch/sparc/net/bpf_jit_comp_64.c
+@@ -1287,6 +1287,9 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx)
+ return 1;
+ break;
+ }
++ /* speculation barrier */
++ case BPF_ST | BPF_NOSPEC:
++ break;
+ /* ST: *(size *)(dst + off) = imm */
+ case BPF_ST | BPF_MEM | BPF_W:
+ case BPF_ST | BPF_MEM | BPF_H:
+diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
+index 66e304a84deb..ee9971bbe034 100644
+--- a/arch/x86/net/bpf_jit_comp.c
++++ b/arch/x86/net/bpf_jit_comp.c
+@@ -1235,6 +1235,13 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
+ }
+ break;
+
++ /* speculation barrier */
++ case BPF_ST | BPF_NOSPEC:
++ if (boot_cpu_has(X86_FEATURE_XMM2))
++ /* Emit 'lfence' */
++ EMIT3(0x0F, 0xAE, 0xE8);
++ break;
++
+ /* ST: *(u8*)(dst_reg + off) = imm */
+ case BPF_ST | BPF_MEM | BPF_B:
+ if (is_ereg(dst_reg))
+diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c
+index 3da88ded6ee3..3bfda5f502cb 100644
+--- a/arch/x86/net/bpf_jit_comp32.c
++++ b/arch/x86/net/bpf_jit_comp32.c
+@@ -1886,6 +1886,12 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
+ i++;
+ break;
+ }
++ /* speculation barrier */
++ case BPF_ST | BPF_NOSPEC:
++ if (boot_cpu_has(X86_FEATURE_XMM2))
++ /* Emit 'lfence' */
++ EMIT3(0x0F, 0xAE, 0xE8);
++ break;
+ /* ST: *(u8*)(dst_reg + off) = imm */
+ case BPF_ST | BPF_MEM | BPF_H:
+ case BPF_ST | BPF_MEM | BPF_B:
+diff --git a/include/linux/filter.h b/include/linux/filter.h
+index 9a09547bc7ba..16e5cebea82c 100644
+--- a/include/linux/filter.h
++++ b/include/linux/filter.h
+@@ -73,6 +73,11 @@ struct ctl_table_header;
+ /* unused opcode to mark call to interpreter with arguments */
+ #define BPF_CALL_ARGS 0xe0
+
++/* unused opcode to mark speculation barrier for mitigating
++ * Speculative Store Bypass
++ */
++#define BPF_NOSPEC 0xc0
++
+ /* As per nm, we expose JITed images as text (code) section for
+ * kallsyms. That way, tools like perf can find it to match
+ * addresses.
+@@ -390,6 +395,16 @@ static inline bool insn_is_zext(const struct bpf_insn *insn)
+ .off = 0, \
+ .imm = 0 })
+
++/* Speculation barrier */
++
++#define BPF_ST_NOSPEC() \
++ ((struct bpf_insn) { \
++ .code = BPF_ST | BPF_NOSPEC, \
++ .dst_reg = 0, \
++ .src_reg = 0, \
++ .off = 0, \
++ .imm = 0 })
++
+ /* Internal classic blocks for direct assignment */
+
+ #define __BPF_STMT(CODE, K) \
+diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
+index 9b1577498373..b1a5fc04492b 100644
+--- a/kernel/bpf/core.c
++++ b/kernel/bpf/core.c
+@@ -32,6 +32,8 @@
+ #include <linux/perf_event.h>
+ #include <linux/extable.h>
+ #include <linux/log2.h>
++
++#include <asm/barrier.h>
+ #include <asm/unaligned.h>
+
+ /* Registers */
+@@ -1377,6 +1379,7 @@ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn)
+ /* Non-UAPI available opcodes. */
+ [BPF_JMP | BPF_CALL_ARGS] = &&JMP_CALL_ARGS,
+ [BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL,
++ [BPF_ST | BPF_NOSPEC] = &&ST_NOSPEC,
+ [BPF_LDX | BPF_PROBE_MEM | BPF_B] = &&LDX_PROBE_MEM_B,
+ [BPF_LDX | BPF_PROBE_MEM | BPF_H] = &&LDX_PROBE_MEM_H,
+ [BPF_LDX | BPF_PROBE_MEM | BPF_W] = &&LDX_PROBE_MEM_W,
+@@ -1621,7 +1624,21 @@ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn)
+ COND_JMP(s, JSGE, >=)
+ COND_JMP(s, JSLE, <=)
+ #undef COND_JMP
+- /* STX and ST and LDX*/
++ /* ST, STX and LDX*/
++ ST_NOSPEC:
++ /* Speculation barrier for mitigating Speculative Store Bypass.
++ * In case of arm64, we rely on the firmware mitigation as
++ * controlled via the ssbd kernel parameter. Whenever the
++ * mitigation is enabled, it works for all of the kernel code
++ * with no need to provide any additional instructions here.
++ * In case of x86, we use 'lfence' insn for mitigation. We
++ * reuse preexisting logic from Spectre v1 mitigation that
++ * happens to produce the required code on x86 for v4 as well.
++ */
++#ifdef CONFIG_X86
++ barrier_nospec();
++#endif
++ CONT;
+ #define LDST(SIZEOP, SIZE) \
+ STX_MEM_##SIZEOP: \
+ *(SIZE *)(unsigned long) (DST + insn->off) = SRC; \
+diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
+index bbfc6bb79240..ca3cd9aaa6ce 100644
+--- a/kernel/bpf/disasm.c
++++ b/kernel/bpf/disasm.c
+@@ -206,15 +206,17 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
+ verbose(cbs->private_data, "BUG_%02x\n", insn->code);
+ }
+ } else if (class == BPF_ST) {
+- if (BPF_MODE(insn->code) != BPF_MEM) {
++ if (BPF_MODE(insn->code) == BPF_MEM) {
++ verbose(cbs->private_data, "(%02x) *(%s *)(r%d %+d) = %d\n",
++ insn->code,
++ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
++ insn->dst_reg,
++ insn->off, insn->imm);
++ } else if (BPF_MODE(insn->code) == 0xc0 /* BPF_NOSPEC, no UAPI */) {
++ verbose(cbs->private_data, "(%02x) nospec\n", insn->code);
++ } else {
+ verbose(cbs->private_data, "BUG_st_%02x\n", insn->code);
+- return;
+ }
+- verbose(cbs->private_data, "(%02x) *(%s *)(r%d %+d) = %d\n",
+- insn->code,
+- bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+- insn->dst_reg,
+- insn->off, insn->imm);
+ } else if (class == BPF_LDX) {
+ if (BPF_MODE(insn->code) != BPF_MEM) {
+ verbose(cbs->private_data, "BUG_ldx_%02x\n", insn->code);
+--
+2.30.2
+