From ebcadb84a1796b30b6c6e356cee8a13da1007ac6 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 1 Sep 2021 11:49:55 +0200 Subject: [PATCH] 4.14-stable patches added patches: bpf-do-not-use-ax-register-in-interpreter-on-div-mod.patch bpf-fix-32-bit-src-register-truncation-on-div-mod.patch bpf-fix-subprog-verifier-bypass-by-div-mod-by-0-exception.patch bpf-fix-truncation-handling-for-mod32-dst-reg-wrt-zero.patch kvm-x86-mmu-treat-nx-as-used-not-reserved-for-all-tdp-shadow-mmus.patch kvm-x86-mmu-use-the-correct-inherited-permissions-to-get-shadow-page.patch --- ...x-register-in-interpreter-on-div-mod.patch | 102 +++++++ ...t-src-register-truncation-on-div-mod.patch | 181 ++++++++++++ ...ier-bypass-by-div-mod-by-0-exception.patch | 273 ++++++++++++++++++ ...-handling-for-mod32-dst-reg-wrt-zero.patch | 129 +++++++++ ...not-reserved-for-all-tdp-shadow-mmus.patch | 50 ++++ ...rited-permissions-to-get-shadow-page.patch | 154 ++++++++++ queue-4.14/series | 6 + 7 files changed, 895 insertions(+) create mode 100644 queue-4.14/bpf-do-not-use-ax-register-in-interpreter-on-div-mod.patch create mode 100644 queue-4.14/bpf-fix-32-bit-src-register-truncation-on-div-mod.patch create mode 100644 queue-4.14/bpf-fix-subprog-verifier-bypass-by-div-mod-by-0-exception.patch create mode 100644 queue-4.14/bpf-fix-truncation-handling-for-mod32-dst-reg-wrt-zero.patch create mode 100644 queue-4.14/kvm-x86-mmu-treat-nx-as-used-not-reserved-for-all-tdp-shadow-mmus.patch create mode 100644 queue-4.14/kvm-x86-mmu-use-the-correct-inherited-permissions-to-get-shadow-page.patch diff --git a/queue-4.14/bpf-do-not-use-ax-register-in-interpreter-on-div-mod.patch b/queue-4.14/bpf-do-not-use-ax-register-in-interpreter-on-div-mod.patch new file mode 100644 index 00000000000..b3a4a878a61 --- /dev/null +++ b/queue-4.14/bpf-do-not-use-ax-register-in-interpreter-on-div-mod.patch @@ -0,0 +1,102 @@ +From foo@baz Wed Sep 1 11:42:16 AM CEST 2021 +From: Thadeu Lima de Souza Cascardo +Date: Mon, 30 Aug 2021 15:32:08 -0300 +Subject: bpf: Do not use ax register in interpreter on div/mod +To: stable@vger.kernel.org +Cc: bpf@vger.kernel.org, Daniel Borkmann , Alexei Starovoitov , John Fastabend , Pavel Machek , Thadeu Lima de Souza Cascardo +Message-ID: <20210830183211.339054-2-cascardo@canonical.com> + +From: Daniel Borkmann + +Partially undo old commit 144cd91c4c2b ("bpf: move tmp variable into ax +register in interpreter"). The reason we need this here is because ax +register will be used for holding temporary state for div/mod instruction +which otherwise interpreter would corrupt. This will cause a small +8 byte +stack increase for interpreter, but with the gain that we can use it from +verifier rewrites as scratch register. + +Signed-off-by: Daniel Borkmann +Reviewed-by: John Fastabend +[cascardo: This partial revert is needed in order to support using AX for +the following two commits, as there is no JMP32 on 4.19.y] +Signed-off-by: Thadeu Lima de Souza Cascardo +Signed-off-by: Greg Kroah-Hartman +--- + kernel/bpf/core.c | 32 +++++++++++++++----------------- + 1 file changed, 15 insertions(+), 17 deletions(-) + +--- a/kernel/bpf/core.c ++++ b/kernel/bpf/core.c +@@ -616,9 +616,6 @@ static int bpf_jit_blind_insn(const stru + * below. + * + * Constant blinding is only used by JITs, not in the interpreter. +- * The interpreter uses AX in some occasions as a local temporary +- * register e.g. in DIV or MOD instructions. +- * + * In restricted circumstances, the verifier can also use the AX + * register for rewrites as long as they do not interfere with + * the above cases! +@@ -951,6 +948,7 @@ static unsigned int ___bpf_prog_run(u64 + u32 tail_call_cnt = 0; + void *ptr; + int off; ++ u64 tmp; + + #define CONT ({ insn++; goto select_insn; }) + #define CONT_JMP ({ insn++; goto select_insn; }) +@@ -1013,22 +1011,22 @@ select_insn: + ALU64_MOD_X: + if (unlikely(SRC == 0)) + return 0; +- div64_u64_rem(DST, SRC, &AX); +- DST = AX; ++ div64_u64_rem(DST, SRC, &tmp); ++ DST = tmp; + CONT; + ALU_MOD_X: + if (unlikely((u32)SRC == 0)) + return 0; +- AX = (u32) DST; +- DST = do_div(AX, (u32) SRC); ++ tmp = (u32) DST; ++ DST = do_div(tmp, (u32) SRC); + CONT; + ALU64_MOD_K: +- div64_u64_rem(DST, IMM, &AX); +- DST = AX; ++ div64_u64_rem(DST, IMM, &tmp); ++ DST = tmp; + CONT; + ALU_MOD_K: +- AX = (u32) DST; +- DST = do_div(AX, (u32) IMM); ++ tmp = (u32) DST; ++ DST = do_div(tmp, (u32) IMM); + CONT; + ALU64_DIV_X: + if (unlikely(SRC == 0)) +@@ -1038,17 +1036,17 @@ select_insn: + ALU_DIV_X: + if (unlikely((u32)SRC == 0)) + return 0; +- AX = (u32) DST; +- do_div(AX, (u32) SRC); +- DST = (u32) AX; ++ tmp = (u32) DST; ++ do_div(tmp, (u32) SRC); ++ DST = (u32) tmp; + CONT; + ALU64_DIV_K: + DST = div64_u64(DST, IMM); + CONT; + ALU_DIV_K: +- AX = (u32) DST; +- do_div(AX, (u32) IMM); +- DST = (u32) AX; ++ tmp = (u32) DST; ++ do_div(tmp, (u32) IMM); ++ DST = (u32) tmp; + CONT; + ALU_END_TO_BE: + switch (IMM) { diff --git a/queue-4.14/bpf-fix-32-bit-src-register-truncation-on-div-mod.patch b/queue-4.14/bpf-fix-32-bit-src-register-truncation-on-div-mod.patch new file mode 100644 index 00000000000..dbcb8c8c561 --- /dev/null +++ b/queue-4.14/bpf-fix-32-bit-src-register-truncation-on-div-mod.patch @@ -0,0 +1,181 @@ +From foo@baz Wed Sep 1 11:42:16 AM CEST 2021 +From: Thadeu Lima de Souza Cascardo +Date: Mon, 30 Aug 2021 15:32:10 -0300 +Subject: bpf: Fix 32 bit src register truncation on div/mod +To: stable@vger.kernel.org +Cc: bpf@vger.kernel.org, Daniel Borkmann , Alexei Starovoitov , John Fastabend , Pavel Machek , Salvatore Bonaccorso , Thadeu Lima de Souza Cascardo +Message-ID: <20210830183211.339054-4-cascardo@canonical.com> + +From: Daniel Borkmann + +Commit e88b2c6e5a4d9ce30d75391e4d950da74bb2bd90 upstream. + +While reviewing a different fix, John and I noticed an oddity in one of the +BPF program dumps that stood out, for example: + + # bpftool p d x i 13 + 0: (b7) r0 = 808464450 + 1: (b4) w4 = 808464432 + 2: (bc) w0 = w0 + 3: (15) if r0 == 0x0 goto pc+1 + 4: (9c) w4 %= w0 + [...] + +In line 2 we noticed that the mov32 would 32 bit truncate the original src +register for the div/mod operation. While for the two operations the dst +register is typically marked unknown e.g. from adjust_scalar_min_max_vals() +the src register is not, and thus verifier keeps tracking original bounds, +simplified: + + 0: R1=ctx(id=0,off=0,imm=0) R10=fp0 + 0: (b7) r0 = -1 + 1: R0_w=invP-1 R1=ctx(id=0,off=0,imm=0) R10=fp0 + 1: (b7) r1 = -1 + 2: R0_w=invP-1 R1_w=invP-1 R10=fp0 + 2: (3c) w0 /= w1 + 3: R0_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R1_w=invP-1 R10=fp0 + 3: (77) r1 >>= 32 + 4: R0_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R1_w=invP4294967295 R10=fp0 + 4: (bf) r0 = r1 + 5: R0_w=invP4294967295 R1_w=invP4294967295 R10=fp0 + 5: (95) exit + processed 6 insns (limit 1000000) max_states_per_insn 0 total_states 0 peak_states 0 mark_read 0 + +Runtime result of r0 at exit is 0 instead of expected -1. Remove the +verifier mov32 src rewrite in div/mod and replace it with a jmp32 test +instead. After the fix, we result in the following code generation when +having dividend r1 and divisor r6: + + div, 64 bit: div, 32 bit: + + 0: (b7) r6 = 8 0: (b7) r6 = 8 + 1: (b7) r1 = 8 1: (b7) r1 = 8 + 2: (55) if r6 != 0x0 goto pc+2 2: (56) if w6 != 0x0 goto pc+2 + 3: (ac) w1 ^= w1 3: (ac) w1 ^= w1 + 4: (05) goto pc+1 4: (05) goto pc+1 + 5: (3f) r1 /= r6 5: (3c) w1 /= w6 + 6: (b7) r0 = 0 6: (b7) r0 = 0 + 7: (95) exit 7: (95) exit + + mod, 64 bit: mod, 32 bit: + + 0: (b7) r6 = 8 0: (b7) r6 = 8 + 1: (b7) r1 = 8 1: (b7) r1 = 8 + 2: (15) if r6 == 0x0 goto pc+1 2: (16) if w6 == 0x0 goto pc+1 + 3: (9f) r1 %= r6 3: (9c) w1 %= w6 + 4: (b7) r0 = 0 4: (b7) r0 = 0 + 5: (95) exit 5: (95) exit + +x86 in particular can throw a 'divide error' exception for div +instruction not only for divisor being zero, but also for the case +when the quotient is too large for the designated register. For the +edx:eax and rdx:rax dividend pair it is not an issue in x86 BPF JIT +since we always zero edx (rdx). Hence really the only protection +needed is against divisor being zero. + +Fixes: 68fda450a7df ("bpf: fix 32-bit divide by zero") +Co-developed-by: John Fastabend +Signed-off-by: John Fastabend +Signed-off-by: Daniel Borkmann +[Salvatore Bonaccorso: This is an earlier version of the patch provided +by Daniel Borkmann which does not rely on availability of the BPF_JMP32 +instruction class. This means it is not even strictly a backport of the +upstream commit mentioned but based on Daniel's and John's work to +address the issue.] +Tested-by: Salvatore Bonaccorso +Signed-off-by: Thadeu Lima de Souza Cascardo +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/filter.h | 24 ++++++++++++++++++++++++ + kernel/bpf/verifier.c | 22 +++++++++++----------- + 2 files changed, 35 insertions(+), 11 deletions(-) + +--- a/include/linux/filter.h ++++ b/include/linux/filter.h +@@ -67,6 +67,14 @@ struct bpf_prog_aux; + + /* ALU ops on registers, bpf_add|sub|...: dst_reg += src_reg */ + ++#define BPF_ALU_REG(CLASS, OP, DST, SRC) \ ++ ((struct bpf_insn) { \ ++ .code = CLASS | BPF_OP(OP) | BPF_X, \ ++ .dst_reg = DST, \ ++ .src_reg = SRC, \ ++ .off = 0, \ ++ .imm = 0 }) ++ + #define BPF_ALU64_REG(OP, DST, SRC) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_OP(OP) | BPF_X, \ +@@ -113,6 +121,14 @@ struct bpf_prog_aux; + + /* Short form of mov, dst_reg = src_reg */ + ++#define BPF_MOV_REG(CLASS, DST, SRC) \ ++ ((struct bpf_insn) { \ ++ .code = CLASS | BPF_MOV | BPF_X, \ ++ .dst_reg = DST, \ ++ .src_reg = SRC, \ ++ .off = 0, \ ++ .imm = 0 }) ++ + #define BPF_MOV64_REG(DST, SRC) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_MOV | BPF_X, \ +@@ -147,6 +163,14 @@ struct bpf_prog_aux; + .off = 0, \ + .imm = IMM }) + ++#define BPF_RAW_REG(insn, DST, SRC) \ ++ ((struct bpf_insn) { \ ++ .code = (insn).code, \ ++ .dst_reg = DST, \ ++ .src_reg = SRC, \ ++ .off = (insn).off, \ ++ .imm = (insn).imm }) ++ + /* BPF_LD_IMM64 macro encodes single 'load 64-bit immediate' insn */ + #define BPF_LD_IMM64(DST, IMM) \ + BPF_LD_IMM64_RAW(DST, 0, IMM) +--- a/kernel/bpf/verifier.c ++++ b/kernel/bpf/verifier.c +@@ -4844,28 +4844,28 @@ static int fixup_bpf_calls(struct bpf_ve + insn->code == (BPF_ALU | BPF_DIV | BPF_X)) { + bool is64 = BPF_CLASS(insn->code) == BPF_ALU64; + struct bpf_insn mask_and_div[] = { +- BPF_MOV32_REG(insn->src_reg, insn->src_reg), ++ BPF_MOV_REG(BPF_CLASS(insn->code), BPF_REG_AX, insn->src_reg), + /* Rx div 0 -> 0 */ +- BPF_JMP_IMM(BPF_JNE, insn->src_reg, 0, 2), +- BPF_ALU32_REG(BPF_XOR, insn->dst_reg, insn->dst_reg), ++ BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, 2), ++ BPF_RAW_REG(*insn, insn->dst_reg, BPF_REG_AX), + BPF_JMP_IMM(BPF_JA, 0, 0, 1), +- *insn, ++ BPF_ALU_REG(BPF_CLASS(insn->code), BPF_XOR, insn->dst_reg, insn->dst_reg), + }; + struct bpf_insn mask_and_mod[] = { +- BPF_MOV32_REG(insn->src_reg, insn->src_reg), ++ BPF_MOV_REG(BPF_CLASS(insn->code), BPF_REG_AX, insn->src_reg), + /* Rx mod 0 -> Rx */ +- BPF_JMP_IMM(BPF_JEQ, insn->src_reg, 0, 1), +- *insn, ++ BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, 1), ++ BPF_RAW_REG(*insn, insn->dst_reg, BPF_REG_AX), + }; + struct bpf_insn *patchlet; + + if (insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) || + insn->code == (BPF_ALU | BPF_DIV | BPF_X)) { +- patchlet = mask_and_div + (is64 ? 1 : 0); +- cnt = ARRAY_SIZE(mask_and_div) - (is64 ? 1 : 0); ++ patchlet = mask_and_div; ++ cnt = ARRAY_SIZE(mask_and_div); + } else { +- patchlet = mask_and_mod + (is64 ? 1 : 0); +- cnt = ARRAY_SIZE(mask_and_mod) - (is64 ? 1 : 0); ++ patchlet = mask_and_mod; ++ cnt = ARRAY_SIZE(mask_and_mod); + } + + new_prog = bpf_patch_insn_data(env, i + delta, patchlet, cnt); diff --git a/queue-4.14/bpf-fix-subprog-verifier-bypass-by-div-mod-by-0-exception.patch b/queue-4.14/bpf-fix-subprog-verifier-bypass-by-div-mod-by-0-exception.patch new file mode 100644 index 00000000000..0cc96039cb0 --- /dev/null +++ b/queue-4.14/bpf-fix-subprog-verifier-bypass-by-div-mod-by-0-exception.patch @@ -0,0 +1,273 @@ +From foo@baz Wed Sep 1 11:42:16 AM CEST 2021 +From: Thadeu Lima de Souza Cascardo +Date: Mon, 30 Aug 2021 15:32:09 -0300 +Subject: bpf: fix subprog verifier bypass by div/mod by 0 exception +To: stable@vger.kernel.org +Cc: bpf@vger.kernel.org, Daniel Borkmann , Alexei Starovoitov , John Fastabend , Pavel Machek , Thadeu Lima de Souza Cascardo +Message-ID: <20210830183211.339054-3-cascardo@canonical.com> + +From: Daniel Borkmann + +Commit f6b1b3bf0d5f681631a293cfe1ca934b81716f1e upstream. + +One of the ugly leftovers from the early eBPF days is that div/mod +operations based on registers have a hard-coded src_reg == 0 test +in the interpreter as well as in JIT code generators that would +return from the BPF program with exit code 0. This was basically +adopted from cBPF interpreter for historical reasons. + +There are multiple reasons why this is very suboptimal and prone +to bugs. To name one: the return code mapping for such abnormal +program exit of 0 does not always match with a suitable program +type's exit code mapping. For example, '0' in tc means action 'ok' +where the packet gets passed further up the stack, which is just +undesirable for such cases (e.g. when implementing policy) and +also does not match with other program types. + +While trying to work out an exception handling scheme, I also +noticed that programs crafted like the following will currently +pass the verifier: + + 0: (bf) r6 = r1 + 1: (85) call pc+8 + caller: + R6=ctx(id=0,off=0,imm=0) R10=fp0,call_-1 + callee: + frame1: R1=ctx(id=0,off=0,imm=0) R10=fp0,call_1 + 10: (b4) (u32) r2 = (u32) 0 + 11: (b4) (u32) r3 = (u32) 1 + 12: (3c) (u32) r3 /= (u32) r2 + 13: (61) r0 = *(u32 *)(r1 +76) + 14: (95) exit + returning from callee: + frame1: R0_w=pkt(id=0,off=0,r=0,imm=0) + R1=ctx(id=0,off=0,imm=0) R2_w=inv0 + R3_w=inv(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) + R10=fp0,call_1 + to caller at 2: + R0_w=pkt(id=0,off=0,r=0,imm=0) R6=ctx(id=0,off=0,imm=0) + R10=fp0,call_-1 + + from 14 to 2: R0=pkt(id=0,off=0,r=0,imm=0) + R6=ctx(id=0,off=0,imm=0) R10=fp0,call_-1 + 2: (bf) r1 = r6 + 3: (61) r1 = *(u32 *)(r1 +80) + 4: (bf) r2 = r0 + 5: (07) r2 += 8 + 6: (2d) if r2 > r1 goto pc+1 + R0=pkt(id=0,off=0,r=8,imm=0) R1=pkt_end(id=0,off=0,imm=0) + R2=pkt(id=0,off=8,r=8,imm=0) R6=ctx(id=0,off=0,imm=0) + R10=fp0,call_-1 + 7: (71) r0 = *(u8 *)(r0 +0) + 8: (b7) r0 = 1 + 9: (95) exit + + from 6 to 8: safe + processed 16 insns (limit 131072), stack depth 0+0 + +Basically what happens is that in the subprog we make use of a +div/mod by 0 exception and in the 'normal' subprog's exit path +we just return skb->data back to the main prog. This has the +implication that the verifier thinks we always get a pkt pointer +in R0 while we still have the implicit 'return 0' from the div +as an alternative unconditional return path earlier. Thus, R0 +then contains 0, meaning back in the parent prog we get the +address range of [0x0, skb->data_end] as read and writeable. +Similar can be crafted with other pointer register types. + +Since i) BPF_ABS/IND is not allowed in programs that contain +BPF to BPF calls (and generally it's also disadvised to use in +native eBPF context), ii) unknown opcodes don't return zero +anymore, iii) we don't return an exception code in dead branches, +the only last missing case affected and to fix is the div/mod +handling. + +What we would really need is some infrastructure to propagate +exceptions all the way to the original prog unwinding the +current stack and returning that code to the caller of the +BPF program. In user space such exception handling for similar +runtimes is typically implemented with setjmp(3) and longjmp(3) +as one possibility which is not available in the kernel, +though (kgdb used to implement it in kernel long time ago). I +implemented a PoC exception handling mechanism into the BPF +interpreter with porting setjmp()/longjmp() into x86_64 and +adding a new internal BPF_ABRT opcode that can use a program +specific exception code for all exception cases we have (e.g. +div/mod by 0, unknown opcodes, etc). While this seems to work +in the constrained BPF environment (meaning, here, we don't +need to deal with state e.g. from memory allocations that we +would need to undo before going into exception state), it still +has various drawbacks: i) we would need to implement the +setjmp()/longjmp() for every arch supported in the kernel and +for x86_64, arm64, sparc64 JITs currently supporting calls, +ii) it has unconditional additional cost on main program +entry to store CPU register state in initial setjmp() call, +and we would need some way to pass the jmp_buf down into +___bpf_prog_run() for main prog and all subprogs, but also +storing on stack is not really nice (other option would be +per-cpu storage for this, but it also has the drawback that +we need to disable preemption for every BPF program types). +All in all this approach would add a lot of complexity. + +Another poor-man's solution would be to have some sort of +additional shared register or scratch buffer to hold state +for exceptions, and test that after every call return to +chain returns and pass R0 all the way down to BPF prog caller. +This is also problematic in various ways: i) an additional +register doesn't map well into JITs, and some other scratch +space could only be on per-cpu storage, which, again has the +side-effect that this only works when we disable preemption, +or somewhere in the input context which is not available +everywhere either, and ii) this adds significant runtime +overhead by putting conditionals after each and every call, +as well as implementation complexity. + +Yet another option is to teach verifier that div/mod can +return an integer, which however is also complex to implement +as verifier would need to walk such fake 'mov r0,; exit;' +sequeuence and there would still be no guarantee for having +propagation of this further down to the BPF caller as proper +exception code. For parent prog, it is also is not distinguishable +from a normal return of a constant scalar value. + +The approach taken here is a completely different one with +little complexity and no additional overhead involved in +that we make use of the fact that a div/mod by 0 is undefined +behavior. Instead of bailing out, we adapt the same behavior +as on some major archs like ARMv8 [0] into eBPF as well: +X div 0 results in 0, and X mod 0 results in X. aarch64 and +aarch32 ISA do not generate any traps or otherwise aborts +of program execution for unsigned divides. I verified this +also with a test program compiled by gcc and clang, and the +behavior matches with the spec. Going forward we adapt the +eBPF verifier to emit such rewrites once div/mod by register +was seen. cBPF is not touched and will keep existing 'return 0' +semantics. Given the options, it seems the most suitable from +all of them, also since major archs have similar schemes in +place. Given this is all in the realm of undefined behavior, +we still have the option to adapt if deemed necessary and +this way we would also have the option of more flexibility +from LLVM code generation side (which is then fully visible +to verifier). Thus, this patch i) fixes the panic seen in +above program and ii) doesn't bypass the verifier observations. + + [0] ARM Architecture Reference Manual, ARMv8 [ARM DDI 0487B.b] + http://infocenter.arm.com/help/topic/com.arm.doc.ddi0487b.b/DDI0487B_b_armv8_arm.pdf + 1) aarch64 instruction set: section C3.4.7 and C6.2.279 (UDIV) + "A division by zero results in a zero being written to + the destination register, without any indication that + the division by zero occurred." + 2) aarch32 instruction set: section F1.4.8 and F5.1.263 (UDIV) + "For the SDIV and UDIV instructions, division by zero + always returns a zero result." + +Fixes: f4d7e40a5b71 ("bpf: introduce function calls (verification)") +Signed-off-by: Daniel Borkmann +Acked-by: Alexei Starovoitov +Signed-off-by: Alexei Starovoitov +Signed-off-by: Thadeu Lima de Souza Cascardo +Signed-off-by: Greg Kroah-Hartman +--- + kernel/bpf/core.c | 8 -------- + kernel/bpf/verifier.c | 38 ++++++++++++++++++++++++++++++-------- + net/core/filter.c | 9 ++++++++- + 3 files changed, 38 insertions(+), 17 deletions(-) + +--- a/kernel/bpf/core.c ++++ b/kernel/bpf/core.c +@@ -1009,14 +1009,10 @@ select_insn: + (*(s64 *) &DST) >>= IMM; + CONT; + ALU64_MOD_X: +- if (unlikely(SRC == 0)) +- return 0; + div64_u64_rem(DST, SRC, &tmp); + DST = tmp; + CONT; + ALU_MOD_X: +- if (unlikely((u32)SRC == 0)) +- return 0; + tmp = (u32) DST; + DST = do_div(tmp, (u32) SRC); + CONT; +@@ -1029,13 +1025,9 @@ select_insn: + DST = do_div(tmp, (u32) IMM); + CONT; + ALU64_DIV_X: +- if (unlikely(SRC == 0)) +- return 0; + DST = div64_u64(DST, SRC); + CONT; + ALU_DIV_X: +- if (unlikely((u32)SRC == 0)) +- return 0; + tmp = (u32) DST; + do_div(tmp, (u32) SRC); + DST = (u32) tmp; +--- a/kernel/bpf/verifier.c ++++ b/kernel/bpf/verifier.c +@@ -4838,15 +4838,37 @@ static int fixup_bpf_calls(struct bpf_ve + struct bpf_insn_aux_data *aux; + + for (i = 0; i < insn_cnt; i++, insn++) { +- if (insn->code == (BPF_ALU | BPF_MOD | BPF_X) || ++ if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) || ++ insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) || ++ insn->code == (BPF_ALU | BPF_MOD | BPF_X) || + insn->code == (BPF_ALU | BPF_DIV | BPF_X)) { +- /* due to JIT bugs clear upper 32-bits of src register +- * before div/mod operation +- */ +- insn_buf[0] = BPF_MOV32_REG(insn->src_reg, insn->src_reg); +- insn_buf[1] = *insn; +- cnt = 2; +- new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); ++ bool is64 = BPF_CLASS(insn->code) == BPF_ALU64; ++ struct bpf_insn mask_and_div[] = { ++ BPF_MOV32_REG(insn->src_reg, insn->src_reg), ++ /* Rx div 0 -> 0 */ ++ BPF_JMP_IMM(BPF_JNE, insn->src_reg, 0, 2), ++ BPF_ALU32_REG(BPF_XOR, insn->dst_reg, insn->dst_reg), ++ BPF_JMP_IMM(BPF_JA, 0, 0, 1), ++ *insn, ++ }; ++ struct bpf_insn mask_and_mod[] = { ++ BPF_MOV32_REG(insn->src_reg, insn->src_reg), ++ /* Rx mod 0 -> Rx */ ++ BPF_JMP_IMM(BPF_JEQ, insn->src_reg, 0, 1), ++ *insn, ++ }; ++ struct bpf_insn *patchlet; ++ ++ if (insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) || ++ insn->code == (BPF_ALU | BPF_DIV | BPF_X)) { ++ patchlet = mask_and_div + (is64 ? 1 : 0); ++ cnt = ARRAY_SIZE(mask_and_div) - (is64 ? 1 : 0); ++ } else { ++ patchlet = mask_and_mod + (is64 ? 1 : 0); ++ cnt = ARRAY_SIZE(mask_and_mod) - (is64 ? 1 : 0); ++ } ++ ++ new_prog = bpf_patch_insn_data(env, i + delta, patchlet, cnt); + if (!new_prog) + return -ENOMEM; + +--- a/net/core/filter.c ++++ b/net/core/filter.c +@@ -458,8 +458,15 @@ do_pass: + break; + + if (fp->code == (BPF_ALU | BPF_DIV | BPF_X) || +- fp->code == (BPF_ALU | BPF_MOD | BPF_X)) ++ fp->code == (BPF_ALU | BPF_MOD | BPF_X)) { + *insn++ = BPF_MOV32_REG(BPF_REG_X, BPF_REG_X); ++ /* Error with exception code on div/mod by 0. ++ * For cBPF programs, this was always return 0. ++ */ ++ *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_X, 0, 2); ++ *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); ++ *insn++ = BPF_EXIT_INSN(); ++ } + + *insn = BPF_RAW_INSN(fp->code, BPF_REG_A, BPF_REG_X, 0, fp->k); + break; diff --git a/queue-4.14/bpf-fix-truncation-handling-for-mod32-dst-reg-wrt-zero.patch b/queue-4.14/bpf-fix-truncation-handling-for-mod32-dst-reg-wrt-zero.patch new file mode 100644 index 00000000000..c425bcd5676 --- /dev/null +++ b/queue-4.14/bpf-fix-truncation-handling-for-mod32-dst-reg-wrt-zero.patch @@ -0,0 +1,129 @@ +From foo@baz Wed Sep 1 11:42:16 AM CEST 2021 +From: Thadeu Lima de Souza Cascardo +Date: Mon, 30 Aug 2021 15:32:11 -0300 +Subject: bpf: Fix truncation handling for mod32 dst reg wrt zero +To: stable@vger.kernel.org +Cc: bpf@vger.kernel.org, Daniel Borkmann , Alexei Starovoitov , John Fastabend , Pavel Machek , Salvatore Bonaccorso , Thadeu Lima de Souza Cascardo +Message-ID: <20210830183211.339054-5-cascardo@canonical.com> + +From: Daniel Borkmann + +Commit 9b00f1b78809309163dda2d044d9e94a3c0248a3 upstream. + +Recently noticed that when mod32 with a known src reg of 0 is performed, +then the dst register is 32-bit truncated in verifier: + + 0: R1=ctx(id=0,off=0,imm=0) R10=fp0 + 0: (b7) r0 = 0 + 1: R0_w=inv0 R1=ctx(id=0,off=0,imm=0) R10=fp0 + 1: (b7) r1 = -1 + 2: R0_w=inv0 R1_w=inv-1 R10=fp0 + 2: (b4) w2 = -1 + 3: R0_w=inv0 R1_w=inv-1 R2_w=inv4294967295 R10=fp0 + 3: (9c) w1 %= w0 + 4: R0_w=inv0 R1_w=inv(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R2_w=inv4294967295 R10=fp0 + 4: (b7) r0 = 1 + 5: R0_w=inv1 R1_w=inv(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R2_w=inv4294967295 R10=fp0 + 5: (1d) if r1 == r2 goto pc+1 + R0_w=inv1 R1_w=inv(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R2_w=inv4294967295 R10=fp0 + 6: R0_w=inv1 R1_w=inv(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R2_w=inv4294967295 R10=fp0 + 6: (b7) r0 = 2 + 7: R0_w=inv2 R1_w=inv(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R2_w=inv4294967295 R10=fp0 + 7: (95) exit + 7: R0=inv1 R1=inv(id=0,umin_value=4294967295,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R2=inv4294967295 R10=fp0 + 7: (95) exit + +However, as a runtime result, we get 2 instead of 1, meaning the dst +register does not contain (u32)-1 in this case. The reason is fairly +straight forward given the 0 test leaves the dst register as-is: + + # ./bpftool p d x i 23 + 0: (b7) r0 = 0 + 1: (b7) r1 = -1 + 2: (b4) w2 = -1 + 3: (16) if w0 == 0x0 goto pc+1 + 4: (9c) w1 %= w0 + 5: (b7) r0 = 1 + 6: (1d) if r1 == r2 goto pc+1 + 7: (b7) r0 = 2 + 8: (95) exit + +This was originally not an issue given the dst register was marked as +completely unknown (aka 64 bit unknown). However, after 468f6eafa6c4 +("bpf: fix 32-bit ALU op verification") the verifier casts the register +output to 32 bit, and hence it becomes 32 bit unknown. Note that for +the case where the src register is unknown, the dst register is marked +64 bit unknown. After the fix, the register is truncated by the runtime +and the test passes: + + # ./bpftool p d x i 23 + 0: (b7) r0 = 0 + 1: (b7) r1 = -1 + 2: (b4) w2 = -1 + 3: (16) if w0 == 0x0 goto pc+2 + 4: (9c) w1 %= w0 + 5: (05) goto pc+1 + 6: (bc) w1 = w1 + 7: (b7) r0 = 1 + 8: (1d) if r1 == r2 goto pc+1 + 9: (b7) r0 = 2 + 10: (95) exit + +Semantics also match with {R,W}x mod{64,32} 0 -> {R,W}x. Invalid div +has always been {R,W}x div{64,32} 0 -> 0. Rewrites are as follows: + + mod32: mod64: + + (16) if w0 == 0x0 goto pc+2 (15) if r0 == 0x0 goto pc+1 + (9c) w1 %= w0 (9f) r1 %= r0 + (05) goto pc+1 + (bc) w1 = w1 + +Fixes: 468f6eafa6c4 ("bpf: fix 32-bit ALU op verification") +Signed-off-by: Daniel Borkmann +Reviewed-by: John Fastabend +[Salvatore Bonaccorso: This is an earlier version based on work by +Daniel and John which does not rely on availability of the BPF_JMP32 +instruction class. This means it is not even strictly a backport of the +upstream commit mentioned but based on Daniel's and John's work to +address the issue and was finalized by Thadeu Lima de Souza Cascardo.] +Tested-by: Salvatore Bonaccorso +Signed-off-by: Thadeu Lima de Souza Cascardo +Signed-off-by: Greg Kroah-Hartman +--- + kernel/bpf/verifier.c | 9 +++++---- + 1 file changed, 5 insertions(+), 4 deletions(-) + +--- a/kernel/bpf/verifier.c ++++ b/kernel/bpf/verifier.c +@@ -4845,7 +4845,7 @@ static int fixup_bpf_calls(struct bpf_ve + bool is64 = BPF_CLASS(insn->code) == BPF_ALU64; + struct bpf_insn mask_and_div[] = { + BPF_MOV_REG(BPF_CLASS(insn->code), BPF_REG_AX, insn->src_reg), +- /* Rx div 0 -> 0 */ ++ /* [R,W]x div 0 -> 0 */ + BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, 2), + BPF_RAW_REG(*insn, insn->dst_reg, BPF_REG_AX), + BPF_JMP_IMM(BPF_JA, 0, 0, 1), +@@ -4853,9 +4853,10 @@ static int fixup_bpf_calls(struct bpf_ve + }; + struct bpf_insn mask_and_mod[] = { + BPF_MOV_REG(BPF_CLASS(insn->code), BPF_REG_AX, insn->src_reg), +- /* Rx mod 0 -> Rx */ +- BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, 1), ++ BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, 1 + (is64 ? 0 : 1)), + BPF_RAW_REG(*insn, insn->dst_reg, BPF_REG_AX), ++ BPF_JMP_IMM(BPF_JA, 0, 0, 1), ++ BPF_MOV32_REG(insn->dst_reg, insn->dst_reg), + }; + struct bpf_insn *patchlet; + +@@ -4865,7 +4866,7 @@ static int fixup_bpf_calls(struct bpf_ve + cnt = ARRAY_SIZE(mask_and_div); + } else { + patchlet = mask_and_mod; +- cnt = ARRAY_SIZE(mask_and_mod); ++ cnt = ARRAY_SIZE(mask_and_mod) - (is64 ? 2 : 0); + } + + new_prog = bpf_patch_insn_data(env, i + delta, patchlet, cnt); diff --git a/queue-4.14/kvm-x86-mmu-treat-nx-as-used-not-reserved-for-all-tdp-shadow-mmus.patch b/queue-4.14/kvm-x86-mmu-treat-nx-as-used-not-reserved-for-all-tdp-shadow-mmus.patch new file mode 100644 index 00000000000..f823e3ebb18 --- /dev/null +++ b/queue-4.14/kvm-x86-mmu-treat-nx-as-used-not-reserved-for-all-tdp-shadow-mmus.patch @@ -0,0 +1,50 @@ +From foo@baz Wed Sep 1 11:30:26 AM CEST 2021 +From: Sean Christopherson +Date: Tue, 22 Jun 2021 10:56:47 -0700 +Subject: KVM: x86/mmu: Treat NX as used (not reserved) for all !TDP shadow MMUs + +From: Sean Christopherson + +commit 112022bdb5bc372e00e6e43cb88ee38ea67b97bd upstream + +Mark NX as being used for all non-nested shadow MMUs, as KVM will set the +NX bit for huge SPTEs if the iTLB mutli-hit mitigation is enabled. +Checking the mitigation itself is not sufficient as it can be toggled on +at any time and KVM doesn't reset MMU contexts when that happens. KVM +could reset the contexts, but that would require purging all SPTEs in all +MMUs, for no real benefit. And, KVM already forces EFER.NX=1 when TDP is +disabled (for WP=0, SMEP=1, NX=0), so technically NX is never reserved +for shadow MMUs. + +Fixes: b8e8c8303ff2 ("kvm: mmu: ITLB_MULTIHIT mitigation") +Cc: stable@vger.kernel.org +Signed-off-by: Sean Christopherson +Message-Id: <20210622175739.3610207-3-seanjc@google.com> +Signed-off-by: Paolo Bonzini +[sudip: use old path and adjust context] +Signed-off-by: Sudip Mukherjee +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/mmu.c | 11 ++++++++++- + 1 file changed, 10 insertions(+), 1 deletion(-) + +--- a/arch/x86/kvm/mmu.c ++++ b/arch/x86/kvm/mmu.c +@@ -4326,7 +4326,16 @@ static void reset_rsvds_bits_mask_ept(st + void + reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context) + { +- bool uses_nx = context->nx || context->base_role.smep_andnot_wp; ++ /* ++ * KVM uses NX when TDP is disabled to handle a variety of scenarios, ++ * notably for huge SPTEs if iTLB multi-hit mitigation is enabled and ++ * to generate correct permissions for CR0.WP=0/CR4.SMEP=1/EFER.NX=0. ++ * The iTLB multi-hit workaround can be toggled at any time, so assume ++ * NX can be used by any non-nested shadow MMU to avoid having to reset ++ * MMU contexts. Note, KVM forces EFER.NX=1 when TDP is disabled. ++ */ ++ bool uses_nx = context->nx || !tdp_enabled || ++ context->base_role.smep_andnot_wp; + struct rsvd_bits_validate *shadow_zero_check; + int i; + diff --git a/queue-4.14/kvm-x86-mmu-use-the-correct-inherited-permissions-to-get-shadow-page.patch b/queue-4.14/kvm-x86-mmu-use-the-correct-inherited-permissions-to-get-shadow-page.patch new file mode 100644 index 00000000000..2e2690c2965 --- /dev/null +++ b/queue-4.14/kvm-x86-mmu-use-the-correct-inherited-permissions-to-get-shadow-page.patch @@ -0,0 +1,154 @@ +From b1bd5cba3306691c771d558e94baa73e8b0b96b7 Mon Sep 17 00:00:00 2001 +From: Lai Jiangshan +Date: Thu, 3 Jun 2021 13:24:55 +0800 +Subject: KVM: X86: MMU: Use the correct inherited permissions to get shadow page + +From: Lai Jiangshan + +commit b1bd5cba3306691c771d558e94baa73e8b0b96b7 upstream. + +When computing the access permissions of a shadow page, use the effective +permissions of the walk up to that point, i.e. the logic AND of its parents' +permissions. Two guest PxE entries that point at the same table gfn need to +be shadowed with different shadow pages if their parents' permissions are +different. KVM currently uses the effective permissions of the last +non-leaf entry for all non-leaf entries. Because all non-leaf SPTEs have +full ("uwx") permissions, and the effective permissions are recorded only +in role.access and merged into the leaves, this can lead to incorrect +reuse of a shadow page and eventually to a missing guest protection page +fault. + +For example, here is a shared pagetable: + + pgd[] pud[] pmd[] virtual address pointers + /->pmd1(u--)->pte1(uw-)->page1 <- ptr1 (u--) + /->pud1(uw-)--->pmd2(uw-)->pte2(uw-)->page2 <- ptr2 (uw-) + pgd-| (shared pmd[] as above) + \->pud2(u--)--->pmd1(u--)->pte1(uw-)->page1 <- ptr3 (u--) + \->pmd2(uw-)->pte2(uw-)->page2 <- ptr4 (u--) + + pud1 and pud2 point to the same pmd table, so: + - ptr1 and ptr3 points to the same page. + - ptr2 and ptr4 points to the same page. + +(pud1 and pud2 here are pud entries, while pmd1 and pmd2 here are pmd entries) + +- First, the guest reads from ptr1 first and KVM prepares a shadow + page table with role.access=u--, from ptr1's pud1 and ptr1's pmd1. + "u--" comes from the effective permissions of pgd, pud1 and + pmd1, which are stored in pt->access. "u--" is used also to get + the pagetable for pud1, instead of "uw-". + +- Then the guest writes to ptr2 and KVM reuses pud1 which is present. + The hypervisor set up a shadow page for ptr2 with pt->access is "uw-" + even though the pud1 pmd (because of the incorrect argument to + kvm_mmu_get_page in the previous step) has role.access="u--". + +- Then the guest reads from ptr3. The hypervisor reuses pud1's + shadow pmd for pud2, because both use "u--" for their permissions. + Thus, the shadow pmd already includes entries for both pmd1 and pmd2. + +- At last, the guest writes to ptr4. This causes no vmexit or pagefault, + because pud1's shadow page structures included an "uw-" page even though + its role.access was "u--". + +Any kind of shared pagetable might have the similar problem when in +virtual machine without TDP enabled if the permissions are different +from different ancestors. + +In order to fix the problem, we change pt->access to be an array, and +any access in it will not include permissions ANDed from child ptes. + +The test code is: https://lore.kernel.org/kvm/20210603050537.19605-1-jiangshanlai@gmail.com/ +Remember to test it with TDP disabled. + +The problem had existed long before the commit 41074d07c78b ("KVM: MMU: +Fix inherited permissions for emulated guest pte updates"), and it +is hard to find which is the culprit. So there is no fixes tag here. + +Signed-off-by: Lai Jiangshan +Message-Id: <20210603052455.21023-1-jiangshanlai@gmail.com> +Cc: stable@vger.kernel.org +Fixes: cea0f0e7ea54 ("[PATCH] KVM: MMU: Shadow page table caching") +Signed-off-by: Paolo Bonzini +[OP: - apply arch/x86/kvm/mmu/* changes to arch/x86/kvm + - apply documentation changes to Documentation/virtual/kvm/mmu.txt + - add vcpu parameter to gpte_access() call] +Signed-off-by: Ovidiu Panait +Signed-off-by: Greg Kroah-Hartman + +--- + Documentation/virtual/kvm/mmu.txt | 4 ++-- + arch/x86/kvm/paging_tmpl.h | 14 +++++++++----- + 2 files changed, 11 insertions(+), 7 deletions(-) + +--- a/Documentation/virtual/kvm/mmu.txt ++++ b/Documentation/virtual/kvm/mmu.txt +@@ -152,8 +152,8 @@ Shadow pages contain the following infor + shadow pages) so role.quadrant takes values in the range 0..3. Each + quadrant maps 1GB virtual address space. + role.access: +- Inherited guest access permissions in the form uwx. Note execute +- permission is positive, not negative. ++ Inherited guest access permissions from the parent ptes in the form uwx. ++ Note execute permission is positive, not negative. + role.invalid: + The page is invalid and should not be used. It is a root page that is + currently pinned (by a cpu hardware register pointing to it); once it is +--- a/arch/x86/kvm/paging_tmpl.h ++++ b/arch/x86/kvm/paging_tmpl.h +@@ -93,8 +93,8 @@ struct guest_walker { + gpa_t pte_gpa[PT_MAX_FULL_LEVELS]; + pt_element_t __user *ptep_user[PT_MAX_FULL_LEVELS]; + bool pte_writable[PT_MAX_FULL_LEVELS]; +- unsigned pt_access; +- unsigned pte_access; ++ unsigned int pt_access[PT_MAX_FULL_LEVELS]; ++ unsigned int pte_access; + gfn_t gfn; + struct x86_exception fault; + }; +@@ -388,13 +388,15 @@ retry_walk: + } + + walker->ptes[walker->level - 1] = pte; ++ ++ /* Convert to ACC_*_MASK flags for struct guest_walker. */ ++ walker->pt_access[walker->level - 1] = FNAME(gpte_access)(vcpu, pt_access ^ walk_nx_mask); + } while (!is_last_gpte(mmu, walker->level, pte)); + + pte_pkey = FNAME(gpte_pkeys)(vcpu, pte); + accessed_dirty = have_ad ? pte_access & PT_GUEST_ACCESSED_MASK : 0; + + /* Convert to ACC_*_MASK flags for struct guest_walker. */ +- walker->pt_access = FNAME(gpte_access)(vcpu, pt_access ^ walk_nx_mask); + walker->pte_access = FNAME(gpte_access)(vcpu, pte_access ^ walk_nx_mask); + errcode = permission_fault(vcpu, mmu, walker->pte_access, pte_pkey, access); + if (unlikely(errcode)) +@@ -433,7 +435,8 @@ retry_walk: + } + + pgprintk("%s: pte %llx pte_access %x pt_access %x\n", +- __func__, (u64)pte, walker->pte_access, walker->pt_access); ++ __func__, (u64)pte, walker->pte_access, ++ walker->pt_access[walker->level - 1]); + return 1; + + error: +@@ -602,7 +605,7 @@ static int FNAME(fetch)(struct kvm_vcpu + { + struct kvm_mmu_page *sp = NULL; + struct kvm_shadow_walk_iterator it; +- unsigned direct_access, access = gw->pt_access; ++ unsigned int direct_access, access; + int top_level, ret; + gfn_t gfn, base_gfn; + +@@ -634,6 +637,7 @@ static int FNAME(fetch)(struct kvm_vcpu + sp = NULL; + if (!is_shadow_present_pte(*it.sptep)) { + table_gfn = gw->table_gfn[it.level - 2]; ++ access = gw->pt_access[it.level - 2]; + sp = kvm_mmu_get_page(vcpu, table_gfn, addr, it.level-1, + false, access); + } diff --git a/queue-4.14/series b/queue-4.14/series index 2d7c264a682..d16840aed2b 100644 --- a/queue-4.14/series +++ b/queue-4.14/series @@ -18,3 +18,9 @@ drm-nouveau-disp-power-down-unused-dp-links-during-i.patch net-rds-dma_map_sg-is-entitled-to-merge-entries.patch vt_kdsetmode-extend-console-locking.patch fbmem-add-margin-check-to-fb_check_caps.patch +kvm-x86-mmu-treat-nx-as-used-not-reserved-for-all-tdp-shadow-mmus.patch +kvm-x86-mmu-use-the-correct-inherited-permissions-to-get-shadow-page.patch +bpf-do-not-use-ax-register-in-interpreter-on-div-mod.patch +bpf-fix-subprog-verifier-bypass-by-div-mod-by-0-exception.patch +bpf-fix-32-bit-src-register-truncation-on-div-mod.patch +bpf-fix-truncation-handling-for-mod32-dst-reg-wrt-zero.patch -- 2.47.3