4.19-stable patches

author Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Wed, 15 Sep 2021 12:56:30 +0000 (14:56 +0200)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Wed, 15 Sep 2021 12:56:30 +0000 (14:56 +0200)
author Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Wed, 15 Sep 2021 12:56:30 +0000 (14:56 +0200)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Wed, 15 Sep 2021 12:56:30 +0000 (14:56 +0200)
diff --git a/queue-4.19/arm64-head-avoid-over-mapping-in-map_memory.patch b/queue-4.19/arm64-head-avoid-over-mapping-in-map_memory.patch

new file mode 100644 (file)

index 0000000..90eb7e0
--- /dev/null
+++ b/queue-4.19/arm64-head-avoid-over-mapping-in-map_memory.patch
@@ -0,0 +1,106 @@
+From 90268574a3e8a6b883bd802d702a2738577e1006 Mon Sep 17 00:00:00 2001
+From: Mark Rutland <mark.rutland@arm.com>
+Date: Mon, 23 Aug 2021 11:12:53 +0100
+Subject: arm64: head: avoid over-mapping in map_memory
+
+From: Mark Rutland <mark.rutland@arm.com>
+
+commit 90268574a3e8a6b883bd802d702a2738577e1006 upstream.
+
+The `compute_indices` and `populate_entries` macros operate on inclusive
+bounds, and thus the `map_memory` macro which uses them also operates
+on inclusive bounds.
+
+We pass `_end` and `_idmap_text_end` to `map_memory`, but these are
+exclusive bounds, and if one of these is sufficiently aligned (as a
+result of kernel configuration, physical placement, and KASLR), then:
+
+* In `compute_indices`, the computed `iend` will be in the page/block *after*
+  the final byte of the intended mapping.
+
+* In `populate_entries`, an unnecessary entry will be created at the end
+  of each level of table. At the leaf level, this entry will map up to
+  SWAPPER_BLOCK_SIZE bytes of physical addresses that we did not intend
+  to map.
+
+As we may map up to SWAPPER_BLOCK_SIZE bytes more than intended, we may
+violate the boot protocol and map physical address past the 2MiB-aligned
+end address we are permitted to map. As we map these with Normal memory
+attributes, this may result in further problems depending on what these
+physical addresses correspond to.
+
+The final entry at each level may require an additional table at that
+level. As EARLY_ENTRIES() calculates an inclusive bound, we allocate
+enough memory for this.
+
+Avoid the extraneous mapping by having map_memory convert the exclusive
+end address to an inclusive end address by subtracting one, and do
+likewise in EARLY_ENTRIES() when calculating the number of required
+tables. For clarity, comments are updated to more clearly document which
+boundaries the macros operate on.  For consistency with the other
+macros, the comments in map_memory are also updated to describe `vstart`
+and `vend` as virtual addresses.
+
+Fixes: 0370b31e4845 ("arm64: Extend early page table code to allow for larger kernels")
+Cc: <stable@vger.kernel.org> # 4.16.x
+Signed-off-by: Mark Rutland <mark.rutland@arm.com>
+Cc: Anshuman Khandual <anshuman.khandual@arm.com>
+Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
+Cc: Steve Capper <steve.capper@arm.com>
+Cc: Will Deacon <will@kernel.org>
+Acked-by: Will Deacon <will@kernel.org>
+Link: https://lore.kernel.org/r/20210823101253.55567-1-mark.rutland@arm.com
+Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm64/include/asm/kernel-pgtable.h |    4 ++--
+ arch/arm64/kernel/head.S                |   11 ++++++-----
+ 2 files changed, 8 insertions(+), 7 deletions(-)
+
+--- a/arch/arm64/include/asm/kernel-pgtable.h
++++ b/arch/arm64/include/asm/kernel-pgtable.h
+@@ -76,8 +76,8 @@
+ #define EARLY_KASLR   (0)
+ #endif
+ 
+-#define EARLY_ENTRIES(vstart, vend, shift) (((vend) >> (shift)) \
+-                                      - ((vstart) >> (shift)) + 1 + EARLY_KASLR)
++#define EARLY_ENTRIES(vstart, vend, shift) \
++      ((((vend) - 1) >> (shift)) - ((vstart) >> (shift)) + 1 + EARLY_KASLR)
+ 
+ #define EARLY_PGDS(vstart, vend) (EARLY_ENTRIES(vstart, vend, PGDIR_SHIFT))
+ 
+--- a/arch/arm64/kernel/head.S
++++ b/arch/arm64/kernel/head.S
+@@ -202,7 +202,7 @@ ENDPROC(preserve_boot_args)
+  * to be composed of multiple pages. (This effectively scales the end index).
+  *
+  *    vstart: virtual address of start of range
+- *    vend:   virtual address of end of range
++ *    vend:   virtual address of end of range - we map [vstart, vend]
+  *    shift:  shift used to transform virtual address into index
+  *    ptrs:   number of entries in page table
+  *    istart: index in table corresponding to vstart
+@@ -239,17 +239,18 @@ ENDPROC(preserve_boot_args)
+  *
+  *    tbl:    location of page table
+  *    rtbl:   address to be used for first level page table entry (typically tbl + PAGE_SIZE)
+- *    vstart: start address to map
+- *    vend:   end address to map - we map [vstart, vend]
++ *    vstart: virtual address of start of range
++ *    vend:   virtual address of end of range - we map [vstart, vend - 1]
+  *    flags:  flags to use to map last level entries
+  *    phys:   physical address corresponding to vstart - physical memory is contiguous
+  *    pgds:   the number of pgd entries
+  *
+  * Temporaries:       istart, iend, tmp, count, sv - these need to be different registers
+- * Preserves: vstart, vend, flags
+- * Corrupts:  tbl, rtbl, istart, iend, tmp, count, sv
++ * Preserves: vstart, flags
++ * Corrupts:  tbl, rtbl, vend, istart, iend, tmp, count, sv
+  */
+       .macro map_memory, tbl, rtbl, vstart, vend, flags, phys, pgds, istart, iend, tmp, count, sv
++      sub \vend, \vend, #1
+       add \rtbl, \tbl, #PAGE_SIZE
+       mov \sv, \rtbl
+       mov \count, #0
diff --git a/queue-4.19/block-bfq-fix-bfq_set_next_ioprio_data.patch b/queue-4.19/block-bfq-fix-bfq_set_next_ioprio_data.patch

new file mode 100644 (file)

index 0000000..06beda4
--- /dev/null
+++ b/queue-4.19/block-bfq-fix-bfq_set_next_ioprio_data.patch
@@ -0,0 +1,38 @@
+From a680dd72ec336b81511e3bff48efac6dbfa563e7 Mon Sep 17 00:00:00 2001
+From: Damien Le Moal <damien.lemoal@wdc.com>
+Date: Wed, 11 Aug 2021 12:36:57 +0900
+Subject: block: bfq: fix bfq_set_next_ioprio_data()
+
+From: Damien Le Moal <damien.lemoal@wdc.com>
+
+commit a680dd72ec336b81511e3bff48efac6dbfa563e7 upstream.
+
+For a request that has a priority level equal to or larger than
+IOPRIO_BE_NR, bfq_set_next_ioprio_data() prints a critical warning but
+defaults to setting the request new_ioprio field to IOPRIO_BE_NR. This
+is not consistent with the warning and the allowed values for priority
+levels. Fix this by setting the request new_ioprio field to
+IOPRIO_BE_NR - 1, the lowest priority level allowed.
+
+Cc: <stable@vger.kernel.org>
+Fixes: aee69d78dec0 ("block, bfq: introduce the BFQ-v0 I/O scheduler as an extra scheduler")
+Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
+Reviewed-by: Hannes Reinecke <hare@suse.de>
+Link: https://lore.kernel.org/r/20210811033702.368488-2-damien.lemoal@wdc.com
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ block/bfq-iosched.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/block/bfq-iosched.c
++++ b/block/bfq-iosched.c
+@@ -4241,7 +4241,7 @@ bfq_set_next_ioprio_data(struct bfq_queu
+       if (bfqq->new_ioprio >= IOPRIO_BE_NR) {
+               pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n",
+                       bfqq->new_ioprio);
+-              bfqq->new_ioprio = IOPRIO_BE_NR;
++              bfqq->new_ioprio = IOPRIO_BE_NR - 1;
+       }
+ 
+       bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio);
diff --git a/queue-4.19/bpf-correct-slot_type-marking-logic-to-allow-more-stack-slot-sharing.patch b/queue-4.19/bpf-correct-slot_type-marking-logic-to-allow-more-stack-slot-sharing.patch

new file mode 100644 (file)

index 0000000..45fadfe
--- /dev/null
+++ b/queue-4.19/bpf-correct-slot_type-marking-logic-to-allow-more-stack-slot-sharing.patch
@@ -0,0 +1,118 @@
+From foo@baz Wed Sep 15 02:37:47 PM CEST 2021
+From: Ovidiu Panait <ovidiu.panait@windriver.com>
+Date: Mon, 13 Sep 2021 18:35:26 +0300
+Subject: bpf: correct slot_type marking logic to allow more stack slot sharing
+To: stable@vger.kernel.org
+Cc: bpf@vger.kernel.org, daniel@iogearbox.net
+Message-ID: <20210913153537.2162465-3-ovidiu.panait@windriver.com>
+
+From: Jiong Wang <jiong.wang@netronome.com>
+
+commit 0bae2d4d62d523f06ff1a8e88ce38b45400acd28 upstream.
+
+Verifier is supposed to support sharing stack slot allocated to ptr with
+SCALAR_VALUE for privileged program. However this doesn't happen for some
+cases.
+
+The reason is verifier is not clearing slot_type STACK_SPILL for all bytes,
+it only clears part of them, while verifier is using:
+
+  slot_type[0] == STACK_SPILL
+
+as a convention to check one slot is ptr type.
+
+So, the consequence of partial clearing slot_type is verifier could treat a
+partially overridden ptr slot, which should now be a SCALAR_VALUE slot,
+still as ptr slot, and rejects some valid programs.
+
+Before this patch, test_xdp_noinline.o under bpf selftests, bpf_lxc.o and
+bpf_netdev.o under Cilium bpf repo, when built with -mattr=+alu32 are
+rejected due to this issue. After this patch, they all accepted.
+
+There is no processed insn number change before and after this patch on
+Cilium bpf programs.
+
+Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
+Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
+Reviewed-by: Daniel Borkmann <daniel@iogearbox.net>
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+[OP: adjusted context for 4.19]
+Signed-off-by: Ovidiu Panait <ovidiu.panait@windriver.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/bpf/verifier.c                       |    5 ++++
+ tools/testing/selftests/bpf/test_verifier.c |   34 ++++++++++++++++++++++++++--
+ 2 files changed, 37 insertions(+), 2 deletions(-)
+
+--- a/kernel/bpf/verifier.c
++++ b/kernel/bpf/verifier.c
+@@ -1039,6 +1039,10 @@ static int check_stack_write(struct bpf_
+ 
+               /* regular write of data into stack destroys any spilled ptr */
+               state->stack[spi].spilled_ptr.type = NOT_INIT;
++              /* Mark slots as STACK_MISC if they belonged to spilled ptr. */
++              if (state->stack[spi].slot_type[0] == STACK_SPILL)
++                      for (i = 0; i < BPF_REG_SIZE; i++)
++                              state->stack[spi].slot_type[i] = STACK_MISC;
+ 
+               /* only mark the slot as written if all 8 bytes were written
+                * otherwise read propagation may incorrectly stop too soon
+@@ -1056,6 +1060,7 @@ static int check_stack_write(struct bpf_
+                   register_is_null(&cur->regs[value_regno]))
+                       type = STACK_ZERO;
+ 
++              /* Mark slots affected by this stack write. */
+               for (i = 0; i < size; i++)
+                       state->stack[spi].slot_type[(slot - i) % BPF_REG_SIZE] =
+                               type;
+--- a/tools/testing/selftests/bpf/test_verifier.c
++++ b/tools/testing/selftests/bpf/test_verifier.c
+@@ -956,16 +956,46 @@ static struct bpf_test tests[] = {
+                       BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8),
+                       /* mess up with R1 pointer on stack */
+                       BPF_ST_MEM(BPF_B, BPF_REG_10, -7, 0x23),
+-                      /* fill back into R0 should fail */
++                      /* fill back into R0 is fine for priv.
++                       * R0 now becomes SCALAR_VALUE.
++                       */
+                       BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_10, -8),
++                      /* Load from R0 should fail. */
++                      BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 8),
+                       BPF_EXIT_INSN(),
+               },
+               .errstr_unpriv = "attempt to corrupt spilled",
+-              .errstr = "corrupted spill",
++              .errstr = "R0 invalid mem access 'inv",
+               .result = REJECT,
+               .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+       },
+       {
++              "check corrupted spill/fill, LSB",
++              .insns = {
++                      BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8),
++                      BPF_ST_MEM(BPF_H, BPF_REG_10, -8, 0xcafe),
++                      BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_10, -8),
++                      BPF_EXIT_INSN(),
++              },
++              .errstr_unpriv = "attempt to corrupt spilled",
++              .result_unpriv = REJECT,
++              .result = ACCEPT,
++              .retval = POINTER_VALUE,
++      },
++      {
++              "check corrupted spill/fill, MSB",
++              .insns = {
++                      BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8),
++                      BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 0x12345678),
++                      BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_10, -8),
++                      BPF_EXIT_INSN(),
++              },
++              .errstr_unpriv = "attempt to corrupt spilled",
++              .result_unpriv = REJECT,
++              .result = ACCEPT,
++              .retval = POINTER_VALUE,
++      },
++      {
+               "invalid src register in STX",
+               .insns = {
+                       BPF_STX_MEM(BPF_B, BPF_REG_10, -1, -1),
diff --git a/queue-4.19/bpf-fix-leakage-due-to-insufficient-speculative-store-bypass-mitigation.patch b/queue-4.19/bpf-fix-leakage-due-to-insufficient-speculative-store-bypass-mitigation.patch

new file mode 100644 (file)

index 0000000..80d01b0
--- /dev/null
+++ b/queue-4.19/bpf-fix-leakage-due-to-insufficient-speculative-store-bypass-mitigation.patch
@@ -0,0 +1,450 @@
+From foo@baz Wed Sep 15 02:37:47 PM CEST 2021
+From: Ovidiu Panait <ovidiu.panait@windriver.com>
+Date: Mon, 13 Sep 2021 18:35:35 +0300
+Subject: bpf: Fix leakage due to insufficient speculative store bypass mitigation
+To: stable@vger.kernel.org
+Cc: bpf@vger.kernel.org, daniel@iogearbox.net
+Message-ID: <20210913153537.2162465-12-ovidiu.panait@windriver.com>
+
+From: Daniel Borkmann <daniel@iogearbox.net>
+
+commit 2039f26f3aca5b0e419b98f65dd36481337b86ee upstream.
+
+Spectre v4 gadgets make use of memory disambiguation, which is a set of
+techniques that execute memory access instructions, that is, loads and
+stores, out of program order; Intel's optimization manual, section 2.4.4.5:
+
+  A load instruction micro-op may depend on a preceding store. Many
+  microarchitectures block loads until all preceding store addresses are
+  known. The memory disambiguator predicts which loads will not depend on
+  any previous stores. When the disambiguator predicts that a load does
+  not have such a dependency, the load takes its data from the L1 data
+  cache. Eventually, the prediction is verified. If an actual conflict is
+  detected, the load and all succeeding instructions are re-executed.
+
+af86ca4e3088 ("bpf: Prevent memory disambiguation attack") tried to mitigate
+this attack by sanitizing the memory locations through preemptive "fast"
+(low latency) stores of zero prior to the actual "slow" (high latency) store
+of a pointer value such that upon dependency misprediction the CPU then
+speculatively executes the load of the pointer value and retrieves the zero
+value instead of the attacker controlled scalar value previously stored at
+that location, meaning, subsequent access in the speculative domain is then
+redirected to the "zero page".
+
+The sanitized preemptive store of zero prior to the actual "slow" store is
+done through a simple ST instruction based on r10 (frame pointer) with
+relative offset to the stack location that the verifier has been tracking
+on the original used register for STX, which does not have to be r10. Thus,
+there are no memory dependencies for this store, since it's only using r10
+and immediate constant of zero; hence af86ca4e3088 /assumed/ a low latency
+operation.
+
+However, a recent attack demonstrated that this mitigation is not sufficient
+since the preemptive store of zero could also be turned into a "slow" store
+and is thus bypassed as well:
+
+  [...]
+  // r2 = oob address (e.g. scalar)
+  // r7 = pointer to map value
+  31: (7b) *(u64 *)(r10 -16) = r2
+  // r9 will remain "fast" register, r10 will become "slow" register below
+  32: (bf) r9 = r10
+  // JIT maps BPF reg to x86 reg:
+  //  r9  -> r15 (callee saved)
+  //  r10 -> rbp
+  // train store forward prediction to break dependency link between both r9
+  // and r10 by evicting them from the predictor's LRU table.
+  33: (61) r0 = *(u32 *)(r7 +24576)
+  34: (63) *(u32 *)(r7 +29696) = r0
+  35: (61) r0 = *(u32 *)(r7 +24580)
+  36: (63) *(u32 *)(r7 +29700) = r0
+  37: (61) r0 = *(u32 *)(r7 +24584)
+  38: (63) *(u32 *)(r7 +29704) = r0
+  39: (61) r0 = *(u32 *)(r7 +24588)
+  40: (63) *(u32 *)(r7 +29708) = r0
+  [...]
+  543: (61) r0 = *(u32 *)(r7 +25596)
+  544: (63) *(u32 *)(r7 +30716) = r0
+  // prepare call to bpf_ringbuf_output() helper. the latter will cause rbp
+  // to spill to stack memory while r13/r14/r15 (all callee saved regs) remain
+  // in hardware registers. rbp becomes slow due to push/pop latency. below is
+  // disasm of bpf_ringbuf_output() helper for better visual context:
+  //
+  // ffffffff8117ee20: 41 54                 push   r12
+  // ffffffff8117ee22: 55                    push   rbp
+  // ffffffff8117ee23: 53                    push   rbx
+  // ffffffff8117ee24: 48 f7 c1 fc ff ff ff  test   rcx,0xfffffffffffffffc
+  // ffffffff8117ee2b: 0f 85 af 00 00 00     jne    ffffffff8117eee0 <-- jump taken
+  // [...]
+  // ffffffff8117eee0: 49 c7 c4 ea ff ff ff  mov    r12,0xffffffffffffffea
+  // ffffffff8117eee7: 5b                    pop    rbx
+  // ffffffff8117eee8: 5d                    pop    rbp
+  // ffffffff8117eee9: 4c 89 e0              mov    rax,r12
+  // ffffffff8117eeec: 41 5c                 pop    r12
+  // ffffffff8117eeee: c3                    ret
+  545: (18) r1 = map[id:4]
+  547: (bf) r2 = r7
+  548: (b7) r3 = 0
+  549: (b7) r4 = 4
+  550: (85) call bpf_ringbuf_output#194288
+  // instruction 551 inserted by verifier    \
+  551: (7a) *(u64 *)(r10 -16) = 0            | /both/ are now slow stores here
+  // storing map value pointer r7 at fp-16   | since value of r10 is "slow".
+  552: (7b) *(u64 *)(r10 -16) = r7           /
+  // following "fast" read to the same memory location, but due to dependency
+  // misprediction it will speculatively execute before insn 551/552 completes.
+  553: (79) r2 = *(u64 *)(r9 -16)
+  // in speculative domain contains attacker controlled r2. in non-speculative
+  // domain this contains r7, and thus accesses r7 +0 below.
+  554: (71) r3 = *(u8 *)(r2 +0)
+  // leak r3
+
+As can be seen, the current speculative store bypass mitigation which the
+verifier inserts at line 551 is insufficient since /both/, the write of
+the zero sanitation as well as the map value pointer are a high latency
+instruction due to prior memory access via push/pop of r10 (rbp) in contrast
+to the low latency read in line 553 as r9 (r15) which stays in hardware
+registers. Thus, architecturally, fp-16 is r7, however, microarchitecturally,
+fp-16 can still be r2.
+
+Initial thoughts to address this issue was to track spilled pointer loads
+from stack and enforce their load via LDX through r10 as well so that /both/
+the preemptive store of zero /as well as/ the load use the /same/ register
+such that a dependency is created between the store and load. However, this
+option is not sufficient either since it can be bypassed as well under
+speculation. An updated attack with pointer spill/fills now _all_ based on
+r10 would look as follows:
+
+  [...]
+  // r2 = oob address (e.g. scalar)
+  // r7 = pointer to map value
+  [...]
+  // longer store forward prediction training sequence than before.
+  2062: (61) r0 = *(u32 *)(r7 +25588)
+  2063: (63) *(u32 *)(r7 +30708) = r0
+  2064: (61) r0 = *(u32 *)(r7 +25592)
+  2065: (63) *(u32 *)(r7 +30712) = r0
+  2066: (61) r0 = *(u32 *)(r7 +25596)
+  2067: (63) *(u32 *)(r7 +30716) = r0
+  // store the speculative load address (scalar) this time after the store
+  // forward prediction training.
+  2068: (7b) *(u64 *)(r10 -16) = r2
+  // preoccupy the CPU store port by running sequence of dummy stores.
+  2069: (63) *(u32 *)(r7 +29696) = r0
+  2070: (63) *(u32 *)(r7 +29700) = r0
+  2071: (63) *(u32 *)(r7 +29704) = r0
+  2072: (63) *(u32 *)(r7 +29708) = r0
+  2073: (63) *(u32 *)(r7 +29712) = r0
+  2074: (63) *(u32 *)(r7 +29716) = r0
+  2075: (63) *(u32 *)(r7 +29720) = r0
+  2076: (63) *(u32 *)(r7 +29724) = r0
+  2077: (63) *(u32 *)(r7 +29728) = r0
+  2078: (63) *(u32 *)(r7 +29732) = r0
+  2079: (63) *(u32 *)(r7 +29736) = r0
+  2080: (63) *(u32 *)(r7 +29740) = r0
+  2081: (63) *(u32 *)(r7 +29744) = r0
+  2082: (63) *(u32 *)(r7 +29748) = r0
+  2083: (63) *(u32 *)(r7 +29752) = r0
+  2084: (63) *(u32 *)(r7 +29756) = r0
+  2085: (63) *(u32 *)(r7 +29760) = r0
+  2086: (63) *(u32 *)(r7 +29764) = r0
+  2087: (63) *(u32 *)(r7 +29768) = r0
+  2088: (63) *(u32 *)(r7 +29772) = r0
+  2089: (63) *(u32 *)(r7 +29776) = r0
+  2090: (63) *(u32 *)(r7 +29780) = r0
+  2091: (63) *(u32 *)(r7 +29784) = r0
+  2092: (63) *(u32 *)(r7 +29788) = r0
+  2093: (63) *(u32 *)(r7 +29792) = r0
+  2094: (63) *(u32 *)(r7 +29796) = r0
+  2095: (63) *(u32 *)(r7 +29800) = r0
+  2096: (63) *(u32 *)(r7 +29804) = r0
+  2097: (63) *(u32 *)(r7 +29808) = r0
+  2098: (63) *(u32 *)(r7 +29812) = r0
+  // overwrite scalar with dummy pointer; same as before, also including the
+  // sanitation store with 0 from the current mitigation by the verifier.
+  2099: (7a) *(u64 *)(r10 -16) = 0         | /both/ are now slow stores here
+  2100: (7b) *(u64 *)(r10 -16) = r7        | since store unit is still busy.
+  // load from stack intended to bypass stores.
+  2101: (79) r2 = *(u64 *)(r10 -16)
+  2102: (71) r3 = *(u8 *)(r2 +0)
+  // leak r3
+  [...]
+
+Looking at the CPU microarchitecture, the scheduler might issue loads (such
+as seen in line 2101) before stores (line 2099,2100) because the load execution
+units become available while the store execution unit is still busy with the
+sequence of dummy stores (line 2069-2098). And so the load may use the prior
+stored scalar from r2 at address r10 -16 for speculation. The updated attack
+may work less reliable on CPU microarchitectures where loads and stores share
+execution resources.
+
+This concludes that the sanitizing with zero stores from af86ca4e3088 ("bpf:
+Prevent memory disambiguation attack") is insufficient. Moreover, the detection
+of stack reuse from af86ca4e3088 where previously data (STACK_MISC) has been
+written to a given stack slot where a pointer value is now to be stored does
+not have sufficient coverage as precondition for the mitigation either; for
+several reasons outlined as follows:
+
+ 1) Stack content from prior program runs could still be preserved and is
+    therefore not "random", best example is to split a speculative store
+    bypass attack between tail calls, program A would prepare and store the
+    oob address at a given stack slot and then tail call into program B which
+    does the "slow" store of a pointer to the stack with subsequent "fast"
+    read. From program B PoV such stack slot type is STACK_INVALID, and
+    therefore also must be subject to mitigation.
+
+ 2) The STACK_SPILL must not be coupled to register_is_const(&stack->spilled_ptr)
+    condition, for example, the previous content of that memory location could
+    also be a pointer to map or map value. Without the fix, a speculative
+    store bypass is not mitigated in such precondition and can then lead to
+    a type confusion in the speculative domain leaking kernel memory near
+    these pointer types.
+
+While brainstorming on various alternative mitigation possibilities, we also
+stumbled upon a retrospective from Chrome developers [0]:
+
+  [...] For variant 4, we implemented a mitigation to zero the unused memory
+  of the heap prior to allocation, which cost about 1% when done concurrently
+  and 4% for scavenging. Variant 4 defeats everything we could think of. We
+  explored more mitigations for variant 4 but the threat proved to be more
+  pervasive and dangerous than we anticipated. For example, stack slots used
+  by the register allocator in the optimizing compiler could be subject to
+  type confusion, leading to pointer crafting. Mitigating type confusion for
+  stack slots alone would have required a complete redesign of the backend of
+  the optimizing compiler, perhaps man years of work, without a guarantee of
+  completeness. [...]
+
+>From BPF side, the problem space is reduced, however, options are rather
+limited. One idea that has been explored was to xor-obfuscate pointer spills
+to the BPF stack:
+
+  [...]
+  // preoccupy the CPU store port by running sequence of dummy stores.
+  [...]
+  2106: (63) *(u32 *)(r7 +29796) = r0
+  2107: (63) *(u32 *)(r7 +29800) = r0
+  2108: (63) *(u32 *)(r7 +29804) = r0
+  2109: (63) *(u32 *)(r7 +29808) = r0
+  2110: (63) *(u32 *)(r7 +29812) = r0
+  // overwrite scalar with dummy pointer; xored with random 'secret' value
+  // of 943576462 before store ...
+  2111: (b4) w11 = 943576462
+  2112: (af) r11 ^= r7
+  2113: (7b) *(u64 *)(r10 -16) = r11
+  2114: (79) r11 = *(u64 *)(r10 -16)
+  2115: (b4) w2 = 943576462
+  2116: (af) r2 ^= r11
+  // ... and restored with the same 'secret' value with the help of AX reg.
+  2117: (71) r3 = *(u8 *)(r2 +0)
+  [...]
+
+While the above would not prevent speculation, it would make data leakage
+infeasible by directing it to random locations. In order to be effective
+and prevent type confusion under speculation, such random secret would have
+to be regenerated for each store. The additional complexity involved for a
+tracking mechanism that prevents jumps such that restoring spilled pointers
+would not get corrupted is not worth the gain for unprivileged. Hence, the
+fix in here eventually opted for emitting a non-public BPF_ST | BPF_NOSPEC
+instruction which the x86 JIT translates into a lfence opcode. Inserting the
+latter in between the store and load instruction is one of the mitigations
+options [1]. The x86 instruction manual notes:
+
+  [...] An LFENCE that follows an instruction that stores to memory might
+  complete before the data being stored have become globally visible. [...]
+
+The latter meaning that the preceding store instruction finished execution
+and the store is at minimum guaranteed to be in the CPU's store queue, but
+it's not guaranteed to be in that CPU's L1 cache at that point (globally
+visible). The latter would only be guaranteed via sfence. So the load which
+is guaranteed to execute after the lfence for that local CPU would have to
+rely on store-to-load forwarding. [2], in section 2.3 on store buffers says:
+
+  [...] For every store operation that is added to the ROB, an entry is
+  allocated in the store buffer. This entry requires both the virtual and
+  physical address of the target. Only if there is no free entry in the store
+  buffer, the frontend stalls until there is an empty slot available in the
+  store buffer again. Otherwise, the CPU can immediately continue adding
+  subsequent instructions to the ROB and execute them out of order. On Intel
+  CPUs, the store buffer has up to 56 entries. [...]
+
+One small upside on the fix is that it lifts constraints from af86ca4e3088
+where the sanitize_stack_off relative to r10 must be the same when coming
+from different paths. The BPF_ST | BPF_NOSPEC gets emitted after a BPF_STX
+or BPF_ST instruction. This happens either when we store a pointer or data
+value to the BPF stack for the first time, or upon later pointer spills.
+The former needs to be enforced since otherwise stale stack data could be
+leaked under speculation as outlined earlier. For non-x86 JITs the BPF_ST |
+BPF_NOSPEC mapping is currently optimized away, but others could emit a
+speculation barrier as well if necessary. For real-world unprivileged
+programs e.g. generated by LLVM, pointer spill/fill is only generated upon
+register pressure and LLVM only tries to do that for pointers which are not
+used often. The program main impact will be the initial BPF_ST | BPF_NOSPEC
+sanitation for the STACK_INVALID case when the first write to a stack slot
+occurs e.g. upon map lookup. In future we might refine ways to mitigate
+the latter cost.
+
+  [0] https://arxiv.org/pdf/1902.05178.pdf
+  [1] https://msrc-blog.microsoft.com/2018/05/21/analysis-and-mitigation-of-speculative-store-bypass-cve-2018-3639/
+  [2] https://arxiv.org/pdf/1905.05725.pdf
+
+Fixes: af86ca4e3088 ("bpf: Prevent memory disambiguation attack")
+Fixes: f7cf25b2026d ("bpf: track spill/fill of constants")
+Co-developed-by: Piotr Krysiuk <piotras@gmail.com>
+Co-developed-by: Benedict Schlueter <benedict.schlueter@rub.de>
+Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
+Signed-off-by: Piotr Krysiuk <piotras@gmail.com>
+Signed-off-by: Benedict Schlueter <benedict.schlueter@rub.de>
+Acked-by: Alexei Starovoitov <ast@kernel.org>
+[OP: - apply check_stack_write_fixed_off() changes in check_stack_write()
+     - replace env->bypass_spec_v4 -> env->allow_ptr_leaks]
+Signed-off-by: Ovidiu Panait <ovidiu.panait@windriver.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/bpf_verifier.h |    2 
+ kernel/bpf/verifier.c        |   88 ++++++++++++++++---------------------------
+ 2 files changed, 34 insertions(+), 56 deletions(-)
+
+--- a/include/linux/bpf_verifier.h
++++ b/include/linux/bpf_verifier.h
+@@ -158,8 +158,8 @@ struct bpf_insn_aux_data {
+               u32 alu_limit;                  /* limit for add/sub register with pointer */
+       };
+       int ctx_field_size; /* the ctx field size for load insn, maybe 0 */
+-      int sanitize_stack_off; /* stack slot to be cleared */
+       bool seen; /* this insn was processed by the verifier */
++      bool sanitize_stack_spill; /* subject to Spectre v4 sanitation */
+       u8 alu_state; /* used in combination with alu_limit */
+ };
+ 
+--- a/kernel/bpf/verifier.c
++++ b/kernel/bpf/verifier.c
+@@ -1008,6 +1008,19 @@ static int check_stack_write(struct bpf_
+       cur = env->cur_state->frame[env->cur_state->curframe];
+       if (value_regno >= 0)
+               reg = &cur->regs[value_regno];
++      if (!env->allow_ptr_leaks) {
++              bool sanitize = reg && is_spillable_regtype(reg->type);
++
++              for (i = 0; i < size; i++) {
++                      if (state->stack[spi].slot_type[i] == STACK_INVALID) {
++                              sanitize = true;
++                              break;
++                      }
++              }
++
++              if (sanitize)
++                      env->insn_aux_data[insn_idx].sanitize_stack_spill = true;
++      }
+ 
+       if (reg && size == BPF_REG_SIZE && register_is_const(reg) &&
+           !register_is_null(reg) && env->allow_ptr_leaks) {
+@@ -1018,47 +1031,10 @@ static int check_stack_write(struct bpf_
+                       verbose(env, "invalid size of register spill\n");
+                       return -EACCES;
+               }
+-
+               if (state != cur && reg->type == PTR_TO_STACK) {
+                       verbose(env, "cannot spill pointers to stack into stack frame of the caller\n");
+                       return -EINVAL;
+               }
+-
+-              if (!env->allow_ptr_leaks) {
+-                      bool sanitize = false;
+-
+-                      if (state->stack[spi].slot_type[0] == STACK_SPILL &&
+-                          register_is_const(&state->stack[spi].spilled_ptr))
+-                              sanitize = true;
+-                      for (i = 0; i < BPF_REG_SIZE; i++)
+-                              if (state->stack[spi].slot_type[i] == STACK_MISC) {
+-                                      sanitize = true;
+-                                      break;
+-                              }
+-                      if (sanitize) {
+-                              int *poff = &env->insn_aux_data[insn_idx].sanitize_stack_off;
+-                              int soff = (-spi - 1) * BPF_REG_SIZE;
+-
+-                              /* detected reuse of integer stack slot with a pointer
+-                               * which means either llvm is reusing stack slot or
+-                               * an attacker is trying to exploit CVE-2018-3639
+-                               * (speculative store bypass)
+-                               * Have to sanitize that slot with preemptive
+-                               * store of zero.
+-                               */
+-                              if (*poff && *poff != soff) {
+-                                      /* disallow programs where single insn stores
+-                                       * into two different stack slots, since verifier
+-                                       * cannot sanitize them
+-                                       */
+-                                      verbose(env,
+-                                              "insn %d cannot access two stack slots fp%d and fp%d",
+-                                              insn_idx, *poff, soff);
+-                                      return -EINVAL;
+-                              }
+-                              *poff = soff;
+-                      }
+-              }
+               save_register_state(state, spi, reg);
+       } else {
+               u8 type = STACK_MISC;
+@@ -5867,34 +5843,33 @@ static int convert_ctx_accesses(struct b
+       insn = env->prog->insnsi + delta;
+ 
+       for (i = 0; i < insn_cnt; i++, insn++) {
++              bool ctx_access;
++
+               if (insn->code == (BPF_LDX | BPF_MEM | BPF_B) ||
+                   insn->code == (BPF_LDX | BPF_MEM | BPF_H) ||
+                   insn->code == (BPF_LDX | BPF_MEM | BPF_W) ||
+-                  insn->code == (BPF_LDX | BPF_MEM | BPF_DW))
++                  insn->code == (BPF_LDX | BPF_MEM | BPF_DW)) {
+                       type = BPF_READ;
+-              else if (insn->code == (BPF_STX | BPF_MEM | BPF_B) ||
+-                       insn->code == (BPF_STX | BPF_MEM | BPF_H) ||
+-                       insn->code == (BPF_STX | BPF_MEM | BPF_W) ||
+-                       insn->code == (BPF_STX | BPF_MEM | BPF_DW))
++                      ctx_access = true;
++              } else if (insn->code == (BPF_STX | BPF_MEM | BPF_B) ||
++                         insn->code == (BPF_STX | BPF_MEM | BPF_H) ||
++                         insn->code == (BPF_STX | BPF_MEM | BPF_W) ||
++                         insn->code == (BPF_STX | BPF_MEM | BPF_DW) ||
++                         insn->code == (BPF_ST | BPF_MEM | BPF_B) ||
++                         insn->code == (BPF_ST | BPF_MEM | BPF_H) ||
++                         insn->code == (BPF_ST | BPF_MEM | BPF_W) ||
++                         insn->code == (BPF_ST | BPF_MEM | BPF_DW)) {
+                       type = BPF_WRITE;
+-              else
++                      ctx_access = BPF_CLASS(insn->code) == BPF_STX;
++              } else {
+                       continue;
++              }
+ 
+               if (type == BPF_WRITE &&
+-                  env->insn_aux_data[i + delta].sanitize_stack_off) {
++                  env->insn_aux_data[i + delta].sanitize_stack_spill) {
+                       struct bpf_insn patch[] = {
+-                              /* Sanitize suspicious stack slot with zero.
+-                               * There are no memory dependencies for this store,
+-                               * since it's only using frame pointer and immediate
+-                               * constant of zero
+-                               */
+-                              BPF_ST_MEM(BPF_DW, BPF_REG_FP,
+-                                         env->insn_aux_data[i + delta].sanitize_stack_off,
+-                                         0),
+-                              /* the original STX instruction will immediately
+-                               * overwrite the same stack slot with appropriate value
+-                               */
+                               *insn,
++                              BPF_ST_NOSPEC(),
+                       };
+ 
+                       cnt = ARRAY_SIZE(patch);
+@@ -5908,6 +5883,9 @@ static int convert_ctx_accesses(struct b
+                       continue;
+               }
+ 
++              if (!ctx_access)
++                      continue;
++
+               if (env->insn_aux_data[i + delta].ptr_type != PTR_TO_CTX)
+                       continue;
+ 
diff --git a/queue-4.19/bpf-fix-pointer-arithmetic-mask-tightening-under-state-pruning.patch b/queue-4.19/bpf-fix-pointer-arithmetic-mask-tightening-under-state-pruning.patch

new file mode 100644 (file)

index 0000000..25ca830
--- /dev/null
+++ b/queue-4.19/bpf-fix-pointer-arithmetic-mask-tightening-under-state-pruning.patch
@@ -0,0 +1,126 @@
+From foo@baz Wed Sep 15 02:37:47 PM CEST 2021
+From: Ovidiu Panait <ovidiu.panait@windriver.com>
+Date: Mon, 13 Sep 2021 18:35:37 +0300
+Subject: bpf: Fix pointer arithmetic mask tightening under state pruning
+To: stable@vger.kernel.org
+Cc: bpf@vger.kernel.org, daniel@iogearbox.net
+Message-ID: <20210913153537.2162465-14-ovidiu.panait@windriver.com>
+
+From: Daniel Borkmann <daniel@iogearbox.net>
+
+commit e042aa532c84d18ff13291d00620502ce7a38dda upstream.
+
+In 7fedb63a8307 ("bpf: Tighten speculative pointer arithmetic mask") we
+narrowed the offset mask for unprivileged pointer arithmetic in order to
+mitigate a corner case where in the speculative domain it is possible to
+advance, for example, the map value pointer by up to value_size-1 out-of-
+bounds in order to leak kernel memory via side-channel to user space.
+
+The verifier's state pruning for scalars leaves one corner case open
+where in the first verification path R_x holds an unknown scalar with an
+aux->alu_limit of e.g. 7, and in a second verification path that same
+register R_x, here denoted as R_x', holds an unknown scalar which has
+tighter bounds and would thus satisfy range_within(R_x, R_x') as well as
+tnum_in(R_x, R_x') for state pruning, yielding an aux->alu_limit of 3:
+Given the second path fits the register constraints for pruning, the final
+generated mask from aux->alu_limit will remain at 7. While technically
+not wrong for the non-speculative domain, it would however be possible
+to craft similar cases where the mask would be too wide as in 7fedb63a8307.
+
+One way to fix it is to detect the presence of unknown scalar map pointer
+arithmetic and force a deeper search on unknown scalars to ensure that
+we do not run into a masking mismatch.
+
+Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
+Acked-by: Alexei Starovoitov <ast@kernel.org>
+[OP: adjusted context for 4.19]
+Signed-off-by: Ovidiu Panait <ovidiu.panait@windriver.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/bpf_verifier.h |    1 +
+ kernel/bpf/verifier.c        |   27 +++++++++++++++++----------
+ 2 files changed, 18 insertions(+), 10 deletions(-)
+
+--- a/include/linux/bpf_verifier.h
++++ b/include/linux/bpf_verifier.h
+@@ -215,6 +215,7 @@ struct bpf_verifier_env {
+       struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */
+       u32 used_map_cnt;               /* number of used maps */
+       u32 id_gen;                     /* used to generate unique reg IDs */
++      bool explore_alu_limits;
+       bool allow_ptr_leaks;
+       bool seen_direct_write;
+       struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */
+--- a/kernel/bpf/verifier.c
++++ b/kernel/bpf/verifier.c
+@@ -2871,6 +2871,12 @@ static int sanitize_ptr_alu(struct bpf_v
+               alu_state |= off_is_imm ? BPF_ALU_IMMEDIATE : 0;
+               alu_state |= ptr_is_dst_reg ?
+                            BPF_ALU_SANITIZE_SRC : BPF_ALU_SANITIZE_DST;
++
++              /* Limit pruning on unknown scalars to enable deep search for
++               * potential masking differences from other program paths.
++               */
++              if (!off_is_imm)
++                      env->explore_alu_limits = true;
+       }
+ 
+       err = update_alu_sanitation_state(aux, alu_state, alu_limit);
+@@ -4813,8 +4819,8 @@ static bool check_ids(u32 old_id, u32 cu
+ }
+ 
+ /* Returns true if (rold safe implies rcur safe) */
+-static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
+-                  struct bpf_id_pair *idmap)
++static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
++                  struct bpf_reg_state *rcur, struct bpf_id_pair *idmap)
+ {
+       bool equal;
+ 
+@@ -4840,6 +4846,8 @@ static bool regsafe(struct bpf_reg_state
+               return false;
+       switch (rold->type) {
+       case SCALAR_VALUE:
++              if (env->explore_alu_limits)
++                      return false;
+               if (rcur->type == SCALAR_VALUE) {
+                       /* new val must satisfy old val knowledge */
+                       return range_within(rold, rcur) &&
+@@ -4916,9 +4924,8 @@ static bool regsafe(struct bpf_reg_state
+       return false;
+ }
+ 
+-static bool stacksafe(struct bpf_func_state *old,
+-                    struct bpf_func_state *cur,
+-                    struct bpf_id_pair *idmap)
++static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
++                    struct bpf_func_state *cur, struct bpf_id_pair *idmap)
+ {
+       int i, spi;
+ 
+@@ -4960,9 +4967,8 @@ static bool stacksafe(struct bpf_func_st
+                       continue;
+               if (old->stack[spi].slot_type[0] != STACK_SPILL)
+                       continue;
+-              if (!regsafe(&old->stack[spi].spilled_ptr,
+-                           &cur->stack[spi].spilled_ptr,
+-                           idmap))
++              if (!regsafe(env, &old->stack[spi].spilled_ptr,
++                           &cur->stack[spi].spilled_ptr, idmap))
+                       /* when explored and current stack slot are both storing
+                        * spilled registers, check that stored pointers types
+                        * are the same as well.
+@@ -5011,10 +5017,11 @@ static bool func_states_equal(struct bpf
+ 
+       memset(env->idmap_scratch, 0, sizeof(env->idmap_scratch));
+       for (i = 0; i < MAX_BPF_REG; i++)
+-              if (!regsafe(&old->regs[i], &cur->regs[i], env->idmap_scratch))
++              if (!regsafe(env, &old->regs[i], &cur->regs[i],
++                           env->idmap_scratch))
+                       return false;
+ 
+-      if (!stacksafe(old, cur, env->idmap_scratch))
++      if (!stacksafe(env, old, cur, env->idmap_scratch))
+               return false;
+ 
+       return true;
diff --git a/queue-4.19/bpf-introduce-bpf-nospec-instruction-for-mitigating-spectre-v4.patch b/queue-4.19/bpf-introduce-bpf-nospec-instruction-for-mitigating-spectre-v4.patch

new file mode 100644 (file)

index 0000000..d68d950
--- /dev/null
+++ b/queue-4.19/bpf-introduce-bpf-nospec-instruction-for-mitigating-spectre-v4.patch
@@ -0,0 +1,269 @@
+From foo@baz Wed Sep 15 02:37:47 PM CEST 2021
+From: Ovidiu Panait <ovidiu.panait@windriver.com>
+Date: Mon, 13 Sep 2021 18:35:34 +0300
+Subject: bpf: Introduce BPF nospec instruction for mitigating Spectre v4
+To: stable@vger.kernel.org
+Cc: bpf@vger.kernel.org, daniel@iogearbox.net
+Message-ID: <20210913153537.2162465-11-ovidiu.panait@windriver.com>
+
+From: Daniel Borkmann <daniel@iogearbox.net>
+
+commit f5e81d1117501546b7be050c5fbafa6efd2c722c upstream.
+
+In case of JITs, each of the JIT backends compiles the BPF nospec instruction
+/either/ to a machine instruction which emits a speculation barrier /or/ to
+/no/ machine instruction in case the underlying architecture is not affected
+by Speculative Store Bypass or has different mitigations in place already.
+
+This covers both x86 and (implicitly) arm64: In case of x86, we use 'lfence'
+instruction for mitigation. In case of arm64, we rely on the firmware mitigation
+as controlled via the ssbd kernel parameter. Whenever the mitigation is enabled,
+it works for all of the kernel code with no need to provide any additional
+instructions here (hence only comment in arm64 JIT). Other archs can follow
+as needed. The BPF nospec instruction is specifically targeting Spectre v4
+since i) we don't use a serialization barrier for the Spectre v1 case, and
+ii) mitigation instructions for v1 and v4 might be different on some archs.
+
+The BPF nospec is required for a future commit, where the BPF verifier does
+annotate intermediate BPF programs with speculation barriers.
+
+Co-developed-by: Piotr Krysiuk <piotras@gmail.com>
+Co-developed-by: Benedict Schlueter <benedict.schlueter@rub.de>
+Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
+Signed-off-by: Piotr Krysiuk <piotras@gmail.com>
+Signed-off-by: Benedict Schlueter <benedict.schlueter@rub.de>
+Acked-by: Alexei Starovoitov <ast@kernel.org>
+[OP: adjusted context for 4.19, drop riscv and ppc32 changes]
+Signed-off-by: Ovidiu Panait <ovidiu.panait@windriver.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm/net/bpf_jit_32.c         |    3 +++
+ arch/arm64/net/bpf_jit_comp.c     |   13 +++++++++++++
+ arch/mips/net/ebpf_jit.c          |    3 +++
+ arch/powerpc/net/bpf_jit_comp64.c |    6 ++++++
+ arch/s390/net/bpf_jit_comp.c      |    5 +++++
+ arch/sparc/net/bpf_jit_comp_64.c  |    3 +++
+ arch/x86/net/bpf_jit_comp.c       |    7 +++++++
+ arch/x86/net/bpf_jit_comp32.c     |    6 ++++++
+ include/linux/filter.h            |   15 +++++++++++++++
+ kernel/bpf/core.c                 |   18 +++++++++++++++++-
+ kernel/bpf/disasm.c               |   16 +++++++++-------
+ 11 files changed, 87 insertions(+), 8 deletions(-)
+
+--- a/arch/arm/net/bpf_jit_32.c
++++ b/arch/arm/net/bpf_jit_32.c
+@@ -1578,6 +1578,9 @@ exit:
+               rn = arm_bpf_get_reg32(src_lo, tmp2[1], ctx);
+               emit_ldx_r(dst, rn, off, ctx, BPF_SIZE(code));
+               break;
++      /* speculation barrier */
++      case BPF_ST | BPF_NOSPEC:
++              break;
+       /* ST: *(size *)(dst + off) = imm */
+       case BPF_ST | BPF_MEM | BPF_W:
+       case BPF_ST | BPF_MEM | BPF_H:
+--- a/arch/arm64/net/bpf_jit_comp.c
++++ b/arch/arm64/net/bpf_jit_comp.c
+@@ -685,6 +685,19 @@ emit_cond_jmp:
+               }
+               break;
+ 
++      /* speculation barrier */
++      case BPF_ST | BPF_NOSPEC:
++              /*
++               * Nothing required here.
++               *
++               * In case of arm64, we rely on the firmware mitigation of
++               * Speculative Store Bypass as controlled via the ssbd kernel
++               * parameter. Whenever the mitigation is enabled, it works
++               * for all of the kernel code with no need to provide any
++               * additional instructions.
++               */
++              break;
++
+       /* ST: *(size *)(dst + off) = imm */
+       case BPF_ST | BPF_MEM | BPF_W:
+       case BPF_ST | BPF_MEM | BPF_H:
+--- a/arch/mips/net/ebpf_jit.c
++++ b/arch/mips/net/ebpf_jit.c
+@@ -1282,6 +1282,9 @@ jeq_common:
+               }
+               break;
+ 
++      case BPF_ST | BPF_NOSPEC: /* speculation barrier */
++              break;
++
+       case BPF_ST | BPF_B | BPF_MEM:
+       case BPF_ST | BPF_H | BPF_MEM:
+       case BPF_ST | BPF_W | BPF_MEM:
+--- a/arch/powerpc/net/bpf_jit_comp64.c
++++ b/arch/powerpc/net/bpf_jit_comp64.c
+@@ -597,6 +597,12 @@ emit_clear:
+                       break;
+ 
+               /*
++               * BPF_ST NOSPEC (speculation barrier)
++               */
++              case BPF_ST | BPF_NOSPEC:
++                      break;
++
++              /*
+                * BPF_ST(X)
+                */
+               case BPF_STX | BPF_MEM | BPF_B: /* *(u8 *)(dst + off) = src */
+--- a/arch/s390/net/bpf_jit_comp.c
++++ b/arch/s390/net/bpf_jit_comp.c
+@@ -884,6 +884,11 @@ static noinline int bpf_jit_insn(struct
+               }
+               break;
+       /*
++       * BPF_NOSPEC (speculation barrier)
++       */
++      case BPF_ST | BPF_NOSPEC:
++              break;
++      /*
+        * BPF_ST(X)
+        */
+       case BPF_STX | BPF_MEM | BPF_B: /* *(u8 *)(dst + off) = src_reg */
+--- a/arch/sparc/net/bpf_jit_comp_64.c
++++ b/arch/sparc/net/bpf_jit_comp_64.c
+@@ -1261,6 +1261,9 @@ static int build_insn(const struct bpf_i
+               emit(opcode | RS1(src) | rs2 | RD(dst), ctx);
+               break;
+       }
++      /* speculation barrier */
++      case BPF_ST | BPF_NOSPEC:
++              break;
+       /* ST: *(size *)(dst + off) = imm */
+       case BPF_ST | BPF_MEM | BPF_W:
+       case BPF_ST | BPF_MEM | BPF_H:
+--- a/arch/x86/net/bpf_jit_comp.c
++++ b/arch/x86/net/bpf_jit_comp.c
+@@ -731,6 +731,13 @@ static int do_jit(struct bpf_prog *bpf_p
+                       }
+                       break;
+ 
++                      /* speculation barrier */
++              case BPF_ST | BPF_NOSPEC:
++                      if (boot_cpu_has(X86_FEATURE_XMM2))
++                              /* Emit 'lfence' */
++                              EMIT3(0x0F, 0xAE, 0xE8);
++                      break;
++
+                       /* ST: *(u8*)(dst_reg + off) = imm */
+               case BPF_ST | BPF_MEM | BPF_B:
+                       if (is_ereg(dst_reg))
+--- a/arch/x86/net/bpf_jit_comp32.c
++++ b/arch/x86/net/bpf_jit_comp32.c
+@@ -1683,6 +1683,12 @@ static int do_jit(struct bpf_prog *bpf_p
+                       i++;
+                       break;
+               }
++              /* speculation barrier */
++              case BPF_ST | BPF_NOSPEC:
++                      if (boot_cpu_has(X86_FEATURE_XMM2))
++                              /* Emit 'lfence' */
++                              EMIT3(0x0F, 0xAE, 0xE8);
++                      break;
+               /* ST: *(u8*)(dst_reg + off) = imm */
+               case BPF_ST | BPF_MEM | BPF_H:
+               case BPF_ST | BPF_MEM | BPF_B:
+--- a/include/linux/filter.h
++++ b/include/linux/filter.h
+@@ -64,6 +64,11 @@ struct sock_reuseport;
+ /* unused opcode to mark call to interpreter with arguments */
+ #define BPF_CALL_ARGS 0xe0
+ 
++/* unused opcode to mark speculation barrier for mitigating
++ * Speculative Store Bypass
++ */
++#define BPF_NOSPEC    0xc0
++
+ /* As per nm, we expose JITed images as text (code) section for
+  * kallsyms. That way, tools like perf can find it to match
+  * addresses.
+@@ -352,6 +357,16 @@ struct sock_reuseport;
+               .dst_reg = 0,                                   \
+               .src_reg = 0,                                   \
+               .off   = 0,                                     \
++              .imm   = 0 })
++
++/* Speculation barrier */
++
++#define BPF_ST_NOSPEC()                                               \
++      ((struct bpf_insn) {                                    \
++              .code  = BPF_ST | BPF_NOSPEC,                   \
++              .dst_reg = 0,                                   \
++              .src_reg = 0,                                   \
++              .off   = 0,                                     \
+               .imm   = 0 })
+ 
+ /* Internal classic blocks for direct assignment */
+--- a/kernel/bpf/core.c
++++ b/kernel/bpf/core.c
+@@ -33,6 +33,7 @@
+ #include <linux/rcupdate.h>
+ #include <linux/perf_event.h>
+ 
++#include <asm/barrier.h>
+ #include <asm/unaligned.h>
+ 
+ /* Registers */
+@@ -1050,6 +1051,7 @@ static u64 ___bpf_prog_run(u64 *regs, co
+               /* Non-UAPI available opcodes. */
+               [BPF_JMP | BPF_CALL_ARGS] = &&JMP_CALL_ARGS,
+               [BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL,
++              [BPF_ST  | BPF_NOSPEC] = &&ST_NOSPEC,
+       };
+ #undef BPF_INSN_3_LBL
+ #undef BPF_INSN_2_LBL
+@@ -1356,7 +1358,21 @@ out:
+       JMP_EXIT:
+               return BPF_R0;
+ 
+-      /* STX and ST and LDX*/
++      /* ST, STX and LDX*/
++      ST_NOSPEC:
++              /* Speculation barrier for mitigating Speculative Store Bypass.
++               * In case of arm64, we rely on the firmware mitigation as
++               * controlled via the ssbd kernel parameter. Whenever the
++               * mitigation is enabled, it works for all of the kernel code
++               * with no need to provide any additional instructions here.
++               * In case of x86, we use 'lfence' insn for mitigation. We
++               * reuse preexisting logic from Spectre v1 mitigation that
++               * happens to produce the required code on x86 for v4 as well.
++               */
++#ifdef CONFIG_X86
++              barrier_nospec();
++#endif
++              CONT;
+ #define LDST(SIZEOP, SIZE)                                            \
+       STX_MEM_##SIZEOP:                                               \
+               *(SIZE *)(unsigned long) (DST + insn->off) = SRC;       \
+--- a/kernel/bpf/disasm.c
++++ b/kernel/bpf/disasm.c
+@@ -171,15 +171,17 @@ void print_bpf_insn(const struct bpf_ins
+               else
+                       verbose(cbs->private_data, "BUG_%02x\n", insn->code);
+       } else if (class == BPF_ST) {
+-              if (BPF_MODE(insn->code) != BPF_MEM) {
++              if (BPF_MODE(insn->code) == BPF_MEM) {
++                      verbose(cbs->private_data, "(%02x) *(%s *)(r%d %+d) = %d\n",
++                              insn->code,
++                              bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
++                              insn->dst_reg,
++                              insn->off, insn->imm);
++              } else if (BPF_MODE(insn->code) == 0xc0 /* BPF_NOSPEC, no UAPI */) {
++                      verbose(cbs->private_data, "(%02x) nospec\n", insn->code);
++              } else {
+                       verbose(cbs->private_data, "BUG_st_%02x\n", insn->code);
+-                      return;
+               }
+-              verbose(cbs->private_data, "(%02x) *(%s *)(r%d %+d) = %d\n",
+-                      insn->code,
+-                      bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+-                      insn->dst_reg,
+-                      insn->off, insn->imm);
+       } else if (class == BPF_LDX) {
+               if (BPF_MODE(insn->code) != BPF_MEM) {
+                       verbose(cbs->private_data, "BUG_ldx_%02x\n", insn->code);
diff --git a/queue-4.19/bpf-reject-indirect-var_off-stack-access-in-raw-mode.patch b/queue-4.19/bpf-reject-indirect-var_off-stack-access-in-raw-mode.patch

new file mode 100644 (file)

index 0000000..46288a2
--- /dev/null
+++ b/queue-4.19/bpf-reject-indirect-var_off-stack-access-in-raw-mode.patch
@@ -0,0 +1,51 @@
+From foo@baz Wed Sep 15 02:37:47 PM CEST 2021
+From: Ovidiu Panait <ovidiu.panait@windriver.com>
+Date: Mon, 13 Sep 2021 18:35:28 +0300
+Subject: bpf: Reject indirect var_off stack access in raw mode
+To: stable@vger.kernel.org
+Cc: bpf@vger.kernel.org, daniel@iogearbox.net
+Message-ID: <20210913153537.2162465-5-ovidiu.panait@windriver.com>
+
+From: Andrey Ignatov <rdna@fb.com>
+
+commit f2bcd05ec7b839ff826d2008506ad2d2dff46a59 upstream.
+
+It's hard to guarantee that whole memory is marked as initialized on
+helper return if uninitialized stack is accessed with variable offset
+since specific bounds are unknown to verifier. This may cause
+uninitialized stack leaking.
+
+Reject such an access in check_stack_boundary to prevent possible
+leaking.
+
+There are no known use-cases for indirect uninitialized stack access
+with variable offset so it shouldn't break anything.
+
+Fixes: 2011fccfb61b ("bpf: Support variable offset stack access from helpers")
+Reported-by: Daniel Borkmann <daniel@iogearbox.net>
+Signed-off-by: Andrey Ignatov <rdna@fb.com>
+Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
+Signed-off-by: Ovidiu Panait <ovidiu.panait@windriver.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/bpf/verifier.c |    9 +++++++++
+ 1 file changed, 9 insertions(+)
+
+--- a/kernel/bpf/verifier.c
++++ b/kernel/bpf/verifier.c
+@@ -1811,6 +1811,15 @@ static int check_stack_boundary(struct b
+               if (err)
+                       return err;
+       } else {
++              /* Only initialized buffer on stack is allowed to be accessed
++               * with variable offset. With uninitialized buffer it's hard to
++               * guarantee that whole memory is marked as initialized on
++               * helper return since specific bounds are unknown what may
++               * cause uninitialized stack leaking.
++               */
++              if (meta && meta->raw_mode)
++                      meta = NULL;
++
+               min_off = reg->smin_value + reg->off;
+               max_off = reg->umax_value + reg->off;
+               err = __check_stack_boundary(env, regno, min_off, access_size,
diff --git a/queue-4.19/bpf-reject-indirect-var_off-stack-access-in-unpriv-mode.patch b/queue-4.19/bpf-reject-indirect-var_off-stack-access-in-unpriv-mode.patch

new file mode 100644 (file)

index 0000000..00f47e6
--- /dev/null
+++ b/queue-4.19/bpf-reject-indirect-var_off-stack-access-in-unpriv-mode.patch
@@ -0,0 +1,68 @@
+From foo@baz Wed Sep 15 02:37:47 PM CEST 2021
+From: Ovidiu Panait <ovidiu.panait@windriver.com>
+Date: Mon, 13 Sep 2021 18:35:29 +0300
+Subject: bpf: Reject indirect var_off stack access in unpriv mode
+To: stable@vger.kernel.org
+Cc: bpf@vger.kernel.org, daniel@iogearbox.net
+Message-ID: <20210913153537.2162465-6-ovidiu.panait@windriver.com>
+
+From: Andrey Ignatov <rdna@fb.com>
+
+commit 088ec26d9c2da9d879ab73e3f4117f9df6c566ee upstream.
+
+Proper support of indirect stack access with variable offset in
+unprivileged mode (!root) requires corresponding support in Spectre
+masking for stack ALU in retrieve_ptr_limit().
+
+There are no use-case for variable offset in unprivileged mode though so
+make verifier reject such accesses for simplicity.
+
+Pointer arithmetics is one (and only?) way to cause variable offset and
+it's already rejected in unpriv mode so that verifier won't even get to
+helper function whose argument contains variable offset, e.g.:
+
+  0: (7a) *(u64 *)(r10 -16) = 0
+  1: (7a) *(u64 *)(r10 -8) = 0
+  2: (61) r2 = *(u32 *)(r1 +0)
+  3: (57) r2 &= 4
+  4: (17) r2 -= 16
+  5: (0f) r2 += r10
+  variable stack access var_off=(0xfffffffffffffff0; 0x4) off=-16 size=1R2
+  stack pointer arithmetic goes out of range, prohibited for !root
+
+Still it looks like a good idea to reject variable offset indirect stack
+access for unprivileged mode in check_stack_boundary() explicitly.
+
+Fixes: 2011fccfb61b ("bpf: Support variable offset stack access from helpers")
+Reported-by: Daniel Borkmann <daniel@iogearbox.net>
+Signed-off-by: Andrey Ignatov <rdna@fb.com>
+Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
+[OP: drop comment in retrieve_ptr_limit()]
+Signed-off-by: Ovidiu Panait <ovidiu.panait@windriver.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/bpf/verifier.c |   13 +++++++++++++
+ 1 file changed, 13 insertions(+)
+
+--- a/kernel/bpf/verifier.c
++++ b/kernel/bpf/verifier.c
+@@ -1811,6 +1811,19 @@ static int check_stack_boundary(struct b
+               if (err)
+                       return err;
+       } else {
++              /* Variable offset is prohibited for unprivileged mode for
++               * simplicity since it requires corresponding support in
++               * Spectre masking for stack ALU.
++               * See also retrieve_ptr_limit().
++               */
++              if (!env->allow_ptr_leaks) {
++                      char tn_buf[48];
++
++                      tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
++                      verbose(env, "R%d indirect variable offset stack access prohibited for !root, var_off=%s\n",
++                              regno, tn_buf);
++                      return -EACCES;
++              }
+               /* Only initialized buffer on stack is allowed to be accessed
+                * with variable offset. With uninitialized buffer it's hard to
+                * guarantee that whole memory is marked as initialized on
diff --git a/queue-4.19/bpf-sanity-check-max-value-for-var_off-stack-access.patch b/queue-4.19/bpf-sanity-check-max-value-for-var_off-stack-access.patch

new file mode 100644 (file)

index 0000000..ce39129
--- /dev/null
+++ b/queue-4.19/bpf-sanity-check-max-value-for-var_off-stack-access.patch
@@ -0,0 +1,91 @@
+From foo@baz Wed Sep 15 02:37:47 PM CEST 2021
+From: Ovidiu Panait <ovidiu.panait@windriver.com>
+Date: Mon, 13 Sep 2021 18:35:30 +0300
+Subject: bpf: Sanity check max value for var_off stack access
+To: stable@vger.kernel.org
+Cc: bpf@vger.kernel.org, daniel@iogearbox.net
+Message-ID: <20210913153537.2162465-7-ovidiu.panait@windriver.com>
+
+From: Andrey Ignatov <rdna@fb.com>
+
+commit 107c26a70ca81bfc33657366ad69d02fdc9efc9d upstream.
+
+As discussed in [1] max value of variable offset has to be checked for
+overflow on stack access otherwise verifier would accept code like this:
+
+  0: (b7) r2 = 6
+  1: (b7) r3 = 28
+  2: (7a) *(u64 *)(r10 -16) = 0
+  3: (7a) *(u64 *)(r10 -8) = 0
+  4: (79) r4 = *(u64 *)(r1 +168)
+  5: (c5) if r4 s< 0x0 goto pc+4
+   R1=ctx(id=0,off=0,imm=0) R2=inv6 R3=inv28
+   R4=inv(id=0,umax_value=9223372036854775807,var_off=(0x0;
+   0x7fffffffffffffff)) R10=fp0,call_-1 fp-8=mmmmmmmm fp-16=mmmmmmmm
+  6: (17) r4 -= 16
+  7: (0f) r4 += r10
+  8: (b7) r5 = 8
+  9: (85) call bpf_getsockopt#57
+  10: (b7) r0 = 0
+  11: (95) exit
+
+, where R4 obviosly has unbounded max value.
+
+Fix it by checking that reg->smax_value is inside (-BPF_MAX_VAR_OFF;
+BPF_MAX_VAR_OFF) range.
+
+reg->smax_value is used instead of reg->umax_value because stack
+pointers are calculated using negative offset from fp. This is opposite
+to e.g. map access where offset must be non-negative and where
+umax_value is used.
+
+Also dedicated verbose logs are added for both min and max bound check
+failures to have diagnostics consistent with variable offset handling in
+check_map_access().
+
+[1] https://marc.info/?l=linux-netdev&m=155433357510597&w=2
+
+Fixes: 2011fccfb61b ("bpf: Support variable offset stack access from helpers")
+Reported-by: Daniel Borkmann <daniel@iogearbox.net>
+Signed-off-by: Andrey Ignatov <rdna@fb.com>
+Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
+Signed-off-by: Ovidiu Panait <ovidiu.panait@windriver.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/bpf/verifier.c |   18 +++++++++++++++---
+ 1 file changed, 15 insertions(+), 3 deletions(-)
+
+--- a/kernel/bpf/verifier.c
++++ b/kernel/bpf/verifier.c
+@@ -1833,16 +1833,28 @@ static int check_stack_boundary(struct b
+               if (meta && meta->raw_mode)
+                       meta = NULL;
+ 
++              if (reg->smax_value >= BPF_MAX_VAR_OFF ||
++                  reg->smax_value <= -BPF_MAX_VAR_OFF) {
++                      verbose(env, "R%d unbounded indirect variable offset stack access\n",
++                              regno);
++                      return -EACCES;
++              }
+               min_off = reg->smin_value + reg->off;
+-              max_off = reg->umax_value + reg->off;
++              max_off = reg->smax_value + reg->off;
+               err = __check_stack_boundary(env, regno, min_off, access_size,
+                                            zero_size_allowed);
+-              if (err)
++              if (err) {
++                      verbose(env, "R%d min value is outside of stack bound\n",
++                              regno);
+                       return err;
++              }
+               err = __check_stack_boundary(env, regno, max_off, access_size,
+                                            zero_size_allowed);
+-              if (err)
++              if (err) {
++                      verbose(env, "R%d max value is outside of stack bound\n",
++                              regno);
+                       return err;
++              }
+       }
+ 
+       if (meta && meta->raw_mode) {
diff --git a/queue-4.19/bpf-support-variable-offset-stack-access-from-helpers.patch b/queue-4.19/bpf-support-variable-offset-stack-access-from-helpers.patch

new file mode 100644 (file)

index 0000000..8bfd5e6
--- /dev/null
+++ b/queue-4.19/bpf-support-variable-offset-stack-access-from-helpers.patch
@@ -0,0 +1,175 @@
+From foo@baz Wed Sep 15 02:37:47 PM CEST 2021
+From: Ovidiu Panait <ovidiu.panait@windriver.com>
+Date: Mon, 13 Sep 2021 18:35:27 +0300
+Subject: bpf: Support variable offset stack access from helpers
+To: stable@vger.kernel.org
+Cc: bpf@vger.kernel.org, daniel@iogearbox.net
+Message-ID: <20210913153537.2162465-4-ovidiu.panait@windriver.com>
+
+From: Andrey Ignatov <rdna@fb.com>
+
+commit 2011fccfb61bbd1d7c8864b2b3ed7012342e9ba3 upstream.
+
+Currently there is a difference in how verifier checks memory access for
+helper arguments for PTR_TO_MAP_VALUE and PTR_TO_STACK with regard to
+variable part of offset.
+
+check_map_access, that is used for PTR_TO_MAP_VALUE, can handle variable
+offsets just fine, so that BPF program can call a helper like this:
+
+  some_helper(map_value_ptr + off, size);
+
+, where offset is unknown at load time, but is checked by program to be
+in a safe rage (off >= 0 && off + size < map_value_size).
+
+But it's not the case for check_stack_boundary, that is used for
+PTR_TO_STACK, and same code with pointer to stack is rejected by
+verifier:
+
+  some_helper(stack_value_ptr + off, size);
+
+For example:
+  0: (7a) *(u64 *)(r10 -16) = 0
+  1: (7a) *(u64 *)(r10 -8) = 0
+  2: (61) r2 = *(u32 *)(r1 +0)
+  3: (57) r2 &= 4
+  4: (17) r2 -= 16
+  5: (0f) r2 += r10
+  6: (18) r1 = 0xffff888111343a80
+  8: (85) call bpf_map_lookup_elem#1
+  invalid variable stack read R2 var_off=(0xfffffffffffffff0; 0x4)
+
+Add support for variable offset access to check_stack_boundary so that
+if offset is checked by program to be in a safe range it's accepted by
+verifier.
+
+Signed-off-by: Andrey Ignatov <rdna@fb.com>
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+[OP: replace reg_state(env, regno) helper with "cur_regs(env) + regno"]
+Signed-off-by: Ovidiu Panait <ovidiu.panait@windriver.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/bpf/verifier.c |   75 ++++++++++++++++++++++++++++++++++++--------------
+ 1 file changed, 54 insertions(+), 21 deletions(-)
+
+--- a/kernel/bpf/verifier.c
++++ b/kernel/bpf/verifier.c
+@@ -1755,6 +1755,29 @@ static int check_xadd(struct bpf_verifie
+                               BPF_SIZE(insn->code), BPF_WRITE, -1, true);
+ }
+ 
++static int __check_stack_boundary(struct bpf_verifier_env *env, u32 regno,
++                                int off, int access_size,
++                                bool zero_size_allowed)
++{
++      struct bpf_reg_state *reg = cur_regs(env) + regno;
++
++      if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 ||
++          access_size < 0 || (access_size == 0 && !zero_size_allowed)) {
++              if (tnum_is_const(reg->var_off)) {
++                      verbose(env, "invalid stack type R%d off=%d access_size=%d\n",
++                              regno, off, access_size);
++              } else {
++                      char tn_buf[48];
++
++                      tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
++                      verbose(env, "invalid stack type R%d var_off=%s access_size=%d\n",
++                              regno, tn_buf, access_size);
++              }
++              return -EACCES;
++      }
++      return 0;
++}
++
+ /* when register 'regno' is passed into function that will read 'access_size'
+  * bytes from that pointer, make sure that it's within stack boundary
+  * and all elements of stack are initialized.
+@@ -1767,7 +1790,7 @@ static int check_stack_boundary(struct b
+ {
+       struct bpf_reg_state *reg = cur_regs(env) + regno;
+       struct bpf_func_state *state = func(env, reg);
+-      int off, i, slot, spi;
++      int err, min_off, max_off, i, slot, spi;
+ 
+       if (reg->type != PTR_TO_STACK) {
+               /* Allow zero-byte read from NULL, regardless of pointer type */
+@@ -1781,21 +1804,23 @@ static int check_stack_boundary(struct b
+               return -EACCES;
+       }
+ 
+-      /* Only allow fixed-offset stack reads */
+-      if (!tnum_is_const(reg->var_off)) {
+-              char tn_buf[48];
+-
+-              tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+-              verbose(env, "invalid variable stack read R%d var_off=%s\n",
+-                      regno, tn_buf);
+-              return -EACCES;
+-      }
+-      off = reg->off + reg->var_off.value;
+-      if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 ||
+-          access_size < 0 || (access_size == 0 && !zero_size_allowed)) {
+-              verbose(env, "invalid stack type R%d off=%d access_size=%d\n",
+-                      regno, off, access_size);
+-              return -EACCES;
++      if (tnum_is_const(reg->var_off)) {
++              min_off = max_off = reg->var_off.value + reg->off;
++              err = __check_stack_boundary(env, regno, min_off, access_size,
++                                           zero_size_allowed);
++              if (err)
++                      return err;
++      } else {
++              min_off = reg->smin_value + reg->off;
++              max_off = reg->umax_value + reg->off;
++              err = __check_stack_boundary(env, regno, min_off, access_size,
++                                           zero_size_allowed);
++              if (err)
++                      return err;
++              err = __check_stack_boundary(env, regno, max_off, access_size,
++                                           zero_size_allowed);
++              if (err)
++                      return err;
+       }
+ 
+       if (meta && meta->raw_mode) {
+@@ -1804,10 +1829,10 @@ static int check_stack_boundary(struct b
+               return 0;
+       }
+ 
+-      for (i = 0; i < access_size; i++) {
++      for (i = min_off; i < max_off + access_size; i++) {
+               u8 *stype;
+ 
+-              slot = -(off + i) - 1;
++              slot = -i - 1;
+               spi = slot / BPF_REG_SIZE;
+               if (state->allocated_stack <= slot)
+                       goto err;
+@@ -1820,8 +1845,16 @@ static int check_stack_boundary(struct b
+                       goto mark;
+               }
+ err:
+-              verbose(env, "invalid indirect read from stack off %d+%d size %d\n",
+-                      off, i, access_size);
++              if (tnum_is_const(reg->var_off)) {
++                      verbose(env, "invalid indirect read from stack off %d+%d size %d\n",
++                              min_off, i - min_off, access_size);
++              } else {
++                      char tn_buf[48];
++
++                      tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
++                      verbose(env, "invalid indirect read from stack var_off %s+%d size %d\n",
++                              tn_buf, i - min_off, access_size);
++              }
+               return -EACCES;
+ mark:
+               /* reading any byte out of 8-byte 'spill_slot' will cause
+@@ -1830,7 +1863,7 @@ mark:
+               mark_reg_read(env, &state->stack[spi].spilled_ptr,
+                             state->stack[spi].spilled_ptr.parent);
+       }
+-      return update_stack_depth(env, state, off);
++      return update_stack_depth(env, state, min_off);
+ }
+ 
+ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
diff --git a/queue-4.19/bpf-track-spill-fill-of-constants.patch b/queue-4.19/bpf-track-spill-fill-of-constants.patch

new file mode 100644 (file)

index 0000000..e86241a
--- /dev/null
+++ b/queue-4.19/bpf-track-spill-fill-of-constants.patch
@@ -0,0 +1,277 @@
+From foo@baz Wed Sep 15 02:37:47 PM CEST 2021
+From: Ovidiu Panait <ovidiu.panait@windriver.com>
+Date: Mon, 13 Sep 2021 18:35:32 +0300
+Subject: bpf: track spill/fill of constants
+To: stable@vger.kernel.org
+Cc: bpf@vger.kernel.org, daniel@iogearbox.net
+Message-ID: <20210913153537.2162465-9-ovidiu.panait@windriver.com>
+
+From: Alexei Starovoitov <ast@kernel.org>
+
+commit f7cf25b2026dc8441e0fa3a202c2aa8a56211e30 upstream.
+
+Compilers often spill induction variables into the stack,
+hence it is necessary for the verifier to track scalar values
+of the registers through stack slots.
+
+Also few bpf programs were incorrectly rejected in the past,
+since the verifier was not able to track such constants while
+they were used to compute offsets into packet headers.
+
+Tracking constants through the stack significantly decreases
+the chances of state pruning, since two different constants
+are considered to be different by state equivalency.
+End result that cilium tests suffer serious degradation in the number
+of states processed and corresponding verification time increase.
+
+                     before  after
+bpf_lb-DLB_L3.o      1838    6441
+bpf_lb-DLB_L4.o      3218    5908
+bpf_lb-DUNKNOWN.o    1064    1064
+bpf_lxc-DDROP_ALL.o  26935   93790
+bpf_lxc-DUNKNOWN.o   34439   123886
+bpf_netdev.o         9721    31413
+bpf_overlay.o        6184    18561
+bpf_lxc_jit.o        39389   359445
+
+After further debugging turned out that cillium progs are
+getting hurt by clang due to the same constant tracking issue.
+Newer clang generates better code by spilling less to the stack.
+Instead it keeps more constants in the registers which
+hurts state pruning since the verifier already tracks constants
+in the registers:
+                  old clang  new clang
+                         (no spill/fill tracking introduced by this patch)
+bpf_lb-DLB_L3.o      1838    1923
+bpf_lb-DLB_L4.o      3218    3077
+bpf_lb-DUNKNOWN.o    1064    1062
+bpf_lxc-DDROP_ALL.o  26935   166729
+bpf_lxc-DUNKNOWN.o   34439   174607
+bpf_netdev.o         9721    8407
+bpf_overlay.o        6184    5420
+bpf_lcx_jit.o        39389   39389
+
+The final table is depressing:
+                  old clang  old clang    new clang  new clang
+                           const spill/fill        const spill/fill
+bpf_lb-DLB_L3.o      1838    6441          1923      8128
+bpf_lb-DLB_L4.o      3218    5908          3077      6707
+bpf_lb-DUNKNOWN.o    1064    1064          1062      1062
+bpf_lxc-DDROP_ALL.o  26935   93790         166729    380712
+bpf_lxc-DUNKNOWN.o   34439   123886        174607    440652
+bpf_netdev.o         9721    31413         8407      31904
+bpf_overlay.o        6184    18561         5420      23569
+bpf_lxc_jit.o        39389   359445        39389     359445
+
+Tracking constants in the registers hurts state pruning already.
+Adding tracking of constants through stack hurts pruning even more.
+The later patch address this general constant tracking issue
+with coarse/precise logic.
+
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Acked-by: Andrii Nakryiko <andriin@fb.com>
+Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
+[OP: - drop verbose_linfo() calls, as the function is not implemented in 4.19
+     - adjust mark_reg_read() calls to match the prototype in 4.19]
+Signed-off-by: Ovidiu Panait <ovidiu.panait@windriver.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/bpf/verifier.c |   88 ++++++++++++++++++++++++++++++++++++--------------
+ 1 file changed, 64 insertions(+), 24 deletions(-)
+
+--- a/kernel/bpf/verifier.c
++++ b/kernel/bpf/verifier.c
+@@ -963,6 +963,23 @@ static bool register_is_null(struct bpf_
+       return reg->type == SCALAR_VALUE && tnum_equals_const(reg->var_off, 0);
+ }
+ 
++static bool register_is_const(struct bpf_reg_state *reg)
++{
++      return reg->type == SCALAR_VALUE && tnum_is_const(reg->var_off);
++}
++
++static void save_register_state(struct bpf_func_state *state,
++                              int spi, struct bpf_reg_state *reg)
++{
++      int i;
++
++      state->stack[spi].spilled_ptr = *reg;
++      state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
++
++      for (i = 0; i < BPF_REG_SIZE; i++)
++              state->stack[spi].slot_type[i] = STACK_SPILL;
++}
++
+ /* check_stack_read/write functions track spill/fill of registers,
+  * stack boundary and alignment are checked in check_mem_access()
+  */
+@@ -972,7 +989,7 @@ static int check_stack_write(struct bpf_
+ {
+       struct bpf_func_state *cur; /* state of the current function */
+       int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err;
+-      enum bpf_reg_type type;
++      struct bpf_reg_state *reg = NULL;
+ 
+       err = realloc_func_state(state, round_up(slot + 1, BPF_REG_SIZE),
+                                true);
+@@ -989,27 +1006,36 @@ static int check_stack_write(struct bpf_
+       }
+ 
+       cur = env->cur_state->frame[env->cur_state->curframe];
+-      if (value_regno >= 0 &&
+-          is_spillable_regtype((type = cur->regs[value_regno].type))) {
++      if (value_regno >= 0)
++              reg = &cur->regs[value_regno];
+ 
++      if (reg && size == BPF_REG_SIZE && register_is_const(reg) &&
++          !register_is_null(reg) && env->allow_ptr_leaks) {
++              save_register_state(state, spi, reg);
++      } else if (reg && is_spillable_regtype(reg->type)) {
+               /* register containing pointer is being spilled into stack */
+               if (size != BPF_REG_SIZE) {
+                       verbose(env, "invalid size of register spill\n");
+                       return -EACCES;
+               }
+ 
+-              if (state != cur && type == PTR_TO_STACK) {
++              if (state != cur && reg->type == PTR_TO_STACK) {
+                       verbose(env, "cannot spill pointers to stack into stack frame of the caller\n");
+                       return -EINVAL;
+               }
+ 
+-              /* save register state */
+-              state->stack[spi].spilled_ptr = cur->regs[value_regno];
+-              state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
+-
+-              for (i = 0; i < BPF_REG_SIZE; i++) {
+-                      if (state->stack[spi].slot_type[i] == STACK_MISC &&
+-                          !env->allow_ptr_leaks) {
++              if (!env->allow_ptr_leaks) {
++                      bool sanitize = false;
++
++                      if (state->stack[spi].slot_type[0] == STACK_SPILL &&
++                          register_is_const(&state->stack[spi].spilled_ptr))
++                              sanitize = true;
++                      for (i = 0; i < BPF_REG_SIZE; i++)
++                              if (state->stack[spi].slot_type[i] == STACK_MISC) {
++                                      sanitize = true;
++                                      break;
++                              }
++                      if (sanitize) {
+                               int *poff = &env->insn_aux_data[insn_idx].sanitize_stack_off;
+                               int soff = (-spi - 1) * BPF_REG_SIZE;
+ 
+@@ -1032,8 +1058,8 @@ static int check_stack_write(struct bpf_
+                               }
+                               *poff = soff;
+                       }
+-                      state->stack[spi].slot_type[i] = STACK_SPILL;
+               }
++              save_register_state(state, spi, reg);
+       } else {
+               u8 type = STACK_MISC;
+ 
+@@ -1056,8 +1082,7 @@ static int check_stack_write(struct bpf_
+                       state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
+ 
+               /* when we zero initialize stack slots mark them as such */
+-              if (value_regno >= 0 &&
+-                  register_is_null(&cur->regs[value_regno]))
++              if (reg && register_is_null(reg))
+                       type = STACK_ZERO;
+ 
+               /* Mark slots affected by this stack write. */
+@@ -1075,6 +1100,7 @@ static int check_stack_read(struct bpf_v
+       struct bpf_verifier_state *vstate = env->cur_state;
+       struct bpf_func_state *state = vstate->frame[vstate->curframe];
+       int i, slot = -off - 1, spi = slot / BPF_REG_SIZE;
++      struct bpf_reg_state *reg;
+       u8 *stype;
+ 
+       if (reg_state->allocated_stack <= slot) {
+@@ -1083,11 +1109,20 @@ static int check_stack_read(struct bpf_v
+               return -EACCES;
+       }
+       stype = reg_state->stack[spi].slot_type;
++      reg = &reg_state->stack[spi].spilled_ptr;
+ 
+       if (stype[0] == STACK_SPILL) {
+               if (size != BPF_REG_SIZE) {
+-                      verbose(env, "invalid size of register spill\n");
+-                      return -EACCES;
++                      if (reg->type != SCALAR_VALUE) {
++                              verbose(env, "invalid size of register fill\n");
++                              return -EACCES;
++                      }
++                      if (value_regno >= 0) {
++                              mark_reg_unknown(env, state->regs, value_regno);
++                              state->regs[value_regno].live |= REG_LIVE_WRITTEN;
++                      }
++                      mark_reg_read(env, reg, reg->parent);
++                      return 0;
+               }
+               for (i = 1; i < BPF_REG_SIZE; i++) {
+                       if (stype[(slot - i) % BPF_REG_SIZE] != STACK_SPILL) {
+@@ -1098,16 +1133,14 @@ static int check_stack_read(struct bpf_v
+ 
+               if (value_regno >= 0) {
+                       /* restore register state from stack */
+-                      state->regs[value_regno] = reg_state->stack[spi].spilled_ptr;
++                      state->regs[value_regno] = *reg;
+                       /* mark reg as written since spilled pointer state likely
+                        * has its liveness marks cleared by is_state_visited()
+                        * which resets stack/reg liveness for state transitions
+                        */
+                       state->regs[value_regno].live |= REG_LIVE_WRITTEN;
+               }
+-              mark_reg_read(env, &reg_state->stack[spi].spilled_ptr,
+-                            reg_state->stack[spi].spilled_ptr.parent);
+-              return 0;
++              mark_reg_read(env, reg, reg->parent);
+       } else {
+               int zeros = 0;
+ 
+@@ -1122,8 +1155,7 @@ static int check_stack_read(struct bpf_v
+                               off, i, size);
+                       return -EACCES;
+               }
+-              mark_reg_read(env, &reg_state->stack[spi].spilled_ptr,
+-                            reg_state->stack[spi].spilled_ptr.parent);
++              mark_reg_read(env, reg, reg->parent);
+               if (value_regno >= 0) {
+                       if (zeros == size) {
+                               /* any size read into register is zero extended,
+@@ -1136,8 +1168,8 @@ static int check_stack_read(struct bpf_v
+                       }
+                       state->regs[value_regno].live |= REG_LIVE_WRITTEN;
+               }
+-              return 0;
+       }
++      return 0;
+ }
+ 
+ static int check_stack_access(struct bpf_verifier_env *env,
+@@ -1790,7 +1822,7 @@ static int check_stack_boundary(struct b
+ {
+       struct bpf_reg_state *reg = cur_regs(env) + regno;
+       struct bpf_func_state *state = func(env, reg);
+-      int err, min_off, max_off, i, slot, spi;
++      int err, min_off, max_off, i, j, slot, spi;
+ 
+       if (reg->type != PTR_TO_STACK) {
+               /* Allow zero-byte read from NULL, regardless of pointer type */
+@@ -1878,6 +1910,14 @@ static int check_stack_boundary(struct b
+                       *stype = STACK_MISC;
+                       goto mark;
+               }
++              if (state->stack[spi].slot_type[0] == STACK_SPILL &&
++                  state->stack[spi].spilled_ptr.type == SCALAR_VALUE) {
++                      __mark_reg_unknown(&state->stack[spi].spilled_ptr);
++                      for (j = 0; j < BPF_REG_SIZE; j++)
++                              state->stack[spi].slot_type[j] = STACK_MISC;
++                      goto mark;
++              }
++
+ err:
+               if (tnum_is_const(reg->var_off)) {
+                       verbose(env, "invalid indirect read from stack off %d+%d size %d\n",
diff --git a/queue-4.19/bpf-verifier-allocate-idmap-scratch-in-verifier-env.patch b/queue-4.19/bpf-verifier-allocate-idmap-scratch-in-verifier-env.patch

new file mode 100644 (file)

index 0000000..49f527a
--- /dev/null
+++ b/queue-4.19/bpf-verifier-allocate-idmap-scratch-in-verifier-env.patch
@@ -0,0 +1,149 @@
+From foo@baz Wed Sep 15 02:37:47 PM CEST 2021
+From: Ovidiu Panait <ovidiu.panait@windriver.com>
+Date: Mon, 13 Sep 2021 18:35:36 +0300
+Subject: bpf: verifier: Allocate idmap scratch in verifier env
+To: stable@vger.kernel.org
+Cc: bpf@vger.kernel.org, daniel@iogearbox.net
+Message-ID: <20210913153537.2162465-13-ovidiu.panait@windriver.com>
+
+From: Lorenz Bauer <lmb@cloudflare.com>
+
+commit c9e73e3d2b1eb1ea7ff068e05007eec3bd8ef1c9 upstream.
+
+func_states_equal makes a very short lived allocation for idmap,
+probably because it's too large to fit on the stack. However the
+function is called quite often, leading to a lot of alloc / free
+churn. Replace the temporary allocation with dedicated scratch
+space in struct bpf_verifier_env.
+
+Signed-off-by: Lorenz Bauer <lmb@cloudflare.com>
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Acked-by: Edward Cree <ecree.xilinx@gmail.com>
+Link: https://lore.kernel.org/bpf/20210429134656.122225-4-lmb@cloudflare.com
+[OP: adjusted context for 4.19]
+Signed-off-by: Ovidiu Panait <ovidiu.panait@windriver.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/bpf_verifier.h |    8 ++++++++
+ kernel/bpf/verifier.c        |   42 +++++++++++++-----------------------------
+ 2 files changed, 21 insertions(+), 29 deletions(-)
+
+--- a/include/linux/bpf_verifier.h
++++ b/include/linux/bpf_verifier.h
+@@ -127,6 +127,13 @@ struct bpf_func_state {
+       struct bpf_stack_state *stack;
+ };
+ 
++struct bpf_id_pair {
++      u32 old;
++      u32 cur;
++};
++
++/* Maximum number of register states that can exist at once */
++#define BPF_ID_MAP_SIZE (MAX_BPF_REG + MAX_BPF_STACK / BPF_REG_SIZE)
+ #define MAX_CALL_FRAMES 8
+ struct bpf_verifier_state {
+       /* call stack tracking */
+@@ -213,6 +220,7 @@ struct bpf_verifier_env {
+       struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */
+       struct bpf_verifier_log log;
+       struct bpf_subprog_info subprog_info[BPF_MAX_SUBPROGS + 1];
++      struct bpf_id_pair idmap_scratch[BPF_ID_MAP_SIZE];
+       u32 subprog_cnt;
+ };
+ 
+--- a/kernel/bpf/verifier.c
++++ b/kernel/bpf/verifier.c
+@@ -4783,13 +4783,6 @@ static bool range_within(struct bpf_reg_
+              old->smax_value >= cur->smax_value;
+ }
+ 
+-/* Maximum number of register states that can exist at once */
+-#define ID_MAP_SIZE   (MAX_BPF_REG + MAX_BPF_STACK / BPF_REG_SIZE)
+-struct idpair {
+-      u32 old;
+-      u32 cur;
+-};
+-
+ /* If in the old state two registers had the same id, then they need to have
+  * the same id in the new state as well.  But that id could be different from
+  * the old state, so we need to track the mapping from old to new ids.
+@@ -4800,11 +4793,11 @@ struct idpair {
+  * So we look through our idmap to see if this old id has been seen before.  If
+  * so, we require the new id to match; otherwise, we add the id pair to the map.
+  */
+-static bool check_ids(u32 old_id, u32 cur_id, struct idpair *idmap)
++static bool check_ids(u32 old_id, u32 cur_id, struct bpf_id_pair *idmap)
+ {
+       unsigned int i;
+ 
+-      for (i = 0; i < ID_MAP_SIZE; i++) {
++      for (i = 0; i < BPF_ID_MAP_SIZE; i++) {
+               if (!idmap[i].old) {
+                       /* Reached an empty slot; haven't seen this id before */
+                       idmap[i].old = old_id;
+@@ -4821,7 +4814,7 @@ static bool check_ids(u32 old_id, u32 cu
+ 
+ /* Returns true if (rold safe implies rcur safe) */
+ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
+-                  struct idpair *idmap)
++                  struct bpf_id_pair *idmap)
+ {
+       bool equal;
+ 
+@@ -4925,7 +4918,7 @@ static bool regsafe(struct bpf_reg_state
+ 
+ static bool stacksafe(struct bpf_func_state *old,
+                     struct bpf_func_state *cur,
+-                    struct idpair *idmap)
++                    struct bpf_id_pair *idmap)
+ {
+       int i, spi;
+ 
+@@ -5011,29 +5004,20 @@ static bool stacksafe(struct bpf_func_st
+  * whereas register type in current state is meaningful, it means that
+  * the current state will reach 'bpf_exit' instruction safely
+  */
+-static bool func_states_equal(struct bpf_func_state *old,
++static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_state *old,
+                             struct bpf_func_state *cur)
+ {
+-      struct idpair *idmap;
+-      bool ret = false;
+       int i;
+ 
+-      idmap = kcalloc(ID_MAP_SIZE, sizeof(struct idpair), GFP_KERNEL);
+-      /* If we failed to allocate the idmap, just say it's not safe */
+-      if (!idmap)
+-              return false;
++      memset(env->idmap_scratch, 0, sizeof(env->idmap_scratch));
++      for (i = 0; i < MAX_BPF_REG; i++)
++              if (!regsafe(&old->regs[i], &cur->regs[i], env->idmap_scratch))
++                      return false;
+ 
+-      for (i = 0; i < MAX_BPF_REG; i++) {
+-              if (!regsafe(&old->regs[i], &cur->regs[i], idmap))
+-                      goto out_free;
+-      }
++      if (!stacksafe(old, cur, env->idmap_scratch))
++              return false;
+ 
+-      if (!stacksafe(old, cur, idmap))
+-              goto out_free;
+-      ret = true;
+-out_free:
+-      kfree(idmap);
+-      return ret;
++      return true;
+ }
+ 
+ static bool states_equal(struct bpf_verifier_env *env,
+@@ -5057,7 +5041,7 @@ static bool states_equal(struct bpf_veri
+       for (i = 0; i <= old->curframe; i++) {
+               if (old->frame[i]->callsite != cur->frame[i]->callsite)
+                       return false;
+-              if (!func_states_equal(old->frame[i], cur->frame[i]))
++              if (!func_states_equal(env, old->frame[i], cur->frame[i]))
+                       return false;
+       }
+       return true;
diff --git a/queue-4.19/bpf-verifier-per-register-parent-pointers.patch b/queue-4.19/bpf-verifier-per-register-parent-pointers.patch

new file mode 100644 (file)

index 0000000..93e86be
--- /dev/null
+++ b/queue-4.19/bpf-verifier-per-register-parent-pointers.patch
@@ -0,0 +1,412 @@
+From foo@baz Wed Sep 15 02:37:47 PM CEST 2021
+From: Ovidiu Panait <ovidiu.panait@windriver.com>
+Date: Mon, 13 Sep 2021 18:35:25 +0300
+Subject: bpf/verifier: per-register parent pointers
+To: stable@vger.kernel.org
+Cc: bpf@vger.kernel.org, daniel@iogearbox.net
+Message-ID: <20210913153537.2162465-2-ovidiu.panait@windriver.com>
+
+From: Edward Cree <ecree@solarflare.com>
+
+commit 679c782de14bd48c19dd74cd1af20a2bc05dd936 upstream.
+
+By giving each register its own liveness chain, we elide the skip_callee()
+ logic.  Instead, each register's parent is the state it inherits from;
+ both check_func_call() and prepare_func_exit() automatically connect
+ reg states to the correct chain since when they copy the reg state across
+ (r1-r5 into the callee as args, and r0 out as the return value) they also
+ copy the parent pointer.
+
+Signed-off-by: Edward Cree <ecree@solarflare.com>
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+[OP: adjusted context for 4.19]
+Signed-off-by: Ovidiu Panait <ovidiu.panait@windriver.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/bpf_verifier.h |    8 -
+ kernel/bpf/verifier.c        |  183 ++++++++++---------------------------------
+ 2 files changed, 47 insertions(+), 144 deletions(-)
+
+--- a/include/linux/bpf_verifier.h
++++ b/include/linux/bpf_verifier.h
+@@ -41,6 +41,7 @@ enum bpf_reg_liveness {
+ };
+ 
+ struct bpf_reg_state {
++      /* Ordering of fields matters.  See states_equal() */
+       enum bpf_reg_type type;
+       union {
+               /* valid when type == PTR_TO_PACKET */
+@@ -62,7 +63,6 @@ struct bpf_reg_state {
+        * came from, when one is tested for != NULL.
+        */
+       u32 id;
+-      /* Ordering of fields matters.  See states_equal() */
+       /* For scalar types (SCALAR_VALUE), this represents our knowledge of
+        * the actual value.
+        * For pointer types, this represents the variable part of the offset
+@@ -79,15 +79,15 @@ struct bpf_reg_state {
+       s64 smax_value; /* maximum possible (s64)value */
+       u64 umin_value; /* minimum possible (u64)value */
+       u64 umax_value; /* maximum possible (u64)value */
++      /* parentage chain for liveness checking */
++      struct bpf_reg_state *parent;
+       /* Inside the callee two registers can be both PTR_TO_STACK like
+        * R1=fp-8 and R2=fp-8, but one of them points to this function stack
+        * while another to the caller's stack. To differentiate them 'frameno'
+        * is used which is an index in bpf_verifier_state->frame[] array
+        * pointing to bpf_func_state.
+-       * This field must be second to last, for states_equal() reasons.
+        */
+       u32 frameno;
+-      /* This field must be last, for states_equal() reasons. */
+       enum bpf_reg_liveness live;
+ };
+ 
+@@ -110,7 +110,6 @@ struct bpf_stack_state {
+  */
+ struct bpf_func_state {
+       struct bpf_reg_state regs[MAX_BPF_REG];
+-      struct bpf_verifier_state *parent;
+       /* index of call instruction that called into this func */
+       int callsite;
+       /* stack frame number of this function state from pov of
+@@ -132,7 +131,6 @@ struct bpf_func_state {
+ struct bpf_verifier_state {
+       /* call stack tracking */
+       struct bpf_func_state *frame[MAX_CALL_FRAMES];
+-      struct bpf_verifier_state *parent;
+       u32 curframe;
+       bool speculative;
+ };
+--- a/kernel/bpf/verifier.c
++++ b/kernel/bpf/verifier.c
+@@ -380,9 +380,9 @@ static int copy_stack_state(struct bpf_f
+ /* do_check() starts with zero-sized stack in struct bpf_verifier_state to
+  * make it consume minimal amount of memory. check_stack_write() access from
+  * the program calls into realloc_func_state() to grow the stack size.
+- * Note there is a non-zero 'parent' pointer inside bpf_verifier_state
+- * which this function copies over. It points to previous bpf_verifier_state
+- * which is never reallocated
++ * Note there is a non-zero parent pointer inside each reg of bpf_verifier_state
++ * which this function copies over. It points to corresponding reg in previous
++ * bpf_verifier_state which is never reallocated
+  */
+ static int realloc_func_state(struct bpf_func_state *state, int size,
+                             bool copy_old)
+@@ -467,7 +467,6 @@ static int copy_verifier_state(struct bp
+       }
+       dst_state->speculative = src->speculative;
+       dst_state->curframe = src->curframe;
+-      dst_state->parent = src->parent;
+       for (i = 0; i <= src->curframe; i++) {
+               dst = dst_state->frame[i];
+               if (!dst) {
+@@ -739,6 +738,7 @@ static void init_reg_state(struct bpf_ve
+       for (i = 0; i < MAX_BPF_REG; i++) {
+               mark_reg_not_init(env, regs, i);
+               regs[i].live = REG_LIVE_NONE;
++              regs[i].parent = NULL;
+       }
+ 
+       /* frame pointer */
+@@ -883,74 +883,21 @@ next:
+       return 0;
+ }
+ 
+-static
+-struct bpf_verifier_state *skip_callee(struct bpf_verifier_env *env,
+-                                     const struct bpf_verifier_state *state,
+-                                     struct bpf_verifier_state *parent,
+-                                     u32 regno)
+-{
+-      struct bpf_verifier_state *tmp = NULL;
+-
+-      /* 'parent' could be a state of caller and
+-       * 'state' could be a state of callee. In such case
+-       * parent->curframe < state->curframe
+-       * and it's ok for r1 - r5 registers
+-       *
+-       * 'parent' could be a callee's state after it bpf_exit-ed.
+-       * In such case parent->curframe > state->curframe
+-       * and it's ok for r0 only
+-       */
+-      if (parent->curframe == state->curframe ||
+-          (parent->curframe < state->curframe &&
+-           regno >= BPF_REG_1 && regno <= BPF_REG_5) ||
+-          (parent->curframe > state->curframe &&
+-             regno == BPF_REG_0))
+-              return parent;
+-
+-      if (parent->curframe > state->curframe &&
+-          regno >= BPF_REG_6) {
+-              /* for callee saved regs we have to skip the whole chain
+-               * of states that belong to callee and mark as LIVE_READ
+-               * the registers before the call
+-               */
+-              tmp = parent;
+-              while (tmp && tmp->curframe != state->curframe) {
+-                      tmp = tmp->parent;
+-              }
+-              if (!tmp)
+-                      goto bug;
+-              parent = tmp;
+-      } else {
+-              goto bug;
+-      }
+-      return parent;
+-bug:
+-      verbose(env, "verifier bug regno %d tmp %p\n", regno, tmp);
+-      verbose(env, "regno %d parent frame %d current frame %d\n",
+-              regno, parent->curframe, state->curframe);
+-      return NULL;
+-}
+-
++/* Parentage chain of this register (or stack slot) should take care of all
++ * issues like callee-saved registers, stack slot allocation time, etc.
++ */
+ static int mark_reg_read(struct bpf_verifier_env *env,
+-                       const struct bpf_verifier_state *state,
+-                       struct bpf_verifier_state *parent,
+-                       u32 regno)
++                       const struct bpf_reg_state *state,
++                       struct bpf_reg_state *parent)
+ {
+       bool writes = parent == state->parent; /* Observe write marks */
+ 
+-      if (regno == BPF_REG_FP)
+-              /* We don't need to worry about FP liveness because it's read-only */
+-              return 0;
+-
+       while (parent) {
+               /* if read wasn't screened by an earlier write ... */
+-              if (writes && state->frame[state->curframe]->regs[regno].live & REG_LIVE_WRITTEN)
++              if (writes && state->live & REG_LIVE_WRITTEN)
+                       break;
+-              parent = skip_callee(env, state, parent, regno);
+-              if (!parent)
+-                      return -EFAULT;
+               /* ... then we depend on parent's value */
+-              parent->frame[parent->curframe]->regs[regno].live |= REG_LIVE_READ;
++              parent->live |= REG_LIVE_READ;
+               state = parent;
+               parent = state->parent;
+               writes = true;
+@@ -976,7 +923,10 @@ static int check_reg_arg(struct bpf_veri
+                       verbose(env, "R%d !read_ok\n", regno);
+                       return -EACCES;
+               }
+-              return mark_reg_read(env, vstate, vstate->parent, regno);
++              /* We don't need to worry about FP liveness because it's read-only */
++              if (regno != BPF_REG_FP)
++                      return mark_reg_read(env, &regs[regno],
++                                           regs[regno].parent);
+       } else {
+               /* check whether register used as dest operand can be written to */
+               if (regno == BPF_REG_FP) {
+@@ -1087,8 +1037,8 @@ static int check_stack_write(struct bpf_
+       } else {
+               u8 type = STACK_MISC;
+ 
+-              /* regular write of data into stack */
+-              state->stack[spi].spilled_ptr = (struct bpf_reg_state) {};
++              /* regular write of data into stack destroys any spilled ptr */
++              state->stack[spi].spilled_ptr.type = NOT_INIT;
+ 
+               /* only mark the slot as written if all 8 bytes were written
+                * otherwise read propagation may incorrectly stop too soon
+@@ -1113,61 +1063,6 @@ static int check_stack_write(struct bpf_
+       return 0;
+ }
+ 
+-/* registers of every function are unique and mark_reg_read() propagates
+- * the liveness in the following cases:
+- * - from callee into caller for R1 - R5 that were used as arguments
+- * - from caller into callee for R0 that used as result of the call
+- * - from caller to the same caller skipping states of the callee for R6 - R9,
+- *   since R6 - R9 are callee saved by implicit function prologue and
+- *   caller's R6 != callee's R6, so when we propagate liveness up to
+- *   parent states we need to skip callee states for R6 - R9.
+- *
+- * stack slot marking is different, since stacks of caller and callee are
+- * accessible in both (since caller can pass a pointer to caller's stack to
+- * callee which can pass it to another function), hence mark_stack_slot_read()
+- * has to propagate the stack liveness to all parent states at given frame number.
+- * Consider code:
+- * f1() {
+- *   ptr = fp - 8;
+- *   *ptr = ctx;
+- *   call f2 {
+- *      .. = *ptr;
+- *   }
+- *   .. = *ptr;
+- * }
+- * First *ptr is reading from f1's stack and mark_stack_slot_read() has
+- * to mark liveness at the f1's frame and not f2's frame.
+- * Second *ptr is also reading from f1's stack and mark_stack_slot_read() has
+- * to propagate liveness to f2 states at f1's frame level and further into
+- * f1 states at f1's frame level until write into that stack slot
+- */
+-static void mark_stack_slot_read(struct bpf_verifier_env *env,
+-                               const struct bpf_verifier_state *state,
+-                               struct bpf_verifier_state *parent,
+-                               int slot, int frameno)
+-{
+-      bool writes = parent == state->parent; /* Observe write marks */
+-
+-      while (parent) {
+-              if (parent->frame[frameno]->allocated_stack <= slot * BPF_REG_SIZE)
+-                      /* since LIVE_WRITTEN mark is only done for full 8-byte
+-                       * write the read marks are conservative and parent
+-                       * state may not even have the stack allocated. In such case
+-                       * end the propagation, since the loop reached beginning
+-                       * of the function
+-                       */
+-                      break;
+-              /* if read wasn't screened by an earlier write ... */
+-              if (writes && state->frame[frameno]->stack[slot].spilled_ptr.live & REG_LIVE_WRITTEN)
+-                      break;
+-              /* ... then we depend on parent's value */
+-              parent->frame[frameno]->stack[slot].spilled_ptr.live |= REG_LIVE_READ;
+-              state = parent;
+-              parent = state->parent;
+-              writes = true;
+-      }
+-}
+-
+ static int check_stack_read(struct bpf_verifier_env *env,
+                           struct bpf_func_state *reg_state /* func where register points to */,
+                           int off, int size, int value_regno)
+@@ -1205,8 +1100,8 @@ static int check_stack_read(struct bpf_v
+                        */
+                       state->regs[value_regno].live |= REG_LIVE_WRITTEN;
+               }
+-              mark_stack_slot_read(env, vstate, vstate->parent, spi,
+-                                   reg_state->frameno);
++              mark_reg_read(env, &reg_state->stack[spi].spilled_ptr,
++                            reg_state->stack[spi].spilled_ptr.parent);
+               return 0;
+       } else {
+               int zeros = 0;
+@@ -1222,8 +1117,8 @@ static int check_stack_read(struct bpf_v
+                               off, i, size);
+                       return -EACCES;
+               }
+-              mark_stack_slot_read(env, vstate, vstate->parent, spi,
+-                                   reg_state->frameno);
++              mark_reg_read(env, &reg_state->stack[spi].spilled_ptr,
++                            reg_state->stack[spi].spilled_ptr.parent);
+               if (value_regno >= 0) {
+                       if (zeros == size) {
+                               /* any size read into register is zero extended,
+@@ -1927,8 +1822,8 @@ mark:
+               /* reading any byte out of 8-byte 'spill_slot' will cause
+                * the whole slot to be marked as 'read'
+                */
+-              mark_stack_slot_read(env, env->cur_state, env->cur_state->parent,
+-                                   spi, state->frameno);
++              mark_reg_read(env, &state->stack[spi].spilled_ptr,
++                            state->stack[spi].spilled_ptr.parent);
+       }
+       return update_stack_depth(env, state, off);
+ }
+@@ -2384,11 +2279,13 @@ static int check_func_call(struct bpf_ve
+                       state->curframe + 1 /* frameno within this callchain */,
+                       subprog /* subprog number within this prog */);
+ 
+-      /* copy r1 - r5 args that callee can access */
++      /* copy r1 - r5 args that callee can access.  The copy includes parent
++       * pointers, which connects us up to the liveness chain
++       */
+       for (i = BPF_REG_1; i <= BPF_REG_5; i++)
+               callee->regs[i] = caller->regs[i];
+ 
+-      /* after the call regsiters r0 - r5 were scratched */
++      /* after the call registers r0 - r5 were scratched */
+       for (i = 0; i < CALLER_SAVED_REGS; i++) {
+               mark_reg_not_init(env, caller->regs, caller_saved[i]);
+               check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
+@@ -4844,7 +4741,7 @@ static bool regsafe(struct bpf_reg_state
+               /* explored state didn't use this */
+               return true;
+ 
+-      equal = memcmp(rold, rcur, offsetof(struct bpf_reg_state, frameno)) == 0;
++      equal = memcmp(rold, rcur, offsetof(struct bpf_reg_state, parent)) == 0;
+ 
+       if (rold->type == PTR_TO_STACK)
+               /* two stack pointers are equal only if they're pointing to
+@@ -5083,7 +4980,7 @@ static bool states_equal(struct bpf_veri
+  * equivalent state (jump target or such) we didn't arrive by the straight-line
+  * code, so read marks in the state must propagate to the parent regardless
+  * of the state's write marks. That's what 'parent == state->parent' comparison
+- * in mark_reg_read() and mark_stack_slot_read() is for.
++ * in mark_reg_read() is for.
+  */
+ static int propagate_liveness(struct bpf_verifier_env *env,
+                             const struct bpf_verifier_state *vstate,
+@@ -5104,7 +5001,8 @@ static int propagate_liveness(struct bpf
+               if (vparent->frame[vparent->curframe]->regs[i].live & REG_LIVE_READ)
+                       continue;
+               if (vstate->frame[vstate->curframe]->regs[i].live & REG_LIVE_READ) {
+-                      err = mark_reg_read(env, vstate, vparent, i);
++                      err = mark_reg_read(env, &vstate->frame[vstate->curframe]->regs[i],
++                                          &vparent->frame[vstate->curframe]->regs[i]);
+                       if (err)
+                               return err;
+               }
+@@ -5119,7 +5017,8 @@ static int propagate_liveness(struct bpf
+                       if (parent->stack[i].spilled_ptr.live & REG_LIVE_READ)
+                               continue;
+                       if (state->stack[i].spilled_ptr.live & REG_LIVE_READ)
+-                              mark_stack_slot_read(env, vstate, vparent, i, frame);
++                              mark_reg_read(env, &state->stack[i].spilled_ptr,
++                                            &parent->stack[i].spilled_ptr);
+               }
+       }
+       return err;
+@@ -5129,7 +5028,7 @@ static int is_state_visited(struct bpf_v
+ {
+       struct bpf_verifier_state_list *new_sl;
+       struct bpf_verifier_state_list *sl;
+-      struct bpf_verifier_state *cur = env->cur_state;
++      struct bpf_verifier_state *cur = env->cur_state, *new;
+       int i, j, err, states_cnt = 0;
+ 
+       sl = env->explored_states[insn_idx];
+@@ -5175,16 +5074,18 @@ static int is_state_visited(struct bpf_v
+               return -ENOMEM;
+ 
+       /* add new state to the head of linked list */
+-      err = copy_verifier_state(&new_sl->state, cur);
++      new = &new_sl->state;
++      err = copy_verifier_state(new, cur);
+       if (err) {
+-              free_verifier_state(&new_sl->state, false);
++              free_verifier_state(new, false);
+               kfree(new_sl);
+               return err;
+       }
+       new_sl->next = env->explored_states[insn_idx];
+       env->explored_states[insn_idx] = new_sl;
+       /* connect new state to parentage chain */
+-      cur->parent = &new_sl->state;
++      for (i = 0; i < BPF_REG_FP; i++)
++              cur_regs(env)[i].parent = &new->frame[new->curframe]->regs[i];
+       /* clear write marks in current state: the writes we did are not writes
+        * our child did, so they don't screen off its reads from us.
+        * (There are no read marks in current state, because reads always mark
+@@ -5197,9 +5098,13 @@ static int is_state_visited(struct bpf_v
+       /* all stack frames are accessible from callee, clear them all */
+       for (j = 0; j <= cur->curframe; j++) {
+               struct bpf_func_state *frame = cur->frame[j];
++              struct bpf_func_state *newframe = new->frame[j];
+ 
+-              for (i = 0; i < frame->allocated_stack / BPF_REG_SIZE; i++)
++              for (i = 0; i < frame->allocated_stack / BPF_REG_SIZE; i++) {
+                       frame->stack[i].spilled_ptr.live = REG_LIVE_NONE;
++                      frame->stack[i].spilled_ptr.parent =
++                                              &newframe->stack[i].spilled_ptr;
++              }
+       }
+       return 0;
+ }
diff --git a/queue-4.19/crypto-public_key-fix-overflow-during-implicit-conversion.patch b/queue-4.19/crypto-public_key-fix-overflow-during-implicit-conversion.patch

new file mode 100644 (file)

index 0000000..37ed417
--- /dev/null
+++ b/queue-4.19/crypto-public_key-fix-overflow-during-implicit-conversion.patch
@@ -0,0 +1,70 @@
+From f985911b7bc75d5c98ed24d8aaa8b94c590f7c6a Mon Sep 17 00:00:00 2001
+From: zhenwei pi <pizhenwei@bytedance.com>
+Date: Thu, 19 Aug 2021 20:37:10 +0800
+Subject: crypto: public_key: fix overflow during implicit conversion
+
+From: zhenwei pi <pizhenwei@bytedance.com>
+
+commit f985911b7bc75d5c98ed24d8aaa8b94c590f7c6a upstream.
+
+Hit kernel warning like this, it can be reproduced by verifying 256
+bytes datafile by keyctl command, run script:
+RAWDATA=rawdata
+SIGDATA=sigdata
+
+modprobe pkcs8_key_parser
+
+rm -rf *.der *.pem *.pfx
+rm -rf $RAWDATA
+dd if=/dev/random of=$RAWDATA bs=256 count=1
+
+openssl req -nodes -x509 -newkey rsa:4096 -keyout key.pem -out cert.pem \
+  -subj "/C=CN/ST=GD/L=SZ/O=vihoo/OU=dev/CN=xx.com/emailAddress=yy@xx.com"
+
+KEY_ID=`openssl pkcs8 -in key.pem -topk8 -nocrypt -outform DER | keyctl \
+  padd asymmetric 123 @s`
+
+keyctl pkey_sign $KEY_ID 0 $RAWDATA enc=pkcs1 hash=sha1 > $SIGDATA
+keyctl pkey_verify $KEY_ID 0 $RAWDATA $SIGDATA enc=pkcs1 hash=sha1
+
+Then the kernel reports:
+ WARNING: CPU: 5 PID: 344556 at crypto/rsa-pkcs1pad.c:540
+   pkcs1pad_verify+0x160/0x190
+ ...
+ Call Trace:
+  public_key_verify_signature+0x282/0x380
+  ? software_key_query+0x12d/0x180
+  ? keyctl_pkey_params_get+0xd6/0x130
+  asymmetric_key_verify_signature+0x66/0x80
+  keyctl_pkey_verify+0xa5/0x100
+  do_syscall_64+0x35/0xb0
+  entry_SYSCALL_64_after_hwframe+0x44/0xae
+
+The reason of this issue, in function 'asymmetric_key_verify_signature':
+'.digest_size(u8) = params->in_len(u32)' leads overflow of an u8 value,
+so use u32 instead of u8 for digest_size field. And reorder struct
+public_key_signature, it saves 8 bytes on a 64-bit machine.
+
+Cc: stable@vger.kernel.org
+Signed-off-by: zhenwei pi <pizhenwei@bytedance.com>
+Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
+Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/crypto/public_key.h |    4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/include/crypto/public_key.h
++++ b/include/crypto/public_key.h
+@@ -35,9 +35,9 @@ extern void public_key_free(struct publi
+ struct public_key_signature {
+       struct asymmetric_key_id *auth_ids[2];
+       u8 *s;                  /* Signature */
+-      u32 s_size;             /* Number of bytes in signature */
+       u8 *digest;
+-      u8 digest_size;         /* Number of bytes in digest */
++      u32 s_size;             /* Number of bytes in signature */
++      u32 digest_size;        /* Number of bytes in digest */
+       const char *pkey_algo;
+       const char *hash_algo;
+ };
diff --git a/queue-4.19/dm-crypt-avoid-percpu_counter-spinlock-contention-in-crypt_page_alloc.patch b/queue-4.19/dm-crypt-avoid-percpu_counter-spinlock-contention-in-crypt_page_alloc.patch

new file mode 100644 (file)

index 0000000..960ff86
--- /dev/null
+++ b/queue-4.19/dm-crypt-avoid-percpu_counter-spinlock-contention-in-crypt_page_alloc.patch
@@ -0,0 +1,128 @@
+From 528b16bfc3ae5f11638e71b3b63a81f9999df727 Mon Sep 17 00:00:00 2001
+From: Arne Welzel <arne.welzel@corelight.com>
+Date: Sat, 14 Aug 2021 00:40:38 +0200
+Subject: dm crypt: Avoid percpu_counter spinlock contention in crypt_page_alloc()
+
+From: Arne Welzel <arne.welzel@corelight.com>
+
+commit 528b16bfc3ae5f11638e71b3b63a81f9999df727 upstream.
+
+On systems with many cores using dm-crypt, heavy spinlock contention in
+percpu_counter_compare() can be observed when the page allocation limit
+for a given device is reached or close to be reached. This is due
+to percpu_counter_compare() taking a spinlock to compute an exact
+result on potentially many CPUs at the same time.
+
+Switch to non-exact comparison of allocated and allowed pages by using
+the value returned by percpu_counter_read_positive() to avoid taking
+the percpu_counter spinlock.
+
+This may over/under estimate the actual number of allocated pages by at
+most (batch-1) * num_online_cpus().
+
+Currently, batch is bounded by 32. The system on which this issue was
+first observed has 256 CPUs and 512GB of RAM. With a 4k page size, this
+change may over/under estimate by 31MB. With ~10G (2%) allowed dm-crypt
+allocations, this seems an acceptable error. Certainly preferred over
+running into the spinlock contention.
+
+This behavior was reproduced on an EC2 c5.24xlarge instance with 96 CPUs
+and 192GB RAM as follows, but can be provoked on systems with less CPUs
+as well.
+
+ * Disable swap
+ * Tune vm settings to promote regular writeback
+     $ echo 50 > /proc/sys/vm/dirty_expire_centisecs
+     $ echo 25 > /proc/sys/vm/dirty_writeback_centisecs
+     $ echo $((128 * 1024 * 1024)) > /proc/sys/vm/dirty_background_bytes
+
+ * Create 8 dmcrypt devices based on files on a tmpfs
+ * Create and mount an ext4 filesystem on each crypt devices
+ * Run stress-ng --hdd 8 within one of above filesystems
+
+Total %system usage collected from sysstat goes to ~35%. Write throughput
+on the underlying loop device is ~2GB/s. perf profiling an individual
+kworker kcryptd thread shows the following profile, indicating spinlock
+contention in percpu_counter_compare():
+
+    99.98%     0.00%  kworker/u193:46  [kernel.kallsyms]  [k] ret_from_fork
+      |
+      --ret_from_fork
+        kthread
+        worker_thread
+        |
+         --99.92%--process_one_work
+            |
+            |--80.52%--kcryptd_crypt
+            |    |
+            |    |--62.58%--mempool_alloc
+            |    |  |
+            |    |   --62.24%--crypt_page_alloc
+            |    |     |
+            |    |      --61.51%--__percpu_counter_compare
+            |    |        |
+            |    |         --61.34%--__percpu_counter_sum
+            |    |           |
+            |    |           |--58.68%--_raw_spin_lock_irqsave
+            |    |           |  |
+            |    |           |   --58.30%--native_queued_spin_lock_slowpath
+            |    |           |
+            |    |            --0.69%--cpumask_next
+            |    |                |
+            |    |                 --0.51%--_find_next_bit
+            |    |
+            |    |--10.61%--crypt_convert
+            |    |          |
+            |    |          |--6.05%--xts_crypt
+            ...
+
+After applying this patch and running the same test, %system usage is
+lowered to ~7% and write throughput on the loop device increases
+to ~2.7GB/s. perf report shows mempool_alloc() as ~8% rather than ~62%
+in the profile and not hitting the percpu_counter() spinlock anymore.
+
+    |--8.15%--mempool_alloc
+    |    |
+    |    |--3.93%--crypt_page_alloc
+    |    |    |
+    |    |     --3.75%--__alloc_pages
+    |    |         |
+    |    |          --3.62%--get_page_from_freelist
+    |    |              |
+    |    |               --3.22%--rmqueue_bulk
+    |    |                   |
+    |    |                    --2.59%--_raw_spin_lock
+    |    |                      |
+    |    |                       --2.57%--native_queued_spin_lock_slowpath
+    |    |
+    |     --3.05%--_raw_spin_lock_irqsave
+    |               |
+    |                --2.49%--native_queued_spin_lock_slowpath
+
+Suggested-by: DJ Gregor <dj@corelight.com>
+Reviewed-by: Mikulas Patocka <mpatocka@redhat.com>
+Signed-off-by: Arne Welzel <arne.welzel@corelight.com>
+Fixes: 5059353df86e ("dm crypt: limit the number of allocated pages")
+Cc: stable@vger.kernel.org
+Signed-off-by: Mike Snitzer <snitzer@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/md/dm-crypt.c |    7 ++++++-
+ 1 file changed, 6 insertions(+), 1 deletion(-)
+
+--- a/drivers/md/dm-crypt.c
++++ b/drivers/md/dm-crypt.c
+@@ -2181,7 +2181,12 @@ static void *crypt_page_alloc(gfp_t gfp_
+       struct crypt_config *cc = pool_data;
+       struct page *page;
+ 
+-      if (unlikely(percpu_counter_compare(&cc->n_allocated_pages, dm_crypt_pages_per_client) >= 0) &&
++      /*
++       * Note, percpu_counter_read_positive() may over (and under) estimate
++       * the current usage by at most (batch - 1) * num_online_cpus() pages,
++       * but avoids potential spinlock contention of an exact result.
++       */
++      if (unlikely(percpu_counter_read_positive(&cc->n_allocated_pages) >= dm_crypt_pages_per_client) &&
+           likely(gfp_mask & __GFP_NORETRY))
+               return NULL;
+ 
diff --git a/queue-4.19/media-rc-loopback-return-number-of-emitters-rather-than-error.patch b/queue-4.19/media-rc-loopback-return-number-of-emitters-rather-than-error.patch

new file mode 100644 (file)

index 0000000..a0a30e2
--- /dev/null
+++ b/queue-4.19/media-rc-loopback-return-number-of-emitters-rather-than-error.patch
@@ -0,0 +1,31 @@
+From 6b7f554be8c92319d7e6df92fd247ebb9beb4a45 Mon Sep 17 00:00:00 2001
+From: Sean Young <sean@mess.org>
+Date: Sat, 3 Jul 2021 15:37:17 +0200
+Subject: media: rc-loopback: return number of emitters rather than error
+
+From: Sean Young <sean@mess.org>
+
+commit 6b7f554be8c92319d7e6df92fd247ebb9beb4a45 upstream.
+
+The LIRC_SET_TRANSMITTER_MASK ioctl should return the number of emitters
+if an invalid list was set.
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Sean Young <sean@mess.org>
+Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/media/rc/rc-loopback.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/media/rc/rc-loopback.c
++++ b/drivers/media/rc/rc-loopback.c
+@@ -52,7 +52,7 @@ static int loop_set_tx_mask(struct rc_de
+ 
+       if ((mask & (RXMASK_REGULAR | RXMASK_LEARNING)) != mask) {
+               dprintk("invalid tx mask: %u\n", mask);
+-              return -EINVAL;
++              return 2;
+       }
+ 
+       dprintk("setting tx mask: %u\n", mask);
diff --git a/queue-4.19/media-uvc-don-t-do-dma-on-stack.patch b/queue-4.19/media-uvc-don-t-do-dma-on-stack.patch

new file mode 100644 (file)

index 0000000..1127d20
--- /dev/null
+++ b/queue-4.19/media-uvc-don-t-do-dma-on-stack.patch
@@ -0,0 +1,96 @@
+From 1a10d7fdb6d0e235e9d230916244cc2769d3f170 Mon Sep 17 00:00:00 2001
+From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
+Date: Thu, 17 Jun 2021 14:33:29 +0200
+Subject: media: uvc: don't do DMA on stack
+
+From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
+
+commit 1a10d7fdb6d0e235e9d230916244cc2769d3f170 upstream.
+
+As warned by smatch:
+       drivers/media/usb/uvc/uvc_v4l2.c:911 uvc_ioctl_g_input() error: doing dma on the stack (&i)
+       drivers/media/usb/uvc/uvc_v4l2.c:943 uvc_ioctl_s_input() error: doing dma on the stack (&i)
+
+those two functions call uvc_query_ctrl passing a pointer to
+a data at the DMA stack. those are used to send URBs via
+usb_control_msg(). Using DMA stack is not supported and should
+not work anymore on modern Linux versions.
+
+So, use a kmalloc'ed buffer.
+
+Cc: stable@vger.kernel.org     # Kernel 4.9 and upper
+Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
+Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/media/usb/uvc/uvc_v4l2.c |   34 +++++++++++++++++++++++-----------
+ 1 file changed, 23 insertions(+), 11 deletions(-)
+
+--- a/drivers/media/usb/uvc/uvc_v4l2.c
++++ b/drivers/media/usb/uvc/uvc_v4l2.c
+@@ -900,8 +900,8 @@ static int uvc_ioctl_g_input(struct file
+ {
+       struct uvc_fh *handle = fh;
+       struct uvc_video_chain *chain = handle->chain;
++      u8 *buf;
+       int ret;
+-      u8 i;
+ 
+       if (chain->selector == NULL ||
+           (chain->dev->quirks & UVC_QUIRK_IGNORE_SELECTOR_UNIT)) {
+@@ -909,22 +909,27 @@ static int uvc_ioctl_g_input(struct file
+               return 0;
+       }
+ 
++      buf = kmalloc(1, GFP_KERNEL);
++      if (!buf)
++              return -ENOMEM;
++
+       ret = uvc_query_ctrl(chain->dev, UVC_GET_CUR, chain->selector->id,
+                            chain->dev->intfnum,  UVC_SU_INPUT_SELECT_CONTROL,
+-                           &i, 1);
+-      if (ret < 0)
+-              return ret;
++                           buf, 1);
++      if (!ret)
++              *input = *buf - 1;
+ 
+-      *input = i - 1;
+-      return 0;
++      kfree(buf);
++
++      return ret;
+ }
+ 
+ static int uvc_ioctl_s_input(struct file *file, void *fh, unsigned int input)
+ {
+       struct uvc_fh *handle = fh;
+       struct uvc_video_chain *chain = handle->chain;
++      u8 *buf;
+       int ret;
+-      u32 i;
+ 
+       ret = uvc_acquire_privileges(handle);
+       if (ret < 0)
+@@ -940,10 +945,17 @@ static int uvc_ioctl_s_input(struct file
+       if (input >= chain->selector->bNrInPins)
+               return -EINVAL;
+ 
+-      i = input + 1;
+-      return uvc_query_ctrl(chain->dev, UVC_SET_CUR, chain->selector->id,
+-                            chain->dev->intfnum, UVC_SU_INPUT_SELECT_CONTROL,
+-                            &i, 1);
++      buf = kmalloc(1, GFP_KERNEL);
++      if (!buf)
++              return -ENOMEM;
++
++      *buf = input + 1;
++      ret = uvc_query_ctrl(chain->dev, UVC_SET_CUR, chain->selector->id,
++                           chain->dev->intfnum, UVC_SU_INPUT_SELECT_CONTROL,
++                           buf, 1);
++      kfree(buf);
++
++      return ret;
+ }
+ 
+ static int uvc_ioctl_queryctrl(struct file *file, void *fh,
diff --git a/queue-4.19/power-supply-max17042-handle-fails-of-reading-status-register.patch b/queue-4.19/power-supply-max17042-handle-fails-of-reading-status-register.patch

new file mode 100644 (file)

index 0000000..d0c6dab
--- /dev/null
+++ b/queue-4.19/power-supply-max17042-handle-fails-of-reading-status-register.patch
@@ -0,0 +1,43 @@
+From 54784ffa5b267f57161eb8fbb811499f22a0a0bf Mon Sep 17 00:00:00 2001
+From: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
+Date: Mon, 16 Aug 2021 10:27:14 +0200
+Subject: power: supply: max17042: handle fails of reading status register
+
+From: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
+
+commit 54784ffa5b267f57161eb8fbb811499f22a0a0bf upstream.
+
+Reading status register can fail in the interrupt handler.  In such
+case, the regmap_read() will not store anything useful under passed
+'val' variable and random stack value will be used to determine type of
+interrupt.
+
+Handle the regmap_read() failure to avoid handling interrupt type and
+triggering changed power supply event based on random stack value.
+
+Fixes: 39e7213edc4f ("max17042_battery: Support regmap to access device's registers")
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
+Reviewed-by: Hans de Goede <hdegoede@redhat.com>
+Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/power/supply/max17042_battery.c |    6 +++++-
+ 1 file changed, 5 insertions(+), 1 deletion(-)
+
+--- a/drivers/power/supply/max17042_battery.c
++++ b/drivers/power/supply/max17042_battery.c
+@@ -856,8 +856,12 @@ static irqreturn_t max17042_thread_handl
+ {
+       struct max17042_chip *chip = dev;
+       u32 val;
++      int ret;
++
++      ret = regmap_read(chip->regmap, MAX17042_STATUS, &val);
++      if (ret)
++              return IRQ_HANDLED;
+ 
+-      regmap_read(chip->regmap, MAX17042_STATUS, &val);
+       if ((val & STATUS_INTR_SOCMIN_BIT) ||
+               (val & STATUS_INTR_SOCMAX_BIT)) {
+               dev_info(&chip->client->dev, "SOC threshold INTR\n");
diff --git a/queue-4.19/selftests-bpf-fix-tests-due-to-const-spill-fill.patch b/queue-4.19/selftests-bpf-fix-tests-due-to-const-spill-fill.patch

new file mode 100644 (file)

index 0000000..38805e9
--- /dev/null
+++ b/queue-4.19/selftests-bpf-fix-tests-due-to-const-spill-fill.patch
@@ -0,0 +1,192 @@
+From foo@baz Wed Sep 15 02:37:47 PM CEST 2021
+From: Ovidiu Panait <ovidiu.panait@windriver.com>
+Date: Mon, 13 Sep 2021 18:35:33 +0300
+Subject: selftests/bpf: fix tests due to const spill/fill
+To: stable@vger.kernel.org
+Cc: bpf@vger.kernel.org, daniel@iogearbox.net
+Message-ID: <20210913153537.2162465-10-ovidiu.panait@windriver.com>
+
+From: Alexei Starovoitov <ast@kernel.org>
+
+commit fc559a70d57c6ee5443f7a750858503e94cdc941 upstream.
+
+fix tests that incorrectly assumed that the verifier
+cannot track constants through stack.
+
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Acked-by: Andrii Nakryiko <andriin@fb.com>
+Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
+[OP: backport to 4.19]
+Signed-off-by: Ovidiu Panait <ovidiu.panait@windriver.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ tools/testing/selftests/bpf/test_verifier.c |   31 +++++++++++++++-------------
+ 1 file changed, 17 insertions(+), 14 deletions(-)
+
+--- a/tools/testing/selftests/bpf/test_verifier.c
++++ b/tools/testing/selftests/bpf/test_verifier.c
+@@ -3888,7 +3888,8 @@ static struct bpf_test tests[] = {
+                                   offsetof(struct __sk_buff, data)),
+                       BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+                                   offsetof(struct __sk_buff, data_end)),
+-                      BPF_MOV64_IMM(BPF_REG_0, 0xffffffff),
++                      BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
++                          offsetof(struct __sk_buff, mark)),
+                       BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -8),
+                       BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_10, -8),
+                       BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xffff),
+@@ -6560,9 +6561,9 @@ static struct bpf_test tests[] = {
+       {
+               "helper access to variable memory: stack, bitwise AND, zero included",
+               .insns = {
++                      BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, 8),
+                       BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -64),
+-                      BPF_MOV64_IMM(BPF_REG_2, 16),
+                       BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_2, -128),
+                       BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, -128),
+                       BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 64),
+@@ -6577,9 +6578,9 @@ static struct bpf_test tests[] = {
+       {
+               "helper access to variable memory: stack, bitwise AND + JMP, wrong max",
+               .insns = {
++                      BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, 8),
+                       BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -64),
+-                      BPF_MOV64_IMM(BPF_REG_2, 16),
+                       BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_2, -128),
+                       BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, -128),
+                       BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 65),
+@@ -6653,9 +6654,9 @@ static struct bpf_test tests[] = {
+       {
+               "helper access to variable memory: stack, JMP, bounds + offset",
+               .insns = {
++                      BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, 8),
+                       BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -64),
+-                      BPF_MOV64_IMM(BPF_REG_2, 16),
+                       BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_2, -128),
+                       BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, -128),
+                       BPF_JMP_IMM(BPF_JGT, BPF_REG_2, 64, 5),
+@@ -6674,9 +6675,9 @@ static struct bpf_test tests[] = {
+       {
+               "helper access to variable memory: stack, JMP, wrong max",
+               .insns = {
++                      BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, 8),
+                       BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -64),
+-                      BPF_MOV64_IMM(BPF_REG_2, 16),
+                       BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_2, -128),
+                       BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, -128),
+                       BPF_JMP_IMM(BPF_JGT, BPF_REG_2, 65, 4),
+@@ -6694,9 +6695,9 @@ static struct bpf_test tests[] = {
+       {
+               "helper access to variable memory: stack, JMP, no max check",
+               .insns = {
++                      BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, 8),
+                       BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -64),
+-                      BPF_MOV64_IMM(BPF_REG_2, 16),
+                       BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_2, -128),
+                       BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, -128),
+                       BPF_MOV64_IMM(BPF_REG_4, 0),
+@@ -6714,9 +6715,9 @@ static struct bpf_test tests[] = {
+       {
+               "helper access to variable memory: stack, JMP, no min check",
+               .insns = {
++                      BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, 8),
+                       BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -64),
+-                      BPF_MOV64_IMM(BPF_REG_2, 16),
+                       BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_2, -128),
+                       BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, -128),
+                       BPF_JMP_IMM(BPF_JGT, BPF_REG_2, 64, 3),
+@@ -6732,9 +6733,9 @@ static struct bpf_test tests[] = {
+       {
+               "helper access to variable memory: stack, JMP (signed), no min check",
+               .insns = {
++                      BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, 8),
+                       BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -64),
+-                      BPF_MOV64_IMM(BPF_REG_2, 16),
+                       BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_2, -128),
+                       BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, -128),
+                       BPF_JMP_IMM(BPF_JSGT, BPF_REG_2, 64, 3),
+@@ -6776,6 +6777,7 @@ static struct bpf_test tests[] = {
+       {
+               "helper access to variable memory: map, JMP, wrong max",
+               .insns = {
++                      BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, 8),
+                       BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+                       BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+@@ -6783,7 +6785,7 @@ static struct bpf_test tests[] = {
+                       BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+                       BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 10),
+                       BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+-                      BPF_MOV64_IMM(BPF_REG_2, sizeof(struct test_val)),
++                      BPF_MOV64_REG(BPF_REG_2, BPF_REG_6),
+                       BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -128),
+                       BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_10, -128),
+                       BPF_JMP_IMM(BPF_JSGT, BPF_REG_2,
+@@ -6795,7 +6797,7 @@ static struct bpf_test tests[] = {
+                       BPF_MOV64_IMM(BPF_REG_0, 0),
+                       BPF_EXIT_INSN(),
+               },
+-              .fixup_map2 = { 3 },
++              .fixup_map2 = { 4 },
+               .errstr = "invalid access to map value, value_size=48 off=0 size=49",
+               .result = REJECT,
+               .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+@@ -6830,6 +6832,7 @@ static struct bpf_test tests[] = {
+       {
+               "helper access to variable memory: map adjusted, JMP, wrong max",
+               .insns = {
++                      BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, 8),
+                       BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+                       BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+@@ -6838,7 +6841,7 @@ static struct bpf_test tests[] = {
+                       BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 11),
+                       BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 20),
+-                      BPF_MOV64_IMM(BPF_REG_2, sizeof(struct test_val)),
++                      BPF_MOV64_REG(BPF_REG_2, BPF_REG_6),
+                       BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -128),
+                       BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_10, -128),
+                       BPF_JMP_IMM(BPF_JSGT, BPF_REG_2,
+@@ -6850,7 +6853,7 @@ static struct bpf_test tests[] = {
+                       BPF_MOV64_IMM(BPF_REG_0, 0),
+                       BPF_EXIT_INSN(),
+               },
+-              .fixup_map2 = { 3 },
++              .fixup_map2 = { 4 },
+               .errstr = "R1 min value is outside of the array range",
+               .result = REJECT,
+               .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+@@ -6872,8 +6875,8 @@ static struct bpf_test tests[] = {
+       {
+               "helper access to variable memory: size > 0 not allowed on NULL (ARG_PTR_TO_MEM_OR_NULL)",
+               .insns = {
++                      BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 0),
+                       BPF_MOV64_IMM(BPF_REG_1, 0),
+-                      BPF_MOV64_IMM(BPF_REG_2, 1),
+                       BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -128),
+                       BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_10, -128),
+                       BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 64),
+@@ -7100,6 +7103,7 @@ static struct bpf_test tests[] = {
+       {
+               "helper access to variable memory: 8 bytes leak",
+               .insns = {
++                      BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, 8),
+                       BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -64),
+                       BPF_MOV64_IMM(BPF_REG_0, 0),
+@@ -7110,7 +7114,6 @@ static struct bpf_test tests[] = {
+                       BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -24),
+                       BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16),
+                       BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -8),
+-                      BPF_MOV64_IMM(BPF_REG_2, 1),
+                       BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -128),
+                       BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_10, -128),
+                       BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 63),
diff --git a/queue-4.19/selftests-bpf-test-variable-offset-stack-access.patch b/queue-4.19/selftests-bpf-test-variable-offset-stack-access.patch

new file mode 100644 (file)

index 0000000..f644bea
--- /dev/null
+++ b/queue-4.19/selftests-bpf-test-variable-offset-stack-access.patch
@@ -0,0 +1,132 @@
+From foo@baz Wed Sep 15 02:37:47 PM CEST 2021
+From: Ovidiu Panait <ovidiu.panait@windriver.com>
+Date: Mon, 13 Sep 2021 18:35:31 +0300
+Subject: selftests/bpf: Test variable offset stack access
+To: stable@vger.kernel.org
+Cc: bpf@vger.kernel.org, daniel@iogearbox.net
+Message-ID: <20210913153537.2162465-8-ovidiu.panait@windriver.com>
+
+From: Andrey Ignatov <rdna@fb.com>
+
+commit 8ff80e96e3ccea5ff0a890d4f18997e0344dbec2 upstream.
+
+Test different scenarios of indirect variable-offset stack access: out of
+bound access (>0), min_off below initialized part of the stack,
+max_off+size above initialized part of the stack, initialized stack.
+
+Example of output:
+  ...
+  #856/p indirect variable-offset stack access, out of bound OK
+  #857/p indirect variable-offset stack access, max_off+size > max_initialized OK
+  #858/p indirect variable-offset stack access, min_off < min_initialized OK
+  #859/p indirect variable-offset stack access, ok OK
+  ...
+
+Signed-off-by: Andrey Ignatov <rdna@fb.com>
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+[OP: backport to 4.19]
+Signed-off-by: Ovidiu Panait <ovidiu.panait@windriver.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ tools/testing/selftests/bpf/test_verifier.c |   79 +++++++++++++++++++++++++++-
+ 1 file changed, 77 insertions(+), 2 deletions(-)
+
+--- a/tools/testing/selftests/bpf/test_verifier.c
++++ b/tools/testing/selftests/bpf/test_verifier.c
+@@ -8495,7 +8495,7 @@ static struct bpf_test tests[] = {
+               .prog_type = BPF_PROG_TYPE_LWT_IN,
+       },
+       {
+-              "indirect variable-offset stack access",
++              "indirect variable-offset stack access, out of bound",
+               .insns = {
+                       /* Fill the top 8 bytes of the stack */
+                       BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+@@ -8516,11 +8516,86 @@ static struct bpf_test tests[] = {
+                       BPF_EXIT_INSN(),
+               },
+               .fixup_map1 = { 5 },
+-              .errstr = "variable stack read R2",
++              .errstr = "invalid stack type R2 var_off",
+               .result = REJECT,
+               .prog_type = BPF_PROG_TYPE_LWT_IN,
+       },
+       {
++              "indirect variable-offset stack access, max_off+size > max_initialized",
++              .insns = {
++              /* Fill only the second from top 8 bytes of the stack. */
++              BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, 0),
++              /* Get an unknown value. */
++              BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 0),
++              /* Make it small and 4-byte aligned. */
++              BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 4),
++              BPF_ALU64_IMM(BPF_SUB, BPF_REG_2, 16),
++              /* Add it to fp.  We now have either fp-12 or fp-16, but we don't know
++               * which. fp-12 size 8 is partially uninitialized stack.
++               */
++              BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_10),
++              /* Dereference it indirectly. */
++              BPF_LD_MAP_FD(BPF_REG_1, 0),
++              BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
++              BPF_MOV64_IMM(BPF_REG_0, 0),
++              BPF_EXIT_INSN(),
++              },
++              .fixup_map1 = { 5 },
++              .errstr = "invalid indirect read from stack var_off",
++              .result = REJECT,
++              .prog_type = BPF_PROG_TYPE_LWT_IN,
++      },
++      {
++              "indirect variable-offset stack access, min_off < min_initialized",
++              .insns = {
++              /* Fill only the top 8 bytes of the stack. */
++              BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
++              /* Get an unknown value */
++              BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 0),
++              /* Make it small and 4-byte aligned. */
++              BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 4),
++              BPF_ALU64_IMM(BPF_SUB, BPF_REG_2, 16),
++              /* Add it to fp.  We now have either fp-12 or fp-16, but we don't know
++               * which. fp-16 size 8 is partially uninitialized stack.
++               */
++              BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_10),
++              /* Dereference it indirectly. */
++              BPF_LD_MAP_FD(BPF_REG_1, 0),
++              BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
++              BPF_MOV64_IMM(BPF_REG_0, 0),
++              BPF_EXIT_INSN(),
++              },
++              .fixup_map1 = { 5 },
++              .errstr = "invalid indirect read from stack var_off",
++              .result = REJECT,
++              .prog_type = BPF_PROG_TYPE_LWT_IN,
++      },
++      {
++              "indirect variable-offset stack access, ok",
++              .insns = {
++              /* Fill the top 16 bytes of the stack. */
++              BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, 0),
++              BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
++              /* Get an unknown value. */
++              BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 0),
++              /* Make it small and 4-byte aligned. */
++              BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 4),
++              BPF_ALU64_IMM(BPF_SUB, BPF_REG_2, 16),
++              /* Add it to fp.  We now have either fp-12 or fp-16, we don't know
++               * which, but either way it points to initialized stack.
++               */
++              BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_10),
++              /* Dereference it indirectly. */
++              BPF_LD_MAP_FD(BPF_REG_1, 0),
++              BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
++              BPF_MOV64_IMM(BPF_REG_0, 0),
++              BPF_EXIT_INSN(),
++              },
++              .fixup_map1 = { 6 },
++              .result = ACCEPT,
++              .prog_type = BPF_PROG_TYPE_LWT_IN,
++      },
++      {
+               "direct stack access with 32-bit wraparound. test1",
+               .insns = {
+                       BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
diff --git a/queue-4.19/series b/queue-4.19/series

index 7860e5d7774c02cd8ee83530da0882bb655f424d..e4562c9525e9317192f861135c89999d4a9e5933 100644 (file)
--- a/queue-4.19/series
+++ b/queue-4.19/series
@@ -126,3 +126,26 @@ powerpc-perf-hv-gpci-fix-counter-value-parsing.patch
  xen-fix-setting-of-max_pfn-in-shared_info.patch
  include-linux-list.h-add-a-macro-to-test-if-entry-is-pointing-to-the-head.patch
  9p-xen-fix-end-of-loop-tests-for-list_for_each_entry.patch
+bpf-verifier-per-register-parent-pointers.patch
+bpf-correct-slot_type-marking-logic-to-allow-more-stack-slot-sharing.patch
+bpf-support-variable-offset-stack-access-from-helpers.patch
+bpf-reject-indirect-var_off-stack-access-in-raw-mode.patch
+bpf-reject-indirect-var_off-stack-access-in-unpriv-mode.patch
+bpf-sanity-check-max-value-for-var_off-stack-access.patch
+selftests-bpf-test-variable-offset-stack-access.patch
+bpf-track-spill-fill-of-constants.patch
+selftests-bpf-fix-tests-due-to-const-spill-fill.patch
+bpf-introduce-bpf-nospec-instruction-for-mitigating-spectre-v4.patch
+bpf-fix-leakage-due-to-insufficient-speculative-store-bypass-mitigation.patch
+bpf-verifier-allocate-idmap-scratch-in-verifier-env.patch
+bpf-fix-pointer-arithmetic-mask-tightening-under-state-pruning.patch
+tools-thermal-tmon-add-cross-compiling-support.patch
+soc-aspeed-lpc-ctrl-fix-boundary-check-for-mmap.patch
+arm64-head-avoid-over-mapping-in-map_memory.patch
+crypto-public_key-fix-overflow-during-implicit-conversion.patch
+block-bfq-fix-bfq_set_next_ioprio_data.patch
+power-supply-max17042-handle-fails-of-reading-status-register.patch
+dm-crypt-avoid-percpu_counter-spinlock-contention-in-crypt_page_alloc.patch
+vmci-fix-null-pointer-dereference-when-unmapping-queue-pair.patch
+media-uvc-don-t-do-dma-on-stack.patch
+media-rc-loopback-return-number-of-emitters-rather-than-error.patch
diff --git a/queue-4.19/soc-aspeed-lpc-ctrl-fix-boundary-check-for-mmap.patch b/queue-4.19/soc-aspeed-lpc-ctrl-fix-boundary-check-for-mmap.patch

new file mode 100644 (file)

index 0000000..b506f2e
--- /dev/null
+++ b/queue-4.19/soc-aspeed-lpc-ctrl-fix-boundary-check-for-mmap.patch
@@ -0,0 +1,38 @@
+From b49a0e69a7b1a68c8d3f64097d06dabb770fec96 Mon Sep 17 00:00:00 2001
+From: Iwona Winiarska <iwona.winiarska@intel.com>
+Date: Wed, 4 Aug 2021 01:48:18 +0200
+Subject: soc: aspeed: lpc-ctrl: Fix boundary check for mmap
+
+From: Iwona Winiarska <iwona.winiarska@intel.com>
+
+commit b49a0e69a7b1a68c8d3f64097d06dabb770fec96 upstream.
+
+The check mixes pages (vm_pgoff) with bytes (vm_start, vm_end) on one
+side of the comparison, and uses resource address (rather than just the
+resource size) on the other side of the comparison.
+This can allow malicious userspace to easily bypass the boundary check and
+map pages that are located outside memory-region reserved by the driver.
+
+Fixes: 6c4e97678501 ("drivers/misc: Add Aspeed LPC control driver")
+Cc: stable@vger.kernel.org
+Signed-off-by: Iwona Winiarska <iwona.winiarska@intel.com>
+Reviewed-by: Andrew Jeffery <andrew@aj.id.au>
+Tested-by: Andrew Jeffery <andrew@aj.id.au>
+Reviewed-by: Joel Stanley <joel@aj.id.au>
+Signed-off-by: Joel Stanley <joel@jms.id.au>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/misc/aspeed-lpc-ctrl.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/misc/aspeed-lpc-ctrl.c
++++ b/drivers/misc/aspeed-lpc-ctrl.c
+@@ -50,7 +50,7 @@ static int aspeed_lpc_ctrl_mmap(struct f
+       unsigned long vsize = vma->vm_end - vma->vm_start;
+       pgprot_t prot = vma->vm_page_prot;
+ 
+-      if (vma->vm_pgoff + vsize > lpc_ctrl->mem_base + lpc_ctrl->mem_size)
++      if (vma->vm_pgoff + vma_pages(vma) > lpc_ctrl->mem_size >> PAGE_SHIFT)
+               return -EINVAL;
+ 
+       /* ast2400/2500 AHB accesses are not cache coherent */
diff --git a/queue-4.19/tools-thermal-tmon-add-cross-compiling-support.patch b/queue-4.19/tools-thermal-tmon-add-cross-compiling-support.patch

new file mode 100644 (file)

index 0000000..7db4d0c
--- /dev/null
+++ b/queue-4.19/tools-thermal-tmon-add-cross-compiling-support.patch
@@ -0,0 +1,34 @@
+From b5f7912bb604b47a0fe024560488a7556dce8ee7 Mon Sep 17 00:00:00 2001
+From: Rolf Eike Beer <eb@emlix.com>
+Date: Fri, 30 Jul 2021 13:51:54 +0200
+Subject: tools/thermal/tmon: Add cross compiling support
+
+From: Rolf Eike Beer <eb@emlix.com>
+
+commit b5f7912bb604b47a0fe024560488a7556dce8ee7 upstream.
+
+Default to prefixed pkg-config when crosscompiling, this matches what
+other parts of the tools/ directory already do.
+
+[dlezcano] : Reworked description
+
+Signed-off-by: Rolf Eike Beer <eb@emlix.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
+Link: https://lore.kernel.org/r/31302992.qZodDJZGDc@devpool47
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ tools/thermal/tmon/Makefile |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/tools/thermal/tmon/Makefile
++++ b/tools/thermal/tmon/Makefile
+@@ -10,7 +10,7 @@ CFLAGS+= -O1 ${WARNFLAGS}
+ # Add "-fstack-protector" only if toolchain supports it.
+ CFLAGS+= $(call cc-option,-fstack-protector)
+ CC?= $(CROSS_COMPILE)gcc
+-PKG_CONFIG?= pkg-config
++PKG_CONFIG?= $(CROSS_COMPILE)pkg-config
+ 
+ CFLAGS+=-D VERSION=\"$(VERSION)\"
+ LDFLAGS+=
diff --git a/queue-4.19/vmci-fix-null-pointer-dereference-when-unmapping-queue-pair.patch b/queue-4.19/vmci-fix-null-pointer-dereference-when-unmapping-queue-pair.patch

new file mode 100644 (file)

index 0000000..e4ae52b
--- /dev/null
+++ b/queue-4.19/vmci-fix-null-pointer-dereference-when-unmapping-queue-pair.patch
@@ -0,0 +1,79 @@
+From a30dc6cf0dc51419021550152e435736aaef8799 Mon Sep 17 00:00:00 2001
+From: Wang Hai <wanghai38@huawei.com>
+Date: Wed, 18 Aug 2021 20:48:45 +0800
+Subject: VMCI: fix NULL pointer dereference when unmapping queue pair
+
+From: Wang Hai <wanghai38@huawei.com>
+
+commit a30dc6cf0dc51419021550152e435736aaef8799 upstream.
+
+I got a NULL pointer dereference report when doing fuzz test:
+
+Call Trace:
+  qp_release_pages+0xae/0x130
+  qp_host_unregister_user_memory.isra.25+0x2d/0x80
+  vmci_qp_broker_unmap+0x191/0x320
+  ? vmci_host_do_alloc_queuepair.isra.9+0x1c0/0x1c0
+  vmci_host_unlocked_ioctl+0x59f/0xd50
+  ? do_vfs_ioctl+0x14b/0xa10
+  ? tomoyo_file_ioctl+0x28/0x30
+  ? vmci_host_do_alloc_queuepair.isra.9+0x1c0/0x1c0
+  __x64_sys_ioctl+0xea/0x120
+  do_syscall_64+0x34/0xb0
+  entry_SYSCALL_64_after_hwframe+0x44/0xae
+
+When a queue pair is created by the following call, it will not
+register the user memory if the page_store is NULL, and the
+entry->state will be set to VMCIQPB_CREATED_NO_MEM.
+
+vmci_host_unlocked_ioctl
+  vmci_host_do_alloc_queuepair
+    vmci_qp_broker_alloc
+      qp_broker_alloc
+        qp_broker_create // set entry->state = VMCIQPB_CREATED_NO_MEM;
+
+When unmapping this queue pair, qp_host_unregister_user_memory() will
+be called to unregister the non-existent user memory, which will
+result in a null pointer reference. It will also change
+VMCIQPB_CREATED_NO_MEM to VMCIQPB_CREATED_MEM, which should not be
+present in this operation.
+
+Only when the qp broker has mem, it can unregister the user
+memory when unmapping the qp broker.
+
+Only when the qp broker has no mem, it can register the user
+memory when mapping the qp broker.
+
+Fixes: 06164d2b72aa ("VMCI: queue pairs implementation.")
+Cc: stable <stable@vger.kernel.org>
+Reported-by: Hulk Robot <hulkci@huawei.com>
+Reviewed-by: Jorgen Hansen <jhansen@vmware.com>
+Signed-off-by: Wang Hai <wanghai38@huawei.com>
+Link: https://lore.kernel.org/r/20210818124845.488312-1-wanghai38@huawei.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/misc/vmw_vmci/vmci_queue_pair.c |    6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+--- a/drivers/misc/vmw_vmci/vmci_queue_pair.c
++++ b/drivers/misc/vmw_vmci/vmci_queue_pair.c
+@@ -2249,7 +2249,8 @@ int vmci_qp_broker_map(struct vmci_handl
+ 
+       result = VMCI_SUCCESS;
+ 
+-      if (context_id != VMCI_HOST_CONTEXT_ID) {
++      if (context_id != VMCI_HOST_CONTEXT_ID &&
++          !QPBROKERSTATE_HAS_MEM(entry)) {
+               struct vmci_qp_page_store page_store;
+ 
+               page_store.pages = guest_mem;
+@@ -2356,7 +2357,8 @@ int vmci_qp_broker_unmap(struct vmci_han
+               goto out;
+       }
+ 
+-      if (context_id != VMCI_HOST_CONTEXT_ID) {
++      if (context_id != VMCI_HOST_CONTEXT_ID &&
++          QPBROKERSTATE_HAS_MEM(entry)) {
+               qp_acquire_queue_mutex(entry->produce_q);
+               result = qp_save_headers(entry);
+               if (result < VMCI_SUCCESS)
author	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Wed, 15 Sep 2021 12:56:30 +0000 (14:56 +0200)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Wed, 15 Sep 2021 12:56:30 +0000 (14:56 +0200)
queue-4.19/arm64-head-avoid-over-mapping-in-map_memory.patch	[new file with mode: 0644]	patch \| blob
queue-4.19/block-bfq-fix-bfq_set_next_ioprio_data.patch	[new file with mode: 0644]	patch \| blob
queue-4.19/bpf-correct-slot_type-marking-logic-to-allow-more-stack-slot-sharing.patch	[new file with mode: 0644]	patch \| blob
queue-4.19/bpf-fix-leakage-due-to-insufficient-speculative-store-bypass-mitigation.patch	[new file with mode: 0644]	patch \| blob
queue-4.19/bpf-fix-pointer-arithmetic-mask-tightening-under-state-pruning.patch	[new file with mode: 0644]	patch \| blob
queue-4.19/bpf-introduce-bpf-nospec-instruction-for-mitigating-spectre-v4.patch	[new file with mode: 0644]	patch \| blob
queue-4.19/bpf-reject-indirect-var_off-stack-access-in-raw-mode.patch	[new file with mode: 0644]	patch \| blob
queue-4.19/bpf-reject-indirect-var_off-stack-access-in-unpriv-mode.patch	[new file with mode: 0644]	patch \| blob
queue-4.19/bpf-sanity-check-max-value-for-var_off-stack-access.patch	[new file with mode: 0644]	patch \| blob
queue-4.19/bpf-support-variable-offset-stack-access-from-helpers.patch	[new file with mode: 0644]	patch \| blob
queue-4.19/bpf-track-spill-fill-of-constants.patch	[new file with mode: 0644]	patch \| blob
queue-4.19/bpf-verifier-allocate-idmap-scratch-in-verifier-env.patch	[new file with mode: 0644]	patch \| blob
queue-4.19/bpf-verifier-per-register-parent-pointers.patch	[new file with mode: 0644]	patch \| blob
queue-4.19/crypto-public_key-fix-overflow-during-implicit-conversion.patch	[new file with mode: 0644]	patch \| blob
queue-4.19/dm-crypt-avoid-percpu_counter-spinlock-contention-in-crypt_page_alloc.patch	[new file with mode: 0644]	patch \| blob
queue-4.19/media-rc-loopback-return-number-of-emitters-rather-than-error.patch	[new file with mode: 0644]	patch \| blob
queue-4.19/media-uvc-don-t-do-dma-on-stack.patch	[new file with mode: 0644]	patch \| blob
queue-4.19/power-supply-max17042-handle-fails-of-reading-status-register.patch	[new file with mode: 0644]	patch \| blob
queue-4.19/selftests-bpf-fix-tests-due-to-const-spill-fill.patch	[new file with mode: 0644]	patch \| blob
queue-4.19/selftests-bpf-test-variable-offset-stack-access.patch	[new file with mode: 0644]	patch \| blob
queue-4.19/series		patch \| blob \| blame \| history
queue-4.19/soc-aspeed-lpc-ctrl-fix-boundary-check-for-mmap.patch	[new file with mode: 0644]	patch \| blob
queue-4.19/tools-thermal-tmon-add-cross-compiling-support.patch	[new file with mode: 0644]	patch \| blob
queue-4.19/vmci-fix-null-pointer-dereference-when-unmapping-queue-pair.patch	[new file with mode: 0644]	patch \| blob