5.15-stable patches

author Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Sat, 12 Aug 2023 09:25:17 +0000 (11:25 +0200)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Sat, 12 Aug 2023 09:25:17 +0000 (11:25 +0200)
author Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Sat, 12 Aug 2023 09:25:17 +0000 (11:25 +0200)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Sat, 12 Aug 2023 09:25:17 +0000 (11:25 +0200)
diff --git a/queue-5.15/bpf-aggressively-forget-precise-markings-during-state-checkpointing.patch b/queue-5.15/bpf-aggressively-forget-precise-markings-during-state-checkpointing.patch

new file mode 100644 (file)

index 0000000..2e63751
--- /dev/null
+++ b/queue-5.15/bpf-aggressively-forget-precise-markings-during-state-checkpointing.patch
@@ -0,0 +1,129 @@
+From pulehui@huaweicloud.com Fri Aug  4 17:25:38 2023
+From: Pu Lehui <pulehui@huaweicloud.com>
+Date: Fri,  4 Aug 2023 23:24:56 +0800
+Subject: bpf: aggressively forget precise markings during state checkpointing
+To: stable@vger.kernel.org, Greg KH <greg@kroah.com>, Eduard Zingerman <eddyz87@gmail.com>, Luiz Capitulino <luizcap@amazon.com>
+Cc: Alexei Starovoitov <ast@kernel.org>, Daniel Borkmann <daniel@iogearbox.net>, Andrii Nakryiko <andrii@kernel.org>, Pu Lehui <pulehui@huawei.com>, Pu Lehui <pulehui@huaweicloud.com>
+Message-ID: <20230804152459.2565673-4-pulehui@huaweicloud.com>
+
+From: Andrii Nakryiko <andrii@kernel.org>
+
+[ Upstream commit 7a830b53c17bbadcf99f778f28aaaa4e6c41df5f ]
+
+Exploit the property of about-to-be-checkpointed state to be able to
+forget all precise markings up to that point even more aggressively. We
+now clear all potentially inherited precise markings right before
+checkpointing and branching off into child state. If any of children
+states require precise knowledge of any SCALAR register, those will be
+propagated backwards later on before this state is finalized, preserving
+correctness.
+
+There is a single selftests BPF program change, but tremendous one: 25x
+reduction in number of verified instructions and states in
+trace_virtqueue_add_sgs.
+
+Cilium results are more modest, but happen across wider range of programs.
+
+SELFTESTS RESULTS
+=================
+
+$ ./veristat -C -e file,prog,insns,states ~/imprecise-early-results.csv ~/imprecise-aggressive-results.csv | grep -v '+0'
+File                 Program                  Total insns (A)  Total insns (B)  Total insns (DIFF)  Total states (A)  Total states (B)  Total states (DIFF)
+-------------------  -----------------------  ---------------  ---------------  ------------------  ----------------  ----------------  -------------------
+loop6.bpf.linked1.o  trace_virtqueue_add_sgs           398057            15114   -382943 (-96.20%)              8717               336      -8381 (-96.15%)
+-------------------  -----------------------  ---------------  ---------------  ------------------  ----------------  ----------------  -------------------
+
+CILIUM RESULTS
+==============
+
+$ ./veristat -C -e file,prog,insns,states ~/imprecise-early-results-cilium.csv ~/imprecise-aggressive-results-cilium.csv | grep -v '+0'
+File           Program                           Total insns (A)  Total insns (B)  Total insns (DIFF)  Total states (A)  Total states (B)  Total states (DIFF)
+-------------  --------------------------------  ---------------  ---------------  ------------------  ----------------  ----------------  -------------------
+bpf_host.o     tail_handle_nat_fwd_ipv4                    23426            23221       -205 (-0.88%)              1537              1515         -22 (-1.43%)
+bpf_host.o     tail_handle_nat_fwd_ipv6                    13009            12904       -105 (-0.81%)               719               708         -11 (-1.53%)
+bpf_host.o     tail_nodeport_nat_ingress_ipv6               5261             5196        -65 (-1.24%)               247               243          -4 (-1.62%)
+bpf_host.o     tail_nodeport_nat_ipv6_egress                3446             3406        -40 (-1.16%)               203               198          -5 (-2.46%)
+bpf_lxc.o      tail_handle_nat_fwd_ipv4                    23426            23221       -205 (-0.88%)              1537              1515         -22 (-1.43%)
+bpf_lxc.o      tail_handle_nat_fwd_ipv6                    13009            12904       -105 (-0.81%)               719               708         -11 (-1.53%)
+bpf_lxc.o      tail_ipv4_ct_egress                          5074             4897       -177 (-3.49%)               255               248          -7 (-2.75%)
+bpf_lxc.o      tail_ipv4_ct_ingress                         5100             4923       -177 (-3.47%)               255               248          -7 (-2.75%)
+bpf_lxc.o      tail_ipv4_ct_ingress_policy_only             5100             4923       -177 (-3.47%)               255               248          -7 (-2.75%)
+bpf_lxc.o      tail_ipv6_ct_egress                          4558             4536        -22 (-0.48%)               188               187          -1 (-0.53%)
+bpf_lxc.o      tail_ipv6_ct_ingress                         4578             4556        -22 (-0.48%)               188               187          -1 (-0.53%)
+bpf_lxc.o      tail_ipv6_ct_ingress_policy_only             4578             4556        -22 (-0.48%)               188               187          -1 (-0.53%)
+bpf_lxc.o      tail_nodeport_nat_ingress_ipv6               5261             5196        -65 (-1.24%)               247               243          -4 (-1.62%)
+bpf_overlay.o  tail_nodeport_nat_ingress_ipv6               5261             5196        -65 (-1.24%)               247               243          -4 (-1.62%)
+bpf_overlay.o  tail_nodeport_nat_ipv6_egress                3482             3442        -40 (-1.15%)               204               201          -3 (-1.47%)
+bpf_xdp.o      tail_nodeport_nat_egress_ipv4               17200            15619      -1581 (-9.19%)              1111              1010        -101 (-9.09%)
+-------------  --------------------------------  ---------------  ---------------  ------------------  ----------------  ----------------  -------------------
+
+Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
+Link: https://lore.kernel.org/r/20221104163649.121784-6-andrii@kernel.org
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Stable-dep-of: ecdf985d7615 ("bpf: track immediate values written to stack by BPF_ST instruction")
+Signed-off-by: Pu Lehui <pulehui@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/bpf/verifier.c |   37 +++++++++++++++++++++++++++++++++++++
+ 1 file changed, 37 insertions(+)
+
+--- a/kernel/bpf/verifier.c
++++ b/kernel/bpf/verifier.c
+@@ -2407,6 +2407,31 @@ static void mark_all_scalars_precise(str
+       }
+ }
+ 
++static void mark_all_scalars_imprecise(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
++{
++      struct bpf_func_state *func;
++      struct bpf_reg_state *reg;
++      int i, j;
++
++      for (i = 0; i <= st->curframe; i++) {
++              func = st->frame[i];
++              for (j = 0; j < BPF_REG_FP; j++) {
++                      reg = &func->regs[j];
++                      if (reg->type != SCALAR_VALUE)
++                              continue;
++                      reg->precise = false;
++              }
++              for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) {
++                      if (!is_spilled_reg(&func->stack[j]))
++                              continue;
++                      reg = &func->stack[j].spilled_ptr;
++                      if (reg->type != SCALAR_VALUE)
++                              continue;
++                      reg->precise = false;
++              }
++      }
++}
++
+ /*
+  * __mark_chain_precision() backtracks BPF program instruction sequence and
+  * chain of verifier states making sure that register *regno* (if regno >= 0)
+@@ -2485,6 +2510,14 @@ static void mark_all_scalars_precise(str
+  * be imprecise. If any child state does require this register to be precise,
+  * we'll mark it precise later retroactively during precise markings
+  * propagation from child state to parent states.
++ *
++ * Skipping precise marking setting in current state is a mild version of
++ * relying on the above observation. But we can utilize this property even
++ * more aggressively by proactively forgetting any precise marking in the
++ * current state (which we inherited from the parent state), right before we
++ * checkpoint it and branch off into new child state. This is done by
++ * mark_all_scalars_imprecise() to hopefully get more permissive and generic
++ * finalized states which help in short circuiting more future states.
+  */
+ static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int regno,
+                                 int spi)
+@@ -10984,6 +11017,10 @@ next:
+       env->prev_jmps_processed = env->jmps_processed;
+       env->prev_insn_processed = env->insn_processed;
+ 
++      /* forget precise markings we inherited, see __mark_chain_precision */
++      if (env->bpf_capable)
++              mark_all_scalars_imprecise(env, cur);
++
+       /* add new state to the head of linked list */
+       new = &new_sl->state;
+       err = copy_verifier_state(new, cur);
diff --git a/queue-5.15/bpf-allow-precision-tracking-for-programs-with-subprogs.patch b/queue-5.15/bpf-allow-precision-tracking-for-programs-with-subprogs.patch

new file mode 100644 (file)

index 0000000..c336f44
--- /dev/null
+++ b/queue-5.15/bpf-allow-precision-tracking-for-programs-with-subprogs.patch
@@ -0,0 +1,244 @@
+From pulehui@huaweicloud.com Fri Aug  4 17:25:37 2023
+From: Pu Lehui <pulehui@huaweicloud.com>
+Date: Fri,  4 Aug 2023 23:24:54 +0800
+Subject: bpf: allow precision tracking for programs with subprogs
+To: stable@vger.kernel.org, Greg KH <greg@kroah.com>, Eduard Zingerman <eddyz87@gmail.com>, Luiz Capitulino <luizcap@amazon.com>
+Cc: Alexei Starovoitov <ast@kernel.org>, Daniel Borkmann <daniel@iogearbox.net>, Andrii Nakryiko <andrii@kernel.org>, Pu Lehui <pulehui@huawei.com>, Pu Lehui <pulehui@huaweicloud.com>
+Message-ID: <20230804152459.2565673-2-pulehui@huaweicloud.com>
+
+From: Andrii Nakryiko <andrii@kernel.org>
+
+[ Upstream commit be2ef8161572ec1973124ebc50f56dafc2925e07 ]
+
+Stop forcing precise=true for SCALAR registers when BPF program has any
+subprograms. Current restriction means that any BPF program, as soon as
+it uses subprograms, will end up not getting any of the precision
+tracking benefits in reduction of number of verified states.
+
+This patch keeps the fallback mark_all_scalars_precise() behavior if
+precise marking has to cross function frames. E.g., if subprogram
+requires R1 (first input arg) to be marked precise, ideally we'd need to
+backtrack to the parent function and keep marking R1 and its
+dependencies as precise. But right now we give up and force all the
+SCALARs in any of the current and parent states to be forced to
+precise=true. We can lift that restriction in the future.
+
+But this patch fixes two issues identified when trying to enable
+precision tracking for subprogs.
+
+First, prevent "escaping" from top-most state in a global subprog. While
+with entry-level BPF program we never end up requesting precision for
+R1-R5 registers, because R2-R5 are not initialized (and so not readable
+in correct BPF program), and R1 is PTR_TO_CTX, not SCALAR, and so is
+implicitly precise. With global subprogs, though, it's different, as
+global subprog a) can have up to 5 SCALAR input arguments, which might
+get marked as precise=true and b) it is validated in isolation from its
+main entry BPF program. b) means that we can end up exhausting parent
+state chain and still not mark all registers in reg_mask as precise,
+which would lead to verifier bug warning.
+
+To handle that, we need to consider two cases. First, if the very first
+state is not immediately "checkpointed" (i.e., stored in state lookup
+hashtable), it will get correct first_insn_idx and last_insn_idx
+instruction set during state checkpointing. As such, this case is
+already handled and __mark_chain_precision() already handles that by
+just doing nothing when we reach to the very first parent state.
+st->parent will be NULL and we'll just stop. Perhaps some extra check
+for reg_mask and stack_mask is due here, but this patch doesn't address
+that issue.
+
+More problematic second case is when global function's initial state is
+immediately checkpointed before we manage to process the very first
+instruction. This is happening because when there is a call to global
+subprog from the main program the very first subprog's instruction is
+marked as pruning point, so before we manage to process first
+instruction we have to check and checkpoint state. This patch adds
+a special handling for such "empty" state, which is identified by having
+st->last_insn_idx set to -1. In such case, we check that we are indeed
+validating global subprog, and with some sanity checking we mark input
+args as precise if requested.
+
+Note that we also initialize state->first_insn_idx with correct start
+insn_idx offset. For main program zero is correct value, but for any
+subprog it's quite confusing to not have first_insn_idx set. This
+doesn't have any functional impact, but helps with debugging and state
+printing. We also explicitly initialize state->last_insns_idx instead of
+relying on is_state_visited() to do this with env->prev_insns_idx, which
+will be -1 on the very first instruction. This concludes necessary
+changes to handle specifically global subprog's precision tracking.
+
+Second identified problem was missed handling of BPF helper functions
+that call into subprogs (e.g., bpf_loop and few others). From precision
+tracking and backtracking logic's standpoint those are effectively calls
+into subprogs and should be called as BPF_PSEUDO_CALL calls.
+
+This patch takes the least intrusive way and just checks against a short
+list of current BPF helpers that do call subprogs, encapsulated in
+is_callback_calling_function() function. But to prevent accidentally
+forgetting to add new BPF helpers to this "list", we also do a sanity
+check in __check_func_call, which has to be called for each such special
+BPF helper, to validate that BPF helper is indeed recognized as
+callback-calling one. This should catch any missed checks in the future.
+Adding some special flags to be added in function proto definitions
+seemed like an overkill in this case.
+
+With the above changes, it's possible to remove forceful setting of
+reg->precise to true in __mark_reg_unknown, which turns on precision
+tracking both inside subprogs and entry progs that have subprogs. No
+warnings or errors were detected across all the selftests, but also when
+validating with veristat against internal Meta BPF objects and Cilium
+objects. Further, in some BPF programs there are noticeable reduction in
+number of states and instructions validated due to more effective
+precision tracking, especially benefiting syncookie test.
+
+$ ./veristat -C -e file,prog,insns,states ~/baseline-results.csv ~/subprog-precise-results.csv  | grep -v '+0'
+File                                      Program                     Total insns (A)  Total insns (B)  Total insns (DIFF)  Total states (A)  Total states (B)  Total states (DIFF)
+----------------------------------------  --------------------------  ---------------  ---------------  ------------------  ----------------  ----------------  -------------------
+pyperf600_bpf_loop.bpf.linked1.o          on_event                               3966             3678       -288 (-7.26%)               306               276         -30 (-9.80%)
+pyperf_global.bpf.linked1.o               on_event                               7563             7530        -33 (-0.44%)               520               517          -3 (-0.58%)
+pyperf_subprogs.bpf.linked1.o             on_event                              36358            36934       +576 (+1.58%)              2499              2531         +32 (+1.28%)
+setget_sockopt.bpf.linked1.o              skops_sockopt                          3965             4038        +73 (+1.84%)               343               347          +4 (+1.17%)
+test_cls_redirect_subprogs.bpf.linked1.o  cls_redirect                          64965            64901        -64 (-0.10%)              4619              4612          -7 (-0.15%)
+test_misc_tcp_hdr_options.bpf.linked1.o   misc_estab                             1491             1307      -184 (-12.34%)               110               100         -10 (-9.09%)
+test_pkt_access.bpf.linked1.o             test_pkt_access                         354              349         -5 (-1.41%)                25                24          -1 (-4.00%)
+test_sock_fields.bpf.linked1.o            egress_read_sock_fields                 435              375       -60 (-13.79%)                22                20          -2 (-9.09%)
+test_sysctl_loop2.bpf.linked1.o           sysctl_tcp_mem                         1508             1501         -7 (-0.46%)                29                28          -1 (-3.45%)
+test_tc_dtime.bpf.linked1.o               egress_fwdns_prio100                    468              435        -33 (-7.05%)                45                41          -4 (-8.89%)
+test_tc_dtime.bpf.linked1.o               ingress_fwdns_prio100                   398              408        +10 (+2.51%)                42                39          -3 (-7.14%)
+test_tc_dtime.bpf.linked1.o               ingress_fwdns_prio101                  1096              842      -254 (-23.18%)                97                73        -24 (-24.74%)
+test_tcp_hdr_options.bpf.linked1.o        estab                                  2758             2408      -350 (-12.69%)               208               181        -27 (-12.98%)
+test_urandom_usdt.bpf.linked1.o           urand_read_with_sema                    466              448        -18 (-3.86%)                31                28          -3 (-9.68%)
+test_urandom_usdt.bpf.linked1.o           urand_read_without_sema                 466              448        -18 (-3.86%)                31                28          -3 (-9.68%)
+test_urandom_usdt.bpf.linked1.o           urandlib_read_with_sema                 466              448        -18 (-3.86%)                31                28          -3 (-9.68%)
+test_urandom_usdt.bpf.linked1.o           urandlib_read_without_sema              466              448        -18 (-3.86%)                31                28          -3 (-9.68%)
+test_xdp_noinline.bpf.linked1.o           balancer_ingress_v6                    4302             4294         -8 (-0.19%)               257               256          -1 (-0.39%)
+xdp_synproxy_kern.bpf.linked1.o           syncookie_tc                         583722           405757   -177965 (-30.49%)             35846             25735     -10111 (-28.21%)
+xdp_synproxy_kern.bpf.linked1.o           syncookie_xdp                        609123           479055   -130068 (-21.35%)             35452             29145      -6307 (-17.79%)
+----------------------------------------  --------------------------  ---------------  ---------------  ------------------  ----------------  ----------------  -------------------
+
+Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
+Link: https://lore.kernel.org/r/20221104163649.121784-4-andrii@kernel.org
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Stable-dep-of: ecdf985d7615 ("bpf: track immediate values written to stack by BPF_ST instruction")
+Signed-off-by: Pu Lehui <pulehui@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/bpf/verifier.c |   59 +++++++++++++++++++++++++++++++++++++++++++++++++-
+ 1 file changed, 58 insertions(+), 1 deletion(-)
+
+--- a/kernel/bpf/verifier.c
++++ b/kernel/bpf/verifier.c
+@@ -517,6 +517,12 @@ static bool is_ptr_cast_function(enum bp
+               func_id == BPF_FUNC_skc_to_tcp_request_sock;
+ }
+ 
++static bool is_callback_calling_function(enum bpf_func_id func_id)
++{
++      return func_id == BPF_FUNC_for_each_map_elem ||
++             func_id == BPF_FUNC_timer_set_callback;
++}
++
+ static bool is_cmpxchg_insn(const struct bpf_insn *insn)
+ {
+       return BPF_CLASS(insn->code) == BPF_STX &&
+@@ -1446,7 +1452,7 @@ static void __mark_reg_unknown(const str
+       reg->type = SCALAR_VALUE;
+       reg->var_off = tnum_unknown;
+       reg->frameno = 0;
+-      reg->precise = env->subprog_cnt > 1 || !env->bpf_capable;
++      reg->precise = !env->bpf_capable;
+       __mark_reg_unbounded(reg);
+ }
+ 
+@@ -2267,6 +2273,11 @@ static int backtrack_insn(struct bpf_ver
+                        */
+                       if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL && insn->imm == 0)
+                               return -ENOTSUPP;
++                      /* BPF helpers that invoke callback subprogs are
++                       * equivalent to BPF_PSEUDO_CALL above
++                       */
++                      if (insn->src_reg == 0 && is_callback_calling_function(insn->imm))
++                              return -ENOTSUPP;
+                       /* regular helper call sets R0 */
+                       *reg_mask &= ~1;
+                       if (*reg_mask & 0x3f) {
+@@ -2445,12 +2456,42 @@ static int __mark_chain_precision(struct
+               return 0;
+       if (!reg_mask && !stack_mask)
+               return 0;
++
+       for (;;) {
+               DECLARE_BITMAP(mask, 64);
+               u32 history = st->jmp_history_cnt;
+ 
+               if (env->log.level & BPF_LOG_LEVEL)
+                       verbose(env, "last_idx %d first_idx %d\n", last_idx, first_idx);
++
++              if (last_idx < 0) {
++                      /* we are at the entry into subprog, which
++                       * is expected for global funcs, but only if
++                       * requested precise registers are R1-R5
++                       * (which are global func's input arguments)
++                       */
++                      if (st->curframe == 0 &&
++                          st->frame[0]->subprogno > 0 &&
++                          st->frame[0]->callsite == BPF_MAIN_FUNC &&
++                          stack_mask == 0 && (reg_mask & ~0x3e) == 0) {
++                              bitmap_from_u64(mask, reg_mask);
++                              for_each_set_bit(i, mask, 32) {
++                                      reg = &st->frame[0]->regs[i];
++                                      if (reg->type != SCALAR_VALUE) {
++                                              reg_mask &= ~(1u << i);
++                                              continue;
++                                      }
++                                      reg->precise = true;
++                              }
++                              return 0;
++                      }
++
++                      verbose(env, "BUG backtracing func entry subprog %d reg_mask %x stack_mask %llx\n",
++                              st->frame[0]->subprogno, reg_mask, stack_mask);
++                      WARN_ONCE(1, "verifier backtracking bug");
++                      return -EFAULT;
++              }
++
+               for (i = last_idx;;) {
+                       if (skip_first) {
+                               err = 0;
+@@ -5806,6 +5847,10 @@ typedef int (*set_callee_state_fn)(struc
+                                  struct bpf_func_state *callee,
+                                  int insn_idx);
+ 
++static int set_callee_state(struct bpf_verifier_env *env,
++                          struct bpf_func_state *caller,
++                          struct bpf_func_state *callee, int insn_idx);
++
+ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
+                            int *insn_idx, int subprog,
+                            set_callee_state_fn set_callee_state_cb)
+@@ -5856,6 +5901,16 @@ static int __check_func_call(struct bpf_
+               }
+       }
+ 
++      /* set_callee_state is used for direct subprog calls, but we are
++       * interested in validating only BPF helpers that can call subprogs as
++       * callbacks
++       */
++      if (set_callee_state_cb != set_callee_state && !is_callback_calling_function(insn->imm)) {
++              verbose(env, "verifier bug: helper %s#%d is not marked as callback-calling\n",
++                      func_id_name(insn->imm), insn->imm);
++              return -EFAULT;
++      }
++
+       if (insn->code == (BPF_JMP | BPF_CALL) &&
+           insn->src_reg == 0 &&
+           insn->imm == BPF_FUNC_timer_set_callback) {
+@@ -13209,6 +13264,8 @@ static int do_check_common(struct bpf_ve
+                       BPF_MAIN_FUNC /* callsite */,
+                       0 /* frameno */,
+                       subprog);
++      state->first_insn_idx = env->subprog_info[subprog].start;
++      state->last_insn_idx = -1;
+ 
+       regs = state->frame[state->curframe]->regs;
+       if (subprog || env->prog->type == BPF_PROG_TYPE_EXT) {
diff --git a/queue-5.15/bpf-stop-setting-precise-in-current-state.patch b/queue-5.15/bpf-stop-setting-precise-in-current-state.patch

new file mode 100644 (file)

index 0000000..7904d8a
--- /dev/null
+++ b/queue-5.15/bpf-stop-setting-precise-in-current-state.patch
@@ -0,0 +1,235 @@
+From pulehui@huaweicloud.com Fri Aug  4 17:25:42 2023
+From: Pu Lehui <pulehui@huaweicloud.com>
+Date: Fri,  4 Aug 2023 23:24:55 +0800
+Subject: bpf: stop setting precise in current state
+To: stable@vger.kernel.org, Greg KH <greg@kroah.com>, Eduard Zingerman <eddyz87@gmail.com>, Luiz Capitulino <luizcap@amazon.com>
+Cc: Alexei Starovoitov <ast@kernel.org>, Daniel Borkmann <daniel@iogearbox.net>, Andrii Nakryiko <andrii@kernel.org>, Pu Lehui <pulehui@huawei.com>, Pu Lehui <pulehui@huaweicloud.com>
+Message-ID: <20230804152459.2565673-3-pulehui@huaweicloud.com>
+
+From: Andrii Nakryiko <andrii@kernel.org>
+
+[ Upstream commit f63181b6ae79fd3b034cde641db774268c2c3acf ]
+
+Setting reg->precise to true in current state is not necessary from
+correctness standpoint, but it does pessimise the whole precision (or
+rather "imprecision", because that's what we want to keep as much as
+possible) tracking. Why is somewhat subtle and my best attempt to
+explain this is recorded in an extensive comment for __mark_chain_precise()
+function. Some more careful thinking and code reading is probably required
+still to grok this completely, unfortunately. Whiteboarding and a bunch
+of extra handwaiving in person would be even more helpful, but is deemed
+impractical in Git commit.
+
+Next patch pushes this imprecision property even further, building on top of
+the insights described in this patch.
+
+End results are pretty nice, we get reduction in number of total instructions
+and states verified due to a better states reuse, as some of the states are now
+more generic and permissive due to less unnecessary precise=true requirements.
+
+SELFTESTS RESULTS
+=================
+
+$ ./veristat -C -e file,prog,insns,states ~/subprog-precise-results.csv ~/imprecise-early-results.csv | grep -v '+0'
+File                                     Program                 Total insns (A)  Total insns (B)  Total insns (DIFF)  Total states (A)  Total states (B)  Total states (DIFF)
+---------------------------------------  ----------------------  ---------------  ---------------  ------------------  ----------------  ----------------  -------------------
+bpf_iter_ksym.bpf.linked1.o              dump_ksym                           347              285       -62 (-17.87%)                20                19          -1 (-5.00%)
+pyperf600_bpf_loop.bpf.linked1.o         on_event                           3678             3736        +58 (+1.58%)               276               285          +9 (+3.26%)
+setget_sockopt.bpf.linked1.o             skops_sockopt                      4038             3947        -91 (-2.25%)               347               343          -4 (-1.15%)
+test_l4lb.bpf.linked1.o                  balancer_ingress                   4559             2611     -1948 (-42.73%)               118               105        -13 (-11.02%)
+test_l4lb_noinline.bpf.linked1.o         balancer_ingress                   6279             6268        -11 (-0.18%)               237               236          -1 (-0.42%)
+test_misc_tcp_hdr_options.bpf.linked1.o  misc_estab                         1307             1303         -4 (-0.31%)               100                99          -1 (-1.00%)
+test_sk_lookup.bpf.linked1.o             ctx_narrow_access                   456              447         -9 (-1.97%)                39                38          -1 (-2.56%)
+test_sysctl_loop1.bpf.linked1.o          sysctl_tcp_mem                     1389             1384         -5 (-0.36%)                26                25          -1 (-3.85%)
+test_tc_dtime.bpf.linked1.o              egress_fwdns_prio101                518              485        -33 (-6.37%)                51                46          -5 (-9.80%)
+test_tc_dtime.bpf.linked1.o              egress_host                         519              468        -51 (-9.83%)                50                44         -6 (-12.00%)
+test_tc_dtime.bpf.linked1.o              ingress_fwdns_prio101               842             1000      +158 (+18.76%)                73                88        +15 (+20.55%)
+xdp_synproxy_kern.bpf.linked1.o          syncookie_tc                     405757           373173     -32584 (-8.03%)             25735             22882      -2853 (-11.09%)
+xdp_synproxy_kern.bpf.linked1.o          syncookie_xdp                    479055           371590   -107465 (-22.43%)             29145             22207      -6938 (-23.81%)
+---------------------------------------  ----------------------  ---------------  ---------------  ------------------  ----------------  ----------------  -------------------
+
+Slight regression in test_tc_dtime.bpf.linked1.o/ingress_fwdns_prio101
+is left for a follow up, there might be some more precision-related bugs
+in existing BPF verifier logic.
+
+CILIUM RESULTS
+==============
+
+$ ./veristat -C -e file,prog,insns,states ~/subprog-precise-results-cilium.csv ~/imprecise-early-results-cilium.csv | grep -v '+0'
+File           Program                         Total insns (A)  Total insns (B)  Total insns (DIFF)  Total states (A)  Total states (B)  Total states (DIFF)
+-------------  ------------------------------  ---------------  ---------------  ------------------  ----------------  ----------------  -------------------
+bpf_host.o     cil_from_host                               762              556      -206 (-27.03%)                43                37         -6 (-13.95%)
+bpf_host.o     tail_handle_nat_fwd_ipv4                  23541            23426       -115 (-0.49%)              1538              1537          -1 (-0.07%)
+bpf_host.o     tail_nodeport_nat_egress_ipv4             33592            33566        -26 (-0.08%)              2163              2161          -2 (-0.09%)
+bpf_lxc.o      tail_handle_nat_fwd_ipv4                  23541            23426       -115 (-0.49%)              1538              1537          -1 (-0.07%)
+bpf_overlay.o  tail_nodeport_nat_egress_ipv4             33581            33543        -38 (-0.11%)              2160              2157          -3 (-0.14%)
+bpf_xdp.o      tail_handle_nat_fwd_ipv4                  21659            20920       -739 (-3.41%)              1440              1376         -64 (-4.44%)
+bpf_xdp.o      tail_handle_nat_fwd_ipv6                  17084            17039        -45 (-0.26%)               907               905          -2 (-0.22%)
+bpf_xdp.o      tail_lb_ipv4                              73442            73430        -12 (-0.02%)              4370              4369          -1 (-0.02%)
+bpf_xdp.o      tail_lb_ipv6                             152114           151895       -219 (-0.14%)              6493              6479         -14 (-0.22%)
+bpf_xdp.o      tail_nodeport_nat_egress_ipv4             17377            17200       -177 (-1.02%)              1125              1111         -14 (-1.24%)
+bpf_xdp.o      tail_nodeport_nat_ingress_ipv6             6405             6397         -8 (-0.12%)               309               308          -1 (-0.32%)
+bpf_xdp.o      tail_rev_nodeport_lb4                      7126             6934       -192 (-2.69%)               414               402         -12 (-2.90%)
+bpf_xdp.o      tail_rev_nodeport_lb6                     18059            17905       -154 (-0.85%)              1105              1096          -9 (-0.81%)
+-------------  ------------------------------  ---------------  ---------------  ------------------  ----------------  ----------------  -------------------
+
+Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
+Link: https://lore.kernel.org/r/20221104163649.121784-5-andrii@kernel.org
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Stable-dep-of: ecdf985d7615 ("bpf: track immediate values written to stack by BPF_ST instruction")
+Signed-off-by: Pu Lehui <pulehui@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/bpf/verifier.c |  103 ++++++++++++++++++++++++++++++++++++++++++++------
+ 1 file changed, 91 insertions(+), 12 deletions(-)
+
+--- a/kernel/bpf/verifier.c
++++ b/kernel/bpf/verifier.c
+@@ -2382,8 +2382,11 @@ static void mark_all_scalars_precise(str
+ 
+       /* big hammer: mark all scalars precise in this path.
+        * pop_stack may still get !precise scalars.
++       * We also skip current state and go straight to first parent state,
++       * because precision markings in current non-checkpointed state are
++       * not needed. See why in the comment in __mark_chain_precision below.
+        */
+-      for (; st; st = st->parent)
++      for (st = st->parent; st; st = st->parent) {
+               for (i = 0; i <= st->curframe; i++) {
+                       func = st->frame[i];
+                       for (j = 0; j < BPF_REG_FP; j++) {
+@@ -2401,8 +2404,88 @@ static void mark_all_scalars_precise(str
+                               reg->precise = true;
+                       }
+               }
++      }
+ }
+ 
++/*
++ * __mark_chain_precision() backtracks BPF program instruction sequence and
++ * chain of verifier states making sure that register *regno* (if regno >= 0)
++ * and/or stack slot *spi* (if spi >= 0) are marked as precisely tracked
++ * SCALARS, as well as any other registers and slots that contribute to
++ * a tracked state of given registers/stack slots, depending on specific BPF
++ * assembly instructions (see backtrack_insns() for exact instruction handling
++ * logic). This backtracking relies on recorded jmp_history and is able to
++ * traverse entire chain of parent states. This process ends only when all the
++ * necessary registers/slots and their transitive dependencies are marked as
++ * precise.
++ *
++ * One important and subtle aspect is that precise marks *do not matter* in
++ * the currently verified state (current state). It is important to understand
++ * why this is the case.
++ *
++ * First, note that current state is the state that is not yet "checkpointed",
++ * i.e., it is not yet put into env->explored_states, and it has no children
++ * states as well. It's ephemeral, and can end up either a) being discarded if
++ * compatible explored state is found at some point or BPF_EXIT instruction is
++ * reached or b) checkpointed and put into env->explored_states, branching out
++ * into one or more children states.
++ *
++ * In the former case, precise markings in current state are completely
++ * ignored by state comparison code (see regsafe() for details). Only
++ * checkpointed ("old") state precise markings are important, and if old
++ * state's register/slot is precise, regsafe() assumes current state's
++ * register/slot as precise and checks value ranges exactly and precisely. If
++ * states turn out to be compatible, current state's necessary precise
++ * markings and any required parent states' precise markings are enforced
++ * after the fact with propagate_precision() logic, after the fact. But it's
++ * important to realize that in this case, even after marking current state
++ * registers/slots as precise, we immediately discard current state. So what
++ * actually matters is any of the precise markings propagated into current
++ * state's parent states, which are always checkpointed (due to b) case above).
++ * As such, for scenario a) it doesn't matter if current state has precise
++ * markings set or not.
++ *
++ * Now, for the scenario b), checkpointing and forking into child(ren)
++ * state(s). Note that before current state gets to checkpointing step, any
++ * processed instruction always assumes precise SCALAR register/slot
++ * knowledge: if precise value or range is useful to prune jump branch, BPF
++ * verifier takes this opportunity enthusiastically. Similarly, when
++ * register's value is used to calculate offset or memory address, exact
++ * knowledge of SCALAR range is assumed, checked, and enforced. So, similar to
++ * what we mentioned above about state comparison ignoring precise markings
++ * during state comparison, BPF verifier ignores and also assumes precise
++ * markings *at will* during instruction verification process. But as verifier
++ * assumes precision, it also propagates any precision dependencies across
++ * parent states, which are not yet finalized, so can be further restricted
++ * based on new knowledge gained from restrictions enforced by their children
++ * states. This is so that once those parent states are finalized, i.e., when
++ * they have no more active children state, state comparison logic in
++ * is_state_visited() would enforce strict and precise SCALAR ranges, if
++ * required for correctness.
++ *
++ * To build a bit more intuition, note also that once a state is checkpointed,
++ * the path we took to get to that state is not important. This is crucial
++ * property for state pruning. When state is checkpointed and finalized at
++ * some instruction index, it can be correctly and safely used to "short
++ * circuit" any *compatible* state that reaches exactly the same instruction
++ * index. I.e., if we jumped to that instruction from a completely different
++ * code path than original finalized state was derived from, it doesn't
++ * matter, current state can be discarded because from that instruction
++ * forward having a compatible state will ensure we will safely reach the
++ * exit. States describe preconditions for further exploration, but completely
++ * forget the history of how we got here.
++ *
++ * This also means that even if we needed precise SCALAR range to get to
++ * finalized state, but from that point forward *that same* SCALAR register is
++ * never used in a precise context (i.e., it's precise value is not needed for
++ * correctness), it's correct and safe to mark such register as "imprecise"
++ * (i.e., precise marking set to false). This is what we rely on when we do
++ * not set precise marking in current state. If no child state requires
++ * precision for any given SCALAR register, it's safe to dictate that it can
++ * be imprecise. If any child state does require this register to be precise,
++ * we'll mark it precise later retroactively during precise markings
++ * propagation from child state to parent states.
++ */
+ static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int regno,
+                                 int spi)
+ {
+@@ -2420,6 +2503,10 @@ static int __mark_chain_precision(struct
+       if (!env->bpf_capable)
+               return 0;
+ 
++      /* Do sanity checks against current state of register and/or stack
++       * slot, but don't set precise flag in current state, as precision
++       * tracking in the current state is unnecessary.
++       */
+       func = st->frame[frame];
+       if (regno >= 0) {
+               reg = &func->regs[regno];
+@@ -2427,11 +2514,7 @@ static int __mark_chain_precision(struct
+                       WARN_ONCE(1, "backtracing misuse");
+                       return -EFAULT;
+               }
+-              if (!reg->precise)
+-                      new_marks = true;
+-              else
+-                      reg_mask = 0;
+-              reg->precise = true;
++              new_marks = true;
+       }
+ 
+       while (spi >= 0) {
+@@ -2444,11 +2527,7 @@ static int __mark_chain_precision(struct
+                       stack_mask = 0;
+                       break;
+               }
+-              if (!reg->precise)
+-                      new_marks = true;
+-              else
+-                      stack_mask = 0;
+-              reg->precise = true;
++              new_marks = true;
+               break;
+       }
+ 
+@@ -10356,7 +10435,7 @@ static bool regsafe(struct bpf_verifier_
+               if (env->explore_alu_limits)
+                       return false;
+               if (rcur->type == SCALAR_VALUE) {
+-                      if (!rold->precise && !rcur->precise)
++                      if (!rold->precise)
+                               return true;
+                       /* new val must satisfy old val knowledge */
+                       return range_within(rold, rcur) &&
diff --git a/queue-5.15/selftests-bpf-fix-sk_assign-on-s390x.patch b/queue-5.15/selftests-bpf-fix-sk_assign-on-s390x.patch

new file mode 100644 (file)

index 0000000..b66621c
--- /dev/null
+++ b/queue-5.15/selftests-bpf-fix-sk_assign-on-s390x.patch
@@ -0,0 +1,123 @@
+From pulehui@huaweicloud.com Fri Aug  4 17:25:38 2023
+From: Pu Lehui <pulehui@huaweicloud.com>
+Date: Fri,  4 Aug 2023 23:24:59 +0800
+Subject: selftests/bpf: Fix sk_assign on s390x
+To: stable@vger.kernel.org, Greg KH <greg@kroah.com>, Eduard Zingerman <eddyz87@gmail.com>, Luiz Capitulino <luizcap@amazon.com>
+Cc: Alexei Starovoitov <ast@kernel.org>, Daniel Borkmann <daniel@iogearbox.net>, Andrii Nakryiko <andrii@kernel.org>, Pu Lehui <pulehui@huawei.com>, Pu Lehui <pulehui@huaweicloud.com>
+Message-ID: <20230804152459.2565673-7-pulehui@huaweicloud.com>
+
+From: Ilya Leoshkevich <iii@linux.ibm.com>
+
+[ Upstream commit 7ce878ca81bca7811e669db4c394b86780e0dbe4 ]
+
+sk_assign is failing on an s390x machine running Debian "bookworm" for
+2 reasons: legacy server_map definition and uninitialized addrlen in
+recvfrom() call.
+
+Fix by adding a new-style server_map definition and dropping addrlen
+(recvfrom() allows NULL values for src_addr and addrlen).
+
+Since the test should support tc built without libbpf, build the prog
+twice: with the old-style definition and with the new-style definition,
+then select the right one at runtime. This could be done at compile
+time too, but this would not be cross-compilation friendly.
+
+Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
+Link: https://lore.kernel.org/r/20230129190501.1624747-2-iii@linux.ibm.com
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Pu Lehui <pulehui@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ tools/testing/selftests/bpf/prog_tests/sk_assign.c        |   25 ++++++++++----
+ tools/testing/selftests/bpf/progs/test_sk_assign.c        |   11 ++++++
+ tools/testing/selftests/bpf/progs/test_sk_assign_libbpf.c |    3 +
+ 3 files changed, 33 insertions(+), 6 deletions(-)
+ create mode 100644 tools/testing/selftests/bpf/progs/test_sk_assign_libbpf.c
+
+--- a/tools/testing/selftests/bpf/prog_tests/sk_assign.c
++++ b/tools/testing/selftests/bpf/prog_tests/sk_assign.c
+@@ -29,7 +29,23 @@ static int stop, duration;
+ static bool
+ configure_stack(void)
+ {
++      char tc_version[128];
+       char tc_cmd[BUFSIZ];
++      char *prog;
++      FILE *tc;
++
++      /* Check whether tc is built with libbpf. */
++      tc = popen("tc -V", "r");
++      if (CHECK_FAIL(!tc))
++              return false;
++      if (CHECK_FAIL(!fgets(tc_version, sizeof(tc_version), tc)))
++              return false;
++      if (strstr(tc_version, ", libbpf "))
++              prog = "test_sk_assign_libbpf.o";
++      else
++              prog = "test_sk_assign.o";
++      if (CHECK_FAIL(pclose(tc)))
++              return false;
+ 
+       /* Move to a new networking namespace */
+       if (CHECK_FAIL(unshare(CLONE_NEWNET)))
+@@ -46,8 +62,8 @@ configure_stack(void)
+       /* Load qdisc, BPF program */
+       if (CHECK_FAIL(system("tc qdisc add dev lo clsact")))
+               return false;
+-      sprintf(tc_cmd, "%s %s %s %s", "tc filter add dev lo ingress bpf",
+-                     "direct-action object-file ./test_sk_assign.o",
++      sprintf(tc_cmd, "%s %s %s %s %s", "tc filter add dev lo ingress bpf",
++                     "direct-action object-file", prog,
+                      "section classifier/sk_assign_test",
+                      (env.verbosity < VERBOSE_VERY) ? " 2>/dev/null" : "verbose");
+       if (CHECK(system(tc_cmd), "BPF load failed;",
+@@ -129,15 +145,12 @@ get_port(int fd)
+ static ssize_t
+ rcv_msg(int srv_client, int type)
+ {
+-      struct sockaddr_storage ss;
+       char buf[BUFSIZ];
+-      socklen_t slen;
+ 
+       if (type == SOCK_STREAM)
+               return read(srv_client, &buf, sizeof(buf));
+       else
+-              return recvfrom(srv_client, &buf, sizeof(buf), 0,
+-                              (struct sockaddr *)&ss, &slen);
++              return recvfrom(srv_client, &buf, sizeof(buf), 0, NULL, NULL);
+ }
+ 
+ static int
+--- a/tools/testing/selftests/bpf/progs/test_sk_assign.c
++++ b/tools/testing/selftests/bpf/progs/test_sk_assign.c
+@@ -16,6 +16,16 @@
+ #include <bpf/bpf_helpers.h>
+ #include <bpf/bpf_endian.h>
+ 
++#if defined(IPROUTE2_HAVE_LIBBPF)
++/* Use a new-style map definition. */
++struct {
++      __uint(type, BPF_MAP_TYPE_SOCKMAP);
++      __type(key, int);
++      __type(value, __u64);
++      __uint(pinning, LIBBPF_PIN_BY_NAME);
++      __uint(max_entries, 1);
++} server_map SEC(".maps");
++#else
+ /* Pin map under /sys/fs/bpf/tc/globals/<map name> */
+ #define PIN_GLOBAL_NS 2
+ 
+@@ -35,6 +45,7 @@ struct {
+       .max_elem = 1,
+       .pinning = PIN_GLOBAL_NS,
+ };
++#endif
+ 
+ int _version SEC("version") = 1;
+ char _license[] SEC("license") = "GPL";
+--- /dev/null
++++ b/tools/testing/selftests/bpf/progs/test_sk_assign_libbpf.c
+@@ -0,0 +1,3 @@
++// SPDX-License-Identifier: GPL-2.0
++#define IPROUTE2_HAVE_LIBBPF
++#include "test_sk_assign.c"
diff --git a/queue-5.15/selftests-bpf-make-test_align-selftest-more-robust.patch b/queue-5.15/selftests-bpf-make-test_align-selftest-more-robust.patch

new file mode 100644 (file)

index 0000000..4d26682
--- /dev/null
+++ b/queue-5.15/selftests-bpf-make-test_align-selftest-more-robust.patch
@@ -0,0 +1,127 @@
+From pulehui@huaweicloud.com Fri Aug  4 17:25:38 2023
+From: Pu Lehui <pulehui@huaweicloud.com>
+Date: Fri,  4 Aug 2023 23:24:57 +0800
+Subject: selftests/bpf: make test_align selftest more robust
+To: stable@vger.kernel.org, Greg KH <greg@kroah.com>, Eduard Zingerman <eddyz87@gmail.com>, Luiz Capitulino <luizcap@amazon.com>
+Cc: Alexei Starovoitov <ast@kernel.org>, Daniel Borkmann <daniel@iogearbox.net>, Andrii Nakryiko <andrii@kernel.org>, Pu Lehui <pulehui@huawei.com>, Pu Lehui <pulehui@huaweicloud.com>
+Message-ID: <20230804152459.2565673-5-pulehui@huaweicloud.com>
+
+From: Andrii Nakryiko <andrii@kernel.org>
+
+[ Upstream commit 4f999b767769b76378c3616c624afd6f4bb0d99f ]
+
+test_align selftest relies on BPF verifier log emitting register states
+for specific instructions in expected format. Unfortunately, BPF
+verifier precision backtracking log interferes with such expectations.
+And instruction on which precision propagation happens sometimes don't
+output full expected register states. This does indeed look like
+something to be improved in BPF verifier, but is beyond the scope of
+this patch set.
+
+So to make test_align a bit more robust, inject few dummy R4 = R5
+instructions which capture desired state of R5 and won't have precision
+tracking logs on them. This fixes tests until we can improve BPF
+verifier output in the presence of precision tracking.
+
+Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
+Link: https://lore.kernel.org/r/20221104163649.121784-7-andrii@kernel.org
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Stable-dep-of: ecdf985d7615 ("bpf: track immediate values written to stack by BPF_ST instruction")
+Signed-off-by: Pu Lehui <pulehui@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ tools/testing/selftests/bpf/prog_tests/align.c |   36 +++++++++++++++----------
+ 1 file changed, 23 insertions(+), 13 deletions(-)
+
+--- a/tools/testing/selftests/bpf/prog_tests/align.c
++++ b/tools/testing/selftests/bpf/prog_tests/align.c
+@@ -2,7 +2,7 @@
+ #include <test_progs.h>
+ 
+ #define MAX_INSNS     512
+-#define MAX_MATCHES   16
++#define MAX_MATCHES   24
+ 
+ struct bpf_reg_match {
+       unsigned int line;
+@@ -267,6 +267,7 @@ static struct bpf_align_test tests[] = {
+                        */
+                       BPF_MOV64_REG(BPF_REG_5, BPF_REG_2),
+                       BPF_ALU64_REG(BPF_ADD, BPF_REG_5, BPF_REG_6),
++                      BPF_MOV64_REG(BPF_REG_4, BPF_REG_5),
+                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 14),
+                       BPF_MOV64_REG(BPF_REG_4, BPF_REG_5),
+                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4),
+@@ -280,6 +281,7 @@ static struct bpf_align_test tests[] = {
+                       BPF_MOV64_REG(BPF_REG_5, BPF_REG_2),
+                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 14),
+                       BPF_ALU64_REG(BPF_ADD, BPF_REG_5, BPF_REG_6),
++                      BPF_MOV64_REG(BPF_REG_4, BPF_REG_5),
+                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 4),
+                       BPF_ALU64_REG(BPF_ADD, BPF_REG_5, BPF_REG_6),
+                       BPF_MOV64_REG(BPF_REG_4, BPF_REG_5),
+@@ -311,44 +313,52 @@ static struct bpf_align_test tests[] = {
+                       {15, "R4=pkt(id=1,off=18,r=18,umax_value=1020,var_off=(0x0; 0x3fc))"},
+                       {15, "R5=pkt(id=1,off=14,r=18,umax_value=1020,var_off=(0x0; 0x3fc))"},
+                       /* Variable offset is added to R5 packet pointer,
+-                       * resulting in auxiliary alignment of 4.
++                       * resulting in auxiliary alignment of 4. To avoid BPF
++                       * verifier's precision backtracking logging
++                       * interfering we also have a no-op R4 = R5
++                       * instruction to validate R5 state. We also check
++                       * that R4 is what it should be in such case.
+                        */
+-                      {18, "R5_w=pkt(id=2,off=0,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
++                      {19, "R4_w=pkt(id=2,off=0,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
++                      {19, "R5_w=pkt(id=2,off=0,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
+                       /* Constant offset is added to R5, resulting in
+                        * reg->off of 14.
+                        */
+-                      {19, "R5_w=pkt(id=2,off=14,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
++                      {20, "R5_w=pkt(id=2,off=14,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
+                       /* At the time the word size load is performed from R5,
+                        * its total fixed offset is NET_IP_ALIGN + reg->off
+                        * (14) which is 16.  Then the variable offset is 4-byte
+                        * aligned, so the total offset is 4-byte aligned and
+                        * meets the load's requirements.
+                        */
+-                      {23, "R4=pkt(id=2,off=18,r=18,umax_value=1020,var_off=(0x0; 0x3fc))"},
+-                      {23, "R5=pkt(id=2,off=14,r=18,umax_value=1020,var_off=(0x0; 0x3fc))"},
++                      {24, "R4=pkt(id=2,off=18,r=18,umax_value=1020,var_off=(0x0; 0x3fc))"},
++                      {24, "R5=pkt(id=2,off=14,r=18,umax_value=1020,var_off=(0x0; 0x3fc))"},
+                       /* Constant offset is added to R5 packet pointer,
+                        * resulting in reg->off value of 14.
+                        */
+-                      {26, "R5_w=pkt(id=0,off=14,r=8"},
++                      {27, "R5_w=pkt(id=0,off=14,r=8"},
+                       /* Variable offset is added to R5, resulting in a
+-                       * variable offset of (4n).
++                       * variable offset of (4n). See comment for insn #19
++                       * for R4 = R5 trick.
+                        */
+-                      {27, "R5_w=pkt(id=3,off=14,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
++                      {29, "R4_w=pkt(id=3,off=14,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
++                      {29, "R5_w=pkt(id=3,off=14,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
+                       /* Constant is added to R5 again, setting reg->off to 18. */
+-                      {28, "R5_w=pkt(id=3,off=18,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
++                      {30, "R5_w=pkt(id=3,off=18,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
+                       /* And once more we add a variable; resulting var_off
+                        * is still (4n), fixed offset is not changed.
+                        * Also, we create a new reg->id.
+                        */
+-                      {29, "R5_w=pkt(id=4,off=18,r=0,umax_value=2040,var_off=(0x0; 0x7fc)"},
++                      {32, "R4_w=pkt(id=4,off=18,r=0,umax_value=2040,var_off=(0x0; 0x7fc)"},
++                      {32, "R5_w=pkt(id=4,off=18,r=0,umax_value=2040,var_off=(0x0; 0x7fc)"},
+                       /* At the time the word size load is performed from R5,
+                        * its total fixed offset is NET_IP_ALIGN + reg->off (18)
+                        * which is 20.  Then the variable offset is (4n), so
+                        * the total offset is 4-byte aligned and meets the
+                        * load's requirements.
+                        */
+-                      {33, "R4=pkt(id=4,off=22,r=22,umax_value=2040,var_off=(0x0; 0x7fc)"},
+-                      {33, "R5=pkt(id=4,off=18,r=22,umax_value=2040,var_off=(0x0; 0x7fc)"},
++                      {35, "R4=pkt(id=4,off=22,r=22,umax_value=2040,var_off=(0x0; 0x7fc)"},
++                      {35, "R5=pkt(id=4,off=18,r=22,umax_value=2040,var_off=(0x0; 0x7fc)"},
+               },
+       },
+       {
diff --git a/queue-5.15/selftests-bpf-workaround-verification-failure-for-fexit_bpf2bpf-func_replace_return_code.patch b/queue-5.15/selftests-bpf-workaround-verification-failure-for-fexit_bpf2bpf-func_replace_return_code.patch

new file mode 100644 (file)

index 0000000..f01ec23
--- /dev/null
+++ b/queue-5.15/selftests-bpf-workaround-verification-failure-for-fexit_bpf2bpf-func_replace_return_code.patch
@@ -0,0 +1,95 @@
+From pulehui@huaweicloud.com Fri Aug  4 17:25:37 2023
+From: Pu Lehui <pulehui@huaweicloud.com>
+Date: Fri,  4 Aug 2023 23:24:58 +0800
+Subject: selftests/bpf: Workaround verification failure for fexit_bpf2bpf/func_replace_return_code
+To: stable@vger.kernel.org, Greg KH <greg@kroah.com>, Eduard Zingerman <eddyz87@gmail.com>, Luiz Capitulino <luizcap@amazon.com>
+Cc: Alexei Starovoitov <ast@kernel.org>, Daniel Borkmann <daniel@iogearbox.net>, Andrii Nakryiko <andrii@kernel.org>, Pu Lehui <pulehui@huawei.com>, Pu Lehui <pulehui@huaweicloud.com>
+Message-ID: <20230804152459.2565673-6-pulehui@huaweicloud.com>
+
+From: Yonghong Song <yhs@fb.com>
+
+[ Upstream commit 63d78b7e8ca2d0eb8c687a355fa19d01b6fcc723 ]
+
+With latest llvm17, selftest fexit_bpf2bpf/func_replace_return_code
+has the following verification failure:
+
+  0: R1=ctx(off=0,imm=0) R10=fp0
+  ; int connect_v4_prog(struct bpf_sock_addr *ctx)
+  0: (bf) r7 = r1                       ; R1=ctx(off=0,imm=0) R7_w=ctx(off=0,imm=0)
+  1: (b4) w6 = 0                        ; R6_w=0
+  ; memset(&tuple.ipv4.saddr, 0, sizeof(tuple.ipv4.saddr));
+  ...
+  ; return do_bind(ctx) ? 1 : 0;
+  179: (bf) r1 = r7                     ; R1=ctx(off=0,imm=0) R7=ctx(off=0,imm=0)
+  180: (85) call pc+147
+  Func#3 is global and valid. Skipping.
+  181: R0_w=scalar()
+  181: (bc) w6 = w0                     ; R0_w=scalar() R6_w=scalar(umax=4294967295,var_off=(0x0; 0xffffffff))
+  182: (05) goto pc-129
+  ; }
+  54: (bc) w0 = w6                      ; R0_w=scalar(umax=4294967295,var_off=(0x0; 0xffffffff)) R6_w=scalar(umax=4294967295,var_off=(0x0; 0xffffffff))
+  55: (95) exit
+  At program exit the register R0 has value (0x0; 0xffffffff) should have been in (0x0; 0x1)
+  processed 281 insns (limit 1000000) max_states_per_insn 1 total_states 26 peak_states 26 mark_read 13
+  -- END PROG LOAD LOG --
+  libbpf: prog 'connect_v4_prog': failed to load: -22
+
+The corresponding source code:
+
+  __attribute__ ((noinline))
+  int do_bind(struct bpf_sock_addr *ctx)
+  {
+        struct sockaddr_in sa = {};
+
+        sa.sin_family = AF_INET;
+        sa.sin_port = bpf_htons(0);
+        sa.sin_addr.s_addr = bpf_htonl(SRC_REWRITE_IP4);
+
+        if (bpf_bind(ctx, (struct sockaddr *)&sa, sizeof(sa)) != 0)
+                return 0;
+
+        return 1;
+  }
+  ...
+  SEC("cgroup/connect4")
+  int connect_v4_prog(struct bpf_sock_addr *ctx)
+  {
+  ...
+        return do_bind(ctx) ? 1 : 0;
+  }
+
+Insn 180 is a call to 'do_bind'. The call's return value is also the return value
+for the program. Since do_bind() returns 0/1, so it is legitimate for compiler to
+optimize 'return do_bind(ctx) ? 1 : 0' to 'return do_bind(ctx)'. However, such
+optimization breaks verifier as the return value of 'do_bind()' is marked as any
+scalar which violates the requirement of prog return value 0/1.
+
+There are two ways to fix this problem, (1) changing 'return 1' in do_bind() to
+e.g. 'return 10' so the compiler has to do 'do_bind(ctx) ? 1 :0', or (2)
+suggested by Andrii, marking do_bind() with __weak attribute so the compiler
+cannot make any assumption on do_bind() return value.
+
+This patch adopted adding __weak approach which is simpler and more resistant
+to potential compiler optimizations.
+
+Suggested-by: Andrii Nakryiko <andrii@kernel.org>
+Signed-off-by: Yonghong Song <yhs@fb.com>
+Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
+Link: https://lore.kernel.org/bpf/20230310012410.2920570-1-yhs@fb.com
+Signed-off-by: Pu Lehui <pulehui@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ tools/testing/selftests/bpf/progs/connect4_prog.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/tools/testing/selftests/bpf/progs/connect4_prog.c
++++ b/tools/testing/selftests/bpf/progs/connect4_prog.c
+@@ -33,7 +33,7 @@
+ 
+ int _version SEC("version") = 1;
+ 
+-__attribute__ ((noinline))
++__attribute__ ((noinline)) __weak
+ int do_bind(struct bpf_sock_addr *ctx)
+ {
+       struct sockaddr_in sa = {};
diff --git a/queue-5.15/series b/queue-5.15/series

index 40d728fb248fb7c0136949df2cc9b826944df2c8..c18b464362eb5dc7b99627f477dac6a76431eba7 100644 (file)
--- a/queue-5.15/series
+++ b/queue-5.15/series
@@ -11,3 +11,9 @@ drm-amd-display-check-attr-flag-before-set-cursor-degamma-on-dcn3.patch
  hwmon-pmbus-bel-pfe-enable-pmbus_skip_status_check-for-pfe1100.patch
  radix-tree-test-suite-fix-incorrect-allocation-size-for-pthreads.patch
  nilfs2-fix-use-after-free-of-nilfs_root-in-dirtying-inodes-via-iput.patch
+bpf-allow-precision-tracking-for-programs-with-subprogs.patch
+bpf-stop-setting-precise-in-current-state.patch
+bpf-aggressively-forget-precise-markings-during-state-checkpointing.patch
+selftests-bpf-make-test_align-selftest-more-robust.patch
+selftests-bpf-workaround-verification-failure-for-fexit_bpf2bpf-func_replace_return_code.patch
+selftests-bpf-fix-sk_assign-on-s390x.patch
author	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Sat, 12 Aug 2023 09:25:17 +0000 (11:25 +0200)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Sat, 12 Aug 2023 09:25:17 +0000 (11:25 +0200)
queue-5.15/bpf-aggressively-forget-precise-markings-during-state-checkpointing.patch	[new file with mode: 0644]	patch \| blob
queue-5.15/bpf-allow-precision-tracking-for-programs-with-subprogs.patch	[new file with mode: 0644]	patch \| blob
queue-5.15/bpf-stop-setting-precise-in-current-state.patch	[new file with mode: 0644]	patch \| blob
queue-5.15/selftests-bpf-fix-sk_assign-on-s390x.patch	[new file with mode: 0644]	patch \| blob
queue-5.15/selftests-bpf-make-test_align-selftest-more-robust.patch	[new file with mode: 0644]	patch \| blob
queue-5.15/selftests-bpf-workaround-verification-failure-for-fexit_bpf2bpf-func_replace_return_code.patch	[new file with mode: 0644]	patch \| blob
queue-5.15/series		patch \| blob \| blame \| history