--- /dev/null
+From pulehui@huaweicloud.com Fri Aug 4 17:25:38 2023
+From: Pu Lehui <pulehui@huaweicloud.com>
+Date: Fri, 4 Aug 2023 23:24:56 +0800
+Subject: bpf: aggressively forget precise markings during state checkpointing
+To: stable@vger.kernel.org, Greg KH <greg@kroah.com>, Eduard Zingerman <eddyz87@gmail.com>, Luiz Capitulino <luizcap@amazon.com>
+Cc: Alexei Starovoitov <ast@kernel.org>, Daniel Borkmann <daniel@iogearbox.net>, Andrii Nakryiko <andrii@kernel.org>, Pu Lehui <pulehui@huawei.com>, Pu Lehui <pulehui@huaweicloud.com>
+Message-ID: <20230804152459.2565673-4-pulehui@huaweicloud.com>
+
+From: Andrii Nakryiko <andrii@kernel.org>
+
+[ Upstream commit 7a830b53c17bbadcf99f778f28aaaa4e6c41df5f ]
+
+Exploit the property of about-to-be-checkpointed state to be able to
+forget all precise markings up to that point even more aggressively. We
+now clear all potentially inherited precise markings right before
+checkpointing and branching off into child state. If any of children
+states require precise knowledge of any SCALAR register, those will be
+propagated backwards later on before this state is finalized, preserving
+correctness.
+
+There is a single selftests BPF program change, but tremendous one: 25x
+reduction in number of verified instructions and states in
+trace_virtqueue_add_sgs.
+
+Cilium results are more modest, but happen across wider range of programs.
+
+SELFTESTS RESULTS
+=================
+
+$ ./veristat -C -e file,prog,insns,states ~/imprecise-early-results.csv ~/imprecise-aggressive-results.csv | grep -v '+0'
+File Program Total insns (A) Total insns (B) Total insns (DIFF) Total states (A) Total states (B) Total states (DIFF)
+------------------- ----------------------- --------------- --------------- ------------------ ---------------- ---------------- -------------------
+loop6.bpf.linked1.o trace_virtqueue_add_sgs 398057 15114 -382943 (-96.20%) 8717 336 -8381 (-96.15%)
+------------------- ----------------------- --------------- --------------- ------------------ ---------------- ---------------- -------------------
+
+CILIUM RESULTS
+==============
+
+$ ./veristat -C -e file,prog,insns,states ~/imprecise-early-results-cilium.csv ~/imprecise-aggressive-results-cilium.csv | grep -v '+0'
+File Program Total insns (A) Total insns (B) Total insns (DIFF) Total states (A) Total states (B) Total states (DIFF)
+------------- -------------------------------- --------------- --------------- ------------------ ---------------- ---------------- -------------------
+bpf_host.o tail_handle_nat_fwd_ipv4 23426 23221 -205 (-0.88%) 1537 1515 -22 (-1.43%)
+bpf_host.o tail_handle_nat_fwd_ipv6 13009 12904 -105 (-0.81%) 719 708 -11 (-1.53%)
+bpf_host.o tail_nodeport_nat_ingress_ipv6 5261 5196 -65 (-1.24%) 247 243 -4 (-1.62%)
+bpf_host.o tail_nodeport_nat_ipv6_egress 3446 3406 -40 (-1.16%) 203 198 -5 (-2.46%)
+bpf_lxc.o tail_handle_nat_fwd_ipv4 23426 23221 -205 (-0.88%) 1537 1515 -22 (-1.43%)
+bpf_lxc.o tail_handle_nat_fwd_ipv6 13009 12904 -105 (-0.81%) 719 708 -11 (-1.53%)
+bpf_lxc.o tail_ipv4_ct_egress 5074 4897 -177 (-3.49%) 255 248 -7 (-2.75%)
+bpf_lxc.o tail_ipv4_ct_ingress 5100 4923 -177 (-3.47%) 255 248 -7 (-2.75%)
+bpf_lxc.o tail_ipv4_ct_ingress_policy_only 5100 4923 -177 (-3.47%) 255 248 -7 (-2.75%)
+bpf_lxc.o tail_ipv6_ct_egress 4558 4536 -22 (-0.48%) 188 187 -1 (-0.53%)
+bpf_lxc.o tail_ipv6_ct_ingress 4578 4556 -22 (-0.48%) 188 187 -1 (-0.53%)
+bpf_lxc.o tail_ipv6_ct_ingress_policy_only 4578 4556 -22 (-0.48%) 188 187 -1 (-0.53%)
+bpf_lxc.o tail_nodeport_nat_ingress_ipv6 5261 5196 -65 (-1.24%) 247 243 -4 (-1.62%)
+bpf_overlay.o tail_nodeport_nat_ingress_ipv6 5261 5196 -65 (-1.24%) 247 243 -4 (-1.62%)
+bpf_overlay.o tail_nodeport_nat_ipv6_egress 3482 3442 -40 (-1.15%) 204 201 -3 (-1.47%)
+bpf_xdp.o tail_nodeport_nat_egress_ipv4 17200 15619 -1581 (-9.19%) 1111 1010 -101 (-9.09%)
+------------- -------------------------------- --------------- --------------- ------------------ ---------------- ---------------- -------------------
+
+Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
+Link: https://lore.kernel.org/r/20221104163649.121784-6-andrii@kernel.org
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Stable-dep-of: ecdf985d7615 ("bpf: track immediate values written to stack by BPF_ST instruction")
+Signed-off-by: Pu Lehui <pulehui@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/bpf/verifier.c | 37 +++++++++++++++++++++++++++++++++++++
+ 1 file changed, 37 insertions(+)
+
+--- a/kernel/bpf/verifier.c
++++ b/kernel/bpf/verifier.c
+@@ -2407,6 +2407,31 @@ static void mark_all_scalars_precise(str
+ }
+ }
+
++static void mark_all_scalars_imprecise(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
++{
++ struct bpf_func_state *func;
++ struct bpf_reg_state *reg;
++ int i, j;
++
++ for (i = 0; i <= st->curframe; i++) {
++ func = st->frame[i];
++ for (j = 0; j < BPF_REG_FP; j++) {
++ reg = &func->regs[j];
++ if (reg->type != SCALAR_VALUE)
++ continue;
++ reg->precise = false;
++ }
++ for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) {
++ if (!is_spilled_reg(&func->stack[j]))
++ continue;
++ reg = &func->stack[j].spilled_ptr;
++ if (reg->type != SCALAR_VALUE)
++ continue;
++ reg->precise = false;
++ }
++ }
++}
++
+ /*
+ * __mark_chain_precision() backtracks BPF program instruction sequence and
+ * chain of verifier states making sure that register *regno* (if regno >= 0)
+@@ -2485,6 +2510,14 @@ static void mark_all_scalars_precise(str
+ * be imprecise. If any child state does require this register to be precise,
+ * we'll mark it precise later retroactively during precise markings
+ * propagation from child state to parent states.
++ *
++ * Skipping precise marking setting in current state is a mild version of
++ * relying on the above observation. But we can utilize this property even
++ * more aggressively by proactively forgetting any precise marking in the
++ * current state (which we inherited from the parent state), right before we
++ * checkpoint it and branch off into new child state. This is done by
++ * mark_all_scalars_imprecise() to hopefully get more permissive and generic
++ * finalized states which help in short circuiting more future states.
+ */
+ static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int regno,
+ int spi)
+@@ -10984,6 +11017,10 @@ next:
+ env->prev_jmps_processed = env->jmps_processed;
+ env->prev_insn_processed = env->insn_processed;
+
++ /* forget precise markings we inherited, see __mark_chain_precision */
++ if (env->bpf_capable)
++ mark_all_scalars_imprecise(env, cur);
++
+ /* add new state to the head of linked list */
+ new = &new_sl->state;
+ err = copy_verifier_state(new, cur);
--- /dev/null
+From pulehui@huaweicloud.com Fri Aug 4 17:25:37 2023
+From: Pu Lehui <pulehui@huaweicloud.com>
+Date: Fri, 4 Aug 2023 23:24:54 +0800
+Subject: bpf: allow precision tracking for programs with subprogs
+To: stable@vger.kernel.org, Greg KH <greg@kroah.com>, Eduard Zingerman <eddyz87@gmail.com>, Luiz Capitulino <luizcap@amazon.com>
+Cc: Alexei Starovoitov <ast@kernel.org>, Daniel Borkmann <daniel@iogearbox.net>, Andrii Nakryiko <andrii@kernel.org>, Pu Lehui <pulehui@huawei.com>, Pu Lehui <pulehui@huaweicloud.com>
+Message-ID: <20230804152459.2565673-2-pulehui@huaweicloud.com>
+
+From: Andrii Nakryiko <andrii@kernel.org>
+
+[ Upstream commit be2ef8161572ec1973124ebc50f56dafc2925e07 ]
+
+Stop forcing precise=true for SCALAR registers when BPF program has any
+subprograms. Current restriction means that any BPF program, as soon as
+it uses subprograms, will end up not getting any of the precision
+tracking benefits in reduction of number of verified states.
+
+This patch keeps the fallback mark_all_scalars_precise() behavior if
+precise marking has to cross function frames. E.g., if subprogram
+requires R1 (first input arg) to be marked precise, ideally we'd need to
+backtrack to the parent function and keep marking R1 and its
+dependencies as precise. But right now we give up and force all the
+SCALARs in any of the current and parent states to be forced to
+precise=true. We can lift that restriction in the future.
+
+But this patch fixes two issues identified when trying to enable
+precision tracking for subprogs.
+
+First, prevent "escaping" from top-most state in a global subprog. While
+with entry-level BPF program we never end up requesting precision for
+R1-R5 registers, because R2-R5 are not initialized (and so not readable
+in correct BPF program), and R1 is PTR_TO_CTX, not SCALAR, and so is
+implicitly precise. With global subprogs, though, it's different, as
+global subprog a) can have up to 5 SCALAR input arguments, which might
+get marked as precise=true and b) it is validated in isolation from its
+main entry BPF program. b) means that we can end up exhausting parent
+state chain and still not mark all registers in reg_mask as precise,
+which would lead to verifier bug warning.
+
+To handle that, we need to consider two cases. First, if the very first
+state is not immediately "checkpointed" (i.e., stored in state lookup
+hashtable), it will get correct first_insn_idx and last_insn_idx
+instruction set during state checkpointing. As such, this case is
+already handled and __mark_chain_precision() already handles that by
+just doing nothing when we reach to the very first parent state.
+st->parent will be NULL and we'll just stop. Perhaps some extra check
+for reg_mask and stack_mask is due here, but this patch doesn't address
+that issue.
+
+More problematic second case is when global function's initial state is
+immediately checkpointed before we manage to process the very first
+instruction. This is happening because when there is a call to global
+subprog from the main program the very first subprog's instruction is
+marked as pruning point, so before we manage to process first
+instruction we have to check and checkpoint state. This patch adds
+a special handling for such "empty" state, which is identified by having
+st->last_insn_idx set to -1. In such case, we check that we are indeed
+validating global subprog, and with some sanity checking we mark input
+args as precise if requested.
+
+Note that we also initialize state->first_insn_idx with correct start
+insn_idx offset. For main program zero is correct value, but for any
+subprog it's quite confusing to not have first_insn_idx set. This
+doesn't have any functional impact, but helps with debugging and state
+printing. We also explicitly initialize state->last_insns_idx instead of
+relying on is_state_visited() to do this with env->prev_insns_idx, which
+will be -1 on the very first instruction. This concludes necessary
+changes to handle specifically global subprog's precision tracking.
+
+Second identified problem was missed handling of BPF helper functions
+that call into subprogs (e.g., bpf_loop and few others). From precision
+tracking and backtracking logic's standpoint those are effectively calls
+into subprogs and should be called as BPF_PSEUDO_CALL calls.
+
+This patch takes the least intrusive way and just checks against a short
+list of current BPF helpers that do call subprogs, encapsulated in
+is_callback_calling_function() function. But to prevent accidentally
+forgetting to add new BPF helpers to this "list", we also do a sanity
+check in __check_func_call, which has to be called for each such special
+BPF helper, to validate that BPF helper is indeed recognized as
+callback-calling one. This should catch any missed checks in the future.
+Adding some special flags to be added in function proto definitions
+seemed like an overkill in this case.
+
+With the above changes, it's possible to remove forceful setting of
+reg->precise to true in __mark_reg_unknown, which turns on precision
+tracking both inside subprogs and entry progs that have subprogs. No
+warnings or errors were detected across all the selftests, but also when
+validating with veristat against internal Meta BPF objects and Cilium
+objects. Further, in some BPF programs there are noticeable reduction in
+number of states and instructions validated due to more effective
+precision tracking, especially benefiting syncookie test.
+
+$ ./veristat -C -e file,prog,insns,states ~/baseline-results.csv ~/subprog-precise-results.csv | grep -v '+0'
+File Program Total insns (A) Total insns (B) Total insns (DIFF) Total states (A) Total states (B) Total states (DIFF)
+---------------------------------------- -------------------------- --------------- --------------- ------------------ ---------------- ---------------- -------------------
+pyperf600_bpf_loop.bpf.linked1.o on_event 3966 3678 -288 (-7.26%) 306 276 -30 (-9.80%)
+pyperf_global.bpf.linked1.o on_event 7563 7530 -33 (-0.44%) 520 517 -3 (-0.58%)
+pyperf_subprogs.bpf.linked1.o on_event 36358 36934 +576 (+1.58%) 2499 2531 +32 (+1.28%)
+setget_sockopt.bpf.linked1.o skops_sockopt 3965 4038 +73 (+1.84%) 343 347 +4 (+1.17%)
+test_cls_redirect_subprogs.bpf.linked1.o cls_redirect 64965 64901 -64 (-0.10%) 4619 4612 -7 (-0.15%)
+test_misc_tcp_hdr_options.bpf.linked1.o misc_estab 1491 1307 -184 (-12.34%) 110 100 -10 (-9.09%)
+test_pkt_access.bpf.linked1.o test_pkt_access 354 349 -5 (-1.41%) 25 24 -1 (-4.00%)
+test_sock_fields.bpf.linked1.o egress_read_sock_fields 435 375 -60 (-13.79%) 22 20 -2 (-9.09%)
+test_sysctl_loop2.bpf.linked1.o sysctl_tcp_mem 1508 1501 -7 (-0.46%) 29 28 -1 (-3.45%)
+test_tc_dtime.bpf.linked1.o egress_fwdns_prio100 468 435 -33 (-7.05%) 45 41 -4 (-8.89%)
+test_tc_dtime.bpf.linked1.o ingress_fwdns_prio100 398 408 +10 (+2.51%) 42 39 -3 (-7.14%)
+test_tc_dtime.bpf.linked1.o ingress_fwdns_prio101 1096 842 -254 (-23.18%) 97 73 -24 (-24.74%)
+test_tcp_hdr_options.bpf.linked1.o estab 2758 2408 -350 (-12.69%) 208 181 -27 (-12.98%)
+test_urandom_usdt.bpf.linked1.o urand_read_with_sema 466 448 -18 (-3.86%) 31 28 -3 (-9.68%)
+test_urandom_usdt.bpf.linked1.o urand_read_without_sema 466 448 -18 (-3.86%) 31 28 -3 (-9.68%)
+test_urandom_usdt.bpf.linked1.o urandlib_read_with_sema 466 448 -18 (-3.86%) 31 28 -3 (-9.68%)
+test_urandom_usdt.bpf.linked1.o urandlib_read_without_sema 466 448 -18 (-3.86%) 31 28 -3 (-9.68%)
+test_xdp_noinline.bpf.linked1.o balancer_ingress_v6 4302 4294 -8 (-0.19%) 257 256 -1 (-0.39%)
+xdp_synproxy_kern.bpf.linked1.o syncookie_tc 583722 405757 -177965 (-30.49%) 35846 25735 -10111 (-28.21%)
+xdp_synproxy_kern.bpf.linked1.o syncookie_xdp 609123 479055 -130068 (-21.35%) 35452 29145 -6307 (-17.79%)
+---------------------------------------- -------------------------- --------------- --------------- ------------------ ---------------- ---------------- -------------------
+
+Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
+Link: https://lore.kernel.org/r/20221104163649.121784-4-andrii@kernel.org
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Stable-dep-of: ecdf985d7615 ("bpf: track immediate values written to stack by BPF_ST instruction")
+Signed-off-by: Pu Lehui <pulehui@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/bpf/verifier.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++-
+ 1 file changed, 58 insertions(+), 1 deletion(-)
+
+--- a/kernel/bpf/verifier.c
++++ b/kernel/bpf/verifier.c
+@@ -517,6 +517,12 @@ static bool is_ptr_cast_function(enum bp
+ func_id == BPF_FUNC_skc_to_tcp_request_sock;
+ }
+
++static bool is_callback_calling_function(enum bpf_func_id func_id)
++{
++ return func_id == BPF_FUNC_for_each_map_elem ||
++ func_id == BPF_FUNC_timer_set_callback;
++}
++
+ static bool is_cmpxchg_insn(const struct bpf_insn *insn)
+ {
+ return BPF_CLASS(insn->code) == BPF_STX &&
+@@ -1446,7 +1452,7 @@ static void __mark_reg_unknown(const str
+ reg->type = SCALAR_VALUE;
+ reg->var_off = tnum_unknown;
+ reg->frameno = 0;
+- reg->precise = env->subprog_cnt > 1 || !env->bpf_capable;
++ reg->precise = !env->bpf_capable;
+ __mark_reg_unbounded(reg);
+ }
+
+@@ -2267,6 +2273,11 @@ static int backtrack_insn(struct bpf_ver
+ */
+ if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL && insn->imm == 0)
+ return -ENOTSUPP;
++ /* BPF helpers that invoke callback subprogs are
++ * equivalent to BPF_PSEUDO_CALL above
++ */
++ if (insn->src_reg == 0 && is_callback_calling_function(insn->imm))
++ return -ENOTSUPP;
+ /* regular helper call sets R0 */
+ *reg_mask &= ~1;
+ if (*reg_mask & 0x3f) {
+@@ -2445,12 +2456,42 @@ static int __mark_chain_precision(struct
+ return 0;
+ if (!reg_mask && !stack_mask)
+ return 0;
++
+ for (;;) {
+ DECLARE_BITMAP(mask, 64);
+ u32 history = st->jmp_history_cnt;
+
+ if (env->log.level & BPF_LOG_LEVEL)
+ verbose(env, "last_idx %d first_idx %d\n", last_idx, first_idx);
++
++ if (last_idx < 0) {
++ /* we are at the entry into subprog, which
++ * is expected for global funcs, but only if
++ * requested precise registers are R1-R5
++ * (which are global func's input arguments)
++ */
++ if (st->curframe == 0 &&
++ st->frame[0]->subprogno > 0 &&
++ st->frame[0]->callsite == BPF_MAIN_FUNC &&
++ stack_mask == 0 && (reg_mask & ~0x3e) == 0) {
++ bitmap_from_u64(mask, reg_mask);
++ for_each_set_bit(i, mask, 32) {
++ reg = &st->frame[0]->regs[i];
++ if (reg->type != SCALAR_VALUE) {
++ reg_mask &= ~(1u << i);
++ continue;
++ }
++ reg->precise = true;
++ }
++ return 0;
++ }
++
++ verbose(env, "BUG backtracing func entry subprog %d reg_mask %x stack_mask %llx\n",
++ st->frame[0]->subprogno, reg_mask, stack_mask);
++ WARN_ONCE(1, "verifier backtracking bug");
++ return -EFAULT;
++ }
++
+ for (i = last_idx;;) {
+ if (skip_first) {
+ err = 0;
+@@ -5806,6 +5847,10 @@ typedef int (*set_callee_state_fn)(struc
+ struct bpf_func_state *callee,
+ int insn_idx);
+
++static int set_callee_state(struct bpf_verifier_env *env,
++ struct bpf_func_state *caller,
++ struct bpf_func_state *callee, int insn_idx);
++
+ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
+ int *insn_idx, int subprog,
+ set_callee_state_fn set_callee_state_cb)
+@@ -5856,6 +5901,16 @@ static int __check_func_call(struct bpf_
+ }
+ }
+
++ /* set_callee_state is used for direct subprog calls, but we are
++ * interested in validating only BPF helpers that can call subprogs as
++ * callbacks
++ */
++ if (set_callee_state_cb != set_callee_state && !is_callback_calling_function(insn->imm)) {
++ verbose(env, "verifier bug: helper %s#%d is not marked as callback-calling\n",
++ func_id_name(insn->imm), insn->imm);
++ return -EFAULT;
++ }
++
+ if (insn->code == (BPF_JMP | BPF_CALL) &&
+ insn->src_reg == 0 &&
+ insn->imm == BPF_FUNC_timer_set_callback) {
+@@ -13209,6 +13264,8 @@ static int do_check_common(struct bpf_ve
+ BPF_MAIN_FUNC /* callsite */,
+ 0 /* frameno */,
+ subprog);
++ state->first_insn_idx = env->subprog_info[subprog].start;
++ state->last_insn_idx = -1;
+
+ regs = state->frame[state->curframe]->regs;
+ if (subprog || env->prog->type == BPF_PROG_TYPE_EXT) {
--- /dev/null
+From pulehui@huaweicloud.com Fri Aug 4 17:25:42 2023
+From: Pu Lehui <pulehui@huaweicloud.com>
+Date: Fri, 4 Aug 2023 23:24:55 +0800
+Subject: bpf: stop setting precise in current state
+To: stable@vger.kernel.org, Greg KH <greg@kroah.com>, Eduard Zingerman <eddyz87@gmail.com>, Luiz Capitulino <luizcap@amazon.com>
+Cc: Alexei Starovoitov <ast@kernel.org>, Daniel Borkmann <daniel@iogearbox.net>, Andrii Nakryiko <andrii@kernel.org>, Pu Lehui <pulehui@huawei.com>, Pu Lehui <pulehui@huaweicloud.com>
+Message-ID: <20230804152459.2565673-3-pulehui@huaweicloud.com>
+
+From: Andrii Nakryiko <andrii@kernel.org>
+
+[ Upstream commit f63181b6ae79fd3b034cde641db774268c2c3acf ]
+
+Setting reg->precise to true in current state is not necessary from
+correctness standpoint, but it does pessimise the whole precision (or
+rather "imprecision", because that's what we want to keep as much as
+possible) tracking. Why is somewhat subtle and my best attempt to
+explain this is recorded in an extensive comment for __mark_chain_precise()
+function. Some more careful thinking and code reading is probably required
+still to grok this completely, unfortunately. Whiteboarding and a bunch
+of extra handwaiving in person would be even more helpful, but is deemed
+impractical in Git commit.
+
+Next patch pushes this imprecision property even further, building on top of
+the insights described in this patch.
+
+End results are pretty nice, we get reduction in number of total instructions
+and states verified due to a better states reuse, as some of the states are now
+more generic and permissive due to less unnecessary precise=true requirements.
+
+SELFTESTS RESULTS
+=================
+
+$ ./veristat -C -e file,prog,insns,states ~/subprog-precise-results.csv ~/imprecise-early-results.csv | grep -v '+0'
+File Program Total insns (A) Total insns (B) Total insns (DIFF) Total states (A) Total states (B) Total states (DIFF)
+--------------------------------------- ---------------------- --------------- --------------- ------------------ ---------------- ---------------- -------------------
+bpf_iter_ksym.bpf.linked1.o dump_ksym 347 285 -62 (-17.87%) 20 19 -1 (-5.00%)
+pyperf600_bpf_loop.bpf.linked1.o on_event 3678 3736 +58 (+1.58%) 276 285 +9 (+3.26%)
+setget_sockopt.bpf.linked1.o skops_sockopt 4038 3947 -91 (-2.25%) 347 343 -4 (-1.15%)
+test_l4lb.bpf.linked1.o balancer_ingress 4559 2611 -1948 (-42.73%) 118 105 -13 (-11.02%)
+test_l4lb_noinline.bpf.linked1.o balancer_ingress 6279 6268 -11 (-0.18%) 237 236 -1 (-0.42%)
+test_misc_tcp_hdr_options.bpf.linked1.o misc_estab 1307 1303 -4 (-0.31%) 100 99 -1 (-1.00%)
+test_sk_lookup.bpf.linked1.o ctx_narrow_access 456 447 -9 (-1.97%) 39 38 -1 (-2.56%)
+test_sysctl_loop1.bpf.linked1.o sysctl_tcp_mem 1389 1384 -5 (-0.36%) 26 25 -1 (-3.85%)
+test_tc_dtime.bpf.linked1.o egress_fwdns_prio101 518 485 -33 (-6.37%) 51 46 -5 (-9.80%)
+test_tc_dtime.bpf.linked1.o egress_host 519 468 -51 (-9.83%) 50 44 -6 (-12.00%)
+test_tc_dtime.bpf.linked1.o ingress_fwdns_prio101 842 1000 +158 (+18.76%) 73 88 +15 (+20.55%)
+xdp_synproxy_kern.bpf.linked1.o syncookie_tc 405757 373173 -32584 (-8.03%) 25735 22882 -2853 (-11.09%)
+xdp_synproxy_kern.bpf.linked1.o syncookie_xdp 479055 371590 -107465 (-22.43%) 29145 22207 -6938 (-23.81%)
+--------------------------------------- ---------------------- --------------- --------------- ------------------ ---------------- ---------------- -------------------
+
+Slight regression in test_tc_dtime.bpf.linked1.o/ingress_fwdns_prio101
+is left for a follow up, there might be some more precision-related bugs
+in existing BPF verifier logic.
+
+CILIUM RESULTS
+==============
+
+$ ./veristat -C -e file,prog,insns,states ~/subprog-precise-results-cilium.csv ~/imprecise-early-results-cilium.csv | grep -v '+0'
+File Program Total insns (A) Total insns (B) Total insns (DIFF) Total states (A) Total states (B) Total states (DIFF)
+------------- ------------------------------ --------------- --------------- ------------------ ---------------- ---------------- -------------------
+bpf_host.o cil_from_host 762 556 -206 (-27.03%) 43 37 -6 (-13.95%)
+bpf_host.o tail_handle_nat_fwd_ipv4 23541 23426 -115 (-0.49%) 1538 1537 -1 (-0.07%)
+bpf_host.o tail_nodeport_nat_egress_ipv4 33592 33566 -26 (-0.08%) 2163 2161 -2 (-0.09%)
+bpf_lxc.o tail_handle_nat_fwd_ipv4 23541 23426 -115 (-0.49%) 1538 1537 -1 (-0.07%)
+bpf_overlay.o tail_nodeport_nat_egress_ipv4 33581 33543 -38 (-0.11%) 2160 2157 -3 (-0.14%)
+bpf_xdp.o tail_handle_nat_fwd_ipv4 21659 20920 -739 (-3.41%) 1440 1376 -64 (-4.44%)
+bpf_xdp.o tail_handle_nat_fwd_ipv6 17084 17039 -45 (-0.26%) 907 905 -2 (-0.22%)
+bpf_xdp.o tail_lb_ipv4 73442 73430 -12 (-0.02%) 4370 4369 -1 (-0.02%)
+bpf_xdp.o tail_lb_ipv6 152114 151895 -219 (-0.14%) 6493 6479 -14 (-0.22%)
+bpf_xdp.o tail_nodeport_nat_egress_ipv4 17377 17200 -177 (-1.02%) 1125 1111 -14 (-1.24%)
+bpf_xdp.o tail_nodeport_nat_ingress_ipv6 6405 6397 -8 (-0.12%) 309 308 -1 (-0.32%)
+bpf_xdp.o tail_rev_nodeport_lb4 7126 6934 -192 (-2.69%) 414 402 -12 (-2.90%)
+bpf_xdp.o tail_rev_nodeport_lb6 18059 17905 -154 (-0.85%) 1105 1096 -9 (-0.81%)
+------------- ------------------------------ --------------- --------------- ------------------ ---------------- ---------------- -------------------
+
+Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
+Link: https://lore.kernel.org/r/20221104163649.121784-5-andrii@kernel.org
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Stable-dep-of: ecdf985d7615 ("bpf: track immediate values written to stack by BPF_ST instruction")
+Signed-off-by: Pu Lehui <pulehui@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/bpf/verifier.c | 103 ++++++++++++++++++++++++++++++++++++++++++++------
+ 1 file changed, 91 insertions(+), 12 deletions(-)
+
+--- a/kernel/bpf/verifier.c
++++ b/kernel/bpf/verifier.c
+@@ -2382,8 +2382,11 @@ static void mark_all_scalars_precise(str
+
+ /* big hammer: mark all scalars precise in this path.
+ * pop_stack may still get !precise scalars.
++ * We also skip current state and go straight to first parent state,
++ * because precision markings in current non-checkpointed state are
++ * not needed. See why in the comment in __mark_chain_precision below.
+ */
+- for (; st; st = st->parent)
++ for (st = st->parent; st; st = st->parent) {
+ for (i = 0; i <= st->curframe; i++) {
+ func = st->frame[i];
+ for (j = 0; j < BPF_REG_FP; j++) {
+@@ -2401,8 +2404,88 @@ static void mark_all_scalars_precise(str
+ reg->precise = true;
+ }
+ }
++ }
+ }
+
++/*
++ * __mark_chain_precision() backtracks BPF program instruction sequence and
++ * chain of verifier states making sure that register *regno* (if regno >= 0)
++ * and/or stack slot *spi* (if spi >= 0) are marked as precisely tracked
++ * SCALARS, as well as any other registers and slots that contribute to
++ * a tracked state of given registers/stack slots, depending on specific BPF
++ * assembly instructions (see backtrack_insns() for exact instruction handling
++ * logic). This backtracking relies on recorded jmp_history and is able to
++ * traverse entire chain of parent states. This process ends only when all the
++ * necessary registers/slots and their transitive dependencies are marked as
++ * precise.
++ *
++ * One important and subtle aspect is that precise marks *do not matter* in
++ * the currently verified state (current state). It is important to understand
++ * why this is the case.
++ *
++ * First, note that current state is the state that is not yet "checkpointed",
++ * i.e., it is not yet put into env->explored_states, and it has no children
++ * states as well. It's ephemeral, and can end up either a) being discarded if
++ * compatible explored state is found at some point or BPF_EXIT instruction is
++ * reached or b) checkpointed and put into env->explored_states, branching out
++ * into one or more children states.
++ *
++ * In the former case, precise markings in current state are completely
++ * ignored by state comparison code (see regsafe() for details). Only
++ * checkpointed ("old") state precise markings are important, and if old
++ * state's register/slot is precise, regsafe() assumes current state's
++ * register/slot as precise and checks value ranges exactly and precisely. If
++ * states turn out to be compatible, current state's necessary precise
++ * markings and any required parent states' precise markings are enforced
++ * after the fact with propagate_precision() logic, after the fact. But it's
++ * important to realize that in this case, even after marking current state
++ * registers/slots as precise, we immediately discard current state. So what
++ * actually matters is any of the precise markings propagated into current
++ * state's parent states, which are always checkpointed (due to b) case above).
++ * As such, for scenario a) it doesn't matter if current state has precise
++ * markings set or not.
++ *
++ * Now, for the scenario b), checkpointing and forking into child(ren)
++ * state(s). Note that before current state gets to checkpointing step, any
++ * processed instruction always assumes precise SCALAR register/slot
++ * knowledge: if precise value or range is useful to prune jump branch, BPF
++ * verifier takes this opportunity enthusiastically. Similarly, when
++ * register's value is used to calculate offset or memory address, exact
++ * knowledge of SCALAR range is assumed, checked, and enforced. So, similar to
++ * what we mentioned above about state comparison ignoring precise markings
++ * during state comparison, BPF verifier ignores and also assumes precise
++ * markings *at will* during instruction verification process. But as verifier
++ * assumes precision, it also propagates any precision dependencies across
++ * parent states, which are not yet finalized, so can be further restricted
++ * based on new knowledge gained from restrictions enforced by their children
++ * states. This is so that once those parent states are finalized, i.e., when
++ * they have no more active children state, state comparison logic in
++ * is_state_visited() would enforce strict and precise SCALAR ranges, if
++ * required for correctness.
++ *
++ * To build a bit more intuition, note also that once a state is checkpointed,
++ * the path we took to get to that state is not important. This is crucial
++ * property for state pruning. When state is checkpointed and finalized at
++ * some instruction index, it can be correctly and safely used to "short
++ * circuit" any *compatible* state that reaches exactly the same instruction
++ * index. I.e., if we jumped to that instruction from a completely different
++ * code path than original finalized state was derived from, it doesn't
++ * matter, current state can be discarded because from that instruction
++ * forward having a compatible state will ensure we will safely reach the
++ * exit. States describe preconditions for further exploration, but completely
++ * forget the history of how we got here.
++ *
++ * This also means that even if we needed precise SCALAR range to get to
++ * finalized state, but from that point forward *that same* SCALAR register is
++ * never used in a precise context (i.e., it's precise value is not needed for
++ * correctness), it's correct and safe to mark such register as "imprecise"
++ * (i.e., precise marking set to false). This is what we rely on when we do
++ * not set precise marking in current state. If no child state requires
++ * precision for any given SCALAR register, it's safe to dictate that it can
++ * be imprecise. If any child state does require this register to be precise,
++ * we'll mark it precise later retroactively during precise markings
++ * propagation from child state to parent states.
++ */
+ static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int regno,
+ int spi)
+ {
+@@ -2420,6 +2503,10 @@ static int __mark_chain_precision(struct
+ if (!env->bpf_capable)
+ return 0;
+
++ /* Do sanity checks against current state of register and/or stack
++ * slot, but don't set precise flag in current state, as precision
++ * tracking in the current state is unnecessary.
++ */
+ func = st->frame[frame];
+ if (regno >= 0) {
+ reg = &func->regs[regno];
+@@ -2427,11 +2514,7 @@ static int __mark_chain_precision(struct
+ WARN_ONCE(1, "backtracing misuse");
+ return -EFAULT;
+ }
+- if (!reg->precise)
+- new_marks = true;
+- else
+- reg_mask = 0;
+- reg->precise = true;
++ new_marks = true;
+ }
+
+ while (spi >= 0) {
+@@ -2444,11 +2527,7 @@ static int __mark_chain_precision(struct
+ stack_mask = 0;
+ break;
+ }
+- if (!reg->precise)
+- new_marks = true;
+- else
+- stack_mask = 0;
+- reg->precise = true;
++ new_marks = true;
+ break;
+ }
+
+@@ -10356,7 +10435,7 @@ static bool regsafe(struct bpf_verifier_
+ if (env->explore_alu_limits)
+ return false;
+ if (rcur->type == SCALAR_VALUE) {
+- if (!rold->precise && !rcur->precise)
++ if (!rold->precise)
+ return true;
+ /* new val must satisfy old val knowledge */
+ return range_within(rold, rcur) &&
--- /dev/null
+From pulehui@huaweicloud.com Fri Aug 4 17:25:38 2023
+From: Pu Lehui <pulehui@huaweicloud.com>
+Date: Fri, 4 Aug 2023 23:24:59 +0800
+Subject: selftests/bpf: Fix sk_assign on s390x
+To: stable@vger.kernel.org, Greg KH <greg@kroah.com>, Eduard Zingerman <eddyz87@gmail.com>, Luiz Capitulino <luizcap@amazon.com>
+Cc: Alexei Starovoitov <ast@kernel.org>, Daniel Borkmann <daniel@iogearbox.net>, Andrii Nakryiko <andrii@kernel.org>, Pu Lehui <pulehui@huawei.com>, Pu Lehui <pulehui@huaweicloud.com>
+Message-ID: <20230804152459.2565673-7-pulehui@huaweicloud.com>
+
+From: Ilya Leoshkevich <iii@linux.ibm.com>
+
+[ Upstream commit 7ce878ca81bca7811e669db4c394b86780e0dbe4 ]
+
+sk_assign is failing on an s390x machine running Debian "bookworm" for
+2 reasons: legacy server_map definition and uninitialized addrlen in
+recvfrom() call.
+
+Fix by adding a new-style server_map definition and dropping addrlen
+(recvfrom() allows NULL values for src_addr and addrlen).
+
+Since the test should support tc built without libbpf, build the prog
+twice: with the old-style definition and with the new-style definition,
+then select the right one at runtime. This could be done at compile
+time too, but this would not be cross-compilation friendly.
+
+Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
+Link: https://lore.kernel.org/r/20230129190501.1624747-2-iii@linux.ibm.com
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Pu Lehui <pulehui@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ tools/testing/selftests/bpf/prog_tests/sk_assign.c | 25 ++++++++++----
+ tools/testing/selftests/bpf/progs/test_sk_assign.c | 11 ++++++
+ tools/testing/selftests/bpf/progs/test_sk_assign_libbpf.c | 3 +
+ 3 files changed, 33 insertions(+), 6 deletions(-)
+ create mode 100644 tools/testing/selftests/bpf/progs/test_sk_assign_libbpf.c
+
+--- a/tools/testing/selftests/bpf/prog_tests/sk_assign.c
++++ b/tools/testing/selftests/bpf/prog_tests/sk_assign.c
+@@ -29,7 +29,23 @@ static int stop, duration;
+ static bool
+ configure_stack(void)
+ {
++ char tc_version[128];
+ char tc_cmd[BUFSIZ];
++ char *prog;
++ FILE *tc;
++
++ /* Check whether tc is built with libbpf. */
++ tc = popen("tc -V", "r");
++ if (CHECK_FAIL(!tc))
++ return false;
++ if (CHECK_FAIL(!fgets(tc_version, sizeof(tc_version), tc)))
++ return false;
++ if (strstr(tc_version, ", libbpf "))
++ prog = "test_sk_assign_libbpf.o";
++ else
++ prog = "test_sk_assign.o";
++ if (CHECK_FAIL(pclose(tc)))
++ return false;
+
+ /* Move to a new networking namespace */
+ if (CHECK_FAIL(unshare(CLONE_NEWNET)))
+@@ -46,8 +62,8 @@ configure_stack(void)
+ /* Load qdisc, BPF program */
+ if (CHECK_FAIL(system("tc qdisc add dev lo clsact")))
+ return false;
+- sprintf(tc_cmd, "%s %s %s %s", "tc filter add dev lo ingress bpf",
+- "direct-action object-file ./test_sk_assign.o",
++ sprintf(tc_cmd, "%s %s %s %s %s", "tc filter add dev lo ingress bpf",
++ "direct-action object-file", prog,
+ "section classifier/sk_assign_test",
+ (env.verbosity < VERBOSE_VERY) ? " 2>/dev/null" : "verbose");
+ if (CHECK(system(tc_cmd), "BPF load failed;",
+@@ -129,15 +145,12 @@ get_port(int fd)
+ static ssize_t
+ rcv_msg(int srv_client, int type)
+ {
+- struct sockaddr_storage ss;
+ char buf[BUFSIZ];
+- socklen_t slen;
+
+ if (type == SOCK_STREAM)
+ return read(srv_client, &buf, sizeof(buf));
+ else
+- return recvfrom(srv_client, &buf, sizeof(buf), 0,
+- (struct sockaddr *)&ss, &slen);
++ return recvfrom(srv_client, &buf, sizeof(buf), 0, NULL, NULL);
+ }
+
+ static int
+--- a/tools/testing/selftests/bpf/progs/test_sk_assign.c
++++ b/tools/testing/selftests/bpf/progs/test_sk_assign.c
+@@ -16,6 +16,16 @@
+ #include <bpf/bpf_helpers.h>
+ #include <bpf/bpf_endian.h>
+
++#if defined(IPROUTE2_HAVE_LIBBPF)
++/* Use a new-style map definition. */
++struct {
++ __uint(type, BPF_MAP_TYPE_SOCKMAP);
++ __type(key, int);
++ __type(value, __u64);
++ __uint(pinning, LIBBPF_PIN_BY_NAME);
++ __uint(max_entries, 1);
++} server_map SEC(".maps");
++#else
+ /* Pin map under /sys/fs/bpf/tc/globals/<map name> */
+ #define PIN_GLOBAL_NS 2
+
+@@ -35,6 +45,7 @@ struct {
+ .max_elem = 1,
+ .pinning = PIN_GLOBAL_NS,
+ };
++#endif
+
+ int _version SEC("version") = 1;
+ char _license[] SEC("license") = "GPL";
+--- /dev/null
++++ b/tools/testing/selftests/bpf/progs/test_sk_assign_libbpf.c
+@@ -0,0 +1,3 @@
++// SPDX-License-Identifier: GPL-2.0
++#define IPROUTE2_HAVE_LIBBPF
++#include "test_sk_assign.c"
--- /dev/null
+From pulehui@huaweicloud.com Fri Aug 4 17:25:38 2023
+From: Pu Lehui <pulehui@huaweicloud.com>
+Date: Fri, 4 Aug 2023 23:24:57 +0800
+Subject: selftests/bpf: make test_align selftest more robust
+To: stable@vger.kernel.org, Greg KH <greg@kroah.com>, Eduard Zingerman <eddyz87@gmail.com>, Luiz Capitulino <luizcap@amazon.com>
+Cc: Alexei Starovoitov <ast@kernel.org>, Daniel Borkmann <daniel@iogearbox.net>, Andrii Nakryiko <andrii@kernel.org>, Pu Lehui <pulehui@huawei.com>, Pu Lehui <pulehui@huaweicloud.com>
+Message-ID: <20230804152459.2565673-5-pulehui@huaweicloud.com>
+
+From: Andrii Nakryiko <andrii@kernel.org>
+
+[ Upstream commit 4f999b767769b76378c3616c624afd6f4bb0d99f ]
+
+test_align selftest relies on BPF verifier log emitting register states
+for specific instructions in expected format. Unfortunately, BPF
+verifier precision backtracking log interferes with such expectations.
+And instruction on which precision propagation happens sometimes don't
+output full expected register states. This does indeed look like
+something to be improved in BPF verifier, but is beyond the scope of
+this patch set.
+
+So to make test_align a bit more robust, inject few dummy R4 = R5
+instructions which capture desired state of R5 and won't have precision
+tracking logs on them. This fixes tests until we can improve BPF
+verifier output in the presence of precision tracking.
+
+Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
+Link: https://lore.kernel.org/r/20221104163649.121784-7-andrii@kernel.org
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Stable-dep-of: ecdf985d7615 ("bpf: track immediate values written to stack by BPF_ST instruction")
+Signed-off-by: Pu Lehui <pulehui@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ tools/testing/selftests/bpf/prog_tests/align.c | 36 +++++++++++++++----------
+ 1 file changed, 23 insertions(+), 13 deletions(-)
+
+--- a/tools/testing/selftests/bpf/prog_tests/align.c
++++ b/tools/testing/selftests/bpf/prog_tests/align.c
+@@ -2,7 +2,7 @@
+ #include <test_progs.h>
+
+ #define MAX_INSNS 512
+-#define MAX_MATCHES 16
++#define MAX_MATCHES 24
+
+ struct bpf_reg_match {
+ unsigned int line;
+@@ -267,6 +267,7 @@ static struct bpf_align_test tests[] = {
+ */
+ BPF_MOV64_REG(BPF_REG_5, BPF_REG_2),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_5, BPF_REG_6),
++ BPF_MOV64_REG(BPF_REG_4, BPF_REG_5),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 14),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_5),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4),
+@@ -280,6 +281,7 @@ static struct bpf_align_test tests[] = {
+ BPF_MOV64_REG(BPF_REG_5, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 14),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_5, BPF_REG_6),
++ BPF_MOV64_REG(BPF_REG_4, BPF_REG_5),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 4),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_5, BPF_REG_6),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_5),
+@@ -311,44 +313,52 @@ static struct bpf_align_test tests[] = {
+ {15, "R4=pkt(id=1,off=18,r=18,umax_value=1020,var_off=(0x0; 0x3fc))"},
+ {15, "R5=pkt(id=1,off=14,r=18,umax_value=1020,var_off=(0x0; 0x3fc))"},
+ /* Variable offset is added to R5 packet pointer,
+- * resulting in auxiliary alignment of 4.
++ * resulting in auxiliary alignment of 4. To avoid BPF
++ * verifier's precision backtracking logging
++ * interfering we also have a no-op R4 = R5
++ * instruction to validate R5 state. We also check
++ * that R4 is what it should be in such case.
+ */
+- {18, "R5_w=pkt(id=2,off=0,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
++ {19, "R4_w=pkt(id=2,off=0,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
++ {19, "R5_w=pkt(id=2,off=0,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
+ /* Constant offset is added to R5, resulting in
+ * reg->off of 14.
+ */
+- {19, "R5_w=pkt(id=2,off=14,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
++ {20, "R5_w=pkt(id=2,off=14,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
+ /* At the time the word size load is performed from R5,
+ * its total fixed offset is NET_IP_ALIGN + reg->off
+ * (14) which is 16. Then the variable offset is 4-byte
+ * aligned, so the total offset is 4-byte aligned and
+ * meets the load's requirements.
+ */
+- {23, "R4=pkt(id=2,off=18,r=18,umax_value=1020,var_off=(0x0; 0x3fc))"},
+- {23, "R5=pkt(id=2,off=14,r=18,umax_value=1020,var_off=(0x0; 0x3fc))"},
++ {24, "R4=pkt(id=2,off=18,r=18,umax_value=1020,var_off=(0x0; 0x3fc))"},
++ {24, "R5=pkt(id=2,off=14,r=18,umax_value=1020,var_off=(0x0; 0x3fc))"},
+ /* Constant offset is added to R5 packet pointer,
+ * resulting in reg->off value of 14.
+ */
+- {26, "R5_w=pkt(id=0,off=14,r=8"},
++ {27, "R5_w=pkt(id=0,off=14,r=8"},
+ /* Variable offset is added to R5, resulting in a
+- * variable offset of (4n).
++ * variable offset of (4n). See comment for insn #19
++ * for R4 = R5 trick.
+ */
+- {27, "R5_w=pkt(id=3,off=14,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
++ {29, "R4_w=pkt(id=3,off=14,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
++ {29, "R5_w=pkt(id=3,off=14,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
+ /* Constant is added to R5 again, setting reg->off to 18. */
+- {28, "R5_w=pkt(id=3,off=18,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
++ {30, "R5_w=pkt(id=3,off=18,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
+ /* And once more we add a variable; resulting var_off
+ * is still (4n), fixed offset is not changed.
+ * Also, we create a new reg->id.
+ */
+- {29, "R5_w=pkt(id=4,off=18,r=0,umax_value=2040,var_off=(0x0; 0x7fc)"},
++ {32, "R4_w=pkt(id=4,off=18,r=0,umax_value=2040,var_off=(0x0; 0x7fc)"},
++ {32, "R5_w=pkt(id=4,off=18,r=0,umax_value=2040,var_off=(0x0; 0x7fc)"},
+ /* At the time the word size load is performed from R5,
+ * its total fixed offset is NET_IP_ALIGN + reg->off (18)
+ * which is 20. Then the variable offset is (4n), so
+ * the total offset is 4-byte aligned and meets the
+ * load's requirements.
+ */
+- {33, "R4=pkt(id=4,off=22,r=22,umax_value=2040,var_off=(0x0; 0x7fc)"},
+- {33, "R5=pkt(id=4,off=18,r=22,umax_value=2040,var_off=(0x0; 0x7fc)"},
++ {35, "R4=pkt(id=4,off=22,r=22,umax_value=2040,var_off=(0x0; 0x7fc)"},
++ {35, "R5=pkt(id=4,off=18,r=22,umax_value=2040,var_off=(0x0; 0x7fc)"},
+ },
+ },
+ {
--- /dev/null
+From pulehui@huaweicloud.com Fri Aug 4 17:25:37 2023
+From: Pu Lehui <pulehui@huaweicloud.com>
+Date: Fri, 4 Aug 2023 23:24:58 +0800
+Subject: selftests/bpf: Workaround verification failure for fexit_bpf2bpf/func_replace_return_code
+To: stable@vger.kernel.org, Greg KH <greg@kroah.com>, Eduard Zingerman <eddyz87@gmail.com>, Luiz Capitulino <luizcap@amazon.com>
+Cc: Alexei Starovoitov <ast@kernel.org>, Daniel Borkmann <daniel@iogearbox.net>, Andrii Nakryiko <andrii@kernel.org>, Pu Lehui <pulehui@huawei.com>, Pu Lehui <pulehui@huaweicloud.com>
+Message-ID: <20230804152459.2565673-6-pulehui@huaweicloud.com>
+
+From: Yonghong Song <yhs@fb.com>
+
+[ Upstream commit 63d78b7e8ca2d0eb8c687a355fa19d01b6fcc723 ]
+
+With latest llvm17, selftest fexit_bpf2bpf/func_replace_return_code
+has the following verification failure:
+
+ 0: R1=ctx(off=0,imm=0) R10=fp0
+ ; int connect_v4_prog(struct bpf_sock_addr *ctx)
+ 0: (bf) r7 = r1 ; R1=ctx(off=0,imm=0) R7_w=ctx(off=0,imm=0)
+ 1: (b4) w6 = 0 ; R6_w=0
+ ; memset(&tuple.ipv4.saddr, 0, sizeof(tuple.ipv4.saddr));
+ ...
+ ; return do_bind(ctx) ? 1 : 0;
+ 179: (bf) r1 = r7 ; R1=ctx(off=0,imm=0) R7=ctx(off=0,imm=0)
+ 180: (85) call pc+147
+ Func#3 is global and valid. Skipping.
+ 181: R0_w=scalar()
+ 181: (bc) w6 = w0 ; R0_w=scalar() R6_w=scalar(umax=4294967295,var_off=(0x0; 0xffffffff))
+ 182: (05) goto pc-129
+ ; }
+ 54: (bc) w0 = w6 ; R0_w=scalar(umax=4294967295,var_off=(0x0; 0xffffffff)) R6_w=scalar(umax=4294967295,var_off=(0x0; 0xffffffff))
+ 55: (95) exit
+ At program exit the register R0 has value (0x0; 0xffffffff) should have been in (0x0; 0x1)
+ processed 281 insns (limit 1000000) max_states_per_insn 1 total_states 26 peak_states 26 mark_read 13
+ -- END PROG LOAD LOG --
+ libbpf: prog 'connect_v4_prog': failed to load: -22
+
+The corresponding source code:
+
+ __attribute__ ((noinline))
+ int do_bind(struct bpf_sock_addr *ctx)
+ {
+ struct sockaddr_in sa = {};
+
+ sa.sin_family = AF_INET;
+ sa.sin_port = bpf_htons(0);
+ sa.sin_addr.s_addr = bpf_htonl(SRC_REWRITE_IP4);
+
+ if (bpf_bind(ctx, (struct sockaddr *)&sa, sizeof(sa)) != 0)
+ return 0;
+
+ return 1;
+ }
+ ...
+ SEC("cgroup/connect4")
+ int connect_v4_prog(struct bpf_sock_addr *ctx)
+ {
+ ...
+ return do_bind(ctx) ? 1 : 0;
+ }
+
+Insn 180 is a call to 'do_bind'. The call's return value is also the return value
+for the program. Since do_bind() returns 0/1, so it is legitimate for compiler to
+optimize 'return do_bind(ctx) ? 1 : 0' to 'return do_bind(ctx)'. However, such
+optimization breaks verifier as the return value of 'do_bind()' is marked as any
+scalar which violates the requirement of prog return value 0/1.
+
+There are two ways to fix this problem, (1) changing 'return 1' in do_bind() to
+e.g. 'return 10' so the compiler has to do 'do_bind(ctx) ? 1 :0', or (2)
+suggested by Andrii, marking do_bind() with __weak attribute so the compiler
+cannot make any assumption on do_bind() return value.
+
+This patch adopted adding __weak approach which is simpler and more resistant
+to potential compiler optimizations.
+
+Suggested-by: Andrii Nakryiko <andrii@kernel.org>
+Signed-off-by: Yonghong Song <yhs@fb.com>
+Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
+Link: https://lore.kernel.org/bpf/20230310012410.2920570-1-yhs@fb.com
+Signed-off-by: Pu Lehui <pulehui@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ tools/testing/selftests/bpf/progs/connect4_prog.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/tools/testing/selftests/bpf/progs/connect4_prog.c
++++ b/tools/testing/selftests/bpf/progs/connect4_prog.c
+@@ -33,7 +33,7 @@
+
+ int _version SEC("version") = 1;
+
+-__attribute__ ((noinline))
++__attribute__ ((noinline)) __weak
+ int do_bind(struct bpf_sock_addr *ctx)
+ {
+ struct sockaddr_in sa = {};
hwmon-pmbus-bel-pfe-enable-pmbus_skip_status_check-for-pfe1100.patch
radix-tree-test-suite-fix-incorrect-allocation-size-for-pthreads.patch
nilfs2-fix-use-after-free-of-nilfs_root-in-dirtying-inodes-via-iput.patch
+bpf-allow-precision-tracking-for-programs-with-subprogs.patch
+bpf-stop-setting-precise-in-current-state.patch
+bpf-aggressively-forget-precise-markings-during-state-checkpointing.patch
+selftests-bpf-make-test_align-selftest-more-robust.patch
+selftests-bpf-workaround-verification-failure-for-fexit_bpf2bpf-func_replace_return_code.patch
+selftests-bpf-fix-sk_assign-on-s390x.patch