From 46ac68a51a8031e1134d27e4ca90992552154a16 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 24 Jul 2023 17:28:06 +0200 Subject: [PATCH] 6.1-stable patches added patches: bpf-aggressively-forget-precise-markings-during-state-checkpointing.patch bpf-allow-precision-tracking-for-programs-with-subprogs.patch bpf-stop-setting-precise-in-current-state.patch selftests-bpf-fix-sk_assign-on-s390x.patch selftests-bpf-make-test_align-selftest-more-robust.patch selftests-bpf-workaround-verification-failure-for-fexit_bpf2bpf-func_replace_return_code.patch --- ...-markings-during-state-checkpointing.patch | 128 +++++++++ ...-tracking-for-programs-with-subprogs.patch | 246 ++++++++++++++++++ ...top-setting-precise-in-current-state.patch | 234 +++++++++++++++++ ...selftests-bpf-fix-sk_assign-on-s390x.patch | 123 +++++++++ ...make-test_align-selftest-more-robust.patch | 134 ++++++++++ ...xit_bpf2bpf-func_replace_return_code.patch | 95 +++++++ queue-6.1/series | 6 + 7 files changed, 966 insertions(+) create mode 100644 queue-6.1/bpf-aggressively-forget-precise-markings-during-state-checkpointing.patch create mode 100644 queue-6.1/bpf-allow-precision-tracking-for-programs-with-subprogs.patch create mode 100644 queue-6.1/bpf-stop-setting-precise-in-current-state.patch create mode 100644 queue-6.1/selftests-bpf-fix-sk_assign-on-s390x.patch create mode 100644 queue-6.1/selftests-bpf-make-test_align-selftest-more-robust.patch create mode 100644 queue-6.1/selftests-bpf-workaround-verification-failure-for-fexit_bpf2bpf-func_replace_return_code.patch diff --git a/queue-6.1/bpf-aggressively-forget-precise-markings-during-state-checkpointing.patch b/queue-6.1/bpf-aggressively-forget-precise-markings-during-state-checkpointing.patch new file mode 100644 index 00000000000..d3ca2081c08 --- /dev/null +++ b/queue-6.1/bpf-aggressively-forget-precise-markings-during-state-checkpointing.patch @@ -0,0 +1,128 @@ +From stable-owner@vger.kernel.org Mon Jul 24 14:42:44 2023 +From: Eduard Zingerman +Date: Mon, 24 Jul 2023 15:42:20 +0300 +Subject: bpf: aggressively forget precise markings during state checkpointing +To: stable@vger.kernel.org, ast@kernel.org +Cc: andrii@kernel.org, daniel@iogearbox.net, martin.lau@linux.dev, yhs@fb.com, mykolal@fb.com, luizcap@amazon.com, Eduard Zingerman +Message-ID: <20230724124223.1176479-4-eddyz87@gmail.com> + +From: Andrii Nakryiko + +[ Upstream commit 7a830b53c17bbadcf99f778f28aaaa4e6c41df5f ] + +Exploit the property of about-to-be-checkpointed state to be able to +forget all precise markings up to that point even more aggressively. We +now clear all potentially inherited precise markings right before +checkpointing and branching off into child state. If any of children +states require precise knowledge of any SCALAR register, those will be +propagated backwards later on before this state is finalized, preserving +correctness. + +There is a single selftests BPF program change, but tremendous one: 25x +reduction in number of verified instructions and states in +trace_virtqueue_add_sgs. + +Cilium results are more modest, but happen across wider range of programs. + +SELFTESTS RESULTS +================= + +$ ./veristat -C -e file,prog,insns,states ~/imprecise-early-results.csv ~/imprecise-aggressive-results.csv | grep -v '+0' +File Program Total insns (A) Total insns (B) Total insns (DIFF) Total states (A) Total states (B) Total states (DIFF) +------------------- ----------------------- --------------- --------------- ------------------ ---------------- ---------------- ------------------- +loop6.bpf.linked1.o trace_virtqueue_add_sgs 398057 15114 -382943 (-96.20%) 8717 336 -8381 (-96.15%) +------------------- ----------------------- --------------- --------------- ------------------ ---------------- ---------------- ------------------- + +CILIUM RESULTS +============== + +$ ./veristat -C -e file,prog,insns,states ~/imprecise-early-results-cilium.csv ~/imprecise-aggressive-results-cilium.csv | grep -v '+0' +File Program Total insns (A) Total insns (B) Total insns (DIFF) Total states (A) Total states (B) Total states (DIFF) +------------- -------------------------------- --------------- --------------- ------------------ ---------------- ---------------- ------------------- +bpf_host.o tail_handle_nat_fwd_ipv4 23426 23221 -205 (-0.88%) 1537 1515 -22 (-1.43%) +bpf_host.o tail_handle_nat_fwd_ipv6 13009 12904 -105 (-0.81%) 719 708 -11 (-1.53%) +bpf_host.o tail_nodeport_nat_ingress_ipv6 5261 5196 -65 (-1.24%) 247 243 -4 (-1.62%) +bpf_host.o tail_nodeport_nat_ipv6_egress 3446 3406 -40 (-1.16%) 203 198 -5 (-2.46%) +bpf_lxc.o tail_handle_nat_fwd_ipv4 23426 23221 -205 (-0.88%) 1537 1515 -22 (-1.43%) +bpf_lxc.o tail_handle_nat_fwd_ipv6 13009 12904 -105 (-0.81%) 719 708 -11 (-1.53%) +bpf_lxc.o tail_ipv4_ct_egress 5074 4897 -177 (-3.49%) 255 248 -7 (-2.75%) +bpf_lxc.o tail_ipv4_ct_ingress 5100 4923 -177 (-3.47%) 255 248 -7 (-2.75%) +bpf_lxc.o tail_ipv4_ct_ingress_policy_only 5100 4923 -177 (-3.47%) 255 248 -7 (-2.75%) +bpf_lxc.o tail_ipv6_ct_egress 4558 4536 -22 (-0.48%) 188 187 -1 (-0.53%) +bpf_lxc.o tail_ipv6_ct_ingress 4578 4556 -22 (-0.48%) 188 187 -1 (-0.53%) +bpf_lxc.o tail_ipv6_ct_ingress_policy_only 4578 4556 -22 (-0.48%) 188 187 -1 (-0.53%) +bpf_lxc.o tail_nodeport_nat_ingress_ipv6 5261 5196 -65 (-1.24%) 247 243 -4 (-1.62%) +bpf_overlay.o tail_nodeport_nat_ingress_ipv6 5261 5196 -65 (-1.24%) 247 243 -4 (-1.62%) +bpf_overlay.o tail_nodeport_nat_ipv6_egress 3482 3442 -40 (-1.15%) 204 201 -3 (-1.47%) +bpf_xdp.o tail_nodeport_nat_egress_ipv4 17200 15619 -1581 (-9.19%) 1111 1010 -101 (-9.09%) +------------- -------------------------------- --------------- --------------- ------------------ ---------------- ---------------- ------------------- + +Signed-off-by: Andrii Nakryiko +Link: https://lore.kernel.org/r/20221104163649.121784-6-andrii@kernel.org +Signed-off-by: Alexei Starovoitov +Signed-off-by: Eduard Zingerman +Signed-off-by: Greg Kroah-Hartman +--- + kernel/bpf/verifier.c | 37 +++++++++++++++++++++++++++++++++++++ + 1 file changed, 37 insertions(+) + +--- a/kernel/bpf/verifier.c ++++ b/kernel/bpf/verifier.c +@@ -2813,6 +2813,31 @@ static void mark_all_scalars_precise(str + } + } + ++static void mark_all_scalars_imprecise(struct bpf_verifier_env *env, struct bpf_verifier_state *st) ++{ ++ struct bpf_func_state *func; ++ struct bpf_reg_state *reg; ++ int i, j; ++ ++ for (i = 0; i <= st->curframe; i++) { ++ func = st->frame[i]; ++ for (j = 0; j < BPF_REG_FP; j++) { ++ reg = &func->regs[j]; ++ if (reg->type != SCALAR_VALUE) ++ continue; ++ reg->precise = false; ++ } ++ for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) { ++ if (!is_spilled_reg(&func->stack[j])) ++ continue; ++ reg = &func->stack[j].spilled_ptr; ++ if (reg->type != SCALAR_VALUE) ++ continue; ++ reg->precise = false; ++ } ++ } ++} ++ + /* + * __mark_chain_precision() backtracks BPF program instruction sequence and + * chain of verifier states making sure that register *regno* (if regno >= 0) +@@ -2891,6 +2916,14 @@ static void mark_all_scalars_precise(str + * be imprecise. If any child state does require this register to be precise, + * we'll mark it precise later retroactively during precise markings + * propagation from child state to parent states. ++ * ++ * Skipping precise marking setting in current state is a mild version of ++ * relying on the above observation. But we can utilize this property even ++ * more aggressively by proactively forgetting any precise marking in the ++ * current state (which we inherited from the parent state), right before we ++ * checkpoint it and branch off into new child state. This is done by ++ * mark_all_scalars_imprecise() to hopefully get more permissive and generic ++ * finalized states which help in short circuiting more future states. + */ + static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int regno, + int spi) +@@ -12296,6 +12329,10 @@ next: + env->prev_jmps_processed = env->jmps_processed; + env->prev_insn_processed = env->insn_processed; + ++ /* forget precise markings we inherited, see __mark_chain_precision */ ++ if (env->bpf_capable) ++ mark_all_scalars_imprecise(env, cur); ++ + /* add new state to the head of linked list */ + new = &new_sl->state; + err = copy_verifier_state(new, cur); diff --git a/queue-6.1/bpf-allow-precision-tracking-for-programs-with-subprogs.patch b/queue-6.1/bpf-allow-precision-tracking-for-programs-with-subprogs.patch new file mode 100644 index 00000000000..acec2f6d51e --- /dev/null +++ b/queue-6.1/bpf-allow-precision-tracking-for-programs-with-subprogs.patch @@ -0,0 +1,246 @@ +From stable-owner@vger.kernel.org Mon Jul 24 14:42:40 2023 +From: Eduard Zingerman +Date: Mon, 24 Jul 2023 15:42:18 +0300 +Subject: bpf: allow precision tracking for programs with subprogs +To: stable@vger.kernel.org, ast@kernel.org +Cc: andrii@kernel.org, daniel@iogearbox.net, martin.lau@linux.dev, yhs@fb.com, mykolal@fb.com, luizcap@amazon.com, Eduard Zingerman +Message-ID: <20230724124223.1176479-2-eddyz87@gmail.com> + +From: Andrii Nakryiko + +[ Upstream commit be2ef8161572ec1973124ebc50f56dafc2925e07 ] + +Stop forcing precise=true for SCALAR registers when BPF program has any +subprograms. Current restriction means that any BPF program, as soon as +it uses subprograms, will end up not getting any of the precision +tracking benefits in reduction of number of verified states. + +This patch keeps the fallback mark_all_scalars_precise() behavior if +precise marking has to cross function frames. E.g., if subprogram +requires R1 (first input arg) to be marked precise, ideally we'd need to +backtrack to the parent function and keep marking R1 and its +dependencies as precise. But right now we give up and force all the +SCALARs in any of the current and parent states to be forced to +precise=true. We can lift that restriction in the future. + +But this patch fixes two issues identified when trying to enable +precision tracking for subprogs. + +First, prevent "escaping" from top-most state in a global subprog. While +with entry-level BPF program we never end up requesting precision for +R1-R5 registers, because R2-R5 are not initialized (and so not readable +in correct BPF program), and R1 is PTR_TO_CTX, not SCALAR, and so is +implicitly precise. With global subprogs, though, it's different, as +global subprog a) can have up to 5 SCALAR input arguments, which might +get marked as precise=true and b) it is validated in isolation from its +main entry BPF program. b) means that we can end up exhausting parent +state chain and still not mark all registers in reg_mask as precise, +which would lead to verifier bug warning. + +To handle that, we need to consider two cases. First, if the very first +state is not immediately "checkpointed" (i.e., stored in state lookup +hashtable), it will get correct first_insn_idx and last_insn_idx +instruction set during state checkpointing. As such, this case is +already handled and __mark_chain_precision() already handles that by +just doing nothing when we reach to the very first parent state. +st->parent will be NULL and we'll just stop. Perhaps some extra check +for reg_mask and stack_mask is due here, but this patch doesn't address +that issue. + +More problematic second case is when global function's initial state is +immediately checkpointed before we manage to process the very first +instruction. This is happening because when there is a call to global +subprog from the main program the very first subprog's instruction is +marked as pruning point, so before we manage to process first +instruction we have to check and checkpoint state. This patch adds +a special handling for such "empty" state, which is identified by having +st->last_insn_idx set to -1. In such case, we check that we are indeed +validating global subprog, and with some sanity checking we mark input +args as precise if requested. + +Note that we also initialize state->first_insn_idx with correct start +insn_idx offset. For main program zero is correct value, but for any +subprog it's quite confusing to not have first_insn_idx set. This +doesn't have any functional impact, but helps with debugging and state +printing. We also explicitly initialize state->last_insns_idx instead of +relying on is_state_visited() to do this with env->prev_insns_idx, which +will be -1 on the very first instruction. This concludes necessary +changes to handle specifically global subprog's precision tracking. + +Second identified problem was missed handling of BPF helper functions +that call into subprogs (e.g., bpf_loop and few others). From precision +tracking and backtracking logic's standpoint those are effectively calls +into subprogs and should be called as BPF_PSEUDO_CALL calls. + +This patch takes the least intrusive way and just checks against a short +list of current BPF helpers that do call subprogs, encapsulated in +is_callback_calling_function() function. But to prevent accidentally +forgetting to add new BPF helpers to this "list", we also do a sanity +check in __check_func_call, which has to be called for each such special +BPF helper, to validate that BPF helper is indeed recognized as +callback-calling one. This should catch any missed checks in the future. +Adding some special flags to be added in function proto definitions +seemed like an overkill in this case. + +With the above changes, it's possible to remove forceful setting of +reg->precise to true in __mark_reg_unknown, which turns on precision +tracking both inside subprogs and entry progs that have subprogs. No +warnings or errors were detected across all the selftests, but also when +validating with veristat against internal Meta BPF objects and Cilium +objects. Further, in some BPF programs there are noticeable reduction in +number of states and instructions validated due to more effective +precision tracking, especially benefiting syncookie test. + +$ ./veristat -C -e file,prog,insns,states ~/baseline-results.csv ~/subprog-precise-results.csv | grep -v '+0' +File Program Total insns (A) Total insns (B) Total insns (DIFF) Total states (A) Total states (B) Total states (DIFF) +---------------------------------------- -------------------------- --------------- --------------- ------------------ ---------------- ---------------- ------------------- +pyperf600_bpf_loop.bpf.linked1.o on_event 3966 3678 -288 (-7.26%) 306 276 -30 (-9.80%) +pyperf_global.bpf.linked1.o on_event 7563 7530 -33 (-0.44%) 520 517 -3 (-0.58%) +pyperf_subprogs.bpf.linked1.o on_event 36358 36934 +576 (+1.58%) 2499 2531 +32 (+1.28%) +setget_sockopt.bpf.linked1.o skops_sockopt 3965 4038 +73 (+1.84%) 343 347 +4 (+1.17%) +test_cls_redirect_subprogs.bpf.linked1.o cls_redirect 64965 64901 -64 (-0.10%) 4619 4612 -7 (-0.15%) +test_misc_tcp_hdr_options.bpf.linked1.o misc_estab 1491 1307 -184 (-12.34%) 110 100 -10 (-9.09%) +test_pkt_access.bpf.linked1.o test_pkt_access 354 349 -5 (-1.41%) 25 24 -1 (-4.00%) +test_sock_fields.bpf.linked1.o egress_read_sock_fields 435 375 -60 (-13.79%) 22 20 -2 (-9.09%) +test_sysctl_loop2.bpf.linked1.o sysctl_tcp_mem 1508 1501 -7 (-0.46%) 29 28 -1 (-3.45%) +test_tc_dtime.bpf.linked1.o egress_fwdns_prio100 468 435 -33 (-7.05%) 45 41 -4 (-8.89%) +test_tc_dtime.bpf.linked1.o ingress_fwdns_prio100 398 408 +10 (+2.51%) 42 39 -3 (-7.14%) +test_tc_dtime.bpf.linked1.o ingress_fwdns_prio101 1096 842 -254 (-23.18%) 97 73 -24 (-24.74%) +test_tcp_hdr_options.bpf.linked1.o estab 2758 2408 -350 (-12.69%) 208 181 -27 (-12.98%) +test_urandom_usdt.bpf.linked1.o urand_read_with_sema 466 448 -18 (-3.86%) 31 28 -3 (-9.68%) +test_urandom_usdt.bpf.linked1.o urand_read_without_sema 466 448 -18 (-3.86%) 31 28 -3 (-9.68%) +test_urandom_usdt.bpf.linked1.o urandlib_read_with_sema 466 448 -18 (-3.86%) 31 28 -3 (-9.68%) +test_urandom_usdt.bpf.linked1.o urandlib_read_without_sema 466 448 -18 (-3.86%) 31 28 -3 (-9.68%) +test_xdp_noinline.bpf.linked1.o balancer_ingress_v6 4302 4294 -8 (-0.19%) 257 256 -1 (-0.39%) +xdp_synproxy_kern.bpf.linked1.o syncookie_tc 583722 405757 -177965 (-30.49%) 35846 25735 -10111 (-28.21%) +xdp_synproxy_kern.bpf.linked1.o syncookie_xdp 609123 479055 -130068 (-21.35%) 35452 29145 -6307 (-17.79%) +---------------------------------------- -------------------------- --------------- --------------- ------------------ ---------------- ---------------- ------------------- + +Signed-off-by: Andrii Nakryiko +Link: https://lore.kernel.org/r/20221104163649.121784-4-andrii@kernel.org +Signed-off-by: Alexei Starovoitov +Signed-off-by: Eduard Zingerman +Signed-off-by: Greg Kroah-Hartman +--- + kernel/bpf/verifier.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++- + 1 file changed, 61 insertions(+), 1 deletion(-) + +--- a/kernel/bpf/verifier.c ++++ b/kernel/bpf/verifier.c +@@ -511,6 +511,15 @@ static bool is_dynptr_ref_function(enum + return func_id == BPF_FUNC_dynptr_data; + } + ++static bool is_callback_calling_function(enum bpf_func_id func_id) ++{ ++ return func_id == BPF_FUNC_for_each_map_elem || ++ func_id == BPF_FUNC_timer_set_callback || ++ func_id == BPF_FUNC_find_vma || ++ func_id == BPF_FUNC_loop || ++ func_id == BPF_FUNC_user_ringbuf_drain; ++} ++ + static bool helper_multiple_ref_obj_use(enum bpf_func_id func_id, + const struct bpf_map *map) + { +@@ -1693,7 +1702,7 @@ static void __mark_reg_unknown(const str + reg->type = SCALAR_VALUE; + reg->var_off = tnum_unknown; + reg->frameno = 0; +- reg->precise = env->subprog_cnt > 1 || !env->bpf_capable; ++ reg->precise = !env->bpf_capable; + __mark_reg_unbounded(reg); + } + +@@ -2670,6 +2679,11 @@ static int backtrack_insn(struct bpf_ver + */ + if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL && insn->imm == 0) + return -ENOTSUPP; ++ /* BPF helpers that invoke callback subprogs are ++ * equivalent to BPF_PSEUDO_CALL above ++ */ ++ if (insn->src_reg == 0 && is_callback_calling_function(insn->imm)) ++ return -ENOTSUPP; + /* regular helper call sets R0 */ + *reg_mask &= ~1; + if (*reg_mask & 0x3f) { +@@ -2848,12 +2862,42 @@ static int __mark_chain_precision(struct + return 0; + if (!reg_mask && !stack_mask) + return 0; ++ + for (;;) { + DECLARE_BITMAP(mask, 64); + u32 history = st->jmp_history_cnt; + + if (env->log.level & BPF_LOG_LEVEL2) + verbose(env, "last_idx %d first_idx %d\n", last_idx, first_idx); ++ ++ if (last_idx < 0) { ++ /* we are at the entry into subprog, which ++ * is expected for global funcs, but only if ++ * requested precise registers are R1-R5 ++ * (which are global func's input arguments) ++ */ ++ if (st->curframe == 0 && ++ st->frame[0]->subprogno > 0 && ++ st->frame[0]->callsite == BPF_MAIN_FUNC && ++ stack_mask == 0 && (reg_mask & ~0x3e) == 0) { ++ bitmap_from_u64(mask, reg_mask); ++ for_each_set_bit(i, mask, 32) { ++ reg = &st->frame[0]->regs[i]; ++ if (reg->type != SCALAR_VALUE) { ++ reg_mask &= ~(1u << i); ++ continue; ++ } ++ reg->precise = true; ++ } ++ return 0; ++ } ++ ++ verbose(env, "BUG backtracing func entry subprog %d reg_mask %x stack_mask %llx\n", ++ st->frame[0]->subprogno, reg_mask, stack_mask); ++ WARN_ONCE(1, "verifier backtracking bug"); ++ return -EFAULT; ++ } ++ + for (i = last_idx;;) { + if (skip_first) { + err = 0; +@@ -6732,6 +6776,10 @@ typedef int (*set_callee_state_fn)(struc + struct bpf_func_state *callee, + int insn_idx); + ++static int set_callee_state(struct bpf_verifier_env *env, ++ struct bpf_func_state *caller, ++ struct bpf_func_state *callee, int insn_idx); ++ + static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, + int *insn_idx, int subprog, + set_callee_state_fn set_callee_state_cb) +@@ -6782,6 +6830,16 @@ static int __check_func_call(struct bpf_ + } + } + ++ /* set_callee_state is used for direct subprog calls, but we are ++ * interested in validating only BPF helpers that can call subprogs as ++ * callbacks ++ */ ++ if (set_callee_state_cb != set_callee_state && !is_callback_calling_function(insn->imm)) { ++ verbose(env, "verifier bug: helper %s#%d is not marked as callback-calling\n", ++ func_id_name(insn->imm), insn->imm); ++ return -EFAULT; ++ } ++ + if (insn->code == (BPF_JMP | BPF_CALL) && + insn->src_reg == 0 && + insn->imm == BPF_FUNC_timer_set_callback) { +@@ -14713,6 +14771,8 @@ static int do_check_common(struct bpf_ve + BPF_MAIN_FUNC /* callsite */, + 0 /* frameno */, + subprog); ++ state->first_insn_idx = env->subprog_info[subprog].start; ++ state->last_insn_idx = -1; + + regs = state->frame[state->curframe]->regs; + if (subprog || env->prog->type == BPF_PROG_TYPE_EXT) { diff --git a/queue-6.1/bpf-stop-setting-precise-in-current-state.patch b/queue-6.1/bpf-stop-setting-precise-in-current-state.patch new file mode 100644 index 00000000000..0ca70ac779b --- /dev/null +++ b/queue-6.1/bpf-stop-setting-precise-in-current-state.patch @@ -0,0 +1,234 @@ +From stable-owner@vger.kernel.org Mon Jul 24 14:42:43 2023 +From: Eduard Zingerman +Date: Mon, 24 Jul 2023 15:42:19 +0300 +Subject: bpf: stop setting precise in current state +To: stable@vger.kernel.org, ast@kernel.org +Cc: andrii@kernel.org, daniel@iogearbox.net, martin.lau@linux.dev, yhs@fb.com, mykolal@fb.com, luizcap@amazon.com, Eduard Zingerman +Message-ID: <20230724124223.1176479-3-eddyz87@gmail.com> + +From: Andrii Nakryiko + +[ Upstream commit f63181b6ae79fd3b034cde641db774268c2c3acf ] + +Setting reg->precise to true in current state is not necessary from +correctness standpoint, but it does pessimise the whole precision (or +rather "imprecision", because that's what we want to keep as much as +possible) tracking. Why is somewhat subtle and my best attempt to +explain this is recorded in an extensive comment for __mark_chain_precise() +function. Some more careful thinking and code reading is probably required +still to grok this completely, unfortunately. Whiteboarding and a bunch +of extra handwaiving in person would be even more helpful, but is deemed +impractical in Git commit. + +Next patch pushes this imprecision property even further, building on top of +the insights described in this patch. + +End results are pretty nice, we get reduction in number of total instructions +and states verified due to a better states reuse, as some of the states are now +more generic and permissive due to less unnecessary precise=true requirements. + +SELFTESTS RESULTS +================= + +$ ./veristat -C -e file,prog,insns,states ~/subprog-precise-results.csv ~/imprecise-early-results.csv | grep -v '+0' +File Program Total insns (A) Total insns (B) Total insns (DIFF) Total states (A) Total states (B) Total states (DIFF) +--------------------------------------- ---------------------- --------------- --------------- ------------------ ---------------- ---------------- ------------------- +bpf_iter_ksym.bpf.linked1.o dump_ksym 347 285 -62 (-17.87%) 20 19 -1 (-5.00%) +pyperf600_bpf_loop.bpf.linked1.o on_event 3678 3736 +58 (+1.58%) 276 285 +9 (+3.26%) +setget_sockopt.bpf.linked1.o skops_sockopt 4038 3947 -91 (-2.25%) 347 343 -4 (-1.15%) +test_l4lb.bpf.linked1.o balancer_ingress 4559 2611 -1948 (-42.73%) 118 105 -13 (-11.02%) +test_l4lb_noinline.bpf.linked1.o balancer_ingress 6279 6268 -11 (-0.18%) 237 236 -1 (-0.42%) +test_misc_tcp_hdr_options.bpf.linked1.o misc_estab 1307 1303 -4 (-0.31%) 100 99 -1 (-1.00%) +test_sk_lookup.bpf.linked1.o ctx_narrow_access 456 447 -9 (-1.97%) 39 38 -1 (-2.56%) +test_sysctl_loop1.bpf.linked1.o sysctl_tcp_mem 1389 1384 -5 (-0.36%) 26 25 -1 (-3.85%) +test_tc_dtime.bpf.linked1.o egress_fwdns_prio101 518 485 -33 (-6.37%) 51 46 -5 (-9.80%) +test_tc_dtime.bpf.linked1.o egress_host 519 468 -51 (-9.83%) 50 44 -6 (-12.00%) +test_tc_dtime.bpf.linked1.o ingress_fwdns_prio101 842 1000 +158 (+18.76%) 73 88 +15 (+20.55%) +xdp_synproxy_kern.bpf.linked1.o syncookie_tc 405757 373173 -32584 (-8.03%) 25735 22882 -2853 (-11.09%) +xdp_synproxy_kern.bpf.linked1.o syncookie_xdp 479055 371590 -107465 (-22.43%) 29145 22207 -6938 (-23.81%) +--------------------------------------- ---------------------- --------------- --------------- ------------------ ---------------- ---------------- ------------------- + +Slight regression in test_tc_dtime.bpf.linked1.o/ingress_fwdns_prio101 +is left for a follow up, there might be some more precision-related bugs +in existing BPF verifier logic. + +CILIUM RESULTS +============== + +$ ./veristat -C -e file,prog,insns,states ~/subprog-precise-results-cilium.csv ~/imprecise-early-results-cilium.csv | grep -v '+0' +File Program Total insns (A) Total insns (B) Total insns (DIFF) Total states (A) Total states (B) Total states (DIFF) +------------- ------------------------------ --------------- --------------- ------------------ ---------------- ---------------- ------------------- +bpf_host.o cil_from_host 762 556 -206 (-27.03%) 43 37 -6 (-13.95%) +bpf_host.o tail_handle_nat_fwd_ipv4 23541 23426 -115 (-0.49%) 1538 1537 -1 (-0.07%) +bpf_host.o tail_nodeport_nat_egress_ipv4 33592 33566 -26 (-0.08%) 2163 2161 -2 (-0.09%) +bpf_lxc.o tail_handle_nat_fwd_ipv4 23541 23426 -115 (-0.49%) 1538 1537 -1 (-0.07%) +bpf_overlay.o tail_nodeport_nat_egress_ipv4 33581 33543 -38 (-0.11%) 2160 2157 -3 (-0.14%) +bpf_xdp.o tail_handle_nat_fwd_ipv4 21659 20920 -739 (-3.41%) 1440 1376 -64 (-4.44%) +bpf_xdp.o tail_handle_nat_fwd_ipv6 17084 17039 -45 (-0.26%) 907 905 -2 (-0.22%) +bpf_xdp.o tail_lb_ipv4 73442 73430 -12 (-0.02%) 4370 4369 -1 (-0.02%) +bpf_xdp.o tail_lb_ipv6 152114 151895 -219 (-0.14%) 6493 6479 -14 (-0.22%) +bpf_xdp.o tail_nodeport_nat_egress_ipv4 17377 17200 -177 (-1.02%) 1125 1111 -14 (-1.24%) +bpf_xdp.o tail_nodeport_nat_ingress_ipv6 6405 6397 -8 (-0.12%) 309 308 -1 (-0.32%) +bpf_xdp.o tail_rev_nodeport_lb4 7126 6934 -192 (-2.69%) 414 402 -12 (-2.90%) +bpf_xdp.o tail_rev_nodeport_lb6 18059 17905 -154 (-0.85%) 1105 1096 -9 (-0.81%) +------------- ------------------------------ --------------- --------------- ------------------ ---------------- ---------------- ------------------- + +Signed-off-by: Andrii Nakryiko +Link: https://lore.kernel.org/r/20221104163649.121784-5-andrii@kernel.org +Signed-off-by: Alexei Starovoitov +Signed-off-by: Eduard Zingerman +Signed-off-by: Greg Kroah-Hartman +--- + kernel/bpf/verifier.c | 103 ++++++++++++++++++++++++++++++++++++++++++++------ + 1 file changed, 91 insertions(+), 12 deletions(-) + +--- a/kernel/bpf/verifier.c ++++ b/kernel/bpf/verifier.c +@@ -2788,8 +2788,11 @@ static void mark_all_scalars_precise(str + + /* big hammer: mark all scalars precise in this path. + * pop_stack may still get !precise scalars. ++ * We also skip current state and go straight to first parent state, ++ * because precision markings in current non-checkpointed state are ++ * not needed. See why in the comment in __mark_chain_precision below. + */ +- for (; st; st = st->parent) ++ for (st = st->parent; st; st = st->parent) { + for (i = 0; i <= st->curframe; i++) { + func = st->frame[i]; + for (j = 0; j < BPF_REG_FP; j++) { +@@ -2807,8 +2810,88 @@ static void mark_all_scalars_precise(str + reg->precise = true; + } + } ++ } + } + ++/* ++ * __mark_chain_precision() backtracks BPF program instruction sequence and ++ * chain of verifier states making sure that register *regno* (if regno >= 0) ++ * and/or stack slot *spi* (if spi >= 0) are marked as precisely tracked ++ * SCALARS, as well as any other registers and slots that contribute to ++ * a tracked state of given registers/stack slots, depending on specific BPF ++ * assembly instructions (see backtrack_insns() for exact instruction handling ++ * logic). This backtracking relies on recorded jmp_history and is able to ++ * traverse entire chain of parent states. This process ends only when all the ++ * necessary registers/slots and their transitive dependencies are marked as ++ * precise. ++ * ++ * One important and subtle aspect is that precise marks *do not matter* in ++ * the currently verified state (current state). It is important to understand ++ * why this is the case. ++ * ++ * First, note that current state is the state that is not yet "checkpointed", ++ * i.e., it is not yet put into env->explored_states, and it has no children ++ * states as well. It's ephemeral, and can end up either a) being discarded if ++ * compatible explored state is found at some point or BPF_EXIT instruction is ++ * reached or b) checkpointed and put into env->explored_states, branching out ++ * into one or more children states. ++ * ++ * In the former case, precise markings in current state are completely ++ * ignored by state comparison code (see regsafe() for details). Only ++ * checkpointed ("old") state precise markings are important, and if old ++ * state's register/slot is precise, regsafe() assumes current state's ++ * register/slot as precise and checks value ranges exactly and precisely. If ++ * states turn out to be compatible, current state's necessary precise ++ * markings and any required parent states' precise markings are enforced ++ * after the fact with propagate_precision() logic, after the fact. But it's ++ * important to realize that in this case, even after marking current state ++ * registers/slots as precise, we immediately discard current state. So what ++ * actually matters is any of the precise markings propagated into current ++ * state's parent states, which are always checkpointed (due to b) case above). ++ * As such, for scenario a) it doesn't matter if current state has precise ++ * markings set or not. ++ * ++ * Now, for the scenario b), checkpointing and forking into child(ren) ++ * state(s). Note that before current state gets to checkpointing step, any ++ * processed instruction always assumes precise SCALAR register/slot ++ * knowledge: if precise value or range is useful to prune jump branch, BPF ++ * verifier takes this opportunity enthusiastically. Similarly, when ++ * register's value is used to calculate offset or memory address, exact ++ * knowledge of SCALAR range is assumed, checked, and enforced. So, similar to ++ * what we mentioned above about state comparison ignoring precise markings ++ * during state comparison, BPF verifier ignores and also assumes precise ++ * markings *at will* during instruction verification process. But as verifier ++ * assumes precision, it also propagates any precision dependencies across ++ * parent states, which are not yet finalized, so can be further restricted ++ * based on new knowledge gained from restrictions enforced by their children ++ * states. This is so that once those parent states are finalized, i.e., when ++ * they have no more active children state, state comparison logic in ++ * is_state_visited() would enforce strict and precise SCALAR ranges, if ++ * required for correctness. ++ * ++ * To build a bit more intuition, note also that once a state is checkpointed, ++ * the path we took to get to that state is not important. This is crucial ++ * property for state pruning. When state is checkpointed and finalized at ++ * some instruction index, it can be correctly and safely used to "short ++ * circuit" any *compatible* state that reaches exactly the same instruction ++ * index. I.e., if we jumped to that instruction from a completely different ++ * code path than original finalized state was derived from, it doesn't ++ * matter, current state can be discarded because from that instruction ++ * forward having a compatible state will ensure we will safely reach the ++ * exit. States describe preconditions for further exploration, but completely ++ * forget the history of how we got here. ++ * ++ * This also means that even if we needed precise SCALAR range to get to ++ * finalized state, but from that point forward *that same* SCALAR register is ++ * never used in a precise context (i.e., it's precise value is not needed for ++ * correctness), it's correct and safe to mark such register as "imprecise" ++ * (i.e., precise marking set to false). This is what we rely on when we do ++ * not set precise marking in current state. If no child state requires ++ * precision for any given SCALAR register, it's safe to dictate that it can ++ * be imprecise. If any child state does require this register to be precise, ++ * we'll mark it precise later retroactively during precise markings ++ * propagation from child state to parent states. ++ */ + static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int regno, + int spi) + { +@@ -2826,6 +2909,10 @@ static int __mark_chain_precision(struct + if (!env->bpf_capable) + return 0; + ++ /* Do sanity checks against current state of register and/or stack ++ * slot, but don't set precise flag in current state, as precision ++ * tracking in the current state is unnecessary. ++ */ + func = st->frame[frame]; + if (regno >= 0) { + reg = &func->regs[regno]; +@@ -2833,11 +2920,7 @@ static int __mark_chain_precision(struct + WARN_ONCE(1, "backtracing misuse"); + return -EFAULT; + } +- if (!reg->precise) +- new_marks = true; +- else +- reg_mask = 0; +- reg->precise = true; ++ new_marks = true; + } + + while (spi >= 0) { +@@ -2850,11 +2933,7 @@ static int __mark_chain_precision(struct + stack_mask = 0; + break; + } +- if (!reg->precise) +- new_marks = true; +- else +- stack_mask = 0; +- reg->precise = true; ++ new_marks = true; + break; + } + +@@ -11668,7 +11747,7 @@ static bool regsafe(struct bpf_verifier_ + if (env->explore_alu_limits) + return false; + if (rcur->type == SCALAR_VALUE) { +- if (!rold->precise && !rcur->precise) ++ if (!rold->precise) + return true; + /* new val must satisfy old val knowledge */ + return range_within(rold, rcur) && diff --git a/queue-6.1/selftests-bpf-fix-sk_assign-on-s390x.patch b/queue-6.1/selftests-bpf-fix-sk_assign-on-s390x.patch new file mode 100644 index 00000000000..2fe7e9f37a4 --- /dev/null +++ b/queue-6.1/selftests-bpf-fix-sk_assign-on-s390x.patch @@ -0,0 +1,123 @@ +From stable-owner@vger.kernel.org Mon Jul 24 14:42:47 2023 +From: Eduard Zingerman +Date: Mon, 24 Jul 2023 15:42:23 +0300 +Subject: selftests/bpf: Fix sk_assign on s390x +To: stable@vger.kernel.org, ast@kernel.org +Cc: andrii@kernel.org, daniel@iogearbox.net, martin.lau@linux.dev, yhs@fb.com, mykolal@fb.com, luizcap@amazon.com, Ilya Leoshkevich , Eduard Zingerman +Message-ID: <20230724124223.1176479-7-eddyz87@gmail.com> + +From: Ilya Leoshkevich + +[ Upstream commit 7ce878ca81bca7811e669db4c394b86780e0dbe4 ] + +sk_assign is failing on an s390x machine running Debian "bookworm" for +2 reasons: legacy server_map definition and uninitialized addrlen in +recvfrom() call. + +Fix by adding a new-style server_map definition and dropping addrlen +(recvfrom() allows NULL values for src_addr and addrlen). + +Since the test should support tc built without libbpf, build the prog +twice: with the old-style definition and with the new-style definition, +then select the right one at runtime. This could be done at compile +time too, but this would not be cross-compilation friendly. + +Signed-off-by: Ilya Leoshkevich +Link: https://lore.kernel.org/r/20230129190501.1624747-2-iii@linux.ibm.com +Signed-off-by: Alexei Starovoitov +Signed-off-by: Eduard Zingerman +Signed-off-by: Greg Kroah-Hartman +--- + tools/testing/selftests/bpf/prog_tests/sk_assign.c | 25 ++++++++++---- + tools/testing/selftests/bpf/progs/test_sk_assign.c | 11 ++++++ + tools/testing/selftests/bpf/progs/test_sk_assign_libbpf.c | 3 + + 3 files changed, 33 insertions(+), 6 deletions(-) + create mode 100644 tools/testing/selftests/bpf/progs/test_sk_assign_libbpf.c + +--- a/tools/testing/selftests/bpf/prog_tests/sk_assign.c ++++ b/tools/testing/selftests/bpf/prog_tests/sk_assign.c +@@ -29,7 +29,23 @@ static int stop, duration; + static bool + configure_stack(void) + { ++ char tc_version[128]; + char tc_cmd[BUFSIZ]; ++ char *prog; ++ FILE *tc; ++ ++ /* Check whether tc is built with libbpf. */ ++ tc = popen("tc -V", "r"); ++ if (CHECK_FAIL(!tc)) ++ return false; ++ if (CHECK_FAIL(!fgets(tc_version, sizeof(tc_version), tc))) ++ return false; ++ if (strstr(tc_version, ", libbpf ")) ++ prog = "test_sk_assign_libbpf.bpf.o"; ++ else ++ prog = "test_sk_assign.bpf.o"; ++ if (CHECK_FAIL(pclose(tc))) ++ return false; + + /* Move to a new networking namespace */ + if (CHECK_FAIL(unshare(CLONE_NEWNET))) +@@ -46,8 +62,8 @@ configure_stack(void) + /* Load qdisc, BPF program */ + if (CHECK_FAIL(system("tc qdisc add dev lo clsact"))) + return false; +- sprintf(tc_cmd, "%s %s %s %s", "tc filter add dev lo ingress bpf", +- "direct-action object-file ./test_sk_assign.bpf.o", ++ sprintf(tc_cmd, "%s %s %s %s %s", "tc filter add dev lo ingress bpf", ++ "direct-action object-file", prog, + "section tc", + (env.verbosity < VERBOSE_VERY) ? " 2>/dev/null" : "verbose"); + if (CHECK(system(tc_cmd), "BPF load failed;", +@@ -129,15 +145,12 @@ get_port(int fd) + static ssize_t + rcv_msg(int srv_client, int type) + { +- struct sockaddr_storage ss; + char buf[BUFSIZ]; +- socklen_t slen; + + if (type == SOCK_STREAM) + return read(srv_client, &buf, sizeof(buf)); + else +- return recvfrom(srv_client, &buf, sizeof(buf), 0, +- (struct sockaddr *)&ss, &slen); ++ return recvfrom(srv_client, &buf, sizeof(buf), 0, NULL, NULL); + } + + static int +--- a/tools/testing/selftests/bpf/progs/test_sk_assign.c ++++ b/tools/testing/selftests/bpf/progs/test_sk_assign.c +@@ -16,6 +16,16 @@ + #include + #include + ++#if defined(IPROUTE2_HAVE_LIBBPF) ++/* Use a new-style map definition. */ ++struct { ++ __uint(type, BPF_MAP_TYPE_SOCKMAP); ++ __type(key, int); ++ __type(value, __u64); ++ __uint(pinning, LIBBPF_PIN_BY_NAME); ++ __uint(max_entries, 1); ++} server_map SEC(".maps"); ++#else + /* Pin map under /sys/fs/bpf/tc/globals/ */ + #define PIN_GLOBAL_NS 2 + +@@ -35,6 +45,7 @@ struct { + .max_elem = 1, + .pinning = PIN_GLOBAL_NS, + }; ++#endif + + char _license[] SEC("license") = "GPL"; + +--- /dev/null ++++ b/tools/testing/selftests/bpf/progs/test_sk_assign_libbpf.c +@@ -0,0 +1,3 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#define IPROUTE2_HAVE_LIBBPF ++#include "test_sk_assign.c" diff --git a/queue-6.1/selftests-bpf-make-test_align-selftest-more-robust.patch b/queue-6.1/selftests-bpf-make-test_align-selftest-more-robust.patch new file mode 100644 index 00000000000..44b87fce809 --- /dev/null +++ b/queue-6.1/selftests-bpf-make-test_align-selftest-more-robust.patch @@ -0,0 +1,134 @@ +From stable-owner@vger.kernel.org Mon Jul 24 14:42:45 2023 +From: Eduard Zingerman +Date: Mon, 24 Jul 2023 15:42:21 +0300 +Subject: selftests/bpf: make test_align selftest more robust +To: stable@vger.kernel.org, ast@kernel.org +Cc: andrii@kernel.org, daniel@iogearbox.net, martin.lau@linux.dev, yhs@fb.com, mykolal@fb.com, luizcap@amazon.com, Eduard Zingerman +Message-ID: <20230724124223.1176479-5-eddyz87@gmail.com> + +From: Andrii Nakryiko + +[ Upstream commit 4f999b767769b76378c3616c624afd6f4bb0d99f ] + +test_align selftest relies on BPF verifier log emitting register states +for specific instructions in expected format. Unfortunately, BPF +verifier precision backtracking log interferes with such expectations. +And instruction on which precision propagation happens sometimes don't +output full expected register states. This does indeed look like +something to be improved in BPF verifier, but is beyond the scope of +this patch set. + +So to make test_align a bit more robust, inject few dummy R4 = R5 +instructions which capture desired state of R5 and won't have precision +tracking logs on them. This fixes tests until we can improve BPF +verifier output in the presence of precision tracking. + +Signed-off-by: Andrii Nakryiko +Link: https://lore.kernel.org/r/20221104163649.121784-7-andrii@kernel.org +Signed-off-by: Alexei Starovoitov +Signed-off-by: Eduard Zingerman +Signed-off-by: Greg Kroah-Hartman +--- + tools/testing/selftests/bpf/prog_tests/align.c | 38 +++++++++++++++---------- + 1 file changed, 24 insertions(+), 14 deletions(-) + +--- a/tools/testing/selftests/bpf/prog_tests/align.c ++++ b/tools/testing/selftests/bpf/prog_tests/align.c +@@ -2,7 +2,7 @@ + #include + + #define MAX_INSNS 512 +-#define MAX_MATCHES 16 ++#define MAX_MATCHES 24 + + struct bpf_reg_match { + unsigned int line; +@@ -267,6 +267,7 @@ static struct bpf_align_test tests[] = { + */ + BPF_MOV64_REG(BPF_REG_5, BPF_REG_2), + BPF_ALU64_REG(BPF_ADD, BPF_REG_5, BPF_REG_6), ++ BPF_MOV64_REG(BPF_REG_4, BPF_REG_5), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 14), + BPF_MOV64_REG(BPF_REG_4, BPF_REG_5), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4), +@@ -280,6 +281,7 @@ static struct bpf_align_test tests[] = { + BPF_MOV64_REG(BPF_REG_5, BPF_REG_2), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 14), + BPF_ALU64_REG(BPF_ADD, BPF_REG_5, BPF_REG_6), ++ BPF_MOV64_REG(BPF_REG_4, BPF_REG_5), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 4), + BPF_ALU64_REG(BPF_ADD, BPF_REG_5, BPF_REG_6), + BPF_MOV64_REG(BPF_REG_4, BPF_REG_5), +@@ -311,44 +313,52 @@ static struct bpf_align_test tests[] = { + {15, "R4=pkt(id=1,off=18,r=18,umax=1020,var_off=(0x0; 0x3fc))"}, + {15, "R5=pkt(id=1,off=14,r=18,umax=1020,var_off=(0x0; 0x3fc))"}, + /* Variable offset is added to R5 packet pointer, +- * resulting in auxiliary alignment of 4. ++ * resulting in auxiliary alignment of 4. To avoid BPF ++ * verifier's precision backtracking logging ++ * interfering we also have a no-op R4 = R5 ++ * instruction to validate R5 state. We also check ++ * that R4 is what it should be in such case. + */ +- {17, "R5_w=pkt(id=2,off=0,r=0,umax=1020,var_off=(0x0; 0x3fc))"}, ++ {18, "R4_w=pkt(id=2,off=0,r=0,umax=1020,var_off=(0x0; 0x3fc))"}, ++ {18, "R5_w=pkt(id=2,off=0,r=0,umax=1020,var_off=(0x0; 0x3fc))"}, + /* Constant offset is added to R5, resulting in + * reg->off of 14. + */ +- {18, "R5_w=pkt(id=2,off=14,r=0,umax=1020,var_off=(0x0; 0x3fc))"}, ++ {19, "R5_w=pkt(id=2,off=14,r=0,umax=1020,var_off=(0x0; 0x3fc))"}, + /* At the time the word size load is performed from R5, + * its total fixed offset is NET_IP_ALIGN + reg->off + * (14) which is 16. Then the variable offset is 4-byte + * aligned, so the total offset is 4-byte aligned and + * meets the load's requirements. + */ +- {23, "R4=pkt(id=2,off=18,r=18,umax=1020,var_off=(0x0; 0x3fc))"}, +- {23, "R5=pkt(id=2,off=14,r=18,umax=1020,var_off=(0x0; 0x3fc))"}, ++ {24, "R4=pkt(id=2,off=18,r=18,umax=1020,var_off=(0x0; 0x3fc))"}, ++ {24, "R5=pkt(id=2,off=14,r=18,umax=1020,var_off=(0x0; 0x3fc))"}, + /* Constant offset is added to R5 packet pointer, + * resulting in reg->off value of 14. + */ +- {25, "R5_w=pkt(off=14,r=8"}, ++ {26, "R5_w=pkt(off=14,r=8"}, + /* Variable offset is added to R5, resulting in a +- * variable offset of (4n). ++ * variable offset of (4n). See comment for insn #18 ++ * for R4 = R5 trick. + */ +- {26, "R5_w=pkt(id=3,off=14,r=0,umax=1020,var_off=(0x0; 0x3fc))"}, ++ {28, "R4_w=pkt(id=3,off=14,r=0,umax=1020,var_off=(0x0; 0x3fc))"}, ++ {28, "R5_w=pkt(id=3,off=14,r=0,umax=1020,var_off=(0x0; 0x3fc))"}, + /* Constant is added to R5 again, setting reg->off to 18. */ +- {27, "R5_w=pkt(id=3,off=18,r=0,umax=1020,var_off=(0x0; 0x3fc))"}, ++ {29, "R5_w=pkt(id=3,off=18,r=0,umax=1020,var_off=(0x0; 0x3fc))"}, + /* And once more we add a variable; resulting var_off + * is still (4n), fixed offset is not changed. + * Also, we create a new reg->id. + */ +- {28, "R5_w=pkt(id=4,off=18,r=0,umax=2040,var_off=(0x0; 0x7fc)"}, ++ {31, "R4_w=pkt(id=4,off=18,r=0,umax=2040,var_off=(0x0; 0x7fc)"}, ++ {31, "R5_w=pkt(id=4,off=18,r=0,umax=2040,var_off=(0x0; 0x7fc)"}, + /* At the time the word size load is performed from R5, + * its total fixed offset is NET_IP_ALIGN + reg->off (18) + * which is 20. Then the variable offset is (4n), so + * the total offset is 4-byte aligned and meets the + * load's requirements. + */ +- {33, "R4=pkt(id=4,off=22,r=22,umax=2040,var_off=(0x0; 0x7fc)"}, +- {33, "R5=pkt(id=4,off=18,r=22,umax=2040,var_off=(0x0; 0x7fc)"}, ++ {35, "R4=pkt(id=4,off=22,r=22,umax=2040,var_off=(0x0; 0x7fc)"}, ++ {35, "R5=pkt(id=4,off=18,r=22,umax=2040,var_off=(0x0; 0x7fc)"}, + }, + }, + { +@@ -681,6 +691,6 @@ void test_align(void) + if (!test__start_subtest(test->descr)) + continue; + +- CHECK_FAIL(do_test_single(test)); ++ ASSERT_OK(do_test_single(test), test->descr); + } + } diff --git a/queue-6.1/selftests-bpf-workaround-verification-failure-for-fexit_bpf2bpf-func_replace_return_code.patch b/queue-6.1/selftests-bpf-workaround-verification-failure-for-fexit_bpf2bpf-func_replace_return_code.patch new file mode 100644 index 00000000000..ebb5dddeacc --- /dev/null +++ b/queue-6.1/selftests-bpf-workaround-verification-failure-for-fexit_bpf2bpf-func_replace_return_code.patch @@ -0,0 +1,95 @@ +From stable-owner@vger.kernel.org Mon Jul 24 14:42:44 2023 +From: Eduard Zingerman +Date: Mon, 24 Jul 2023 15:42:22 +0300 +Subject: selftests/bpf: Workaround verification failure for fexit_bpf2bpf/func_replace_return_code +To: stable@vger.kernel.org, ast@kernel.org +Cc: andrii@kernel.org, daniel@iogearbox.net, martin.lau@linux.dev, yhs@fb.com, mykolal@fb.com, luizcap@amazon.com, Eduard Zingerman +Message-ID: <20230724124223.1176479-6-eddyz87@gmail.com> + +From: Yonghong Song + +[ Upstream commit 63d78b7e8ca2d0eb8c687a355fa19d01b6fcc723 ] + +With latest llvm17, selftest fexit_bpf2bpf/func_replace_return_code +has the following verification failure: + + 0: R1=ctx(off=0,imm=0) R10=fp0 + ; int connect_v4_prog(struct bpf_sock_addr *ctx) + 0: (bf) r7 = r1 ; R1=ctx(off=0,imm=0) R7_w=ctx(off=0,imm=0) + 1: (b4) w6 = 0 ; R6_w=0 + ; memset(&tuple.ipv4.saddr, 0, sizeof(tuple.ipv4.saddr)); + ... + ; return do_bind(ctx) ? 1 : 0; + 179: (bf) r1 = r7 ; R1=ctx(off=0,imm=0) R7=ctx(off=0,imm=0) + 180: (85) call pc+147 + Func#3 is global and valid. Skipping. + 181: R0_w=scalar() + 181: (bc) w6 = w0 ; R0_w=scalar() R6_w=scalar(umax=4294967295,var_off=(0x0; 0xffffffff)) + 182: (05) goto pc-129 + ; } + 54: (bc) w0 = w6 ; R0_w=scalar(umax=4294967295,var_off=(0x0; 0xffffffff)) R6_w=scalar(umax=4294967295,var_off=(0x0; 0xffffffff)) + 55: (95) exit + At program exit the register R0 has value (0x0; 0xffffffff) should have been in (0x0; 0x1) + processed 281 insns (limit 1000000) max_states_per_insn 1 total_states 26 peak_states 26 mark_read 13 + -- END PROG LOAD LOG -- + libbpf: prog 'connect_v4_prog': failed to load: -22 + +The corresponding source code: + + __attribute__ ((noinline)) + int do_bind(struct bpf_sock_addr *ctx) + { + struct sockaddr_in sa = {}; + + sa.sin_family = AF_INET; + sa.sin_port = bpf_htons(0); + sa.sin_addr.s_addr = bpf_htonl(SRC_REWRITE_IP4); + + if (bpf_bind(ctx, (struct sockaddr *)&sa, sizeof(sa)) != 0) + return 0; + + return 1; + } + ... + SEC("cgroup/connect4") + int connect_v4_prog(struct bpf_sock_addr *ctx) + { + ... + return do_bind(ctx) ? 1 : 0; + } + +Insn 180 is a call to 'do_bind'. The call's return value is also the return value +for the program. Since do_bind() returns 0/1, so it is legitimate for compiler to +optimize 'return do_bind(ctx) ? 1 : 0' to 'return do_bind(ctx)'. However, such +optimization breaks verifier as the return value of 'do_bind()' is marked as any +scalar which violates the requirement of prog return value 0/1. + +There are two ways to fix this problem, (1) changing 'return 1' in do_bind() to +e.g. 'return 10' so the compiler has to do 'do_bind(ctx) ? 1 :0', or (2) +suggested by Andrii, marking do_bind() with __weak attribute so the compiler +cannot make any assumption on do_bind() return value. + +This patch adopted adding __weak approach which is simpler and more resistant +to potential compiler optimizations. + +Suggested-by: Andrii Nakryiko +Signed-off-by: Yonghong Song +Signed-off-by: Andrii Nakryiko +Link: https://lore.kernel.org/bpf/20230310012410.2920570-1-yhs@fb.com +Signed-off-by: Eduard Zingerman +Signed-off-by: Greg Kroah-Hartman +--- + tools/testing/selftests/bpf/progs/connect4_prog.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/tools/testing/selftests/bpf/progs/connect4_prog.c ++++ b/tools/testing/selftests/bpf/progs/connect4_prog.c +@@ -32,7 +32,7 @@ + #define IFNAMSIZ 16 + #endif + +-__attribute__ ((noinline)) ++__attribute__ ((noinline)) __weak + int do_bind(struct bpf_sock_addr *ctx) + { + struct sockaddr_in sa = {}; diff --git a/queue-6.1/series b/queue-6.1/series index 7f939259784..7fc065207a5 100644 --- a/queue-6.1/series +++ b/queue-6.1/series @@ -169,5 +169,11 @@ spi-dw-remove-misleading-comment-for-mount-evans-soc.patch kallsyms-add-kallsyms_seqs_of_names-to-list-of-special-symbols.patch scripts-kallsyms.c-make-the-comment-up-to-date-with-current-implementation.patch scripts-kallsyms-update-the-usage-in-the-comment-block.patch +bpf-allow-precision-tracking-for-programs-with-subprogs.patch +bpf-stop-setting-precise-in-current-state.patch +bpf-aggressively-forget-precise-markings-during-state-checkpointing.patch +selftests-bpf-make-test_align-selftest-more-robust.patch +selftests-bpf-workaround-verification-failure-for-fexit_bpf2bpf-func_replace_return_code.patch +selftests-bpf-fix-sk_assign-on-s390x.patch x86-cpu-amd-move-the-errata-checking-functionality-up.patch x86-cpu-amd-add-a-zenbleed-fix.patch -- 2.47.3