]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
6.6-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Sat, 27 Jan 2024 01:17:09 +0000 (17:17 -0800)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Sat, 27 Jan 2024 01:17:09 +0000 (17:17 -0800)
added patches:
bpf-correct-loop-detection-for-iterators-convergence.patch
bpf-exact-states-comparison-for-iterator-convergence-checks.patch
bpf-extract-__check_reg_arg-utility-function.patch
bpf-extract-same_callsites-as-utility-function.patch
bpf-extract-setup_func_entry-utility-function.patch
bpf-keep-track-of-max-number-of-bpf_loop-callback-iterations.patch
bpf-move-explored_state-closer-to-the-beginning-of-verifier.c.patch
bpf-print-full-verifier-states-on-infinite-loop-detection.patch
bpf-verify-callbacks-as-if-they-are-called-unknown-number-of-times.patch
bpf-widening-for-callback-iterators.patch
selftests-bpf-check-if-max-number-of-bpf_loop-iterations-is-tracked.patch
selftests-bpf-test-if-state-loops-are-detected-in-a-tricky-case.patch
selftests-bpf-test-widening-for-iterating-callbacks.patch
selftests-bpf-tests-for-iterating-callbacks.patch
selftests-bpf-tests-with-delayed-read-precision-makrs-in-loop-body.patch
selftests-bpf-track-string-payload-offset-as-scalar-in-strobemeta.patch
selftests-bpf-track-tcp-payload-offset-as-scalar-in-xdp_synproxy.patch

18 files changed:
queue-6.6/bpf-correct-loop-detection-for-iterators-convergence.patch [new file with mode: 0644]
queue-6.6/bpf-exact-states-comparison-for-iterator-convergence-checks.patch [new file with mode: 0644]
queue-6.6/bpf-extract-__check_reg_arg-utility-function.patch [new file with mode: 0644]
queue-6.6/bpf-extract-same_callsites-as-utility-function.patch [new file with mode: 0644]
queue-6.6/bpf-extract-setup_func_entry-utility-function.patch [new file with mode: 0644]
queue-6.6/bpf-keep-track-of-max-number-of-bpf_loop-callback-iterations.patch [new file with mode: 0644]
queue-6.6/bpf-move-explored_state-closer-to-the-beginning-of-verifier.c.patch [new file with mode: 0644]
queue-6.6/bpf-print-full-verifier-states-on-infinite-loop-detection.patch [new file with mode: 0644]
queue-6.6/bpf-verify-callbacks-as-if-they-are-called-unknown-number-of-times.patch [new file with mode: 0644]
queue-6.6/bpf-widening-for-callback-iterators.patch [new file with mode: 0644]
queue-6.6/selftests-bpf-check-if-max-number-of-bpf_loop-iterations-is-tracked.patch [new file with mode: 0644]
queue-6.6/selftests-bpf-test-if-state-loops-are-detected-in-a-tricky-case.patch [new file with mode: 0644]
queue-6.6/selftests-bpf-test-widening-for-iterating-callbacks.patch [new file with mode: 0644]
queue-6.6/selftests-bpf-tests-for-iterating-callbacks.patch [new file with mode: 0644]
queue-6.6/selftests-bpf-tests-with-delayed-read-precision-makrs-in-loop-body.patch [new file with mode: 0644]
queue-6.6/selftests-bpf-track-string-payload-offset-as-scalar-in-strobemeta.patch [new file with mode: 0644]
queue-6.6/selftests-bpf-track-tcp-payload-offset-as-scalar-in-xdp_synproxy.patch [new file with mode: 0644]
queue-6.6/series

diff --git a/queue-6.6/bpf-correct-loop-detection-for-iterators-convergence.patch b/queue-6.6/bpf-correct-loop-detection-for-iterators-convergence.patch
new file mode 100644 (file)
index 0000000..2caf719
--- /dev/null
@@ -0,0 +1,339 @@
+From 2a0992829ea3864939d917a5c7b48be6629c6217 Mon Sep 17 00:00:00 2001
+From: Eduard Zingerman <eddyz87@gmail.com>
+Date: Tue, 24 Oct 2023 03:09:15 +0300
+Subject: bpf: correct loop detection for iterators convergence
+
+From: Eduard Zingerman <eddyz87@gmail.com>
+
+commit 2a0992829ea3864939d917a5c7b48be6629c6217 upstream.
+
+It turns out that .branches > 0 in is_state_visited() is not a
+sufficient condition to identify if two verifier states form a loop
+when iterators convergence is computed. This commit adds logic to
+distinguish situations like below:
+
+ (I)            initial       (II)            initial
+                  |                             |
+                  V                             V
+     .---------> hdr                           ..
+     |            |                             |
+     |            V                             V
+     |    .------...                    .------..
+     |    |       |                     |       |
+     |    V       V                     V       V
+     |   ...     ...               .-> hdr     ..
+     |    |       |                |    |       |
+     |    V       V                |    V       V
+     |   succ <- cur               |   succ <- cur
+     |    |                        |    |
+     |    V                        |    V
+     |   ...                       |   ...
+     |    |                        |    |
+     '----'                        '----'
+
+For both (I) and (II) successor 'succ' of the current state 'cur' was
+previously explored and has branches count at 0. However, loop entry
+'hdr' corresponding to 'succ' might be a part of current DFS path.
+If that is the case 'succ' and 'cur' are members of the same loop
+and have to be compared exactly.
+
+Co-developed-by: Andrii Nakryiko <andrii.nakryiko@gmail.com>
+Co-developed-by: Alexei Starovoitov <alexei.starovoitov@gmail.com>
+Reviewed-by: Andrii Nakryiko <andrii@kernel.org>
+Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
+Link: https://lore.kernel.org/r/20231024000917.12153-6-eddyz87@gmail.com
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/bpf_verifier.h |   15 +++
+ kernel/bpf/verifier.c        |  207 ++++++++++++++++++++++++++++++++++++++++++-
+ 2 files changed, 218 insertions(+), 4 deletions(-)
+
+--- a/include/linux/bpf_verifier.h
++++ b/include/linux/bpf_verifier.h
+@@ -372,10 +372,25 @@ struct bpf_verifier_state {
+       struct bpf_active_lock active_lock;
+       bool speculative;
+       bool active_rcu_lock;
++      /* If this state was ever pointed-to by other state's loop_entry field
++       * this flag would be set to true. Used to avoid freeing such states
++       * while they are still in use.
++       */
++      bool used_as_loop_entry;
+       /* first and last insn idx of this verifier state */
+       u32 first_insn_idx;
+       u32 last_insn_idx;
++      /* If this state is a part of states loop this field points to some
++       * parent of this state such that:
++       * - it is also a member of the same states loop;
++       * - DFS states traversal starting from initial state visits loop_entry
++       *   state before this state.
++       * Used to compute topmost loop entry for state loops.
++       * State loops might appear because of open coded iterators logic.
++       * See get_loop_entry() for more information.
++       */
++      struct bpf_verifier_state *loop_entry;
+       /* jmp history recorded from first to last.
+        * backtracking is using it to go from last to first.
+        * For most states jmp_history_cnt is [0-3].
+--- a/kernel/bpf/verifier.c
++++ b/kernel/bpf/verifier.c
+@@ -1772,6 +1772,7 @@ static int copy_verifier_state(struct bp
+       dst_state->first_insn_idx = src->first_insn_idx;
+       dst_state->last_insn_idx = src->last_insn_idx;
+       dst_state->dfs_depth = src->dfs_depth;
++      dst_state->used_as_loop_entry = src->used_as_loop_entry;
+       for (i = 0; i <= src->curframe; i++) {
+               dst = dst_state->frame[i];
+               if (!dst) {
+@@ -1814,11 +1815,176 @@ static bool same_callsites(struct bpf_ve
+       return true;
+ }
++/* Open coded iterators allow back-edges in the state graph in order to
++ * check unbounded loops that iterators.
++ *
++ * In is_state_visited() it is necessary to know if explored states are
++ * part of some loops in order to decide whether non-exact states
++ * comparison could be used:
++ * - non-exact states comparison establishes sub-state relation and uses
++ *   read and precision marks to do so, these marks are propagated from
++ *   children states and thus are not guaranteed to be final in a loop;
++ * - exact states comparison just checks if current and explored states
++ *   are identical (and thus form a back-edge).
++ *
++ * Paper "A New Algorithm for Identifying Loops in Decompilation"
++ * by Tao Wei, Jian Mao, Wei Zou and Yu Chen [1] presents a convenient
++ * algorithm for loop structure detection and gives an overview of
++ * relevant terminology. It also has helpful illustrations.
++ *
++ * [1] https://api.semanticscholar.org/CorpusID:15784067
++ *
++ * We use a similar algorithm but because loop nested structure is
++ * irrelevant for verifier ours is significantly simpler and resembles
++ * strongly connected components algorithm from Sedgewick's textbook.
++ *
++ * Define topmost loop entry as a first node of the loop traversed in a
++ * depth first search starting from initial state. The goal of the loop
++ * tracking algorithm is to associate topmost loop entries with states
++ * derived from these entries.
++ *
++ * For each step in the DFS states traversal algorithm needs to identify
++ * the following situations:
++ *
++ *          initial                     initial                   initial
++ *            |                           |                         |
++ *            V                           V                         V
++ *           ...                         ...           .---------> hdr
++ *            |                           |            |            |
++ *            V                           V            |            V
++ *           cur                     .-> succ          |    .------...
++ *            |                      |    |            |    |       |
++ *            V                      |    V            |    V       V
++ *           succ                    '-- cur           |   ...     ...
++ *                                                     |    |       |
++ *                                                     |    V       V
++ *                                                     |   succ <- cur
++ *                                                     |    |
++ *                                                     |    V
++ *                                                     |   ...
++ *                                                     |    |
++ *                                                     '----'
++ *
++ *  (A) successor state of cur   (B) successor state of cur or it's entry
++ *      not yet traversed            are in current DFS path, thus cur and succ
++ *                                   are members of the same outermost loop
++ *
++ *                      initial                  initial
++ *                        |                        |
++ *                        V                        V
++ *                       ...                      ...
++ *                        |                        |
++ *                        V                        V
++ *                .------...               .------...
++ *                |       |                |       |
++ *                V       V                V       V
++ *           .-> hdr     ...              ...     ...
++ *           |    |       |                |       |
++ *           |    V       V                V       V
++ *           |   succ <- cur              succ <- cur
++ *           |    |                        |
++ *           |    V                        V
++ *           |   ...                      ...
++ *           |    |                        |
++ *           '----'                       exit
++ *
++ * (C) successor state of cur is a part of some loop but this loop
++ *     does not include cur or successor state is not in a loop at all.
++ *
++ * Algorithm could be described as the following python code:
++ *
++ *     traversed = set()   # Set of traversed nodes
++ *     entries = {}        # Mapping from node to loop entry
++ *     depths = {}         # Depth level assigned to graph node
++ *     path = set()        # Current DFS path
++ *
++ *     # Find outermost loop entry known for n
++ *     def get_loop_entry(n):
++ *         h = entries.get(n, None)
++ *         while h in entries and entries[h] != h:
++ *             h = entries[h]
++ *         return h
++ *
++ *     # Update n's loop entry if h's outermost entry comes
++ *     # before n's outermost entry in current DFS path.
++ *     def update_loop_entry(n, h):
++ *         n1 = get_loop_entry(n) or n
++ *         h1 = get_loop_entry(h) or h
++ *         if h1 in path and depths[h1] <= depths[n1]:
++ *             entries[n] = h1
++ *
++ *     def dfs(n, depth):
++ *         traversed.add(n)
++ *         path.add(n)
++ *         depths[n] = depth
++ *         for succ in G.successors(n):
++ *             if succ not in traversed:
++ *                 # Case A: explore succ and update cur's loop entry
++ *                 #         only if succ's entry is in current DFS path.
++ *                 dfs(succ, depth + 1)
++ *                 h = get_loop_entry(succ)
++ *                 update_loop_entry(n, h)
++ *             else:
++ *                 # Case B or C depending on `h1 in path` check in update_loop_entry().
++ *                 update_loop_entry(n, succ)
++ *         path.remove(n)
++ *
++ * To adapt this algorithm for use with verifier:
++ * - use st->branch == 0 as a signal that DFS of succ had been finished
++ *   and cur's loop entry has to be updated (case A), handle this in
++ *   update_branch_counts();
++ * - use st->branch > 0 as a signal that st is in the current DFS path;
++ * - handle cases B and C in is_state_visited();
++ * - update topmost loop entry for intermediate states in get_loop_entry().
++ */
++static struct bpf_verifier_state *get_loop_entry(struct bpf_verifier_state *st)
++{
++      struct bpf_verifier_state *topmost = st->loop_entry, *old;
++
++      while (topmost && topmost->loop_entry && topmost != topmost->loop_entry)
++              topmost = topmost->loop_entry;
++      /* Update loop entries for intermediate states to avoid this
++       * traversal in future get_loop_entry() calls.
++       */
++      while (st && st->loop_entry != topmost) {
++              old = st->loop_entry;
++              st->loop_entry = topmost;
++              st = old;
++      }
++      return topmost;
++}
++
++static void update_loop_entry(struct bpf_verifier_state *cur, struct bpf_verifier_state *hdr)
++{
++      struct bpf_verifier_state *cur1, *hdr1;
++
++      cur1 = get_loop_entry(cur) ?: cur;
++      hdr1 = get_loop_entry(hdr) ?: hdr;
++      /* The head1->branches check decides between cases B and C in
++       * comment for get_loop_entry(). If hdr1->branches == 0 then
++       * head's topmost loop entry is not in current DFS path,
++       * hence 'cur' and 'hdr' are not in the same loop and there is
++       * no need to update cur->loop_entry.
++       */
++      if (hdr1->branches && hdr1->dfs_depth <= cur1->dfs_depth) {
++              cur->loop_entry = hdr;
++              hdr->used_as_loop_entry = true;
++      }
++}
++
+ static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
+ {
+       while (st) {
+               u32 br = --st->branches;
++              /* br == 0 signals that DFS exploration for 'st' is finished,
++               * thus it is necessary to update parent's loop entry if it
++               * turned out that st is a part of some loop.
++               * This is a part of 'case A' in get_loop_entry() comment.
++               */
++              if (br == 0 && st->parent && st->loop_entry)
++                      update_loop_entry(st->parent, st->loop_entry);
++
+               /* WARN_ON(br > 1) technically makes sense here,
+                * but see comment in push_stack(), hence:
+                */
+@@ -16262,10 +16428,11 @@ static int is_state_visited(struct bpf_v
+ {
+       struct bpf_verifier_state_list *new_sl;
+       struct bpf_verifier_state_list *sl, **pprev;
+-      struct bpf_verifier_state *cur = env->cur_state, *new;
++      struct bpf_verifier_state *cur = env->cur_state, *new, *loop_entry;
+       int i, j, n, err, states_cnt = 0;
+       bool force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx);
+       bool add_new_state = force_new_state;
++      bool force_exact;
+       /* bpf progs typically have pruning point every 4 instructions
+        * http://vger.kernel.org/bpfconf2019.html#session-1
+@@ -16360,8 +16527,10 @@ static int is_state_visited(struct bpf_v
+                                        */
+                                       spi = __get_spi(iter_reg->off + iter_reg->var_off.value);
+                                       iter_state = &func(env, iter_reg)->stack[spi].spilled_ptr;
+-                                      if (iter_state->iter.state == BPF_ITER_STATE_ACTIVE)
++                                      if (iter_state->iter.state == BPF_ITER_STATE_ACTIVE) {
++                                              update_loop_entry(cur, &sl->state);
+                                               goto hit;
++                                      }
+                               }
+                               goto skip_inf_loop_check;
+                       }
+@@ -16392,7 +16561,36 @@ skip_inf_loop_check:
+                               add_new_state = false;
+                       goto miss;
+               }
+-              if (states_equal(env, &sl->state, cur, false)) {
++              /* If sl->state is a part of a loop and this loop's entry is a part of
++               * current verification path then states have to be compared exactly.
++               * 'force_exact' is needed to catch the following case:
++               *
++               *                initial     Here state 'succ' was processed first,
++               *                  |         it was eventually tracked to produce a
++               *                  V         state identical to 'hdr'.
++               *     .---------> hdr        All branches from 'succ' had been explored
++               *     |            |         and thus 'succ' has its .branches == 0.
++               *     |            V
++               *     |    .------...        Suppose states 'cur' and 'succ' correspond
++               *     |    |       |         to the same instruction + callsites.
++               *     |    V       V         In such case it is necessary to check
++               *     |   ...     ...        if 'succ' and 'cur' are states_equal().
++               *     |    |       |         If 'succ' and 'cur' are a part of the
++               *     |    V       V         same loop exact flag has to be set.
++               *     |   succ <- cur        To check if that is the case, verify
++               *     |    |                 if loop entry of 'succ' is in current
++               *     |    V                 DFS path.
++               *     |   ...
++               *     |    |
++               *     '----'
++               *
++               * Additional details are in the comment before get_loop_entry().
++               */
++              loop_entry = get_loop_entry(&sl->state);
++              force_exact = loop_entry && loop_entry->branches > 0;
++              if (states_equal(env, &sl->state, cur, force_exact)) {
++                      if (force_exact)
++                              update_loop_entry(cur, loop_entry);
+ hit:
+                       sl->hit_cnt++;
+                       /* reached equivalent register/stack state,
+@@ -16441,7 +16639,8 @@ miss:
+                        * speed up verification
+                        */
+                       *pprev = sl->next;
+-                      if (sl->state.frame[0]->regs[0].live & REG_LIVE_DONE) {
++                      if (sl->state.frame[0]->regs[0].live & REG_LIVE_DONE &&
++                          !sl->state.used_as_loop_entry) {
+                               u32 br = sl->state.branches;
+                               WARN_ONCE(br,
diff --git a/queue-6.6/bpf-exact-states-comparison-for-iterator-convergence-checks.patch b/queue-6.6/bpf-exact-states-comparison-for-iterator-convergence-checks.patch
new file mode 100644 (file)
index 0000000..7481001
--- /dev/null
@@ -0,0 +1,494 @@
+From 2793a8b015f7f1caadb9bce9c63dc659f7522676 Mon Sep 17 00:00:00 2001
+From: Eduard Zingerman <eddyz87@gmail.com>
+Date: Tue, 24 Oct 2023 03:09:13 +0300
+Subject: bpf: exact states comparison for iterator convergence checks
+
+From: Eduard Zingerman <eddyz87@gmail.com>
+
+commit 2793a8b015f7f1caadb9bce9c63dc659f7522676 upstream.
+
+Convergence for open coded iterators is computed in is_state_visited()
+by examining states with branches count > 1 and using states_equal().
+states_equal() computes sub-state relation using read and precision marks.
+Read and precision marks are propagated from children states,
+thus are not guaranteed to be complete inside a loop when branches
+count > 1. This could be demonstrated using the following unsafe program:
+
+     1. r7 = -16
+     2. r6 = bpf_get_prandom_u32()
+     3. while (bpf_iter_num_next(&fp[-8])) {
+     4.   if (r6 != 42) {
+     5.     r7 = -32
+     6.     r6 = bpf_get_prandom_u32()
+     7.     continue
+     8.   }
+     9.   r0 = r10
+    10.   r0 += r7
+    11.   r8 = *(u64 *)(r0 + 0)
+    12.   r6 = bpf_get_prandom_u32()
+    13. }
+
+Here verifier would first visit path 1-3, create a checkpoint at 3
+with r7=-16, continue to 4-7,3 with r7=-32.
+
+Because instructions at 9-12 had not been visitied yet existing
+checkpoint at 3 does not have read or precision mark for r7.
+Thus states_equal() would return true and verifier would discard
+current state, thus unsafe memory access at 11 would not be caught.
+
+This commit fixes this loophole by introducing exact state comparisons
+for iterator convergence logic:
+- registers are compared using regs_exact() regardless of read or
+  precision marks;
+- stack slots have to have identical type.
+
+Unfortunately, this is too strict even for simple programs like below:
+
+    i = 0;
+    while(iter_next(&it))
+      i++;
+
+At each iteration step i++ would produce a new distinct state and
+eventually instruction processing limit would be reached.
+
+To avoid such behavior speculatively forget (widen) range for
+imprecise scalar registers, if those registers were not precise at the
+end of the previous iteration and do not match exactly.
+
+This a conservative heuristic that allows to verify wide range of
+programs, however it precludes verification of programs that conjure
+an imprecise value on the first loop iteration and use it as precise
+on the second.
+
+Test case iter_task_vma_for_each() presents one of such cases:
+
+        unsigned int seen = 0;
+        ...
+        bpf_for_each(task_vma, vma, task, 0) {
+                if (seen >= 1000)
+                        break;
+                ...
+                seen++;
+        }
+
+Here clang generates the following code:
+
+<LBB0_4>:
+      24:       r8 = r6                          ; stash current value of
+                ... body ...                       'seen'
+      29:       r1 = r10
+      30:       r1 += -0x8
+      31:       call bpf_iter_task_vma_next
+      32:       r6 += 0x1                        ; seen++;
+      33:       if r0 == 0x0 goto +0x2 <LBB0_6>  ; exit on next() == NULL
+      34:       r7 += 0x10
+      35:       if r8 < 0x3e7 goto -0xc <LBB0_4> ; loop on seen < 1000
+
+<LBB0_6>:
+      ... exit ...
+
+Note that counter in r6 is copied to r8 and then incremented,
+conditional jump is done using r8. Because of this precision mark for
+r6 lags one state behind of precision mark on r8 and widening logic
+kicks in.
+
+Adding barrier_var(seen) after conditional is sufficient to force
+clang use the same register for both counting and conditional jump.
+
+This issue was discussed in the thread [1] which was started by
+Andrew Werner <awerner32@gmail.com> demonstrating a similar bug
+in callback functions handling. The callbacks would be addressed
+in a followup patch.
+
+[1] https://lore.kernel.org/bpf/97a90da09404c65c8e810cf83c94ac703705dc0e.camel@gmail.com/
+
+Co-developed-by: Andrii Nakryiko <andrii.nakryiko@gmail.com>
+Co-developed-by: Alexei Starovoitov <alexei.starovoitov@gmail.com>
+Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
+Link: https://lore.kernel.org/r/20231024000917.12153-4-eddyz87@gmail.com
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/bpf_verifier.h |    1 
+ kernel/bpf/verifier.c        |  218 ++++++++++++++++++++++++++++++++++++-------
+ 2 files changed, 188 insertions(+), 31 deletions(-)
+
+--- a/include/linux/bpf_verifier.h
++++ b/include/linux/bpf_verifier.h
+@@ -383,6 +383,7 @@ struct bpf_verifier_state {
+        */
+       struct bpf_idx_pair *jmp_history;
+       u32 jmp_history_cnt;
++      u32 dfs_depth;
+ };
+ #define bpf_get_spilled_reg(slot, frame)                              \
+--- a/kernel/bpf/verifier.c
++++ b/kernel/bpf/verifier.c
+@@ -1771,6 +1771,7 @@ static int copy_verifier_state(struct bp
+       dst_state->parent = src->parent;
+       dst_state->first_insn_idx = src->first_insn_idx;
+       dst_state->last_insn_idx = src->last_insn_idx;
++      dst_state->dfs_depth = src->dfs_depth;
+       for (i = 0; i <= src->curframe; i++) {
+               dst = dst_state->frame[i];
+               if (!dst) {
+@@ -7554,6 +7555,81 @@ static int process_iter_arg(struct bpf_v
+       return 0;
+ }
++/* Look for a previous loop entry at insn_idx: nearest parent state
++ * stopped at insn_idx with callsites matching those in cur->frame.
++ */
++static struct bpf_verifier_state *find_prev_entry(struct bpf_verifier_env *env,
++                                                struct bpf_verifier_state *cur,
++                                                int insn_idx)
++{
++      struct bpf_verifier_state_list *sl;
++      struct bpf_verifier_state *st;
++
++      /* Explored states are pushed in stack order, most recent states come first */
++      sl = *explored_state(env, insn_idx);
++      for (; sl; sl = sl->next) {
++              /* If st->branches != 0 state is a part of current DFS verification path,
++               * hence cur & st for a loop.
++               */
++              st = &sl->state;
++              if (st->insn_idx == insn_idx && st->branches && same_callsites(st, cur) &&
++                  st->dfs_depth < cur->dfs_depth)
++                      return st;
++      }
++
++      return NULL;
++}
++
++static void reset_idmap_scratch(struct bpf_verifier_env *env);
++static bool regs_exact(const struct bpf_reg_state *rold,
++                     const struct bpf_reg_state *rcur,
++                     struct bpf_idmap *idmap);
++
++static void maybe_widen_reg(struct bpf_verifier_env *env,
++                          struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
++                          struct bpf_idmap *idmap)
++{
++      if (rold->type != SCALAR_VALUE)
++              return;
++      if (rold->type != rcur->type)
++              return;
++      if (rold->precise || rcur->precise || regs_exact(rold, rcur, idmap))
++              return;
++      __mark_reg_unknown(env, rcur);
++}
++
++static int widen_imprecise_scalars(struct bpf_verifier_env *env,
++                                 struct bpf_verifier_state *old,
++                                 struct bpf_verifier_state *cur)
++{
++      struct bpf_func_state *fold, *fcur;
++      int i, fr;
++
++      reset_idmap_scratch(env);
++      for (fr = old->curframe; fr >= 0; fr--) {
++              fold = old->frame[fr];
++              fcur = cur->frame[fr];
++
++              for (i = 0; i < MAX_BPF_REG; i++)
++                      maybe_widen_reg(env,
++                                      &fold->regs[i],
++                                      &fcur->regs[i],
++                                      &env->idmap_scratch);
++
++              for (i = 0; i < fold->allocated_stack / BPF_REG_SIZE; i++) {
++                      if (!is_spilled_reg(&fold->stack[i]) ||
++                          !is_spilled_reg(&fcur->stack[i]))
++                              continue;
++
++                      maybe_widen_reg(env,
++                                      &fold->stack[i].spilled_ptr,
++                                      &fcur->stack[i].spilled_ptr,
++                                      &env->idmap_scratch);
++              }
++      }
++      return 0;
++}
++
+ /* process_iter_next_call() is called when verifier gets to iterator's next
+  * "method" (e.g., bpf_iter_num_next() for numbers iterator) call. We'll refer
+  * to it as just "iter_next()" in comments below.
+@@ -7595,25 +7671,47 @@ static int process_iter_arg(struct bpf_v
+  * is some statically known limit on number of iterations (e.g., if there is
+  * an explicit `if n > 100 then break;` statement somewhere in the loop).
+  *
+- * One very subtle but very important aspect is that we *always* simulate NULL
+- * condition first (as the current state) before we simulate non-NULL case.
+- * This has to do with intricacies of scalar precision tracking. By simulating
+- * "exit condition" of iter_next() returning NULL first, we make sure all the
+- * relevant precision marks *that will be set **after** we exit iterator loop*
+- * are propagated backwards to common parent state of NULL and non-NULL
+- * branches. Thanks to that, state equivalence checks done later in forked
+- * state, when reaching iter_next() for ACTIVE iterator, can assume that
+- * precision marks are finalized and won't change. Because simulating another
+- * ACTIVE iterator iteration won't change them (because given same input
+- * states we'll end up with exactly same output states which we are currently
+- * comparing; and verification after the loop already propagated back what
+- * needs to be **additionally** tracked as precise). It's subtle, grok
+- * precision tracking for more intuitive understanding.
++ * Iteration convergence logic in is_state_visited() relies on exact
++ * states comparison, which ignores read and precision marks.
++ * This is necessary because read and precision marks are not finalized
++ * while in the loop. Exact comparison might preclude convergence for
++ * simple programs like below:
++ *
++ *     i = 0;
++ *     while(iter_next(&it))
++ *       i++;
++ *
++ * At each iteration step i++ would produce a new distinct state and
++ * eventually instruction processing limit would be reached.
++ *
++ * To avoid such behavior speculatively forget (widen) range for
++ * imprecise scalar registers, if those registers were not precise at the
++ * end of the previous iteration and do not match exactly.
++ *
++ * This is a conservative heuristic that allows to verify wide range of programs,
++ * however it precludes verification of programs that conjure an
++ * imprecise value on the first loop iteration and use it as precise on a second.
++ * For example, the following safe program would fail to verify:
++ *
++ *     struct bpf_num_iter it;
++ *     int arr[10];
++ *     int i = 0, a = 0;
++ *     bpf_iter_num_new(&it, 0, 10);
++ *     while (bpf_iter_num_next(&it)) {
++ *       if (a == 0) {
++ *         a = 1;
++ *         i = 7; // Because i changed verifier would forget
++ *                // it's range on second loop entry.
++ *       } else {
++ *         arr[i] = 42; // This would fail to verify.
++ *       }
++ *     }
++ *     bpf_iter_num_destroy(&it);
+  */
+ static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx,
+                                 struct bpf_kfunc_call_arg_meta *meta)
+ {
+-      struct bpf_verifier_state *cur_st = env->cur_state, *queued_st;
++      struct bpf_verifier_state *cur_st = env->cur_state, *queued_st, *prev_st;
+       struct bpf_func_state *cur_fr = cur_st->frame[cur_st->curframe], *queued_fr;
+       struct bpf_reg_state *cur_iter, *queued_iter;
+       int iter_frameno = meta->iter.frameno;
+@@ -7631,6 +7729,19 @@ static int process_iter_next_call(struct
+       }
+       if (cur_iter->iter.state == BPF_ITER_STATE_ACTIVE) {
++              /* Because iter_next() call is a checkpoint is_state_visitied()
++               * should guarantee parent state with same call sites and insn_idx.
++               */
++              if (!cur_st->parent || cur_st->parent->insn_idx != insn_idx ||
++                  !same_callsites(cur_st->parent, cur_st)) {
++                      verbose(env, "bug: bad parent state for iter next call");
++                      return -EFAULT;
++              }
++              /* Note cur_st->parent in the call below, it is necessary to skip
++               * checkpoint created for cur_st by is_state_visited()
++               * right at this instruction.
++               */
++              prev_st = find_prev_entry(env, cur_st->parent, insn_idx);
+               /* branch out active iter state */
+               queued_st = push_stack(env, insn_idx + 1, insn_idx, false);
+               if (!queued_st)
+@@ -7639,6 +7750,8 @@ static int process_iter_next_call(struct
+               queued_iter = &queued_st->frame[iter_frameno]->stack[iter_spi].spilled_ptr;
+               queued_iter->iter.state = BPF_ITER_STATE_ACTIVE;
+               queued_iter->iter.depth++;
++              if (prev_st)
++                      widen_imprecise_scalars(env, prev_st, queued_st);
+               queued_fr = queued_st->frame[queued_st->curframe];
+               mark_ptr_not_null_reg(&queued_fr->regs[BPF_REG_0]);
+@@ -15560,8 +15673,11 @@ static bool regs_exact(const struct bpf_
+ /* Returns true if (rold safe implies rcur safe) */
+ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
+-                  struct bpf_reg_state *rcur, struct bpf_idmap *idmap)
++                  struct bpf_reg_state *rcur, struct bpf_idmap *idmap, bool exact)
+ {
++      if (exact)
++              return regs_exact(rold, rcur, idmap);
++
+       if (!(rold->live & REG_LIVE_READ))
+               /* explored state didn't use this */
+               return true;
+@@ -15678,7 +15794,7 @@ static bool regsafe(struct bpf_verifier_
+ }
+ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
+-                    struct bpf_func_state *cur, struct bpf_idmap *idmap)
++                    struct bpf_func_state *cur, struct bpf_idmap *idmap, bool exact)
+ {
+       int i, spi;
+@@ -15691,7 +15807,12 @@ static bool stacksafe(struct bpf_verifie
+               spi = i / BPF_REG_SIZE;
+-              if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ)) {
++              if (exact &&
++                  old->stack[spi].slot_type[i % BPF_REG_SIZE] !=
++                  cur->stack[spi].slot_type[i % BPF_REG_SIZE])
++                      return false;
++
++              if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ) && !exact) {
+                       i += BPF_REG_SIZE - 1;
+                       /* explored state didn't use this */
+                       continue;
+@@ -15741,7 +15862,7 @@ static bool stacksafe(struct bpf_verifie
+                        * return false to continue verification of this path
+                        */
+                       if (!regsafe(env, &old->stack[spi].spilled_ptr,
+-                                   &cur->stack[spi].spilled_ptr, idmap))
++                                   &cur->stack[spi].spilled_ptr, idmap, exact))
+                               return false;
+                       break;
+               case STACK_DYNPTR:
+@@ -15823,16 +15944,16 @@ static bool refsafe(struct bpf_func_stat
+  * the current state will reach 'bpf_exit' instruction safely
+  */
+ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_state *old,
+-                            struct bpf_func_state *cur)
++                            struct bpf_func_state *cur, bool exact)
+ {
+       int i;
+       for (i = 0; i < MAX_BPF_REG; i++)
+               if (!regsafe(env, &old->regs[i], &cur->regs[i],
+-                           &env->idmap_scratch))
++                           &env->idmap_scratch, exact))
+                       return false;
+-      if (!stacksafe(env, old, cur, &env->idmap_scratch))
++      if (!stacksafe(env, old, cur, &env->idmap_scratch, exact))
+               return false;
+       if (!refsafe(old, cur, &env->idmap_scratch))
+@@ -15841,17 +15962,23 @@ static bool func_states_equal(struct bpf
+       return true;
+ }
++static void reset_idmap_scratch(struct bpf_verifier_env *env)
++{
++      env->idmap_scratch.tmp_id_gen = env->id_gen;
++      memset(&env->idmap_scratch.map, 0, sizeof(env->idmap_scratch.map));
++}
++
+ static bool states_equal(struct bpf_verifier_env *env,
+                        struct bpf_verifier_state *old,
+-                       struct bpf_verifier_state *cur)
++                       struct bpf_verifier_state *cur,
++                       bool exact)
+ {
+       int i;
+       if (old->curframe != cur->curframe)
+               return false;
+-      env->idmap_scratch.tmp_id_gen = env->id_gen;
+-      memset(&env->idmap_scratch.map, 0, sizeof(env->idmap_scratch.map));
++      reset_idmap_scratch(env);
+       /* Verification state from speculative execution simulation
+        * must never prune a non-speculative execution one.
+@@ -15881,7 +16008,7 @@ static bool states_equal(struct bpf_veri
+       for (i = 0; i <= old->curframe; i++) {
+               if (old->frame[i]->callsite != cur->frame[i]->callsite)
+                       return false;
+-              if (!func_states_equal(env, old->frame[i], cur->frame[i]))
++              if (!func_states_equal(env, old->frame[i], cur->frame[i], exact))
+                       return false;
+       }
+       return true;
+@@ -16136,7 +16263,7 @@ static int is_state_visited(struct bpf_v
+       struct bpf_verifier_state_list *new_sl;
+       struct bpf_verifier_state_list *sl, **pprev;
+       struct bpf_verifier_state *cur = env->cur_state, *new;
+-      int i, j, err, states_cnt = 0;
++      int i, j, n, err, states_cnt = 0;
+       bool force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx);
+       bool add_new_state = force_new_state;
+@@ -16191,9 +16318,33 @@ static int is_state_visited(struct bpf_v
+                        * It's safe to assume that iterator loop will finish, taking into
+                        * account iter_next() contract of eventually returning
+                        * sticky NULL result.
++                       *
++                       * Note, that states have to be compared exactly in this case because
++                       * read and precision marks might not be finalized inside the loop.
++                       * E.g. as in the program below:
++                       *
++                       *     1. r7 = -16
++                       *     2. r6 = bpf_get_prandom_u32()
++                       *     3. while (bpf_iter_num_next(&fp[-8])) {
++                       *     4.   if (r6 != 42) {
++                       *     5.     r7 = -32
++                       *     6.     r6 = bpf_get_prandom_u32()
++                       *     7.     continue
++                       *     8.   }
++                       *     9.   r0 = r10
++                       *    10.   r0 += r7
++                       *    11.   r8 = *(u64 *)(r0 + 0)
++                       *    12.   r6 = bpf_get_prandom_u32()
++                       *    13. }
++                       *
++                       * Here verifier would first visit path 1-3, create a checkpoint at 3
++                       * with r7=-16, continue to 4-7,3. Existing checkpoint at 3 does
++                       * not have read or precision mark for r7 yet, thus inexact states
++                       * comparison would discard current state with r7=-32
++                       * => unsafe memory access at 11 would not be caught.
+                        */
+                       if (is_iter_next_insn(env, insn_idx)) {
+-                              if (states_equal(env, &sl->state, cur)) {
++                              if (states_equal(env, &sl->state, cur, true)) {
+                                       struct bpf_func_state *cur_frame;
+                                       struct bpf_reg_state *iter_state, *iter_reg;
+                                       int spi;
+@@ -16216,7 +16367,7 @@ static int is_state_visited(struct bpf_v
+                       }
+                       /* attempt to detect infinite loop to avoid unnecessary doomed work */
+                       if (states_maybe_looping(&sl->state, cur) &&
+-                          states_equal(env, &sl->state, cur) &&
++                          states_equal(env, &sl->state, cur, false) &&
+                           !iter_active_depths_differ(&sl->state, cur)) {
+                               verbose_linfo(env, insn_idx, "; ");
+                               verbose(env, "infinite loop detected at insn %d\n", insn_idx);
+@@ -16241,7 +16392,7 @@ skip_inf_loop_check:
+                               add_new_state = false;
+                       goto miss;
+               }
+-              if (states_equal(env, &sl->state, cur)) {
++              if (states_equal(env, &sl->state, cur, false)) {
+ hit:
+                       sl->hit_cnt++;
+                       /* reached equivalent register/stack state,
+@@ -16280,8 +16431,12 @@ miss:
+                * to keep checking from state equivalence point of view.
+                * Higher numbers increase max_states_per_insn and verification time,
+                * but do not meaningfully decrease insn_processed.
++               * 'n' controls how many times state could miss before eviction.
++               * Use bigger 'n' for checkpoints because evicting checkpoint states
++               * too early would hinder iterator convergence.
+                */
+-              if (sl->miss_cnt > sl->hit_cnt * 3 + 3) {
++              n = is_force_checkpoint(env, insn_idx) && sl->state.branches > 0 ? 64 : 3;
++              if (sl->miss_cnt > sl->hit_cnt * n + n) {
+                       /* the state is unlikely to be useful. Remove it to
+                        * speed up verification
+                        */
+@@ -16355,6 +16510,7 @@ next:
+       cur->parent = new;
+       cur->first_insn_idx = insn_idx;
++      cur->dfs_depth = new->dfs_depth + 1;
+       clear_jmp_history(cur);
+       new_sl->next = *explored_state(env, insn_idx);
+       *explored_state(env, insn_idx) = new_sl;
diff --git a/queue-6.6/bpf-extract-__check_reg_arg-utility-function.patch b/queue-6.6/bpf-extract-__check_reg_arg-utility-function.patch
new file mode 100644 (file)
index 0000000..7950158
--- /dev/null
@@ -0,0 +1,70 @@
+From 683b96f9606ab7308ffb23c46ab43cecdef8a241 Mon Sep 17 00:00:00 2001
+From: Eduard Zingerman <eddyz87@gmail.com>
+Date: Tue, 21 Nov 2023 04:06:54 +0200
+Subject: bpf: extract __check_reg_arg() utility function
+
+From: Eduard Zingerman <eddyz87@gmail.com>
+
+commit 683b96f9606ab7308ffb23c46ab43cecdef8a241 upstream.
+
+Split check_reg_arg() into two utility functions:
+- check_reg_arg() operating on registers from current verifier state;
+- __check_reg_arg() operating on a specific set of registers passed as
+  a parameter;
+
+The __check_reg_arg() function would be used by a follow-up change for
+callbacks handling.
+
+Acked-by: Andrii Nakryiko <andrii@kernel.org>
+Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
+Link: https://lore.kernel.org/r/20231121020701.26440-5-eddyz87@gmail.com
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/bpf/verifier.c |   19 +++++++++++++------
+ 1 file changed, 13 insertions(+), 6 deletions(-)
+
+--- a/kernel/bpf/verifier.c
++++ b/kernel/bpf/verifier.c
+@@ -3321,13 +3321,11 @@ static void mark_insn_zext(struct bpf_ve
+       reg->subreg_def = DEF_NOT_SUBREG;
+ }
+-static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
+-                       enum reg_arg_type t)
++static int __check_reg_arg(struct bpf_verifier_env *env, struct bpf_reg_state *regs, u32 regno,
++                         enum reg_arg_type t)
+ {
+-      struct bpf_verifier_state *vstate = env->cur_state;
+-      struct bpf_func_state *state = vstate->frame[vstate->curframe];
+       struct bpf_insn *insn = env->prog->insnsi + env->insn_idx;
+-      struct bpf_reg_state *reg, *regs = state->regs;
++      struct bpf_reg_state *reg;
+       bool rw64;
+       if (regno >= MAX_BPF_REG) {
+@@ -3368,6 +3366,15 @@ static int check_reg_arg(struct bpf_veri
+       return 0;
+ }
++static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
++                       enum reg_arg_type t)
++{
++      struct bpf_verifier_state *vstate = env->cur_state;
++      struct bpf_func_state *state = vstate->frame[vstate->curframe];
++
++      return __check_reg_arg(env, state->regs, regno, t);
++}
++
+ static void mark_jmp_point(struct bpf_verifier_env *env, int idx)
+ {
+       env->insn_aux_data[idx].jmp_point = true;
+@@ -9147,7 +9154,7 @@ static void clear_caller_saved_regs(stru
+       /* after the call registers r0 - r5 were scratched */
+       for (i = 0; i < CALLER_SAVED_REGS; i++) {
+               mark_reg_not_init(env, regs, caller_saved[i]);
+-              check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
++              __check_reg_arg(env, regs, caller_saved[i], DST_OP_NO_MARK);
+       }
+ }
diff --git a/queue-6.6/bpf-extract-same_callsites-as-utility-function.patch b/queue-6.6/bpf-extract-same_callsites-as-utility-function.patch
new file mode 100644 (file)
index 0000000..5a391e8
--- /dev/null
@@ -0,0 +1,63 @@
+From 4c97259abc9bc8df7712f76f58ce385581876857 Mon Sep 17 00:00:00 2001
+From: Eduard Zingerman <eddyz87@gmail.com>
+Date: Tue, 24 Oct 2023 03:09:12 +0300
+Subject: bpf: extract same_callsites() as utility function
+
+From: Eduard Zingerman <eddyz87@gmail.com>
+
+commit 4c97259abc9bc8df7712f76f58ce385581876857 upstream.
+
+Extract same_callsites() from clean_live_states() as a utility function.
+This function would be used by the next patch in the set.
+
+Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
+Link: https://lore.kernel.org/r/20231024000917.12153-3-eddyz87@gmail.com
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/bpf/verifier.c |   20 +++++++++++++++-----
+ 1 file changed, 15 insertions(+), 5 deletions(-)
+
+--- a/kernel/bpf/verifier.c
++++ b/kernel/bpf/verifier.c
+@@ -1799,6 +1799,20 @@ static struct bpf_verifier_state_list **
+       return &env->explored_states[(idx ^ state->callsite) % state_htab_size(env)];
+ }
++static bool same_callsites(struct bpf_verifier_state *a, struct bpf_verifier_state *b)
++{
++      int fr;
++
++      if (a->curframe != b->curframe)
++              return false;
++
++      for (fr = a->curframe; fr >= 0; fr--)
++              if (a->frame[fr]->callsite != b->frame[fr]->callsite)
++                      return false;
++
++      return true;
++}
++
+ static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
+ {
+       while (st) {
+@@ -15521,18 +15535,14 @@ static void clean_live_states(struct bpf
+                             struct bpf_verifier_state *cur)
+ {
+       struct bpf_verifier_state_list *sl;
+-      int i;
+       sl = *explored_state(env, insn);
+       while (sl) {
+               if (sl->state.branches)
+                       goto next;
+               if (sl->state.insn_idx != insn ||
+-                  sl->state.curframe != cur->curframe)
++                  !same_callsites(&sl->state, cur))
+                       goto next;
+-              for (i = 0; i <= cur->curframe; i++)
+-                      if (sl->state.frame[i]->callsite != cur->frame[i]->callsite)
+-                              goto next;
+               clean_verifier_state(env, &sl->state);
+ next:
+               sl = sl->next;
diff --git a/queue-6.6/bpf-extract-setup_func_entry-utility-function.patch b/queue-6.6/bpf-extract-setup_func_entry-utility-function.patch
new file mode 100644 (file)
index 0000000..739eafe
--- /dev/null
@@ -0,0 +1,149 @@
+From 58124a98cb8eda69d248d7f1de954c8b2767c945 Mon Sep 17 00:00:00 2001
+From: Eduard Zingerman <eddyz87@gmail.com>
+Date: Tue, 21 Nov 2023 04:06:55 +0200
+Subject: bpf: extract setup_func_entry() utility function
+
+From: Eduard Zingerman <eddyz87@gmail.com>
+
+commit 58124a98cb8eda69d248d7f1de954c8b2767c945 upstream.
+
+Move code for simulated stack frame creation to a separate utility
+function. This function would be used in the follow-up change for
+callbacks handling.
+
+Acked-by: Andrii Nakryiko <andrii@kernel.org>
+Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
+Link: https://lore.kernel.org/r/20231121020701.26440-6-eddyz87@gmail.com
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/bpf/verifier.c |   84 ++++++++++++++++++++++++++++----------------------
+ 1 file changed, 48 insertions(+), 36 deletions(-)
+
+--- a/kernel/bpf/verifier.c
++++ b/kernel/bpf/verifier.c
+@@ -9167,11 +9167,10 @@ static int set_callee_state(struct bpf_v
+                           struct bpf_func_state *caller,
+                           struct bpf_func_state *callee, int insn_idx);
+-static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
+-                           int *insn_idx, int subprog,
+-                           set_callee_state_fn set_callee_state_cb)
++static int setup_func_entry(struct bpf_verifier_env *env, int subprog, int callsite,
++                          set_callee_state_fn set_callee_state_cb,
++                          struct bpf_verifier_state *state)
+ {
+-      struct bpf_verifier_state *state = env->cur_state;
+       struct bpf_func_state *caller, *callee;
+       int err;
+@@ -9181,13 +9180,53 @@ static int __check_func_call(struct bpf_
+               return -E2BIG;
+       }
+-      caller = state->frame[state->curframe];
+       if (state->frame[state->curframe + 1]) {
+               verbose(env, "verifier bug. Frame %d already allocated\n",
+                       state->curframe + 1);
+               return -EFAULT;
+       }
++      caller = state->frame[state->curframe];
++      callee = kzalloc(sizeof(*callee), GFP_KERNEL);
++      if (!callee)
++              return -ENOMEM;
++      state->frame[state->curframe + 1] = callee;
++
++      /* callee cannot access r0, r6 - r9 for reading and has to write
++       * into its own stack before reading from it.
++       * callee can read/write into caller's stack
++       */
++      init_func_state(env, callee,
++                      /* remember the callsite, it will be used by bpf_exit */
++                      callsite,
++                      state->curframe + 1 /* frameno within this callchain */,
++                      subprog /* subprog number within this prog */);
++      /* Transfer references to the callee */
++      err = copy_reference_state(callee, caller);
++      err = err ?: set_callee_state_cb(env, caller, callee, callsite);
++      if (err)
++              goto err_out;
++
++      /* only increment it after check_reg_arg() finished */
++      state->curframe++;
++
++      return 0;
++
++err_out:
++      free_func_state(callee);
++      state->frame[state->curframe + 1] = NULL;
++      return err;
++}
++
++static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
++                           int *insn_idx, int subprog,
++                           set_callee_state_fn set_callee_state_cb)
++{
++      struct bpf_verifier_state *state = env->cur_state;
++      struct bpf_func_state *caller, *callee;
++      int err;
++
++      caller = state->frame[state->curframe];
+       err = btf_check_subprog_call(env, subprog, caller->regs);
+       if (err == -EFAULT)
+               return err;
+@@ -9256,35 +9295,12 @@ static int __check_func_call(struct bpf_
+               return 0;
+       }
+-      callee = kzalloc(sizeof(*callee), GFP_KERNEL);
+-      if (!callee)
+-              return -ENOMEM;
+-      state->frame[state->curframe + 1] = callee;
+-
+-      /* callee cannot access r0, r6 - r9 for reading and has to write
+-       * into its own stack before reading from it.
+-       * callee can read/write into caller's stack
+-       */
+-      init_func_state(env, callee,
+-                      /* remember the callsite, it will be used by bpf_exit */
+-                      *insn_idx /* callsite */,
+-                      state->curframe + 1 /* frameno within this callchain */,
+-                      subprog /* subprog number within this prog */);
+-
+-      /* Transfer references to the callee */
+-      err = copy_reference_state(callee, caller);
++      err = setup_func_entry(env, subprog, *insn_idx, set_callee_state_cb, state);
+       if (err)
+-              goto err_out;
+-
+-      err = set_callee_state_cb(env, caller, callee, *insn_idx);
+-      if (err)
+-              goto err_out;
++              return err;
+       clear_caller_saved_regs(env, caller->regs);
+-      /* only increment it after check_reg_arg() finished */
+-      state->curframe++;
+-
+       /* and go analyze first insn of the callee */
+       *insn_idx = env->subprog_info[subprog].start - 1;
+@@ -9292,14 +9308,10 @@ static int __check_func_call(struct bpf_
+               verbose(env, "caller:\n");
+               print_verifier_state(env, caller, true);
+               verbose(env, "callee:\n");
+-              print_verifier_state(env, callee, true);
++              print_verifier_state(env, state->frame[state->curframe], true);
+       }
+-      return 0;
+-err_out:
+-      free_func_state(callee);
+-      state->frame[state->curframe + 1] = NULL;
+-      return err;
++      return 0;
+ }
+ int map_set_for_each_callback_args(struct bpf_verifier_env *env,
diff --git a/queue-6.6/bpf-keep-track-of-max-number-of-bpf_loop-callback-iterations.patch b/queue-6.6/bpf-keep-track-of-max-number-of-bpf_loop-callback-iterations.patch
new file mode 100644 (file)
index 0000000..492ce43
--- /dev/null
@@ -0,0 +1,181 @@
+From bb124da69c47dd98d69361ec13244ece50bec63e Mon Sep 17 00:00:00 2001
+From: Eduard Zingerman <eddyz87@gmail.com>
+Date: Tue, 21 Nov 2023 04:07:00 +0200
+Subject: bpf: keep track of max number of bpf_loop callback iterations
+
+From: Eduard Zingerman <eddyz87@gmail.com>
+
+commit bb124da69c47dd98d69361ec13244ece50bec63e upstream.
+
+In some cases verifier can't infer convergence of the bpf_loop()
+iteration. E.g. for the following program:
+
+    static int cb(__u32 idx, struct num_context* ctx)
+    {
+        ctx->i++;
+        return 0;
+    }
+
+    SEC("?raw_tp")
+    int prog(void *_)
+    {
+        struct num_context ctx = { .i = 0 };
+        __u8 choice_arr[2] = { 0, 1 };
+
+        bpf_loop(2, cb, &ctx, 0);
+        return choice_arr[ctx.i];
+    }
+
+Each 'cb' simulation would eventually return to 'prog' and reach
+'return choice_arr[ctx.i]' statement. At which point ctx.i would be
+marked precise, thus forcing verifier to track multitude of separate
+states with {.i=0}, {.i=1}, ... at bpf_loop() callback entry.
+
+This commit allows "brute force" handling for such cases by limiting
+number of callback body simulations using 'umax' value of the first
+bpf_loop() parameter.
+
+For this, extend bpf_func_state with 'callback_depth' field.
+Increment this field when callback visiting state is pushed to states
+traversal stack. For frame #N it's 'callback_depth' field counts how
+many times callback with frame depth N+1 had been executed.
+Use bpf_func_state specifically to allow independent tracking of
+callback depths when multiple nested bpf_loop() calls are present.
+
+Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
+Link: https://lore.kernel.org/r/20231121020701.26440-11-eddyz87@gmail.com
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/bpf_verifier.h                                   |   11 +++
+ kernel/bpf/verifier.c                                          |   19 ++++-
+ tools/testing/selftests/bpf/progs/verifier_subprog_precision.c |   35 +++++++---
+ 3 files changed, 53 insertions(+), 12 deletions(-)
+
+--- a/include/linux/bpf_verifier.h
++++ b/include/linux/bpf_verifier.h
+@@ -300,6 +300,17 @@ struct bpf_func_state {
+       bool in_callback_fn;
+       struct tnum callback_ret_range;
+       bool in_async_callback_fn;
++      /* For callback calling functions that limit number of possible
++       * callback executions (e.g. bpf_loop) keeps track of current
++       * simulated iteration number.
++       * Value in frame N refers to number of times callback with frame
++       * N+1 was simulated, e.g. for the following call:
++       *
++       *   bpf_loop(..., fn, ...); | suppose current frame is N
++       *                           | fn would be simulated in frame N+1
++       *                           | number of simulations is tracked in frame N
++       */
++      u32 callback_depth;
+       /* The following fields should be last. See copy_func_state() */
+       int acquired_refs;
+--- a/kernel/bpf/verifier.c
++++ b/kernel/bpf/verifier.c
+@@ -9301,6 +9301,8 @@ static int push_callback_call(struct bpf
+               return err;
+       callback_state->callback_unroll_depth++;
++      callback_state->frame[callback_state->curframe - 1]->callback_depth++;
++      caller->callback_depth = 0;
+       return 0;
+ }
+@@ -10090,8 +10092,21 @@ static int check_helper_call(struct bpf_
+               break;
+       case BPF_FUNC_loop:
+               update_loop_inline_state(env, meta.subprogno);
+-              err = push_callback_call(env, insn, insn_idx, meta.subprogno,
+-                                       set_loop_callback_state);
++              /* Verifier relies on R1 value to determine if bpf_loop() iteration
++               * is finished, thus mark it precise.
++               */
++              err = mark_chain_precision(env, BPF_REG_1);
++              if (err)
++                      return err;
++              if (cur_func(env)->callback_depth < regs[BPF_REG_1].umax_value) {
++                      err = push_callback_call(env, insn, insn_idx, meta.subprogno,
++                                               set_loop_callback_state);
++              } else {
++                      cur_func(env)->callback_depth = 0;
++                      if (env->log.level & BPF_LOG_LEVEL2)
++                              verbose(env, "frame%d bpf_loop iteration limit reached\n",
++                                      env->cur_state->curframe);
++              }
+               break;
+       case BPF_FUNC_dynptr_from_mem:
+               if (regs[BPF_REG_1].type != PTR_TO_MAP_VALUE) {
+--- a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c
++++ b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c
+@@ -119,7 +119,23 @@ __naked int global_subprog_result_precis
+ SEC("?raw_tp")
+ __success __log_level(2)
+-/* First simulated path does not include callback body */
++/* First simulated path does not include callback body,
++ * r1 and r4 are always precise for bpf_loop() calls.
++ */
++__msg("9: (85) call bpf_loop#181")
++__msg("mark_precise: frame0: last_idx 9 first_idx 9 subseq_idx -1")
++__msg("mark_precise: frame0: parent state regs=r4 stack=:")
++__msg("mark_precise: frame0: last_idx 8 first_idx 0 subseq_idx 9")
++__msg("mark_precise: frame0: regs=r4 stack= before 8: (b7) r4 = 0")
++__msg("mark_precise: frame0: last_idx 9 first_idx 9 subseq_idx -1")
++__msg("mark_precise: frame0: parent state regs=r1 stack=:")
++__msg("mark_precise: frame0: last_idx 8 first_idx 0 subseq_idx 9")
++__msg("mark_precise: frame0: regs=r1 stack= before 8: (b7) r4 = 0")
++__msg("mark_precise: frame0: regs=r1 stack= before 7: (b7) r3 = 0")
++__msg("mark_precise: frame0: regs=r1 stack= before 6: (bf) r2 = r8")
++__msg("mark_precise: frame0: regs=r1 stack= before 5: (bf) r1 = r6")
++__msg("mark_precise: frame0: regs=r6 stack= before 4: (b7) r6 = 3")
++/* r6 precision propagation */
+ __msg("14: (0f) r1 += r6")
+ __msg("mark_precise: frame0: last_idx 14 first_idx 9")
+ __msg("mark_precise: frame0: regs=r6 stack= before 13: (bf) r1 = r7")
+@@ -134,10 +150,9 @@ __msg("17: (b7) r0 = 0")
+ __msg("18: (95) exit")
+ __msg("returning from callee:")
+ __msg("to caller at 9:")
+-/* r4 (flags) is always precise for bpf_loop() */
+-__msg("frame 0: propagating r4")
++__msg("frame 0: propagating r1,r4")
+ __msg("mark_precise: frame0: last_idx 9 first_idx 9 subseq_idx -1")
+-__msg("mark_precise: frame0: regs=r4 stack= before 18: (95) exit")
++__msg("mark_precise: frame0: regs=r1,r4 stack= before 18: (95) exit")
+ __msg("from 18 to 9: safe")
+ __naked int callback_result_precise(void)
+ {
+@@ -264,12 +279,12 @@ __msg("15: (b7) r0 = 0")
+ __msg("16: (95) exit")
+ __msg("returning from callee:")
+ __msg("to caller at 9:")
+-/* r4 (flags) is always precise for bpf_loop(),
++/* r1, r4 are always precise for bpf_loop(),
+  * r6 was marked before backtracking to callback body.
+  */
+-__msg("frame 0: propagating r4,r6")
++__msg("frame 0: propagating r1,r4,r6")
+ __msg("mark_precise: frame0: last_idx 9 first_idx 9 subseq_idx -1")
+-__msg("mark_precise: frame0: regs=r4,r6 stack= before 16: (95) exit")
++__msg("mark_precise: frame0: regs=r1,r4,r6 stack= before 16: (95) exit")
+ __msg("mark_precise: frame1: regs= stack= before 15: (b7) r0 = 0")
+ __msg("mark_precise: frame1: regs= stack= before 9: (85) call bpf_loop")
+ __msg("mark_precise: frame0: parent state regs= stack=:")
+@@ -422,12 +437,12 @@ __msg("17: (b7) r0 = 0")
+ __msg("18: (95) exit")
+ __msg("returning from callee:")
+ __msg("to caller at 10:")
+-/* r4 (flags) is always precise for bpf_loop(),
++/* r1, r4 are always precise for bpf_loop(),
+  * fp-8 was marked before backtracking to callback body.
+  */
+-__msg("frame 0: propagating r4,fp-8")
++__msg("frame 0: propagating r1,r4,fp-8")
+ __msg("mark_precise: frame0: last_idx 10 first_idx 10 subseq_idx -1")
+-__msg("mark_precise: frame0: regs=r4 stack=-8 before 18: (95) exit")
++__msg("mark_precise: frame0: regs=r1,r4 stack=-8 before 18: (95) exit")
+ __msg("mark_precise: frame1: regs= stack= before 17: (b7) r0 = 0")
+ __msg("mark_precise: frame1: regs= stack= before 10: (85) call bpf_loop#181")
+ __msg("mark_precise: frame0: parent state regs= stack=:")
diff --git a/queue-6.6/bpf-move-explored_state-closer-to-the-beginning-of-verifier.c.patch b/queue-6.6/bpf-move-explored_state-closer-to-the-beginning-of-verifier.c.patch
new file mode 100644 (file)
index 0000000..a18642a
--- /dev/null
@@ -0,0 +1,64 @@
+From 3c4e420cb6536026ddd50eaaff5f30e4f144200d Mon Sep 17 00:00:00 2001
+From: Eduard Zingerman <eddyz87@gmail.com>
+Date: Tue, 24 Oct 2023 03:09:11 +0300
+Subject: bpf: move explored_state() closer to the beginning of verifier.c
+
+From: Eduard Zingerman <eddyz87@gmail.com>
+
+commit 3c4e420cb6536026ddd50eaaff5f30e4f144200d upstream.
+
+Subsequent patches would make use of explored_state() function.
+Move it up to avoid adding unnecessary prototype.
+
+Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
+Link: https://lore.kernel.org/r/20231024000917.12153-2-eddyz87@gmail.com
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/bpf/verifier.c |   28 +++++++++++++---------------
+ 1 file changed, 13 insertions(+), 15 deletions(-)
+
+--- a/kernel/bpf/verifier.c
++++ b/kernel/bpf/verifier.c
+@@ -1786,6 +1786,19 @@ static int copy_verifier_state(struct bp
+       return 0;
+ }
++static u32 state_htab_size(struct bpf_verifier_env *env)
++{
++      return env->prog->len;
++}
++
++static struct bpf_verifier_state_list **explored_state(struct bpf_verifier_env *env, int idx)
++{
++      struct bpf_verifier_state *cur = env->cur_state;
++      struct bpf_func_state *state = cur->frame[cur->curframe];
++
++      return &env->explored_states[(idx ^ state->callsite) % state_htab_size(env)];
++}
++
+ static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
+ {
+       while (st) {
+@@ -14702,21 +14715,6 @@ enum {
+       BRANCH = 2,
+ };
+-static u32 state_htab_size(struct bpf_verifier_env *env)
+-{
+-      return env->prog->len;
+-}
+-
+-static struct bpf_verifier_state_list **explored_state(
+-                                      struct bpf_verifier_env *env,
+-                                      int idx)
+-{
+-      struct bpf_verifier_state *cur = env->cur_state;
+-      struct bpf_func_state *state = cur->frame[cur->curframe];
+-
+-      return &env->explored_states[(idx ^ state->callsite) % state_htab_size(env)];
+-}
+-
+ static void mark_prune_point(struct bpf_verifier_env *env, int idx)
+ {
+       env->insn_aux_data[idx].prune_point = true;
diff --git a/queue-6.6/bpf-print-full-verifier-states-on-infinite-loop-detection.patch b/queue-6.6/bpf-print-full-verifier-states-on-infinite-loop-detection.patch
new file mode 100644 (file)
index 0000000..b5d831a
--- /dev/null
@@ -0,0 +1,34 @@
+From b4d8239534fddc036abe4a0fdbf474d9894d4641 Mon Sep 17 00:00:00 2001
+From: Eduard Zingerman <eddyz87@gmail.com>
+Date: Tue, 24 Oct 2023 03:09:17 +0300
+Subject: bpf: print full verifier states on infinite loop detection
+
+From: Eduard Zingerman <eddyz87@gmail.com>
+
+commit b4d8239534fddc036abe4a0fdbf474d9894d4641 upstream.
+
+Additional logging in is_state_visited(): if infinite loop is detected
+print full verifier state for both current and equivalent states.
+
+Acked-by: Andrii Nakryiko <andrii@kernel.org>
+Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
+Link: https://lore.kernel.org/r/20231024000917.12153-8-eddyz87@gmail.com
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/bpf/verifier.c |    4 ++++
+ 1 file changed, 4 insertions(+)
+
+--- a/kernel/bpf/verifier.c
++++ b/kernel/bpf/verifier.c
+@@ -16540,6 +16540,10 @@ static int is_state_visited(struct bpf_v
+                           !iter_active_depths_differ(&sl->state, cur)) {
+                               verbose_linfo(env, insn_idx, "; ");
+                               verbose(env, "infinite loop detected at insn %d\n", insn_idx);
++                              verbose(env, "cur state:");
++                              print_verifier_state(env, cur->frame[cur->curframe], true);
++                              verbose(env, "old state:");
++                              print_verifier_state(env, sl->state.frame[cur->curframe], true);
+                               return -EINVAL;
+                       }
+                       /* if the verifier is processing a loop, avoid adding new state
diff --git a/queue-6.6/bpf-verify-callbacks-as-if-they-are-called-unknown-number-of-times.patch b/queue-6.6/bpf-verify-callbacks-as-if-they-are-called-unknown-number-of-times.patch
new file mode 100644 (file)
index 0000000..5405ba9
--- /dev/null
@@ -0,0 +1,660 @@
+From ab5cfac139ab8576fb54630d4cca23c3e690ee90 Mon Sep 17 00:00:00 2001
+From: Eduard Zingerman <eddyz87@gmail.com>
+Date: Tue, 21 Nov 2023 04:06:56 +0200
+Subject: bpf: verify callbacks as if they are called unknown number of times
+
+From: Eduard Zingerman <eddyz87@gmail.com>
+
+commit ab5cfac139ab8576fb54630d4cca23c3e690ee90 upstream.
+
+Prior to this patch callbacks were handled as regular function calls,
+execution of callback body was modeled exactly once.
+This patch updates callbacks handling logic as follows:
+- introduces a function push_callback_call() that schedules callback
+  body verification in env->head stack;
+- updates prepare_func_exit() to reschedule callback body verification
+  upon BPF_EXIT;
+- as calls to bpf_*_iter_next(), calls to callback invoking functions
+  are marked as checkpoints;
+- is_state_visited() is updated to stop callback based iteration when
+  some identical parent state is found.
+
+Paths with callback function invoked zero times are now verified first,
+which leads to necessity to modify some selftests:
+- the following negative tests required adding release/unlock/drop
+  calls to avoid previously masked unrelated error reports:
+  - cb_refs.c:underflow_prog
+  - exceptions_fail.c:reject_rbtree_add_throw
+  - exceptions_fail.c:reject_with_cp_reference
+- the following precision tracking selftests needed change in expected
+  log trace:
+  - verifier_subprog_precision.c:callback_result_precise
+    (note: r0 precision is no longer propagated inside callback and
+           I think this is a correct behavior)
+  - verifier_subprog_precision.c:parent_callee_saved_reg_precise_with_callback
+  - verifier_subprog_precision.c:parent_stack_slot_precise_with_callback
+
+Reported-by: Andrew Werner <awerner32@gmail.com>
+Closes: https://lore.kernel.org/bpf/CA+vRuzPChFNXmouzGG+wsy=6eMcfr1mFG0F3g7rbg-sedGKW3w@mail.gmail.com/
+Acked-by: Andrii Nakryiko <andrii@kernel.org>
+Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
+Link: https://lore.kernel.org/r/20231121020701.26440-7-eddyz87@gmail.com
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/bpf_verifier.h                                   |    5 
+ kernel/bpf/verifier.c                                          |  272 ++++++----
+ tools/testing/selftests/bpf/progs/cb_refs.c                    |    1 
+ tools/testing/selftests/bpf/progs/verifier_subprog_precision.c |   71 ++
+ 4 files changed, 237 insertions(+), 112 deletions(-)
+
+--- a/include/linux/bpf_verifier.h
++++ b/include/linux/bpf_verifier.h
+@@ -399,6 +399,7 @@ struct bpf_verifier_state {
+       struct bpf_idx_pair *jmp_history;
+       u32 jmp_history_cnt;
+       u32 dfs_depth;
++      u32 callback_unroll_depth;
+ };
+ #define bpf_get_spilled_reg(slot, frame)                              \
+@@ -506,6 +507,10 @@ struct bpf_insn_aux_data {
+        * this instruction, regardless of any heuristics
+        */
+       bool force_checkpoint;
++      /* true if instruction is a call to a helper function that
++       * accepts callback function as a parameter.
++       */
++      bool calls_callback;
+ };
+ #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */
+--- a/kernel/bpf/verifier.c
++++ b/kernel/bpf/verifier.c
+@@ -542,12 +542,11 @@ static bool is_dynptr_ref_function(enum
+       return func_id == BPF_FUNC_dynptr_data;
+ }
+-static bool is_callback_calling_kfunc(u32 btf_id);
++static bool is_sync_callback_calling_kfunc(u32 btf_id);
+-static bool is_callback_calling_function(enum bpf_func_id func_id)
++static bool is_sync_callback_calling_function(enum bpf_func_id func_id)
+ {
+       return func_id == BPF_FUNC_for_each_map_elem ||
+-             func_id == BPF_FUNC_timer_set_callback ||
+              func_id == BPF_FUNC_find_vma ||
+              func_id == BPF_FUNC_loop ||
+              func_id == BPF_FUNC_user_ringbuf_drain;
+@@ -558,6 +557,18 @@ static bool is_async_callback_calling_fu
+       return func_id == BPF_FUNC_timer_set_callback;
+ }
++static bool is_callback_calling_function(enum bpf_func_id func_id)
++{
++      return is_sync_callback_calling_function(func_id) ||
++             is_async_callback_calling_function(func_id);
++}
++
++static bool is_sync_callback_calling_insn(struct bpf_insn *insn)
++{
++      return (bpf_helper_call(insn) && is_sync_callback_calling_function(insn->imm)) ||
++             (bpf_pseudo_kfunc_call(insn) && is_sync_callback_calling_kfunc(insn->imm));
++}
++
+ static bool is_storage_get_function(enum bpf_func_id func_id)
+ {
+       return func_id == BPF_FUNC_sk_storage_get ||
+@@ -1772,6 +1783,7 @@ static int copy_verifier_state(struct bp
+       dst_state->first_insn_idx = src->first_insn_idx;
+       dst_state->last_insn_idx = src->last_insn_idx;
+       dst_state->dfs_depth = src->dfs_depth;
++      dst_state->callback_unroll_depth = src->callback_unroll_depth;
+       dst_state->used_as_loop_entry = src->used_as_loop_entry;
+       for (i = 0; i <= src->curframe; i++) {
+               dst = dst_state->frame[i];
+@@ -3613,6 +3625,8 @@ static void fmt_stack_mask(char *buf, ss
+       }
+ }
++static bool calls_callback(struct bpf_verifier_env *env, int insn_idx);
++
+ /* For given verifier state backtrack_insn() is called from the last insn to
+  * the first insn. Its purpose is to compute a bitmask of registers and
+  * stack slots that needs precision in the parent verifier state.
+@@ -3788,16 +3802,13 @@ static int backtrack_insn(struct bpf_ver
+                                       return -EFAULT;
+                               return 0;
+                       }
+-              } else if ((bpf_helper_call(insn) &&
+-                          is_callback_calling_function(insn->imm) &&
+-                          !is_async_callback_calling_function(insn->imm)) ||
+-                         (bpf_pseudo_kfunc_call(insn) && is_callback_calling_kfunc(insn->imm))) {
+-                      /* callback-calling helper or kfunc call, which means
+-                       * we are exiting from subprog, but unlike the subprog
+-                       * call handling above, we shouldn't propagate
+-                       * precision of r1-r5 (if any requested), as they are
+-                       * not actually arguments passed directly to callback
+-                       * subprogs
++              } else if (is_sync_callback_calling_insn(insn) && idx != subseq_idx - 1) {
++                      /* exit from callback subprog to callback-calling helper or
++                       * kfunc call. Use idx/subseq_idx check to discern it from
++                       * straight line code backtracking.
++                       * Unlike the subprog call handling above, we shouldn't
++                       * propagate precision of r1-r5 (if any requested), as they are
++                       * not actually arguments passed directly to callback subprogs
+                        */
+                       if (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) {
+                               verbose(env, "BUG regs %x\n", bt_reg_mask(bt));
+@@ -3832,10 +3843,18 @@ static int backtrack_insn(struct bpf_ver
+               } else if (opcode == BPF_EXIT) {
+                       bool r0_precise;
++                      /* Backtracking to a nested function call, 'idx' is a part of
++                       * the inner frame 'subseq_idx' is a part of the outer frame.
++                       * In case of a regular function call, instructions giving
++                       * precision to registers R1-R5 should have been found already.
++                       * In case of a callback, it is ok to have R1-R5 marked for
++                       * backtracking, as these registers are set by the function
++                       * invoking callback.
++                       */
++                      if (subseq_idx >= 0 && calls_callback(env, subseq_idx))
++                              for (i = BPF_REG_1; i <= BPF_REG_5; i++)
++                                      bt_clear_reg(bt, i);
+                       if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) {
+-                              /* if backtracing was looking for registers R1-R5
+-                               * they should have been found already.
+-                               */
+                               verbose(env, "BUG regs %x\n", bt_reg_mask(bt));
+                               WARN_ONCE(1, "verifier backtracking bug");
+                               return -EFAULT;
+@@ -9218,11 +9237,11 @@ err_out:
+       return err;
+ }
+-static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
+-                           int *insn_idx, int subprog,
+-                           set_callee_state_fn set_callee_state_cb)
++static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
++                            int insn_idx, int subprog,
++                            set_callee_state_fn set_callee_state_cb)
+ {
+-      struct bpf_verifier_state *state = env->cur_state;
++      struct bpf_verifier_state *state = env->cur_state, *callback_state;
+       struct bpf_func_state *caller, *callee;
+       int err;
+@@ -9230,43 +9249,21 @@ static int __check_func_call(struct bpf_
+       err = btf_check_subprog_call(env, subprog, caller->regs);
+       if (err == -EFAULT)
+               return err;
+-      if (subprog_is_global(env, subprog)) {
+-              if (err) {
+-                      verbose(env, "Caller passes invalid args into func#%d\n",
+-                              subprog);
+-                      return err;
+-              } else {
+-                      if (env->log.level & BPF_LOG_LEVEL)
+-                              verbose(env,
+-                                      "Func#%d is global and valid. Skipping.\n",
+-                                      subprog);
+-                      clear_caller_saved_regs(env, caller->regs);
+-
+-                      /* All global functions return a 64-bit SCALAR_VALUE */
+-                      mark_reg_unknown(env, caller->regs, BPF_REG_0);
+-                      caller->regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG;
+-
+-                      /* continue with next insn after call */
+-                      return 0;
+-              }
+-      }
+       /* set_callee_state is used for direct subprog calls, but we are
+        * interested in validating only BPF helpers that can call subprogs as
+        * callbacks
+        */
+-      if (set_callee_state_cb != set_callee_state) {
+-              if (bpf_pseudo_kfunc_call(insn) &&
+-                  !is_callback_calling_kfunc(insn->imm)) {
+-                      verbose(env, "verifier bug: kfunc %s#%d not marked as callback-calling\n",
+-                              func_id_name(insn->imm), insn->imm);
+-                      return -EFAULT;
+-              } else if (!bpf_pseudo_kfunc_call(insn) &&
+-                         !is_callback_calling_function(insn->imm)) { /* helper */
+-                      verbose(env, "verifier bug: helper %s#%d not marked as callback-calling\n",
+-                              func_id_name(insn->imm), insn->imm);
+-                      return -EFAULT;
+-              }
++      if (bpf_pseudo_kfunc_call(insn) &&
++          !is_sync_callback_calling_kfunc(insn->imm)) {
++              verbose(env, "verifier bug: kfunc %s#%d not marked as callback-calling\n",
++                      func_id_name(insn->imm), insn->imm);
++              return -EFAULT;
++      } else if (!bpf_pseudo_kfunc_call(insn) &&
++                 !is_callback_calling_function(insn->imm)) { /* helper */
++              verbose(env, "verifier bug: helper %s#%d not marked as callback-calling\n",
++                      func_id_name(insn->imm), insn->imm);
++              return -EFAULT;
+       }
+       if (insn->code == (BPF_JMP | BPF_CALL) &&
+@@ -9277,25 +9274,76 @@ static int __check_func_call(struct bpf_
+               /* there is no real recursion here. timer callbacks are async */
+               env->subprog_info[subprog].is_async_cb = true;
+               async_cb = push_async_cb(env, env->subprog_info[subprog].start,
+-                                       *insn_idx, subprog);
++                                       insn_idx, subprog);
+               if (!async_cb)
+                       return -EFAULT;
+               callee = async_cb->frame[0];
+               callee->async_entry_cnt = caller->async_entry_cnt + 1;
+               /* Convert bpf_timer_set_callback() args into timer callback args */
+-              err = set_callee_state_cb(env, caller, callee, *insn_idx);
++              err = set_callee_state_cb(env, caller, callee, insn_idx);
+               if (err)
+                       return err;
++              return 0;
++      }
++
++      /* for callback functions enqueue entry to callback and
++       * proceed with next instruction within current frame.
++       */
++      callback_state = push_stack(env, env->subprog_info[subprog].start, insn_idx, false);
++      if (!callback_state)
++              return -ENOMEM;
++
++      err = setup_func_entry(env, subprog, insn_idx, set_callee_state_cb,
++                             callback_state);
++      if (err)
++              return err;
++
++      callback_state->callback_unroll_depth++;
++      return 0;
++}
++
++static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
++                         int *insn_idx)
++{
++      struct bpf_verifier_state *state = env->cur_state;
++      struct bpf_func_state *caller;
++      int err, subprog, target_insn;
++
++      target_insn = *insn_idx + insn->imm + 1;
++      subprog = find_subprog(env, target_insn);
++      if (subprog < 0) {
++              verbose(env, "verifier bug. No program starts at insn %d\n", target_insn);
++              return -EFAULT;
++      }
++
++      caller = state->frame[state->curframe];
++      err = btf_check_subprog_call(env, subprog, caller->regs);
++      if (err == -EFAULT)
++              return err;
++      if (subprog_is_global(env, subprog)) {
++              if (err) {
++                      verbose(env, "Caller passes invalid args into func#%d\n", subprog);
++                      return err;
++              }
++
++              if (env->log.level & BPF_LOG_LEVEL)
++                      verbose(env, "Func#%d is global and valid. Skipping.\n", subprog);
+               clear_caller_saved_regs(env, caller->regs);
++
++              /* All global functions return a 64-bit SCALAR_VALUE */
+               mark_reg_unknown(env, caller->regs, BPF_REG_0);
+               caller->regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG;
++
+               /* continue with next insn after call */
+               return 0;
+       }
+-      err = setup_func_entry(env, subprog, *insn_idx, set_callee_state_cb, state);
++      /* for regular function entry setup new frame and continue
++       * from that frame.
++       */
++      err = setup_func_entry(env, subprog, *insn_idx, set_callee_state, state);
+       if (err)
+               return err;
+@@ -9355,22 +9403,6 @@ static int set_callee_state(struct bpf_v
+       return 0;
+ }
+-static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
+-                         int *insn_idx)
+-{
+-      int subprog, target_insn;
+-
+-      target_insn = *insn_idx + insn->imm + 1;
+-      subprog = find_subprog(env, target_insn);
+-      if (subprog < 0) {
+-              verbose(env, "verifier bug. No program starts at insn %d\n",
+-                      target_insn);
+-              return -EFAULT;
+-      }
+-
+-      return __check_func_call(env, insn, insn_idx, subprog, set_callee_state);
+-}
+-
+ static int set_map_elem_callback_state(struct bpf_verifier_env *env,
+                                      struct bpf_func_state *caller,
+                                      struct bpf_func_state *callee,
+@@ -9601,6 +9633,11 @@ static int prepare_func_exit(struct bpf_
+                       verbose_invalid_scalar(env, r0, &range, "callback return", "R0");
+                       return -EINVAL;
+               }
++              if (!calls_callback(env, callee->callsite)) {
++                      verbose(env, "BUG: in callback at %d, callsite %d !calls_callback\n",
++                              *insn_idx, callee->callsite);
++                      return -EFAULT;
++              }
+       } else {
+               /* return to the caller whatever r0 had in the callee */
+               caller->regs[BPF_REG_0] = *r0;
+@@ -9618,7 +9655,15 @@ static int prepare_func_exit(struct bpf_
+                       return err;
+       }
+-      *insn_idx = callee->callsite + 1;
++      /* for callbacks like bpf_loop or bpf_for_each_map_elem go back to callsite,
++       * there function call logic would reschedule callback visit. If iteration
++       * converges is_state_visited() would prune that visit eventually.
++       */
++      if (callee->in_callback_fn)
++              *insn_idx = callee->callsite;
++      else
++              *insn_idx = callee->callsite + 1;
++
+       if (env->log.level & BPF_LOG_LEVEL) {
+               verbose(env, "returning from callee:\n");
+               print_verifier_state(env, callee, true);
+@@ -10009,24 +10054,24 @@ static int check_helper_call(struct bpf_
+               }
+               break;
+       case BPF_FUNC_for_each_map_elem:
+-              err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
+-                                      set_map_elem_callback_state);
++              err = push_callback_call(env, insn, insn_idx, meta.subprogno,
++                                       set_map_elem_callback_state);
+               break;
+       case BPF_FUNC_timer_set_callback:
+-              err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
+-                                      set_timer_callback_state);
++              err = push_callback_call(env, insn, insn_idx, meta.subprogno,
++                                       set_timer_callback_state);
+               break;
+       case BPF_FUNC_find_vma:
+-              err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
+-                                      set_find_vma_callback_state);
++              err = push_callback_call(env, insn, insn_idx, meta.subprogno,
++                                       set_find_vma_callback_state);
+               break;
+       case BPF_FUNC_snprintf:
+               err = check_bpf_snprintf_call(env, regs);
+               break;
+       case BPF_FUNC_loop:
+               update_loop_inline_state(env, meta.subprogno);
+-              err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
+-                                      set_loop_callback_state);
++              err = push_callback_call(env, insn, insn_idx, meta.subprogno,
++                                       set_loop_callback_state);
+               break;
+       case BPF_FUNC_dynptr_from_mem:
+               if (regs[BPF_REG_1].type != PTR_TO_MAP_VALUE) {
+@@ -10105,8 +10150,8 @@ static int check_helper_call(struct bpf_
+               break;
+       }
+       case BPF_FUNC_user_ringbuf_drain:
+-              err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
+-                                      set_user_ringbuf_callback_state);
++              err = push_callback_call(env, insn, insn_idx, meta.subprogno,
++                                       set_user_ringbuf_callback_state);
+               break;
+       }
+@@ -10956,7 +11001,7 @@ static bool is_bpf_graph_api_kfunc(u32 b
+              btf_id == special_kfunc_list[KF_bpf_refcount_acquire_impl];
+ }
+-static bool is_callback_calling_kfunc(u32 btf_id)
++static bool is_sync_callback_calling_kfunc(u32 btf_id)
+ {
+       return btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl];
+ }
+@@ -11660,6 +11705,21 @@ static int check_kfunc_call(struct bpf_v
+               return -EACCES;
+       }
++      /* Check the arguments */
++      err = check_kfunc_args(env, &meta, insn_idx);
++      if (err < 0)
++              return err;
++
++      if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
++              err = push_callback_call(env, insn, insn_idx, meta.subprogno,
++                                       set_rbtree_add_callback_state);
++              if (err) {
++                      verbose(env, "kfunc %s#%d failed callback verification\n",
++                              func_name, meta.func_id);
++                      return err;
++              }
++      }
++
+       rcu_lock = is_kfunc_bpf_rcu_read_lock(&meta);
+       rcu_unlock = is_kfunc_bpf_rcu_read_unlock(&meta);
+@@ -11694,10 +11754,6 @@ static int check_kfunc_call(struct bpf_v
+               return -EINVAL;
+       }
+-      /* Check the arguments */
+-      err = check_kfunc_args(env, &meta, insn_idx);
+-      if (err < 0)
+-              return err;
+       /* In case of release function, we get register number of refcounted
+        * PTR_TO_BTF_ID in bpf_kfunc_arg_meta, do the release now.
+        */
+@@ -11731,16 +11787,6 @@ static int check_kfunc_call(struct bpf_v
+               }
+       }
+-      if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
+-              err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
+-                                      set_rbtree_add_callback_state);
+-              if (err) {
+-                      verbose(env, "kfunc %s#%d failed callback verification\n",
+-                              func_name, meta.func_id);
+-                      return err;
+-              }
+-      }
+-
+       for (i = 0; i < CALLER_SAVED_REGS; i++)
+               mark_reg_not_init(env, regs, caller_saved[i]);
+@@ -15047,6 +15093,15 @@ static bool is_force_checkpoint(struct b
+       return env->insn_aux_data[insn_idx].force_checkpoint;
+ }
++static void mark_calls_callback(struct bpf_verifier_env *env, int idx)
++{
++      env->insn_aux_data[idx].calls_callback = true;
++}
++
++static bool calls_callback(struct bpf_verifier_env *env, int insn_idx)
++{
++      return env->insn_aux_data[insn_idx].calls_callback;
++}
+ enum {
+       DONE_EXPLORING = 0,
+@@ -15160,6 +15215,21 @@ static int visit_insn(int t, struct bpf_
+                        * async state will be pushed for further exploration.
+                        */
+                       mark_prune_point(env, t);
++              /* For functions that invoke callbacks it is not known how many times
++               * callback would be called. Verifier models callback calling functions
++               * by repeatedly visiting callback bodies and returning to origin call
++               * instruction.
++               * In order to stop such iteration verifier needs to identify when a
++               * state identical some state from a previous iteration is reached.
++               * Check below forces creation of checkpoint before callback calling
++               * instruction to allow search for such identical states.
++               */
++              if (is_sync_callback_calling_insn(insn)) {
++                      mark_calls_callback(env, t);
++                      mark_force_checkpoint(env, t);
++                      mark_prune_point(env, t);
++                      mark_jmp_point(env, t);
++              }
+               if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
+                       struct bpf_kfunc_call_arg_meta meta;
+@@ -16553,10 +16623,16 @@ static int is_state_visited(struct bpf_v
+                               }
+                               goto skip_inf_loop_check;
+                       }
++                      if (calls_callback(env, insn_idx)) {
++                              if (states_equal(env, &sl->state, cur, true))
++                                      goto hit;
++                              goto skip_inf_loop_check;
++                      }
+                       /* attempt to detect infinite loop to avoid unnecessary doomed work */
+                       if (states_maybe_looping(&sl->state, cur) &&
+                           states_equal(env, &sl->state, cur, false) &&
+-                          !iter_active_depths_differ(&sl->state, cur)) {
++                          !iter_active_depths_differ(&sl->state, cur) &&
++                          sl->state.callback_unroll_depth == cur->callback_unroll_depth) {
+                               verbose_linfo(env, insn_idx, "; ");
+                               verbose(env, "infinite loop detected at insn %d\n", insn_idx);
+                               verbose(env, "cur state:");
+--- a/tools/testing/selftests/bpf/progs/cb_refs.c
++++ b/tools/testing/selftests/bpf/progs/cb_refs.c
+@@ -33,6 +33,7 @@ int underflow_prog(void *ctx)
+       if (!p)
+               return 0;
+       bpf_for_each_map_elem(&array_map, cb1, &p, 0);
++      bpf_kfunc_call_test_release(p);
+       return 0;
+ }
+--- a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c
++++ b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c
+@@ -119,15 +119,26 @@ __naked int global_subprog_result_precis
+ SEC("?raw_tp")
+ __success __log_level(2)
++/* First simulated path does not include callback body */
+ __msg("14: (0f) r1 += r6")
+-__msg("mark_precise: frame0: last_idx 14 first_idx 10")
++__msg("mark_precise: frame0: last_idx 14 first_idx 9")
+ __msg("mark_precise: frame0: regs=r6 stack= before 13: (bf) r1 = r7")
+ __msg("mark_precise: frame0: regs=r6 stack= before 12: (27) r6 *= 4")
+ __msg("mark_precise: frame0: regs=r6 stack= before 11: (25) if r6 > 0x3 goto pc+4")
+ __msg("mark_precise: frame0: regs=r6 stack= before 10: (bf) r6 = r0")
+-__msg("mark_precise: frame0: parent state regs=r0 stack=:")
+-__msg("mark_precise: frame0: last_idx 18 first_idx 0")
+-__msg("mark_precise: frame0: regs=r0 stack= before 18: (95) exit")
++__msg("mark_precise: frame0: regs=r0 stack= before 9: (85) call bpf_loop")
++/* State entering callback body popped from states stack */
++__msg("from 9 to 17: frame1:")
++__msg("17: frame1: R1=scalar() R2=0 R10=fp0 cb")
++__msg("17: (b7) r0 = 0")
++__msg("18: (95) exit")
++__msg("returning from callee:")
++__msg("to caller at 9:")
++/* r4 (flags) is always precise for bpf_loop() */
++__msg("frame 0: propagating r4")
++__msg("mark_precise: frame0: last_idx 9 first_idx 9 subseq_idx -1")
++__msg("mark_precise: frame0: regs=r4 stack= before 18: (95) exit")
++__msg("from 18 to 9: safe")
+ __naked int callback_result_precise(void)
+ {
+       asm volatile (
+@@ -233,20 +244,36 @@ __naked int parent_callee_saved_reg_prec
+ SEC("?raw_tp")
+ __success __log_level(2)
++/* First simulated path does not include callback body */
+ __msg("12: (0f) r1 += r6")
+-__msg("mark_precise: frame0: last_idx 12 first_idx 10")
++__msg("mark_precise: frame0: last_idx 12 first_idx 9")
+ __msg("mark_precise: frame0: regs=r6 stack= before 11: (bf) r1 = r7")
+ __msg("mark_precise: frame0: regs=r6 stack= before 10: (27) r6 *= 4")
++__msg("mark_precise: frame0: regs=r6 stack= before 9: (85) call bpf_loop")
+ __msg("mark_precise: frame0: parent state regs=r6 stack=:")
+-__msg("mark_precise: frame0: last_idx 16 first_idx 0")
+-__msg("mark_precise: frame0: regs=r6 stack= before 16: (95) exit")
+-__msg("mark_precise: frame1: regs= stack= before 15: (b7) r0 = 0")
+-__msg("mark_precise: frame1: regs= stack= before 9: (85) call bpf_loop#181")
++__msg("mark_precise: frame0: last_idx 8 first_idx 0 subseq_idx 9")
+ __msg("mark_precise: frame0: regs=r6 stack= before 8: (b7) r4 = 0")
+ __msg("mark_precise: frame0: regs=r6 stack= before 7: (b7) r3 = 0")
+ __msg("mark_precise: frame0: regs=r6 stack= before 6: (bf) r2 = r8")
+ __msg("mark_precise: frame0: regs=r6 stack= before 5: (b7) r1 = 1")
+ __msg("mark_precise: frame0: regs=r6 stack= before 4: (b7) r6 = 3")
++/* State entering callback body popped from states stack */
++__msg("from 9 to 15: frame1:")
++__msg("15: frame1: R1=scalar() R2=0 R10=fp0 cb")
++__msg("15: (b7) r0 = 0")
++__msg("16: (95) exit")
++__msg("returning from callee:")
++__msg("to caller at 9:")
++/* r4 (flags) is always precise for bpf_loop(),
++ * r6 was marked before backtracking to callback body.
++ */
++__msg("frame 0: propagating r4,r6")
++__msg("mark_precise: frame0: last_idx 9 first_idx 9 subseq_idx -1")
++__msg("mark_precise: frame0: regs=r4,r6 stack= before 16: (95) exit")
++__msg("mark_precise: frame1: regs= stack= before 15: (b7) r0 = 0")
++__msg("mark_precise: frame1: regs= stack= before 9: (85) call bpf_loop")
++__msg("mark_precise: frame0: parent state regs= stack=:")
++__msg("from 16 to 9: safe")
+ __naked int parent_callee_saved_reg_precise_with_callback(void)
+ {
+       asm volatile (
+@@ -373,22 +400,38 @@ __naked int parent_stack_slot_precise_gl
+ SEC("?raw_tp")
+ __success __log_level(2)
++/* First simulated path does not include callback body */
+ __msg("14: (0f) r1 += r6")
+-__msg("mark_precise: frame0: last_idx 14 first_idx 11")
++__msg("mark_precise: frame0: last_idx 14 first_idx 10")
+ __msg("mark_precise: frame0: regs=r6 stack= before 13: (bf) r1 = r7")
+ __msg("mark_precise: frame0: regs=r6 stack= before 12: (27) r6 *= 4")
+ __msg("mark_precise: frame0: regs=r6 stack= before 11: (79) r6 = *(u64 *)(r10 -8)")
++__msg("mark_precise: frame0: regs= stack=-8 before 10: (85) call bpf_loop")
+ __msg("mark_precise: frame0: parent state regs= stack=-8:")
+-__msg("mark_precise: frame0: last_idx 18 first_idx 0")
+-__msg("mark_precise: frame0: regs= stack=-8 before 18: (95) exit")
+-__msg("mark_precise: frame1: regs= stack= before 17: (b7) r0 = 0")
+-__msg("mark_precise: frame1: regs= stack= before 10: (85) call bpf_loop#181")
++__msg("mark_precise: frame0: last_idx 9 first_idx 0 subseq_idx 10")
+ __msg("mark_precise: frame0: regs= stack=-8 before 9: (b7) r4 = 0")
+ __msg("mark_precise: frame0: regs= stack=-8 before 8: (b7) r3 = 0")
+ __msg("mark_precise: frame0: regs= stack=-8 before 7: (bf) r2 = r8")
+ __msg("mark_precise: frame0: regs= stack=-8 before 6: (bf) r1 = r6")
+ __msg("mark_precise: frame0: regs= stack=-8 before 5: (7b) *(u64 *)(r10 -8) = r6")
+ __msg("mark_precise: frame0: regs=r6 stack= before 4: (b7) r6 = 3")
++/* State entering callback body popped from states stack */
++__msg("from 10 to 17: frame1:")
++__msg("17: frame1: R1=scalar() R2=0 R10=fp0 cb")
++__msg("17: (b7) r0 = 0")
++__msg("18: (95) exit")
++__msg("returning from callee:")
++__msg("to caller at 10:")
++/* r4 (flags) is always precise for bpf_loop(),
++ * fp-8 was marked before backtracking to callback body.
++ */
++__msg("frame 0: propagating r4,fp-8")
++__msg("mark_precise: frame0: last_idx 10 first_idx 10 subseq_idx -1")
++__msg("mark_precise: frame0: regs=r4 stack=-8 before 18: (95) exit")
++__msg("mark_precise: frame1: regs= stack= before 17: (b7) r0 = 0")
++__msg("mark_precise: frame1: regs= stack= before 10: (85) call bpf_loop#181")
++__msg("mark_precise: frame0: parent state regs= stack=:")
++__msg("from 18 to 10: safe")
+ __naked int parent_stack_slot_precise_with_callback(void)
+ {
+       asm volatile (
diff --git a/queue-6.6/bpf-widening-for-callback-iterators.patch b/queue-6.6/bpf-widening-for-callback-iterators.patch
new file mode 100644 (file)
index 0000000..03e7d32
--- /dev/null
@@ -0,0 +1,83 @@
+From cafe2c21508a38cdb3ed22708842e957b2572c3e Mon Sep 17 00:00:00 2001
+From: Eduard Zingerman <eddyz87@gmail.com>
+Date: Tue, 21 Nov 2023 04:06:58 +0200
+Subject: bpf: widening for callback iterators
+
+From: Eduard Zingerman <eddyz87@gmail.com>
+
+commit cafe2c21508a38cdb3ed22708842e957b2572c3e upstream.
+
+Callbacks are similar to open coded iterators, so add imprecise
+widening logic for callback body processing. This makes callback based
+loops behave identically to open coded iterators, e.g. allowing to
+verify programs like below:
+
+  struct ctx { u32 i; };
+  int cb(u32 idx, struct ctx* ctx)
+  {
+          ++ctx->i;
+          return 0;
+  }
+  ...
+  struct ctx ctx = { .i = 0 };
+  bpf_loop(100, cb, &ctx, 0);
+  ...
+
+Acked-by: Andrii Nakryiko <andrii@kernel.org>
+Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
+Link: https://lore.kernel.org/r/20231121020701.26440-9-eddyz87@gmail.com
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/bpf/verifier.c |   24 ++++++++++++++++++++++--
+ 1 file changed, 22 insertions(+), 2 deletions(-)
+
+--- a/kernel/bpf/verifier.c
++++ b/kernel/bpf/verifier.c
+@@ -9595,9 +9595,10 @@ static bool in_rbtree_lock_required_cb(s
+ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
+ {
+-      struct bpf_verifier_state *state = env->cur_state;
++      struct bpf_verifier_state *state = env->cur_state, *prev_st;
+       struct bpf_func_state *caller, *callee;
+       struct bpf_reg_state *r0;
++      bool in_callback_fn;
+       int err;
+       callee = state->frame[state->curframe];
+@@ -9659,7 +9660,8 @@ static int prepare_func_exit(struct bpf_
+        * there function call logic would reschedule callback visit. If iteration
+        * converges is_state_visited() would prune that visit eventually.
+        */
+-      if (callee->in_callback_fn)
++      in_callback_fn = callee->in_callback_fn;
++      if (in_callback_fn)
+               *insn_idx = callee->callsite;
+       else
+               *insn_idx = callee->callsite + 1;
+@@ -9673,6 +9675,24 @@ static int prepare_func_exit(struct bpf_
+       /* clear everything in the callee */
+       free_func_state(callee);
+       state->frame[state->curframe--] = NULL;
++
++      /* for callbacks widen imprecise scalars to make programs like below verify:
++       *
++       *   struct ctx { int i; }
++       *   void cb(int idx, struct ctx *ctx) { ctx->i++; ... }
++       *   ...
++       *   struct ctx = { .i = 0; }
++       *   bpf_loop(100, cb, &ctx, 0);
++       *
++       * This is similar to what is done in process_iter_next_call() for open
++       * coded iterators.
++       */
++      prev_st = in_callback_fn ? find_prev_entry(env, state, *insn_idx) : NULL;
++      if (prev_st) {
++              err = widen_imprecise_scalars(env, prev_st, state);
++              if (err)
++                      return err;
++      }
+       return 0;
+ }
diff --git a/queue-6.6/selftests-bpf-check-if-max-number-of-bpf_loop-iterations-is-tracked.patch b/queue-6.6/selftests-bpf-check-if-max-number-of-bpf_loop-iterations-is-tracked.patch
new file mode 100644 (file)
index 0000000..98ea140
--- /dev/null
@@ -0,0 +1,103 @@
+From 57e2a52deeb12ab84c15c6d0fb93638b5b94001b Mon Sep 17 00:00:00 2001
+From: Eduard Zingerman <eddyz87@gmail.com>
+Date: Tue, 21 Nov 2023 04:07:01 +0200
+Subject: selftests/bpf: check if max number of bpf_loop iterations is tracked
+
+From: Eduard Zingerman <eddyz87@gmail.com>
+
+commit 57e2a52deeb12ab84c15c6d0fb93638b5b94001b upstream.
+
+Check that even if bpf_loop() callback simulation does not converge to
+a specific state, verification could proceed via "brute force"
+simulation of maximal number of callback calls.
+
+Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
+Link: https://lore.kernel.org/r/20231121020701.26440-12-eddyz87@gmail.com
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ tools/testing/selftests/bpf/progs/verifier_iterating_callbacks.c |   75 ++++++++++
+ 1 file changed, 75 insertions(+)
+
+--- a/tools/testing/selftests/bpf/progs/verifier_iterating_callbacks.c
++++ b/tools/testing/selftests/bpf/progs/verifier_iterating_callbacks.c
+@@ -164,4 +164,79 @@ int unsafe_find_vma(void *unused)
+       return choice_arr[loop_ctx.i];
+ }
++static int iter_limit_cb(__u32 idx, struct num_context *ctx)
++{
++      ctx->i++;
++      return 0;
++}
++
++SEC("?raw_tp")
++__success
++int bpf_loop_iter_limit_ok(void *unused)
++{
++      struct num_context ctx = { .i = 0 };
++
++      bpf_loop(1, iter_limit_cb, &ctx, 0);
++      return choice_arr[ctx.i];
++}
++
++SEC("?raw_tp")
++__failure __msg("invalid access to map value, value_size=2 off=2 size=1")
++int bpf_loop_iter_limit_overflow(void *unused)
++{
++      struct num_context ctx = { .i = 0 };
++
++      bpf_loop(2, iter_limit_cb, &ctx, 0);
++      return choice_arr[ctx.i];
++}
++
++static int iter_limit_level2a_cb(__u32 idx, struct num_context *ctx)
++{
++      ctx->i += 100;
++      return 0;
++}
++
++static int iter_limit_level2b_cb(__u32 idx, struct num_context *ctx)
++{
++      ctx->i += 10;
++      return 0;
++}
++
++static int iter_limit_level1_cb(__u32 idx, struct num_context *ctx)
++{
++      ctx->i += 1;
++      bpf_loop(1, iter_limit_level2a_cb, ctx, 0);
++      bpf_loop(1, iter_limit_level2b_cb, ctx, 0);
++      return 0;
++}
++
++/* Check that path visiting every callback function once had been
++ * reached by verifier. Variables 'ctx{1,2}i' below serve as flags,
++ * with each decimal digit corresponding to a callback visit marker.
++ */
++SEC("socket")
++__success __retval(111111)
++int bpf_loop_iter_limit_nested(void *unused)
++{
++      struct num_context ctx1 = { .i = 0 };
++      struct num_context ctx2 = { .i = 0 };
++      __u64 a, b, c;
++
++      bpf_loop(1, iter_limit_level1_cb, &ctx1, 0);
++      bpf_loop(1, iter_limit_level1_cb, &ctx2, 0);
++      a = ctx1.i;
++      b = ctx2.i;
++      /* Force 'ctx1.i' and 'ctx2.i' precise. */
++      c = choice_arr[(a + b) % 2];
++      /* This makes 'c' zero, but neither clang nor verifier know it. */
++      c /= 10;
++      /* Make sure that verifier does not visit 'impossible' states:
++       * enumerate all possible callback visit masks.
++       */
++      if (a != 0 && a != 1 && a != 11 && a != 101 && a != 111 &&
++          b != 0 && b != 1 && b != 11 && b != 101 && b != 111)
++              asm volatile ("r0 /= 0;" ::: "r0");
++      return 1000 * a + b + c;
++}
++
+ char _license[] SEC("license") = "GPL";
diff --git a/queue-6.6/selftests-bpf-test-if-state-loops-are-detected-in-a-tricky-case.patch b/queue-6.6/selftests-bpf-test-if-state-loops-are-detected-in-a-tricky-case.patch
new file mode 100644 (file)
index 0000000..e4868c9
--- /dev/null
@@ -0,0 +1,228 @@
+From 64870feebecb7130291a55caf0ce839a87405a70 Mon Sep 17 00:00:00 2001
+From: Eduard Zingerman <eddyz87@gmail.com>
+Date: Tue, 24 Oct 2023 03:09:16 +0300
+Subject: selftests/bpf: test if state loops are detected in a tricky case
+
+From: Eduard Zingerman <eddyz87@gmail.com>
+
+commit 64870feebecb7130291a55caf0ce839a87405a70 upstream.
+
+A convoluted test case for iterators convergence logic that
+demonstrates that states with branch count equal to 0 might still be
+a part of not completely explored loop.
+
+E.g. consider the following state diagram:
+
+               initial     Here state 'succ' was processed first,
+                 |         it was eventually tracked to produce a
+                 V         state identical to 'hdr'.
+    .---------> hdr        All branches from 'succ' had been explored
+    |            |         and thus 'succ' has its .branches == 0.
+    |            V
+    |    .------...        Suppose states 'cur' and 'succ' correspond
+    |    |       |         to the same instruction + callsites.
+    |    V       V         In such case it is necessary to check
+    |   ...     ...        whether 'succ' and 'cur' are identical.
+    |    |       |         If 'succ' and 'cur' are a part of the same loop
+    |    V       V         they have to be compared exactly.
+    |   succ <- cur
+    |    |
+    |    V
+    |   ...
+    |    |
+    '----'
+
+Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
+Link: https://lore.kernel.org/r/20231024000917.12153-7-eddyz87@gmail.com
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ tools/testing/selftests/bpf/progs/iters.c |  177 ++++++++++++++++++++++++++++++
+ 1 file changed, 177 insertions(+)
+
+--- a/tools/testing/selftests/bpf/progs/iters.c
++++ b/tools/testing/selftests/bpf/progs/iters.c
+@@ -999,6 +999,183 @@ __naked int loop_state_deps1(void)
+ }
+ SEC("?raw_tp")
++__failure
++__msg("math between fp pointer and register with unbounded")
++__flag(BPF_F_TEST_STATE_FREQ)
++__naked int loop_state_deps2(void)
++{
++      /* This is equivalent to C program below.
++       *
++       * The case turns out to be tricky in a sense that:
++       * - states with read+precise mark on c are explored only on a second
++       *   iteration of the first inner loop and in a state which is pushed to
++       *   states stack first.
++       * - states with c=-25 are explored only on a second iteration of the
++       *   second inner loop and in a state which is pushed to states stack
++       *   first.
++       *
++       * Depending on the details of iterator convergence logic
++       * verifier might stop states traversal too early and miss
++       * unsafe c=-25 memory access.
++       *
++       *   j = iter_new();             // fp[-16]
++       *   a = 0;                      // r6
++       *   b = 0;                      // r7
++       *   c = -24;                    // r8
++       *   while (iter_next(j)) {
++       *     i = iter_new();           // fp[-8]
++       *     a = 0;                    // r6
++       *     b = 0;                    // r7
++       *     while (iter_next(i)) {
++       *       if (a == 1) {
++       *         a = 0;
++       *         b = 1;
++       *       } else if (a == 0) {
++       *         a = 1;
++       *         if (random() == 42)
++       *           continue;
++       *         if (b == 1) {
++       *           *(r10 + c) = 7;     // this is not safe
++       *           iter_destroy(i);
++       *           iter_destroy(j);
++       *           return;
++       *         }
++       *       }
++       *     }
++       *     iter_destroy(i);
++       *     i = iter_new();           // fp[-8]
++       *     a = 0;                    // r6
++       *     b = 0;                    // r7
++       *     while (iter_next(i)) {
++       *       if (a == 1) {
++       *         a = 0;
++       *         b = 1;
++       *       } else if (a == 0) {
++       *         a = 1;
++       *         if (random() == 42)
++       *           continue;
++       *         if (b == 1) {
++       *           a = 0;
++       *           c = -25;
++       *         }
++       *       }
++       *     }
++       *     iter_destroy(i);
++       *   }
++       *   iter_destroy(j);
++       *   return;
++       */
++      asm volatile (
++              "r1 = r10;"
++              "r1 += -16;"
++              "r2 = 0;"
++              "r3 = 10;"
++              "call %[bpf_iter_num_new];"
++              "r6 = 0;"
++              "r7 = 0;"
++              "r8 = -24;"
++      "j_loop_%=:"
++              "r1 = r10;"
++              "r1 += -16;"
++              "call %[bpf_iter_num_next];"
++              "if r0 == 0 goto j_loop_end_%=;"
++
++              /* first inner loop */
++              "r1 = r10;"
++              "r1 += -8;"
++              "r2 = 0;"
++              "r3 = 10;"
++              "call %[bpf_iter_num_new];"
++              "r6 = 0;"
++              "r7 = 0;"
++      "i_loop_%=:"
++              "r1 = r10;"
++              "r1 += -8;"
++              "call %[bpf_iter_num_next];"
++              "if r0 == 0 goto i_loop_end_%=;"
++      "check_one_r6_%=:"
++              "if r6 != 1 goto check_zero_r6_%=;"
++              "r6 = 0;"
++              "r7 = 1;"
++              "goto i_loop_%=;"
++      "check_zero_r6_%=:"
++              "if r6 != 0 goto i_loop_%=;"
++              "r6 = 1;"
++              "call %[bpf_get_prandom_u32];"
++              "if r0 != 42 goto check_one_r7_%=;"
++              "goto i_loop_%=;"
++      "check_one_r7_%=:"
++              "if r7 != 1 goto i_loop_%=;"
++              "r0 = r10;"
++              "r0 += r8;"
++              "r1 = 7;"
++              "*(u64 *)(r0 + 0) = r1;"
++              "r1 = r10;"
++              "r1 += -8;"
++              "call %[bpf_iter_num_destroy];"
++              "r1 = r10;"
++              "r1 += -16;"
++              "call %[bpf_iter_num_destroy];"
++              "r0 = 0;"
++              "exit;"
++      "i_loop_end_%=:"
++              "r1 = r10;"
++              "r1 += -8;"
++              "call %[bpf_iter_num_destroy];"
++
++              /* second inner loop */
++              "r1 = r10;"
++              "r1 += -8;"
++              "r2 = 0;"
++              "r3 = 10;"
++              "call %[bpf_iter_num_new];"
++              "r6 = 0;"
++              "r7 = 0;"
++      "i2_loop_%=:"
++              "r1 = r10;"
++              "r1 += -8;"
++              "call %[bpf_iter_num_next];"
++              "if r0 == 0 goto i2_loop_end_%=;"
++      "check2_one_r6_%=:"
++              "if r6 != 1 goto check2_zero_r6_%=;"
++              "r6 = 0;"
++              "r7 = 1;"
++              "goto i2_loop_%=;"
++      "check2_zero_r6_%=:"
++              "if r6 != 0 goto i2_loop_%=;"
++              "r6 = 1;"
++              "call %[bpf_get_prandom_u32];"
++              "if r0 != 42 goto check2_one_r7_%=;"
++              "goto i2_loop_%=;"
++      "check2_one_r7_%=:"
++              "if r7 != 1 goto i2_loop_%=;"
++              "r6 = 0;"
++              "r8 = -25;"
++              "goto i2_loop_%=;"
++      "i2_loop_end_%=:"
++              "r1 = r10;"
++              "r1 += -8;"
++              "call %[bpf_iter_num_destroy];"
++
++              "r6 = 0;"
++              "r7 = 0;"
++              "goto j_loop_%=;"
++      "j_loop_end_%=:"
++              "r1 = r10;"
++              "r1 += -16;"
++              "call %[bpf_iter_num_destroy];"
++              "r0 = 0;"
++              "exit;"
++              :
++              : __imm(bpf_get_prandom_u32),
++                __imm(bpf_iter_num_new),
++                __imm(bpf_iter_num_next),
++                __imm(bpf_iter_num_destroy)
++              : __clobber_all
++      );
++}
++
++SEC("?raw_tp")
+ __success
+ __naked int triple_continue(void)
+ {
diff --git a/queue-6.6/selftests-bpf-test-widening-for-iterating-callbacks.patch b/queue-6.6/selftests-bpf-test-widening-for-iterating-callbacks.patch
new file mode 100644 (file)
index 0000000..cbd8b5e
--- /dev/null
@@ -0,0 +1,56 @@
+From 9f3330aa644d6d979eb064c46e85c62d4b4eac75 Mon Sep 17 00:00:00 2001
+From: Eduard Zingerman <eddyz87@gmail.com>
+Date: Tue, 21 Nov 2023 04:06:59 +0200
+Subject: selftests/bpf: test widening for iterating callbacks
+
+From: Eduard Zingerman <eddyz87@gmail.com>
+
+commit 9f3330aa644d6d979eb064c46e85c62d4b4eac75 upstream.
+
+A test case to verify that imprecise scalars widening is applied to
+callback entering state, when callback call is simulated repeatedly.
+
+Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
+Link: https://lore.kernel.org/r/20231121020701.26440-10-eddyz87@gmail.com
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ tools/testing/selftests/bpf/progs/verifier_iterating_callbacks.c |   20 ++++++++++
+ 1 file changed, 20 insertions(+)
+
+--- a/tools/testing/selftests/bpf/progs/verifier_iterating_callbacks.c
++++ b/tools/testing/selftests/bpf/progs/verifier_iterating_callbacks.c
+@@ -25,6 +25,7 @@ struct buf_context {
+ struct num_context {
+       __u64 i;
++      __u64 j;
+ };
+ __u8 choice_arr[2] = { 0, 1 };
+@@ -69,6 +70,25 @@ int unsafe_on_zero_iter(void *unused)
+       return choice_arr[loop_ctx.i];
+ }
++static int widening_cb(__u32 idx, struct num_context *ctx)
++{
++      ++ctx->i;
++      return 0;
++}
++
++SEC("?raw_tp")
++__success
++int widening(void *unused)
++{
++      struct num_context loop_ctx = { .i = 0, .j = 1 };
++
++      bpf_loop(100, widening_cb, &loop_ctx, 0);
++      /* loop_ctx.j is not changed during callback iteration,
++       * verifier should not apply widening to it.
++       */
++      return choice_arr[loop_ctx.j];
++}
++
+ static int loop_detection_cb(__u32 idx, struct num_context *ctx)
+ {
+       for (;;) {}
diff --git a/queue-6.6/selftests-bpf-tests-for-iterating-callbacks.patch b/queue-6.6/selftests-bpf-tests-for-iterating-callbacks.patch
new file mode 100644 (file)
index 0000000..18dfeed
--- /dev/null
@@ -0,0 +1,197 @@
+From 958465e217dbf5fc6677d42d8827fb3073d86afd Mon Sep 17 00:00:00 2001
+From: Eduard Zingerman <eddyz87@gmail.com>
+Date: Tue, 21 Nov 2023 04:06:57 +0200
+Subject: selftests/bpf: tests for iterating callbacks
+
+From: Eduard Zingerman <eddyz87@gmail.com>
+
+commit 958465e217dbf5fc6677d42d8827fb3073d86afd upstream.
+
+A set of test cases to check behavior of callback handling logic,
+check if verifier catches the following situations:
+- program not safe on second callback iteration;
+- program not safe on zero callback iterations;
+- infinite loop inside a callback.
+
+Verify that callback logic works for bpf_loop, bpf_for_each_map_elem,
+bpf_user_ringbuf_drain, bpf_find_vma.
+
+Acked-by: Andrii Nakryiko <andrii@kernel.org>
+Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
+Link: https://lore.kernel.org/r/20231121020701.26440-8-eddyz87@gmail.com
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ tools/testing/selftests/bpf/prog_tests/verifier.c                |    2 
+ tools/testing/selftests/bpf/progs/verifier_iterating_callbacks.c |  147 ++++++++++
+ 2 files changed, 149 insertions(+)
+ create mode 100644 tools/testing/selftests/bpf/progs/verifier_iterating_callbacks.c
+
+--- a/tools/testing/selftests/bpf/prog_tests/verifier.c
++++ b/tools/testing/selftests/bpf/prog_tests/verifier.c
+@@ -31,6 +31,7 @@
+ #include "verifier_helper_restricted.skel.h"
+ #include "verifier_helper_value_access.skel.h"
+ #include "verifier_int_ptr.skel.h"
++#include "verifier_iterating_callbacks.skel.h"
+ #include "verifier_jeq_infer_not_null.skel.h"
+ #include "verifier_ld_ind.skel.h"
+ #include "verifier_ldsx.skel.h"
+@@ -138,6 +139,7 @@ void test_verifier_helper_packet_access(
+ void test_verifier_helper_restricted(void)    { RUN(verifier_helper_restricted); }
+ void test_verifier_helper_value_access(void)  { RUN(verifier_helper_value_access); }
+ void test_verifier_int_ptr(void)              { RUN(verifier_int_ptr); }
++void test_verifier_iterating_callbacks(void)  { RUN(verifier_iterating_callbacks); }
+ void test_verifier_jeq_infer_not_null(void)   { RUN(verifier_jeq_infer_not_null); }
+ void test_verifier_ld_ind(void)               { RUN(verifier_ld_ind); }
+ void test_verifier_ldsx(void)                  { RUN(verifier_ldsx); }
+--- /dev/null
++++ b/tools/testing/selftests/bpf/progs/verifier_iterating_callbacks.c
+@@ -0,0 +1,147 @@
++// SPDX-License-Identifier: GPL-2.0
++
++#include <linux/bpf.h>
++#include <bpf/bpf_helpers.h>
++#include "bpf_misc.h"
++
++struct {
++      __uint(type, BPF_MAP_TYPE_ARRAY);
++      __uint(max_entries, 8);
++      __type(key, __u32);
++      __type(value, __u64);
++} map SEC(".maps");
++
++struct {
++      __uint(type, BPF_MAP_TYPE_USER_RINGBUF);
++      __uint(max_entries, 8);
++} ringbuf SEC(".maps");
++
++struct vm_area_struct;
++struct bpf_map;
++
++struct buf_context {
++      char *buf;
++};
++
++struct num_context {
++      __u64 i;
++};
++
++__u8 choice_arr[2] = { 0, 1 };
++
++static int unsafe_on_2nd_iter_cb(__u32 idx, struct buf_context *ctx)
++{
++      if (idx == 0) {
++              ctx->buf = (char *)(0xDEAD);
++              return 0;
++      }
++
++      if (bpf_probe_read_user(ctx->buf, 8, (void *)(0xBADC0FFEE)))
++              return 1;
++
++      return 0;
++}
++
++SEC("?raw_tp")
++__failure __msg("R1 type=scalar expected=fp")
++int unsafe_on_2nd_iter(void *unused)
++{
++      char buf[4];
++      struct buf_context loop_ctx = { .buf = buf };
++
++      bpf_loop(100, unsafe_on_2nd_iter_cb, &loop_ctx, 0);
++      return 0;
++}
++
++static int unsafe_on_zero_iter_cb(__u32 idx, struct num_context *ctx)
++{
++      ctx->i = 0;
++      return 0;
++}
++
++SEC("?raw_tp")
++__failure __msg("invalid access to map value, value_size=2 off=32 size=1")
++int unsafe_on_zero_iter(void *unused)
++{
++      struct num_context loop_ctx = { .i = 32 };
++
++      bpf_loop(100, unsafe_on_zero_iter_cb, &loop_ctx, 0);
++      return choice_arr[loop_ctx.i];
++}
++
++static int loop_detection_cb(__u32 idx, struct num_context *ctx)
++{
++      for (;;) {}
++      return 0;
++}
++
++SEC("?raw_tp")
++__failure __msg("infinite loop detected")
++int loop_detection(void *unused)
++{
++      struct num_context loop_ctx = { .i = 0 };
++
++      bpf_loop(100, loop_detection_cb, &loop_ctx, 0);
++      return 0;
++}
++
++static __always_inline __u64 oob_state_machine(struct num_context *ctx)
++{
++      switch (ctx->i) {
++      case 0:
++              ctx->i = 1;
++              break;
++      case 1:
++              ctx->i = 32;
++              break;
++      }
++      return 0;
++}
++
++static __u64 for_each_map_elem_cb(struct bpf_map *map, __u32 *key, __u64 *val, void *data)
++{
++      return oob_state_machine(data);
++}
++
++SEC("?raw_tp")
++__failure __msg("invalid access to map value, value_size=2 off=32 size=1")
++int unsafe_for_each_map_elem(void *unused)
++{
++      struct num_context loop_ctx = { .i = 0 };
++
++      bpf_for_each_map_elem(&map, for_each_map_elem_cb, &loop_ctx, 0);
++      return choice_arr[loop_ctx.i];
++}
++
++static __u64 ringbuf_drain_cb(struct bpf_dynptr *dynptr, void *data)
++{
++      return oob_state_machine(data);
++}
++
++SEC("?raw_tp")
++__failure __msg("invalid access to map value, value_size=2 off=32 size=1")
++int unsafe_ringbuf_drain(void *unused)
++{
++      struct num_context loop_ctx = { .i = 0 };
++
++      bpf_user_ringbuf_drain(&ringbuf, ringbuf_drain_cb, &loop_ctx, 0);
++      return choice_arr[loop_ctx.i];
++}
++
++static __u64 find_vma_cb(struct task_struct *task, struct vm_area_struct *vma, void *data)
++{
++      return oob_state_machine(data);
++}
++
++SEC("?raw_tp")
++__failure __msg("invalid access to map value, value_size=2 off=32 size=1")
++int unsafe_find_vma(void *unused)
++{
++      struct task_struct *task = bpf_get_current_task_btf();
++      struct num_context loop_ctx = { .i = 0 };
++
++      bpf_find_vma(task, 0, find_vma_cb, &loop_ctx, 0);
++      return choice_arr[loop_ctx.i];
++}
++
++char _license[] SEC("license") = "GPL";
diff --git a/queue-6.6/selftests-bpf-tests-with-delayed-read-precision-makrs-in-loop-body.patch b/queue-6.6/selftests-bpf-tests-with-delayed-read-precision-makrs-in-loop-body.patch
new file mode 100644 (file)
index 0000000..88e5d03
--- /dev/null
@@ -0,0 +1,556 @@
+From 389ede06c2974b2f878a7ebff6b0f4f707f9db74 Mon Sep 17 00:00:00 2001
+From: Eduard Zingerman <eddyz87@gmail.com>
+Date: Tue, 24 Oct 2023 03:09:14 +0300
+Subject: selftests/bpf: tests with delayed read/precision makrs in loop body
+
+From: Eduard Zingerman <eddyz87@gmail.com>
+
+commit 389ede06c2974b2f878a7ebff6b0f4f707f9db74 upstream.
+
+These test cases try to hide read and precision marks from loop
+convergence logic: marks would only be assigned on subsequent loop
+iterations or after exploring states pushed to env->head stack first.
+Without verifier fix to use exact states comparison logic for
+iterators convergence these tests (except 'triple_continue') would be
+errorneously marked as safe.
+
+Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
+Link: https://lore.kernel.org/r/20231024000917.12153-5-eddyz87@gmail.com
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ tools/testing/selftests/bpf/progs/iters.c |  518 ++++++++++++++++++++++++++++++
+ 1 file changed, 518 insertions(+)
+
+--- a/tools/testing/selftests/bpf/progs/iters.c
++++ b/tools/testing/selftests/bpf/progs/iters.c
+@@ -14,6 +14,13 @@ int my_pid;
+ int arr[256];
+ int small_arr[16] SEC(".data.small_arr");
++struct {
++      __uint(type, BPF_MAP_TYPE_HASH);
++      __uint(max_entries, 10);
++      __type(key, int);
++      __type(value, int);
++} amap SEC(".maps");
++
+ #ifdef REAL_TEST
+ #define MY_PID_GUARD() if (my_pid != (bpf_get_current_pid_tgid() >> 32)) return 0
+ #else
+@@ -716,4 +723,515 @@ int iter_pass_iter_ptr_to_subprog(const
+       return 0;
+ }
++SEC("?raw_tp")
++__failure
++__msg("R1 type=scalar expected=fp")
++__naked int delayed_read_mark(void)
++{
++      /* This is equivalent to C program below.
++       * The call to bpf_iter_num_next() is reachable with r7 values &fp[-16] and 0xdead.
++       * State with r7=&fp[-16] is visited first and follows r6 != 42 ... continue branch.
++       * At this point iterator next() call is reached with r7 that has no read mark.
++       * Loop body with r7=0xdead would only be visited if verifier would decide to continue
++       * with second loop iteration. Absence of read mark on r7 might affect state
++       * equivalent logic used for iterator convergence tracking.
++       *
++       * r7 = &fp[-16]
++       * fp[-16] = 0
++       * r6 = bpf_get_prandom_u32()
++       * bpf_iter_num_new(&fp[-8], 0, 10)
++       * while (bpf_iter_num_next(&fp[-8])) {
++       *   r6++
++       *   if (r6 != 42) {
++       *     r7 = 0xdead
++       *     continue;
++       *   }
++       *   bpf_probe_read_user(r7, 8, 0xdeadbeef); // this is not safe
++       * }
++       * bpf_iter_num_destroy(&fp[-8])
++       * return 0
++       */
++      asm volatile (
++              "r7 = r10;"
++              "r7 += -16;"
++              "r0 = 0;"
++              "*(u64 *)(r7 + 0) = r0;"
++              "call %[bpf_get_prandom_u32];"
++              "r6 = r0;"
++              "r1 = r10;"
++              "r1 += -8;"
++              "r2 = 0;"
++              "r3 = 10;"
++              "call %[bpf_iter_num_new];"
++      "1:"
++              "r1 = r10;"
++              "r1 += -8;"
++              "call %[bpf_iter_num_next];"
++              "if r0 == 0 goto 2f;"
++              "r6 += 1;"
++              "if r6 != 42 goto 3f;"
++              "r7 = 0xdead;"
++              "goto 1b;"
++      "3:"
++              "r1 = r7;"
++              "r2 = 8;"
++              "r3 = 0xdeadbeef;"
++              "call %[bpf_probe_read_user];"
++              "goto 1b;"
++      "2:"
++              "r1 = r10;"
++              "r1 += -8;"
++              "call %[bpf_iter_num_destroy];"
++              "r0 = 0;"
++              "exit;"
++              :
++              : __imm(bpf_get_prandom_u32),
++                __imm(bpf_iter_num_new),
++                __imm(bpf_iter_num_next),
++                __imm(bpf_iter_num_destroy),
++                __imm(bpf_probe_read_user)
++              : __clobber_all
++      );
++}
++
++SEC("?raw_tp")
++__failure
++__msg("math between fp pointer and register with unbounded")
++__naked int delayed_precision_mark(void)
++{
++      /* This is equivalent to C program below.
++       * The test is similar to delayed_iter_mark but verifies that incomplete
++       * precision don't fool verifier.
++       * The call to bpf_iter_num_next() is reachable with r7 values -16 and -32.
++       * State with r7=-16 is visited first and follows r6 != 42 ... continue branch.
++       * At this point iterator next() call is reached with r7 that has no read
++       * and precision marks.
++       * Loop body with r7=-32 would only be visited if verifier would decide to continue
++       * with second loop iteration. Absence of precision mark on r7 might affect state
++       * equivalent logic used for iterator convergence tracking.
++       *
++       * r8 = 0
++       * fp[-16] = 0
++       * r7 = -16
++       * r6 = bpf_get_prandom_u32()
++       * bpf_iter_num_new(&fp[-8], 0, 10)
++       * while (bpf_iter_num_next(&fp[-8])) {
++       *   if (r6 != 42) {
++       *     r7 = -32
++       *     r6 = bpf_get_prandom_u32()
++       *     continue;
++       *   }
++       *   r0 = r10
++       *   r0 += r7
++       *   r8 = *(u64 *)(r0 + 0)           // this is not safe
++       *   r6 = bpf_get_prandom_u32()
++       * }
++       * bpf_iter_num_destroy(&fp[-8])
++       * return r8
++       */
++      asm volatile (
++              "r8 = 0;"
++              "*(u64 *)(r10 - 16) = r8;"
++              "r7 = -16;"
++              "call %[bpf_get_prandom_u32];"
++              "r6 = r0;"
++              "r1 = r10;"
++              "r1 += -8;"
++              "r2 = 0;"
++              "r3 = 10;"
++              "call %[bpf_iter_num_new];"
++      "1:"
++              "r1 = r10;"
++              "r1 += -8;\n"
++              "call %[bpf_iter_num_next];"
++              "if r0 == 0 goto 2f;"
++              "if r6 != 42 goto 3f;"
++              "r7 = -32;"
++              "call %[bpf_get_prandom_u32];"
++              "r6 = r0;"
++              "goto 1b;\n"
++      "3:"
++              "r0 = r10;"
++              "r0 += r7;"
++              "r8 = *(u64 *)(r0 + 0);"
++              "call %[bpf_get_prandom_u32];"
++              "r6 = r0;"
++              "goto 1b;\n"
++      "2:"
++              "r1 = r10;"
++              "r1 += -8;"
++              "call %[bpf_iter_num_destroy];"
++              "r0 = r8;"
++              "exit;"
++              :
++              : __imm(bpf_get_prandom_u32),
++                __imm(bpf_iter_num_new),
++                __imm(bpf_iter_num_next),
++                __imm(bpf_iter_num_destroy),
++                __imm(bpf_probe_read_user)
++              : __clobber_all
++      );
++}
++
++SEC("?raw_tp")
++__failure
++__msg("math between fp pointer and register with unbounded")
++__flag(BPF_F_TEST_STATE_FREQ)
++__naked int loop_state_deps1(void)
++{
++      /* This is equivalent to C program below.
++       *
++       * The case turns out to be tricky in a sense that:
++       * - states with c=-25 are explored only on a second iteration
++       *   of the outer loop;
++       * - states with read+precise mark on c are explored only on
++       *   second iteration of the inner loop and in a state which
++       *   is pushed to states stack first.
++       *
++       * Depending on the details of iterator convergence logic
++       * verifier might stop states traversal too early and miss
++       * unsafe c=-25 memory access.
++       *
++       *   j = iter_new();             // fp[-16]
++       *   a = 0;                      // r6
++       *   b = 0;                      // r7
++       *   c = -24;                    // r8
++       *   while (iter_next(j)) {
++       *     i = iter_new();           // fp[-8]
++       *     a = 0;                    // r6
++       *     b = 0;                    // r7
++       *     while (iter_next(i)) {
++       *       if (a == 1) {
++       *         a = 0;
++       *         b = 1;
++       *       } else if (a == 0) {
++       *         a = 1;
++       *         if (random() == 42)
++       *           continue;
++       *         if (b == 1) {
++       *           *(r10 + c) = 7;  // this is not safe
++       *           iter_destroy(i);
++       *           iter_destroy(j);
++       *           return;
++       *         }
++       *       }
++       *     }
++       *     iter_destroy(i);
++       *     a = 0;
++       *     b = 0;
++       *     c = -25;
++       *   }
++       *   iter_destroy(j);
++       *   return;
++       */
++      asm volatile (
++              "r1 = r10;"
++              "r1 += -16;"
++              "r2 = 0;"
++              "r3 = 10;"
++              "call %[bpf_iter_num_new];"
++              "r6 = 0;"
++              "r7 = 0;"
++              "r8 = -24;"
++      "j_loop_%=:"
++              "r1 = r10;"
++              "r1 += -16;"
++              "call %[bpf_iter_num_next];"
++              "if r0 == 0 goto j_loop_end_%=;"
++              "r1 = r10;"
++              "r1 += -8;"
++              "r2 = 0;"
++              "r3 = 10;"
++              "call %[bpf_iter_num_new];"
++              "r6 = 0;"
++              "r7 = 0;"
++      "i_loop_%=:"
++              "r1 = r10;"
++              "r1 += -8;"
++              "call %[bpf_iter_num_next];"
++              "if r0 == 0 goto i_loop_end_%=;"
++      "check_one_r6_%=:"
++              "if r6 != 1 goto check_zero_r6_%=;"
++              "r6 = 0;"
++              "r7 = 1;"
++              "goto i_loop_%=;"
++      "check_zero_r6_%=:"
++              "if r6 != 0 goto i_loop_%=;"
++              "r6 = 1;"
++              "call %[bpf_get_prandom_u32];"
++              "if r0 != 42 goto check_one_r7_%=;"
++              "goto i_loop_%=;"
++      "check_one_r7_%=:"
++              "if r7 != 1 goto i_loop_%=;"
++              "r0 = r10;"
++              "r0 += r8;"
++              "r1 = 7;"
++              "*(u64 *)(r0 + 0) = r1;"
++              "r1 = r10;"
++              "r1 += -8;"
++              "call %[bpf_iter_num_destroy];"
++              "r1 = r10;"
++              "r1 += -16;"
++              "call %[bpf_iter_num_destroy];"
++              "r0 = 0;"
++              "exit;"
++      "i_loop_end_%=:"
++              "r1 = r10;"
++              "r1 += -8;"
++              "call %[bpf_iter_num_destroy];"
++              "r6 = 0;"
++              "r7 = 0;"
++              "r8 = -25;"
++              "goto j_loop_%=;"
++      "j_loop_end_%=:"
++              "r1 = r10;"
++              "r1 += -16;"
++              "call %[bpf_iter_num_destroy];"
++              "r0 = 0;"
++              "exit;"
++              :
++              : __imm(bpf_get_prandom_u32),
++                __imm(bpf_iter_num_new),
++                __imm(bpf_iter_num_next),
++                __imm(bpf_iter_num_destroy)
++              : __clobber_all
++      );
++}
++
++SEC("?raw_tp")
++__success
++__naked int triple_continue(void)
++{
++      /* This is equivalent to C program below.
++       * High branching factor of the loop body turned out to be
++       * problematic for one of the iterator convergence tracking
++       * algorithms explored.
++       *
++       * r6 = bpf_get_prandom_u32()
++       * bpf_iter_num_new(&fp[-8], 0, 10)
++       * while (bpf_iter_num_next(&fp[-8])) {
++       *   if (bpf_get_prandom_u32() != 42)
++       *     continue;
++       *   if (bpf_get_prandom_u32() != 42)
++       *     continue;
++       *   if (bpf_get_prandom_u32() != 42)
++       *     continue;
++       *   r0 += 0;
++       * }
++       * bpf_iter_num_destroy(&fp[-8])
++       * return 0
++       */
++      asm volatile (
++              "r1 = r10;"
++              "r1 += -8;"
++              "r2 = 0;"
++              "r3 = 10;"
++              "call %[bpf_iter_num_new];"
++      "loop_%=:"
++              "r1 = r10;"
++              "r1 += -8;"
++              "call %[bpf_iter_num_next];"
++              "if r0 == 0 goto loop_end_%=;"
++              "call %[bpf_get_prandom_u32];"
++              "if r0 != 42 goto loop_%=;"
++              "call %[bpf_get_prandom_u32];"
++              "if r0 != 42 goto loop_%=;"
++              "call %[bpf_get_prandom_u32];"
++              "if r0 != 42 goto loop_%=;"
++              "r0 += 0;"
++              "goto loop_%=;"
++      "loop_end_%=:"
++              "r1 = r10;"
++              "r1 += -8;"
++              "call %[bpf_iter_num_destroy];"
++              "r0 = 0;"
++              "exit;"
++              :
++              : __imm(bpf_get_prandom_u32),
++                __imm(bpf_iter_num_new),
++                __imm(bpf_iter_num_next),
++                __imm(bpf_iter_num_destroy)
++              : __clobber_all
++      );
++}
++
++SEC("?raw_tp")
++__success
++__naked int widen_spill(void)
++{
++      /* This is equivalent to C program below.
++       * The counter is stored in fp[-16], if this counter is not widened
++       * verifier states representing loop iterations would never converge.
++       *
++       * fp[-16] = 0
++       * bpf_iter_num_new(&fp[-8], 0, 10)
++       * while (bpf_iter_num_next(&fp[-8])) {
++       *   r0 = fp[-16];
++       *   r0 += 1;
++       *   fp[-16] = r0;
++       * }
++       * bpf_iter_num_destroy(&fp[-8])
++       * return 0
++       */
++      asm volatile (
++              "r0 = 0;"
++              "*(u64 *)(r10 - 16) = r0;"
++              "r1 = r10;"
++              "r1 += -8;"
++              "r2 = 0;"
++              "r3 = 10;"
++              "call %[bpf_iter_num_new];"
++      "loop_%=:"
++              "r1 = r10;"
++              "r1 += -8;"
++              "call %[bpf_iter_num_next];"
++              "if r0 == 0 goto loop_end_%=;"
++              "r0 = *(u64 *)(r10 - 16);"
++              "r0 += 1;"
++              "*(u64 *)(r10 - 16) = r0;"
++              "goto loop_%=;"
++      "loop_end_%=:"
++              "r1 = r10;"
++              "r1 += -8;"
++              "call %[bpf_iter_num_destroy];"
++              "r0 = 0;"
++              "exit;"
++              :
++              : __imm(bpf_iter_num_new),
++                __imm(bpf_iter_num_next),
++                __imm(bpf_iter_num_destroy)
++              : __clobber_all
++      );
++}
++
++SEC("raw_tp")
++__success
++__naked int checkpoint_states_deletion(void)
++{
++      /* This is equivalent to C program below.
++       *
++       *   int *a, *b, *c, *d, *e, *f;
++       *   int i, sum = 0;
++       *   bpf_for(i, 0, 10) {
++       *     a = bpf_map_lookup_elem(&amap, &i);
++       *     b = bpf_map_lookup_elem(&amap, &i);
++       *     c = bpf_map_lookup_elem(&amap, &i);
++       *     d = bpf_map_lookup_elem(&amap, &i);
++       *     e = bpf_map_lookup_elem(&amap, &i);
++       *     f = bpf_map_lookup_elem(&amap, &i);
++       *     if (a) sum += 1;
++       *     if (b) sum += 1;
++       *     if (c) sum += 1;
++       *     if (d) sum += 1;
++       *     if (e) sum += 1;
++       *     if (f) sum += 1;
++       *   }
++       *   return 0;
++       *
++       * The body of the loop spawns multiple simulation paths
++       * with different combination of NULL/non-NULL information for a/b/c/d/e/f.
++       * Each combination is unique from states_equal() point of view.
++       * Explored states checkpoint is created after each iterator next call.
++       * Iterator convergence logic expects that eventually current state
++       * would get equal to one of the explored states and thus loop
++       * exploration would be finished (at-least for a specific path).
++       * Verifier evicts explored states with high miss to hit ratio
++       * to to avoid comparing current state with too many explored
++       * states per instruction.
++       * This test is designed to "stress test" eviction policy defined using formula:
++       *
++       *    sl->miss_cnt > sl->hit_cnt * N + N // if true sl->state is evicted
++       *
++       * Currently N is set to 64, which allows for 6 variables in this test.
++       */
++      asm volatile (
++              "r6 = 0;"                  /* a */
++              "r7 = 0;"                  /* b */
++              "r8 = 0;"                  /* c */
++              "*(u64 *)(r10 - 24) = r6;" /* d */
++              "*(u64 *)(r10 - 32) = r6;" /* e */
++              "*(u64 *)(r10 - 40) = r6;" /* f */
++              "r9 = 0;"                  /* sum */
++              "r1 = r10;"
++              "r1 += -8;"
++              "r2 = 0;"
++              "r3 = 10;"
++              "call %[bpf_iter_num_new];"
++      "loop_%=:"
++              "r1 = r10;"
++              "r1 += -8;"
++              "call %[bpf_iter_num_next];"
++              "if r0 == 0 goto loop_end_%=;"
++
++              "*(u64 *)(r10 - 16) = r0;"
++
++              "r1 = %[amap] ll;"
++              "r2 = r10;"
++              "r2 += -16;"
++              "call %[bpf_map_lookup_elem];"
++              "r6 = r0;"
++
++              "r1 = %[amap] ll;"
++              "r2 = r10;"
++              "r2 += -16;"
++              "call %[bpf_map_lookup_elem];"
++              "r7 = r0;"
++
++              "r1 = %[amap] ll;"
++              "r2 = r10;"
++              "r2 += -16;"
++              "call %[bpf_map_lookup_elem];"
++              "r8 = r0;"
++
++              "r1 = %[amap] ll;"
++              "r2 = r10;"
++              "r2 += -16;"
++              "call %[bpf_map_lookup_elem];"
++              "*(u64 *)(r10 - 24) = r0;"
++
++              "r1 = %[amap] ll;"
++              "r2 = r10;"
++              "r2 += -16;"
++              "call %[bpf_map_lookup_elem];"
++              "*(u64 *)(r10 - 32) = r0;"
++
++              "r1 = %[amap] ll;"
++              "r2 = r10;"
++              "r2 += -16;"
++              "call %[bpf_map_lookup_elem];"
++              "*(u64 *)(r10 - 40) = r0;"
++
++              "if r6 == 0 goto +1;"
++              "r9 += 1;"
++              "if r7 == 0 goto +1;"
++              "r9 += 1;"
++              "if r8 == 0 goto +1;"
++              "r9 += 1;"
++              "r0 = *(u64 *)(r10 - 24);"
++              "if r0 == 0 goto +1;"
++              "r9 += 1;"
++              "r0 = *(u64 *)(r10 - 32);"
++              "if r0 == 0 goto +1;"
++              "r9 += 1;"
++              "r0 = *(u64 *)(r10 - 40);"
++              "if r0 == 0 goto +1;"
++              "r9 += 1;"
++
++              "goto loop_%=;"
++      "loop_end_%=:"
++              "r1 = r10;"
++              "r1 += -8;"
++              "call %[bpf_iter_num_destroy];"
++              "r0 = 0;"
++              "exit;"
++              :
++              : __imm(bpf_map_lookup_elem),
++                __imm(bpf_iter_num_new),
++                __imm(bpf_iter_num_next),
++                __imm(bpf_iter_num_destroy),
++                __imm_addr(amap)
++              : __clobber_all
++      );
++}
++
+ char _license[] SEC("license") = "GPL";
diff --git a/queue-6.6/selftests-bpf-track-string-payload-offset-as-scalar-in-strobemeta.patch b/queue-6.6/selftests-bpf-track-string-payload-offset-as-scalar-in-strobemeta.patch
new file mode 100644 (file)
index 0000000..8a46706
--- /dev/null
@@ -0,0 +1,252 @@
+From 87eb0152bcc102ecbda866978f4e54db5a3be1ef Mon Sep 17 00:00:00 2001
+From: Eduard Zingerman <eddyz87@gmail.com>
+Date: Tue, 21 Nov 2023 04:06:52 +0200
+Subject: selftests/bpf: track string payload offset as scalar in strobemeta
+
+From: Eduard Zingerman <eddyz87@gmail.com>
+
+commit 87eb0152bcc102ecbda866978f4e54db5a3be1ef upstream.
+
+This change prepares strobemeta for update in callbacks verification
+logic. To allow bpf_loop() verification converge when multiple
+callback iterations are considered:
+- track offset inside strobemeta_payload->payload directly as scalar
+  value;
+- at each iteration make sure that remaining
+  strobemeta_payload->payload capacity is sufficient for execution of
+  read_{map,str}_var functions;
+- make sure that offset is tracked as unbound scalar between
+  iterations, otherwise verifier won't be able infer that bpf_loop
+  callback reaches identical states.
+
+Acked-by: Andrii Nakryiko <andrii@kernel.org>
+Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
+Link: https://lore.kernel.org/r/20231121020701.26440-3-eddyz87@gmail.com
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ tools/testing/selftests/bpf/progs/strobemeta.h |   78 +++++++++++++++----------
+ 1 file changed, 48 insertions(+), 30 deletions(-)
+
+--- a/tools/testing/selftests/bpf/progs/strobemeta.h
++++ b/tools/testing/selftests/bpf/progs/strobemeta.h
+@@ -24,9 +24,11 @@ struct task_struct {};
+ #define STACK_TABLE_EPOCH_SHIFT 20
+ #define STROBE_MAX_STR_LEN 1
+ #define STROBE_MAX_CFGS 32
++#define READ_MAP_VAR_PAYLOAD_CAP                                      \
++      ((1 + STROBE_MAX_MAP_ENTRIES * 2) * STROBE_MAX_STR_LEN)
+ #define STROBE_MAX_PAYLOAD                                            \
+       (STROBE_MAX_STRS * STROBE_MAX_STR_LEN +                         \
+-      STROBE_MAX_MAPS * (1 + STROBE_MAX_MAP_ENTRIES * 2) * STROBE_MAX_STR_LEN)
++       STROBE_MAX_MAPS * READ_MAP_VAR_PAYLOAD_CAP)
+ struct strobe_value_header {
+       /*
+@@ -355,7 +357,7 @@ static __always_inline uint64_t read_str
+                                            size_t idx, void *tls_base,
+                                            struct strobe_value_generic *value,
+                                            struct strobemeta_payload *data,
+-                                           void *payload)
++                                           size_t off)
+ {
+       void *location;
+       uint64_t len;
+@@ -366,7 +368,7 @@ static __always_inline uint64_t read_str
+               return 0;
+       bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location);
+-      len = bpf_probe_read_user_str(payload, STROBE_MAX_STR_LEN, value->ptr);
++      len = bpf_probe_read_user_str(&data->payload[off], STROBE_MAX_STR_LEN, value->ptr);
+       /*
+        * if bpf_probe_read_user_str returns error (<0), due to casting to
+        * unsinged int, it will become big number, so next check is
+@@ -378,14 +380,14 @@ static __always_inline uint64_t read_str
+               return 0;
+       data->str_lens[idx] = len;
+-      return len;
++      return off + len;
+ }
+-static __always_inline void *read_map_var(struct strobemeta_cfg *cfg,
+-                                        size_t idx, void *tls_base,
+-                                        struct strobe_value_generic *value,
+-                                        struct strobemeta_payload *data,
+-                                        void *payload)
++static __always_inline uint64_t read_map_var(struct strobemeta_cfg *cfg,
++                                           size_t idx, void *tls_base,
++                                           struct strobe_value_generic *value,
++                                           struct strobemeta_payload *data,
++                                           size_t off)
+ {
+       struct strobe_map_descr* descr = &data->map_descrs[idx];
+       struct strobe_map_raw map;
+@@ -397,11 +399,11 @@ static __always_inline void *read_map_va
+       location = calc_location(&cfg->map_locs[idx], tls_base);
+       if (!location)
+-              return payload;
++              return off;
+       bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location);
+       if (bpf_probe_read_user(&map, sizeof(struct strobe_map_raw), value->ptr))
+-              return payload;
++              return off;
+       descr->id = map.id;
+       descr->cnt = map.cnt;
+@@ -410,10 +412,10 @@ static __always_inline void *read_map_va
+               data->req_meta_valid = 1;
+       }
+-      len = bpf_probe_read_user_str(payload, STROBE_MAX_STR_LEN, map.tag);
++      len = bpf_probe_read_user_str(&data->payload[off], STROBE_MAX_STR_LEN, map.tag);
+       if (len <= STROBE_MAX_STR_LEN) {
+               descr->tag_len = len;
+-              payload += len;
++              off += len;
+       }
+ #ifdef NO_UNROLL
+@@ -426,22 +428,22 @@ static __always_inline void *read_map_va
+                       break;
+               descr->key_lens[i] = 0;
+-              len = bpf_probe_read_user_str(payload, STROBE_MAX_STR_LEN,
++              len = bpf_probe_read_user_str(&data->payload[off], STROBE_MAX_STR_LEN,
+                                             map.entries[i].key);
+               if (len <= STROBE_MAX_STR_LEN) {
+                       descr->key_lens[i] = len;
+-                      payload += len;
++                      off += len;
+               }
+               descr->val_lens[i] = 0;
+-              len = bpf_probe_read_user_str(payload, STROBE_MAX_STR_LEN,
++              len = bpf_probe_read_user_str(&data->payload[off], STROBE_MAX_STR_LEN,
+                                             map.entries[i].val);
+               if (len <= STROBE_MAX_STR_LEN) {
+                       descr->val_lens[i] = len;
+-                      payload += len;
++                      off += len;
+               }
+       }
+-      return payload;
++      return off;
+ }
+ #ifdef USE_BPF_LOOP
+@@ -455,14 +457,20 @@ struct read_var_ctx {
+       struct strobemeta_payload *data;
+       void *tls_base;
+       struct strobemeta_cfg *cfg;
+-      void *payload;
++      size_t payload_off;
+       /* value gets mutated */
+       struct strobe_value_generic *value;
+       enum read_type type;
+ };
+-static int read_var_callback(__u32 index, struct read_var_ctx *ctx)
++static int read_var_callback(__u64 index, struct read_var_ctx *ctx)
+ {
++      /* lose precision info for ctx->payload_off, verifier won't track
++       * double xor, barrier_var() is needed to force clang keep both xors.
++       */
++      ctx->payload_off ^= index;
++      barrier_var(ctx->payload_off);
++      ctx->payload_off ^= index;
+       switch (ctx->type) {
+       case READ_INT_VAR:
+               if (index >= STROBE_MAX_INTS)
+@@ -472,14 +480,18 @@ static int read_var_callback(__u32 index
+       case READ_MAP_VAR:
+               if (index >= STROBE_MAX_MAPS)
+                       return 1;
+-              ctx->payload = read_map_var(ctx->cfg, index, ctx->tls_base,
+-                                          ctx->value, ctx->data, ctx->payload);
++              if (ctx->payload_off > sizeof(ctx->data->payload) - READ_MAP_VAR_PAYLOAD_CAP)
++                      return 1;
++              ctx->payload_off = read_map_var(ctx->cfg, index, ctx->tls_base,
++                                              ctx->value, ctx->data, ctx->payload_off);
+               break;
+       case READ_STR_VAR:
+               if (index >= STROBE_MAX_STRS)
+                       return 1;
+-              ctx->payload += read_str_var(ctx->cfg, index, ctx->tls_base,
+-                                           ctx->value, ctx->data, ctx->payload);
++              if (ctx->payload_off > sizeof(ctx->data->payload) - STROBE_MAX_STR_LEN)
++                      return 1;
++              ctx->payload_off = read_str_var(ctx->cfg, index, ctx->tls_base,
++                                              ctx->value, ctx->data, ctx->payload_off);
+               break;
+       }
+       return 0;
+@@ -501,7 +513,8 @@ static void *read_strobe_meta(struct tas
+       pid_t pid = bpf_get_current_pid_tgid() >> 32;
+       struct strobe_value_generic value = {0};
+       struct strobemeta_cfg *cfg;
+-      void *tls_base, *payload;
++      size_t payload_off;
++      void *tls_base;
+       cfg = bpf_map_lookup_elem(&strobemeta_cfgs, &pid);
+       if (!cfg)
+@@ -509,7 +522,7 @@ static void *read_strobe_meta(struct tas
+       data->int_vals_set_mask = 0;
+       data->req_meta_valid = 0;
+-      payload = data->payload;
++      payload_off = 0;
+       /*
+        * we don't have struct task_struct definition, it should be:
+        * tls_base = (void *)task->thread.fsbase;
+@@ -522,7 +535,7 @@ static void *read_strobe_meta(struct tas
+               .tls_base = tls_base,
+               .value = &value,
+               .data = data,
+-              .payload = payload,
++              .payload_off = 0,
+       };
+       int err;
+@@ -540,6 +553,11 @@ static void *read_strobe_meta(struct tas
+       err = bpf_loop(STROBE_MAX_MAPS, read_var_callback, &ctx, 0);
+       if (err != STROBE_MAX_MAPS)
+               return NULL;
++
++      payload_off = ctx.payload_off;
++      /* this should not really happen, here only to satisfy verifer */
++      if (payload_off > sizeof(data->payload))
++              payload_off = sizeof(data->payload);
+ #else
+ #ifdef NO_UNROLL
+ #pragma clang loop unroll(disable)
+@@ -555,7 +573,7 @@ static void *read_strobe_meta(struct tas
+ #pragma unroll
+ #endif /* NO_UNROLL */
+       for (int i = 0; i < STROBE_MAX_STRS; ++i) {
+-              payload += read_str_var(cfg, i, tls_base, &value, data, payload);
++              payload_off = read_str_var(cfg, i, tls_base, &value, data, payload_off);
+       }
+ #ifdef NO_UNROLL
+ #pragma clang loop unroll(disable)
+@@ -563,7 +581,7 @@ static void *read_strobe_meta(struct tas
+ #pragma unroll
+ #endif /* NO_UNROLL */
+       for (int i = 0; i < STROBE_MAX_MAPS; ++i) {
+-              payload = read_map_var(cfg, i, tls_base, &value, data, payload);
++              payload_off = read_map_var(cfg, i, tls_base, &value, data, payload_off);
+       }
+ #endif /* USE_BPF_LOOP */
+@@ -571,7 +589,7 @@ static void *read_strobe_meta(struct tas
+        * return pointer right after end of payload, so it's possible to
+        * calculate exact amount of useful data that needs to be sent
+        */
+-      return payload;
++      return &data->payload[payload_off];
+ }
+ SEC("raw_tracepoint/kfree_skb")
diff --git a/queue-6.6/selftests-bpf-track-tcp-payload-offset-as-scalar-in-xdp_synproxy.patch b/queue-6.6/selftests-bpf-track-tcp-payload-offset-as-scalar-in-xdp_synproxy.patch
new file mode 100644 (file)
index 0000000..fe51c37
--- /dev/null
@@ -0,0 +1,179 @@
+From 977bc146d4eb7070118d8a974919b33bb52732b4 Mon Sep 17 00:00:00 2001
+From: Eduard Zingerman <eddyz87@gmail.com>
+Date: Tue, 21 Nov 2023 04:06:51 +0200
+Subject: selftests/bpf: track tcp payload offset as scalar in xdp_synproxy
+
+From: Eduard Zingerman <eddyz87@gmail.com>
+
+commit 977bc146d4eb7070118d8a974919b33bb52732b4 upstream.
+
+This change prepares syncookie_{tc,xdp} for update in callbakcs
+verification logic. To allow bpf_loop() verification converge when
+multiple callback itreations are considered:
+- track offset inside TCP payload explicitly, not as a part of the
+  pointer;
+- make sure that offset does not exceed MAX_PACKET_OFF enforced by
+  verifier;
+- make sure that offset is tracked as unbound scalar between
+  iterations, otherwise verifier won't be able infer that bpf_loop
+  callback reaches identical states.
+
+Acked-by: Andrii Nakryiko <andrii@kernel.org>
+Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
+Link: https://lore.kernel.org/r/20231121020701.26440-2-eddyz87@gmail.com
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c |   84 +++++++++++-------
+ 1 file changed, 52 insertions(+), 32 deletions(-)
+
+--- a/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c
++++ b/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c
+@@ -53,6 +53,8 @@
+ #define DEFAULT_TTL 64
+ #define MAX_ALLOWED_PORTS 8
++#define MAX_PACKET_OFF 0xffff
++
+ #define swap(a, b) \
+       do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)
+@@ -183,63 +185,76 @@ static __always_inline __u32 tcp_time_st
+ }
+ struct tcpopt_context {
+-      __u8 *ptr;
+-      __u8 *end;
++      void *data;
+       void *data_end;
+       __be32 *tsecr;
+       __u8 wscale;
+       bool option_timestamp;
+       bool option_sack;
++      __u32 off;
+ };
+-static int tscookie_tcpopt_parse(struct tcpopt_context *ctx)
++static __always_inline u8 *next(struct tcpopt_context *ctx, __u32 sz)
+ {
+-      __u8 opcode, opsize;
++      __u64 off = ctx->off;
++      __u8 *data;
+-      if (ctx->ptr >= ctx->end)
+-              return 1;
+-      if (ctx->ptr >= ctx->data_end)
+-              return 1;
++      /* Verifier forbids access to packet when offset exceeds MAX_PACKET_OFF */
++      if (off > MAX_PACKET_OFF - sz)
++              return NULL;
++
++      data = ctx->data + off;
++      barrier_var(data);
++      if (data + sz >= ctx->data_end)
++              return NULL;
+-      opcode = ctx->ptr[0];
++      ctx->off += sz;
++      return data;
++}
+-      if (opcode == TCPOPT_EOL)
+-              return 1;
+-      if (opcode == TCPOPT_NOP) {
+-              ++ctx->ptr;
+-              return 0;
+-      }
++static int tscookie_tcpopt_parse(struct tcpopt_context *ctx)
++{
++      __u8 *opcode, *opsize, *wscale, *tsecr;
++      __u32 off = ctx->off;
+-      if (ctx->ptr + 1 >= ctx->end)
++      opcode = next(ctx, 1);
++      if (!opcode)
+               return 1;
+-      if (ctx->ptr + 1 >= ctx->data_end)
+-              return 1;
+-      opsize = ctx->ptr[1];
+-      if (opsize < 2)
++
++      if (*opcode == TCPOPT_EOL)
+               return 1;
++      if (*opcode == TCPOPT_NOP)
++              return 0;
+-      if (ctx->ptr + opsize > ctx->end)
++      opsize = next(ctx, 1);
++      if (!opsize || *opsize < 2)
+               return 1;
+-      switch (opcode) {
++      switch (*opcode) {
+       case TCPOPT_WINDOW:
+-              if (opsize == TCPOLEN_WINDOW && ctx->ptr + TCPOLEN_WINDOW <= ctx->data_end)
+-                      ctx->wscale = ctx->ptr[2] < TCP_MAX_WSCALE ? ctx->ptr[2] : TCP_MAX_WSCALE;
++              wscale = next(ctx, 1);
++              if (!wscale)
++                      return 1;
++              if (*opsize == TCPOLEN_WINDOW)
++                      ctx->wscale = *wscale < TCP_MAX_WSCALE ? *wscale : TCP_MAX_WSCALE;
+               break;
+       case TCPOPT_TIMESTAMP:
+-              if (opsize == TCPOLEN_TIMESTAMP && ctx->ptr + TCPOLEN_TIMESTAMP <= ctx->data_end) {
++              tsecr = next(ctx, 4);
++              if (!tsecr)
++                      return 1;
++              if (*opsize == TCPOLEN_TIMESTAMP) {
+                       ctx->option_timestamp = true;
+                       /* Client's tsval becomes our tsecr. */
+-                      *ctx->tsecr = get_unaligned((__be32 *)(ctx->ptr + 2));
++                      *ctx->tsecr = get_unaligned((__be32 *)tsecr);
+               }
+               break;
+       case TCPOPT_SACK_PERM:
+-              if (opsize == TCPOLEN_SACK_PERM)
++              if (*opsize == TCPOLEN_SACK_PERM)
+                       ctx->option_sack = true;
+               break;
+       }
+-      ctx->ptr += opsize;
++      ctx->off = off + *opsize;
+       return 0;
+ }
+@@ -256,16 +271,21 @@ static int tscookie_tcpopt_parse_batch(_
+ static __always_inline bool tscookie_init(struct tcphdr *tcp_header,
+                                         __u16 tcp_len, __be32 *tsval,
+-                                        __be32 *tsecr, void *data_end)
++                                        __be32 *tsecr, void *data, void *data_end)
+ {
+       struct tcpopt_context loop_ctx = {
+-              .ptr = (__u8 *)(tcp_header + 1),
+-              .end = (__u8 *)tcp_header + tcp_len,
++              .data = data,
+               .data_end = data_end,
+               .tsecr = tsecr,
+               .wscale = TS_OPT_WSCALE_MASK,
+               .option_timestamp = false,
+               .option_sack = false,
++              /* Note: currently verifier would track .off as unbound scalar.
++               *       In case if verifier would at some point get smarter and
++               *       compute bounded value for this var, beware that it might
++               *       hinder bpf_loop() convergence validation.
++               */
++              .off = (__u8 *)(tcp_header + 1) - (__u8 *)data,
+       };
+       u32 cookie;
+@@ -635,7 +655,7 @@ static __always_inline int syncookie_han
+       cookie = (__u32)value;
+       if (tscookie_init((void *)hdr->tcp, hdr->tcp_len,
+-                        &tsopt_buf[0], &tsopt_buf[1], data_end))
++                        &tsopt_buf[0], &tsopt_buf[1], data, data_end))
+               tsopt = tsopt_buf;
+       /* Check that there is enough space for a SYNACK. It also covers
index 463a8c54a54344d4952d6156cb277fa022f48656..baa3a013fe9fc596351c2acf7d265a5bfbc916fe 100644 (file)
@@ -143,3 +143,20 @@ ksmbd-don-t-increment-epoch-if-current-state-and-request-state-are-same.patch
 ksmbd-send-lease-break-notification-on-file_rename_information.patch
 ksmbd-add-missing-set_freezable-for-freezable-kthread.patch
 dt-bindings-net-snps-dwmac-tx-coe-unsupported.patch
+bpf-move-explored_state-closer-to-the-beginning-of-verifier.c.patch
+bpf-extract-same_callsites-as-utility-function.patch
+bpf-exact-states-comparison-for-iterator-convergence-checks.patch
+selftests-bpf-tests-with-delayed-read-precision-makrs-in-loop-body.patch
+bpf-correct-loop-detection-for-iterators-convergence.patch
+selftests-bpf-test-if-state-loops-are-detected-in-a-tricky-case.patch
+bpf-print-full-verifier-states-on-infinite-loop-detection.patch
+selftests-bpf-track-tcp-payload-offset-as-scalar-in-xdp_synproxy.patch
+selftests-bpf-track-string-payload-offset-as-scalar-in-strobemeta.patch
+bpf-extract-__check_reg_arg-utility-function.patch
+bpf-extract-setup_func_entry-utility-function.patch
+bpf-verify-callbacks-as-if-they-are-called-unknown-number-of-times.patch
+selftests-bpf-tests-for-iterating-callbacks.patch
+bpf-widening-for-callback-iterators.patch
+selftests-bpf-test-widening-for-iterating-callbacks.patch
+bpf-keep-track-of-max-number-of-bpf_loop-callback-iterations.patch
+selftests-bpf-check-if-max-number-of-bpf_loop-iterations-is-tracked.patch