perf: Add a EVENT_GUEST flag

author Kan Liang <kan.liang@linux.intel.com>

Sat, 6 Dec 2025 00:16:42 +0000 (16:16 -0800)

committer Peter Zijlstra <peterz@infradead.org>

Wed, 17 Dec 2025 12:31:05 +0000 (13:31 +0100)
author Kan Liang <kan.liang@linux.intel.com>
Sat, 6 Dec 2025 00:16:42 +0000 (16:16 -0800)
committer Peter Zijlstra <peterz@infradead.org>
Wed, 17 Dec 2025 12:31:05 +0000 (13:31 +0100)
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h

index d5aa1bc3f0884b42f6d34dd62eeee986f7df179d..d9988e3fd557124332960e9c14d2d6e4ac45899a 100644 (file)
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1044,6 +1044,11 @@ struct perf_event_context {
          */
         struct perf_time_ctx            time;
  
+       /*
+        * Context clock, runs when in the guest mode.
+        */
+       struct perf_time_ctx            timeguest;
+
         /*
          * These fields let us detect when two contexts have both
          * been cloned (inherited) from a common ancestor.
@@ -1176,6 +1181,7 @@ struct bpf_perf_event_data_kern {
   */
  struct perf_cgroup_info {
         struct perf_time_ctx            time;
+       struct perf_time_ctx            timeguest;
         int                             active;
  };
  
diff --git a/kernel/events/core.c b/kernel/events/core.c

index 95f118230ff5ec35b30f89ef3fa2bfa150854262..6781d39f315874aeba470085db3c29e4b28911a5 100644 (file)
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -165,7 +165,19 @@ enum event_type_t {
         /* see ctx_resched() for details */
         EVENT_CPU       = 0x10,
         EVENT_CGROUP    = 0x20,
-       EVENT_FLAGS     = EVENT_CGROUP,
+
+       /*
+        * EVENT_GUEST is set when scheduling in/out events between the host
+        * and a guest with a mediated vPMU.  Among other things, EVENT_GUEST
+        * is used:
+        *
+        * - In for_each_epc() to skip PMUs that don't support events in a
+        *   MEDIATED_VPMU guest, i.e. don't need to be context switched.
+        * - To indicate the start/end point of the events in a guest.  Guest
+        *   running time is deducted for host-only (exclude_guest) events.
+        */
+       EVENT_GUEST     = 0x40,
+       EVENT_FLAGS     = EVENT_CGROUP | EVENT_GUEST,
         /* compound helpers */
         EVENT_ALL         = EVENT_FLEXIBLE | EVENT_PINNED,
         EVENT_TIME_FROZEN = EVENT_TIME | EVENT_FROZEN,
@@ -458,6 +470,11 @@ static cpumask_var_t perf_online_pkg_mask;
  static cpumask_var_t perf_online_sys_mask;
  static struct kmem_cache *perf_event_cache;
  
+static __always_inline bool is_guest_mediated_pmu_loaded(void)
+{
+       return false;
+}
+
  /*
   * perf event paranoia level:
   *  -1 - not paranoid at all
@@ -784,6 +801,9 @@ static bool perf_skip_pmu_ctx(struct perf_event_pmu_context *pmu_ctx,
  {
         if ((event_type & EVENT_CGROUP) && !pmu_ctx->nr_cgroups)
                 return true;
+       if ((event_type & EVENT_GUEST) &&
+           !(pmu_ctx->pmu->capabilities & PERF_PMU_CAP_MEDIATED_VPMU))
+               return true;
         return false;
  }
  
@@ -834,6 +854,39 @@ static inline void update_perf_time_ctx(struct perf_time_ctx *time, u64 now, boo
         WRITE_ONCE(time->offset, time->time - time->stamp);
  }
  
+static_assert(offsetof(struct perf_event_context, timeguest) -
+             offsetof(struct perf_event_context, time) ==
+             sizeof(struct perf_time_ctx));
+
+#define T_TOTAL                0
+#define T_GUEST                1
+
+static inline u64 __perf_event_time_ctx(struct perf_event *event,
+                                       struct perf_time_ctx *times)
+{
+       u64 time = times[T_TOTAL].time;
+
+       if (event->attr.exclude_guest)
+               time -= times[T_GUEST].time;
+
+       return time;
+}
+
+static inline u64 __perf_event_time_ctx_now(struct perf_event *event,
+                                           struct perf_time_ctx *times,
+                                           u64 now)
+{
+       if (is_guest_mediated_pmu_loaded() && event->attr.exclude_guest) {
+               /*
+                * (now + times[total].offset) - (now + times[guest].offset) :=
+                * times[total].offset - times[guest].offset
+                */
+               return READ_ONCE(times[T_TOTAL].offset) - READ_ONCE(times[T_GUEST].offset);
+       }
+
+       return now + READ_ONCE(times[T_TOTAL].offset);
+}
+
  #ifdef CONFIG_CGROUP_PERF
  
  static inline bool
@@ -870,12 +923,16 @@ static inline int is_cgroup_event(struct perf_event *event)
         return event->cgrp != NULL;
  }
  
+static_assert(offsetof(struct perf_cgroup_info, timeguest) -
+             offsetof(struct perf_cgroup_info, time) ==
+             sizeof(struct perf_time_ctx));
+
  static inline u64 perf_cgroup_event_time(struct perf_event *event)
  {
         struct perf_cgroup_info *t;
  
         t = per_cpu_ptr(event->cgrp->info, event->cpu);
-       return t->time.time;
+       return __perf_event_time_ctx(event, &t->time);
  }
  
  static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now)
@@ -884,9 +941,21 @@ static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now)
  
         t = per_cpu_ptr(event->cgrp->info, event->cpu);
         if (!__load_acquire(&t->active))
-               return t->time.time;
-       now += READ_ONCE(t->time.offset);
-       return now;
+               return __perf_event_time_ctx(event, &t->time);
+
+       return __perf_event_time_ctx_now(event, &t->time, now);
+}
+
+static inline void __update_cgrp_guest_time(struct perf_cgroup_info *info, u64 now, bool adv)
+{
+       update_perf_time_ctx(&info->timeguest, now, adv);
+}
+
+static inline void update_cgrp_time(struct perf_cgroup_info *info, u64 now)
+{
+       update_perf_time_ctx(&info->time, now, true);
+       if (is_guest_mediated_pmu_loaded())
+               __update_cgrp_guest_time(info, now, true);
  }
  
  static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final)
@@ -902,7 +971,7 @@ static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx,
                         cgrp = container_of(css, struct perf_cgroup, css);
                         info = this_cpu_ptr(cgrp->info);
  
-                       update_perf_time_ctx(&info->time, now, true);
+                       update_cgrp_time(info, now);
                         if (final)
                                 __store_release(&info->active, 0);
                 }
@@ -925,11 +994,11 @@ static inline void update_cgrp_time_from_event(struct perf_event *event)
          * Do not update time when cgroup is not active
          */
         if (info->active)
-               update_perf_time_ctx(&info->time, perf_clock(), true);
+               update_cgrp_time(info, perf_clock());
  }
  
  static inline void
-perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx)
+perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx, bool guest)
  {
         struct perf_event_context *ctx = &cpuctx->ctx;
         struct perf_cgroup *cgrp = cpuctx->cgrp;
@@ -949,8 +1018,12 @@ perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx)
         for (css = &cgrp->css; css; css = css->parent) {
                 cgrp = container_of(css, struct perf_cgroup, css);
                 info = this_cpu_ptr(cgrp->info);
-               update_perf_time_ctx(&info->time, ctx->time.stamp, false);
-               __store_release(&info->active, 1);
+               if (guest) {
+                       __update_cgrp_guest_time(info, ctx->time.stamp, false);
+               } else {
+                       update_perf_time_ctx(&info->time, ctx->time.stamp, false);
+                       __store_release(&info->active, 1);
+               }
         }
  }
  
@@ -1154,7 +1227,7 @@ static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
  }
  
  static inline void
-perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx)
+perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx, bool guest)
  {
  }
  
@@ -1566,16 +1639,24 @@ static void perf_unpin_context(struct perf_event_context *ctx)
   */
  static void __update_context_time(struct perf_event_context *ctx, bool adv)
  {
-       u64 now = perf_clock();
+       lockdep_assert_held(&ctx->lock);
+
+       update_perf_time_ctx(&ctx->time, perf_clock(), adv);
+}
  
+static void __update_context_guest_time(struct perf_event_context *ctx, bool adv)
+{
         lockdep_assert_held(&ctx->lock);
  
-       update_perf_time_ctx(&ctx->time, now, adv);
+       /* must be called after __update_context_time(); */
+       update_perf_time_ctx(&ctx->timeguest, ctx->time.stamp, adv);
  }
  
  static void update_context_time(struct perf_event_context *ctx)
  {
         __update_context_time(ctx, true);
+       if (is_guest_mediated_pmu_loaded())
+               __update_context_guest_time(ctx, true);
  }
  
  static u64 perf_event_time(struct perf_event *event)
@@ -1588,7 +1669,7 @@ static u64 perf_event_time(struct perf_event *event)
         if (is_cgroup_event(event))
                 return perf_cgroup_event_time(event);
  
-       return ctx->time.time;
+       return __perf_event_time_ctx(event, &ctx->time);
  }
  
  static u64 perf_event_time_now(struct perf_event *event, u64 now)
@@ -1602,10 +1683,9 @@ static u64 perf_event_time_now(struct perf_event *event, u64 now)
                 return perf_cgroup_event_time_now(event, now);
  
         if (!(__load_acquire(&ctx->is_active) & EVENT_TIME))
-               return ctx->time.time;
+               return __perf_event_time_ctx(event, &ctx->time);
  
-       now += READ_ONCE(ctx->time.offset);
-       return now;
+       return __perf_event_time_ctx_now(event, &ctx->time, now);
  }
  
  static enum event_type_t get_event_type(struct perf_event *event)
@@ -2425,20 +2505,23 @@ group_sched_out(struct perf_event *group_event, struct perf_event_context *ctx)
  }
  
  static inline void
-__ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, bool final)
+__ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx,
+                 bool final, enum event_type_t event_type)
  {
         if (ctx->is_active & EVENT_TIME) {
                 if (ctx->is_active & EVENT_FROZEN)
                         return;
+
                 update_context_time(ctx);
-               update_cgrp_time_from_cpuctx(cpuctx, final);
+               /* vPMU should not stop time */
+               update_cgrp_time_from_cpuctx(cpuctx, !(event_type & EVENT_GUEST) && final);
         }
  }
  
  static inline void
  ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx)
  {
-       __ctx_time_update(cpuctx, ctx, false);
+       __ctx_time_update(cpuctx, ctx, false, 0);
  }
  
  /*
@@ -3510,7 +3593,7 @@ ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t
          *
          * would only update time for the pinned events.
          */
-       __ctx_time_update(cpuctx, ctx, ctx == &cpuctx->ctx);
+       __ctx_time_update(cpuctx, ctx, ctx == &cpuctx->ctx, event_type);
  
         /*
          * CPU-release for the below ->is_active store,
@@ -3536,7 +3619,18 @@ ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t
                         cpuctx->task_ctx = NULL;
         }
  
-       is_active ^= ctx->is_active; /* changed bits */
+       if (event_type & EVENT_GUEST) {
+               /*
+                * Schedule out all exclude_guest events of PMU
+                * with PERF_PMU_CAP_MEDIATED_VPMU.
+                */
+               is_active = EVENT_ALL;
+               __update_context_guest_time(ctx, false);
+               perf_cgroup_set_timestamp(cpuctx, true);
+               barrier();
+       } else {
+               is_active ^= ctx->is_active; /* changed bits */
+       }
  
         for_each_epc(pmu_ctx, ctx, pmu, event_type)
                 __pmu_ctx_sched_out(pmu_ctx, is_active);
@@ -3995,10 +4089,15 @@ static inline void group_update_userpage(struct perf_event *group_event)
                 event_update_userpage(event);
  }
  
+struct merge_sched_data {
+       int can_add_hw;
+       enum event_type_t event_type;
+};
+
  static int merge_sched_in(struct perf_event *event, void *data)
  {
         struct perf_event_context *ctx = event->ctx;
-       int *can_add_hw = data;
+       struct merge_sched_data *msd = data;
  
         if (event->state <= PERF_EVENT_STATE_OFF)
                 return 0;
@@ -4006,13 +4105,22 @@ static int merge_sched_in(struct perf_event *event, void *data)
         if (!event_filter_match(event))
                 return 0;
  
-       if (group_can_go_on(event, *can_add_hw)) {
+       /*
+        * Don't schedule in any host events from PMU with
+        * PERF_PMU_CAP_MEDIATED_VPMU, while a guest is running.
+        */
+       if (is_guest_mediated_pmu_loaded() &&
+           event->pmu_ctx->pmu->capabilities & PERF_PMU_CAP_MEDIATED_VPMU &&
+           !(msd->event_type & EVENT_GUEST))
+               return 0;
+
+       if (group_can_go_on(event, msd->can_add_hw)) {
                 if (!group_sched_in(event, ctx))
                         list_add_tail(&event->active_list, get_event_list(event));
         }
  
         if (event->state == PERF_EVENT_STATE_INACTIVE) {
-               *can_add_hw = 0;
+               msd->can_add_hw = 0;
                 if (event->attr.pinned) {
                         perf_cgroup_event_disable(event, ctx);
                         perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
@@ -4035,11 +4143,15 @@ static int merge_sched_in(struct perf_event *event, void *data)
  
  static void pmu_groups_sched_in(struct perf_event_context *ctx,
                                 struct perf_event_groups *groups,
-                               struct pmu *pmu)
+                               struct pmu *pmu,
+                               enum event_type_t event_type)
  {
-       int can_add_hw = 1;
+       struct merge_sched_data msd = {
+               .can_add_hw = 1,
+               .event_type = event_type,
+       };
         visit_groups_merge(ctx, groups, smp_processor_id(), pmu,
-                          merge_sched_in, &can_add_hw);
+                          merge_sched_in, &msd);
  }
  
  static void __pmu_ctx_sched_in(struct perf_event_pmu_context *pmu_ctx,
@@ -4048,9 +4160,9 @@ static void __pmu_ctx_sched_in(struct perf_event_pmu_context *pmu_ctx,
         struct perf_event_context *ctx = pmu_ctx->ctx;
  
         if (event_type & EVENT_PINNED)
-               pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu_ctx->pmu);
+               pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu_ctx->pmu, event_type);
         if (event_type & EVENT_FLEXIBLE)
-               pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu_ctx->pmu);
+               pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu_ctx->pmu, event_type);
  }
  
  static void
@@ -4067,9 +4179,11 @@ ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t
                 return;
  
         if (!(is_active & EVENT_TIME)) {
+               /* EVENT_TIME should be active while the guest runs */
+               WARN_ON_ONCE(event_type & EVENT_GUEST);
                 /* start ctx time */
                 __update_context_time(ctx, false);
-               perf_cgroup_set_timestamp(cpuctx);
+               perf_cgroup_set_timestamp(cpuctx, false);
                 /*
                  * CPU-release for the below ->is_active store,
                  * see __load_acquire() in perf_event_time_now()
@@ -4085,7 +4199,23 @@ ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t
                         WARN_ON_ONCE(cpuctx->task_ctx != ctx);
         }
  
-       is_active ^= ctx->is_active; /* changed bits */
+       if (event_type & EVENT_GUEST) {
+               /*
+                * Schedule in the required exclude_guest events of PMU
+                * with PERF_PMU_CAP_MEDIATED_VPMU.
+                */
+               is_active = event_type & EVENT_ALL;
+
+               /*
+                * Update ctx time to set the new start time for
+                * the exclude_guest events.
+                */
+               update_context_time(ctx);
+               update_cgrp_time_from_cpuctx(cpuctx, false);
+               barrier();
+       } else {
+               is_active ^= ctx->is_active; /* changed bits */
+       }
  
         /*
          * First go through the list and put on any pinned groups
@@ -4093,13 +4223,13 @@ ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t
          */
         if (is_active & EVENT_PINNED) {
                 for_each_epc(pmu_ctx, ctx, pmu, event_type)
-                       __pmu_ctx_sched_in(pmu_ctx, EVENT_PINNED);
+                       __pmu_ctx_sched_in(pmu_ctx, EVENT_PINNED | (event_type & EVENT_GUEST));
         }
  
         /* Then walk through the lower prio flexible groups */
         if (is_active & EVENT_FLEXIBLE) {
                 for_each_epc(pmu_ctx, ctx, pmu, event_type)
-                       __pmu_ctx_sched_in(pmu_ctx, EVENT_FLEXIBLE);
+                       __pmu_ctx_sched_in(pmu_ctx, EVENT_FLEXIBLE | (event_type & EVENT_GUEST));
         }
  }
  
@@ -6627,22 +6757,22 @@ void perf_event_update_userpage(struct perf_event *event)
                 goto unlock;
  
         /*
-        * compute total_time_enabled, total_time_running
-        * based on snapshot values taken when the event
-        * was last scheduled in.
+        * Disable preemption to guarantee consistent time stamps are stored to
+        * the user page.
+        */
+       preempt_disable();
+
+       /*
+        * Compute total_time_enabled, total_time_running based on snapshot
+        * values taken when the event was last scheduled in.
          *
-        * we cannot simply called update_context_time()
-        * because of locking issue as we can be called in
-        * NMI context
+        * We cannot simply call update_context_time() because doing so would
+        * lead to deadlock when called from NMI context.
          */
         calc_timer_values(event, &now, &enabled, &running);
  
         userpg = rb->user_page;
-       /*
-        * Disable preemption to guarantee consistent time stamps are stored to
-        * the user page.
-        */
-       preempt_disable();
+
         ++userpg->lock;
         barrier();
         userpg->index = perf_event_index(event);
@@ -7939,13 +8069,11 @@ static void perf_output_read(struct perf_output_handle *handle,
         u64 read_format = event->attr.read_format;
  
         /*
-        * compute total_time_enabled, total_time_running
-        * based on snapshot values taken when the event
-        * was last scheduled in.
+        * Compute total_time_enabled, total_time_running based on snapshot
+        * values taken when the event was last scheduled in.
          *
-        * we cannot simply called update_context_time()
-        * because of locking issue as we are called in
-        * NMI context
+        * We cannot simply call update_context_time() because doing so would
+        * lead to deadlock when called from NMI context.
          */
         if (read_format & PERF_FORMAT_TOTAL_TIMES)
                 calc_timer_values(event, &now, &enabled, &running);
author	Kan Liang <kan.liang@linux.intel.com>
	Sat, 6 Dec 2025 00:16:42 +0000 (16:16 -0800)
committer	Peter Zijlstra <peterz@infradead.org>
	Wed, 17 Dec 2025 12:31:05 +0000 (13:31 +0100)
include/linux/perf_event.h		patch \| blob \| blame \| history
kernel/events/core.c		patch \| blob \| blame \| history