]> git.ipfire.org Git - thirdparty/kernel/stable.git/commitdiff
fs/resctrl: Eliminate false positive lockdep warning when reading SNC counters
authorReinette Chatre <reinette.chatre@intel.com>
Mon, 8 Sep 2025 22:15:51 +0000 (15:15 -0700)
committerBorislav Petkov (AMD) <bp@alien8.de>
Tue, 9 Sep 2025 10:43:36 +0000 (12:43 +0200)
Running resctrl_tests on an SNC-2 system with lockdep debugging enabled
triggers several warnings with following trace:

  WARNING: CPU: 0 PID: 1914 at kernel/cpu.c:528 lockdep_assert_cpus_held
  ...
  Call Trace:
  __mon_event_count
  ? __lock_acquire
  ? __pfx___mon_event_count
  mon_event_count
  ? __pfx_smp_mon_event_count
  smp_mon_event_count
  smp_call_on_cpu_callback

get_cpu_cacheinfo_level() called from __mon_event_count() requires CPU hotplug
lock to be held. The hotplug lock is indeed held during this time, as
confirmed by the lockdep_assert_cpus_held() within mon_event_read() that calls
mon_event_count() via IPI, but the lockdep tracking is not able to follow the
IPI.

Fresh CPU cache information via get_cpu_cacheinfo_level() from
__mon_event_count() was added to support the fix for the issue where resctrl
inappropriately maintained links to L3 cache information that will be stale in
the case when the associated CPU goes offline.

Keep the cacheinfo ID in struct rdt_mon_domain to ensure that resctrl does not
maintain stale cache information while CPUs can go offline. Return to using
a pointer to the L3 cache information (struct cacheinfo) in struct rmid_read,
rmid_read::ci. Initialize rmid_read::ci before the IPI where it is used. CPU
hotplug lock is held across rmid_read::ci initialization and use to ensure
that it points to accurate cache information.

Fixes: 594902c986e2 ("x86,fs/resctrl: Remove inappropriate references to cacheinfo in the resctrl subsystem")
Signed-off-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
fs/resctrl/ctrlmondata.c
fs/resctrl/internal.h
fs/resctrl/monitor.c

index d98e0d2de09fd07290b2c2868f862d1fdc24a4d9..3c39cfacb2518338e40593ac32245fec62d06aa6 100644 (file)
@@ -625,11 +625,11 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg)
                 */
                list_for_each_entry(d, &r->mon_domains, hdr.list) {
                        if (d->ci_id == domid) {
-                               rr.ci_id = d->ci_id;
                                cpu = cpumask_any(&d->hdr.cpu_mask);
                                ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE);
                                if (!ci)
                                        continue;
+                               rr.ci = ci;
                                mon_event_read(&rr, r, NULL, rdtgrp,
                                               &ci->shared_cpu_map, evtid, false);
                                goto checkresult;
index 0a1eedba2b03ad9eb42341252978fb053c7d12c8..9a8cf6f11151d9c8512344505388135d4196c388 100644 (file)
@@ -98,7 +98,7 @@ struct mon_data {
  *        domains in @r sharing L3 @ci.id
  * @evtid: Which monitor event to read.
  * @first: Initialize MBM counter when true.
- * @ci_id: Cacheinfo id for L3. Only set when @d is NULL. Used when summing domains.
+ * @ci:    Cacheinfo for L3. Only set when @d is NULL. Used when summing domains.
  * @err:   Error encountered when reading counter.
  * @val:   Returned value of event counter. If @rgrp is a parent resource group,
  *        @val includes the sum of event counts from its child resource groups.
@@ -112,7 +112,7 @@ struct rmid_read {
        struct rdt_mon_domain   *d;
        enum resctrl_event_id   evtid;
        bool                    first;
-       unsigned int            ci_id;
+       struct cacheinfo        *ci;
        int                     err;
        u64                     val;
        void                    *arch_mon_ctx;
index f5637855c3acace6515318dfdbf88a6aac2ad88d..7326c28a7908f31660e5233afb090656dedd1057 100644 (file)
@@ -361,7 +361,6 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr)
 {
        int cpu = smp_processor_id();
        struct rdt_mon_domain *d;
-       struct cacheinfo *ci;
        struct mbm_state *m;
        int err, ret;
        u64 tval = 0;
@@ -389,8 +388,7 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr)
        }
 
        /* Summing domains that share a cache, must be on a CPU for that cache. */
-       ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE);
-       if (!ci || ci->id != rr->ci_id)
+       if (!cpumask_test_cpu(cpu, &rr->ci->shared_cpu_map))
                return -EINVAL;
 
        /*
@@ -402,7 +400,7 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr)
         */
        ret = -EINVAL;
        list_for_each_entry(d, &rr->r->mon_domains, hdr.list) {
-               if (d->ci_id != rr->ci_id)
+               if (d->ci_id != rr->ci->id)
                        continue;
                err = resctrl_arch_rmid_read(rr->r, d, closid, rmid,
                                             rr->evtid, &tval, rr->arch_mon_ctx);