]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
drm/xe/gt_throttle: Avoid TOCTOU when monitoring reasons
authorLucas De Marchi <lucas.demarchi@intel.com>
Tue, 4 Nov 2025 22:20:51 +0000 (14:20 -0800)
committerLucas De Marchi <lucas.demarchi@intel.com>
Wed, 5 Nov 2025 16:52:13 +0000 (08:52 -0800)
It's currently not possible to safely monitor if there's throttling
happening and what are the reasons. The approach of reading the status
and then reading the reasons is not reliable as by the time sysadmin
reads the reason, the throttling could not be happening anymore.

Previous tentative to fix that[1] was breaking the ABI and potentially
sysadmin's scripts. This takes a different approach of adding and
documenting the additional attribute. It's still valuable, though
redundant, to provide the simpler 0/1 interface.

In order to avoid userspace knowledge on the bitmask meaning and to be
able to maintain the kernel side in sync with possible changes in
future, just walk the attribute group and check what are the masks that
match the value read.

[1] https://lore.kernel.org/intel-xe/20241025092238.167042-1-raag.jadav@intel.com/

Cc: Raag Jadav <raag.jadav@intel.com>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Reviewed-by: Raag Jadav <raag.jadav@intel.com>
Link: https://patch.msgid.link/20251104-gt-throttle-cri-v5-1-4948b060bbfd@intel.com
Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com>
drivers/gpu/drm/xe/xe_gt_throttle.c

index fa7068aac3344438c977351b87c1f00751a45c14..82c5fbcdfbe3e2ea420e96b34eb44dabd7e7c0d7 100644 (file)
  * Their availability depend on the platform and some may not be visible if that
  * reason is not available.
  *
+ * The ``reasons`` attribute can be used by sysadmin to monitor all possible
+ * reasons for throttling and report them. It's preferred over monitoring
+ * ``status`` and then reading the reason from individual attributes since that
+ * is racy. If there's no throttling happening, "none" is returned.
+ *
  * The following attributes are available on Crescent Island platform:
  *
- * - ``status``: Overall throttle status
+ * - ``status``: Overall throttle status (0: no throttling, 1: throttling)
+ * - ``reasons``: Array of reasons causing throttling separated by space
  * - ``reason_pl1``: package PL1
  * - ``reason_pl2``: package PL2
  * - ``reason_pl4``: package PL4
@@ -43,7 +49,8 @@
  *
  * Other platforms support the following reasons:
  *
- * - ``status``: Overall status
+ * - ``status``: Overall throttle status (0: no throttling, 1: throttling)
+ * - ``reasons``: Array of reasons causing throttling separated by space
  * - ``reason_pl1``: package PL1
  * - ``reason_pl2``: package PL2
  * - ``reason_pl4``: package PL4, Iccmax etc.
@@ -111,12 +118,57 @@ static ssize_t reason_show(struct kobject *kobj,
        return sysfs_emit(buff, "%u\n", is_throttled_by(gt, ta->mask));
 }
 
+static const struct attribute_group *get_platform_throttle_group(struct xe_device *xe);
+
+static ssize_t reasons_show(struct kobject *kobj,
+                           struct kobj_attribute *attr, char *buff)
+{
+       struct xe_gt *gt = throttle_to_gt(kobj);
+       struct xe_device *xe = gt_to_xe(gt);
+       const struct attribute_group *group;
+       struct attribute **pother;
+       ssize_t ret = 0;
+       u32 reasons;
+
+       reasons = xe_gt_throttle_get_limit_reasons(gt);
+       if (!reasons)
+               goto ret_none;
+
+       group = get_platform_throttle_group(xe);
+       for (pother = group->attrs; *pother; pother++) {
+               struct kobj_attribute *kattr = container_of(*pother, struct kobj_attribute, attr);
+               struct throttle_attribute *other_ta = kobj_attribute_to_throttle(kattr);
+
+               if (other_ta->mask != U32_MAX && reasons & other_ta->mask)
+                       ret += sysfs_emit_at(buff, ret, "%s ", (*pother)->name);
+       }
+
+       if (drm_WARN_ONCE(&xe->drm, !ret, "Unknown reason: %#x\n", reasons))
+               goto ret_none;
+
+       /* Drop extra space from last iteration above */
+       ret--;
+       ret += sysfs_emit_at(buff, ret, "\n");
+
+       return ret;
+
+ret_none:
+       return sysfs_emit(buff, "none\n");
+}
+
 #define THROTTLE_ATTR_RO(name, _mask)                          \
        struct throttle_attribute attr_##name = {               \
                .attr = __ATTR(name, 0444, reason_show, NULL),  \
                .mask = _mask,                                  \
        }
 
+#define THROTTLE_ATTR_RO_FUNC(name, _mask, _show)              \
+       struct throttle_attribute attr_##name = {               \
+               .attr = __ATTR(name, 0444, _show, NULL),        \
+               .mask = _mask,                                  \
+       }
+
+static THROTTLE_ATTR_RO_FUNC(reasons, 0, reasons_show);
 static THROTTLE_ATTR_RO(status, U32_MAX);
 static THROTTLE_ATTR_RO(reason_pl1, POWER_LIMIT_1_MASK);
 static THROTTLE_ATTR_RO(reason_pl2, POWER_LIMIT_2_MASK);
@@ -128,6 +180,7 @@ static THROTTLE_ATTR_RO(reason_vr_thermalert, VR_THERMALERT_MASK);
 static THROTTLE_ATTR_RO(reason_vr_tdc, VR_TDC_MASK);
 
 static struct attribute *throttle_attrs[] = {
+       &attr_reasons.attr.attr,
        &attr_status.attr.attr,
        &attr_reason_pl1.attr.attr,
        &attr_reason_pl2.attr.attr,
@@ -153,6 +206,7 @@ static THROTTLE_ATTR_RO(reason_psys_crit, PSYS_CRIT_MASK);
 
 static struct attribute *cri_throttle_attrs[] = {
        /* Common */
+       &attr_reasons.attr.attr,
        &attr_status.attr.attr,
        &attr_reason_pl1.attr.attr,
        &attr_reason_pl2.attr.attr,