]> git.ipfire.org Git - thirdparty/kernel/stable.git/commitdiff
drm/amd: Add Suspend/Hibernate notification callback support
authorMario Limonciello <mario.limonciello@amd.com>
Thu, 28 Nov 2024 03:26:56 +0000 (21:26 -0600)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Thu, 22 May 2025 12:12:14 +0000 (14:12 +0200)
[ Upstream commit 2965e6355dcdf157b5fafa25a2715f00064da8bf ]

As part of the suspend sequence VRAM needs to be evicted on dGPUs.
In order to make suspend/resume more reliable we moved this into
the pmops prepare() callback so that the suspend sequence would fail
but the system could remain operational under high memory usage suspend.

Another class of issues exist though where due to memory fragementation
there isn't a large enough contiguous space and swap isn't accessible.

Add support for a suspend/hibernate notification callback that could
evict VRAM before tasks are frozen. This should allow paging out to swap
if necessary.

Link: https://github.com/ROCm/ROCK-Kernel-Driver/issues/174
Link: https://gitlab.freedesktop.org/drm/amd/-/issues/3476
Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/2362
Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3781
Reviewed-by: Lijo Lazar <lijo.lazar@amd.com>
Link: https://lore.kernel.org/r/20241128032656.2090059-2-superm1@kernel.org
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Stable-dep-of: d0ce1aaa8531 ("Revert "drm/amd: Stop evicting resources on APUs in suspend"")
Signed-off-by: Sasha Levin <sashal@kernel.org>
drivers/gpu/drm/amd/amdgpu/amdgpu.h
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c

index ed4ebc6d32695f778b8a84530639383fc53b2afa..9cda2ecaf69b558102e881011fb695e29d3cc021 100644 (file)
@@ -788,6 +788,7 @@ struct amdgpu_device {
        bool                            need_swiotlb;
        bool                            accel_working;
        struct notifier_block           acpi_nb;
+       struct notifier_block           pm_nb;
        struct amdgpu_i2c_chan          *i2c_bus[AMDGPU_MAX_I2C_BUS];
        struct debugfs_blob_wrapper     debugfs_vbios_blob;
        struct debugfs_blob_wrapper     debugfs_discovery_blob;
index e22e2a1df730c753b0d218e8c38e7e028d1efefd..9fd9424ec8f71d1d06fcd17b6312fe6c2fa872ea 100644 (file)
@@ -142,6 +142,8 @@ const char *amdgpu_asic_name[] = {
 };
 
 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev);
+static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode,
+                                    void *data);
 
 /**
  * DOC: pcie_replay_count
@@ -3922,6 +3924,11 @@ fence_driver_init:
 
        amdgpu_device_check_iommu_direct_map(adev);
 
+       adev->pm_nb.notifier_call = amdgpu_device_pm_notifier;
+       r = register_pm_notifier(&adev->pm_nb);
+       if (r)
+               goto failed;
+
        return 0;
 
 release_ras_con:
@@ -3983,6 +3990,8 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev)
        flush_delayed_work(&adev->delayed_init_work);
        adev->shutdown = true;
 
+       unregister_pm_notifier(&adev->pm_nb);
+
        /* make sure IB test finished before entering exclusive mode
         * to avoid preemption on IB test
         */
@@ -4109,6 +4118,41 @@ static int amdgpu_device_evict_resources(struct amdgpu_device *adev)
 /*
  * Suspend & resume.
  */
+/**
+ * amdgpu_device_pm_notifier - Notification block for Suspend/Hibernate events
+ * @nb: notifier block
+ * @mode: suspend mode
+ * @data: data
+ *
+ * This function is called when the system is about to suspend or hibernate.
+ * It is used to evict resources from the device before the system goes to
+ * sleep while there is still access to swap.
+ */
+static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode,
+                                    void *data)
+{
+       struct amdgpu_device *adev = container_of(nb, struct amdgpu_device, pm_nb);
+       int r;
+
+       switch (mode) {
+       case PM_HIBERNATION_PREPARE:
+               adev->in_s4 = true;
+               fallthrough;
+       case PM_SUSPEND_PREPARE:
+               r = amdgpu_device_evict_resources(adev);
+               /*
+                * This is considered non-fatal at this time because
+                * amdgpu_device_prepare() will also fatally evict resources.
+                * See https://gitlab.freedesktop.org/drm/amd/-/issues/3781
+                */
+               if (r)
+                       drm_warn(adev_to_drm(adev), "Failed to evict resources, freeze active processes if problems occur: %d\n", r);
+               break;
+       }
+
+       return NOTIFY_DONE;
+}
+
 /**
  * amdgpu_device_prepare - prepare for device suspend
  *
@@ -4148,7 +4192,7 @@ int amdgpu_device_prepare(struct drm_device *dev)
        return 0;
 
 unprepare:
-       adev->in_s0ix = adev->in_s3 = false;
+       adev->in_s0ix = adev->in_s3 = adev->in_s4 = false;
 
        return r;
 }
index bacf2e5de2abcef8fa52d085d739a2c491db50e6..c5727c0e6ce1c7ab4dd1e0c6b14c4d250f388f86 100644 (file)
@@ -2463,7 +2463,6 @@ static int amdgpu_pmops_freeze(struct device *dev)
        struct amdgpu_device *adev = drm_to_adev(drm_dev);
        int r;
 
-       adev->in_s4 = true;
        r = amdgpu_device_suspend(drm_dev, true);
        if (r)
                return r;