]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
drm/amdgpu: Estimate RAS reservation when report capacity v2
authorHawking Zhang <Hawking.Zhang@amd.com>
Tue, 28 May 2024 05:52:46 +0000 (13:52 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Wed, 5 Jun 2024 14:57:53 +0000 (10:57 -0400)
Add estimate of how much vram we need to reserve for RAS
when caculating the total available vram.

v2: apply the change to MP0 v13_0_2 and v13_0_14

Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h

index 3af00b57cd8a4e254e96f94435de1b877b1ad8c8..11672bfe4fad69deac4b6e6cee93cf599204afcc 100644 (file)
@@ -172,6 +172,8 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
 {
        uint64_t reserved_for_pt =
                ESTIMATE_PT_SIZE(amdgpu_amdkfd_total_mem_size);
+       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+       uint64_t reserved_for_ras = (con ? con->reserved_pages_in_bytes : 0);
        size_t system_mem_needed, ttm_mem_needed, vram_needed;
        int ret = 0;
        uint64_t vram_size = 0;
@@ -220,7 +222,7 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
            (kfd_mem_limit.ttm_mem_used + ttm_mem_needed >
             kfd_mem_limit.max_ttm_mem_limit) ||
            (adev && xcp_id >= 0 && adev->kfd.vram_used[xcp_id] + vram_needed >
-            vram_size - reserved_for_pt - atomic64_read(&adev->vram_pin_size))) {
+            vram_size - reserved_for_pt - reserved_for_ras - atomic64_read(&adev->vram_pin_size))) {
                ret = -ENOMEM;
                goto release;
        }
@@ -1673,6 +1675,8 @@ size_t amdgpu_amdkfd_get_available_memory(struct amdgpu_device *adev,
 {
        uint64_t reserved_for_pt =
                ESTIMATE_PT_SIZE(amdgpu_amdkfd_total_mem_size);
+       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+       uint64_t reserved_for_ras = (con ? con->reserved_pages_in_bytes : 0);
        ssize_t available;
        uint64_t vram_available, system_mem_available, ttm_mem_available;
 
@@ -1680,7 +1684,8 @@ size_t amdgpu_amdkfd_get_available_memory(struct amdgpu_device *adev,
        vram_available = KFD_XCP_MEMORY_SIZE(adev, xcp_id)
                - adev->kfd.vram_used_aligned[xcp_id]
                - atomic64_read(&adev->vram_pin_size)
-               - reserved_for_pt;
+               - reserved_for_pt
+               - reserved_for_ras;
 
        if (adev->flags & AMD_IS_APU) {
                system_mem_available = no_system_mem_limit ?
index db4a811cc0f5dd3922e3344ab4b9002e56c8d002..45b696524541b781fff739924419302aa836174b 100644 (file)
@@ -3298,6 +3298,24 @@ static void amdgpu_ras_event_mgr_init(struct amdgpu_device *adev)
                amdgpu_put_xgmi_hive(hive);
 }
 
+static void amdgpu_ras_init_reserved_vram_size(struct amdgpu_device *adev)
+{
+       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+
+       if (!con || (adev->flags & AMD_IS_APU))
+               return;
+
+       switch (amdgpu_ip_version(adev, MP0_HWIP, 0)) {
+       case IP_VERSION(13, 0, 2):
+       case IP_VERSION(13, 0, 6):
+       case IP_VERSION(13, 0, 14):
+               con->reserved_pages_in_bytes = AMDGPU_RAS_RESERVED_VRAM_SIZE;
+               break;
+       default:
+               break;
+       }
+}
+
 int amdgpu_ras_init(struct amdgpu_device *adev)
 {
        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
@@ -3403,6 +3421,8 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
        /* Get RAS schema for particular SOC */
        con->schema = amdgpu_get_ras_schema(adev);
 
+       amdgpu_ras_init_reserved_vram_size(adev);
+
        if (amdgpu_ras_fs_init(adev)) {
                r = -EINVAL;
                goto release_con;
index d06c01b978cd8370cc58d0448ce956296f0e6b5e..36a4de89c27d59ccc5698e0d771143b8907f8012 100644 (file)
@@ -64,6 +64,9 @@ struct amdgpu_iv_entry;
 #define AMDGPU_RAS_FEATURES_SOCKETID_SHIFT 29
 #define AMDGPU_RAS_FEATURES_SOCKETID_MASK 0xe0000000
 
+/* Reserve 8 physical dram row for possible retirement.
+ * In worst cases, it will lose 8 * 2MB memory in vram domain */
+#define AMDGPU_RAS_RESERVED_VRAM_SIZE  (16ULL << 20)
 /* The high three bits indicates socketid */
 #define AMDGPU_RAS_GET_FEATURES(val)  ((val) & ~AMDGPU_RAS_FEATURES_SOCKETID_MASK)
 
@@ -541,6 +544,7 @@ struct amdgpu_ras {
        struct ras_event_manager __event_mgr;
        struct ras_event_manager *event_mgr;
 
+       uint64_t reserved_pages_in_bytes;
 };
 
 struct ras_fs_data {