drm/amd/ras: Add amdgpu ras management function.

author YiPeng Chai <YiPeng.Chai@amd.com>

Mon, 17 Mar 2025 10:03:28 +0000 (18:03 +0800)

committer Alex Deucher <alexander.deucher@amd.com>

Mon, 13 Oct 2025 18:14:31 +0000 (14:14 -0400)
author YiPeng Chai <YiPeng.Chai@amd.com>
Mon, 17 Mar 2025 10:03:28 +0000 (18:03 +0800)
committer Alex Deucher <alexander.deucher@amd.com>
Mon, 13 Oct 2025 18:14:31 +0000 (14:14 -0400)
diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.c b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.c

new file mode 100644 (file)

index 0000000..a038c87
--- /dev/null
+++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.c
@@ -0,0 +1,560 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright 2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+#include "amdgpu.h"
+#include "amdgpu_reset.h"
+#include "amdgpu_xgmi.h"
+#include "ras_sys.h"
+#include "amdgpu_ras_mgr.h"
+#include "amdgpu_ras_cmd.h"
+#include "amdgpu_ras_process.h"
+#include "amdgpu_ras_eeprom_i2c.h"
+#include "amdgpu_ras_mp1_v13_0.h"
+#include "amdgpu_ras_nbio_v7_9.h"
+
+#define MAX_SOCKET_NUM_PER_HIVE                8
+#define MAX_AID_NUM_PER_SOCKET         4
+#define MAX_XCD_NUM_PER_AID                    2
+
+/* typical ECC bad page rate is 1 bad page per 100MB VRAM */
+#define ESTIMATE_BAD_PAGE_THRESHOLD(size)         ((size)/(100 * 1024 * 1024ULL))
+
+#define COUNT_BAD_PAGE_THRESHOLD(size) (((size) >> 21) << 4)
+
+/* Reserve 8 physical dram row for possible retirement.
+ * In worst cases, it will lose 8 * 2MB memory in vram domain
+ */
+#define RAS_RESERVED_VRAM_SIZE_DEFAULT (16ULL << 20)
+
+
+static void ras_mgr_init_event_mgr(struct ras_event_manager *mgr)
+{
+       struct ras_event_state *event_state;
+       int i;
+
+       memset(mgr, 0, sizeof(*mgr));
+       atomic64_set(&mgr->seqno, 0);
+
+       for (i = 0; i < ARRAY_SIZE(mgr->event_state); i++) {
+               event_state = &mgr->event_state[i];
+               event_state->last_seqno = RAS_EVENT_INVALID_ID;
+               atomic64_set(&event_state->count, 0);
+       }
+}
+
+static void amdgpu_ras_mgr_init_event_mgr(struct ras_core_context *ras_core)
+{
+       struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev;
+       struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
+       struct ras_event_manager *event_mgr;
+       struct amdgpu_hive_info *hive;
+
+       hive = amdgpu_get_xgmi_hive(adev);
+       event_mgr = hive ? &hive->event_mgr : &ras_mgr->ras_event_mgr;
+
+       /* init event manager with node 0 on xgmi system */
+       if (!amdgpu_reset_in_recovery(adev)) {
+               if (!hive || adev->gmc.xgmi.node_id == 0)
+                       ras_mgr_init_event_mgr(event_mgr);
+       }
+
+       if (hive)
+               amdgpu_put_xgmi_hive(hive);
+}
+
+static int amdgpu_ras_mgr_init_aca_config(struct amdgpu_device *adev,
+               struct ras_core_config *config)
+{
+       struct ras_aca_config *aca_cfg = &config->aca_cfg;
+
+       aca_cfg->socket_num_per_hive = MAX_SOCKET_NUM_PER_HIVE;
+       aca_cfg->aid_num_per_socket = MAX_AID_NUM_PER_SOCKET;
+       aca_cfg->xcd_num_per_aid = MAX_XCD_NUM_PER_AID;
+
+       return 0;
+}
+
+static int amdgpu_ras_mgr_init_eeprom_config(struct amdgpu_device *adev,
+               struct ras_core_config *config)
+{
+       struct ras_eeprom_config *eeprom_cfg = &config->eeprom_cfg;
+
+       eeprom_cfg->eeprom_sys_fn = &amdgpu_ras_eeprom_i2c_sys_func;
+       eeprom_cfg->eeprom_i2c_adapter = adev->pm.ras_eeprom_i2c_bus;
+       if (eeprom_cfg->eeprom_i2c_adapter) {
+               const struct i2c_adapter_quirks *quirks =
+                       ((struct i2c_adapter *)eeprom_cfg->eeprom_i2c_adapter)->quirks;
+
+               if (quirks) {
+                       eeprom_cfg->max_i2c_read_len = quirks->max_read_len;
+                       eeprom_cfg->max_i2c_write_len = quirks->max_write_len;
+               }
+       }
+
+       /*
+        * amdgpu_bad_page_threshold is used to config
+        * the threshold for the number of bad pages.
+        * -1:  Threshold is set to default value
+        *      Driver will issue a warning message when threshold is reached
+        *      and continue runtime services.
+        * 0:   Disable bad page retirement
+        *      Driver will not retire bad pages
+        *      which is intended for debugging purpose.
+        * -2:  Threshold is determined by a formula
+        *      that assumes 1 bad page per 100M of local memory.
+        *      Driver will continue runtime services when threhold is reached.
+        * 0 < threshold < max number of bad page records in EEPROM,
+        *      A user-defined threshold is set
+        *      Driver will halt runtime services when this custom threshold is reached.
+        */
+       if (amdgpu_bad_page_threshold == NONSTOP_OVER_THRESHOLD)
+               eeprom_cfg->eeprom_record_threshold_count =
+                               ESTIMATE_BAD_PAGE_THRESHOLD(adev->gmc.mc_vram_size);
+       else if (amdgpu_bad_page_threshold == WARN_NONSTOP_OVER_THRESHOLD)
+               eeprom_cfg->eeprom_record_threshold_count =
+                               COUNT_BAD_PAGE_THRESHOLD(RAS_RESERVED_VRAM_SIZE_DEFAULT);
+       else
+               eeprom_cfg->eeprom_record_threshold_count = amdgpu_bad_page_threshold;
+
+       eeprom_cfg->eeprom_record_threshold_config = amdgpu_bad_page_threshold;
+
+       return 0;
+}
+
+static int amdgpu_ras_mgr_init_mp1_config(struct amdgpu_device *adev,
+               struct ras_core_config *config)
+{
+       struct ras_mp1_config *mp1_cfg = &config->mp1_cfg;
+       int ret = 0;
+
+       switch (config->mp1_ip_version) {
+       case IP_VERSION(13, 0, 6):
+       case IP_VERSION(13, 0, 14):
+       case IP_VERSION(13, 0, 12):
+               mp1_cfg->mp1_sys_fn = &amdgpu_ras_mp1_sys_func_v13_0;
+               break;
+       default:
+               RAS_DEV_ERR(adev,
+                       "The mp1(0x%x) ras config is not right!\n",
+                       config->mp1_ip_version);
+               ret = -EINVAL;
+               break;
+       }
+
+       return ret;
+}
+
+static int amdgpu_ras_mgr_init_nbio_config(struct amdgpu_device *adev,
+               struct ras_core_config *config)
+{
+       struct ras_nbio_config *nbio_cfg = &config->nbio_cfg;
+       int ret = 0;
+
+       switch (config->nbio_ip_version) {
+       case IP_VERSION(7, 9, 0):
+               nbio_cfg->nbio_sys_fn = &amdgpu_ras_nbio_sys_func_v7_9;
+               break;
+       default:
+               RAS_DEV_ERR(adev,
+                       "The nbio(0x%x) ras config is not right!\n",
+                       config->mp1_ip_version);
+               ret = -EINVAL;
+               break;
+       }
+
+       return ret;
+}
+
+static int amdgpu_ras_mgr_get_ras_psp_system_status(struct ras_core_context *ras_core,
+                       struct ras_psp_sys_status *status)
+{
+       struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev;
+       struct ta_context *context = &adev->psp.ras_context.context;
+
+       status->initialized = context->initialized;
+       status->session_id = context->session_id;
+       status->psp_cmd_mutex = &adev->psp.mutex;
+
+       return 0;
+}
+
+static int amdgpu_ras_mgr_get_ras_ta_init_param(struct ras_core_context *ras_core,
+       struct ras_ta_init_param *ras_ta_param)
+{
+       struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev;
+       uint32_t nps_mode;
+
+       if (amdgpu_ras_is_poison_mode_supported(adev))
+               ras_ta_param->poison_mode_en = 1;
+
+       if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu)
+               ras_ta_param->dgpu_mode = 1;
+
+       ras_ta_param->xcc_mask = adev->gfx.xcc_mask;
+       ras_ta_param->channel_dis_num = hweight32(adev->gmc.m_half_use) * 2;
+
+       ras_ta_param->active_umc_mask = adev->umc.active_mask;
+
+       if (!amdgpu_ras_mgr_get_curr_nps_mode(adev, &nps_mode))
+               ras_ta_param->nps_mode = nps_mode;
+
+       return 0;
+}
+
+const struct ras_psp_sys_func amdgpu_ras_psp_sys_func = {
+       .get_ras_psp_system_status = amdgpu_ras_mgr_get_ras_psp_system_status,
+       .get_ras_ta_init_param = amdgpu_ras_mgr_get_ras_ta_init_param,
+};
+
+static int amdgpu_ras_mgr_init_psp_config(struct amdgpu_device *adev,
+       struct ras_core_config *config)
+{
+       struct ras_psp_config *psp_cfg = &config->psp_cfg;
+
+       psp_cfg->psp_sys_fn = &amdgpu_ras_psp_sys_func;
+
+       return 0;
+}
+
+static int amdgpu_ras_mgr_init_umc_config(struct amdgpu_device *adev,
+       struct ras_core_config *config)
+{
+       struct ras_umc_config *umc_cfg = &config->umc_cfg;
+
+       umc_cfg->umc_vram_type = adev->gmc.vram_type;
+
+       return 0;
+}
+
+static struct ras_core_context *amdgpu_ras_mgr_create_ras_core(struct amdgpu_device *adev)
+{
+       struct ras_core_config init_config;
+
+       memset(&init_config, 0, sizeof(init_config));
+
+       init_config.umc_ip_version = amdgpu_ip_version(adev, UMC_HWIP, 0);
+       init_config.mp1_ip_version = amdgpu_ip_version(adev, MP1_HWIP, 0);
+       init_config.gfx_ip_version = amdgpu_ip_version(adev, GC_HWIP, 0);
+       init_config.nbio_ip_version = amdgpu_ip_version(adev, NBIO_HWIP, 0);
+       init_config.psp_ip_version = amdgpu_ip_version(adev, MP1_HWIP, 0);
+
+       if (init_config.umc_ip_version == IP_VERSION(12, 0, 0))
+               init_config.aca_ip_version = IP_VERSION(1, 0, 0);
+
+       init_config.sys_fn = &amdgpu_ras_sys_fn;
+       init_config.ras_eeprom_supported = true;
+       init_config.poison_supported =
+               amdgpu_ras_is_poison_mode_supported(adev);
+
+       amdgpu_ras_mgr_init_aca_config(adev, &init_config);
+       amdgpu_ras_mgr_init_eeprom_config(adev, &init_config);
+       amdgpu_ras_mgr_init_mp1_config(adev, &init_config);
+       amdgpu_ras_mgr_init_nbio_config(adev, &init_config);
+       amdgpu_ras_mgr_init_psp_config(adev, &init_config);
+       amdgpu_ras_mgr_init_umc_config(adev, &init_config);
+
+       return ras_core_create(&init_config);
+}
+
+static int amdgpu_ras_mgr_sw_init(struct amdgpu_ip_block *ip_block)
+{
+       struct amdgpu_device *adev = ip_block->adev;
+       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+       struct amdgpu_ras_mgr *ras_mgr;
+       int ret = 0;
+
+       ras_mgr = kzalloc(sizeof(*ras_mgr), GFP_KERNEL);
+       if (!ras_mgr)
+               return -EINVAL;
+
+       con->ras_mgr = ras_mgr;
+       ras_mgr->adev = adev;
+
+       ras_mgr->ras_core = amdgpu_ras_mgr_create_ras_core(adev);
+       if (!ras_mgr->ras_core) {
+               RAS_DEV_ERR(adev, "Failed to create ras core!\n");
+               ret = -EINVAL;
+               goto err;
+       }
+
+       ras_mgr->ras_core->dev = adev;
+
+       amdgpu_ras_process_init(adev);
+       ras_core_sw_init(ras_mgr->ras_core);
+       amdgpu_ras_mgr_init_event_mgr(ras_mgr->ras_core);
+       return 0;
+
+err:
+       kfree(ras_mgr);
+       return ret;
+}
+
+static int amdgpu_ras_mgr_sw_fini(struct amdgpu_ip_block *ip_block)
+{
+       struct amdgpu_device *adev = ip_block->adev;
+       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+       struct amdgpu_ras_mgr *ras_mgr = (struct amdgpu_ras_mgr *)con->ras_mgr;
+
+       if (!ras_mgr)
+               return 0;
+
+       amdgpu_ras_process_fini(adev);
+       ras_core_sw_fini(ras_mgr->ras_core);
+       ras_core_destroy(ras_mgr->ras_core);
+       ras_mgr->ras_core = NULL;
+
+       kfree(con->ras_mgr);
+       con->ras_mgr = NULL;
+
+       return 0;
+}
+
+static int amdgpu_ras_mgr_hw_init(struct amdgpu_ip_block *ip_block)
+{
+       struct amdgpu_device *adev = ip_block->adev;
+       struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
+       int ret;
+
+       if (!ras_mgr || !ras_mgr->ras_core)
+               return -EINVAL;
+
+       ret = ras_core_hw_init(ras_mgr->ras_core);
+       if (ret) {
+               RAS_DEV_ERR(adev, "Failed to initialize ras core!\n");
+               return ret;
+       }
+
+       ras_mgr->ras_is_ready = true;
+
+       RAS_DEV_INFO(adev, "AMDGPU RAS Is Ready.\n");
+       return 0;
+}
+
+static int amdgpu_ras_mgr_hw_fini(struct amdgpu_ip_block *ip_block)
+{
+       struct amdgpu_device *adev = ip_block->adev;
+       struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
+
+       if (!ras_mgr || !ras_mgr->ras_core)
+               return -EINVAL;
+
+       ras_core_hw_fini(ras_mgr->ras_core);
+
+       ras_mgr->ras_is_ready = false;
+
+       return 0;
+}
+
+struct amdgpu_ras_mgr *amdgpu_ras_mgr_get_context(struct amdgpu_device *adev)
+{
+       if (!adev || !adev->psp.ras_context.ras)
+               return NULL;
+
+       return (struct amdgpu_ras_mgr *)adev->psp.ras_context.ras->ras_mgr;
+}
+
+static const struct amd_ip_funcs ras_v1_0_ip_funcs = {
+       .name = "ras_v1_0",
+       .sw_init = amdgpu_ras_mgr_sw_init,
+       .sw_fini = amdgpu_ras_mgr_sw_fini,
+       .hw_init = amdgpu_ras_mgr_hw_init,
+       .hw_fini = amdgpu_ras_mgr_hw_fini,
+};
+
+int amdgpu_enable_unified_ras(struct amdgpu_device *adev, bool enable)
+{
+       struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
+
+       if (!ras_mgr || !ras_mgr->ras_core)
+               return -EPERM;
+
+       if (amdgpu_sriov_vf(adev))
+               return -EPERM;
+
+       RAS_DEV_INFO(adev, "Enable amdgpu unified ras!");
+       return ras_core_set_status(ras_mgr->ras_core, enable);
+}
+
+bool amdgpu_unified_ras_enabled(struct amdgpu_device *adev)
+{
+       struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
+
+       if (!ras_mgr || !ras_mgr->ras_core)
+               return false;
+
+       if (amdgpu_sriov_vf(adev))
+               return false;
+
+       return ras_core_is_enabled(ras_mgr->ras_core);
+}
+
+static bool amdgpu_ras_mgr_is_ready(struct amdgpu_device *adev)
+{
+       struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
+
+       if (ras_mgr && ras_mgr->ras_core && ras_mgr->ras_is_ready &&
+           ras_core_is_ready(ras_mgr->ras_core))
+               return true;
+
+       return false;
+}
+
+int amdgpu_ras_mgr_handle_fatal_interrupt(struct amdgpu_device *adev, void *data)
+{
+       struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
+
+       if (!amdgpu_ras_mgr_is_ready(adev))
+               return -EPERM;
+
+       return ras_core_handle_nbio_irq(ras_mgr->ras_core, data);
+}
+
+uint64_t amdgpu_ras_mgr_gen_ras_event_seqno(struct amdgpu_device *adev,
+                       enum ras_seqno_type seqno_type)
+{
+       struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
+       int ret;
+       uint64_t seq_no;
+
+       if (!amdgpu_ras_mgr_is_ready(adev) ||
+           (seqno_type >= RAS_SEQNO_TYPE_COUNT_MAX))
+               return 0;
+
+       seq_no = ras_core_gen_seqno(ras_mgr->ras_core, seqno_type);
+
+       if ((seqno_type == RAS_SEQNO_TYPE_DE) ||
+           (seqno_type == RAS_SEQNO_TYPE_POISON_CONSUMPTION)) {
+               ret = ras_core_put_seqno(ras_mgr->ras_core, seqno_type, seq_no);
+               if (ret)
+                       RAS_DEV_WARN(adev, "There are too many ras interrupts!");
+       }
+
+       return seq_no;
+}
+
+int amdgpu_ras_mgr_handle_controller_interrupt(struct amdgpu_device *adev, void *data)
+{
+       struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
+       struct ras_ih_info *ih_info = (struct ras_ih_info *)data;
+       uint64_t seq_no = 0;
+       int ret = 0;
+
+       if (!amdgpu_ras_mgr_is_ready(adev))
+               return -EPERM;
+
+       if (ih_info && (ih_info->block == AMDGPU_RAS_BLOCK__UMC)) {
+               if (ras_mgr->ras_core->poison_supported) {
+                       seq_no = amdgpu_ras_mgr_gen_ras_event_seqno(adev, RAS_SEQNO_TYPE_DE);
+                       RAS_DEV_INFO(adev,
+                               "{%llu} RAS poison is created, no user action is needed.\n",
+                               seq_no);
+               }
+
+               ret = amdgpu_ras_process_handle_umc_interrupt(adev, ih_info);
+       } else if (ras_mgr->ras_core->poison_supported) {
+               ret = amdgpu_ras_process_handle_unexpected_interrupt(adev, ih_info);
+       } else {
+               RAS_DEV_WARN(adev,
+                       "No RAS interrupt handler for non-UMC block with poison disabled.\n");
+       }
+
+       return ret;
+}
+
+int amdgpu_ras_mgr_handle_consumer_interrupt(struct amdgpu_device *adev, void *data)
+{
+       if (!amdgpu_ras_mgr_is_ready(adev))
+               return -EPERM;
+
+       return amdgpu_ras_process_handle_consumption_interrupt(adev, data);
+}
+
+int amdgpu_ras_mgr_update_ras_ecc(struct amdgpu_device *adev)
+{
+       struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
+
+       if (!amdgpu_ras_mgr_is_ready(adev))
+               return -EPERM;
+
+       return ras_core_update_ecc_info(ras_mgr->ras_core);
+}
+
+int amdgpu_ras_mgr_reset_gpu(struct amdgpu_device *adev, uint32_t flags)
+{
+       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+
+       if (!amdgpu_ras_mgr_is_ready(adev))
+               return -EPERM;
+
+       con->gpu_reset_flags |= flags;
+       return amdgpu_ras_reset_gpu(adev);
+}
+
+bool amdgpu_ras_mgr_check_eeprom_safety_watermark(struct amdgpu_device *adev)
+{
+       struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
+
+       if (!amdgpu_ras_mgr_is_ready(adev))
+               return false;
+
+       return ras_eeprom_check_safety_watermark(ras_mgr->ras_core);
+}
+
+int amdgpu_ras_mgr_get_curr_nps_mode(struct amdgpu_device *adev,
+       uint32_t *nps_mode)
+{
+       struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
+       uint32_t mode;
+
+       if (!amdgpu_ras_mgr_is_ready(adev))
+               return -EINVAL;
+
+       mode = ras_core_get_curr_nps_mode(ras_mgr->ras_core);
+       if (!mode || mode > AMDGPU_NPS8_PARTITION_MODE)
+               return -EINVAL;
+
+       *nps_mode = mode;
+
+       return 0;
+}
+
+bool amdgpu_ras_mgr_check_retired_addr(struct amdgpu_device *adev,
+                       uint64_t addr)
+{
+       struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
+
+       if (!amdgpu_ras_mgr_is_ready(adev))
+               return false;
+
+       return ras_umc_check_retired_addr(ras_mgr->ras_core, addr);
+}
+
+bool amdgpu_ras_mgr_is_rma(struct amdgpu_device *adev)
+{
+       struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
+
+       if (!ras_mgr || !ras_mgr->ras_core || !ras_mgr->ras_is_ready)
+               return false;
+
+       return ras_core_gpu_is_rma(ras_mgr->ras_core);
+}
diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.h b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.h

new file mode 100644 (file)

index 0000000..fa761de
--- /dev/null
+++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.h
@@ -0,0 +1,73 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright (c) 2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef __AMDGPU_RAS_MGR_H__
+#define __AMDGPU_RAS_MGR_H__
+#include "ras.h"
+#include "amdgpu_ras_process.h"
+
+enum ras_ih_type {
+       RAS_IH_NONE,
+       RAS_IH_FROM_BLOCK_CONTROLLER,
+       RAS_IH_FROM_CONSUMER_CLIENT,
+       RAS_IH_FROM_FATAL_ERROR,
+};
+
+struct ras_ih_info {
+       uint32_t block;
+       union {
+               struct amdgpu_iv_entry iv_entry;
+               struct {
+                       uint16_t pasid;
+                       uint32_t reset;
+                       pasid_notify pasid_fn;
+                       void *data;
+               };
+       };
+};
+
+struct amdgpu_ras_mgr {
+       struct amdgpu_device *adev;
+       struct ras_core_context *ras_core;
+       struct delayed_work retire_page_dwork;
+       struct ras_event_manager ras_event_mgr;
+       uint64_t last_poison_consumption_seqno;
+       bool ras_is_ready;
+};
+
+struct amdgpu_ras_mgr *amdgpu_ras_mgr_get_context(
+                       struct amdgpu_device *adev);
+int amdgpu_enable_unified_ras(struct amdgpu_device *adev, bool enable);
+bool amdgpu_unified_ras_enabled(struct amdgpu_device *adev);
+int amdgpu_ras_mgr_handle_fatal_interrupt(struct amdgpu_device *adev, void *data);
+int amdgpu_ras_mgr_handle_controller_interrupt(struct amdgpu_device *adev, void *data);
+int amdgpu_ras_mgr_handle_consumer_interrupt(struct amdgpu_device *adev, void *data);
+int amdgpu_ras_mgr_update_ras_ecc(struct amdgpu_device *adev);
+int amdgpu_ras_mgr_reset_gpu(struct amdgpu_device *adev, uint32_t flags);
+uint64_t amdgpu_ras_mgr_gen_ras_event_seqno(struct amdgpu_device *adev,
+                       enum ras_seqno_type seqno_type);
+bool amdgpu_ras_mgr_check_eeprom_safety_watermark(struct amdgpu_device *adev);
+int amdgpu_ras_mgr_get_curr_nps_mode(struct amdgpu_device *adev, uint32_t *nps_mode);
+bool amdgpu_ras_mgr_check_retired_addr(struct amdgpu_device *adev,
+                       uint64_t addr);
+bool amdgpu_ras_mgr_is_rma(struct amdgpu_device *adev);
+#endif
author	YiPeng Chai <YiPeng.Chai@amd.com>
	Mon, 17 Mar 2025 10:03:28 +0000 (18:03 +0800)
committer	Alex Deucher <alexander.deucher@amd.com>
	Mon, 13 Oct 2025 18:14:31 +0000 (14:14 -0400)
drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.c	[new file with mode: 0644]	patch \| blob
drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.h	[new file with mode: 0644]	patch \| blob