break;
case RAS_EVENT_ID__DEVICE_RMA:
ras_log_ring_add_log_event(ras_core, RAS_LOG_EVENT_RMA, NULL, NULL);
- ret = amdgpu_dpm_send_rma_reason(ras_core->dev);
+ if (!ras_fw_eeprom_supported(ras_core))
+ ret = amdgpu_dpm_send_rma_reason(ras_core->dev);
break;
case RAS_EVENT_ID__RESET_GPU:
ret = amdgpu_ras_mgr_reset_gpu(ras_core->dev, *(uint32_t *)data);
#define GPU_RESET_CAUSE_FATAL (RAS_CORE_RESET_GPU | 0x0002)
#define GPU_RESET_CAUSE_RMA (RAS_CORE_RESET_GPU | 0x0004)
+enum ras_gpu_health_status {
+ RAS_GPU_HEALTH_NONE = 0,
+ RAS_GPU_HEALTH_USABLE = 1,
+ RAS_GPU_RETIRED__ECC_REACH_THRESHOLD = 2,
+ RAS_GPU_IN_BAD_STATUS = 3,
+};
+
enum ras_core_fw_feature_flags {
RAS_CORE_FW_FEATURE_BIT__RAS_EEPROM = BIT_ULL(0),
};
(RECORD)->retired_row_pfn = tmp; \
} while (0)
-enum ras_gpu_health_status {
- RAS_GPU_HEALTH_NONE = 0,
- RAS_GPU_HEALTH_USABLE = 1,
- RAS_GPU_RETIRED__ECC_REACH_THRESHOLD = 2,
- RAS_GPU_IN_BAD_STATUS = 3,
-};
-
enum ras_eeprom_err_type {
RAS_EEPROM_ERR_NA,
RAS_EEPROM_ERR_RECOVERABLE,
return 0;
}
+
+int ras_fw_eeprom_check_storage_status(struct ras_core_context *ras_core)
+{
+ struct ras_fw_eeprom_control *control = &ras_core->ras_fw_eeprom;
+ int bad_page_count;
+
+ bad_page_count = ras_umc_get_badpage_count(ras_core);
+
+ if ((control->record_threshold_count < bad_page_count) &&
+ (control->record_threshold_config != 0)) {
+ RAS_DEV_ERR(ras_core->dev, "RAS records:%d exceed threshold:%d",
+ bad_page_count, control->record_threshold_count);
+ if ((control->record_threshold_config == WARN_NONSTOP_OVER_THRESHOLD) ||
+ (control->record_threshold_config == NONSTOP_OVER_THRESHOLD)) {
+ RAS_DEV_WARN(ras_core->dev,
+ "Please consult AMD Service Action Guide (SAG) for appropriate service procedures\n");
+ } else {
+ ras_core->is_rma = true;
+ RAS_DEV_ERR(ras_core->dev,
+ "User defined threshold is set, runtime service will be halt when threshold is reached\n");
+ }
+ return 0;
+ }
+
+ RAS_DEV_INFO(ras_core->dev,
+ "Found existing EEPROM table with %d records\n",
+ bad_page_count);
+ /* Warn if we are at 90% of the threshold or above
+ */
+ if (10 * bad_page_count >= 9 * control->record_threshold_count)
+ RAS_DEV_WARN(ras_core->dev,
+ "RAS records:%u exceeds 90%% of threshold:%d\n",
+ bad_page_count,
+ control->record_threshold_count);
+
+ return 0;
+}
+
+enum ras_gpu_health_status
+ ras_fw_eeprom_check_gpu_status(struct ras_core_context *ras_core)
+{
+ struct ras_fw_eeprom_control *control = &ras_core->ras_fw_eeprom;
+
+ if (!control->record_threshold_config)
+ return RAS_GPU_HEALTH_NONE;
+
+ if (ras_core->is_rma)
+ return RAS_GPU_RETIRED__ECC_REACH_THRESHOLD;
+
+ return RAS_GPU_HEALTH_USABLE;
+}