struct queue *q, const uint32_t *restore_sdma_id);
static int reset_queues_on_hws_hang(struct device_queue_manager *dqm, bool is_sdma);
+static int resume_all_queues_mes(struct device_queue_manager *dqm);
+static int suspend_all_queues_mes(struct device_queue_manager *dqm);
+static struct queue *find_queue_by_doorbell_offset(struct device_queue_manager *dqm,
+ u32 doorbell_offset);
+static void set_queue_as_reset(struct device_queue_manager *dqm, struct queue *q,
+ struct qcm_process_device *qpd);
static inline
enum KFD_MQD_TYPE get_mqd_type_from_queue_type(enum kfd_queue_type type)
return r;
}
-static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q,
- struct qcm_process_device *qpd)
+static int remove_queue_mes_on_reset_option(struct device_queue_manager *dqm, struct queue *q,
+ struct qcm_process_device *qpd,
+ bool is_for_reset,
+ bool flush_mes_queue)
{
struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev;
int r;
struct mes_remove_queue_input queue_input;
+ /* queue was already removed during reset */
+ if (q->properties.is_reset)
+ return 0;
+
if (!dqm->sched_running || dqm->sched_halt)
return 0;
if (!down_read_trylock(&adev->reset_domain->sem))
memset(&queue_input, 0x0, sizeof(struct mes_remove_queue_input));
queue_input.doorbell_offset = q->properties.doorbell_off;
queue_input.gang_context_addr = q->gang_ctx_gpu_addr;
+ queue_input.remove_queue_after_reset = flush_mes_queue;
queue_input.xcc_id = ffs(dqm->dev->xcc_mask) - 1;
amdgpu_mes_lock(&adev->mes);
amdgpu_mes_unlock(&adev->mes);
up_read(&adev->reset_domain->sem);
+ if (is_for_reset)
+ return r;
+
if (r) {
+ if (!suspend_all_queues_mes(dqm))
+ return resume_all_queues_mes(dqm);
+
dev_err(adev->dev, "failed to remove hardware queue from MES, doorbell=0x%x\n",
q->properties.doorbell_off);
dev_err(adev->dev, "MES might be in unrecoverable state, issue a GPU reset\n");
return r;
}
+static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q,
+ struct qcm_process_device *qpd)
+{
+ return remove_queue_mes_on_reset_option(dqm, q, qpd, false, false);
+}
+
static int remove_all_kfd_queues_mes(struct device_queue_manager *dqm)
{
struct device_process_node *cur;
return retval;
}
+static int reset_queues_mes(struct device_queue_manager *dqm)
+{
+ struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev;
+ int hqd_info_size = adev->mes.hung_queue_hqd_info_offset;
+ int num_hung = 0, r = 0, i, pipe, queue, queue_type;
+ u32 *hung_array = dqm->hung_db_array;
+ struct amdgpu_mes_hung_queue_hqd_info *hqd_info = dqm->hqd_info;
+ struct kfd_process_device *pdd;
+ struct queue *q;
+
+ if (!amdgpu_mes_queue_reset_by_mes_supported(adev)) {
+ r = -ENOTRECOVERABLE;
+ goto fail;
+ }
+
+ /* reset should be used only in dqm locked queue reset */
+ if (WARN_ON(dqm->detect_hang_count > 0))
+ return 0;
+
+ if (!amdgpu_gpu_recovery) {
+ r = -ENOTRECOVERABLE;
+ goto fail;
+ }
+
+ if (!hung_array || !hqd_info) {
+ r = -ENOMEM;
+ goto fail;
+ }
+
+ memset(hqd_info, 0, hqd_info_size * sizeof(struct amdgpu_mes_hung_queue_hqd_info));
+
+ /*
+ * AMDGPU_RING_TYPE_COMPUTE parameter does not matter if called
+ * post suspend_all as reset & detect will return all hung queue types.
+ *
+ * Passed parameter is for targeting queues not scheduled by MES add_queue.
+ */
+ r = amdgpu_mes_detect_and_reset_hung_queues(adev, AMDGPU_RING_TYPE_COMPUTE,
+ false, &num_hung, hung_array, ffs(dqm->dev->xcc_mask) - 1);
+
+ if (!num_hung || r) {
+ r = -ENOTRECOVERABLE;
+ goto fail;
+ }
+
+ /* MES resets queue/pipe and cleans up internally */
+ for (i = 0; i < num_hung; i++) {
+ hqd_info[i].bit0_31 = hung_array[i + hqd_info_size];
+ pipe = hqd_info[i].pipe_index;
+ queue = hqd_info[i].queue_index;
+ queue_type = hqd_info[i].queue_type;
+
+ if (queue_type != MES_QUEUE_TYPE_COMPUTE &&
+ queue_type != MES_QUEUE_TYPE_SDMA) {
+ pr_warn("Unsupported hung queue reset type: %d\n", queue_type);
+ hung_array[i] = AMDGPU_MES_INVALID_DB_OFFSET;
+ continue;
+ }
+
+ q = find_queue_by_doorbell_offset(dqm, hung_array[i]);
+ if (!q) {
+ r = -ENOTRECOVERABLE;
+ goto fail;
+ }
+
+ pdd = kfd_get_process_device_data(q->device, q->process);
+ if (!pdd) {
+ r = -ENODEV;
+ goto fail;
+ }
+
+ pr_warn("Hang detected doorbell %x pipe %d queue %d type %d\n",
+ hung_array[i], pipe, queue, queue_type);
+ /* Proceed remove_queue with reset=true */
+ remove_queue_mes_on_reset_option(dqm, q, &pdd->qpd, true, false);
+ set_queue_as_reset(dqm, q, &pdd->qpd);
+ }
+
+ dqm->detect_hang_count = num_hung;
+ kfd_signal_reset_event(dqm->dev);
+
+fail:
+ dqm->detect_hang_count = 0;
+ return r;
+}
+
static int suspend_all_queues_mes(struct device_queue_manager *dqm)
{
struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev;
up_read(&adev->reset_domain->sem);
if (r) {
+ if (!reset_queues_mes(dqm))
+ return 0;
+
dev_err(adev->dev, "failed to suspend gangs from MES\n");
dev_err(adev->dev, "MES might be in unrecoverable state, issue a GPU reset\n");
kfd_hws_hang(dqm);
{
struct device *dev = dqm->dev->adev->dev;
int retval, num_hw_queue_slots;
+ struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev;
+ int hung_array_size = amdgpu_mes_get_hung_queue_db_array_size(adev);
+ int hqd_info_size = adev->mes.hung_queue_hqd_info_offset;
dqm_lock(dqm);
goto fail_detect_hang_buffer;
}
+ dqm->hung_db_array = kzalloc(hung_array_size * sizeof(u32), GFP_KERNEL);
+ dqm->hqd_info = kzalloc(
+ hqd_info_size * sizeof(struct amdgpu_mes_hung_queue_hqd_info),
+ GFP_KERNEL);
+
dqm_unlock(dqm);
return 0;
pm_uninit(&dqm->packet_mgr);
kfree(dqm->detect_hang_info);
dqm->detect_hang_info = NULL;
+ kfree(dqm->hung_db_array);
+ kfree(dqm->hqd_info);
+
dqm_unlock(dqm);
return ret;
q->properties.queue_id, pdd->process->lead_thread->pid);
pdd->has_reset_queue = true;
+ q->properties.is_reset = true;
if (q->properties.is_active) {
q->properties.is_active = false;
decrement_queue_count(dqm, qpd, q);
return NULL;
}
+static struct queue *find_queue_by_doorbell_offset(struct device_queue_manager *dqm, u32 doorbell_offset)
+{
+ struct device_process_node *cur;
+ struct qcm_process_device *qpd;
+ struct queue *q;
+
+ list_for_each_entry(cur, &dqm->queues, list) {
+ qpd = cur->qpd;
+ list_for_each_entry(q, &qpd->queues_list, list) {
+ if (doorbell_offset == q->properties.doorbell_off)
+ return q;
+ }
+ }
+
+ return NULL;
+}
+
static int reset_hung_queues(struct device_queue_manager *dqm)
{
int r = 0, reset_count = 0, i;