#include "cik_regs.h"
#include "kfd_kernel_queue.h"
#include "amdgpu_amdkfd.h"
+#include "amdgpu_reset.h"
#include "mes_v11_api_def.h"
#include "kfd_debug.h"
/*
* Issue a GPU reset if HWS is unresponsive
*/
- dqm->is_hws_hang = true;
-
- /* It's possible we're detecting a HWS hang in the
- * middle of a GPU reset. No need to schedule another
- * reset in this case.
- */
- if (!dqm->is_resetting)
- schedule_work(&dqm->hw_exception_work);
+ schedule_work(&dqm->hw_exception_work);
}
static int convert_to_mes_queue_type(int queue_type)
int r, queue_type;
uint64_t wptr_addr_off;
- if (dqm->is_hws_hang)
+ if (!down_read_trylock(&adev->reset_domain->sem))
return -EIO;
memset(&queue_input, 0x0, sizeof(struct mes_add_queue_input));
amdgpu_mes_lock(&adev->mes);
r = adev->mes.funcs->add_hw_queue(&adev->mes, &queue_input);
amdgpu_mes_unlock(&adev->mes);
+ up_read(&adev->reset_domain->sem);
if (r) {
dev_err(adev->dev, "failed to add hardware queue to MES, doorbell=0x%x\n",
q->properties.doorbell_off);
int r;
struct mes_remove_queue_input queue_input;
- if (dqm->is_hws_hang)
+ if (!down_read_trylock(&adev->reset_domain->sem))
return -EIO;
memset(&queue_input, 0x0, sizeof(struct mes_remove_queue_input));
amdgpu_mes_lock(&adev->mes);
r = adev->mes.funcs->remove_hw_queue(&adev->mes, &queue_input);
amdgpu_mes_unlock(&adev->mes);
+ up_read(&adev->reset_domain->sem);
if (r) {
dev_err(adev->dev, "failed to remove hardware queue from MES, doorbell=0x%x\n",
}
if (dqm->dev->adev->asic_type == CHIP_HAWAII)
- pm_uninit(&dqm->packet_mgr, false);
+ pm_uninit(&dqm->packet_mgr);
dqm->sched_running = false;
dqm_unlock(dqm);
return 0;
}
-static void pre_reset(struct device_queue_manager *dqm)
-{
- dqm_lock(dqm);
- dqm->is_resetting = true;
- dqm_unlock(dqm);
-}
-
static int allocate_sdma_queue(struct device_queue_manager *dqm,
struct queue *q, const uint32_t *restore_sdma_id)
{
init_interrupts(dqm);
/* clear hang status when driver try to start the hw scheduler */
- dqm->is_hws_hang = false;
- dqm->is_resetting = false;
dqm->sched_running = true;
if (!dqm->dev->kfd->shared_resources.enable_mes)
fail_allocate_vidmem:
fail_set_sched_resources:
if (!dqm->dev->kfd->shared_resources.enable_mes)
- pm_uninit(&dqm->packet_mgr, false);
+ pm_uninit(&dqm->packet_mgr);
fail_packet_manager_init:
dqm_unlock(dqm);
return retval;
static int stop_cpsch(struct device_queue_manager *dqm)
{
- bool hanging;
-
dqm_lock(dqm);
if (!dqm->sched_running) {
dqm_unlock(dqm);
return 0;
}
- if (!dqm->is_hws_hang) {
- if (!dqm->dev->kfd->shared_resources.enable_mes)
- unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0, USE_DEFAULT_GRACE_PERIOD, false);
- else
- remove_all_queues_mes(dqm);
- }
+ if (!dqm->dev->kfd->shared_resources.enable_mes)
+ unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0, USE_DEFAULT_GRACE_PERIOD, false);
+ else
+ remove_all_queues_mes(dqm);
- hanging = dqm->is_hws_hang || dqm->is_resetting;
dqm->sched_running = false;
if (!dqm->dev->kfd->shared_resources.enable_mes)
kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
if (!dqm->dev->kfd->shared_resources.enable_mes)
- pm_uninit(&dqm->packet_mgr, hanging);
+ pm_uninit(&dqm->packet_mgr);
dqm_unlock(dqm);
return 0;
{
struct device *dev = dqm->dev->adev->dev;
struct mqd_manager *mqd_mgr;
- int retval = 0;
+ int retval;
if (!dqm->sched_running)
return 0;
- if (dqm->is_hws_hang || dqm->is_resetting)
- return -EIO;
if (!dqm->active_runlist)
- return retval;
+ return 0;
+ if (!down_read_trylock(&dqm->dev->adev->reset_domain->sem))
+ return -EIO;
if (grace_period != USE_DEFAULT_GRACE_PERIOD) {
retval = pm_update_grace_period(&dqm->packet_mgr, grace_period);
if (retval)
- return retval;
+ goto out;
}
retval = pm_send_unmap_queue(&dqm->packet_mgr, filter, filter_param, reset);
if (retval)
- return retval;
+ goto out;
*dqm->fence_addr = KFD_FENCE_INIT;
pm_send_query_status(&dqm->packet_mgr, dqm->fence_gpu_addr,
if (retval) {
dev_err(dev, "The cp might be in an unrecoverable state due to an unsuccessful queues preemption\n");
kfd_hws_hang(dqm);
- return retval;
+ goto out;
}
/* In the current MEC firmware implementation, if compute queue
while (halt_if_hws_hang)
schedule();
kfd_hws_hang(dqm);
- return -ETIME;
+ retval = -ETIME;
+ goto out;
}
/* We need to reset the grace period value for this device */
pm_release_ib(&dqm->packet_mgr);
dqm->active_runlist = false;
+out:
+ up_read(&dqm->dev->adev->reset_domain->sem);
return retval;
}
{
int retval;
- if (dqm->is_hws_hang)
+ if (!down_read_trylock(&dqm->dev->adev->reset_domain->sem))
return -EIO;
retval = unmap_queues_cpsch(dqm, filter, filter_param, grace_period, false);
- if (retval)
- return retval;
-
- return map_queues_cpsch(dqm);
+ if (!retval)
+ retval = map_queues_cpsch(dqm);
+ up_read(&dqm->dev->adev->reset_domain->sem);
+ return retval;
}
static int wait_on_destroy_queue(struct device_queue_manager *dqm,
if (!dqm->dev->kfd->shared_resources.enable_mes)
retval = execute_queues_cpsch(dqm, filter, 0, USE_DEFAULT_GRACE_PERIOD);
- if ((!dqm->is_hws_hang) && (retval || qpd->reset_wavefronts)) {
+ if ((retval || qpd->reset_wavefronts) &&
+ down_read_trylock(&dqm->dev->adev->reset_domain->sem)) {
pr_warn("Resetting wave fronts (cpsch) on dev %p\n", dqm->dev);
dbgdev_wave_reset_wavefronts(dqm->dev, qpd->pqm->process);
qpd->reset_wavefronts = false;
+ up_read(&dqm->dev->adev->reset_domain->sem);
}
/* Lastly, free mqd resources.
dqm->ops.initialize = initialize_cpsch;
dqm->ops.start = start_cpsch;
dqm->ops.stop = stop_cpsch;
- dqm->ops.pre_reset = pre_reset;
dqm->ops.destroy_queue = destroy_queue_cpsch;
dqm->ops.update_queue = update_queue;
dqm->ops.register_process = register_process;
/* initialize dqm for no cp scheduling */
dqm->ops.start = start_nocpsch;
dqm->ops.stop = stop_nocpsch;
- dqm->ops.pre_reset = pre_reset;
dqm->ops.create_queue = create_queue_nocpsch;
dqm->ops.destroy_queue = destroy_queue_nocpsch;
dqm->ops.update_queue = update_queue;