From: Dave Airlie Date: Fri, 4 Jul 2025 00:06:22 +0000 (+1000) Subject: Merge tag 'amd-drm-next-6.17-2025-07-01' of https://gitlab.freedesktop.org/agd5f... X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=7e2818386aad54ba5ab70e228c555814d33bdad1;p=thirdparty%2Flinux.git Merge tag 'amd-drm-next-6.17-2025-07-01' of https://gitlab.freedesktop.org/agd5f/linux into drm-next amd-drm-next-6.17-2025-07-01: amdgpu: - FAMS2 fixes - OLED fixes - Misc cleanups - AUX fixes - DMCUB updates - SR-IOV hibernation support - RAS updates - DP tunneling fixes - DML2 fixes - Backlight improvements - Suspend improvements - Use scaling for non-native modes on eDP - SDMA 4.4.x fixes - PCIe DPM fixes - SDMA 5.x fixes - Cleaner shader updates for GC 9.x - Remove fence slab - ISP genpd support - Parition handling rework - SDMA FW checks for userq support - Add missing firmware declaration - Fix leak in amdgpu_ctx_mgr_entity_fini() - Freesync fix - Ring reset refactoring - Legacy dpm verbosity changes amdkfd: - GWS fix - mtype fix for ext coherent system memory - MMU notifier fix - gfx7/8 fix radeon: - CS validation support for additional GL extensions - Bump driver version for new CS validation checks From: Alex Deucher Link: https://lore.kernel.org/r/20250701194707.32905-1-alexander.deucher@amd.com Signed-off-by: Dave Airlie --- 7e2818386aad54ba5ab70e228c555814d33bdad1 diff --cc drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c index 8cecf25996edc,3d170060282ee..2c3547f4cea42 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c @@@ -322,11 -283,13 +283,13 @@@ bool amdgpu_fence_process(struct amdgpu */ static void amdgpu_fence_fallback(struct timer_list *t) { - struct amdgpu_ring *ring = from_timer(ring, t, - fence_drv.fallback_timer); + struct amdgpu_ring *ring = timer_container_of(ring, t, + fence_drv.fallback_timer); if (amdgpu_fence_process(ring)) - DRM_WARN("Fence fallback timer expired on ring %s\n", ring->name); + dev_warn(ring->adev->dev, + "Fence fallback timer expired on ring %s\n", + ring->name); } /** diff --cc drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c index 13c60cac4261b,cf988077a3eef..183fa33c24347 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c @@@ -729,10 -733,10 +733,10 @@@ static const struct irq_domain_ops amdg */ int amdgpu_irq_add_domain(struct amdgpu_device *adev) { - adev->irq.domain = irq_domain_add_linear(NULL, AMDGPU_MAX_IRQ_SRC_ID, - &amdgpu_hw_irqdomain_ops, adev); + adev->irq.domain = irq_domain_create_linear(NULL, AMDGPU_MAX_IRQ_SRC_ID, + &amdgpu_hw_irqdomain_ops, adev); if (!adev->irq.domain) { - DRM_ERROR("GPU irq add domain failed\n"); + dev_err(adev->dev, "GPU irq add domain failed\n"); return -ENODEV; } diff --cc drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index 1e24590ae1449,f0b7080dccb8d..2b58e353cca15 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@@ -89,11 -89,9 +89,10 @@@ static enum drm_gpu_sched_stat amdgpu_j { struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched); struct amdgpu_job *job = to_amdgpu_job(s_job); + struct drm_wedge_task_info *info = NULL; struct amdgpu_task_info *ti; struct amdgpu_device *adev = ring->adev; - int idx; - int r; + int idx, r; if (!drm_dev_enter(adev_to_drm(adev), &idx)) { dev_info(adev->dev, "%s - device unplugged skipping recovery on scheduler:%s", @@@ -133,47 -133,22 +132,24 @@@ if (unlikely(adev->debug_disable_gpu_ring_reset)) { dev_err(adev->dev, "Ring reset disabled by debug mask\n"); } else if (amdgpu_gpu_recovery && ring->funcs->reset) { - bool is_guilty; - - dev_err(adev->dev, "Starting %s ring reset\n", s_job->sched->name); - /* stop the scheduler, but don't mess with the - * bad job yet because if ring reset fails - * we'll fall back to full GPU reset. - */ - drm_sched_wqueue_stop(&ring->sched); - - /* for engine resets, we need to reset the engine, - * but individual queues may be unaffected. - * check here to make sure the accounting is correct. - */ - if (ring->funcs->is_guilty) - is_guilty = ring->funcs->is_guilty(ring); - else - is_guilty = true; - - if (is_guilty) - dma_fence_set_error(&s_job->s_fence->finished, -ETIME); - - r = amdgpu_ring_reset(ring, job->vmid); + dev_err(adev->dev, "Starting %s ring reset\n", + s_job->sched->name); + r = amdgpu_ring_reset(ring, job->vmid, NULL); if (!r) { - if (amdgpu_ring_sched_ready(ring)) - drm_sched_stop(&ring->sched, s_job); - if (is_guilty) { - atomic_inc(&ring->adev->gpu_reset_counter); - amdgpu_fence_driver_force_completion(ring); - } - if (amdgpu_ring_sched_ready(ring)) - drm_sched_start(&ring->sched, 0); - dev_err(adev->dev, "Ring %s reset succeeded\n", ring->sched.name); - drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE, info); + atomic_inc(&ring->adev->gpu_reset_counter); + dev_err(adev->dev, "Ring %s reset succeeded\n", + ring->sched.name); + drm_dev_wedged_event(adev_to_drm(adev), - DRM_WEDGE_RECOVERY_NONE); ++ DRM_WEDGE_RECOVERY_NONE, info); goto exit; } - dev_err(adev->dev, "Ring %s reset failure\n", ring->sched.name); + dev_err(adev->dev, "Ring %s reset failed\n", ring->sched.name); } + dma_fence_set_error(&s_job->s_fence->finished, -ETIME); + amdgpu_vm_put_task_info(ti); + if (amdgpu_device_should_recover_gpu(ring->adev)) { struct amdgpu_reset_context reset_context; memset(&reset_context, 0, sizeof(reset_context));