2 * Copyright 2015 Advanced Micro Devices, Inc.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
24 #include <linux/kthread.h>
25 #include <linux/wait.h>
26 #include <linux/sched.h>
28 #include <drm/drm_drv.h>
31 #include "amdgpu_trace.h"
32 #include "amdgpu_reset.h"
34 static enum drm_gpu_sched_stat
amdgpu_job_timedout(struct drm_sched_job
*s_job
)
36 struct amdgpu_ring
*ring
= to_amdgpu_ring(s_job
->sched
);
37 struct amdgpu_job
*job
= to_amdgpu_job(s_job
);
38 struct amdgpu_task_info ti
;
39 struct amdgpu_device
*adev
= ring
->adev
;
43 if (!drm_dev_enter(adev_to_drm(adev
), &idx
)) {
44 DRM_INFO("%s - device unplugged skipping recovery on scheduler:%s",
45 __func__
, s_job
->sched
->name
);
47 /* Effectively the job is aborted as the device is gone */
48 return DRM_GPU_SCHED_STAT_ENODEV
;
51 memset(&ti
, 0, sizeof(struct amdgpu_task_info
));
52 adev
->job_hang
= true;
54 if (amdgpu_gpu_recovery
&&
55 amdgpu_ring_soft_recovery(ring
, job
->vmid
, s_job
->s_fence
->parent
)) {
56 DRM_ERROR("ring %s timeout, but soft recovered\n",
61 amdgpu_vm_get_task_info(ring
->adev
, job
->pasid
, &ti
);
62 DRM_ERROR("ring %s timeout, signaled seq=%u, emitted seq=%u\n",
63 job
->base
.sched
->name
, atomic_read(&ring
->fence_drv
.last_seq
),
64 ring
->fence_drv
.sync_seq
);
65 DRM_ERROR("Process information: process %s pid %d thread %s pid %d\n",
66 ti
.process_name
, ti
.tgid
, ti
.task_name
, ti
.pid
);
68 dma_fence_set_error(&s_job
->s_fence
->finished
, -ETIME
);
70 if (amdgpu_device_should_recover_gpu(ring
->adev
)) {
71 struct amdgpu_reset_context reset_context
;
72 memset(&reset_context
, 0, sizeof(reset_context
));
74 reset_context
.method
= AMD_RESET_METHOD_NONE
;
75 reset_context
.reset_req_dev
= adev
;
76 clear_bit(AMDGPU_NEED_FULL_RESET
, &reset_context
.flags
);
78 r
= amdgpu_device_gpu_recover(ring
->adev
, job
, &reset_context
);
80 DRM_ERROR("GPU Recovery Failed: %d\n", r
);
82 drm_sched_suspend_timeout(&ring
->sched
);
83 if (amdgpu_sriov_vf(adev
))
84 adev
->virt
.tdr_debug
= true;
88 adev
->job_hang
= false;
90 return DRM_GPU_SCHED_STAT_NOMINAL
;
93 int amdgpu_job_alloc(struct amdgpu_device
*adev
, struct amdgpu_vm
*vm
,
94 struct drm_sched_entity
*entity
, void *owner
,
95 unsigned int num_ibs
, struct amdgpu_job
**job
)
100 *job
= kzalloc(struct_size(*job
, ibs
, num_ibs
), GFP_KERNEL
);
105 * Initialize the scheduler to at least some ring so that we always
106 * have a pointer to adev.
108 (*job
)->base
.sched
= &adev
->rings
[0]->sched
;
111 amdgpu_sync_create(&(*job
)->explicit_sync
);
112 (*job
)->generation
= amdgpu_vm_generation(adev
, vm
);
113 (*job
)->vm_pd_addr
= AMDGPU_BO_INVALID_OFFSET
;
118 return drm_sched_job_init(&(*job
)->base
, entity
, owner
);
121 int amdgpu_job_alloc_with_ib(struct amdgpu_device
*adev
,
122 struct drm_sched_entity
*entity
, void *owner
,
123 size_t size
, enum amdgpu_ib_pool_type pool_type
,
124 struct amdgpu_job
**job
)
128 r
= amdgpu_job_alloc(adev
, NULL
, entity
, owner
, 1, job
);
133 r
= amdgpu_ib_get(adev
, NULL
, size
, pool_type
, &(*job
)->ibs
[0]);
136 drm_sched_job_cleanup(&(*job
)->base
);
143 void amdgpu_job_set_resources(struct amdgpu_job
*job
, struct amdgpu_bo
*gds
,
144 struct amdgpu_bo
*gws
, struct amdgpu_bo
*oa
)
147 job
->gds_base
= amdgpu_bo_gpu_offset(gds
) >> PAGE_SHIFT
;
148 job
->gds_size
= amdgpu_bo_size(gds
) >> PAGE_SHIFT
;
151 job
->gws_base
= amdgpu_bo_gpu_offset(gws
) >> PAGE_SHIFT
;
152 job
->gws_size
= amdgpu_bo_size(gws
) >> PAGE_SHIFT
;
155 job
->oa_base
= amdgpu_bo_gpu_offset(oa
) >> PAGE_SHIFT
;
156 job
->oa_size
= amdgpu_bo_size(oa
) >> PAGE_SHIFT
;
160 void amdgpu_job_free_resources(struct amdgpu_job
*job
)
162 struct amdgpu_ring
*ring
= to_amdgpu_ring(job
->base
.sched
);
166 /* Check if any fences where initialized */
167 if (job
->base
.s_fence
&& job
->base
.s_fence
->finished
.ops
)
168 f
= &job
->base
.s_fence
->finished
;
169 else if (job
->hw_fence
.ops
)
174 for (i
= 0; i
< job
->num_ibs
; ++i
)
175 amdgpu_ib_free(ring
->adev
, &job
->ibs
[i
], f
);
178 static void amdgpu_job_free_cb(struct drm_sched_job
*s_job
)
180 struct amdgpu_job
*job
= to_amdgpu_job(s_job
);
182 drm_sched_job_cleanup(s_job
);
184 amdgpu_sync_free(&job
->explicit_sync
);
186 /* only put the hw fence if has embedded fence */
187 if (!job
->hw_fence
.ops
)
190 dma_fence_put(&job
->hw_fence
);
193 void amdgpu_job_set_gang_leader(struct amdgpu_job
*job
,
194 struct amdgpu_job
*leader
)
196 struct dma_fence
*fence
= &leader
->base
.s_fence
->scheduled
;
198 WARN_ON(job
->gang_submit
);
201 * Don't add a reference when we are the gang leader to avoid circle
205 dma_fence_get(fence
);
206 job
->gang_submit
= fence
;
209 void amdgpu_job_free(struct amdgpu_job
*job
)
211 if (job
->base
.entity
)
212 drm_sched_job_cleanup(&job
->base
);
214 amdgpu_job_free_resources(job
);
215 amdgpu_sync_free(&job
->explicit_sync
);
216 if (job
->gang_submit
!= &job
->base
.s_fence
->scheduled
)
217 dma_fence_put(job
->gang_submit
);
219 if (!job
->hw_fence
.ops
)
222 dma_fence_put(&job
->hw_fence
);
225 struct dma_fence
*amdgpu_job_submit(struct amdgpu_job
*job
)
229 drm_sched_job_arm(&job
->base
);
230 f
= dma_fence_get(&job
->base
.s_fence
->finished
);
231 amdgpu_job_free_resources(job
);
232 drm_sched_entity_push_job(&job
->base
);
237 int amdgpu_job_submit_direct(struct amdgpu_job
*job
, struct amdgpu_ring
*ring
,
238 struct dma_fence
**fence
)
242 job
->base
.sched
= &ring
->sched
;
243 r
= amdgpu_ib_schedule(ring
, job
->num_ibs
, job
->ibs
, job
, fence
);
248 amdgpu_job_free(job
);
252 static struct dma_fence
*
253 amdgpu_job_prepare_job(struct drm_sched_job
*sched_job
,
254 struct drm_sched_entity
*s_entity
)
256 struct amdgpu_ring
*ring
= to_amdgpu_ring(s_entity
->rq
->sched
);
257 struct amdgpu_job
*job
= to_amdgpu_job(sched_job
);
258 struct dma_fence
*fence
= NULL
;
261 /* Ignore soft recovered fences here */
262 r
= drm_sched_entity_error(s_entity
);
263 if (r
&& r
!= -ENODATA
)
266 if (!fence
&& job
->gang_submit
)
267 fence
= amdgpu_device_switch_gang(ring
->adev
, job
->gang_submit
);
269 while (!fence
&& job
->vm
&& !job
->vmid
) {
270 r
= amdgpu_vmid_grab(job
->vm
, ring
, job
, &fence
);
272 DRM_ERROR("Error getting VM ID (%d)\n", r
);
280 dma_fence_set_error(&job
->base
.s_fence
->finished
, r
);
284 static struct dma_fence
*amdgpu_job_run(struct drm_sched_job
*sched_job
)
286 struct amdgpu_ring
*ring
= to_amdgpu_ring(sched_job
->sched
);
287 struct amdgpu_device
*adev
= ring
->adev
;
288 struct dma_fence
*fence
= NULL
, *finished
;
289 struct amdgpu_job
*job
;
292 job
= to_amdgpu_job(sched_job
);
293 finished
= &job
->base
.s_fence
->finished
;
295 trace_amdgpu_sched_run_job(job
);
297 /* Skip job if VRAM is lost and never resubmit gangs */
298 if (job
->generation
!= amdgpu_vm_generation(adev
, job
->vm
) ||
299 (job
->job_run_counter
&& job
->gang_submit
))
300 dma_fence_set_error(finished
, -ECANCELED
);
302 if (finished
->error
< 0) {
303 DRM_INFO("Skip scheduling IBs!\n");
305 r
= amdgpu_ib_schedule(ring
, job
->num_ibs
, job
->ibs
, job
,
308 DRM_ERROR("Error scheduling IBs (%d)\n", r
);
311 job
->job_run_counter
++;
312 amdgpu_job_free_resources(job
);
314 fence
= r
? ERR_PTR(r
) : fence
;
318 #define to_drm_sched_job(sched_job) \
319 container_of((sched_job), struct drm_sched_job, queue_node)
321 void amdgpu_job_stop_all_jobs_on_sched(struct drm_gpu_scheduler
*sched
)
323 struct drm_sched_job
*s_job
;
324 struct drm_sched_entity
*s_entity
= NULL
;
327 /* Signal all jobs not yet scheduled */
328 for (i
= sched
->num_rqs
- 1; i
>= DRM_SCHED_PRIORITY_MIN
; i
--) {
329 struct drm_sched_rq
*rq
= sched
->sched_rq
[i
];
330 spin_lock(&rq
->lock
);
331 list_for_each_entry(s_entity
, &rq
->entities
, list
) {
332 while ((s_job
= to_drm_sched_job(spsc_queue_pop(&s_entity
->job_queue
)))) {
333 struct drm_sched_fence
*s_fence
= s_job
->s_fence
;
335 dma_fence_signal(&s_fence
->scheduled
);
336 dma_fence_set_error(&s_fence
->finished
, -EHWPOISON
);
337 dma_fence_signal(&s_fence
->finished
);
340 spin_unlock(&rq
->lock
);
343 /* Signal all jobs already scheduled to HW */
344 list_for_each_entry(s_job
, &sched
->pending_list
, list
) {
345 struct drm_sched_fence
*s_fence
= s_job
->s_fence
;
347 dma_fence_set_error(&s_fence
->finished
, -EHWPOISON
);
348 dma_fence_signal(&s_fence
->finished
);
352 const struct drm_sched_backend_ops amdgpu_sched_ops
= {
353 .prepare_job
= amdgpu_job_prepare_job
,
354 .run_job
= amdgpu_job_run
,
355 .timedout_job
= amdgpu_job_timedout
,
356 .free_job
= amdgpu_job_free_cb