]> git.ipfire.org Git - thirdparty/linux.git/blame - drivers/gpu/drm/i915/intel_lrc.c
drm/i915/execlists: Disable submission tasklet upon wedging
[thirdparty/linux.git] / drivers / gpu / drm / i915 / intel_lrc.c
CommitLineData
b20385f1
OM
1/*
2 * Copyright © 2014 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * Authors:
24 * Ben Widawsky <ben@bwidawsk.net>
25 * Michel Thierry <michel.thierry@intel.com>
26 * Thomas Daniel <thomas.daniel@intel.com>
27 * Oscar Mateo <oscar.mateo@intel.com>
28 *
29 */
30
73e4d07f
OM
31/**
32 * DOC: Logical Rings, Logical Ring Contexts and Execlists
33 *
34 * Motivation:
b20385f1
OM
35 * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts".
36 * These expanded contexts enable a number of new abilities, especially
37 * "Execlists" (also implemented in this file).
38 *
73e4d07f
OM
39 * One of the main differences with the legacy HW contexts is that logical
40 * ring contexts incorporate many more things to the context's state, like
41 * PDPs or ringbuffer control registers:
42 *
43 * The reason why PDPs are included in the context is straightforward: as
44 * PPGTTs (per-process GTTs) are actually per-context, having the PDPs
45 * contained there mean you don't need to do a ppgtt->switch_mm yourself,
46 * instead, the GPU will do it for you on the context switch.
47 *
48 * But, what about the ringbuffer control registers (head, tail, etc..)?
49 * shouldn't we just need a set of those per engine command streamer? This is
50 * where the name "Logical Rings" starts to make sense: by virtualizing the
51 * rings, the engine cs shifts to a new "ring buffer" with every context
52 * switch. When you want to submit a workload to the GPU you: A) choose your
53 * context, B) find its appropriate virtualized ring, C) write commands to it
54 * and then, finally, D) tell the GPU to switch to that context.
55 *
56 * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch
57 * to a contexts is via a context execution list, ergo "Execlists".
58 *
59 * LRC implementation:
60 * Regarding the creation of contexts, we have:
61 *
62 * - One global default context.
63 * - One local default context for each opened fd.
64 * - One local extra context for each context create ioctl call.
65 *
66 * Now that ringbuffers belong per-context (and not per-engine, like before)
67 * and that contexts are uniquely tied to a given engine (and not reusable,
68 * like before) we need:
69 *
70 * - One ringbuffer per-engine inside each context.
71 * - One backing object per-engine inside each context.
72 *
73 * The global default context starts its life with these new objects fully
74 * allocated and populated. The local default context for each opened fd is
75 * more complex, because we don't know at creation time which engine is going
76 * to use them. To handle this, we have implemented a deferred creation of LR
77 * contexts:
78 *
79 * The local context starts its life as a hollow or blank holder, that only
80 * gets populated for a given engine once we receive an execbuffer. If later
81 * on we receive another execbuffer ioctl for the same context but a different
82 * engine, we allocate/populate a new ringbuffer and context backing object and
83 * so on.
84 *
85 * Finally, regarding local contexts created using the ioctl call: as they are
86 * only allowed with the render ring, we can allocate & populate them right
87 * away (no need to defer anything, at least for now).
88 *
89 * Execlists implementation:
b20385f1
OM
90 * Execlists are the new method by which, on gen8+ hardware, workloads are
91 * submitted for execution (as opposed to the legacy, ringbuffer-based, method).
73e4d07f
OM
92 * This method works as follows:
93 *
94 * When a request is committed, its commands (the BB start and any leading or
95 * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer
96 * for the appropriate context. The tail pointer in the hardware context is not
97 * updated at this time, but instead, kept by the driver in the ringbuffer
98 * structure. A structure representing this request is added to a request queue
99 * for the appropriate engine: this structure contains a copy of the context's
100 * tail after the request was written to the ring buffer and a pointer to the
101 * context itself.
102 *
103 * If the engine's request queue was empty before the request was added, the
104 * queue is processed immediately. Otherwise the queue will be processed during
105 * a context switch interrupt. In any case, elements on the queue will get sent
106 * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a
107 * globally unique 20-bits submission ID.
108 *
109 * When execution of a request completes, the GPU updates the context status
110 * buffer with a context complete event and generates a context switch interrupt.
111 * During the interrupt handling, the driver examines the events in the buffer:
112 * for each context complete event, if the announced ID matches that on the head
113 * of the request queue, then that request is retired and removed from the queue.
114 *
115 * After processing, if any requests were retired and the queue is not empty
116 * then a new execution list can be submitted. The two requests at the front of
117 * the queue are next to be submitted but since a context may not occur twice in
118 * an execution list, if subsequent requests have the same ID as the first then
119 * the two requests must be combined. This is done simply by discarding requests
120 * at the head of the queue until either only one requests is left (in which case
121 * we use a NULL second context) or the first two requests have unique IDs.
122 *
123 * By always executing the first two requests in the queue the driver ensures
124 * that the GPU is kept as busy as possible. In the case where a single context
125 * completes but a second context is still executing, the request for this second
126 * context will be at the head of the queue when we remove the first one. This
127 * request will then be resubmitted along with a new request for a different context,
128 * which will cause the hardware to continue executing the second request and queue
129 * the new request (the GPU detects the condition of a context getting preempted
130 * with the same context and optimizes the context switch flow by not doing
131 * preemption, but just sampling the new tail pointer).
132 *
b20385f1 133 */
27af5eea 134#include <linux/interrupt.h>
b20385f1
OM
135
136#include <drm/drmP.h>
137#include <drm/i915_drm.h>
138#include "i915_drv.h"
7c2fa7fa 139#include "i915_gem_render_state.h"
bc4237ec 140#include "i915_vgpu.h"
578f1ac6 141#include "intel_lrc_reg.h"
3bbaba0c 142#include "intel_mocs.h"
7d3c425f 143#include "intel_workarounds.h"
127f1003 144
e981e7b1
TD
145#define RING_EXECLIST_QFULL (1 << 0x2)
146#define RING_EXECLIST1_VALID (1 << 0x3)
147#define RING_EXECLIST0_VALID (1 << 0x4)
148#define RING_EXECLIST_ACTIVE_STATUS (3 << 0xE)
149#define RING_EXECLIST1_ACTIVE (1 << 0x11)
150#define RING_EXECLIST0_ACTIVE (1 << 0x12)
151
152#define GEN8_CTX_STATUS_IDLE_ACTIVE (1 << 0)
153#define GEN8_CTX_STATUS_PREEMPTED (1 << 1)
154#define GEN8_CTX_STATUS_ELEMENT_SWITCH (1 << 2)
155#define GEN8_CTX_STATUS_ACTIVE_IDLE (1 << 3)
156#define GEN8_CTX_STATUS_COMPLETE (1 << 4)
157#define GEN8_CTX_STATUS_LITE_RESTORE (1 << 15)
8670d6f9 158
70c2a24d 159#define GEN8_CTX_STATUS_COMPLETED_MASK \
d8747afb 160 (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED)
70c2a24d 161
0e93cdd4
CW
162/* Typical size of the average request (2 pipecontrols and a MI_BB) */
163#define EXECLISTS_REQUEST_SIZE 64 /* bytes */
a3aabe86 164#define WA_TAIL_DWORDS 2
7e4992ac 165#define WA_TAIL_BYTES (sizeof(u32) * WA_TAIL_DWORDS)
a3aabe86 166
e2efd130 167static int execlists_context_deferred_alloc(struct i915_gem_context *ctx,
1fc44d9b
CW
168 struct intel_engine_cs *engine,
169 struct intel_context *ce);
a3aabe86
CW
170static void execlists_init_reg_state(u32 *reg_state,
171 struct i915_gem_context *ctx,
172 struct intel_engine_cs *engine,
173 struct intel_ring *ring);
7ba717cf 174
f6322edd
CW
175static inline struct i915_priolist *to_priolist(struct rb_node *rb)
176{
177 return rb_entry(rb, struct i915_priolist, node);
178}
179
180static inline int rq_prio(const struct i915_request *rq)
181{
b7268c5e 182 return rq->sched.attr.priority;
f6322edd
CW
183}
184
185static inline bool need_preempt(const struct intel_engine_cs *engine,
186 const struct i915_request *last,
187 int prio)
188{
2a694feb 189 return (intel_engine_has_preemption(engine) &&
c5ce3b8d
CW
190 __execlists_need_preempt(prio, rq_prio(last)) &&
191 !i915_request_completed(last));
f6322edd
CW
192}
193
1fc44d9b 194/*
ca82580c
TU
195 * The context descriptor encodes various attributes of a context,
196 * including its GTT address and some flags. Because it's fairly
197 * expensive to calculate, we'll just do it once and cache the result,
198 * which remains valid until the context is unpinned.
199 *
6e5248b5
DV
200 * This is what a descriptor looks like, from LSB to MSB::
201 *
2355cf08 202 * bits 0-11: flags, GEN8_CTX_* (cached in ctx->desc_template)
6e5248b5 203 * bits 12-31: LRCA, GTT address of (the HWSP of) this context
218b5000 204 * bits 32-52: ctx ID, a globally unique tag (highest bit used by GuC)
6e5248b5
DV
205 * bits 53-54: mbz, reserved for use by hardware
206 * bits 55-63: group ID, currently unused and set to 0
ac52da6a
DCS
207 *
208 * Starting from Gen11, the upper dword of the descriptor has a new format:
209 *
210 * bits 32-36: reserved
211 * bits 37-47: SW context ID
212 * bits 48:53: engine instance
213 * bit 54: mbz, reserved for use by hardware
214 * bits 55-60: SW counter
215 * bits 61-63: engine class
216 *
217 * engine info, SW context ID and SW counter need to form a unique number
218 * (Context ID) per lrc.
73e4d07f 219 */
ca82580c 220static void
e2efd130 221intel_lr_context_descriptor_update(struct i915_gem_context *ctx,
1fc44d9b
CW
222 struct intel_engine_cs *engine,
223 struct intel_context *ce)
84b790f8 224{
7069b144 225 u64 desc;
84b790f8 226
ac52da6a
DCS
227 BUILD_BUG_ON(MAX_CONTEXT_HW_ID > (BIT(GEN8_CTX_ID_WIDTH)));
228 BUILD_BUG_ON(GEN11_MAX_CONTEXT_HW_ID > (BIT(GEN11_SW_CTX_ID_WIDTH)));
84b790f8 229
2355cf08 230 desc = ctx->desc_template; /* bits 0-11 */
ac52da6a
DCS
231 GEM_BUG_ON(desc & GENMASK_ULL(63, 12));
232
0b29c75a 233 desc |= i915_ggtt_offset(ce->state) + LRC_HEADER_PAGES * PAGE_SIZE;
9021ad03 234 /* bits 12-31 */
ac52da6a
DCS
235 GEM_BUG_ON(desc & GENMASK_ULL(63, 32));
236
61d5676b
LL
237 /*
238 * The following 32bits are copied into the OA reports (dword 2).
239 * Consider updating oa_get_render_ctx_id in i915_perf.c when changing
240 * anything below.
241 */
ac52da6a
DCS
242 if (INTEL_GEN(ctx->i915) >= 11) {
243 GEM_BUG_ON(ctx->hw_id >= BIT(GEN11_SW_CTX_ID_WIDTH));
244 desc |= (u64)ctx->hw_id << GEN11_SW_CTX_ID_SHIFT;
245 /* bits 37-47 */
246
247 desc |= (u64)engine->instance << GEN11_ENGINE_INSTANCE_SHIFT;
248 /* bits 48-53 */
249
250 /* TODO: decide what to do with SW counter (bits 55-60) */
251
252 desc |= (u64)engine->class << GEN11_ENGINE_CLASS_SHIFT;
253 /* bits 61-63 */
254 } else {
255 GEM_BUG_ON(ctx->hw_id >= BIT(GEN8_CTX_ID_WIDTH));
256 desc |= (u64)ctx->hw_id << GEN8_CTX_ID_SHIFT; /* bits 32-52 */
257 }
5af05fef 258
9021ad03 259 ce->lrc_desc = desc;
5af05fef
MT
260}
261
27606fd8 262static struct i915_priolist *
87c7acf8 263lookup_priolist(struct intel_engine_cs *engine, int prio)
08dd3e1a 264{
b620e870 265 struct intel_engine_execlists * const execlists = &engine->execlists;
08dd3e1a
CW
266 struct i915_priolist *p;
267 struct rb_node **parent, *rb;
268 bool first = true;
269
b620e870 270 if (unlikely(execlists->no_priolist))
08dd3e1a
CW
271 prio = I915_PRIORITY_NORMAL;
272
273find_priolist:
274 /* most positive priority is scheduled first, equal priorities fifo */
275 rb = NULL;
655250a8 276 parent = &execlists->queue.rb_root.rb_node;
08dd3e1a
CW
277 while (*parent) {
278 rb = *parent;
f6322edd 279 p = to_priolist(rb);
08dd3e1a
CW
280 if (prio > p->priority) {
281 parent = &rb->rb_left;
282 } else if (prio < p->priority) {
283 parent = &rb->rb_right;
284 first = false;
285 } else {
27606fd8 286 return p;
08dd3e1a
CW
287 }
288 }
289
290 if (prio == I915_PRIORITY_NORMAL) {
b620e870 291 p = &execlists->default_priolist;
08dd3e1a
CW
292 } else {
293 p = kmem_cache_alloc(engine->i915->priorities, GFP_ATOMIC);
294 /* Convert an allocation failure to a priority bump */
295 if (unlikely(!p)) {
296 prio = I915_PRIORITY_NORMAL; /* recurses just once */
297
298 /* To maintain ordering with all rendering, after an
299 * allocation failure we have to disable all scheduling.
300 * Requests will then be executed in fifo, and schedule
301 * will ensure that dependencies are emitted in fifo.
302 * There will be still some reordering with existing
303 * requests, so if userspace lied about their
304 * dependencies that reordering may be visible.
305 */
b620e870 306 execlists->no_priolist = true;
08dd3e1a
CW
307 goto find_priolist;
308 }
309 }
310
311 p->priority = prio;
27606fd8 312 INIT_LIST_HEAD(&p->requests);
08dd3e1a 313 rb_link_node(&p->node, rb, parent);
655250a8 314 rb_insert_color_cached(&p->node, &execlists->queue, first);
08dd3e1a 315
f6322edd 316 return p;
08dd3e1a
CW
317}
318
e61e0f51 319static void unwind_wa_tail(struct i915_request *rq)
7e4992ac
CW
320{
321 rq->tail = intel_ring_wrap(rq->ring, rq->wa_tail - WA_TAIL_BYTES);
322 assert_ring_tail_valid(rq->ring, rq->tail);
323}
324
a4598d17 325static void __unwind_incomplete_requests(struct intel_engine_cs *engine)
7e4992ac 326{
e61e0f51 327 struct i915_request *rq, *rn;
097a9481
MW
328 struct i915_priolist *uninitialized_var(p);
329 int last_prio = I915_PRIORITY_INVALID;
7e4992ac 330
a89d1f92 331 lockdep_assert_held(&engine->timeline.lock);
7e4992ac
CW
332
333 list_for_each_entry_safe_reverse(rq, rn,
a89d1f92 334 &engine->timeline.requests,
7e4992ac 335 link) {
e61e0f51 336 if (i915_request_completed(rq))
7e4992ac
CW
337 return;
338
e61e0f51 339 __i915_request_unsubmit(rq);
7e4992ac
CW
340 unwind_wa_tail(rq);
341
f6322edd
CW
342 GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
343 if (rq_prio(rq) != last_prio) {
344 last_prio = rq_prio(rq);
87c7acf8 345 p = lookup_priolist(engine, last_prio);
097a9481
MW
346 }
347
a02eb975 348 GEM_BUG_ON(p->priority != rq_prio(rq));
0c7112a0 349 list_add(&rq->sched.link, &p->requests);
7e4992ac
CW
350 }
351}
352
c41937fd 353void
a4598d17
MW
354execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists)
355{
356 struct intel_engine_cs *engine =
357 container_of(execlists, typeof(*engine), execlists);
4413c474
CW
358 unsigned long flags;
359
360 spin_lock_irqsave(&engine->timeline.lock, flags);
a4598d17 361
a4598d17 362 __unwind_incomplete_requests(engine);
4413c474
CW
363
364 spin_unlock_irqrestore(&engine->timeline.lock, flags);
a4598d17
MW
365}
366
bbd6c47e 367static inline void
e61e0f51 368execlists_context_status_change(struct i915_request *rq, unsigned long status)
84b790f8 369{
bbd6c47e
CW
370 /*
371 * Only used when GVT-g is enabled now. When GVT-g is disabled,
372 * The compiler should eliminate this function as dead-code.
373 */
374 if (!IS_ENABLED(CONFIG_DRM_I915_GVT))
375 return;
6daccb0b 376
3fc03069
CD
377 atomic_notifier_call_chain(&rq->engine->context_status_notifier,
378 status, rq);
84b790f8
BW
379}
380
f2605207
CW
381inline void
382execlists_user_begin(struct intel_engine_execlists *execlists,
383 const struct execlist_port *port)
384{
385 execlists_set_active_once(execlists, EXECLISTS_ACTIVE_USER);
386}
387
388inline void
389execlists_user_end(struct intel_engine_execlists *execlists)
390{
391 execlists_clear_active(execlists, EXECLISTS_ACTIVE_USER);
392}
393
73fd9d38 394static inline void
e61e0f51 395execlists_context_schedule_in(struct i915_request *rq)
73fd9d38
TU
396{
397 execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
30e17b78 398 intel_engine_context_in(rq->engine);
73fd9d38
TU
399}
400
401static inline void
b9b77426 402execlists_context_schedule_out(struct i915_request *rq, unsigned long status)
73fd9d38 403{
30e17b78 404 intel_engine_context_out(rq->engine);
b9b77426
CW
405 execlists_context_status_change(rq, status);
406 trace_i915_request_out(rq);
73fd9d38
TU
407}
408
c6a2ac71
TU
409static void
410execlists_update_context_pdps(struct i915_hw_ppgtt *ppgtt, u32 *reg_state)
411{
412 ASSIGN_CTX_PDP(ppgtt, reg_state, 3);
413 ASSIGN_CTX_PDP(ppgtt, reg_state, 2);
414 ASSIGN_CTX_PDP(ppgtt, reg_state, 1);
415 ASSIGN_CTX_PDP(ppgtt, reg_state, 0);
416}
417
e61e0f51 418static u64 execlists_update_context(struct i915_request *rq)
ae1250b9 419{
1fc44d9b 420 struct intel_context *ce = rq->hw_context;
04da811b 421 struct i915_hw_ppgtt *ppgtt =
4e0d64db 422 rq->gem_context->ppgtt ?: rq->i915->mm.aliasing_ppgtt;
70c2a24d 423 u32 *reg_state = ce->lrc_reg_state;
ae1250b9 424
e6ba9992 425 reg_state[CTX_RING_TAIL+1] = intel_ring_set_tail(rq->ring, rq->tail);
ae1250b9 426
c6a2ac71
TU
427 /* True 32b PPGTT with dynamic page allocation: update PDP
428 * registers and point the unallocated PDPs to scratch page.
429 * PML4 is allocated during ppgtt init, so this is not needed
430 * in 48-bit mode.
431 */
82ad6443 432 if (ppgtt && !i915_vm_is_48bit(&ppgtt->vm))
c6a2ac71 433 execlists_update_context_pdps(ppgtt, reg_state);
70c2a24d
CW
434
435 return ce->lrc_desc;
ae1250b9
OM
436}
437
05f0addd 438static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port)
beecec90 439{
05f0addd
TD
440 if (execlists->ctrl_reg) {
441 writel(lower_32_bits(desc), execlists->submit_reg + port * 2);
442 writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1);
443 } else {
444 writel(upper_32_bits(desc), execlists->submit_reg);
445 writel(lower_32_bits(desc), execlists->submit_reg);
446 }
beecec90
CW
447}
448
70c2a24d 449static void execlists_submit_ports(struct intel_engine_cs *engine)
bbd6c47e 450{
05f0addd
TD
451 struct intel_engine_execlists *execlists = &engine->execlists;
452 struct execlist_port *port = execlists->port;
77f0d0e9 453 unsigned int n;
bbd6c47e 454
05f0addd
TD
455 /*
456 * ELSQ note: the submit queue is not cleared after being submitted
457 * to the HW so we need to make sure we always clean it up. This is
458 * currently ensured by the fact that we always write the same number
459 * of elsq entries, keep this in mind before changing the loop below.
460 */
461 for (n = execlists_num_ports(execlists); n--; ) {
e61e0f51 462 struct i915_request *rq;
77f0d0e9
CW
463 unsigned int count;
464 u64 desc;
465
466 rq = port_unpack(&port[n], &count);
467 if (rq) {
468 GEM_BUG_ON(count > !n);
469 if (!count++)
73fd9d38 470 execlists_context_schedule_in(rq);
77f0d0e9
CW
471 port_set(&port[n], port_pack(rq, count));
472 desc = execlists_update_context(rq);
473 GEM_DEBUG_EXEC(port[n].context_id = upper_32_bits(desc));
bccd3b83 474
0c5c7df3 475 GEM_TRACE("%s in[%d]: ctx=%d.%d, global=%d (fence %llx:%d) (current %d), prio=%d\n",
bccd3b83 476 engine->name, n,
16c8619a 477 port[n].context_id, count,
f6322edd 478 rq->global_seqno,
0c5c7df3 479 rq->fence.context, rq->fence.seqno,
e7702760 480 intel_engine_get_seqno(engine),
f6322edd 481 rq_prio(rq));
77f0d0e9
CW
482 } else {
483 GEM_BUG_ON(!n);
484 desc = 0;
485 }
bbd6c47e 486
05f0addd 487 write_desc(execlists, desc, n);
77f0d0e9 488 }
05f0addd
TD
489
490 /* we need to manually load the submit queue */
491 if (execlists->ctrl_reg)
492 writel(EL_CTRL_LOAD, execlists->ctrl_reg);
493
494 execlists_clear_active(execlists, EXECLISTS_ACTIVE_HWACK);
bbd6c47e
CW
495}
496
1fc44d9b 497static bool ctx_single_port_submission(const struct intel_context *ce)
84b790f8 498{
70c2a24d 499 return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
1fc44d9b 500 i915_gem_context_force_single_submission(ce->gem_context));
70c2a24d 501}
84b790f8 502
1fc44d9b
CW
503static bool can_merge_ctx(const struct intel_context *prev,
504 const struct intel_context *next)
70c2a24d
CW
505{
506 if (prev != next)
507 return false;
26720ab9 508
70c2a24d
CW
509 if (ctx_single_port_submission(prev))
510 return false;
26720ab9 511
70c2a24d 512 return true;
84b790f8
BW
513}
514
e61e0f51 515static void port_assign(struct execlist_port *port, struct i915_request *rq)
77f0d0e9
CW
516{
517 GEM_BUG_ON(rq == port_request(port));
518
519 if (port_isset(port))
e61e0f51 520 i915_request_put(port_request(port));
77f0d0e9 521
e61e0f51 522 port_set(port, port_pack(i915_request_get(rq), port_count(port)));
77f0d0e9
CW
523}
524
beecec90
CW
525static void inject_preempt_context(struct intel_engine_cs *engine)
526{
05f0addd 527 struct intel_engine_execlists *execlists = &engine->execlists;
beecec90 528 struct intel_context *ce =
ab82a063 529 to_intel_context(engine->i915->preempt_context, engine);
beecec90
CW
530 unsigned int n;
531
05f0addd 532 GEM_BUG_ON(execlists->preempt_complete_status !=
d6376374 533 upper_32_bits(ce->lrc_desc));
09b1a4e4
CW
534 GEM_BUG_ON((ce->lrc_reg_state[CTX_CONTEXT_CONTROL + 1] &
535 _MASKED_BIT_ENABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT |
536 CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT)) !=
537 _MASKED_BIT_ENABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT |
538 CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT));
539
f6322edd
CW
540 /*
541 * Switch to our empty preempt context so
542 * the state of the GPU is known (idle).
543 */
16a87394 544 GEM_TRACE("%s\n", engine->name);
05f0addd
TD
545 for (n = execlists_num_ports(execlists); --n; )
546 write_desc(execlists, 0, n);
547
548 write_desc(execlists, ce->lrc_desc, n);
549
550 /* we need to manually load the submit queue */
551 if (execlists->ctrl_reg)
552 writel(EL_CTRL_LOAD, execlists->ctrl_reg);
beecec90 553
ef2fb720
CW
554 execlists_clear_active(execlists, EXECLISTS_ACTIVE_HWACK);
555 execlists_set_active(execlists, EXECLISTS_ACTIVE_PREEMPT);
556}
557
558static void complete_preempt_context(struct intel_engine_execlists *execlists)
559{
560 GEM_BUG_ON(!execlists_is_active(execlists, EXECLISTS_ACTIVE_PREEMPT));
561
562 execlists_cancel_port_requests(execlists);
9512f985
CW
563 __unwind_incomplete_requests(container_of(execlists,
564 struct intel_engine_cs,
565 execlists));
ef2fb720
CW
566
567 execlists_clear_active(execlists, EXECLISTS_ACTIVE_PREEMPT);
beecec90
CW
568}
569
9512f985 570static void execlists_dequeue(struct intel_engine_cs *engine)
acdd884a 571{
7a62cc61
MK
572 struct intel_engine_execlists * const execlists = &engine->execlists;
573 struct execlist_port *port = execlists->port;
76e70087
MK
574 const struct execlist_port * const last_port =
575 &execlists->port[execlists->port_mask];
e61e0f51 576 struct i915_request *last = port_request(port);
20311bd3 577 struct rb_node *rb;
70c2a24d
CW
578 bool submit = false;
579
9512f985
CW
580 /*
581 * Hardware submission is through 2 ports. Conceptually each port
70c2a24d
CW
582 * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
583 * static for a context, and unique to each, so we only execute
584 * requests belonging to a single context from each ring. RING_HEAD
585 * is maintained by the CS in the context image, it marks the place
586 * where it got up to last time, and through RING_TAIL we tell the CS
587 * where we want to execute up to this time.
588 *
589 * In this list the requests are in order of execution. Consecutive
590 * requests from the same context are adjacent in the ringbuffer. We
591 * can combine these requests into a single RING_TAIL update:
592 *
593 * RING_HEAD...req1...req2
594 * ^- RING_TAIL
595 * since to execute req2 the CS must first execute req1.
596 *
597 * Our goal then is to point each port to the end of a consecutive
598 * sequence of requests as being the most optimal (fewest wake ups
599 * and context switches) submission.
779949f4 600 */
acdd884a 601
beecec90
CW
602 if (last) {
603 /*
604 * Don't resubmit or switch until all outstanding
605 * preemptions (lite-restore) are seen. Then we
606 * know the next preemption status we see corresponds
607 * to this ELSP update.
608 */
eed7ec52
CW
609 GEM_BUG_ON(!execlists_is_active(execlists,
610 EXECLISTS_ACTIVE_USER));
ba74cb10 611 GEM_BUG_ON(!port_count(&port[0]));
beecec90 612
ba74cb10
MT
613 /*
614 * If we write to ELSP a second time before the HW has had
615 * a chance to respond to the previous write, we can confuse
616 * the HW and hit "undefined behaviour". After writing to ELSP,
617 * we must then wait until we see a context-switch event from
618 * the HW to indicate that it has had a chance to respond.
619 */
620 if (!execlists_is_active(execlists, EXECLISTS_ACTIVE_HWACK))
0b02befa 621 return;
ba74cb10 622
f6322edd 623 if (need_preempt(engine, last, execlists->queue_priority)) {
beecec90 624 inject_preempt_context(engine);
0b02befa 625 return;
beecec90 626 }
f6322edd
CW
627
628 /*
629 * In theory, we could coalesce more requests onto
630 * the second port (the first port is active, with
631 * no preemptions pending). However, that means we
632 * then have to deal with the possible lite-restore
633 * of the second port (as we submit the ELSP, there
634 * may be a context-switch) but also we may complete
635 * the resubmission before the context-switch. Ergo,
636 * coalescing onto the second port will cause a
637 * preemption event, but we cannot predict whether
638 * that will affect port[0] or port[1].
639 *
640 * If the second port is already active, we can wait
641 * until the next context-switch before contemplating
642 * new requests. The GPU will be busy and we should be
643 * able to resubmit the new ELSP before it idles,
644 * avoiding pipeline bubbles (momentary pauses where
645 * the driver is unable to keep up the supply of new
646 * work). However, we have to double check that the
647 * priorities of the ports haven't been switch.
648 */
649 if (port_count(&port[1]))
0b02befa 650 return;
f6322edd
CW
651
652 /*
653 * WaIdleLiteRestore:bdw,skl
654 * Apply the wa NOOPs to prevent
655 * ring:HEAD == rq:TAIL as we resubmit the
656 * request. See gen8_emit_breadcrumb() for
657 * where we prepare the padding after the
658 * end of the request.
659 */
660 last->tail = last->wa_tail;
beecec90
CW
661 }
662
655250a8 663 while ((rb = rb_first_cached(&execlists->queue))) {
f6322edd 664 struct i915_priolist *p = to_priolist(rb);
e61e0f51 665 struct i915_request *rq, *rn;
6c067579 666
0c7112a0 667 list_for_each_entry_safe(rq, rn, &p->requests, sched.link) {
6c067579
CW
668 /*
669 * Can we combine this request with the current port?
670 * It has to be the same context/ringbuffer and not
671 * have any exceptions (e.g. GVT saying never to
672 * combine contexts).
673 *
674 * If we can combine the requests, we can execute both
675 * by updating the RING_TAIL to point to the end of the
676 * second request, and so we never need to tell the
677 * hardware about the first.
70c2a24d 678 */
1fc44d9b
CW
679 if (last &&
680 !can_merge_ctx(rq->hw_context, last->hw_context)) {
6c067579
CW
681 /*
682 * If we are on the second port and cannot
683 * combine this request with the last, then we
684 * are done.
685 */
76e70087 686 if (port == last_port) {
6c067579 687 __list_del_many(&p->requests,
0c7112a0 688 &rq->sched.link);
6c067579
CW
689 goto done;
690 }
691
692 /*
693 * If GVT overrides us we only ever submit
694 * port[0], leaving port[1] empty. Note that we
695 * also have to be careful that we don't queue
696 * the same context (even though a different
697 * request) to the second port.
698 */
1fc44d9b
CW
699 if (ctx_single_port_submission(last->hw_context) ||
700 ctx_single_port_submission(rq->hw_context)) {
6c067579 701 __list_del_many(&p->requests,
0c7112a0 702 &rq->sched.link);
6c067579
CW
703 goto done;
704 }
705
1fc44d9b 706 GEM_BUG_ON(last->hw_context == rq->hw_context);
6c067579
CW
707
708 if (submit)
709 port_assign(port, last);
710 port++;
7a62cc61
MK
711
712 GEM_BUG_ON(port_isset(port));
6c067579 713 }
70c2a24d 714
0c7112a0 715 INIT_LIST_HEAD(&rq->sched.link);
e61e0f51
CW
716 __i915_request_submit(rq);
717 trace_i915_request_in(rq, port_index(port, execlists));
6c067579
CW
718 last = rq;
719 submit = true;
70c2a24d 720 }
d55ac5bf 721
655250a8 722 rb_erase_cached(&p->node, &execlists->queue);
6c067579
CW
723 INIT_LIST_HEAD(&p->requests);
724 if (p->priority != I915_PRIORITY_NORMAL)
c5cf9a91 725 kmem_cache_free(engine->i915->priorities, p);
f6322edd 726 }
15c83c43 727
6c067579 728done:
15c83c43
CW
729 /*
730 * Here be a bit of magic! Or sleight-of-hand, whichever you prefer.
731 *
732 * We choose queue_priority such that if we add a request of greater
733 * priority than this, we kick the submission tasklet to decide on
734 * the right order of submitting the requests to hardware. We must
735 * also be prepared to reorder requests as they are in-flight on the
736 * HW. We derive the queue_priority then as the first "hole" in
737 * the HW submission ports and if there are no available slots,
738 * the priority of the lowest executing request, i.e. last.
739 *
740 * When we do receive a higher priority request ready to run from the
741 * user, see queue_request(), the queue_priority is bumped to that
742 * request triggering preemption on the next dequeue (or subsequent
743 * interrupt for secondary ports).
744 */
745 execlists->queue_priority =
746 port != execlists->port ? rq_prio(last) : INT_MIN;
747
0b02befa 748 if (submit) {
77f0d0e9 749 port_assign(port, last);
0b02befa
CW
750 execlists_submit_ports(engine);
751 }
339ccd35
CW
752
753 /* We must always keep the beast fed if we have work piled up */
655250a8
CW
754 GEM_BUG_ON(rb_first_cached(&execlists->queue) &&
755 !port_isset(execlists->port));
339ccd35 756
4413c474
CW
757 /* Re-evaluate the executing context setup after each preemptive kick */
758 if (last)
f2605207 759 execlists_user_begin(execlists, execlists->port);
4413c474 760
0b02befa
CW
761 /* If the engine is now idle, so should be the flag; and vice versa. */
762 GEM_BUG_ON(execlists_is_active(&engine->execlists,
763 EXECLISTS_ACTIVE_USER) ==
764 !port_isset(engine->execlists.port));
4413c474
CW
765}
766
c41937fd 767void
a4598d17 768execlists_cancel_port_requests(struct intel_engine_execlists * const execlists)
cf4591d1 769{
3f9e6cd8 770 struct execlist_port *port = execlists->port;
dc2279e1 771 unsigned int num_ports = execlists_num_ports(execlists);
cf4591d1 772
3f9e6cd8 773 while (num_ports-- && port_isset(port)) {
e61e0f51 774 struct i915_request *rq = port_request(port);
7e44fc28 775
0c5c7df3
TU
776 GEM_TRACE("%s:port%u global=%d (fence %llx:%d), (current %d)\n",
777 rq->engine->name,
778 (unsigned int)(port - execlists->port),
779 rq->global_seqno,
780 rq->fence.context, rq->fence.seqno,
781 intel_engine_get_seqno(rq->engine));
782
4a118ecb 783 GEM_BUG_ON(!execlists->active);
b9b77426
CW
784 execlists_context_schedule_out(rq,
785 i915_request_completed(rq) ?
786 INTEL_CONTEXT_SCHEDULE_OUT :
787 INTEL_CONTEXT_SCHEDULE_PREEMPTED);
702791f7 788
e61e0f51 789 i915_request_put(rq);
7e44fc28 790
3f9e6cd8
CW
791 memset(port, 0, sizeof(*port));
792 port++;
793 }
eed7ec52 794
38057aa1 795 execlists_clear_active(execlists, EXECLISTS_ACTIVE_USER);
f2605207 796 execlists_user_end(execlists);
cf4591d1
MK
797}
798
f4b58f04
CW
799static void reset_csb_pointers(struct intel_engine_execlists *execlists)
800{
801 /*
802 * After a reset, the HW starts writing into CSB entry [0]. We
803 * therefore have to set our HEAD pointer back one entry so that
804 * the *first* entry we check is entry 0. To complicate this further,
805 * as we don't wait for the first interrupt after reset, we have to
806 * fake the HW write to point back to the last entry so that our
807 * inline comparison of our cached head position against the last HW
808 * write works even before the first interrupt.
809 */
810 execlists->csb_head = execlists->csb_write_reset;
811 WRITE_ONCE(*execlists->csb_write, execlists->csb_write_reset);
812}
813
f1a498fa
CW
814static void nop_submission_tasklet(unsigned long data)
815{
816 /* The driver is wedged; don't process any more events. */
817}
818
27a5f61b
CW
819static void execlists_cancel_requests(struct intel_engine_cs *engine)
820{
b620e870 821 struct intel_engine_execlists * const execlists = &engine->execlists;
e61e0f51 822 struct i915_request *rq, *rn;
27a5f61b
CW
823 struct rb_node *rb;
824 unsigned long flags;
27a5f61b 825
0c5c7df3
TU
826 GEM_TRACE("%s current %d\n",
827 engine->name, intel_engine_get_seqno(engine));
963ddd63 828
a3e38836
CW
829 /*
830 * Before we call engine->cancel_requests(), we should have exclusive
831 * access to the submission state. This is arranged for us by the
832 * caller disabling the interrupt generation, the tasklet and other
833 * threads that may then access the same state, giving us a free hand
834 * to reset state. However, we still need to let lockdep be aware that
835 * we know this state may be accessed in hardirq context, so we
836 * disable the irq around this manipulation and we want to keep
837 * the spinlock focused on its duties and not accidentally conflate
838 * coverage to the submission's irq state. (Similarly, although we
839 * shouldn't need to disable irq around the manipulation of the
840 * submission's irq state, we also wish to remind ourselves that
841 * it is irq state.)
842 */
d8857d54 843 spin_lock_irqsave(&engine->timeline.lock, flags);
27a5f61b
CW
844
845 /* Cancel the requests on the HW and clear the ELSP tracker. */
a4598d17 846 execlists_cancel_port_requests(execlists);
27a5f61b
CW
847
848 /* Mark all executing requests as skipped. */
a89d1f92 849 list_for_each_entry(rq, &engine->timeline.requests, link) {
27a5f61b 850 GEM_BUG_ON(!rq->global_seqno);
e61e0f51 851 if (!i915_request_completed(rq))
27a5f61b
CW
852 dma_fence_set_error(&rq->fence, -EIO);
853 }
854
855 /* Flush the queued requests to the timeline list (for retiring). */
655250a8 856 while ((rb = rb_first_cached(&execlists->queue))) {
f6322edd 857 struct i915_priolist *p = to_priolist(rb);
27a5f61b 858
0c7112a0
CW
859 list_for_each_entry_safe(rq, rn, &p->requests, sched.link) {
860 INIT_LIST_HEAD(&rq->sched.link);
27a5f61b
CW
861
862 dma_fence_set_error(&rq->fence, -EIO);
e61e0f51 863 __i915_request_submit(rq);
27a5f61b
CW
864 }
865
655250a8 866 rb_erase_cached(&p->node, &execlists->queue);
27a5f61b
CW
867 INIT_LIST_HEAD(&p->requests);
868 if (p->priority != I915_PRIORITY_NORMAL)
869 kmem_cache_free(engine->i915->priorities, p);
870 }
871
872 /* Remaining _unready_ requests will be nop'ed when submitted */
873
f6322edd 874 execlists->queue_priority = INT_MIN;
655250a8 875 execlists->queue = RB_ROOT_CACHED;
3f9e6cd8 876 GEM_BUG_ON(port_isset(execlists->port));
27a5f61b 877
f1a498fa
CW
878 GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet));
879 execlists->tasklet.func = nop_submission_tasklet;
880
d8857d54 881 spin_unlock_irqrestore(&engine->timeline.lock, flags);
27a5f61b
CW
882}
883
9512f985
CW
884static inline bool
885reset_in_progress(const struct intel_engine_execlists *execlists)
886{
887 return unlikely(!__tasklet_is_enabled(&execlists->tasklet));
888}
889
73377dbc 890static void process_csb(struct intel_engine_cs *engine)
e981e7b1 891{
b620e870 892 struct intel_engine_execlists * const execlists = &engine->execlists;
f2605207 893 struct execlist_port *port = execlists->port;
bc4237ec
CW
894 const u32 * const buf = execlists->csb_status;
895 u8 head, tail;
c6a2ac71 896
bc4237ec
CW
897 /*
898 * Note that csb_write, csb_status may be either in HWSP or mmio.
899 * When reading from the csb_write mmio register, we have to be
900 * careful to only use the GEN8_CSB_WRITE_PTR portion, which is
901 * the low 4bits. As it happens we know the next 4bits are always
902 * zero and so we can simply masked off the low u8 of the register
903 * and treat it identically to reading from the HWSP (without having
904 * to use explicit shifting and masking, and probably bifurcating
905 * the code to handle the legacy mmio read).
906 */
907 head = execlists->csb_head;
908 tail = READ_ONCE(*execlists->csb_write);
909 GEM_TRACE("%s cs-irq head=%d, tail=%d\n", engine->name, head, tail);
910 if (unlikely(head == tail))
911 return;
b2209e62 912
bc4237ec
CW
913 /*
914 * Hopefully paired with a wmb() in HW!
915 *
916 * We must complete the read of the write pointer before any reads
917 * from the CSB, so that we do not see stale values. Without an rmb
918 * (lfence) the HW may speculatively perform the CSB[] reads *before*
919 * we perform the READ_ONCE(*csb_write).
920 */
921 rmb();
767a983a 922
bc4237ec 923 do {
8ea397fa
CW
924 struct i915_request *rq;
925 unsigned int status;
926 unsigned int count;
927
928 if (++head == GEN8_CSB_ENTRIES)
929 head = 0;
930
931 /*
932 * We are flying near dragons again.
933 *
934 * We hold a reference to the request in execlist_port[]
935 * but no more than that. We are operating in softirq
936 * context and so cannot hold any mutex or sleep. That
937 * prevents us stopping the requests we are processing
938 * in port[] from being retired simultaneously (the
939 * breadcrumb will be complete before we see the
940 * context-switch). As we only hold the reference to the
941 * request, any pointer chasing underneath the request
942 * is subject to a potential use-after-free. Thus we
943 * store all of the bookkeeping within port[] as
944 * required, and avoid using unguarded pointers beneath
945 * request itself. The same applies to the atomic
946 * status notifier.
947 */
948
8ea397fa
CW
949 GEM_TRACE("%s csb[%d]: status=0x%08x:0x%08x, active=0x%x\n",
950 engine->name, head,
bc4237ec 951 buf[2 * head + 0], buf[2 * head + 1],
8ea397fa
CW
952 execlists->active);
953
bc4237ec 954 status = buf[2 * head];
8ea397fa
CW
955 if (status & (GEN8_CTX_STATUS_IDLE_ACTIVE |
956 GEN8_CTX_STATUS_PREEMPTED))
957 execlists_set_active(execlists,
958 EXECLISTS_ACTIVE_HWACK);
959 if (status & GEN8_CTX_STATUS_ACTIVE_IDLE)
960 execlists_clear_active(execlists,
961 EXECLISTS_ACTIVE_HWACK);
962
963 if (!(status & GEN8_CTX_STATUS_COMPLETED_MASK))
964 continue;
965
966 /* We should never get a COMPLETED | IDLE_ACTIVE! */
967 GEM_BUG_ON(status & GEN8_CTX_STATUS_IDLE_ACTIVE);
968
969 if (status & GEN8_CTX_STATUS_COMPLETE &&
970 buf[2*head + 1] == execlists->preempt_complete_status) {
971 GEM_TRACE("%s preempt-idle\n", engine->name);
972 complete_preempt_context(execlists);
973 continue;
767a983a 974 }
b620e870 975
8ea397fa
CW
976 if (status & GEN8_CTX_STATUS_PREEMPTED &&
977 execlists_is_active(execlists,
978 EXECLISTS_ACTIVE_PREEMPT))
979 continue;
4af0d727 980
8ea397fa
CW
981 GEM_BUG_ON(!execlists_is_active(execlists,
982 EXECLISTS_ACTIVE_USER));
70c2a24d 983
8ea397fa
CW
984 rq = port_unpack(port, &count);
985 GEM_TRACE("%s out[0]: ctx=%d.%d, global=%d (fence %llx:%d) (current %d), prio=%d\n",
986 engine->name,
987 port->context_id, count,
988 rq ? rq->global_seqno : 0,
989 rq ? rq->fence.context : 0,
990 rq ? rq->fence.seqno : 0,
991 intel_engine_get_seqno(engine),
992 rq ? rq_prio(rq) : 0);
993
994 /* Check the context/desc id for this event matches */
995 GEM_DEBUG_BUG_ON(buf[2 * head + 1] != port->context_id);
996
997 GEM_BUG_ON(count == 0);
998 if (--count == 0) {
73377dbc 999 /*
8ea397fa
CW
1000 * On the final event corresponding to the
1001 * submission of this context, we expect either
1002 * an element-switch event or a completion
1003 * event (and on completion, the active-idle
1004 * marker). No more preemptions, lite-restore
1005 * or otherwise.
2ffe80aa 1006 */
8ea397fa
CW
1007 GEM_BUG_ON(status & GEN8_CTX_STATUS_PREEMPTED);
1008 GEM_BUG_ON(port_isset(&port[1]) &&
1009 !(status & GEN8_CTX_STATUS_ELEMENT_SWITCH));
1010 GEM_BUG_ON(!port_isset(&port[1]) &&
1011 !(status & GEN8_CTX_STATUS_ACTIVE_IDLE));
2ffe80aa 1012
8ea397fa
CW
1013 /*
1014 * We rely on the hardware being strongly
1015 * ordered, that the breadcrumb write is
1016 * coherent (visible from the CPU) before the
1017 * user interrupt and CSB is processed.
1018 */
1019 GEM_BUG_ON(!i915_request_completed(rq));
beecec90 1020
8ea397fa
CW
1021 execlists_context_schedule_out(rq,
1022 INTEL_CONTEXT_SCHEDULE_OUT);
1023 i915_request_put(rq);
e084039b 1024
8ea397fa
CW
1025 GEM_TRACE("%s completed ctx=%d\n",
1026 engine->name, port->context_id);
e084039b 1027
8ea397fa
CW
1028 port = execlists_port_complete(execlists, port);
1029 if (port_isset(port))
1030 execlists_user_begin(execlists, port);
1031 else
1032 execlists_user_end(execlists);
1033 } else {
1034 port_set(port, port_pack(rq, count));
4af0d727 1035 }
bc4237ec 1036 } while (head != tail);
e981e7b1 1037
bc4237ec 1038 execlists->csb_head = head;
73377dbc 1039}
c6a2ac71 1040
9512f985 1041static void __execlists_submission_tasklet(struct intel_engine_cs *const engine)
73377dbc 1042{
9512f985 1043 lockdep_assert_held(&engine->timeline.lock);
73377dbc
CW
1044
1045 /*
1046 * We can skip acquiring intel_runtime_pm_get() here as it was taken
1047 * on our behalf by the request (see i915_gem_mark_busy()) and it will
1048 * not be relinquished until the device is idle (see
1049 * i915_gem_idle_work_handler()). As a precaution, we make sure
1050 * that all ELSP are drained i.e. we have processed the CSB,
1051 * before allowing ourselves to idle and calling intel_runtime_pm_put().
1052 */
1053 GEM_BUG_ON(!engine->i915->gt.awake);
1054
fd8526e5 1055 process_csb(engine);
73377dbc
CW
1056 if (!execlists_is_active(&engine->execlists, EXECLISTS_ACTIVE_PREEMPT))
1057 execlists_dequeue(engine);
e981e7b1
TD
1058}
1059
9512f985
CW
1060/*
1061 * Check the unread Context Status Buffers and manage the submission of new
1062 * contexts to the ELSP accordingly.
1063 */
1064static void execlists_submission_tasklet(unsigned long data)
1065{
1066 struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
1067 unsigned long flags;
1068
1069 GEM_TRACE("%s awake?=%d, active=%x\n",
1070 engine->name,
1071 engine->i915->gt.awake,
1072 engine->execlists.active);
1073
1074 spin_lock_irqsave(&engine->timeline.lock, flags);
1075
1076 if (engine->i915->gt.awake) /* we may be delayed until after we idle! */
1077 __execlists_submission_tasklet(engine);
1078
1079 spin_unlock_irqrestore(&engine->timeline.lock, flags);
1080}
1081
f6322edd 1082static void queue_request(struct intel_engine_cs *engine,
0c7112a0 1083 struct i915_sched_node *node,
f6322edd 1084 int prio)
27606fd8 1085{
0c7112a0 1086 list_add_tail(&node->link,
87c7acf8 1087 &lookup_priolist(engine, prio)->requests);
f6322edd 1088}
27606fd8 1089
9512f985 1090static void __update_queue(struct intel_engine_cs *engine, int prio)
ae2f5c00
CW
1091{
1092 engine->execlists.queue_priority = prio;
9512f985
CW
1093}
1094
1095static void __submit_queue_imm(struct intel_engine_cs *engine)
1096{
1097 struct intel_engine_execlists * const execlists = &engine->execlists;
1098
1099 if (reset_in_progress(execlists))
1100 return; /* defer until we restart the engine following reset */
1101
1102 if (execlists->tasklet.func == execlists_submission_tasklet)
1103 __execlists_submission_tasklet(engine);
1104 else
1105 tasklet_hi_schedule(&execlists->tasklet);
ae2f5c00
CW
1106}
1107
f6322edd
CW
1108static void submit_queue(struct intel_engine_cs *engine, int prio)
1109{
9512f985
CW
1110 if (prio > engine->execlists.queue_priority) {
1111 __update_queue(engine, prio);
1112 __submit_queue_imm(engine);
1113 }
27606fd8
CW
1114}
1115
e61e0f51 1116static void execlists_submit_request(struct i915_request *request)
acdd884a 1117{
4a570db5 1118 struct intel_engine_cs *engine = request->engine;
5590af3e 1119 unsigned long flags;
acdd884a 1120
663f71e7 1121 /* Will be called from irq-context when using foreign fences. */
a89d1f92 1122 spin_lock_irqsave(&engine->timeline.lock, flags);
acdd884a 1123
0c7112a0 1124 queue_request(engine, &request->sched, rq_prio(request));
acdd884a 1125
655250a8 1126 GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
0c7112a0 1127 GEM_BUG_ON(list_empty(&request->sched.link));
6c067579 1128
9512f985
CW
1129 submit_queue(engine, rq_prio(request));
1130
a89d1f92 1131 spin_unlock_irqrestore(&engine->timeline.lock, flags);
acdd884a
MT
1132}
1133
0c7112a0 1134static struct i915_request *sched_to_request(struct i915_sched_node *node)
1f181225 1135{
0c7112a0 1136 return container_of(node, struct i915_request, sched);
1f181225
CW
1137}
1138
20311bd3 1139static struct intel_engine_cs *
0c7112a0 1140sched_lock_engine(struct i915_sched_node *node, struct intel_engine_cs *locked)
20311bd3 1141{
0c7112a0 1142 struct intel_engine_cs *engine = sched_to_request(node)->engine;
a79a524e
CW
1143
1144 GEM_BUG_ON(!locked);
20311bd3 1145
20311bd3 1146 if (engine != locked) {
a89d1f92
CW
1147 spin_unlock(&locked->timeline.lock);
1148 spin_lock(&engine->timeline.lock);
20311bd3
CW
1149 }
1150
1151 return engine;
1152}
1153
b7268c5e
CW
1154static void execlists_schedule(struct i915_request *request,
1155 const struct i915_sched_attr *attr)
20311bd3 1156{
a02eb975
CW
1157 struct i915_priolist *uninitialized_var(pl);
1158 struct intel_engine_cs *engine, *last;
20311bd3
CW
1159 struct i915_dependency *dep, *p;
1160 struct i915_dependency stack;
b7268c5e 1161 const int prio = attr->priority;
20311bd3
CW
1162 LIST_HEAD(dfs);
1163
7d1ea609
CW
1164 GEM_BUG_ON(prio == I915_PRIORITY_INVALID);
1165
e61e0f51 1166 if (i915_request_completed(request))
c218ee03
CW
1167 return;
1168
b7268c5e 1169 if (prio <= READ_ONCE(request->sched.attr.priority))
20311bd3
CW
1170 return;
1171
70cd1476
CW
1172 /* Need BKL in order to use the temporary link inside i915_dependency */
1173 lockdep_assert_held(&request->i915->drm.struct_mutex);
20311bd3 1174
0c7112a0 1175 stack.signaler = &request->sched;
20311bd3
CW
1176 list_add(&stack.dfs_link, &dfs);
1177
ce01b173
CW
1178 /*
1179 * Recursively bump all dependent priorities to match the new request.
20311bd3
CW
1180 *
1181 * A naive approach would be to use recursion:
0c7112a0
CW
1182 * static void update_priorities(struct i915_sched_node *node, prio) {
1183 * list_for_each_entry(dep, &node->signalers_list, signal_link)
20311bd3 1184 * update_priorities(dep->signal, prio)
0c7112a0 1185 * queue_request(node);
20311bd3
CW
1186 * }
1187 * but that may have unlimited recursion depth and so runs a very
1188 * real risk of overunning the kernel stack. Instead, we build
1189 * a flat list of all dependencies starting with the current request.
1190 * As we walk the list of dependencies, we add all of its dependencies
1191 * to the end of the list (this may include an already visited
1192 * request) and continue to walk onwards onto the new dependencies. The
1193 * end result is a topological list of requests in reverse order, the
1194 * last element in the list is the request we must execute first.
1195 */
2221c5b7 1196 list_for_each_entry(dep, &dfs, dfs_link) {
0c7112a0 1197 struct i915_sched_node *node = dep->signaler;
20311bd3 1198
ce01b173
CW
1199 /*
1200 * Within an engine, there can be no cycle, but we may
a79a524e
CW
1201 * refer to the same dependency chain multiple times
1202 * (redundant dependencies are not eliminated) and across
1203 * engines.
1204 */
0c7112a0 1205 list_for_each_entry(p, &node->signalers_list, signal_link) {
ce01b173
CW
1206 GEM_BUG_ON(p == dep); /* no cycles! */
1207
0c7112a0 1208 if (i915_sched_node_signaled(p->signaler))
1f181225
CW
1209 continue;
1210
b7268c5e
CW
1211 GEM_BUG_ON(p->signaler->attr.priority < node->attr.priority);
1212 if (prio > READ_ONCE(p->signaler->attr.priority))
20311bd3 1213 list_move_tail(&p->dfs_link, &dfs);
a79a524e 1214 }
20311bd3
CW
1215 }
1216
ce01b173
CW
1217 /*
1218 * If we didn't need to bump any existing priorities, and we haven't
349bdb68
CW
1219 * yet submitted this request (i.e. there is no potential race with
1220 * execlists_submit_request()), we can set our own priority and skip
1221 * acquiring the engine locks.
1222 */
b7268c5e 1223 if (request->sched.attr.priority == I915_PRIORITY_INVALID) {
0c7112a0 1224 GEM_BUG_ON(!list_empty(&request->sched.link));
b7268c5e 1225 request->sched.attr = *attr;
349bdb68
CW
1226 if (stack.dfs_link.next == stack.dfs_link.prev)
1227 return;
1228 __list_del_entry(&stack.dfs_link);
1229 }
1230
a02eb975 1231 last = NULL;
a79a524e 1232 engine = request->engine;
a89d1f92 1233 spin_lock_irq(&engine->timeline.lock);
a79a524e 1234
20311bd3
CW
1235 /* Fifo and depth-first replacement ensure our deps execute before us */
1236 list_for_each_entry_safe_reverse(dep, p, &dfs, dfs_link) {
0c7112a0 1237 struct i915_sched_node *node = dep->signaler;
20311bd3
CW
1238
1239 INIT_LIST_HEAD(&dep->dfs_link);
1240
0c7112a0 1241 engine = sched_lock_engine(node, engine);
20311bd3 1242
b7268c5e 1243 if (prio <= node->attr.priority)
20311bd3
CW
1244 continue;
1245
b7268c5e 1246 node->attr.priority = prio;
0c7112a0 1247 if (!list_empty(&node->link)) {
a02eb975
CW
1248 if (last != engine) {
1249 pl = lookup_priolist(engine, prio);
1250 last = engine;
1251 }
1252 GEM_BUG_ON(pl->priority != prio);
1253 list_move_tail(&node->link, &pl->requests);
a79a524e 1254 }
ae2f5c00
CW
1255
1256 if (prio > engine->execlists.queue_priority &&
9512f985
CW
1257 i915_sw_fence_done(&sched_to_request(node)->submit)) {
1258 /* defer submission until after all of our updates */
1259 __update_queue(engine, prio);
1260 tasklet_hi_schedule(&engine->execlists.tasklet);
1261 }
20311bd3
CW
1262 }
1263
a89d1f92 1264 spin_unlock_irq(&engine->timeline.lock);
20311bd3
CW
1265}
1266
1fc44d9b
CW
1267static void execlists_context_destroy(struct intel_context *ce)
1268{
1fc44d9b
CW
1269 GEM_BUG_ON(ce->pin_count);
1270
dd12c6ca
CW
1271 if (!ce->state)
1272 return;
1273
1fc44d9b 1274 intel_ring_free(ce->ring);
efe79d48
CW
1275
1276 GEM_BUG_ON(i915_gem_object_is_active(ce->state->obj));
1277 i915_gem_object_put(ce->state->obj);
1fc44d9b
CW
1278}
1279
867985d4 1280static void execlists_context_unpin(struct intel_context *ce)
1fc44d9b
CW
1281{
1282 intel_ring_unpin(ce->ring);
1283
1284 ce->state->obj->pin_global--;
1285 i915_gem_object_unpin_map(ce->state->obj);
1286 i915_vma_unpin(ce->state);
1287
1288 i915_gem_context_put(ce->gem_context);
1289}
1290
f4e15af7
CW
1291static int __context_pin(struct i915_gem_context *ctx, struct i915_vma *vma)
1292{
1293 unsigned int flags;
1294 int err;
1295
1296 /*
1297 * Clear this page out of any CPU caches for coherent swap-in/out.
1298 * We only want to do this on the first bind so that we do not stall
1299 * on an active context (which by nature is already on the GPU).
1300 */
1301 if (!(vma->flags & I915_VMA_GLOBAL_BIND)) {
1302 err = i915_gem_object_set_to_gtt_domain(vma->obj, true);
1303 if (err)
1304 return err;
1305 }
1306
1307 flags = PIN_GLOBAL | PIN_HIGH;
1308 if (ctx->ggtt_offset_bias)
1309 flags |= PIN_OFFSET_BIAS | ctx->ggtt_offset_bias;
1310
1311 return i915_vma_pin(vma, 0, GEN8_LR_CONTEXT_ALIGN, flags);
1312}
1313
1fc44d9b
CW
1314static struct intel_context *
1315__execlists_context_pin(struct intel_engine_cs *engine,
1316 struct i915_gem_context *ctx,
1317 struct intel_context *ce)
dcb4c12a 1318{
7d774cac 1319 void *vaddr;
ca82580c 1320 int ret;
dcb4c12a 1321
1fc44d9b 1322 ret = execlists_context_deferred_alloc(ctx, engine, ce);
1d2a19c2
CW
1323 if (ret)
1324 goto err;
56f6e0a7 1325 GEM_BUG_ON(!ce->state);
e8a9c58f 1326
f4e15af7 1327 ret = __context_pin(ctx, ce->state);
e84fe803 1328 if (ret)
24f1d3cc 1329 goto err;
7ba717cf 1330
bf3783e5 1331 vaddr = i915_gem_object_pin_map(ce->state->obj, I915_MAP_WB);
7d774cac
TU
1332 if (IS_ERR(vaddr)) {
1333 ret = PTR_ERR(vaddr);
bf3783e5 1334 goto unpin_vma;
82352e90
TU
1335 }
1336
d822bb18 1337 ret = intel_ring_pin(ce->ring, ctx->i915, ctx->ggtt_offset_bias);
e84fe803 1338 if (ret)
7d774cac 1339 goto unpin_map;
d1675198 1340
1fc44d9b 1341 intel_lr_context_descriptor_update(ctx, engine, ce);
9021ad03 1342
a3aabe86
CW
1343 ce->lrc_reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE;
1344 ce->lrc_reg_state[CTX_RING_BUFFER_START+1] =
bde13ebd 1345 i915_ggtt_offset(ce->ring->vma);
41d37680 1346 GEM_BUG_ON(!intel_ring_offset_valid(ce->ring, ce->ring->head));
c216e906 1347 ce->lrc_reg_state[CTX_RING_HEAD+1] = ce->ring->head;
a3aabe86 1348
3d574a6b 1349 ce->state->obj->pin_global++;
9a6feaf0 1350 i915_gem_context_get(ctx);
1fc44d9b 1351 return ce;
7ba717cf 1352
7d774cac 1353unpin_map:
bf3783e5
CW
1354 i915_gem_object_unpin_map(ce->state->obj);
1355unpin_vma:
1356 __i915_vma_unpin(ce->state);
24f1d3cc 1357err:
9021ad03 1358 ce->pin_count = 0;
266a240b 1359 return ERR_PTR(ret);
e84fe803
NH
1360}
1361
1fc44d9b
CW
1362static const struct intel_context_ops execlists_context_ops = {
1363 .unpin = execlists_context_unpin,
1364 .destroy = execlists_context_destroy,
1365};
1366
1367static struct intel_context *
1368execlists_context_pin(struct intel_engine_cs *engine,
1369 struct i915_gem_context *ctx)
e84fe803 1370{
ab82a063 1371 struct intel_context *ce = to_intel_context(ctx, engine);
e84fe803 1372
91c8a326 1373 lockdep_assert_held(&ctx->i915->drm.struct_mutex);
321fe304 1374
1fc44d9b
CW
1375 if (likely(ce->pin_count++))
1376 return ce;
1377 GEM_BUG_ON(!ce->pin_count); /* no overflow please! */
dcb4c12a 1378
1fc44d9b 1379 ce->ops = &execlists_context_ops;
321fe304 1380
1fc44d9b 1381 return __execlists_context_pin(engine, ctx, ce);
dcb4c12a
OM
1382}
1383
e61e0f51 1384static int execlists_request_alloc(struct i915_request *request)
ef11c01d 1385{
fd138212 1386 int ret;
ef11c01d 1387
1fc44d9b 1388 GEM_BUG_ON(!request->hw_context->pin_count);
e8a9c58f 1389
ef11c01d
CW
1390 /* Flush enough space to reduce the likelihood of waiting after
1391 * we start building the request - in which case we will just
1392 * have to repeat work.
1393 */
1394 request->reserved_space += EXECLISTS_REQUEST_SIZE;
1395
fd138212
CW
1396 ret = intel_ring_wait_for_space(request->ring, request->reserved_space);
1397 if (ret)
1398 return ret;
ef11c01d 1399
ef11c01d
CW
1400 /* Note that after this point, we have committed to using
1401 * this request as it is being used to both track the
1402 * state of engine initialisation and liveness of the
1403 * golden renderstate above. Think twice before you try
1404 * to cancel/unwind this request now.
1405 */
1406
1407 request->reserved_space -= EXECLISTS_REQUEST_SIZE;
1408 return 0;
ef11c01d
CW
1409}
1410
9e000847
AS
1411/*
1412 * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
1413 * PIPE_CONTROL instruction. This is required for the flush to happen correctly
1414 * but there is a slight complication as this is applied in WA batch where the
1415 * values are only initialized once so we cannot take register value at the
1416 * beginning and reuse it further; hence we save its value to memory, upload a
1417 * constant value with bit21 set and then we restore it back with the saved value.
1418 * To simplify the WA, a constant value is formed by using the default value
1419 * of this register. This shouldn't be a problem because we are only modifying
1420 * it for a short period and this batch in non-premptible. We can ofcourse
1421 * use additional instructions that read the actual value of the register
1422 * at that time and set our bit of interest but it makes the WA complicated.
1423 *
1424 * This WA is also required for Gen9 so extracting as a function avoids
1425 * code duplication.
1426 */
097d4f1c
TU
1427static u32 *
1428gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
17ee950d 1429{
097d4f1c
TU
1430 *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1431 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1432 *batch++ = i915_ggtt_offset(engine->scratch) + 256;
1433 *batch++ = 0;
1434
1435 *batch++ = MI_LOAD_REGISTER_IMM(1);
1436 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1437 *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
1438
9f235dfa
TU
1439 batch = gen8_emit_pipe_control(batch,
1440 PIPE_CONTROL_CS_STALL |
1441 PIPE_CONTROL_DC_FLUSH_ENABLE,
1442 0);
097d4f1c
TU
1443
1444 *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1445 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1446 *batch++ = i915_ggtt_offset(engine->scratch) + 256;
1447 *batch++ = 0;
1448
1449 return batch;
17ee950d
AS
1450}
1451
6e5248b5
DV
1452/*
1453 * Typically we only have one indirect_ctx and per_ctx batch buffer which are
1454 * initialized at the beginning and shared across all contexts but this field
1455 * helps us to have multiple batches at different offsets and select them based
1456 * on a criteria. At the moment this batch always start at the beginning of the page
1457 * and at this point we don't have multiple wa_ctx batch buffers.
4d78c8dc 1458 *
6e5248b5
DV
1459 * The number of WA applied are not known at the beginning; we use this field
1460 * to return the no of DWORDS written.
17ee950d 1461 *
6e5248b5
DV
1462 * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
1463 * so it adds NOOPs as padding to make it cacheline aligned.
1464 * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
1465 * makes a complete batch buffer.
17ee950d 1466 */
097d4f1c 1467static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
17ee950d 1468{
7ad00d1a 1469 /* WaDisableCtxRestoreArbitration:bdw,chv */
097d4f1c 1470 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
17ee950d 1471
c82435bb 1472 /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
097d4f1c
TU
1473 if (IS_BROADWELL(engine->i915))
1474 batch = gen8_emit_flush_coherentl3_wa(engine, batch);
c82435bb 1475
0160f055
AS
1476 /* WaClearSlmSpaceAtContextSwitch:bdw,chv */
1477 /* Actual scratch location is at 128 bytes offset */
9f235dfa
TU
1478 batch = gen8_emit_pipe_control(batch,
1479 PIPE_CONTROL_FLUSH_L3 |
1480 PIPE_CONTROL_GLOBAL_GTT_IVB |
1481 PIPE_CONTROL_CS_STALL |
1482 PIPE_CONTROL_QW_WRITE,
1483 i915_ggtt_offset(engine->scratch) +
1484 2 * CACHELINE_BYTES);
0160f055 1485
beecec90
CW
1486 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1487
17ee950d 1488 /* Pad to end of cacheline */
097d4f1c
TU
1489 while ((unsigned long)batch % CACHELINE_BYTES)
1490 *batch++ = MI_NOOP;
17ee950d
AS
1491
1492 /*
1493 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
1494 * execution depends on the length specified in terms of cache lines
1495 * in the register CTX_RCS_INDIRECT_CTX
1496 */
1497
097d4f1c 1498 return batch;
17ee950d
AS
1499}
1500
5ee4a7a6
CW
1501struct lri {
1502 i915_reg_t reg;
1503 u32 value;
1504};
1505
1506static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
0504cffc 1507{
5ee4a7a6 1508 GEM_BUG_ON(!count || count > 63);
beecec90 1509
5ee4a7a6
CW
1510 *batch++ = MI_LOAD_REGISTER_IMM(count);
1511 do {
1512 *batch++ = i915_mmio_reg_offset(lri->reg);
1513 *batch++ = lri->value;
1514 } while (lri++, --count);
1515 *batch++ = MI_NOOP;
a4106a78 1516
5ee4a7a6
CW
1517 return batch;
1518}
b77422f8 1519
5ee4a7a6
CW
1520static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1521{
1522 static const struct lri lri[] = {
1523 /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
1524 {
1525 COMMON_SLICE_CHICKEN2,
1526 __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
1527 0),
1528 },
1529
1530 /* BSpec: 11391 */
1531 {
1532 FF_SLICE_CHICKEN,
1533 __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
1534 FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
1535 },
1536
1537 /* BSpec: 11299 */
1538 {
1539 _3D_CHICKEN3,
1540 __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
1541 _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
1542 }
1543 };
b77422f8 1544
5ee4a7a6 1545 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
b77422f8 1546
5ee4a7a6
CW
1547 /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
1548 batch = gen8_emit_flush_coherentl3_wa(engine, batch);
b77422f8 1549
5ee4a7a6 1550 batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
873e8171 1551
066d4628
MK
1552 /* WaClearSlmSpaceAtContextSwitch:kbl */
1553 /* Actual scratch location is at 128 bytes offset */
097d4f1c 1554 if (IS_KBL_REVID(engine->i915, 0, KBL_REVID_A0)) {
9f235dfa
TU
1555 batch = gen8_emit_pipe_control(batch,
1556 PIPE_CONTROL_FLUSH_L3 |
1557 PIPE_CONTROL_GLOBAL_GTT_IVB |
1558 PIPE_CONTROL_CS_STALL |
1559 PIPE_CONTROL_QW_WRITE,
1560 i915_ggtt_offset(engine->scratch)
1561 + 2 * CACHELINE_BYTES);
066d4628 1562 }
3485d99e 1563
9fb5026f 1564 /* WaMediaPoolStateCmdInWABB:bxt,glk */
3485d99e
TG
1565 if (HAS_POOLED_EU(engine->i915)) {
1566 /*
1567 * EU pool configuration is setup along with golden context
1568 * during context initialization. This value depends on
1569 * device type (2x6 or 3x6) and needs to be updated based
1570 * on which subslice is disabled especially for 2x6
1571 * devices, however it is safe to load default
1572 * configuration of 3x6 device instead of masking off
1573 * corresponding bits because HW ignores bits of a disabled
1574 * subslice and drops down to appropriate config. Please
1575 * see render_state_setup() in i915_gem_render_state.c for
1576 * possible configurations, to avoid duplication they are
1577 * not shown here again.
1578 */
097d4f1c
TU
1579 *batch++ = GEN9_MEDIA_POOL_STATE;
1580 *batch++ = GEN9_MEDIA_POOL_ENABLE;
1581 *batch++ = 0x00777000;
1582 *batch++ = 0;
1583 *batch++ = 0;
1584 *batch++ = 0;
3485d99e
TG
1585 }
1586
beecec90
CW
1587 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1588
0504cffc 1589 /* Pad to end of cacheline */
097d4f1c
TU
1590 while ((unsigned long)batch % CACHELINE_BYTES)
1591 *batch++ = MI_NOOP;
0504cffc 1592
097d4f1c 1593 return batch;
0504cffc
AS
1594}
1595
4b6ce681
RA
1596static u32 *
1597gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1598{
1599 int i;
1600
1601 /*
1602 * WaPipeControlBefore3DStateSamplePattern: cnl
1603 *
1604 * Ensure the engine is idle prior to programming a
1605 * 3DSTATE_SAMPLE_PATTERN during a context restore.
1606 */
1607 batch = gen8_emit_pipe_control(batch,
1608 PIPE_CONTROL_CS_STALL,
1609 0);
1610 /*
1611 * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for
1612 * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in
1613 * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is
1614 * confusing. Since gen8_emit_pipe_control() already advances the
1615 * batch by 6 dwords, we advance the other 10 here, completing a
1616 * cacheline. It's not clear if the workaround requires this padding
1617 * before other commands, or if it's just the regular padding we would
1618 * already have for the workaround bb, so leave it here for now.
1619 */
1620 for (i = 0; i < 10; i++)
1621 *batch++ = MI_NOOP;
1622
1623 /* Pad to end of cacheline */
1624 while ((unsigned long)batch % CACHELINE_BYTES)
1625 *batch++ = MI_NOOP;
1626
1627 return batch;
1628}
1629
097d4f1c
TU
1630#define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE)
1631
1632static int lrc_setup_wa_ctx(struct intel_engine_cs *engine)
17ee950d 1633{
48bb74e4
CW
1634 struct drm_i915_gem_object *obj;
1635 struct i915_vma *vma;
1636 int err;
17ee950d 1637
097d4f1c 1638 obj = i915_gem_object_create(engine->i915, CTX_WA_BB_OBJ_SIZE);
48bb74e4
CW
1639 if (IS_ERR(obj))
1640 return PTR_ERR(obj);
17ee950d 1641
82ad6443 1642 vma = i915_vma_instance(obj, &engine->i915->ggtt.vm, NULL);
48bb74e4
CW
1643 if (IS_ERR(vma)) {
1644 err = PTR_ERR(vma);
1645 goto err;
17ee950d
AS
1646 }
1647
48bb74e4
CW
1648 err = i915_vma_pin(vma, 0, PAGE_SIZE, PIN_GLOBAL | PIN_HIGH);
1649 if (err)
1650 goto err;
1651
1652 engine->wa_ctx.vma = vma;
17ee950d 1653 return 0;
48bb74e4
CW
1654
1655err:
1656 i915_gem_object_put(obj);
1657 return err;
17ee950d
AS
1658}
1659
097d4f1c 1660static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine)
17ee950d 1661{
19880c4a 1662 i915_vma_unpin_and_release(&engine->wa_ctx.vma);
17ee950d
AS
1663}
1664
097d4f1c
TU
1665typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
1666
0bc40be8 1667static int intel_init_workaround_bb(struct intel_engine_cs *engine)
17ee950d 1668{
48bb74e4 1669 struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
097d4f1c
TU
1670 struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx,
1671 &wa_ctx->per_ctx };
1672 wa_bb_func_t wa_bb_fn[2];
17ee950d 1673 struct page *page;
097d4f1c
TU
1674 void *batch, *batch_ptr;
1675 unsigned int i;
48bb74e4 1676 int ret;
17ee950d 1677
10bde236 1678 if (GEM_WARN_ON(engine->id != RCS))
097d4f1c 1679 return -EINVAL;
17ee950d 1680
097d4f1c 1681 switch (INTEL_GEN(engine->i915)) {
cc38cae7
OM
1682 case 11:
1683 return 0;
90007bca 1684 case 10:
4b6ce681
RA
1685 wa_bb_fn[0] = gen10_init_indirectctx_bb;
1686 wa_bb_fn[1] = NULL;
1687 break;
097d4f1c
TU
1688 case 9:
1689 wa_bb_fn[0] = gen9_init_indirectctx_bb;
b8aa2233 1690 wa_bb_fn[1] = NULL;
097d4f1c
TU
1691 break;
1692 case 8:
1693 wa_bb_fn[0] = gen8_init_indirectctx_bb;
3ad7b52d 1694 wa_bb_fn[1] = NULL;
097d4f1c
TU
1695 break;
1696 default:
1697 MISSING_CASE(INTEL_GEN(engine->i915));
5e60d790 1698 return 0;
0504cffc 1699 }
5e60d790 1700
097d4f1c 1701 ret = lrc_setup_wa_ctx(engine);
17ee950d
AS
1702 if (ret) {
1703 DRM_DEBUG_DRIVER("Failed to setup context WA page: %d\n", ret);
1704 return ret;
1705 }
1706
48bb74e4 1707 page = i915_gem_object_get_dirty_page(wa_ctx->vma->obj, 0);
097d4f1c 1708 batch = batch_ptr = kmap_atomic(page);
17ee950d 1709
097d4f1c
TU
1710 /*
1711 * Emit the two workaround batch buffers, recording the offset from the
1712 * start of the workaround batch buffer object for each and their
1713 * respective sizes.
1714 */
1715 for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
1716 wa_bb[i]->offset = batch_ptr - batch;
1d2a19c2
CW
1717 if (GEM_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
1718 CACHELINE_BYTES))) {
097d4f1c
TU
1719 ret = -EINVAL;
1720 break;
1721 }
604a8f6f
CW
1722 if (wa_bb_fn[i])
1723 batch_ptr = wa_bb_fn[i](engine, batch_ptr);
097d4f1c 1724 wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
17ee950d
AS
1725 }
1726
097d4f1c
TU
1727 BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE);
1728
17ee950d
AS
1729 kunmap_atomic(batch);
1730 if (ret)
097d4f1c 1731 lrc_destroy_wa_ctx(engine);
17ee950d
AS
1732
1733 return ret;
1734}
1735
f3c9d407 1736static void enable_execlists(struct intel_engine_cs *engine)
9b1136d5 1737{
c033666a 1738 struct drm_i915_private *dev_priv = engine->i915;
f3c9d407
CW
1739
1740 I915_WRITE(RING_HWSTAM(engine->mmio_base), 0xffffffff);
225701fc
KG
1741
1742 /*
1743 * Make sure we're not enabling the new 12-deep CSB
1744 * FIFO as that requires a slightly updated handling
1745 * in the ctx switch irq. Since we're currently only
1746 * using only 2 elements of the enhanced execlists the
1747 * deeper FIFO it's not needed and it's not worth adding
1748 * more statements to the irq handler to support it.
1749 */
1750 if (INTEL_GEN(dev_priv) >= 11)
1751 I915_WRITE(RING_MODE_GEN7(engine),
1752 _MASKED_BIT_DISABLE(GEN11_GFX_DISABLE_LEGACY_MODE));
1753 else
1754 I915_WRITE(RING_MODE_GEN7(engine),
1755 _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE));
1756
9a4dc803
CW
1757 I915_WRITE(RING_MI_MODE(engine->mmio_base),
1758 _MASKED_BIT_DISABLE(STOP_RING));
1759
f3c9d407
CW
1760 I915_WRITE(RING_HWS_PGA(engine->mmio_base),
1761 engine->status_page.ggtt_offset);
1762 POSTING_READ(RING_HWS_PGA(engine->mmio_base));
1763}
1764
9a4dc803
CW
1765static bool unexpected_starting_state(struct intel_engine_cs *engine)
1766{
1767 struct drm_i915_private *dev_priv = engine->i915;
1768 bool unexpected = false;
1769
1770 if (I915_READ(RING_MI_MODE(engine->mmio_base)) & STOP_RING) {
1771 DRM_DEBUG_DRIVER("STOP_RING still set in RING_MI_MODE\n");
1772 unexpected = true;
1773 }
1774
1775 return unexpected;
1776}
1777
f3c9d407
CW
1778static int gen8_init_common_ring(struct intel_engine_cs *engine)
1779{
821ed7df
CW
1780 int ret;
1781
1782 ret = intel_mocs_init_engine(engine);
1783 if (ret)
1784 return ret;
9b1136d5 1785
ad07dfcd 1786 intel_engine_reset_breadcrumbs(engine);
821ed7df 1787
9a4dc803
CW
1788 if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) {
1789 struct drm_printer p = drm_debug_printer(__func__);
1790
1791 intel_engine_dump(engine, &p, NULL);
1792 }
1793
f3c9d407 1794 enable_execlists(engine);
9b1136d5 1795
821ed7df 1796 return 0;
9b1136d5
OM
1797}
1798
0bc40be8 1799static int gen8_init_render_ring(struct intel_engine_cs *engine)
9b1136d5 1800{
c033666a 1801 struct drm_i915_private *dev_priv = engine->i915;
9b1136d5
OM
1802 int ret;
1803
0bc40be8 1804 ret = gen8_init_common_ring(engine);
9b1136d5
OM
1805 if (ret)
1806 return ret;
1807
f4ecfbfc 1808 intel_whitelist_workarounds_apply(engine);
59b449d5 1809
9b1136d5
OM
1810 /* We need to disable the AsyncFlip performance optimisations in order
1811 * to use MI_WAIT_FOR_EVENT within the CS. It should already be
1812 * programmed to '1' on all products.
1813 *
1814 * WaDisableAsyncFlipPerfMode:snb,ivb,hsw,vlv,bdw,chv
1815 */
1816 I915_WRITE(MI_MODE, _MASKED_BIT_ENABLE(ASYNC_FLIP_PERF_DISABLE));
1817
9b1136d5
OM
1818 I915_WRITE(INSTPM, _MASKED_BIT_ENABLE(INSTPM_FORCE_ORDERING));
1819
59b449d5 1820 return 0;
9b1136d5
OM
1821}
1822
0bc40be8 1823static int gen9_init_render_ring(struct intel_engine_cs *engine)
82ef822e
DL
1824{
1825 int ret;
1826
0bc40be8 1827 ret = gen8_init_common_ring(engine);
82ef822e
DL
1828 if (ret)
1829 return ret;
1830
f4ecfbfc 1831 intel_whitelist_workarounds_apply(engine);
59b449d5
OM
1832
1833 return 0;
82ef822e
DL
1834}
1835
5adfb772
CW
1836static struct i915_request *
1837execlists_reset_prepare(struct intel_engine_cs *engine)
1838{
1839 struct intel_engine_execlists * const execlists = &engine->execlists;
63572937 1840 struct i915_request *request, *active;
9512f985 1841 unsigned long flags;
5adfb772
CW
1842
1843 GEM_TRACE("%s\n", engine->name);
1844
1845 /*
1846 * Prevent request submission to the hardware until we have
1847 * completed the reset in i915_gem_reset_finish(). If a request
1848 * is completed by one engine, it may then queue a request
1849 * to a second via its execlists->tasklet *just* as we are
1850 * calling engine->init_hw() and also writing the ELSP.
1851 * Turning off the execlists->tasklet until the reset is over
1852 * prevents the race.
1853 */
1854 __tasklet_disable_sync_once(&execlists->tasklet);
1855
9512f985
CW
1856 spin_lock_irqsave(&engine->timeline.lock, flags);
1857
63572937
CW
1858 /*
1859 * We want to flush the pending context switches, having disabled
1860 * the tasklet above, we can assume exclusive access to the execlists.
1861 * For this allows us to catch up with an inflight preemption event,
1862 * and avoid blaming an innocent request if the stall was due to the
1863 * preemption itself.
1864 */
fd8526e5 1865 process_csb(engine);
63572937
CW
1866
1867 /*
1868 * The last active request can then be no later than the last request
1869 * now in ELSP[0]. So search backwards from there, so that if the GPU
1870 * has advanced beyond the last CSB update, it will be pardoned.
1871 */
1872 active = NULL;
1873 request = port_request(execlists->port);
1874 if (request) {
3f6e9822
CW
1875 /*
1876 * Prevent the breadcrumb from advancing before we decide
1877 * which request is currently active.
1878 */
1879 intel_engine_stop_cs(engine);
1880
63572937
CW
1881 list_for_each_entry_from_reverse(request,
1882 &engine->timeline.requests,
1883 link) {
1884 if (__i915_request_completed(request,
1885 request->global_seqno))
1886 break;
1887
1888 active = request;
1889 }
63572937
CW
1890 }
1891
9512f985
CW
1892 spin_unlock_irqrestore(&engine->timeline.lock, flags);
1893
63572937 1894 return active;
5adfb772
CW
1895}
1896
1897static void execlists_reset(struct intel_engine_cs *engine,
1898 struct i915_request *request)
821ed7df 1899{
b620e870 1900 struct intel_engine_execlists * const execlists = &engine->execlists;
221ab971 1901 unsigned long flags;
5692251c 1902 u32 *regs;
cdb6ded4 1903
0c5c7df3
TU
1904 GEM_TRACE("%s request global=%x, current=%d\n",
1905 engine->name, request ? request->global_seqno : 0,
1906 intel_engine_get_seqno(engine));
42232213 1907
d8857d54 1908 spin_lock_irqsave(&engine->timeline.lock, flags);
221ab971 1909
cdb6ded4
CW
1910 /*
1911 * Catch up with any missed context-switch interrupts.
1912 *
1913 * Ideally we would just read the remaining CSB entries now that we
1914 * know the gpu is idle. However, the CSB registers are sometimes^W
1915 * often trashed across a GPU reset! Instead we have to rely on
1916 * guessing the missed context-switch events by looking at what
1917 * requests were completed.
1918 */
a4598d17 1919 execlists_cancel_port_requests(execlists);
cdb6ded4 1920
221ab971 1921 /* Push back any incomplete requests for replay after the reset. */
a4598d17 1922 __unwind_incomplete_requests(engine);
cdb6ded4 1923
c3160da9 1924 /* Following the reset, we need to reload the CSB read/write pointers */
f4b58f04 1925 reset_csb_pointers(&engine->execlists);
c3160da9 1926
d8857d54 1927 spin_unlock_irqrestore(&engine->timeline.lock, flags);
aebbc2d7 1928
a3e38836
CW
1929 /*
1930 * If the request was innocent, we leave the request in the ELSP
c0dcb203
CW
1931 * and will try to replay it on restarting. The context image may
1932 * have been corrupted by the reset, in which case we may have
1933 * to service a new GPU hang, but more likely we can continue on
1934 * without impact.
1935 *
1936 * If the request was guilty, we presume the context is corrupt
1937 * and have to at least restore the RING register in the context
1938 * image back to the expected values to skip over the guilty request.
1939 */
221ab971 1940 if (!request || request->fence.error != -EIO)
c0dcb203 1941 return;
821ed7df 1942
a3e38836
CW
1943 /*
1944 * We want a simple context + ring to execute the breadcrumb update.
a3aabe86
CW
1945 * We cannot rely on the context being intact across the GPU hang,
1946 * so clear it and rebuild just what we need for the breadcrumb.
1947 * All pending requests for this context will be zapped, and any
1948 * future request will be after userspace has had the opportunity
1949 * to recreate its own state.
1950 */
1fc44d9b 1951 regs = request->hw_context->lrc_reg_state;
fe0c4935
CW
1952 if (engine->pinned_default_state) {
1953 memcpy(regs, /* skip restoring the vanilla PPHWSP */
1954 engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE,
1955 engine->context_size - PAGE_SIZE);
5692251c 1956 }
4e0d64db
CW
1957 execlists_init_reg_state(regs,
1958 request->gem_context, engine, request->ring);
a3aabe86 1959
821ed7df 1960 /* Move the RING_HEAD onto the breadcrumb, past the hanging batch */
5692251c 1961 regs[CTX_RING_BUFFER_START + 1] = i915_ggtt_offset(request->ring->vma);
a3aabe86 1962
41d37680
CW
1963 request->ring->head = intel_ring_wrap(request->ring, request->postfix);
1964 regs[CTX_RING_HEAD + 1] = request->ring->head;
1965
821ed7df
CW
1966 intel_ring_update_space(request->ring);
1967
a3aabe86 1968 /* Reset WaIdleLiteRestore:bdw,skl as well */
7e4992ac 1969 unwind_wa_tail(request);
821ed7df
CW
1970}
1971
5adfb772
CW
1972static void execlists_reset_finish(struct intel_engine_cs *engine)
1973{
5db1d4ea
CW
1974 struct intel_engine_execlists * const execlists = &engine->execlists;
1975
1976 /* After a GPU reset, we may have requests to replay */
655250a8 1977 if (!RB_EMPTY_ROOT(&execlists->queue.rb_root))
5db1d4ea
CW
1978 tasklet_schedule(&execlists->tasklet);
1979
fe25f304
CW
1980 /*
1981 * Flush the tasklet while we still have the forcewake to be sure
1982 * that it is not allowed to sleep before we restart and reload a
1983 * context.
1984 *
1985 * As before (with execlists_reset_prepare) we rely on the caller
1986 * serialising multiple attempts to reset so that we know that we
1987 * are the only one manipulating tasklet state.
1988 */
5db1d4ea 1989 __tasklet_enable_sync_once(&execlists->tasklet);
5adfb772
CW
1990
1991 GEM_TRACE("%s\n", engine->name);
1992}
1993
e61e0f51 1994static int intel_logical_ring_emit_pdps(struct i915_request *rq)
7a01a0a2 1995{
4e0d64db 1996 struct i915_hw_ppgtt *ppgtt = rq->gem_context->ppgtt;
e61e0f51 1997 struct intel_engine_cs *engine = rq->engine;
e7167769 1998 const int num_lri_cmds = GEN8_3LVL_PDPES * 2;
73dec95e
TU
1999 u32 *cs;
2000 int i;
7a01a0a2 2001
e61e0f51 2002 cs = intel_ring_begin(rq, num_lri_cmds * 2 + 2);
73dec95e
TU
2003 if (IS_ERR(cs))
2004 return PTR_ERR(cs);
7a01a0a2 2005
73dec95e 2006 *cs++ = MI_LOAD_REGISTER_IMM(num_lri_cmds);
e7167769 2007 for (i = GEN8_3LVL_PDPES - 1; i >= 0; i--) {
7a01a0a2
MT
2008 const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i);
2009
73dec95e
TU
2010 *cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(engine, i));
2011 *cs++ = upper_32_bits(pd_daddr);
2012 *cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(engine, i));
2013 *cs++ = lower_32_bits(pd_daddr);
7a01a0a2
MT
2014 }
2015
73dec95e 2016 *cs++ = MI_NOOP;
e61e0f51 2017 intel_ring_advance(rq, cs);
7a01a0a2
MT
2018
2019 return 0;
2020}
2021
e61e0f51 2022static int gen8_emit_bb_start(struct i915_request *rq,
803688ba 2023 u64 offset, u32 len,
54af56db 2024 const unsigned int flags)
15648585 2025{
73dec95e 2026 u32 *cs;
15648585
OM
2027 int ret;
2028
7a01a0a2
MT
2029 /* Don't rely in hw updating PDPs, specially in lite-restore.
2030 * Ideally, we should set Force PD Restore in ctx descriptor,
2031 * but we can't. Force Restore would be a second option, but
2032 * it is unsafe in case of lite-restore (because the ctx is
2dba3239
MT
2033 * not idle). PML4 is allocated during ppgtt init so this is
2034 * not needed in 48-bit.*/
4e0d64db
CW
2035 if (rq->gem_context->ppgtt &&
2036 (intel_engine_flag(rq->engine) & rq->gem_context->ppgtt->pd_dirty_rings) &&
82ad6443 2037 !i915_vm_is_48bit(&rq->gem_context->ppgtt->vm) &&
e61e0f51
CW
2038 !intel_vgpu_active(rq->i915)) {
2039 ret = intel_logical_ring_emit_pdps(rq);
54af56db
MK
2040 if (ret)
2041 return ret;
7a01a0a2 2042
4e0d64db 2043 rq->gem_context->ppgtt->pd_dirty_rings &= ~intel_engine_flag(rq->engine);
7a01a0a2
MT
2044 }
2045
74f94741 2046 cs = intel_ring_begin(rq, 6);
73dec95e
TU
2047 if (IS_ERR(cs))
2048 return PTR_ERR(cs);
15648585 2049
279f5a00
CW
2050 /*
2051 * WaDisableCtxRestoreArbitration:bdw,chv
2052 *
2053 * We don't need to perform MI_ARB_ENABLE as often as we do (in
2054 * particular all the gen that do not need the w/a at all!), if we
2055 * took care to make sure that on every switch into this context
2056 * (both ordinary and for preemption) that arbitrartion was enabled
2057 * we would be fine. However, there doesn't seem to be a downside to
2058 * being paranoid and making sure it is set before each batch and
2059 * every context-switch.
2060 *
2061 * Note that if we fail to enable arbitration before the request
2062 * is complete, then we do not see the context-switch interrupt and
2063 * the engine hangs (with RING_HEAD == RING_TAIL).
2064 *
2065 * That satisfies both the GPGPU w/a and our heavy-handed paranoia.
2066 */
3ad7b52d
CW
2067 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
2068
15648585 2069 /* FIXME(BDW): Address space and security selectors. */
54af56db
MK
2070 *cs++ = MI_BATCH_BUFFER_START_GEN8 |
2071 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8)) |
2072 (flags & I915_DISPATCH_RS ? MI_BATCH_RESOURCE_STREAMER : 0);
73dec95e
TU
2073 *cs++ = lower_32_bits(offset);
2074 *cs++ = upper_32_bits(offset);
74f94741
CW
2075
2076 *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
2077 *cs++ = MI_NOOP;
e61e0f51 2078 intel_ring_advance(rq, cs);
15648585
OM
2079
2080 return 0;
2081}
2082
31bb59cc 2083static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine)
73d477f6 2084{
c033666a 2085 struct drm_i915_private *dev_priv = engine->i915;
31bb59cc
CW
2086 I915_WRITE_IMR(engine,
2087 ~(engine->irq_enable_mask | engine->irq_keep_mask));
2088 POSTING_READ_FW(RING_IMR(engine->mmio_base));
73d477f6
OM
2089}
2090
31bb59cc 2091static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine)
73d477f6 2092{
c033666a 2093 struct drm_i915_private *dev_priv = engine->i915;
31bb59cc 2094 I915_WRITE_IMR(engine, ~engine->irq_keep_mask);
73d477f6
OM
2095}
2096
e61e0f51 2097static int gen8_emit_flush(struct i915_request *request, u32 mode)
4712274c 2098{
73dec95e 2099 u32 cmd, *cs;
4712274c 2100
73dec95e
TU
2101 cs = intel_ring_begin(request, 4);
2102 if (IS_ERR(cs))
2103 return PTR_ERR(cs);
4712274c
OM
2104
2105 cmd = MI_FLUSH_DW + 1;
2106
f0a1fb10
CW
2107 /* We always require a command barrier so that subsequent
2108 * commands, such as breadcrumb interrupts, are strictly ordered
2109 * wrt the contents of the write cache being flushed to memory
2110 * (and thus being coherent from the CPU).
2111 */
2112 cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
2113
7c9cf4e3 2114 if (mode & EMIT_INVALIDATE) {
f0a1fb10 2115 cmd |= MI_INVALIDATE_TLB;
1dae2dfb 2116 if (request->engine->id == VCS)
f0a1fb10 2117 cmd |= MI_INVALIDATE_BSD;
4712274c
OM
2118 }
2119
73dec95e
TU
2120 *cs++ = cmd;
2121 *cs++ = I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT;
2122 *cs++ = 0; /* upper addr */
2123 *cs++ = 0; /* value */
2124 intel_ring_advance(request, cs);
4712274c
OM
2125
2126 return 0;
2127}
2128
e61e0f51 2129static int gen8_emit_flush_render(struct i915_request *request,
7c9cf4e3 2130 u32 mode)
4712274c 2131{
b5321f30 2132 struct intel_engine_cs *engine = request->engine;
bde13ebd
CW
2133 u32 scratch_addr =
2134 i915_ggtt_offset(engine->scratch) + 2 * CACHELINE_BYTES;
0b2d0934 2135 bool vf_flush_wa = false, dc_flush_wa = false;
73dec95e 2136 u32 *cs, flags = 0;
0b2d0934 2137 int len;
4712274c
OM
2138
2139 flags |= PIPE_CONTROL_CS_STALL;
2140
7c9cf4e3 2141 if (mode & EMIT_FLUSH) {
4712274c
OM
2142 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
2143 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
965fd602 2144 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
40a24488 2145 flags |= PIPE_CONTROL_FLUSH_ENABLE;
4712274c
OM
2146 }
2147
7c9cf4e3 2148 if (mode & EMIT_INVALIDATE) {
4712274c
OM
2149 flags |= PIPE_CONTROL_TLB_INVALIDATE;
2150 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
2151 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
2152 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
2153 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
2154 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
2155 flags |= PIPE_CONTROL_QW_WRITE;
2156 flags |= PIPE_CONTROL_GLOBAL_GTT_IVB;
4712274c 2157
1a5a9ce7
BW
2158 /*
2159 * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL
2160 * pipe control.
2161 */
c033666a 2162 if (IS_GEN9(request->i915))
1a5a9ce7 2163 vf_flush_wa = true;
0b2d0934
MK
2164
2165 /* WaForGAMHang:kbl */
2166 if (IS_KBL_REVID(request->i915, 0, KBL_REVID_B0))
2167 dc_flush_wa = true;
1a5a9ce7 2168 }
9647ff36 2169
0b2d0934
MK
2170 len = 6;
2171
2172 if (vf_flush_wa)
2173 len += 6;
2174
2175 if (dc_flush_wa)
2176 len += 12;
2177
73dec95e
TU
2178 cs = intel_ring_begin(request, len);
2179 if (IS_ERR(cs))
2180 return PTR_ERR(cs);
4712274c 2181
9f235dfa
TU
2182 if (vf_flush_wa)
2183 cs = gen8_emit_pipe_control(cs, 0, 0);
9647ff36 2184
9f235dfa
TU
2185 if (dc_flush_wa)
2186 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE,
2187 0);
0b2d0934 2188
9f235dfa 2189 cs = gen8_emit_pipe_control(cs, flags, scratch_addr);
0b2d0934 2190
9f235dfa
TU
2191 if (dc_flush_wa)
2192 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0);
0b2d0934 2193
73dec95e 2194 intel_ring_advance(request, cs);
4712274c
OM
2195
2196 return 0;
2197}
2198
7c17d377
CW
2199/*
2200 * Reserve space for 2 NOOPs at the end of each request to be
2201 * used as a workaround for not being allowed to do lite
2202 * restore with HEAD==TAIL (WaIdleLiteRestore).
2203 */
e61e0f51 2204static void gen8_emit_wa_tail(struct i915_request *request, u32 *cs)
4da46e1e 2205{
beecec90
CW
2206 /* Ensure there's always at least one preemption point per-request. */
2207 *cs++ = MI_ARB_CHECK;
73dec95e
TU
2208 *cs++ = MI_NOOP;
2209 request->wa_tail = intel_ring_offset(request, cs);
caddfe71 2210}
4da46e1e 2211
e61e0f51 2212static void gen8_emit_breadcrumb(struct i915_request *request, u32 *cs)
caddfe71 2213{
7c17d377
CW
2214 /* w/a: bit 5 needs to be zero for MI_FLUSH_DW address. */
2215 BUILD_BUG_ON(I915_GEM_HWS_INDEX_ADDR & (1 << 5));
4da46e1e 2216
df77cd83
MW
2217 cs = gen8_emit_ggtt_write(cs, request->global_seqno,
2218 intel_hws_seqno_address(request->engine));
73dec95e 2219 *cs++ = MI_USER_INTERRUPT;
74f94741 2220 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
73dec95e 2221 request->tail = intel_ring_offset(request, cs);
ed1501d4 2222 assert_ring_tail_valid(request->ring, request->tail);
caddfe71 2223
73dec95e 2224 gen8_emit_wa_tail(request, cs);
7c17d377 2225}
98f29e8d
CW
2226static const int gen8_emit_breadcrumb_sz = 6 + WA_TAIL_DWORDS;
2227
e61e0f51 2228static void gen8_emit_breadcrumb_rcs(struct i915_request *request, u32 *cs)
7c17d377 2229{
ce81a65c
MW
2230 /* We're using qword write, seqno should be aligned to 8 bytes. */
2231 BUILD_BUG_ON(I915_GEM_HWS_INDEX & 1);
2232
df77cd83
MW
2233 cs = gen8_emit_ggtt_write_rcs(cs, request->global_seqno,
2234 intel_hws_seqno_address(request->engine));
73dec95e 2235 *cs++ = MI_USER_INTERRUPT;
74f94741 2236 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
73dec95e 2237 request->tail = intel_ring_offset(request, cs);
ed1501d4 2238 assert_ring_tail_valid(request->ring, request->tail);
caddfe71 2239
73dec95e 2240 gen8_emit_wa_tail(request, cs);
4da46e1e 2241}
df77cd83 2242static const int gen8_emit_breadcrumb_rcs_sz = 8 + WA_TAIL_DWORDS;
98f29e8d 2243
e61e0f51 2244static int gen8_init_rcs_context(struct i915_request *rq)
e7778be1
TD
2245{
2246 int ret;
2247
59b449d5 2248 ret = intel_ctx_workarounds_emit(rq);
e7778be1
TD
2249 if (ret)
2250 return ret;
2251
e61e0f51 2252 ret = intel_rcs_context_init_mocs(rq);
3bbaba0c
PA
2253 /*
2254 * Failing to program the MOCS is non-fatal.The system will not
2255 * run at peak performance. So generate an error and carry on.
2256 */
2257 if (ret)
2258 DRM_ERROR("MOCS failed to program: expect performance issues.\n");
2259
e61e0f51 2260 return i915_gem_render_state_emit(rq);
e7778be1
TD
2261}
2262
73e4d07f
OM
2263/**
2264 * intel_logical_ring_cleanup() - deallocate the Engine Command Streamer
14bb2c11 2265 * @engine: Engine Command Streamer.
73e4d07f 2266 */
0bc40be8 2267void intel_logical_ring_cleanup(struct intel_engine_cs *engine)
454afebd 2268{
6402c330 2269 struct drm_i915_private *dev_priv;
9832b9da 2270
27af5eea
TU
2271 /*
2272 * Tasklet cannot be active at this point due intel_mark_active/idle
2273 * so this is just for documentation.
2274 */
c6dce8f1
SAK
2275 if (WARN_ON(test_bit(TASKLET_STATE_SCHED,
2276 &engine->execlists.tasklet.state)))
2277 tasklet_kill(&engine->execlists.tasklet);
27af5eea 2278
c033666a 2279 dev_priv = engine->i915;
6402c330 2280
0bc40be8 2281 if (engine->buffer) {
0bc40be8 2282 WARN_ON((I915_READ_MODE(engine) & MODE_IDLE) == 0);
b0366a54 2283 }
48d82387 2284
0bc40be8
TU
2285 if (engine->cleanup)
2286 engine->cleanup(engine);
48d82387 2287
e8a9c58f 2288 intel_engine_cleanup_common(engine);
17ee950d 2289
097d4f1c 2290 lrc_destroy_wa_ctx(engine);
f3c9d407 2291
c033666a 2292 engine->i915 = NULL;
3b3f1650
AG
2293 dev_priv->engine[engine->id] = NULL;
2294 kfree(engine);
454afebd
OM
2295}
2296
ff44ad51 2297static void execlists_set_default_submission(struct intel_engine_cs *engine)
ddd66c51 2298{
ff44ad51 2299 engine->submit_request = execlists_submit_request;
27a5f61b 2300 engine->cancel_requests = execlists_cancel_requests;
ff44ad51 2301 engine->schedule = execlists_schedule;
c6dce8f1 2302 engine->execlists.tasklet.func = execlists_submission_tasklet;
aba5e278 2303
1329115c
CW
2304 engine->reset.prepare = execlists_reset_prepare;
2305
aba5e278
CW
2306 engine->park = NULL;
2307 engine->unpark = NULL;
cf669b4e
TU
2308
2309 engine->flags |= I915_ENGINE_SUPPORTS_STATS;
2a694feb
CW
2310 if (engine->i915->preempt_context)
2311 engine->flags |= I915_ENGINE_HAS_PREEMPTION;
3fed1808
CW
2312
2313 engine->i915->caps.scheduler =
2314 I915_SCHEDULER_CAP_ENABLED |
2315 I915_SCHEDULER_CAP_PRIORITY;
2a694feb 2316 if (intel_engine_has_preemption(engine))
3fed1808 2317 engine->i915->caps.scheduler |= I915_SCHEDULER_CAP_PREEMPTION;
ddd66c51
CW
2318}
2319
c9cacf93 2320static void
e1382efb 2321logical_ring_default_vfuncs(struct intel_engine_cs *engine)
c9cacf93
TU
2322{
2323 /* Default vfuncs which can be overriden by each engine. */
0bc40be8 2324 engine->init_hw = gen8_init_common_ring;
5adfb772
CW
2325
2326 engine->reset.prepare = execlists_reset_prepare;
2327 engine->reset.reset = execlists_reset;
2328 engine->reset.finish = execlists_reset_finish;
e8a9c58f
CW
2329
2330 engine->context_pin = execlists_context_pin;
f73e7399
CW
2331 engine->request_alloc = execlists_request_alloc;
2332
0bc40be8 2333 engine->emit_flush = gen8_emit_flush;
9b81d556 2334 engine->emit_breadcrumb = gen8_emit_breadcrumb;
98f29e8d 2335 engine->emit_breadcrumb_sz = gen8_emit_breadcrumb_sz;
ff44ad51
CW
2336
2337 engine->set_default_submission = execlists_set_default_submission;
ddd66c51 2338
d4ccceb0
TU
2339 if (INTEL_GEN(engine->i915) < 11) {
2340 engine->irq_enable = gen8_logical_ring_enable_irq;
2341 engine->irq_disable = gen8_logical_ring_disable_irq;
2342 } else {
2343 /*
2344 * TODO: On Gen11 interrupt masks need to be clear
2345 * to allow C6 entry. Keep interrupts enabled at
2346 * and take the hit of generating extra interrupts
2347 * until a more refined solution exists.
2348 */
2349 }
0bc40be8 2350 engine->emit_bb_start = gen8_emit_bb_start;
c9cacf93
TU
2351}
2352
d9f3af96 2353static inline void
c2c7f240 2354logical_ring_default_irqs(struct intel_engine_cs *engine)
d9f3af96 2355{
fa6f071d
DCS
2356 unsigned int shift = 0;
2357
2358 if (INTEL_GEN(engine->i915) < 11) {
2359 const u8 irq_shifts[] = {
2360 [RCS] = GEN8_RCS_IRQ_SHIFT,
2361 [BCS] = GEN8_BCS_IRQ_SHIFT,
2362 [VCS] = GEN8_VCS1_IRQ_SHIFT,
2363 [VCS2] = GEN8_VCS2_IRQ_SHIFT,
2364 [VECS] = GEN8_VECS_IRQ_SHIFT,
2365 };
2366
2367 shift = irq_shifts[engine->id];
2368 }
2369
0bc40be8
TU
2370 engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift;
2371 engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
d9f3af96
TU
2372}
2373
bb45438f
TU
2374static void
2375logical_ring_setup(struct intel_engine_cs *engine)
2376{
019bf277
TU
2377 intel_engine_setup_common(engine);
2378
bb45438f
TU
2379 /* Intentionally left blank. */
2380 engine->buffer = NULL;
2381
c6dce8f1
SAK
2382 tasklet_init(&engine->execlists.tasklet,
2383 execlists_submission_tasklet, (unsigned long)engine);
bb45438f 2384
bb45438f
TU
2385 logical_ring_default_vfuncs(engine);
2386 logical_ring_default_irqs(engine);
bb45438f
TU
2387}
2388
bc4237ec
CW
2389static bool csb_force_mmio(struct drm_i915_private *i915)
2390{
2391 /* Older GVT emulation depends upon intercepting CSB mmio */
2392 return intel_vgpu_active(i915) && !intel_vgpu_has_hwsp_emulation(i915);
2393}
2394
486e93f7 2395static int logical_ring_init(struct intel_engine_cs *engine)
a19d6ff2 2396{
bc4237ec
CW
2397 struct drm_i915_private *i915 = engine->i915;
2398 struct intel_engine_execlists * const execlists = &engine->execlists;
a19d6ff2
TU
2399 int ret;
2400
019bf277 2401 ret = intel_engine_init_common(engine);
a19d6ff2
TU
2402 if (ret)
2403 goto error;
2404
bc4237ec
CW
2405 if (HAS_LOGICAL_RING_ELSQ(i915)) {
2406 execlists->submit_reg = i915->regs +
05f0addd 2407 i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(engine));
bc4237ec 2408 execlists->ctrl_reg = i915->regs +
05f0addd
TD
2409 i915_mmio_reg_offset(RING_EXECLIST_CONTROL(engine));
2410 } else {
bc4237ec 2411 execlists->submit_reg = i915->regs +
05f0addd
TD
2412 i915_mmio_reg_offset(RING_ELSP(engine));
2413 }
693cfbf0 2414
bc4237ec
CW
2415 execlists->preempt_complete_status = ~0u;
2416 if (i915->preempt_context) {
ab82a063 2417 struct intel_context *ce =
bc4237ec 2418 to_intel_context(i915->preempt_context, engine);
ab82a063 2419
bc4237ec 2420 execlists->preempt_complete_status =
ab82a063
CW
2421 upper_32_bits(ce->lrc_desc);
2422 }
d6376374 2423
bc4237ec
CW
2424 execlists->csb_read =
2425 i915->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine));
2426 if (csb_force_mmio(i915)) {
2427 execlists->csb_status = (u32 __force *)
2428 (i915->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_BUF_LO(engine, 0)));
2429
2430 execlists->csb_write = (u32 __force *)execlists->csb_read;
f4b58f04
CW
2431 execlists->csb_write_reset =
2432 _MASKED_FIELD(GEN8_CSB_WRITE_PTR_MASK,
2433 GEN8_CSB_ENTRIES - 1);
bc4237ec
CW
2434 } else {
2435 execlists->csb_status =
2436 &engine->status_page.page_addr[I915_HWS_CSB_BUF0_INDEX];
2437
2438 execlists->csb_write =
2439 &engine->status_page.page_addr[intel_hws_csb_write_index(i915)];
f4b58f04 2440 execlists->csb_write_reset = GEN8_CSB_ENTRIES - 1;
bc4237ec 2441 }
f4b58f04 2442 reset_csb_pointers(execlists);
c3160da9 2443
a19d6ff2
TU
2444 return 0;
2445
2446error:
2447 intel_logical_ring_cleanup(engine);
2448 return ret;
2449}
2450
88d2ba2e 2451int logical_render_ring_init(struct intel_engine_cs *engine)
a19d6ff2
TU
2452{
2453 struct drm_i915_private *dev_priv = engine->i915;
2454 int ret;
2455
bb45438f
TU
2456 logical_ring_setup(engine);
2457
a19d6ff2
TU
2458 if (HAS_L3_DPF(dev_priv))
2459 engine->irq_keep_mask |= GT_RENDER_L3_PARITY_ERROR_INTERRUPT;
2460
2461 /* Override some for render ring. */
2462 if (INTEL_GEN(dev_priv) >= 9)
2463 engine->init_hw = gen9_init_render_ring;
2464 else
2465 engine->init_hw = gen8_init_render_ring;
2466 engine->init_context = gen8_init_rcs_context;
a19d6ff2 2467 engine->emit_flush = gen8_emit_flush_render;
df77cd83
MW
2468 engine->emit_breadcrumb = gen8_emit_breadcrumb_rcs;
2469 engine->emit_breadcrumb_sz = gen8_emit_breadcrumb_rcs_sz;
a19d6ff2 2470
f51455d4 2471 ret = intel_engine_create_scratch(engine, PAGE_SIZE);
a19d6ff2
TU
2472 if (ret)
2473 return ret;
2474
2475 ret = intel_init_workaround_bb(engine);
2476 if (ret) {
2477 /*
2478 * We continue even if we fail to initialize WA batch
2479 * because we only expect rare glitches but nothing
2480 * critical to prevent us from using GPU
2481 */
2482 DRM_ERROR("WA batch buffer initialization failed: %d\n",
2483 ret);
2484 }
2485
d038fc7e 2486 return logical_ring_init(engine);
a19d6ff2
TU
2487}
2488
88d2ba2e 2489int logical_xcs_ring_init(struct intel_engine_cs *engine)
bb45438f
TU
2490{
2491 logical_ring_setup(engine);
2492
2493 return logical_ring_init(engine);
454afebd
OM
2494}
2495
0cea6502 2496static u32
c033666a 2497make_rpcs(struct drm_i915_private *dev_priv)
0cea6502
JM
2498{
2499 u32 rpcs = 0;
2500
2501 /*
2502 * No explicit RPCS request is needed to ensure full
2503 * slice/subslice/EU enablement prior to Gen9.
2504 */
c033666a 2505 if (INTEL_GEN(dev_priv) < 9)
0cea6502
JM
2506 return 0;
2507
2508 /*
2509 * Starting in Gen9, render power gating can leave
2510 * slice/subslice/EU in a partially enabled state. We
2511 * must make an explicit request through RPCS for full
2512 * enablement.
2513 */
43b67998 2514 if (INTEL_INFO(dev_priv)->sseu.has_slice_pg) {
0cea6502 2515 rpcs |= GEN8_RPCS_S_CNT_ENABLE;
f08a0c92 2516 rpcs |= hweight8(INTEL_INFO(dev_priv)->sseu.slice_mask) <<
0cea6502
JM
2517 GEN8_RPCS_S_CNT_SHIFT;
2518 rpcs |= GEN8_RPCS_ENABLE;
2519 }
2520
43b67998 2521 if (INTEL_INFO(dev_priv)->sseu.has_subslice_pg) {
0cea6502 2522 rpcs |= GEN8_RPCS_SS_CNT_ENABLE;
8cc76693 2523 rpcs |= hweight8(INTEL_INFO(dev_priv)->sseu.subslice_mask[0]) <<
0cea6502
JM
2524 GEN8_RPCS_SS_CNT_SHIFT;
2525 rpcs |= GEN8_RPCS_ENABLE;
2526 }
2527
43b67998
ID
2528 if (INTEL_INFO(dev_priv)->sseu.has_eu_pg) {
2529 rpcs |= INTEL_INFO(dev_priv)->sseu.eu_per_subslice <<
0cea6502 2530 GEN8_RPCS_EU_MIN_SHIFT;
43b67998 2531 rpcs |= INTEL_INFO(dev_priv)->sseu.eu_per_subslice <<
0cea6502
JM
2532 GEN8_RPCS_EU_MAX_SHIFT;
2533 rpcs |= GEN8_RPCS_ENABLE;
2534 }
2535
2536 return rpcs;
2537}
2538
0bc40be8 2539static u32 intel_lr_indirect_ctx_offset(struct intel_engine_cs *engine)
71562919
MT
2540{
2541 u32 indirect_ctx_offset;
2542
c033666a 2543 switch (INTEL_GEN(engine->i915)) {
71562919 2544 default:
c033666a 2545 MISSING_CASE(INTEL_GEN(engine->i915));
71562919 2546 /* fall through */
fd034c77
MT
2547 case 11:
2548 indirect_ctx_offset =
2549 GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
2550 break;
7bd0a2c6
MT
2551 case 10:
2552 indirect_ctx_offset =
2553 GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
2554 break;
71562919
MT
2555 case 9:
2556 indirect_ctx_offset =
2557 GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
2558 break;
2559 case 8:
2560 indirect_ctx_offset =
2561 GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
2562 break;
2563 }
2564
2565 return indirect_ctx_offset;
2566}
2567
56e51bf0 2568static void execlists_init_reg_state(u32 *regs,
a3aabe86
CW
2569 struct i915_gem_context *ctx,
2570 struct intel_engine_cs *engine,
2571 struct intel_ring *ring)
8670d6f9 2572{
a3aabe86
CW
2573 struct drm_i915_private *dev_priv = engine->i915;
2574 struct i915_hw_ppgtt *ppgtt = ctx->ppgtt ?: dev_priv->mm.aliasing_ppgtt;
56e51bf0 2575 u32 base = engine->mmio_base;
1fc44d9b 2576 bool rcs = engine->class == RENDER_CLASS;
56e51bf0
TU
2577
2578 /* A context is actually a big batch buffer with several
2579 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
2580 * values we are setting here are only for the first context restore:
2581 * on a subsequent save, the GPU will recreate this batchbuffer with new
2582 * values (including all the missing MI_LOAD_REGISTER_IMM commands that
2583 * we are not initializing here).
2584 */
2585 regs[CTX_LRI_HEADER_0] = MI_LOAD_REGISTER_IMM(rcs ? 14 : 11) |
2586 MI_LRI_FORCE_POSTED;
2587
2588 CTX_REG(regs, CTX_CONTEXT_CONTROL, RING_CONTEXT_CONTROL(engine),
09b1a4e4
CW
2589 _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT |
2590 CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT) |
56e51bf0 2591 _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH |
56e51bf0
TU
2592 (HAS_RESOURCE_STREAMER(dev_priv) ?
2593 CTX_CTRL_RS_CTX_ENABLE : 0)));
2594 CTX_REG(regs, CTX_RING_HEAD, RING_HEAD(base), 0);
2595 CTX_REG(regs, CTX_RING_TAIL, RING_TAIL(base), 0);
2596 CTX_REG(regs, CTX_RING_BUFFER_START, RING_START(base), 0);
2597 CTX_REG(regs, CTX_RING_BUFFER_CONTROL, RING_CTL(base),
2598 RING_CTL_SIZE(ring->size) | RING_VALID);
2599 CTX_REG(regs, CTX_BB_HEAD_U, RING_BBADDR_UDW(base), 0);
2600 CTX_REG(regs, CTX_BB_HEAD_L, RING_BBADDR(base), 0);
2601 CTX_REG(regs, CTX_BB_STATE, RING_BBSTATE(base), RING_BB_PPGTT);
2602 CTX_REG(regs, CTX_SECOND_BB_HEAD_U, RING_SBBADDR_UDW(base), 0);
2603 CTX_REG(regs, CTX_SECOND_BB_HEAD_L, RING_SBBADDR(base), 0);
2604 CTX_REG(regs, CTX_SECOND_BB_STATE, RING_SBBSTATE(base), 0);
2605 if (rcs) {
604a8f6f
CW
2606 struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
2607
56e51bf0
TU
2608 CTX_REG(regs, CTX_RCS_INDIRECT_CTX, RING_INDIRECT_CTX(base), 0);
2609 CTX_REG(regs, CTX_RCS_INDIRECT_CTX_OFFSET,
2610 RING_INDIRECT_CTX_OFFSET(base), 0);
604a8f6f 2611 if (wa_ctx->indirect_ctx.size) {
bde13ebd 2612 u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
17ee950d 2613
56e51bf0 2614 regs[CTX_RCS_INDIRECT_CTX + 1] =
097d4f1c
TU
2615 (ggtt_offset + wa_ctx->indirect_ctx.offset) |
2616 (wa_ctx->indirect_ctx.size / CACHELINE_BYTES);
17ee950d 2617
56e51bf0 2618 regs[CTX_RCS_INDIRECT_CTX_OFFSET + 1] =
0bc40be8 2619 intel_lr_indirect_ctx_offset(engine) << 6;
604a8f6f
CW
2620 }
2621
2622 CTX_REG(regs, CTX_BB_PER_CTX_PTR, RING_BB_PER_CTX_PTR(base), 0);
2623 if (wa_ctx->per_ctx.size) {
2624 u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
17ee950d 2625
56e51bf0 2626 regs[CTX_BB_PER_CTX_PTR + 1] =
097d4f1c 2627 (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
17ee950d 2628 }
8670d6f9 2629 }
56e51bf0
TU
2630
2631 regs[CTX_LRI_HEADER_1] = MI_LOAD_REGISTER_IMM(9) | MI_LRI_FORCE_POSTED;
2632
2633 CTX_REG(regs, CTX_CTX_TIMESTAMP, RING_CTX_TIMESTAMP(base), 0);
0d925ea0 2634 /* PDP values well be assigned later if needed */
56e51bf0
TU
2635 CTX_REG(regs, CTX_PDP3_UDW, GEN8_RING_PDP_UDW(engine, 3), 0);
2636 CTX_REG(regs, CTX_PDP3_LDW, GEN8_RING_PDP_LDW(engine, 3), 0);
2637 CTX_REG(regs, CTX_PDP2_UDW, GEN8_RING_PDP_UDW(engine, 2), 0);
2638 CTX_REG(regs, CTX_PDP2_LDW, GEN8_RING_PDP_LDW(engine, 2), 0);
2639 CTX_REG(regs, CTX_PDP1_UDW, GEN8_RING_PDP_UDW(engine, 1), 0);
2640 CTX_REG(regs, CTX_PDP1_LDW, GEN8_RING_PDP_LDW(engine, 1), 0);
2641 CTX_REG(regs, CTX_PDP0_UDW, GEN8_RING_PDP_UDW(engine, 0), 0);
2642 CTX_REG(regs, CTX_PDP0_LDW, GEN8_RING_PDP_LDW(engine, 0), 0);
d7b2633d 2643
82ad6443 2644 if (ppgtt && i915_vm_is_48bit(&ppgtt->vm)) {
2dba3239
MT
2645 /* 64b PPGTT (48bit canonical)
2646 * PDP0_DESCRIPTOR contains the base address to PML4 and
2647 * other PDP Descriptors are ignored.
2648 */
56e51bf0 2649 ASSIGN_CTX_PML4(ppgtt, regs);
2dba3239
MT
2650 }
2651
56e51bf0
TU
2652 if (rcs) {
2653 regs[CTX_LRI_HEADER_2] = MI_LOAD_REGISTER_IMM(1);
2654 CTX_REG(regs, CTX_R_PWR_CLK_STATE, GEN8_R_PWR_CLK_STATE,
2655 make_rpcs(dev_priv));
19f81df2
RB
2656
2657 i915_oa_init_reg_state(engine, ctx, regs);
8670d6f9 2658 }
a3aabe86
CW
2659}
2660
2661static int
2662populate_lr_context(struct i915_gem_context *ctx,
2663 struct drm_i915_gem_object *ctx_obj,
2664 struct intel_engine_cs *engine,
2665 struct intel_ring *ring)
2666{
2667 void *vaddr;
d2b4b979 2668 u32 *regs;
a3aabe86
CW
2669 int ret;
2670
2671 ret = i915_gem_object_set_to_cpu_domain(ctx_obj, true);
2672 if (ret) {
2673 DRM_DEBUG_DRIVER("Could not set to CPU domain\n");
2674 return ret;
2675 }
2676
2677 vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB);
2678 if (IS_ERR(vaddr)) {
2679 ret = PTR_ERR(vaddr);
2680 DRM_DEBUG_DRIVER("Could not map object pages! (%d)\n", ret);
2681 return ret;
2682 }
a4f5ea64 2683 ctx_obj->mm.dirty = true;
a3aabe86 2684
d2b4b979
CW
2685 if (engine->default_state) {
2686 /*
2687 * We only want to copy over the template context state;
2688 * skipping over the headers reserved for GuC communication,
2689 * leaving those as zero.
2690 */
2691 const unsigned long start = LRC_HEADER_PAGES * PAGE_SIZE;
2692 void *defaults;
2693
2694 defaults = i915_gem_object_pin_map(engine->default_state,
2695 I915_MAP_WB);
aaefa06a
MA
2696 if (IS_ERR(defaults)) {
2697 ret = PTR_ERR(defaults);
2698 goto err_unpin_ctx;
2699 }
d2b4b979
CW
2700
2701 memcpy(vaddr + start, defaults + start, engine->context_size);
2702 i915_gem_object_unpin_map(engine->default_state);
2703 }
2704
a3aabe86
CW
2705 /* The second page of the context object contains some fields which must
2706 * be set up prior to the first execution. */
d2b4b979
CW
2707 regs = vaddr + LRC_STATE_PN * PAGE_SIZE;
2708 execlists_init_reg_state(regs, ctx, engine, ring);
2709 if (!engine->default_state)
2710 regs[CTX_CONTEXT_CONTROL + 1] |=
2711 _MASKED_BIT_ENABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
05f0addd 2712 if (ctx == ctx->i915->preempt_context && INTEL_GEN(engine->i915) < 11)
517aaffe
CW
2713 regs[CTX_CONTEXT_CONTROL + 1] |=
2714 _MASKED_BIT_ENABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT |
2715 CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT);
8670d6f9 2716
aaefa06a 2717err_unpin_ctx:
7d774cac 2718 i915_gem_object_unpin_map(ctx_obj);
aaefa06a 2719 return ret;
8670d6f9
OM
2720}
2721
e2efd130 2722static int execlists_context_deferred_alloc(struct i915_gem_context *ctx,
1fc44d9b
CW
2723 struct intel_engine_cs *engine,
2724 struct intel_context *ce)
ede7d42b 2725{
8c857917 2726 struct drm_i915_gem_object *ctx_obj;
bf3783e5 2727 struct i915_vma *vma;
8c857917 2728 uint32_t context_size;
7e37f889 2729 struct intel_ring *ring;
a89d1f92 2730 struct i915_timeline *timeline;
8c857917
OM
2731 int ret;
2732
1d2a19c2
CW
2733 if (ce->state)
2734 return 0;
ede7d42b 2735
63ffbcda 2736 context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
8c857917 2737
0b29c75a
MT
2738 /*
2739 * Before the actual start of the context image, we insert a few pages
2740 * for our own use and for sharing with the GuC.
2741 */
2742 context_size += LRC_HEADER_PAGES * PAGE_SIZE;
d1675198 2743
12d79d78 2744 ctx_obj = i915_gem_object_create(ctx->i915, context_size);
467d3578
CW
2745 if (IS_ERR(ctx_obj))
2746 return PTR_ERR(ctx_obj);
8c857917 2747
82ad6443 2748 vma = i915_vma_instance(ctx_obj, &ctx->i915->ggtt.vm, NULL);
bf3783e5
CW
2749 if (IS_ERR(vma)) {
2750 ret = PTR_ERR(vma);
2751 goto error_deref_obj;
2752 }
2753
a89d1f92
CW
2754 timeline = i915_timeline_create(ctx->i915, ctx->name);
2755 if (IS_ERR(timeline)) {
2756 ret = PTR_ERR(timeline);
2757 goto error_deref_obj;
2758 }
2759
2760 ring = intel_engine_create_ring(engine, timeline, ctx->ring_size);
2761 i915_timeline_put(timeline);
dca33ecc
CW
2762 if (IS_ERR(ring)) {
2763 ret = PTR_ERR(ring);
e84fe803 2764 goto error_deref_obj;
8670d6f9
OM
2765 }
2766
dca33ecc 2767 ret = populate_lr_context(ctx, ctx_obj, engine, ring);
8670d6f9
OM
2768 if (ret) {
2769 DRM_DEBUG_DRIVER("Failed to populate LRC: %d\n", ret);
dca33ecc 2770 goto error_ring_free;
84c2377f
OM
2771 }
2772
dca33ecc 2773 ce->ring = ring;
bf3783e5 2774 ce->state = vma;
ede7d42b
OM
2775
2776 return 0;
8670d6f9 2777
dca33ecc 2778error_ring_free:
7e37f889 2779 intel_ring_free(ring);
e84fe803 2780error_deref_obj:
f8c417cd 2781 i915_gem_object_put(ctx_obj);
8670d6f9 2782 return ret;
ede7d42b 2783}
3e5b6f05 2784
821ed7df 2785void intel_lr_context_resume(struct drm_i915_private *dev_priv)
3e5b6f05 2786{
e2f80391 2787 struct intel_engine_cs *engine;
bafb2f7d 2788 struct i915_gem_context *ctx;
3b3f1650 2789 enum intel_engine_id id;
bafb2f7d
CW
2790
2791 /* Because we emit WA_TAIL_DWORDS there may be a disparity
2792 * between our bookkeeping in ce->ring->head and ce->ring->tail and
2793 * that stored in context. As we only write new commands from
2794 * ce->ring->tail onwards, everything before that is junk. If the GPU
2795 * starts reading from its RING_HEAD from the context, it may try to
2796 * execute that junk and die.
2797 *
2798 * So to avoid that we reset the context images upon resume. For
2799 * simplicity, we just zero everything out.
2800 */
829a0af2 2801 list_for_each_entry(ctx, &dev_priv->contexts.list, link) {
3b3f1650 2802 for_each_engine(engine, dev_priv, id) {
ab82a063
CW
2803 struct intel_context *ce =
2804 to_intel_context(ctx, engine);
bafb2f7d 2805 u32 *reg;
3e5b6f05 2806
bafb2f7d
CW
2807 if (!ce->state)
2808 continue;
7d774cac 2809
bafb2f7d
CW
2810 reg = i915_gem_object_pin_map(ce->state->obj,
2811 I915_MAP_WB);
2812 if (WARN_ON(IS_ERR(reg)))
2813 continue;
3e5b6f05 2814
bafb2f7d
CW
2815 reg += LRC_STATE_PN * PAGE_SIZE / sizeof(*reg);
2816 reg[CTX_RING_HEAD+1] = 0;
2817 reg[CTX_RING_TAIL+1] = 0;
3e5b6f05 2818
a4f5ea64 2819 ce->state->obj->mm.dirty = true;
bafb2f7d 2820 i915_gem_object_unpin_map(ce->state->obj);
3e5b6f05 2821
e6ba9992 2822 intel_ring_reset(ce->ring, 0);
bafb2f7d 2823 }
3e5b6f05
TD
2824 }
2825}
2c66555e
CW
2826
2827#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
2828#include "selftests/intel_lrc.c"
2829#endif