]> git.ipfire.org Git - thirdparty/linux.git/blame - drivers/gpu/drm/i915/intel_lrc.c
drm/i915: Allocate a common scratch page
[thirdparty/linux.git] / drivers / gpu / drm / i915 / intel_lrc.c
CommitLineData
b20385f1
OM
1/*
2 * Copyright © 2014 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * Authors:
24 * Ben Widawsky <ben@bwidawsk.net>
25 * Michel Thierry <michel.thierry@intel.com>
26 * Thomas Daniel <thomas.daniel@intel.com>
27 * Oscar Mateo <oscar.mateo@intel.com>
28 *
29 */
30
73e4d07f
OM
31/**
32 * DOC: Logical Rings, Logical Ring Contexts and Execlists
33 *
34 * Motivation:
b20385f1
OM
35 * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts".
36 * These expanded contexts enable a number of new abilities, especially
37 * "Execlists" (also implemented in this file).
38 *
73e4d07f
OM
39 * One of the main differences with the legacy HW contexts is that logical
40 * ring contexts incorporate many more things to the context's state, like
41 * PDPs or ringbuffer control registers:
42 *
43 * The reason why PDPs are included in the context is straightforward: as
44 * PPGTTs (per-process GTTs) are actually per-context, having the PDPs
45 * contained there mean you don't need to do a ppgtt->switch_mm yourself,
46 * instead, the GPU will do it for you on the context switch.
47 *
48 * But, what about the ringbuffer control registers (head, tail, etc..)?
49 * shouldn't we just need a set of those per engine command streamer? This is
50 * where the name "Logical Rings" starts to make sense: by virtualizing the
51 * rings, the engine cs shifts to a new "ring buffer" with every context
52 * switch. When you want to submit a workload to the GPU you: A) choose your
53 * context, B) find its appropriate virtualized ring, C) write commands to it
54 * and then, finally, D) tell the GPU to switch to that context.
55 *
56 * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch
57 * to a contexts is via a context execution list, ergo "Execlists".
58 *
59 * LRC implementation:
60 * Regarding the creation of contexts, we have:
61 *
62 * - One global default context.
63 * - One local default context for each opened fd.
64 * - One local extra context for each context create ioctl call.
65 *
66 * Now that ringbuffers belong per-context (and not per-engine, like before)
67 * and that contexts are uniquely tied to a given engine (and not reusable,
68 * like before) we need:
69 *
70 * - One ringbuffer per-engine inside each context.
71 * - One backing object per-engine inside each context.
72 *
73 * The global default context starts its life with these new objects fully
74 * allocated and populated. The local default context for each opened fd is
75 * more complex, because we don't know at creation time which engine is going
76 * to use them. To handle this, we have implemented a deferred creation of LR
77 * contexts:
78 *
79 * The local context starts its life as a hollow or blank holder, that only
80 * gets populated for a given engine once we receive an execbuffer. If later
81 * on we receive another execbuffer ioctl for the same context but a different
82 * engine, we allocate/populate a new ringbuffer and context backing object and
83 * so on.
84 *
85 * Finally, regarding local contexts created using the ioctl call: as they are
86 * only allowed with the render ring, we can allocate & populate them right
87 * away (no need to defer anything, at least for now).
88 *
89 * Execlists implementation:
b20385f1
OM
90 * Execlists are the new method by which, on gen8+ hardware, workloads are
91 * submitted for execution (as opposed to the legacy, ringbuffer-based, method).
73e4d07f
OM
92 * This method works as follows:
93 *
94 * When a request is committed, its commands (the BB start and any leading or
95 * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer
96 * for the appropriate context. The tail pointer in the hardware context is not
97 * updated at this time, but instead, kept by the driver in the ringbuffer
98 * structure. A structure representing this request is added to a request queue
99 * for the appropriate engine: this structure contains a copy of the context's
100 * tail after the request was written to the ring buffer and a pointer to the
101 * context itself.
102 *
103 * If the engine's request queue was empty before the request was added, the
104 * queue is processed immediately. Otherwise the queue will be processed during
105 * a context switch interrupt. In any case, elements on the queue will get sent
106 * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a
107 * globally unique 20-bits submission ID.
108 *
109 * When execution of a request completes, the GPU updates the context status
110 * buffer with a context complete event and generates a context switch interrupt.
111 * During the interrupt handling, the driver examines the events in the buffer:
112 * for each context complete event, if the announced ID matches that on the head
113 * of the request queue, then that request is retired and removed from the queue.
114 *
115 * After processing, if any requests were retired and the queue is not empty
116 * then a new execution list can be submitted. The two requests at the front of
117 * the queue are next to be submitted but since a context may not occur twice in
118 * an execution list, if subsequent requests have the same ID as the first then
119 * the two requests must be combined. This is done simply by discarding requests
120 * at the head of the queue until either only one requests is left (in which case
121 * we use a NULL second context) or the first two requests have unique IDs.
122 *
123 * By always executing the first two requests in the queue the driver ensures
124 * that the GPU is kept as busy as possible. In the case where a single context
125 * completes but a second context is still executing, the request for this second
126 * context will be at the head of the queue when we remove the first one. This
127 * request will then be resubmitted along with a new request for a different context,
128 * which will cause the hardware to continue executing the second request and queue
129 * the new request (the GPU detects the condition of a context getting preempted
130 * with the same context and optimizes the context switch flow by not doing
131 * preemption, but just sampling the new tail pointer).
132 *
b20385f1 133 */
27af5eea 134#include <linux/interrupt.h>
b20385f1
OM
135
136#include <drm/drmP.h>
137#include <drm/i915_drm.h>
138#include "i915_drv.h"
7c2fa7fa 139#include "i915_gem_render_state.h"
bc4237ec 140#include "i915_vgpu.h"
578f1ac6 141#include "intel_lrc_reg.h"
3bbaba0c 142#include "intel_mocs.h"
7d3c425f 143#include "intel_workarounds.h"
127f1003 144
e981e7b1
TD
145#define RING_EXECLIST_QFULL (1 << 0x2)
146#define RING_EXECLIST1_VALID (1 << 0x3)
147#define RING_EXECLIST0_VALID (1 << 0x4)
148#define RING_EXECLIST_ACTIVE_STATUS (3 << 0xE)
149#define RING_EXECLIST1_ACTIVE (1 << 0x11)
150#define RING_EXECLIST0_ACTIVE (1 << 0x12)
151
152#define GEN8_CTX_STATUS_IDLE_ACTIVE (1 << 0)
153#define GEN8_CTX_STATUS_PREEMPTED (1 << 1)
154#define GEN8_CTX_STATUS_ELEMENT_SWITCH (1 << 2)
155#define GEN8_CTX_STATUS_ACTIVE_IDLE (1 << 3)
156#define GEN8_CTX_STATUS_COMPLETE (1 << 4)
157#define GEN8_CTX_STATUS_LITE_RESTORE (1 << 15)
8670d6f9 158
70c2a24d 159#define GEN8_CTX_STATUS_COMPLETED_MASK \
d8747afb 160 (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED)
70c2a24d 161
0e93cdd4
CW
162/* Typical size of the average request (2 pipecontrols and a MI_BB) */
163#define EXECLISTS_REQUEST_SIZE 64 /* bytes */
a3aabe86 164#define WA_TAIL_DWORDS 2
7e4992ac 165#define WA_TAIL_BYTES (sizeof(u32) * WA_TAIL_DWORDS)
a3aabe86 166
e2efd130 167static int execlists_context_deferred_alloc(struct i915_gem_context *ctx,
1fc44d9b
CW
168 struct intel_engine_cs *engine,
169 struct intel_context *ce);
a3aabe86
CW
170static void execlists_init_reg_state(u32 *reg_state,
171 struct i915_gem_context *ctx,
172 struct intel_engine_cs *engine,
173 struct intel_ring *ring);
7ba717cf 174
f6322edd
CW
175static inline struct i915_priolist *to_priolist(struct rb_node *rb)
176{
177 return rb_entry(rb, struct i915_priolist, node);
178}
179
180static inline int rq_prio(const struct i915_request *rq)
181{
b7268c5e 182 return rq->sched.attr.priority;
f6322edd
CW
183}
184
185static inline bool need_preempt(const struct intel_engine_cs *engine,
186 const struct i915_request *last,
187 int prio)
188{
2a694feb 189 return (intel_engine_has_preemption(engine) &&
c5ce3b8d
CW
190 __execlists_need_preempt(prio, rq_prio(last)) &&
191 !i915_request_completed(last));
f6322edd
CW
192}
193
1fc44d9b 194/*
ca82580c
TU
195 * The context descriptor encodes various attributes of a context,
196 * including its GTT address and some flags. Because it's fairly
197 * expensive to calculate, we'll just do it once and cache the result,
198 * which remains valid until the context is unpinned.
199 *
6e5248b5
DV
200 * This is what a descriptor looks like, from LSB to MSB::
201 *
2355cf08 202 * bits 0-11: flags, GEN8_CTX_* (cached in ctx->desc_template)
6e5248b5 203 * bits 12-31: LRCA, GTT address of (the HWSP of) this context
218b5000 204 * bits 32-52: ctx ID, a globally unique tag (highest bit used by GuC)
6e5248b5
DV
205 * bits 53-54: mbz, reserved for use by hardware
206 * bits 55-63: group ID, currently unused and set to 0
ac52da6a
DCS
207 *
208 * Starting from Gen11, the upper dword of the descriptor has a new format:
209 *
210 * bits 32-36: reserved
211 * bits 37-47: SW context ID
212 * bits 48:53: engine instance
213 * bit 54: mbz, reserved for use by hardware
214 * bits 55-60: SW counter
215 * bits 61-63: engine class
216 *
217 * engine info, SW context ID and SW counter need to form a unique number
218 * (Context ID) per lrc.
73e4d07f 219 */
ca82580c 220static void
e2efd130 221intel_lr_context_descriptor_update(struct i915_gem_context *ctx,
1fc44d9b
CW
222 struct intel_engine_cs *engine,
223 struct intel_context *ce)
84b790f8 224{
7069b144 225 u64 desc;
84b790f8 226
ac52da6a
DCS
227 BUILD_BUG_ON(MAX_CONTEXT_HW_ID > (BIT(GEN8_CTX_ID_WIDTH)));
228 BUILD_BUG_ON(GEN11_MAX_CONTEXT_HW_ID > (BIT(GEN11_SW_CTX_ID_WIDTH)));
84b790f8 229
2355cf08 230 desc = ctx->desc_template; /* bits 0-11 */
ac52da6a
DCS
231 GEM_BUG_ON(desc & GENMASK_ULL(63, 12));
232
0b29c75a 233 desc |= i915_ggtt_offset(ce->state) + LRC_HEADER_PAGES * PAGE_SIZE;
9021ad03 234 /* bits 12-31 */
ac52da6a
DCS
235 GEM_BUG_ON(desc & GENMASK_ULL(63, 32));
236
61d5676b
LL
237 /*
238 * The following 32bits are copied into the OA reports (dword 2).
239 * Consider updating oa_get_render_ctx_id in i915_perf.c when changing
240 * anything below.
241 */
ac52da6a
DCS
242 if (INTEL_GEN(ctx->i915) >= 11) {
243 GEM_BUG_ON(ctx->hw_id >= BIT(GEN11_SW_CTX_ID_WIDTH));
244 desc |= (u64)ctx->hw_id << GEN11_SW_CTX_ID_SHIFT;
245 /* bits 37-47 */
246
247 desc |= (u64)engine->instance << GEN11_ENGINE_INSTANCE_SHIFT;
248 /* bits 48-53 */
249
250 /* TODO: decide what to do with SW counter (bits 55-60) */
251
252 desc |= (u64)engine->class << GEN11_ENGINE_CLASS_SHIFT;
253 /* bits 61-63 */
254 } else {
255 GEM_BUG_ON(ctx->hw_id >= BIT(GEN8_CTX_ID_WIDTH));
256 desc |= (u64)ctx->hw_id << GEN8_CTX_ID_SHIFT; /* bits 32-52 */
257 }
5af05fef 258
9021ad03 259 ce->lrc_desc = desc;
5af05fef
MT
260}
261
27606fd8 262static struct i915_priolist *
87c7acf8 263lookup_priolist(struct intel_engine_cs *engine, int prio)
08dd3e1a 264{
b620e870 265 struct intel_engine_execlists * const execlists = &engine->execlists;
08dd3e1a
CW
266 struct i915_priolist *p;
267 struct rb_node **parent, *rb;
268 bool first = true;
269
b620e870 270 if (unlikely(execlists->no_priolist))
08dd3e1a
CW
271 prio = I915_PRIORITY_NORMAL;
272
273find_priolist:
274 /* most positive priority is scheduled first, equal priorities fifo */
275 rb = NULL;
655250a8 276 parent = &execlists->queue.rb_root.rb_node;
08dd3e1a
CW
277 while (*parent) {
278 rb = *parent;
f6322edd 279 p = to_priolist(rb);
08dd3e1a
CW
280 if (prio > p->priority) {
281 parent = &rb->rb_left;
282 } else if (prio < p->priority) {
283 parent = &rb->rb_right;
284 first = false;
285 } else {
27606fd8 286 return p;
08dd3e1a
CW
287 }
288 }
289
290 if (prio == I915_PRIORITY_NORMAL) {
b620e870 291 p = &execlists->default_priolist;
08dd3e1a
CW
292 } else {
293 p = kmem_cache_alloc(engine->i915->priorities, GFP_ATOMIC);
294 /* Convert an allocation failure to a priority bump */
295 if (unlikely(!p)) {
296 prio = I915_PRIORITY_NORMAL; /* recurses just once */
297
298 /* To maintain ordering with all rendering, after an
299 * allocation failure we have to disable all scheduling.
300 * Requests will then be executed in fifo, and schedule
301 * will ensure that dependencies are emitted in fifo.
302 * There will be still some reordering with existing
303 * requests, so if userspace lied about their
304 * dependencies that reordering may be visible.
305 */
b620e870 306 execlists->no_priolist = true;
08dd3e1a
CW
307 goto find_priolist;
308 }
309 }
310
311 p->priority = prio;
27606fd8 312 INIT_LIST_HEAD(&p->requests);
08dd3e1a 313 rb_link_node(&p->node, rb, parent);
655250a8 314 rb_insert_color_cached(&p->node, &execlists->queue, first);
08dd3e1a 315
f6322edd 316 return p;
08dd3e1a
CW
317}
318
e61e0f51 319static void unwind_wa_tail(struct i915_request *rq)
7e4992ac
CW
320{
321 rq->tail = intel_ring_wrap(rq->ring, rq->wa_tail - WA_TAIL_BYTES);
322 assert_ring_tail_valid(rq->ring, rq->tail);
323}
324
a4598d17 325static void __unwind_incomplete_requests(struct intel_engine_cs *engine)
7e4992ac 326{
e61e0f51 327 struct i915_request *rq, *rn;
097a9481
MW
328 struct i915_priolist *uninitialized_var(p);
329 int last_prio = I915_PRIORITY_INVALID;
7e4992ac 330
a89d1f92 331 lockdep_assert_held(&engine->timeline.lock);
7e4992ac
CW
332
333 list_for_each_entry_safe_reverse(rq, rn,
a89d1f92 334 &engine->timeline.requests,
7e4992ac 335 link) {
e61e0f51 336 if (i915_request_completed(rq))
7e4992ac
CW
337 return;
338
e61e0f51 339 __i915_request_unsubmit(rq);
7e4992ac
CW
340 unwind_wa_tail(rq);
341
f6322edd
CW
342 GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
343 if (rq_prio(rq) != last_prio) {
344 last_prio = rq_prio(rq);
87c7acf8 345 p = lookup_priolist(engine, last_prio);
097a9481
MW
346 }
347
a02eb975 348 GEM_BUG_ON(p->priority != rq_prio(rq));
0c7112a0 349 list_add(&rq->sched.link, &p->requests);
7e4992ac
CW
350 }
351}
352
c41937fd 353void
a4598d17
MW
354execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists)
355{
356 struct intel_engine_cs *engine =
357 container_of(execlists, typeof(*engine), execlists);
4413c474
CW
358 unsigned long flags;
359
360 spin_lock_irqsave(&engine->timeline.lock, flags);
a4598d17 361
a4598d17 362 __unwind_incomplete_requests(engine);
4413c474
CW
363
364 spin_unlock_irqrestore(&engine->timeline.lock, flags);
a4598d17
MW
365}
366
bbd6c47e 367static inline void
e61e0f51 368execlists_context_status_change(struct i915_request *rq, unsigned long status)
84b790f8 369{
bbd6c47e
CW
370 /*
371 * Only used when GVT-g is enabled now. When GVT-g is disabled,
372 * The compiler should eliminate this function as dead-code.
373 */
374 if (!IS_ENABLED(CONFIG_DRM_I915_GVT))
375 return;
6daccb0b 376
3fc03069
CD
377 atomic_notifier_call_chain(&rq->engine->context_status_notifier,
378 status, rq);
84b790f8
BW
379}
380
f2605207
CW
381inline void
382execlists_user_begin(struct intel_engine_execlists *execlists,
383 const struct execlist_port *port)
384{
385 execlists_set_active_once(execlists, EXECLISTS_ACTIVE_USER);
386}
387
388inline void
389execlists_user_end(struct intel_engine_execlists *execlists)
390{
391 execlists_clear_active(execlists, EXECLISTS_ACTIVE_USER);
392}
393
73fd9d38 394static inline void
e61e0f51 395execlists_context_schedule_in(struct i915_request *rq)
73fd9d38
TU
396{
397 execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
30e17b78 398 intel_engine_context_in(rq->engine);
73fd9d38
TU
399}
400
401static inline void
b9b77426 402execlists_context_schedule_out(struct i915_request *rq, unsigned long status)
73fd9d38 403{
30e17b78 404 intel_engine_context_out(rq->engine);
b9b77426
CW
405 execlists_context_status_change(rq, status);
406 trace_i915_request_out(rq);
73fd9d38
TU
407}
408
c6a2ac71
TU
409static void
410execlists_update_context_pdps(struct i915_hw_ppgtt *ppgtt, u32 *reg_state)
411{
412 ASSIGN_CTX_PDP(ppgtt, reg_state, 3);
413 ASSIGN_CTX_PDP(ppgtt, reg_state, 2);
414 ASSIGN_CTX_PDP(ppgtt, reg_state, 1);
415 ASSIGN_CTX_PDP(ppgtt, reg_state, 0);
416}
417
e61e0f51 418static u64 execlists_update_context(struct i915_request *rq)
ae1250b9 419{
1fc44d9b 420 struct intel_context *ce = rq->hw_context;
04da811b 421 struct i915_hw_ppgtt *ppgtt =
4e0d64db 422 rq->gem_context->ppgtt ?: rq->i915->mm.aliasing_ppgtt;
70c2a24d 423 u32 *reg_state = ce->lrc_reg_state;
ae1250b9 424
e6ba9992 425 reg_state[CTX_RING_TAIL+1] = intel_ring_set_tail(rq->ring, rq->tail);
ae1250b9 426
0a823e8f
CW
427 /*
428 * True 32b PPGTT with dynamic page allocation: update PDP
c6a2ac71
TU
429 * registers and point the unallocated PDPs to scratch page.
430 * PML4 is allocated during ppgtt init, so this is not needed
431 * in 48-bit mode.
432 */
82ad6443 433 if (ppgtt && !i915_vm_is_48bit(&ppgtt->vm))
c6a2ac71 434 execlists_update_context_pdps(ppgtt, reg_state);
70c2a24d 435
0a823e8f
CW
436 /*
437 * Make sure the context image is complete before we submit it to HW.
438 *
439 * Ostensibly, writes (including the WCB) should be flushed prior to
440 * an uncached write such as our mmio register access, the empirical
441 * evidence (esp. on Braswell) suggests that the WC write into memory
442 * may not be visible to the HW prior to the completion of the UC
443 * register write and that we may begin execution from the context
444 * before its image is complete leading to invalid PD chasing.
cf66b8a0
CW
445 *
446 * Furthermore, Braswell, at least, wants a full mb to be sure that
447 * the writes are coherent in memory (visible to the GPU) prior to
448 * execution, and not just visible to other CPUs (as is the result of
449 * wmb).
0a823e8f 450 */
cf66b8a0 451 mb();
70c2a24d 452 return ce->lrc_desc;
ae1250b9
OM
453}
454
05f0addd 455static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port)
beecec90 456{
05f0addd
TD
457 if (execlists->ctrl_reg) {
458 writel(lower_32_bits(desc), execlists->submit_reg + port * 2);
459 writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1);
460 } else {
461 writel(upper_32_bits(desc), execlists->submit_reg);
462 writel(lower_32_bits(desc), execlists->submit_reg);
463 }
beecec90
CW
464}
465
70c2a24d 466static void execlists_submit_ports(struct intel_engine_cs *engine)
bbd6c47e 467{
05f0addd
TD
468 struct intel_engine_execlists *execlists = &engine->execlists;
469 struct execlist_port *port = execlists->port;
77f0d0e9 470 unsigned int n;
bbd6c47e 471
d78d3343
CW
472 /*
473 * We can skip acquiring intel_runtime_pm_get() here as it was taken
474 * on our behalf by the request (see i915_gem_mark_busy()) and it will
475 * not be relinquished until the device is idle (see
476 * i915_gem_idle_work_handler()). As a precaution, we make sure
477 * that all ELSP are drained i.e. we have processed the CSB,
478 * before allowing ourselves to idle and calling intel_runtime_pm_put().
479 */
480 GEM_BUG_ON(!engine->i915->gt.awake);
481
05f0addd
TD
482 /*
483 * ELSQ note: the submit queue is not cleared after being submitted
484 * to the HW so we need to make sure we always clean it up. This is
485 * currently ensured by the fact that we always write the same number
486 * of elsq entries, keep this in mind before changing the loop below.
487 */
488 for (n = execlists_num_ports(execlists); n--; ) {
e61e0f51 489 struct i915_request *rq;
77f0d0e9
CW
490 unsigned int count;
491 u64 desc;
492
493 rq = port_unpack(&port[n], &count);
494 if (rq) {
495 GEM_BUG_ON(count > !n);
496 if (!count++)
73fd9d38 497 execlists_context_schedule_in(rq);
77f0d0e9
CW
498 port_set(&port[n], port_pack(rq, count));
499 desc = execlists_update_context(rq);
500 GEM_DEBUG_EXEC(port[n].context_id = upper_32_bits(desc));
bccd3b83 501
0c5c7df3 502 GEM_TRACE("%s in[%d]: ctx=%d.%d, global=%d (fence %llx:%d) (current %d), prio=%d\n",
bccd3b83 503 engine->name, n,
16c8619a 504 port[n].context_id, count,
f6322edd 505 rq->global_seqno,
0c5c7df3 506 rq->fence.context, rq->fence.seqno,
e7702760 507 intel_engine_get_seqno(engine),
f6322edd 508 rq_prio(rq));
77f0d0e9
CW
509 } else {
510 GEM_BUG_ON(!n);
511 desc = 0;
512 }
bbd6c47e 513
05f0addd 514 write_desc(execlists, desc, n);
77f0d0e9 515 }
05f0addd
TD
516
517 /* we need to manually load the submit queue */
518 if (execlists->ctrl_reg)
519 writel(EL_CTRL_LOAD, execlists->ctrl_reg);
520
521 execlists_clear_active(execlists, EXECLISTS_ACTIVE_HWACK);
bbd6c47e
CW
522}
523
1fc44d9b 524static bool ctx_single_port_submission(const struct intel_context *ce)
84b790f8 525{
70c2a24d 526 return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
1fc44d9b 527 i915_gem_context_force_single_submission(ce->gem_context));
70c2a24d 528}
84b790f8 529
1fc44d9b
CW
530static bool can_merge_ctx(const struct intel_context *prev,
531 const struct intel_context *next)
70c2a24d
CW
532{
533 if (prev != next)
534 return false;
26720ab9 535
70c2a24d
CW
536 if (ctx_single_port_submission(prev))
537 return false;
26720ab9 538
70c2a24d 539 return true;
84b790f8
BW
540}
541
e61e0f51 542static void port_assign(struct execlist_port *port, struct i915_request *rq)
77f0d0e9
CW
543{
544 GEM_BUG_ON(rq == port_request(port));
545
546 if (port_isset(port))
e61e0f51 547 i915_request_put(port_request(port));
77f0d0e9 548
e61e0f51 549 port_set(port, port_pack(i915_request_get(rq), port_count(port)));
77f0d0e9
CW
550}
551
beecec90
CW
552static void inject_preempt_context(struct intel_engine_cs *engine)
553{
05f0addd 554 struct intel_engine_execlists *execlists = &engine->execlists;
beecec90 555 struct intel_context *ce =
ab82a063 556 to_intel_context(engine->i915->preempt_context, engine);
beecec90
CW
557 unsigned int n;
558
05f0addd 559 GEM_BUG_ON(execlists->preempt_complete_status !=
d6376374 560 upper_32_bits(ce->lrc_desc));
09b1a4e4 561
f6322edd
CW
562 /*
563 * Switch to our empty preempt context so
564 * the state of the GPU is known (idle).
565 */
16a87394 566 GEM_TRACE("%s\n", engine->name);
05f0addd
TD
567 for (n = execlists_num_ports(execlists); --n; )
568 write_desc(execlists, 0, n);
569
570 write_desc(execlists, ce->lrc_desc, n);
571
572 /* we need to manually load the submit queue */
573 if (execlists->ctrl_reg)
574 writel(EL_CTRL_LOAD, execlists->ctrl_reg);
beecec90 575
ef2fb720
CW
576 execlists_clear_active(execlists, EXECLISTS_ACTIVE_HWACK);
577 execlists_set_active(execlists, EXECLISTS_ACTIVE_PREEMPT);
578}
579
580static void complete_preempt_context(struct intel_engine_execlists *execlists)
581{
582 GEM_BUG_ON(!execlists_is_active(execlists, EXECLISTS_ACTIVE_PREEMPT));
583
0f6b79fa
CW
584 if (inject_preempt_hang(execlists))
585 return;
586
ef2fb720 587 execlists_cancel_port_requests(execlists);
9512f985
CW
588 __unwind_incomplete_requests(container_of(execlists,
589 struct intel_engine_cs,
590 execlists));
beecec90
CW
591}
592
9512f985 593static void execlists_dequeue(struct intel_engine_cs *engine)
acdd884a 594{
7a62cc61
MK
595 struct intel_engine_execlists * const execlists = &engine->execlists;
596 struct execlist_port *port = execlists->port;
76e70087
MK
597 const struct execlist_port * const last_port =
598 &execlists->port[execlists->port_mask];
e61e0f51 599 struct i915_request *last = port_request(port);
20311bd3 600 struct rb_node *rb;
70c2a24d
CW
601 bool submit = false;
602
9512f985
CW
603 /*
604 * Hardware submission is through 2 ports. Conceptually each port
70c2a24d
CW
605 * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
606 * static for a context, and unique to each, so we only execute
607 * requests belonging to a single context from each ring. RING_HEAD
608 * is maintained by the CS in the context image, it marks the place
609 * where it got up to last time, and through RING_TAIL we tell the CS
610 * where we want to execute up to this time.
611 *
612 * In this list the requests are in order of execution. Consecutive
613 * requests from the same context are adjacent in the ringbuffer. We
614 * can combine these requests into a single RING_TAIL update:
615 *
616 * RING_HEAD...req1...req2
617 * ^- RING_TAIL
618 * since to execute req2 the CS must first execute req1.
619 *
620 * Our goal then is to point each port to the end of a consecutive
621 * sequence of requests as being the most optimal (fewest wake ups
622 * and context switches) submission.
779949f4 623 */
acdd884a 624
beecec90
CW
625 if (last) {
626 /*
627 * Don't resubmit or switch until all outstanding
628 * preemptions (lite-restore) are seen. Then we
629 * know the next preemption status we see corresponds
630 * to this ELSP update.
631 */
eed7ec52
CW
632 GEM_BUG_ON(!execlists_is_active(execlists,
633 EXECLISTS_ACTIVE_USER));
ba74cb10 634 GEM_BUG_ON(!port_count(&port[0]));
beecec90 635
ba74cb10
MT
636 /*
637 * If we write to ELSP a second time before the HW has had
638 * a chance to respond to the previous write, we can confuse
639 * the HW and hit "undefined behaviour". After writing to ELSP,
640 * we must then wait until we see a context-switch event from
641 * the HW to indicate that it has had a chance to respond.
642 */
643 if (!execlists_is_active(execlists, EXECLISTS_ACTIVE_HWACK))
0b02befa 644 return;
ba74cb10 645
f6322edd 646 if (need_preempt(engine, last, execlists->queue_priority)) {
beecec90 647 inject_preempt_context(engine);
0b02befa 648 return;
beecec90 649 }
f6322edd
CW
650
651 /*
652 * In theory, we could coalesce more requests onto
653 * the second port (the first port is active, with
654 * no preemptions pending). However, that means we
655 * then have to deal with the possible lite-restore
656 * of the second port (as we submit the ELSP, there
657 * may be a context-switch) but also we may complete
658 * the resubmission before the context-switch. Ergo,
659 * coalescing onto the second port will cause a
660 * preemption event, but we cannot predict whether
661 * that will affect port[0] or port[1].
662 *
663 * If the second port is already active, we can wait
664 * until the next context-switch before contemplating
665 * new requests. The GPU will be busy and we should be
666 * able to resubmit the new ELSP before it idles,
667 * avoiding pipeline bubbles (momentary pauses where
668 * the driver is unable to keep up the supply of new
669 * work). However, we have to double check that the
670 * priorities of the ports haven't been switch.
671 */
672 if (port_count(&port[1]))
0b02befa 673 return;
f6322edd
CW
674
675 /*
676 * WaIdleLiteRestore:bdw,skl
677 * Apply the wa NOOPs to prevent
678 * ring:HEAD == rq:TAIL as we resubmit the
679 * request. See gen8_emit_breadcrumb() for
680 * where we prepare the padding after the
681 * end of the request.
682 */
683 last->tail = last->wa_tail;
beecec90
CW
684 }
685
655250a8 686 while ((rb = rb_first_cached(&execlists->queue))) {
f6322edd 687 struct i915_priolist *p = to_priolist(rb);
e61e0f51 688 struct i915_request *rq, *rn;
6c067579 689
0c7112a0 690 list_for_each_entry_safe(rq, rn, &p->requests, sched.link) {
6c067579
CW
691 /*
692 * Can we combine this request with the current port?
693 * It has to be the same context/ringbuffer and not
694 * have any exceptions (e.g. GVT saying never to
695 * combine contexts).
696 *
697 * If we can combine the requests, we can execute both
698 * by updating the RING_TAIL to point to the end of the
699 * second request, and so we never need to tell the
700 * hardware about the first.
70c2a24d 701 */
1fc44d9b
CW
702 if (last &&
703 !can_merge_ctx(rq->hw_context, last->hw_context)) {
6c067579
CW
704 /*
705 * If we are on the second port and cannot
706 * combine this request with the last, then we
707 * are done.
708 */
76e70087 709 if (port == last_port) {
6c067579 710 __list_del_many(&p->requests,
0c7112a0 711 &rq->sched.link);
6c067579
CW
712 goto done;
713 }
714
715 /*
716 * If GVT overrides us we only ever submit
717 * port[0], leaving port[1] empty. Note that we
718 * also have to be careful that we don't queue
719 * the same context (even though a different
720 * request) to the second port.
721 */
1fc44d9b
CW
722 if (ctx_single_port_submission(last->hw_context) ||
723 ctx_single_port_submission(rq->hw_context)) {
6c067579 724 __list_del_many(&p->requests,
0c7112a0 725 &rq->sched.link);
6c067579
CW
726 goto done;
727 }
728
1fc44d9b 729 GEM_BUG_ON(last->hw_context == rq->hw_context);
6c067579
CW
730
731 if (submit)
732 port_assign(port, last);
733 port++;
7a62cc61
MK
734
735 GEM_BUG_ON(port_isset(port));
6c067579 736 }
70c2a24d 737
0c7112a0 738 INIT_LIST_HEAD(&rq->sched.link);
e61e0f51
CW
739 __i915_request_submit(rq);
740 trace_i915_request_in(rq, port_index(port, execlists));
6c067579
CW
741 last = rq;
742 submit = true;
70c2a24d 743 }
d55ac5bf 744
655250a8 745 rb_erase_cached(&p->node, &execlists->queue);
6c067579
CW
746 INIT_LIST_HEAD(&p->requests);
747 if (p->priority != I915_PRIORITY_NORMAL)
c5cf9a91 748 kmem_cache_free(engine->i915->priorities, p);
f6322edd 749 }
15c83c43 750
6c067579 751done:
15c83c43
CW
752 /*
753 * Here be a bit of magic! Or sleight-of-hand, whichever you prefer.
754 *
755 * We choose queue_priority such that if we add a request of greater
756 * priority than this, we kick the submission tasklet to decide on
757 * the right order of submitting the requests to hardware. We must
758 * also be prepared to reorder requests as they are in-flight on the
759 * HW. We derive the queue_priority then as the first "hole" in
760 * the HW submission ports and if there are no available slots,
761 * the priority of the lowest executing request, i.e. last.
762 *
763 * When we do receive a higher priority request ready to run from the
764 * user, see queue_request(), the queue_priority is bumped to that
765 * request triggering preemption on the next dequeue (or subsequent
766 * interrupt for secondary ports).
767 */
768 execlists->queue_priority =
769 port != execlists->port ? rq_prio(last) : INT_MIN;
770
0b02befa 771 if (submit) {
77f0d0e9 772 port_assign(port, last);
0b02befa
CW
773 execlists_submit_ports(engine);
774 }
339ccd35
CW
775
776 /* We must always keep the beast fed if we have work piled up */
655250a8
CW
777 GEM_BUG_ON(rb_first_cached(&execlists->queue) &&
778 !port_isset(execlists->port));
339ccd35 779
4413c474
CW
780 /* Re-evaluate the executing context setup after each preemptive kick */
781 if (last)
f2605207 782 execlists_user_begin(execlists, execlists->port);
4413c474 783
0b02befa
CW
784 /* If the engine is now idle, so should be the flag; and vice versa. */
785 GEM_BUG_ON(execlists_is_active(&engine->execlists,
786 EXECLISTS_ACTIVE_USER) ==
787 !port_isset(engine->execlists.port));
4413c474
CW
788}
789
c41937fd 790void
a4598d17 791execlists_cancel_port_requests(struct intel_engine_execlists * const execlists)
cf4591d1 792{
3f9e6cd8 793 struct execlist_port *port = execlists->port;
dc2279e1 794 unsigned int num_ports = execlists_num_ports(execlists);
cf4591d1 795
3f9e6cd8 796 while (num_ports-- && port_isset(port)) {
e61e0f51 797 struct i915_request *rq = port_request(port);
7e44fc28 798
0c5c7df3
TU
799 GEM_TRACE("%s:port%u global=%d (fence %llx:%d), (current %d)\n",
800 rq->engine->name,
801 (unsigned int)(port - execlists->port),
802 rq->global_seqno,
803 rq->fence.context, rq->fence.seqno,
804 intel_engine_get_seqno(rq->engine));
805
4a118ecb 806 GEM_BUG_ON(!execlists->active);
b9b77426
CW
807 execlists_context_schedule_out(rq,
808 i915_request_completed(rq) ?
809 INTEL_CONTEXT_SCHEDULE_OUT :
810 INTEL_CONTEXT_SCHEDULE_PREEMPTED);
702791f7 811
e61e0f51 812 i915_request_put(rq);
7e44fc28 813
3f9e6cd8
CW
814 memset(port, 0, sizeof(*port));
815 port++;
816 }
eed7ec52 817
0051163a 818 execlists_clear_all_active(execlists);
cf4591d1
MK
819}
820
f4b58f04
CW
821static void reset_csb_pointers(struct intel_engine_execlists *execlists)
822{
823 /*
824 * After a reset, the HW starts writing into CSB entry [0]. We
825 * therefore have to set our HEAD pointer back one entry so that
826 * the *first* entry we check is entry 0. To complicate this further,
827 * as we don't wait for the first interrupt after reset, we have to
828 * fake the HW write to point back to the last entry so that our
829 * inline comparison of our cached head position against the last HW
830 * write works even before the first interrupt.
831 */
832 execlists->csb_head = execlists->csb_write_reset;
833 WRITE_ONCE(*execlists->csb_write, execlists->csb_write_reset);
834}
835
f1a498fa
CW
836static void nop_submission_tasklet(unsigned long data)
837{
838 /* The driver is wedged; don't process any more events. */
839}
840
27a5f61b
CW
841static void execlists_cancel_requests(struct intel_engine_cs *engine)
842{
b620e870 843 struct intel_engine_execlists * const execlists = &engine->execlists;
e61e0f51 844 struct i915_request *rq, *rn;
27a5f61b
CW
845 struct rb_node *rb;
846 unsigned long flags;
27a5f61b 847
0c5c7df3
TU
848 GEM_TRACE("%s current %d\n",
849 engine->name, intel_engine_get_seqno(engine));
963ddd63 850
a3e38836
CW
851 /*
852 * Before we call engine->cancel_requests(), we should have exclusive
853 * access to the submission state. This is arranged for us by the
854 * caller disabling the interrupt generation, the tasklet and other
855 * threads that may then access the same state, giving us a free hand
856 * to reset state. However, we still need to let lockdep be aware that
857 * we know this state may be accessed in hardirq context, so we
858 * disable the irq around this manipulation and we want to keep
859 * the spinlock focused on its duties and not accidentally conflate
860 * coverage to the submission's irq state. (Similarly, although we
861 * shouldn't need to disable irq around the manipulation of the
862 * submission's irq state, we also wish to remind ourselves that
863 * it is irq state.)
864 */
d8857d54 865 spin_lock_irqsave(&engine->timeline.lock, flags);
27a5f61b
CW
866
867 /* Cancel the requests on the HW and clear the ELSP tracker. */
a4598d17 868 execlists_cancel_port_requests(execlists);
0051163a 869 execlists_user_end(execlists);
27a5f61b
CW
870
871 /* Mark all executing requests as skipped. */
a89d1f92 872 list_for_each_entry(rq, &engine->timeline.requests, link) {
27a5f61b 873 GEM_BUG_ON(!rq->global_seqno);
e61e0f51 874 if (!i915_request_completed(rq))
27a5f61b
CW
875 dma_fence_set_error(&rq->fence, -EIO);
876 }
877
878 /* Flush the queued requests to the timeline list (for retiring). */
655250a8 879 while ((rb = rb_first_cached(&execlists->queue))) {
f6322edd 880 struct i915_priolist *p = to_priolist(rb);
27a5f61b 881
0c7112a0
CW
882 list_for_each_entry_safe(rq, rn, &p->requests, sched.link) {
883 INIT_LIST_HEAD(&rq->sched.link);
27a5f61b
CW
884
885 dma_fence_set_error(&rq->fence, -EIO);
e61e0f51 886 __i915_request_submit(rq);
27a5f61b
CW
887 }
888
655250a8 889 rb_erase_cached(&p->node, &execlists->queue);
27a5f61b
CW
890 INIT_LIST_HEAD(&p->requests);
891 if (p->priority != I915_PRIORITY_NORMAL)
892 kmem_cache_free(engine->i915->priorities, p);
893 }
894
895 /* Remaining _unready_ requests will be nop'ed when submitted */
896
f6322edd 897 execlists->queue_priority = INT_MIN;
655250a8 898 execlists->queue = RB_ROOT_CACHED;
3f9e6cd8 899 GEM_BUG_ON(port_isset(execlists->port));
27a5f61b 900
f1a498fa
CW
901 GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet));
902 execlists->tasklet.func = nop_submission_tasklet;
903
d8857d54 904 spin_unlock_irqrestore(&engine->timeline.lock, flags);
27a5f61b
CW
905}
906
9512f985
CW
907static inline bool
908reset_in_progress(const struct intel_engine_execlists *execlists)
909{
910 return unlikely(!__tasklet_is_enabled(&execlists->tasklet));
911}
912
73377dbc 913static void process_csb(struct intel_engine_cs *engine)
e981e7b1 914{
b620e870 915 struct intel_engine_execlists * const execlists = &engine->execlists;
f2605207 916 struct execlist_port *port = execlists->port;
bc4237ec
CW
917 const u32 * const buf = execlists->csb_status;
918 u8 head, tail;
c6a2ac71 919
bc4237ec
CW
920 /*
921 * Note that csb_write, csb_status may be either in HWSP or mmio.
922 * When reading from the csb_write mmio register, we have to be
923 * careful to only use the GEN8_CSB_WRITE_PTR portion, which is
924 * the low 4bits. As it happens we know the next 4bits are always
925 * zero and so we can simply masked off the low u8 of the register
926 * and treat it identically to reading from the HWSP (without having
927 * to use explicit shifting and masking, and probably bifurcating
928 * the code to handle the legacy mmio read).
929 */
930 head = execlists->csb_head;
931 tail = READ_ONCE(*execlists->csb_write);
932 GEM_TRACE("%s cs-irq head=%d, tail=%d\n", engine->name, head, tail);
933 if (unlikely(head == tail))
934 return;
b2209e62 935
bc4237ec
CW
936 /*
937 * Hopefully paired with a wmb() in HW!
938 *
939 * We must complete the read of the write pointer before any reads
940 * from the CSB, so that we do not see stale values. Without an rmb
941 * (lfence) the HW may speculatively perform the CSB[] reads *before*
942 * we perform the READ_ONCE(*csb_write).
943 */
944 rmb();
767a983a 945
bc4237ec 946 do {
8ea397fa
CW
947 struct i915_request *rq;
948 unsigned int status;
949 unsigned int count;
950
951 if (++head == GEN8_CSB_ENTRIES)
952 head = 0;
953
954 /*
955 * We are flying near dragons again.
956 *
957 * We hold a reference to the request in execlist_port[]
958 * but no more than that. We are operating in softirq
959 * context and so cannot hold any mutex or sleep. That
960 * prevents us stopping the requests we are processing
961 * in port[] from being retired simultaneously (the
962 * breadcrumb will be complete before we see the
963 * context-switch). As we only hold the reference to the
964 * request, any pointer chasing underneath the request
965 * is subject to a potential use-after-free. Thus we
966 * store all of the bookkeeping within port[] as
967 * required, and avoid using unguarded pointers beneath
968 * request itself. The same applies to the atomic
969 * status notifier.
970 */
971
8ea397fa
CW
972 GEM_TRACE("%s csb[%d]: status=0x%08x:0x%08x, active=0x%x\n",
973 engine->name, head,
bc4237ec 974 buf[2 * head + 0], buf[2 * head + 1],
8ea397fa
CW
975 execlists->active);
976
bc4237ec 977 status = buf[2 * head];
8ea397fa
CW
978 if (status & (GEN8_CTX_STATUS_IDLE_ACTIVE |
979 GEN8_CTX_STATUS_PREEMPTED))
980 execlists_set_active(execlists,
981 EXECLISTS_ACTIVE_HWACK);
982 if (status & GEN8_CTX_STATUS_ACTIVE_IDLE)
983 execlists_clear_active(execlists,
984 EXECLISTS_ACTIVE_HWACK);
985
986 if (!(status & GEN8_CTX_STATUS_COMPLETED_MASK))
987 continue;
988
989 /* We should never get a COMPLETED | IDLE_ACTIVE! */
990 GEM_BUG_ON(status & GEN8_CTX_STATUS_IDLE_ACTIVE);
991
992 if (status & GEN8_CTX_STATUS_COMPLETE &&
993 buf[2*head + 1] == execlists->preempt_complete_status) {
994 GEM_TRACE("%s preempt-idle\n", engine->name);
995 complete_preempt_context(execlists);
996 continue;
767a983a 997 }
b620e870 998
8ea397fa
CW
999 if (status & GEN8_CTX_STATUS_PREEMPTED &&
1000 execlists_is_active(execlists,
1001 EXECLISTS_ACTIVE_PREEMPT))
1002 continue;
4af0d727 1003
8ea397fa
CW
1004 GEM_BUG_ON(!execlists_is_active(execlists,
1005 EXECLISTS_ACTIVE_USER));
70c2a24d 1006
8ea397fa
CW
1007 rq = port_unpack(port, &count);
1008 GEM_TRACE("%s out[0]: ctx=%d.%d, global=%d (fence %llx:%d) (current %d), prio=%d\n",
1009 engine->name,
1010 port->context_id, count,
1011 rq ? rq->global_seqno : 0,
1012 rq ? rq->fence.context : 0,
1013 rq ? rq->fence.seqno : 0,
1014 intel_engine_get_seqno(engine),
1015 rq ? rq_prio(rq) : 0);
1016
1017 /* Check the context/desc id for this event matches */
1018 GEM_DEBUG_BUG_ON(buf[2 * head + 1] != port->context_id);
1019
1020 GEM_BUG_ON(count == 0);
1021 if (--count == 0) {
73377dbc 1022 /*
8ea397fa
CW
1023 * On the final event corresponding to the
1024 * submission of this context, we expect either
1025 * an element-switch event or a completion
1026 * event (and on completion, the active-idle
1027 * marker). No more preemptions, lite-restore
1028 * or otherwise.
2ffe80aa 1029 */
8ea397fa
CW
1030 GEM_BUG_ON(status & GEN8_CTX_STATUS_PREEMPTED);
1031 GEM_BUG_ON(port_isset(&port[1]) &&
1032 !(status & GEN8_CTX_STATUS_ELEMENT_SWITCH));
1033 GEM_BUG_ON(!port_isset(&port[1]) &&
1034 !(status & GEN8_CTX_STATUS_ACTIVE_IDLE));
2ffe80aa 1035
8ea397fa
CW
1036 /*
1037 * We rely on the hardware being strongly
1038 * ordered, that the breadcrumb write is
1039 * coherent (visible from the CPU) before the
1040 * user interrupt and CSB is processed.
1041 */
1042 GEM_BUG_ON(!i915_request_completed(rq));
beecec90 1043
8ea397fa
CW
1044 execlists_context_schedule_out(rq,
1045 INTEL_CONTEXT_SCHEDULE_OUT);
1046 i915_request_put(rq);
e084039b 1047
8ea397fa
CW
1048 GEM_TRACE("%s completed ctx=%d\n",
1049 engine->name, port->context_id);
e084039b 1050
8ea397fa
CW
1051 port = execlists_port_complete(execlists, port);
1052 if (port_isset(port))
1053 execlists_user_begin(execlists, port);
1054 else
1055 execlists_user_end(execlists);
1056 } else {
1057 port_set(port, port_pack(rq, count));
4af0d727 1058 }
bc4237ec 1059 } while (head != tail);
e981e7b1 1060
bc4237ec 1061 execlists->csb_head = head;
73377dbc 1062}
c6a2ac71 1063
9512f985 1064static void __execlists_submission_tasklet(struct intel_engine_cs *const engine)
73377dbc 1065{
9512f985 1066 lockdep_assert_held(&engine->timeline.lock);
73377dbc 1067
fd8526e5 1068 process_csb(engine);
73377dbc
CW
1069 if (!execlists_is_active(&engine->execlists, EXECLISTS_ACTIVE_PREEMPT))
1070 execlists_dequeue(engine);
e981e7b1
TD
1071}
1072
9512f985
CW
1073/*
1074 * Check the unread Context Status Buffers and manage the submission of new
1075 * contexts to the ELSP accordingly.
1076 */
1077static void execlists_submission_tasklet(unsigned long data)
1078{
1079 struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
1080 unsigned long flags;
1081
1082 GEM_TRACE("%s awake?=%d, active=%x\n",
1083 engine->name,
1084 engine->i915->gt.awake,
1085 engine->execlists.active);
1086
1087 spin_lock_irqsave(&engine->timeline.lock, flags);
d78d3343 1088 __execlists_submission_tasklet(engine);
9512f985
CW
1089 spin_unlock_irqrestore(&engine->timeline.lock, flags);
1090}
1091
f6322edd 1092static void queue_request(struct intel_engine_cs *engine,
0c7112a0 1093 struct i915_sched_node *node,
f6322edd 1094 int prio)
27606fd8 1095{
0c7112a0 1096 list_add_tail(&node->link,
87c7acf8 1097 &lookup_priolist(engine, prio)->requests);
f6322edd 1098}
27606fd8 1099
9512f985 1100static void __update_queue(struct intel_engine_cs *engine, int prio)
ae2f5c00
CW
1101{
1102 engine->execlists.queue_priority = prio;
9512f985
CW
1103}
1104
1105static void __submit_queue_imm(struct intel_engine_cs *engine)
1106{
1107 struct intel_engine_execlists * const execlists = &engine->execlists;
1108
1109 if (reset_in_progress(execlists))
1110 return; /* defer until we restart the engine following reset */
1111
1112 if (execlists->tasklet.func == execlists_submission_tasklet)
1113 __execlists_submission_tasklet(engine);
1114 else
1115 tasklet_hi_schedule(&execlists->tasklet);
ae2f5c00
CW
1116}
1117
f6322edd
CW
1118static void submit_queue(struct intel_engine_cs *engine, int prio)
1119{
9512f985
CW
1120 if (prio > engine->execlists.queue_priority) {
1121 __update_queue(engine, prio);
1122 __submit_queue_imm(engine);
1123 }
27606fd8
CW
1124}
1125
e61e0f51 1126static void execlists_submit_request(struct i915_request *request)
acdd884a 1127{
4a570db5 1128 struct intel_engine_cs *engine = request->engine;
5590af3e 1129 unsigned long flags;
acdd884a 1130
663f71e7 1131 /* Will be called from irq-context when using foreign fences. */
a89d1f92 1132 spin_lock_irqsave(&engine->timeline.lock, flags);
acdd884a 1133
0c7112a0 1134 queue_request(engine, &request->sched, rq_prio(request));
acdd884a 1135
655250a8 1136 GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
0c7112a0 1137 GEM_BUG_ON(list_empty(&request->sched.link));
6c067579 1138
9512f985
CW
1139 submit_queue(engine, rq_prio(request));
1140
a89d1f92 1141 spin_unlock_irqrestore(&engine->timeline.lock, flags);
acdd884a
MT
1142}
1143
0c7112a0 1144static struct i915_request *sched_to_request(struct i915_sched_node *node)
1f181225 1145{
0c7112a0 1146 return container_of(node, struct i915_request, sched);
1f181225
CW
1147}
1148
20311bd3 1149static struct intel_engine_cs *
0c7112a0 1150sched_lock_engine(struct i915_sched_node *node, struct intel_engine_cs *locked)
20311bd3 1151{
0c7112a0 1152 struct intel_engine_cs *engine = sched_to_request(node)->engine;
a79a524e
CW
1153
1154 GEM_BUG_ON(!locked);
20311bd3 1155
20311bd3 1156 if (engine != locked) {
a89d1f92
CW
1157 spin_unlock(&locked->timeline.lock);
1158 spin_lock(&engine->timeline.lock);
20311bd3
CW
1159 }
1160
1161 return engine;
1162}
1163
b7268c5e
CW
1164static void execlists_schedule(struct i915_request *request,
1165 const struct i915_sched_attr *attr)
20311bd3 1166{
a02eb975
CW
1167 struct i915_priolist *uninitialized_var(pl);
1168 struct intel_engine_cs *engine, *last;
20311bd3
CW
1169 struct i915_dependency *dep, *p;
1170 struct i915_dependency stack;
b7268c5e 1171 const int prio = attr->priority;
20311bd3
CW
1172 LIST_HEAD(dfs);
1173
7d1ea609
CW
1174 GEM_BUG_ON(prio == I915_PRIORITY_INVALID);
1175
e61e0f51 1176 if (i915_request_completed(request))
c218ee03
CW
1177 return;
1178
b7268c5e 1179 if (prio <= READ_ONCE(request->sched.attr.priority))
20311bd3
CW
1180 return;
1181
70cd1476
CW
1182 /* Need BKL in order to use the temporary link inside i915_dependency */
1183 lockdep_assert_held(&request->i915->drm.struct_mutex);
20311bd3 1184
0c7112a0 1185 stack.signaler = &request->sched;
20311bd3
CW
1186 list_add(&stack.dfs_link, &dfs);
1187
ce01b173
CW
1188 /*
1189 * Recursively bump all dependent priorities to match the new request.
20311bd3
CW
1190 *
1191 * A naive approach would be to use recursion:
0c7112a0
CW
1192 * static void update_priorities(struct i915_sched_node *node, prio) {
1193 * list_for_each_entry(dep, &node->signalers_list, signal_link)
20311bd3 1194 * update_priorities(dep->signal, prio)
0c7112a0 1195 * queue_request(node);
20311bd3
CW
1196 * }
1197 * but that may have unlimited recursion depth and so runs a very
1198 * real risk of overunning the kernel stack. Instead, we build
1199 * a flat list of all dependencies starting with the current request.
1200 * As we walk the list of dependencies, we add all of its dependencies
1201 * to the end of the list (this may include an already visited
1202 * request) and continue to walk onwards onto the new dependencies. The
1203 * end result is a topological list of requests in reverse order, the
1204 * last element in the list is the request we must execute first.
1205 */
2221c5b7 1206 list_for_each_entry(dep, &dfs, dfs_link) {
0c7112a0 1207 struct i915_sched_node *node = dep->signaler;
20311bd3 1208
ce01b173
CW
1209 /*
1210 * Within an engine, there can be no cycle, but we may
a79a524e
CW
1211 * refer to the same dependency chain multiple times
1212 * (redundant dependencies are not eliminated) and across
1213 * engines.
1214 */
0c7112a0 1215 list_for_each_entry(p, &node->signalers_list, signal_link) {
ce01b173
CW
1216 GEM_BUG_ON(p == dep); /* no cycles! */
1217
0c7112a0 1218 if (i915_sched_node_signaled(p->signaler))
1f181225
CW
1219 continue;
1220
b7268c5e
CW
1221 GEM_BUG_ON(p->signaler->attr.priority < node->attr.priority);
1222 if (prio > READ_ONCE(p->signaler->attr.priority))
20311bd3 1223 list_move_tail(&p->dfs_link, &dfs);
a79a524e 1224 }
20311bd3
CW
1225 }
1226
ce01b173
CW
1227 /*
1228 * If we didn't need to bump any existing priorities, and we haven't
349bdb68
CW
1229 * yet submitted this request (i.e. there is no potential race with
1230 * execlists_submit_request()), we can set our own priority and skip
1231 * acquiring the engine locks.
1232 */
b7268c5e 1233 if (request->sched.attr.priority == I915_PRIORITY_INVALID) {
0c7112a0 1234 GEM_BUG_ON(!list_empty(&request->sched.link));
b7268c5e 1235 request->sched.attr = *attr;
349bdb68
CW
1236 if (stack.dfs_link.next == stack.dfs_link.prev)
1237 return;
1238 __list_del_entry(&stack.dfs_link);
1239 }
1240
a02eb975 1241 last = NULL;
a79a524e 1242 engine = request->engine;
a89d1f92 1243 spin_lock_irq(&engine->timeline.lock);
a79a524e 1244
20311bd3
CW
1245 /* Fifo and depth-first replacement ensure our deps execute before us */
1246 list_for_each_entry_safe_reverse(dep, p, &dfs, dfs_link) {
0c7112a0 1247 struct i915_sched_node *node = dep->signaler;
20311bd3
CW
1248
1249 INIT_LIST_HEAD(&dep->dfs_link);
1250
0c7112a0 1251 engine = sched_lock_engine(node, engine);
20311bd3 1252
b7268c5e 1253 if (prio <= node->attr.priority)
20311bd3
CW
1254 continue;
1255
b7268c5e 1256 node->attr.priority = prio;
0c7112a0 1257 if (!list_empty(&node->link)) {
a02eb975
CW
1258 if (last != engine) {
1259 pl = lookup_priolist(engine, prio);
1260 last = engine;
1261 }
1262 GEM_BUG_ON(pl->priority != prio);
1263 list_move_tail(&node->link, &pl->requests);
a79a524e 1264 }
ae2f5c00
CW
1265
1266 if (prio > engine->execlists.queue_priority &&
9512f985
CW
1267 i915_sw_fence_done(&sched_to_request(node)->submit)) {
1268 /* defer submission until after all of our updates */
1269 __update_queue(engine, prio);
1270 tasklet_hi_schedule(&engine->execlists.tasklet);
1271 }
20311bd3
CW
1272 }
1273
a89d1f92 1274 spin_unlock_irq(&engine->timeline.lock);
20311bd3
CW
1275}
1276
1fc44d9b
CW
1277static void execlists_context_destroy(struct intel_context *ce)
1278{
1fc44d9b
CW
1279 GEM_BUG_ON(ce->pin_count);
1280
dd12c6ca
CW
1281 if (!ce->state)
1282 return;
1283
1fc44d9b 1284 intel_ring_free(ce->ring);
efe79d48
CW
1285
1286 GEM_BUG_ON(i915_gem_object_is_active(ce->state->obj));
1287 i915_gem_object_put(ce->state->obj);
1fc44d9b
CW
1288}
1289
867985d4 1290static void execlists_context_unpin(struct intel_context *ce)
1fc44d9b 1291{
288f1ced
CW
1292 i915_gem_context_unpin_hw_id(ce->gem_context);
1293
1fc44d9b
CW
1294 intel_ring_unpin(ce->ring);
1295
1296 ce->state->obj->pin_global--;
1297 i915_gem_object_unpin_map(ce->state->obj);
1298 i915_vma_unpin(ce->state);
1299
1300 i915_gem_context_put(ce->gem_context);
1301}
1302
f4e15af7
CW
1303static int __context_pin(struct i915_gem_context *ctx, struct i915_vma *vma)
1304{
1305 unsigned int flags;
1306 int err;
1307
1308 /*
1309 * Clear this page out of any CPU caches for coherent swap-in/out.
1310 * We only want to do this on the first bind so that we do not stall
1311 * on an active context (which by nature is already on the GPU).
1312 */
1313 if (!(vma->flags & I915_VMA_GLOBAL_BIND)) {
666424ab 1314 err = i915_gem_object_set_to_wc_domain(vma->obj, true);
f4e15af7
CW
1315 if (err)
1316 return err;
1317 }
1318
1319 flags = PIN_GLOBAL | PIN_HIGH;
496bcce3 1320 flags |= PIN_OFFSET_BIAS | i915_ggtt_pin_bias(vma);
f4e15af7 1321
c00db496 1322 return i915_vma_pin(vma, 0, 0, flags);
f4e15af7
CW
1323}
1324
1fc44d9b
CW
1325static struct intel_context *
1326__execlists_context_pin(struct intel_engine_cs *engine,
1327 struct i915_gem_context *ctx,
1328 struct intel_context *ce)
dcb4c12a 1329{
7d774cac 1330 void *vaddr;
ca82580c 1331 int ret;
dcb4c12a 1332
1fc44d9b 1333 ret = execlists_context_deferred_alloc(ctx, engine, ce);
1d2a19c2
CW
1334 if (ret)
1335 goto err;
56f6e0a7 1336 GEM_BUG_ON(!ce->state);
e8a9c58f 1337
f4e15af7 1338 ret = __context_pin(ctx, ce->state);
e84fe803 1339 if (ret)
24f1d3cc 1340 goto err;
7ba717cf 1341
666424ab
CW
1342 vaddr = i915_gem_object_pin_map(ce->state->obj,
1343 i915_coherent_map_type(ctx->i915) |
1344 I915_MAP_OVERRIDE);
7d774cac
TU
1345 if (IS_ERR(vaddr)) {
1346 ret = PTR_ERR(vaddr);
bf3783e5 1347 goto unpin_vma;
82352e90
TU
1348 }
1349
5503cb0d 1350 ret = intel_ring_pin(ce->ring);
e84fe803 1351 if (ret)
7d774cac 1352 goto unpin_map;
d1675198 1353
288f1ced
CW
1354 ret = i915_gem_context_pin_hw_id(ctx);
1355 if (ret)
1356 goto unpin_ring;
1357
1fc44d9b 1358 intel_lr_context_descriptor_update(ctx, engine, ce);
9021ad03 1359
dee60ca1
CW
1360 GEM_BUG_ON(!intel_ring_offset_valid(ce->ring, ce->ring->head));
1361
a3aabe86
CW
1362 ce->lrc_reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE;
1363 ce->lrc_reg_state[CTX_RING_BUFFER_START+1] =
bde13ebd 1364 i915_ggtt_offset(ce->ring->vma);
dee60ca1
CW
1365 ce->lrc_reg_state[CTX_RING_HEAD + 1] = ce->ring->head;
1366 ce->lrc_reg_state[CTX_RING_TAIL + 1] = ce->ring->tail;
a3aabe86 1367
3d574a6b 1368 ce->state->obj->pin_global++;
9a6feaf0 1369 i915_gem_context_get(ctx);
1fc44d9b 1370 return ce;
7ba717cf 1371
288f1ced
CW
1372unpin_ring:
1373 intel_ring_unpin(ce->ring);
7d774cac 1374unpin_map:
bf3783e5
CW
1375 i915_gem_object_unpin_map(ce->state->obj);
1376unpin_vma:
1377 __i915_vma_unpin(ce->state);
24f1d3cc 1378err:
9021ad03 1379 ce->pin_count = 0;
266a240b 1380 return ERR_PTR(ret);
e84fe803
NH
1381}
1382
1fc44d9b
CW
1383static const struct intel_context_ops execlists_context_ops = {
1384 .unpin = execlists_context_unpin,
1385 .destroy = execlists_context_destroy,
1386};
1387
1388static struct intel_context *
1389execlists_context_pin(struct intel_engine_cs *engine,
1390 struct i915_gem_context *ctx)
e84fe803 1391{
ab82a063 1392 struct intel_context *ce = to_intel_context(ctx, engine);
e84fe803 1393
91c8a326 1394 lockdep_assert_held(&ctx->i915->drm.struct_mutex);
321fe304 1395
1fc44d9b
CW
1396 if (likely(ce->pin_count++))
1397 return ce;
1398 GEM_BUG_ON(!ce->pin_count); /* no overflow please! */
dcb4c12a 1399
1fc44d9b 1400 ce->ops = &execlists_context_ops;
321fe304 1401
1fc44d9b 1402 return __execlists_context_pin(engine, ctx, ce);
dcb4c12a
OM
1403}
1404
e61e0f51 1405static int execlists_request_alloc(struct i915_request *request)
ef11c01d 1406{
fd138212 1407 int ret;
ef11c01d 1408
1fc44d9b 1409 GEM_BUG_ON(!request->hw_context->pin_count);
e8a9c58f 1410
ef11c01d
CW
1411 /* Flush enough space to reduce the likelihood of waiting after
1412 * we start building the request - in which case we will just
1413 * have to repeat work.
1414 */
1415 request->reserved_space += EXECLISTS_REQUEST_SIZE;
1416
fd138212
CW
1417 ret = intel_ring_wait_for_space(request->ring, request->reserved_space);
1418 if (ret)
1419 return ret;
ef11c01d 1420
ef11c01d
CW
1421 /* Note that after this point, we have committed to using
1422 * this request as it is being used to both track the
1423 * state of engine initialisation and liveness of the
1424 * golden renderstate above. Think twice before you try
1425 * to cancel/unwind this request now.
1426 */
1427
1428 request->reserved_space -= EXECLISTS_REQUEST_SIZE;
1429 return 0;
ef11c01d
CW
1430}
1431
9e000847
AS
1432/*
1433 * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
1434 * PIPE_CONTROL instruction. This is required for the flush to happen correctly
1435 * but there is a slight complication as this is applied in WA batch where the
1436 * values are only initialized once so we cannot take register value at the
1437 * beginning and reuse it further; hence we save its value to memory, upload a
1438 * constant value with bit21 set and then we restore it back with the saved value.
1439 * To simplify the WA, a constant value is formed by using the default value
1440 * of this register. This shouldn't be a problem because we are only modifying
1441 * it for a short period and this batch in non-premptible. We can ofcourse
1442 * use additional instructions that read the actual value of the register
1443 * at that time and set our bit of interest but it makes the WA complicated.
1444 *
1445 * This WA is also required for Gen9 so extracting as a function avoids
1446 * code duplication.
1447 */
097d4f1c
TU
1448static u32 *
1449gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
17ee950d 1450{
fe78742d 1451 /* NB no one else is allowed to scribble over scratch + 256! */
097d4f1c
TU
1452 *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1453 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
fe78742d 1454 *batch++ = i915_scratch_offset(engine->i915) + 256;
097d4f1c
TU
1455 *batch++ = 0;
1456
1457 *batch++ = MI_LOAD_REGISTER_IMM(1);
1458 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1459 *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
1460
9f235dfa
TU
1461 batch = gen8_emit_pipe_control(batch,
1462 PIPE_CONTROL_CS_STALL |
1463 PIPE_CONTROL_DC_FLUSH_ENABLE,
1464 0);
097d4f1c
TU
1465
1466 *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1467 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
fe78742d 1468 *batch++ = i915_scratch_offset(engine->i915) + 256;
097d4f1c
TU
1469 *batch++ = 0;
1470
1471 return batch;
17ee950d
AS
1472}
1473
6e5248b5
DV
1474/*
1475 * Typically we only have one indirect_ctx and per_ctx batch buffer which are
1476 * initialized at the beginning and shared across all contexts but this field
1477 * helps us to have multiple batches at different offsets and select them based
1478 * on a criteria. At the moment this batch always start at the beginning of the page
1479 * and at this point we don't have multiple wa_ctx batch buffers.
4d78c8dc 1480 *
6e5248b5
DV
1481 * The number of WA applied are not known at the beginning; we use this field
1482 * to return the no of DWORDS written.
17ee950d 1483 *
6e5248b5
DV
1484 * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
1485 * so it adds NOOPs as padding to make it cacheline aligned.
1486 * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
1487 * makes a complete batch buffer.
17ee950d 1488 */
097d4f1c 1489static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
17ee950d 1490{
7ad00d1a 1491 /* WaDisableCtxRestoreArbitration:bdw,chv */
097d4f1c 1492 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
17ee950d 1493
c82435bb 1494 /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
097d4f1c
TU
1495 if (IS_BROADWELL(engine->i915))
1496 batch = gen8_emit_flush_coherentl3_wa(engine, batch);
c82435bb 1497
0160f055
AS
1498 /* WaClearSlmSpaceAtContextSwitch:bdw,chv */
1499 /* Actual scratch location is at 128 bytes offset */
9f235dfa
TU
1500 batch = gen8_emit_pipe_control(batch,
1501 PIPE_CONTROL_FLUSH_L3 |
1502 PIPE_CONTROL_GLOBAL_GTT_IVB |
1503 PIPE_CONTROL_CS_STALL |
1504 PIPE_CONTROL_QW_WRITE,
fe78742d 1505 i915_scratch_offset(engine->i915) +
9f235dfa 1506 2 * CACHELINE_BYTES);
0160f055 1507
beecec90
CW
1508 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1509
17ee950d 1510 /* Pad to end of cacheline */
097d4f1c
TU
1511 while ((unsigned long)batch % CACHELINE_BYTES)
1512 *batch++ = MI_NOOP;
17ee950d
AS
1513
1514 /*
1515 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
1516 * execution depends on the length specified in terms of cache lines
1517 * in the register CTX_RCS_INDIRECT_CTX
1518 */
1519
097d4f1c 1520 return batch;
17ee950d
AS
1521}
1522
5ee4a7a6
CW
1523struct lri {
1524 i915_reg_t reg;
1525 u32 value;
1526};
1527
1528static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
0504cffc 1529{
5ee4a7a6 1530 GEM_BUG_ON(!count || count > 63);
beecec90 1531
5ee4a7a6
CW
1532 *batch++ = MI_LOAD_REGISTER_IMM(count);
1533 do {
1534 *batch++ = i915_mmio_reg_offset(lri->reg);
1535 *batch++ = lri->value;
1536 } while (lri++, --count);
1537 *batch++ = MI_NOOP;
a4106a78 1538
5ee4a7a6
CW
1539 return batch;
1540}
b77422f8 1541
5ee4a7a6
CW
1542static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1543{
1544 static const struct lri lri[] = {
1545 /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
1546 {
1547 COMMON_SLICE_CHICKEN2,
1548 __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
1549 0),
1550 },
1551
1552 /* BSpec: 11391 */
1553 {
1554 FF_SLICE_CHICKEN,
1555 __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
1556 FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
1557 },
1558
1559 /* BSpec: 11299 */
1560 {
1561 _3D_CHICKEN3,
1562 __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
1563 _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
1564 }
1565 };
b77422f8 1566
5ee4a7a6 1567 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
b77422f8 1568
5ee4a7a6
CW
1569 /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
1570 batch = gen8_emit_flush_coherentl3_wa(engine, batch);
b77422f8 1571
5ee4a7a6 1572 batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
873e8171 1573
066d4628
MK
1574 /* WaClearSlmSpaceAtContextSwitch:kbl */
1575 /* Actual scratch location is at 128 bytes offset */
097d4f1c 1576 if (IS_KBL_REVID(engine->i915, 0, KBL_REVID_A0)) {
9f235dfa
TU
1577 batch = gen8_emit_pipe_control(batch,
1578 PIPE_CONTROL_FLUSH_L3 |
1579 PIPE_CONTROL_GLOBAL_GTT_IVB |
1580 PIPE_CONTROL_CS_STALL |
1581 PIPE_CONTROL_QW_WRITE,
fe78742d 1582 i915_scratch_offset(engine->i915)
9f235dfa 1583 + 2 * CACHELINE_BYTES);
066d4628 1584 }
3485d99e 1585
9fb5026f 1586 /* WaMediaPoolStateCmdInWABB:bxt,glk */
3485d99e
TG
1587 if (HAS_POOLED_EU(engine->i915)) {
1588 /*
1589 * EU pool configuration is setup along with golden context
1590 * during context initialization. This value depends on
1591 * device type (2x6 or 3x6) and needs to be updated based
1592 * on which subslice is disabled especially for 2x6
1593 * devices, however it is safe to load default
1594 * configuration of 3x6 device instead of masking off
1595 * corresponding bits because HW ignores bits of a disabled
1596 * subslice and drops down to appropriate config. Please
1597 * see render_state_setup() in i915_gem_render_state.c for
1598 * possible configurations, to avoid duplication they are
1599 * not shown here again.
1600 */
097d4f1c
TU
1601 *batch++ = GEN9_MEDIA_POOL_STATE;
1602 *batch++ = GEN9_MEDIA_POOL_ENABLE;
1603 *batch++ = 0x00777000;
1604 *batch++ = 0;
1605 *batch++ = 0;
1606 *batch++ = 0;
3485d99e
TG
1607 }
1608
beecec90
CW
1609 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1610
0504cffc 1611 /* Pad to end of cacheline */
097d4f1c
TU
1612 while ((unsigned long)batch % CACHELINE_BYTES)
1613 *batch++ = MI_NOOP;
0504cffc 1614
097d4f1c 1615 return batch;
0504cffc
AS
1616}
1617
4b6ce681
RA
1618static u32 *
1619gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1620{
1621 int i;
1622
1623 /*
1624 * WaPipeControlBefore3DStateSamplePattern: cnl
1625 *
1626 * Ensure the engine is idle prior to programming a
1627 * 3DSTATE_SAMPLE_PATTERN during a context restore.
1628 */
1629 batch = gen8_emit_pipe_control(batch,
1630 PIPE_CONTROL_CS_STALL,
1631 0);
1632 /*
1633 * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for
1634 * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in
1635 * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is
1636 * confusing. Since gen8_emit_pipe_control() already advances the
1637 * batch by 6 dwords, we advance the other 10 here, completing a
1638 * cacheline. It's not clear if the workaround requires this padding
1639 * before other commands, or if it's just the regular padding we would
1640 * already have for the workaround bb, so leave it here for now.
1641 */
1642 for (i = 0; i < 10; i++)
1643 *batch++ = MI_NOOP;
1644
1645 /* Pad to end of cacheline */
1646 while ((unsigned long)batch % CACHELINE_BYTES)
1647 *batch++ = MI_NOOP;
1648
1649 return batch;
1650}
1651
097d4f1c
TU
1652#define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE)
1653
1654static int lrc_setup_wa_ctx(struct intel_engine_cs *engine)
17ee950d 1655{
48bb74e4
CW
1656 struct drm_i915_gem_object *obj;
1657 struct i915_vma *vma;
1658 int err;
17ee950d 1659
097d4f1c 1660 obj = i915_gem_object_create(engine->i915, CTX_WA_BB_OBJ_SIZE);
48bb74e4
CW
1661 if (IS_ERR(obj))
1662 return PTR_ERR(obj);
17ee950d 1663
82ad6443 1664 vma = i915_vma_instance(obj, &engine->i915->ggtt.vm, NULL);
48bb74e4
CW
1665 if (IS_ERR(vma)) {
1666 err = PTR_ERR(vma);
1667 goto err;
17ee950d
AS
1668 }
1669
7a859c65 1670 err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL | PIN_HIGH);
48bb74e4
CW
1671 if (err)
1672 goto err;
1673
1674 engine->wa_ctx.vma = vma;
17ee950d 1675 return 0;
48bb74e4
CW
1676
1677err:
1678 i915_gem_object_put(obj);
1679 return err;
17ee950d
AS
1680}
1681
097d4f1c 1682static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine)
17ee950d 1683{
6a2f59e4 1684 i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
17ee950d
AS
1685}
1686
097d4f1c
TU
1687typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
1688
0bc40be8 1689static int intel_init_workaround_bb(struct intel_engine_cs *engine)
17ee950d 1690{
48bb74e4 1691 struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
097d4f1c
TU
1692 struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx,
1693 &wa_ctx->per_ctx };
1694 wa_bb_func_t wa_bb_fn[2];
17ee950d 1695 struct page *page;
097d4f1c
TU
1696 void *batch, *batch_ptr;
1697 unsigned int i;
48bb74e4 1698 int ret;
17ee950d 1699
10bde236 1700 if (GEM_WARN_ON(engine->id != RCS))
097d4f1c 1701 return -EINVAL;
17ee950d 1702
097d4f1c 1703 switch (INTEL_GEN(engine->i915)) {
cc38cae7
OM
1704 case 11:
1705 return 0;
90007bca 1706 case 10:
4b6ce681
RA
1707 wa_bb_fn[0] = gen10_init_indirectctx_bb;
1708 wa_bb_fn[1] = NULL;
1709 break;
097d4f1c
TU
1710 case 9:
1711 wa_bb_fn[0] = gen9_init_indirectctx_bb;
b8aa2233 1712 wa_bb_fn[1] = NULL;
097d4f1c
TU
1713 break;
1714 case 8:
1715 wa_bb_fn[0] = gen8_init_indirectctx_bb;
3ad7b52d 1716 wa_bb_fn[1] = NULL;
097d4f1c
TU
1717 break;
1718 default:
1719 MISSING_CASE(INTEL_GEN(engine->i915));
5e60d790 1720 return 0;
0504cffc 1721 }
5e60d790 1722
097d4f1c 1723 ret = lrc_setup_wa_ctx(engine);
17ee950d
AS
1724 if (ret) {
1725 DRM_DEBUG_DRIVER("Failed to setup context WA page: %d\n", ret);
1726 return ret;
1727 }
1728
48bb74e4 1729 page = i915_gem_object_get_dirty_page(wa_ctx->vma->obj, 0);
097d4f1c 1730 batch = batch_ptr = kmap_atomic(page);
17ee950d 1731
097d4f1c
TU
1732 /*
1733 * Emit the two workaround batch buffers, recording the offset from the
1734 * start of the workaround batch buffer object for each and their
1735 * respective sizes.
1736 */
1737 for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
1738 wa_bb[i]->offset = batch_ptr - batch;
1d2a19c2
CW
1739 if (GEM_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
1740 CACHELINE_BYTES))) {
097d4f1c
TU
1741 ret = -EINVAL;
1742 break;
1743 }
604a8f6f
CW
1744 if (wa_bb_fn[i])
1745 batch_ptr = wa_bb_fn[i](engine, batch_ptr);
097d4f1c 1746 wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
17ee950d
AS
1747 }
1748
097d4f1c
TU
1749 BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE);
1750
17ee950d
AS
1751 kunmap_atomic(batch);
1752 if (ret)
097d4f1c 1753 lrc_destroy_wa_ctx(engine);
17ee950d
AS
1754
1755 return ret;
1756}
1757
f3c9d407 1758static void enable_execlists(struct intel_engine_cs *engine)
9b1136d5 1759{
c033666a 1760 struct drm_i915_private *dev_priv = engine->i915;
f3c9d407
CW
1761
1762 I915_WRITE(RING_HWSTAM(engine->mmio_base), 0xffffffff);
225701fc
KG
1763
1764 /*
1765 * Make sure we're not enabling the new 12-deep CSB
1766 * FIFO as that requires a slightly updated handling
1767 * in the ctx switch irq. Since we're currently only
1768 * using only 2 elements of the enhanced execlists the
1769 * deeper FIFO it's not needed and it's not worth adding
1770 * more statements to the irq handler to support it.
1771 */
1772 if (INTEL_GEN(dev_priv) >= 11)
1773 I915_WRITE(RING_MODE_GEN7(engine),
1774 _MASKED_BIT_DISABLE(GEN11_GFX_DISABLE_LEGACY_MODE));
1775 else
1776 I915_WRITE(RING_MODE_GEN7(engine),
1777 _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE));
1778
9a4dc803
CW
1779 I915_WRITE(RING_MI_MODE(engine->mmio_base),
1780 _MASKED_BIT_DISABLE(STOP_RING));
1781
f3c9d407
CW
1782 I915_WRITE(RING_HWS_PGA(engine->mmio_base),
1783 engine->status_page.ggtt_offset);
1784 POSTING_READ(RING_HWS_PGA(engine->mmio_base));
1785}
1786
9a4dc803
CW
1787static bool unexpected_starting_state(struct intel_engine_cs *engine)
1788{
1789 struct drm_i915_private *dev_priv = engine->i915;
1790 bool unexpected = false;
1791
1792 if (I915_READ(RING_MI_MODE(engine->mmio_base)) & STOP_RING) {
1793 DRM_DEBUG_DRIVER("STOP_RING still set in RING_MI_MODE\n");
1794 unexpected = true;
1795 }
1796
1797 return unexpected;
1798}
1799
f3c9d407
CW
1800static int gen8_init_common_ring(struct intel_engine_cs *engine)
1801{
90098efa
TU
1802 intel_engine_apply_workarounds(engine);
1803
805615da 1804 intel_mocs_init_engine(engine);
9b1136d5 1805
ad07dfcd 1806 intel_engine_reset_breadcrumbs(engine);
821ed7df 1807
9a4dc803
CW
1808 if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) {
1809 struct drm_printer p = drm_debug_printer(__func__);
1810
1811 intel_engine_dump(engine, &p, NULL);
1812 }
1813
f3c9d407 1814 enable_execlists(engine);
9b1136d5 1815
821ed7df 1816 return 0;
9b1136d5
OM
1817}
1818
0bc40be8 1819static int gen8_init_render_ring(struct intel_engine_cs *engine)
9b1136d5 1820{
c033666a 1821 struct drm_i915_private *dev_priv = engine->i915;
9b1136d5
OM
1822 int ret;
1823
0bc40be8 1824 ret = gen8_init_common_ring(engine);
9b1136d5
OM
1825 if (ret)
1826 return ret;
1827
f4ecfbfc 1828 intel_whitelist_workarounds_apply(engine);
59b449d5 1829
9b1136d5
OM
1830 /* We need to disable the AsyncFlip performance optimisations in order
1831 * to use MI_WAIT_FOR_EVENT within the CS. It should already be
1832 * programmed to '1' on all products.
1833 *
1834 * WaDisableAsyncFlipPerfMode:snb,ivb,hsw,vlv,bdw,chv
1835 */
1836 I915_WRITE(MI_MODE, _MASKED_BIT_ENABLE(ASYNC_FLIP_PERF_DISABLE));
1837
9b1136d5
OM
1838 I915_WRITE(INSTPM, _MASKED_BIT_ENABLE(INSTPM_FORCE_ORDERING));
1839
59b449d5 1840 return 0;
9b1136d5
OM
1841}
1842
0bc40be8 1843static int gen9_init_render_ring(struct intel_engine_cs *engine)
82ef822e
DL
1844{
1845 int ret;
1846
0bc40be8 1847 ret = gen8_init_common_ring(engine);
82ef822e
DL
1848 if (ret)
1849 return ret;
1850
f4ecfbfc 1851 intel_whitelist_workarounds_apply(engine);
59b449d5
OM
1852
1853 return 0;
82ef822e
DL
1854}
1855
5adfb772
CW
1856static struct i915_request *
1857execlists_reset_prepare(struct intel_engine_cs *engine)
1858{
1859 struct intel_engine_execlists * const execlists = &engine->execlists;
63572937 1860 struct i915_request *request, *active;
9512f985 1861 unsigned long flags;
5adfb772 1862
66fc8296
CW
1863 GEM_TRACE("%s: depth<-%d\n", engine->name,
1864 atomic_read(&execlists->tasklet.count));
5adfb772
CW
1865
1866 /*
1867 * Prevent request submission to the hardware until we have
1868 * completed the reset in i915_gem_reset_finish(). If a request
1869 * is completed by one engine, it may then queue a request
1870 * to a second via its execlists->tasklet *just* as we are
1871 * calling engine->init_hw() and also writing the ELSP.
1872 * Turning off the execlists->tasklet until the reset is over
1873 * prevents the race.
1874 */
1875 __tasklet_disable_sync_once(&execlists->tasklet);
1876
9512f985
CW
1877 spin_lock_irqsave(&engine->timeline.lock, flags);
1878
63572937
CW
1879 /*
1880 * We want to flush the pending context switches, having disabled
1881 * the tasklet above, we can assume exclusive access to the execlists.
1882 * For this allows us to catch up with an inflight preemption event,
1883 * and avoid blaming an innocent request if the stall was due to the
1884 * preemption itself.
1885 */
fd8526e5 1886 process_csb(engine);
63572937
CW
1887
1888 /*
1889 * The last active request can then be no later than the last request
1890 * now in ELSP[0]. So search backwards from there, so that if the GPU
1891 * has advanced beyond the last CSB update, it will be pardoned.
1892 */
1893 active = NULL;
1894 request = port_request(execlists->port);
1895 if (request) {
3f6e9822
CW
1896 /*
1897 * Prevent the breadcrumb from advancing before we decide
1898 * which request is currently active.
1899 */
1900 intel_engine_stop_cs(engine);
1901
63572937
CW
1902 list_for_each_entry_from_reverse(request,
1903 &engine->timeline.requests,
1904 link) {
1905 if (__i915_request_completed(request,
1906 request->global_seqno))
1907 break;
1908
1909 active = request;
1910 }
63572937
CW
1911 }
1912
9512f985
CW
1913 spin_unlock_irqrestore(&engine->timeline.lock, flags);
1914
63572937 1915 return active;
5adfb772
CW
1916}
1917
1918static void execlists_reset(struct intel_engine_cs *engine,
1919 struct i915_request *request)
821ed7df 1920{
b620e870 1921 struct intel_engine_execlists * const execlists = &engine->execlists;
221ab971 1922 unsigned long flags;
5692251c 1923 u32 *regs;
cdb6ded4 1924
0c5c7df3
TU
1925 GEM_TRACE("%s request global=%x, current=%d\n",
1926 engine->name, request ? request->global_seqno : 0,
1927 intel_engine_get_seqno(engine));
42232213 1928
d8857d54 1929 spin_lock_irqsave(&engine->timeline.lock, flags);
221ab971 1930
cdb6ded4
CW
1931 /*
1932 * Catch up with any missed context-switch interrupts.
1933 *
1934 * Ideally we would just read the remaining CSB entries now that we
1935 * know the gpu is idle. However, the CSB registers are sometimes^W
1936 * often trashed across a GPU reset! Instead we have to rely on
1937 * guessing the missed context-switch events by looking at what
1938 * requests were completed.
1939 */
a4598d17 1940 execlists_cancel_port_requests(execlists);
cdb6ded4 1941
221ab971 1942 /* Push back any incomplete requests for replay after the reset. */
a4598d17 1943 __unwind_incomplete_requests(engine);
cdb6ded4 1944
c3160da9 1945 /* Following the reset, we need to reload the CSB read/write pointers */
f4b58f04 1946 reset_csb_pointers(&engine->execlists);
c3160da9 1947
d8857d54 1948 spin_unlock_irqrestore(&engine->timeline.lock, flags);
aebbc2d7 1949
a3e38836
CW
1950 /*
1951 * If the request was innocent, we leave the request in the ELSP
c0dcb203
CW
1952 * and will try to replay it on restarting. The context image may
1953 * have been corrupted by the reset, in which case we may have
1954 * to service a new GPU hang, but more likely we can continue on
1955 * without impact.
1956 *
1957 * If the request was guilty, we presume the context is corrupt
1958 * and have to at least restore the RING register in the context
1959 * image back to the expected values to skip over the guilty request.
1960 */
221ab971 1961 if (!request || request->fence.error != -EIO)
c0dcb203 1962 return;
821ed7df 1963
a3e38836
CW
1964 /*
1965 * We want a simple context + ring to execute the breadcrumb update.
a3aabe86
CW
1966 * We cannot rely on the context being intact across the GPU hang,
1967 * so clear it and rebuild just what we need for the breadcrumb.
1968 * All pending requests for this context will be zapped, and any
1969 * future request will be after userspace has had the opportunity
1970 * to recreate its own state.
1971 */
1fc44d9b 1972 regs = request->hw_context->lrc_reg_state;
fe0c4935
CW
1973 if (engine->pinned_default_state) {
1974 memcpy(regs, /* skip restoring the vanilla PPHWSP */
1975 engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE,
1976 engine->context_size - PAGE_SIZE);
5692251c 1977 }
4e0d64db
CW
1978 execlists_init_reg_state(regs,
1979 request->gem_context, engine, request->ring);
a3aabe86 1980
821ed7df 1981 /* Move the RING_HEAD onto the breadcrumb, past the hanging batch */
5692251c 1982 regs[CTX_RING_BUFFER_START + 1] = i915_ggtt_offset(request->ring->vma);
a3aabe86 1983
41d37680
CW
1984 request->ring->head = intel_ring_wrap(request->ring, request->postfix);
1985 regs[CTX_RING_HEAD + 1] = request->ring->head;
1986
821ed7df
CW
1987 intel_ring_update_space(request->ring);
1988
a3aabe86 1989 /* Reset WaIdleLiteRestore:bdw,skl as well */
7e4992ac 1990 unwind_wa_tail(request);
821ed7df
CW
1991}
1992
5adfb772
CW
1993static void execlists_reset_finish(struct intel_engine_cs *engine)
1994{
5db1d4ea
CW
1995 struct intel_engine_execlists * const execlists = &engine->execlists;
1996
fe25f304 1997 /*
9e4fa012
CW
1998 * After a GPU reset, we may have requests to replay. Do so now while
1999 * we still have the forcewake to be sure that the GPU is not allowed
2000 * to sleep before we restart and reload a context.
fe25f304 2001 *
fe25f304 2002 */
9e4fa012
CW
2003 if (!RB_EMPTY_ROOT(&execlists->queue.rb_root))
2004 execlists->tasklet.func(execlists->tasklet.data);
5adfb772 2005
9e4fa012 2006 tasklet_enable(&execlists->tasklet);
66fc8296
CW
2007 GEM_TRACE("%s: depth->%d\n", engine->name,
2008 atomic_read(&execlists->tasklet.count));
5adfb772
CW
2009}
2010
e61e0f51 2011static int intel_logical_ring_emit_pdps(struct i915_request *rq)
7a01a0a2 2012{
4e0d64db 2013 struct i915_hw_ppgtt *ppgtt = rq->gem_context->ppgtt;
e61e0f51 2014 struct intel_engine_cs *engine = rq->engine;
e7167769 2015 const int num_lri_cmds = GEN8_3LVL_PDPES * 2;
73dec95e
TU
2016 u32 *cs;
2017 int i;
7a01a0a2 2018
e61e0f51 2019 cs = intel_ring_begin(rq, num_lri_cmds * 2 + 2);
73dec95e
TU
2020 if (IS_ERR(cs))
2021 return PTR_ERR(cs);
7a01a0a2 2022
73dec95e 2023 *cs++ = MI_LOAD_REGISTER_IMM(num_lri_cmds);
e7167769 2024 for (i = GEN8_3LVL_PDPES - 1; i >= 0; i--) {
7a01a0a2
MT
2025 const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i);
2026
73dec95e
TU
2027 *cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(engine, i));
2028 *cs++ = upper_32_bits(pd_daddr);
2029 *cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(engine, i));
2030 *cs++ = lower_32_bits(pd_daddr);
7a01a0a2
MT
2031 }
2032
73dec95e 2033 *cs++ = MI_NOOP;
e61e0f51 2034 intel_ring_advance(rq, cs);
7a01a0a2
MT
2035
2036 return 0;
2037}
2038
e61e0f51 2039static int gen8_emit_bb_start(struct i915_request *rq,
803688ba 2040 u64 offset, u32 len,
54af56db 2041 const unsigned int flags)
15648585 2042{
73dec95e 2043 u32 *cs;
15648585
OM
2044 int ret;
2045
7a01a0a2
MT
2046 /* Don't rely in hw updating PDPs, specially in lite-restore.
2047 * Ideally, we should set Force PD Restore in ctx descriptor,
2048 * but we can't. Force Restore would be a second option, but
2049 * it is unsafe in case of lite-restore (because the ctx is
2dba3239
MT
2050 * not idle). PML4 is allocated during ppgtt init so this is
2051 * not needed in 48-bit.*/
4e0d64db
CW
2052 if (rq->gem_context->ppgtt &&
2053 (intel_engine_flag(rq->engine) & rq->gem_context->ppgtt->pd_dirty_rings) &&
82ad6443 2054 !i915_vm_is_48bit(&rq->gem_context->ppgtt->vm) &&
e61e0f51
CW
2055 !intel_vgpu_active(rq->i915)) {
2056 ret = intel_logical_ring_emit_pdps(rq);
54af56db
MK
2057 if (ret)
2058 return ret;
7a01a0a2 2059
4e0d64db 2060 rq->gem_context->ppgtt->pd_dirty_rings &= ~intel_engine_flag(rq->engine);
7a01a0a2
MT
2061 }
2062
74f94741 2063 cs = intel_ring_begin(rq, 6);
73dec95e
TU
2064 if (IS_ERR(cs))
2065 return PTR_ERR(cs);
15648585 2066
279f5a00
CW
2067 /*
2068 * WaDisableCtxRestoreArbitration:bdw,chv
2069 *
2070 * We don't need to perform MI_ARB_ENABLE as often as we do (in
2071 * particular all the gen that do not need the w/a at all!), if we
2072 * took care to make sure that on every switch into this context
2073 * (both ordinary and for preemption) that arbitrartion was enabled
2074 * we would be fine. However, there doesn't seem to be a downside to
2075 * being paranoid and making sure it is set before each batch and
2076 * every context-switch.
2077 *
2078 * Note that if we fail to enable arbitration before the request
2079 * is complete, then we do not see the context-switch interrupt and
2080 * the engine hangs (with RING_HEAD == RING_TAIL).
2081 *
2082 * That satisfies both the GPGPU w/a and our heavy-handed paranoia.
2083 */
3ad7b52d
CW
2084 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
2085
15648585 2086 /* FIXME(BDW): Address space and security selectors. */
54af56db 2087 *cs++ = MI_BATCH_BUFFER_START_GEN8 |
08e3e21a 2088 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
73dec95e
TU
2089 *cs++ = lower_32_bits(offset);
2090 *cs++ = upper_32_bits(offset);
74f94741
CW
2091
2092 *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
2093 *cs++ = MI_NOOP;
e61e0f51 2094 intel_ring_advance(rq, cs);
15648585
OM
2095
2096 return 0;
2097}
2098
31bb59cc 2099static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine)
73d477f6 2100{
c033666a 2101 struct drm_i915_private *dev_priv = engine->i915;
31bb59cc
CW
2102 I915_WRITE_IMR(engine,
2103 ~(engine->irq_enable_mask | engine->irq_keep_mask));
2104 POSTING_READ_FW(RING_IMR(engine->mmio_base));
73d477f6
OM
2105}
2106
31bb59cc 2107static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine)
73d477f6 2108{
c033666a 2109 struct drm_i915_private *dev_priv = engine->i915;
31bb59cc 2110 I915_WRITE_IMR(engine, ~engine->irq_keep_mask);
73d477f6
OM
2111}
2112
e61e0f51 2113static int gen8_emit_flush(struct i915_request *request, u32 mode)
4712274c 2114{
73dec95e 2115 u32 cmd, *cs;
4712274c 2116
73dec95e
TU
2117 cs = intel_ring_begin(request, 4);
2118 if (IS_ERR(cs))
2119 return PTR_ERR(cs);
4712274c
OM
2120
2121 cmd = MI_FLUSH_DW + 1;
2122
f0a1fb10
CW
2123 /* We always require a command barrier so that subsequent
2124 * commands, such as breadcrumb interrupts, are strictly ordered
2125 * wrt the contents of the write cache being flushed to memory
2126 * (and thus being coherent from the CPU).
2127 */
2128 cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
2129
7c9cf4e3 2130 if (mode & EMIT_INVALIDATE) {
f0a1fb10 2131 cmd |= MI_INVALIDATE_TLB;
1dae2dfb 2132 if (request->engine->id == VCS)
f0a1fb10 2133 cmd |= MI_INVALIDATE_BSD;
4712274c
OM
2134 }
2135
73dec95e
TU
2136 *cs++ = cmd;
2137 *cs++ = I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT;
2138 *cs++ = 0; /* upper addr */
2139 *cs++ = 0; /* value */
2140 intel_ring_advance(request, cs);
4712274c
OM
2141
2142 return 0;
2143}
2144
e61e0f51 2145static int gen8_emit_flush_render(struct i915_request *request,
7c9cf4e3 2146 u32 mode)
4712274c 2147{
b5321f30 2148 struct intel_engine_cs *engine = request->engine;
bde13ebd 2149 u32 scratch_addr =
fe78742d 2150 i915_scratch_offset(engine->i915) + 2 * CACHELINE_BYTES;
0b2d0934 2151 bool vf_flush_wa = false, dc_flush_wa = false;
73dec95e 2152 u32 *cs, flags = 0;
0b2d0934 2153 int len;
4712274c
OM
2154
2155 flags |= PIPE_CONTROL_CS_STALL;
2156
7c9cf4e3 2157 if (mode & EMIT_FLUSH) {
4712274c
OM
2158 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
2159 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
965fd602 2160 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
40a24488 2161 flags |= PIPE_CONTROL_FLUSH_ENABLE;
4712274c
OM
2162 }
2163
7c9cf4e3 2164 if (mode & EMIT_INVALIDATE) {
4712274c
OM
2165 flags |= PIPE_CONTROL_TLB_INVALIDATE;
2166 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
2167 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
2168 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
2169 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
2170 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
2171 flags |= PIPE_CONTROL_QW_WRITE;
2172 flags |= PIPE_CONTROL_GLOBAL_GTT_IVB;
4712274c 2173
1a5a9ce7
BW
2174 /*
2175 * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL
2176 * pipe control.
2177 */
c033666a 2178 if (IS_GEN9(request->i915))
1a5a9ce7 2179 vf_flush_wa = true;
0b2d0934
MK
2180
2181 /* WaForGAMHang:kbl */
2182 if (IS_KBL_REVID(request->i915, 0, KBL_REVID_B0))
2183 dc_flush_wa = true;
1a5a9ce7 2184 }
9647ff36 2185
0b2d0934
MK
2186 len = 6;
2187
2188 if (vf_flush_wa)
2189 len += 6;
2190
2191 if (dc_flush_wa)
2192 len += 12;
2193
73dec95e
TU
2194 cs = intel_ring_begin(request, len);
2195 if (IS_ERR(cs))
2196 return PTR_ERR(cs);
4712274c 2197
9f235dfa
TU
2198 if (vf_flush_wa)
2199 cs = gen8_emit_pipe_control(cs, 0, 0);
9647ff36 2200
9f235dfa
TU
2201 if (dc_flush_wa)
2202 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE,
2203 0);
0b2d0934 2204
9f235dfa 2205 cs = gen8_emit_pipe_control(cs, flags, scratch_addr);
0b2d0934 2206
9f235dfa
TU
2207 if (dc_flush_wa)
2208 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0);
0b2d0934 2209
73dec95e 2210 intel_ring_advance(request, cs);
4712274c
OM
2211
2212 return 0;
2213}
2214
7c17d377
CW
2215/*
2216 * Reserve space for 2 NOOPs at the end of each request to be
2217 * used as a workaround for not being allowed to do lite
2218 * restore with HEAD==TAIL (WaIdleLiteRestore).
2219 */
e61e0f51 2220static void gen8_emit_wa_tail(struct i915_request *request, u32 *cs)
4da46e1e 2221{
beecec90
CW
2222 /* Ensure there's always at least one preemption point per-request. */
2223 *cs++ = MI_ARB_CHECK;
73dec95e
TU
2224 *cs++ = MI_NOOP;
2225 request->wa_tail = intel_ring_offset(request, cs);
caddfe71 2226}
4da46e1e 2227
e61e0f51 2228static void gen8_emit_breadcrumb(struct i915_request *request, u32 *cs)
caddfe71 2229{
7c17d377
CW
2230 /* w/a: bit 5 needs to be zero for MI_FLUSH_DW address. */
2231 BUILD_BUG_ON(I915_GEM_HWS_INDEX_ADDR & (1 << 5));
4da46e1e 2232
df77cd83
MW
2233 cs = gen8_emit_ggtt_write(cs, request->global_seqno,
2234 intel_hws_seqno_address(request->engine));
73dec95e 2235 *cs++ = MI_USER_INTERRUPT;
74f94741 2236 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
73dec95e 2237 request->tail = intel_ring_offset(request, cs);
ed1501d4 2238 assert_ring_tail_valid(request->ring, request->tail);
caddfe71 2239
73dec95e 2240 gen8_emit_wa_tail(request, cs);
7c17d377 2241}
98f29e8d
CW
2242static const int gen8_emit_breadcrumb_sz = 6 + WA_TAIL_DWORDS;
2243
e61e0f51 2244static void gen8_emit_breadcrumb_rcs(struct i915_request *request, u32 *cs)
7c17d377 2245{
ce81a65c
MW
2246 /* We're using qword write, seqno should be aligned to 8 bytes. */
2247 BUILD_BUG_ON(I915_GEM_HWS_INDEX & 1);
2248
df77cd83
MW
2249 cs = gen8_emit_ggtt_write_rcs(cs, request->global_seqno,
2250 intel_hws_seqno_address(request->engine));
73dec95e 2251 *cs++ = MI_USER_INTERRUPT;
74f94741 2252 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
73dec95e 2253 request->tail = intel_ring_offset(request, cs);
ed1501d4 2254 assert_ring_tail_valid(request->ring, request->tail);
caddfe71 2255
73dec95e 2256 gen8_emit_wa_tail(request, cs);
4da46e1e 2257}
df77cd83 2258static const int gen8_emit_breadcrumb_rcs_sz = 8 + WA_TAIL_DWORDS;
98f29e8d 2259
e61e0f51 2260static int gen8_init_rcs_context(struct i915_request *rq)
e7778be1
TD
2261{
2262 int ret;
2263
59b449d5 2264 ret = intel_ctx_workarounds_emit(rq);
e7778be1
TD
2265 if (ret)
2266 return ret;
2267
e61e0f51 2268 ret = intel_rcs_context_init_mocs(rq);
3bbaba0c
PA
2269 /*
2270 * Failing to program the MOCS is non-fatal.The system will not
2271 * run at peak performance. So generate an error and carry on.
2272 */
2273 if (ret)
2274 DRM_ERROR("MOCS failed to program: expect performance issues.\n");
2275
e61e0f51 2276 return i915_gem_render_state_emit(rq);
e7778be1
TD
2277}
2278
73e4d07f
OM
2279/**
2280 * intel_logical_ring_cleanup() - deallocate the Engine Command Streamer
14bb2c11 2281 * @engine: Engine Command Streamer.
73e4d07f 2282 */
0bc40be8 2283void intel_logical_ring_cleanup(struct intel_engine_cs *engine)
454afebd 2284{
6402c330 2285 struct drm_i915_private *dev_priv;
9832b9da 2286
27af5eea
TU
2287 /*
2288 * Tasklet cannot be active at this point due intel_mark_active/idle
2289 * so this is just for documentation.
2290 */
c6dce8f1
SAK
2291 if (WARN_ON(test_bit(TASKLET_STATE_SCHED,
2292 &engine->execlists.tasklet.state)))
2293 tasklet_kill(&engine->execlists.tasklet);
27af5eea 2294
c033666a 2295 dev_priv = engine->i915;
6402c330 2296
0bc40be8 2297 if (engine->buffer) {
0bc40be8 2298 WARN_ON((I915_READ_MODE(engine) & MODE_IDLE) == 0);
b0366a54 2299 }
48d82387 2300
0bc40be8
TU
2301 if (engine->cleanup)
2302 engine->cleanup(engine);
48d82387 2303
e8a9c58f 2304 intel_engine_cleanup_common(engine);
17ee950d 2305
097d4f1c 2306 lrc_destroy_wa_ctx(engine);
f3c9d407 2307
c033666a 2308 engine->i915 = NULL;
3b3f1650
AG
2309 dev_priv->engine[engine->id] = NULL;
2310 kfree(engine);
454afebd
OM
2311}
2312
209b7955 2313void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
ddd66c51 2314{
ff44ad51 2315 engine->submit_request = execlists_submit_request;
27a5f61b 2316 engine->cancel_requests = execlists_cancel_requests;
ff44ad51 2317 engine->schedule = execlists_schedule;
c6dce8f1 2318 engine->execlists.tasklet.func = execlists_submission_tasklet;
aba5e278 2319
1329115c
CW
2320 engine->reset.prepare = execlists_reset_prepare;
2321
aba5e278
CW
2322 engine->park = NULL;
2323 engine->unpark = NULL;
cf669b4e
TU
2324
2325 engine->flags |= I915_ENGINE_SUPPORTS_STATS;
2a694feb
CW
2326 if (engine->i915->preempt_context)
2327 engine->flags |= I915_ENGINE_HAS_PREEMPTION;
3fed1808
CW
2328
2329 engine->i915->caps.scheduler =
2330 I915_SCHEDULER_CAP_ENABLED |
2331 I915_SCHEDULER_CAP_PRIORITY;
2a694feb 2332 if (intel_engine_has_preemption(engine))
3fed1808 2333 engine->i915->caps.scheduler |= I915_SCHEDULER_CAP_PREEMPTION;
ddd66c51
CW
2334}
2335
c9cacf93 2336static void
e1382efb 2337logical_ring_default_vfuncs(struct intel_engine_cs *engine)
c9cacf93
TU
2338{
2339 /* Default vfuncs which can be overriden by each engine. */
0bc40be8 2340 engine->init_hw = gen8_init_common_ring;
5adfb772
CW
2341
2342 engine->reset.prepare = execlists_reset_prepare;
2343 engine->reset.reset = execlists_reset;
2344 engine->reset.finish = execlists_reset_finish;
e8a9c58f
CW
2345
2346 engine->context_pin = execlists_context_pin;
f73e7399
CW
2347 engine->request_alloc = execlists_request_alloc;
2348
0bc40be8 2349 engine->emit_flush = gen8_emit_flush;
9b81d556 2350 engine->emit_breadcrumb = gen8_emit_breadcrumb;
98f29e8d 2351 engine->emit_breadcrumb_sz = gen8_emit_breadcrumb_sz;
ff44ad51 2352
209b7955 2353 engine->set_default_submission = intel_execlists_set_default_submission;
ddd66c51 2354
d4ccceb0
TU
2355 if (INTEL_GEN(engine->i915) < 11) {
2356 engine->irq_enable = gen8_logical_ring_enable_irq;
2357 engine->irq_disable = gen8_logical_ring_disable_irq;
2358 } else {
2359 /*
2360 * TODO: On Gen11 interrupt masks need to be clear
2361 * to allow C6 entry. Keep interrupts enabled at
2362 * and take the hit of generating extra interrupts
2363 * until a more refined solution exists.
2364 */
2365 }
0bc40be8 2366 engine->emit_bb_start = gen8_emit_bb_start;
c9cacf93
TU
2367}
2368
d9f3af96 2369static inline void
c2c7f240 2370logical_ring_default_irqs(struct intel_engine_cs *engine)
d9f3af96 2371{
fa6f071d
DCS
2372 unsigned int shift = 0;
2373
2374 if (INTEL_GEN(engine->i915) < 11) {
2375 const u8 irq_shifts[] = {
2376 [RCS] = GEN8_RCS_IRQ_SHIFT,
2377 [BCS] = GEN8_BCS_IRQ_SHIFT,
2378 [VCS] = GEN8_VCS1_IRQ_SHIFT,
2379 [VCS2] = GEN8_VCS2_IRQ_SHIFT,
2380 [VECS] = GEN8_VECS_IRQ_SHIFT,
2381 };
2382
2383 shift = irq_shifts[engine->id];
2384 }
2385
0bc40be8
TU
2386 engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift;
2387 engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
d9f3af96
TU
2388}
2389
bb45438f
TU
2390static void
2391logical_ring_setup(struct intel_engine_cs *engine)
2392{
019bf277
TU
2393 intel_engine_setup_common(engine);
2394
bb45438f
TU
2395 /* Intentionally left blank. */
2396 engine->buffer = NULL;
2397
c6dce8f1
SAK
2398 tasklet_init(&engine->execlists.tasklet,
2399 execlists_submission_tasklet, (unsigned long)engine);
bb45438f 2400
bb45438f
TU
2401 logical_ring_default_vfuncs(engine);
2402 logical_ring_default_irqs(engine);
bb45438f
TU
2403}
2404
bc4237ec
CW
2405static bool csb_force_mmio(struct drm_i915_private *i915)
2406{
2407 /* Older GVT emulation depends upon intercepting CSB mmio */
2408 return intel_vgpu_active(i915) && !intel_vgpu_has_hwsp_emulation(i915);
2409}
2410
486e93f7 2411static int logical_ring_init(struct intel_engine_cs *engine)
a19d6ff2 2412{
bc4237ec
CW
2413 struct drm_i915_private *i915 = engine->i915;
2414 struct intel_engine_execlists * const execlists = &engine->execlists;
a19d6ff2
TU
2415 int ret;
2416
019bf277 2417 ret = intel_engine_init_common(engine);
a19d6ff2 2418 if (ret)
b2164e48 2419 return ret;
a19d6ff2 2420
bc4237ec
CW
2421 if (HAS_LOGICAL_RING_ELSQ(i915)) {
2422 execlists->submit_reg = i915->regs +
05f0addd 2423 i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(engine));
bc4237ec 2424 execlists->ctrl_reg = i915->regs +
05f0addd
TD
2425 i915_mmio_reg_offset(RING_EXECLIST_CONTROL(engine));
2426 } else {
bc4237ec 2427 execlists->submit_reg = i915->regs +
05f0addd
TD
2428 i915_mmio_reg_offset(RING_ELSP(engine));
2429 }
693cfbf0 2430
bc4237ec
CW
2431 execlists->preempt_complete_status = ~0u;
2432 if (i915->preempt_context) {
ab82a063 2433 struct intel_context *ce =
bc4237ec 2434 to_intel_context(i915->preempt_context, engine);
ab82a063 2435
bc4237ec 2436 execlists->preempt_complete_status =
ab82a063
CW
2437 upper_32_bits(ce->lrc_desc);
2438 }
d6376374 2439
bc4237ec
CW
2440 execlists->csb_read =
2441 i915->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine));
2442 if (csb_force_mmio(i915)) {
2443 execlists->csb_status = (u32 __force *)
2444 (i915->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_BUF_LO(engine, 0)));
2445
2446 execlists->csb_write = (u32 __force *)execlists->csb_read;
f4b58f04
CW
2447 execlists->csb_write_reset =
2448 _MASKED_FIELD(GEN8_CSB_WRITE_PTR_MASK,
2449 GEN8_CSB_ENTRIES - 1);
bc4237ec
CW
2450 } else {
2451 execlists->csb_status =
2452 &engine->status_page.page_addr[I915_HWS_CSB_BUF0_INDEX];
2453
2454 execlists->csb_write =
2455 &engine->status_page.page_addr[intel_hws_csb_write_index(i915)];
f4b58f04 2456 execlists->csb_write_reset = GEN8_CSB_ENTRIES - 1;
bc4237ec 2457 }
f4b58f04 2458 reset_csb_pointers(execlists);
c3160da9 2459
a19d6ff2 2460 return 0;
a19d6ff2
TU
2461}
2462
88d2ba2e 2463int logical_render_ring_init(struct intel_engine_cs *engine)
a19d6ff2
TU
2464{
2465 struct drm_i915_private *dev_priv = engine->i915;
2466 int ret;
2467
bb45438f
TU
2468 logical_ring_setup(engine);
2469
a19d6ff2
TU
2470 if (HAS_L3_DPF(dev_priv))
2471 engine->irq_keep_mask |= GT_RENDER_L3_PARITY_ERROR_INTERRUPT;
2472
2473 /* Override some for render ring. */
2474 if (INTEL_GEN(dev_priv) >= 9)
2475 engine->init_hw = gen9_init_render_ring;
2476 else
2477 engine->init_hw = gen8_init_render_ring;
2478 engine->init_context = gen8_init_rcs_context;
a19d6ff2 2479 engine->emit_flush = gen8_emit_flush_render;
df77cd83
MW
2480 engine->emit_breadcrumb = gen8_emit_breadcrumb_rcs;
2481 engine->emit_breadcrumb_sz = gen8_emit_breadcrumb_rcs_sz;
a19d6ff2 2482
b2164e48 2483 ret = logical_ring_init(engine);
a19d6ff2
TU
2484 if (ret)
2485 return ret;
2486
2487 ret = intel_init_workaround_bb(engine);
2488 if (ret) {
2489 /*
2490 * We continue even if we fail to initialize WA batch
2491 * because we only expect rare glitches but nothing
2492 * critical to prevent us from using GPU
2493 */
2494 DRM_ERROR("WA batch buffer initialization failed: %d\n",
2495 ret);
2496 }
2497
90098efa
TU
2498 intel_engine_init_workarounds(engine);
2499
b2164e48 2500 return 0;
a19d6ff2
TU
2501}
2502
88d2ba2e 2503int logical_xcs_ring_init(struct intel_engine_cs *engine)
bb45438f
TU
2504{
2505 logical_ring_setup(engine);
2506
2507 return logical_ring_init(engine);
454afebd
OM
2508}
2509
0cea6502 2510static u32
c033666a 2511make_rpcs(struct drm_i915_private *dev_priv)
0cea6502 2512{
b212f0a4
TU
2513 bool subslice_pg = INTEL_INFO(dev_priv)->sseu.has_subslice_pg;
2514 u8 slices = hweight8(INTEL_INFO(dev_priv)->sseu.slice_mask);
2515 u8 subslices = hweight8(INTEL_INFO(dev_priv)->sseu.subslice_mask[0]);
0cea6502
JM
2516 u32 rpcs = 0;
2517
2518 /*
2519 * No explicit RPCS request is needed to ensure full
2520 * slice/subslice/EU enablement prior to Gen9.
2521 */
c033666a 2522 if (INTEL_GEN(dev_priv) < 9)
0cea6502
JM
2523 return 0;
2524
b212f0a4
TU
2525 /*
2526 * Since the SScount bitfield in GEN8_R_PWR_CLK_STATE is only three bits
2527 * wide and Icelake has up to eight subslices, specfial programming is
2528 * needed in order to correctly enable all subslices.
2529 *
2530 * According to documentation software must consider the configuration
2531 * as 2x4x8 and hardware will translate this to 1x8x8.
2532 *
2533 * Furthemore, even though SScount is three bits, maximum documented
2534 * value for it is four. From this some rules/restrictions follow:
2535 *
2536 * 1.
2537 * If enabled subslice count is greater than four, two whole slices must
2538 * be enabled instead.
2539 *
2540 * 2.
2541 * When more than one slice is enabled, hardware ignores the subslice
2542 * count altogether.
2543 *
2544 * From these restrictions it follows that it is not possible to enable
2545 * a count of subslices between the SScount maximum of four restriction,
2546 * and the maximum available number on a particular SKU. Either all
2547 * subslices are enabled, or a count between one and four on the first
2548 * slice.
2549 */
2550 if (IS_GEN11(dev_priv) && slices == 1 && subslices >= 4) {
2551 GEM_BUG_ON(subslices & 1);
2552
2553 subslice_pg = false;
2554 slices *= 2;
2555 }
2556
0cea6502
JM
2557 /*
2558 * Starting in Gen9, render power gating can leave
2559 * slice/subslice/EU in a partially enabled state. We
2560 * must make an explicit request through RPCS for full
2561 * enablement.
2562 */
43b67998 2563 if (INTEL_INFO(dev_priv)->sseu.has_slice_pg) {
b212f0a4
TU
2564 u32 mask, val = slices;
2565
2566 if (INTEL_GEN(dev_priv) >= 11) {
2567 mask = GEN11_RPCS_S_CNT_MASK;
2568 val <<= GEN11_RPCS_S_CNT_SHIFT;
2569 } else {
2570 mask = GEN8_RPCS_S_CNT_MASK;
2571 val <<= GEN8_RPCS_S_CNT_SHIFT;
2572 }
2573
2574 GEM_BUG_ON(val & ~mask);
2575 val &= mask;
2576
2577 rpcs |= GEN8_RPCS_ENABLE | GEN8_RPCS_S_CNT_ENABLE | val;
0cea6502
JM
2578 }
2579
b212f0a4
TU
2580 if (subslice_pg) {
2581 u32 val = subslices;
2582
2583 val <<= GEN8_RPCS_SS_CNT_SHIFT;
2584
2585 GEM_BUG_ON(val & ~GEN8_RPCS_SS_CNT_MASK);
2586 val &= GEN8_RPCS_SS_CNT_MASK;
2587
2588 rpcs |= GEN8_RPCS_ENABLE | GEN8_RPCS_SS_CNT_ENABLE | val;
0cea6502
JM
2589 }
2590
43b67998 2591 if (INTEL_INFO(dev_priv)->sseu.has_eu_pg) {
b212f0a4
TU
2592 u32 val;
2593
2594 val = INTEL_INFO(dev_priv)->sseu.eu_per_subslice <<
2595 GEN8_RPCS_EU_MIN_SHIFT;
2596 GEM_BUG_ON(val & ~GEN8_RPCS_EU_MIN_MASK);
2597 val &= GEN8_RPCS_EU_MIN_MASK;
2598
2599 rpcs |= val;
2600
2601 val = INTEL_INFO(dev_priv)->sseu.eu_per_subslice <<
2602 GEN8_RPCS_EU_MAX_SHIFT;
2603 GEM_BUG_ON(val & ~GEN8_RPCS_EU_MAX_MASK);
2604 val &= GEN8_RPCS_EU_MAX_MASK;
2605
2606 rpcs |= val;
2607
0cea6502
JM
2608 rpcs |= GEN8_RPCS_ENABLE;
2609 }
2610
2611 return rpcs;
2612}
2613
0bc40be8 2614static u32 intel_lr_indirect_ctx_offset(struct intel_engine_cs *engine)
71562919
MT
2615{
2616 u32 indirect_ctx_offset;
2617
c033666a 2618 switch (INTEL_GEN(engine->i915)) {
71562919 2619 default:
c033666a 2620 MISSING_CASE(INTEL_GEN(engine->i915));
71562919 2621 /* fall through */
fd034c77
MT
2622 case 11:
2623 indirect_ctx_offset =
2624 GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
2625 break;
7bd0a2c6
MT
2626 case 10:
2627 indirect_ctx_offset =
2628 GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
2629 break;
71562919
MT
2630 case 9:
2631 indirect_ctx_offset =
2632 GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
2633 break;
2634 case 8:
2635 indirect_ctx_offset =
2636 GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
2637 break;
2638 }
2639
2640 return indirect_ctx_offset;
2641}
2642
56e51bf0 2643static void execlists_init_reg_state(u32 *regs,
a3aabe86
CW
2644 struct i915_gem_context *ctx,
2645 struct intel_engine_cs *engine,
2646 struct intel_ring *ring)
8670d6f9 2647{
a3aabe86
CW
2648 struct drm_i915_private *dev_priv = engine->i915;
2649 struct i915_hw_ppgtt *ppgtt = ctx->ppgtt ?: dev_priv->mm.aliasing_ppgtt;
56e51bf0 2650 u32 base = engine->mmio_base;
1fc44d9b 2651 bool rcs = engine->class == RENDER_CLASS;
56e51bf0
TU
2652
2653 /* A context is actually a big batch buffer with several
2654 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
2655 * values we are setting here are only for the first context restore:
2656 * on a subsequent save, the GPU will recreate this batchbuffer with new
2657 * values (including all the missing MI_LOAD_REGISTER_IMM commands that
2658 * we are not initializing here).
2659 */
2660 regs[CTX_LRI_HEADER_0] = MI_LOAD_REGISTER_IMM(rcs ? 14 : 11) |
2661 MI_LRI_FORCE_POSTED;
2662
2663 CTX_REG(regs, CTX_CONTEXT_CONTROL, RING_CONTEXT_CONTROL(engine),
ee435831 2664 _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT) |
08e3e21a 2665 _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH));
ee435831
PZ
2666 if (INTEL_GEN(dev_priv) < 11) {
2667 regs[CTX_CONTEXT_CONTROL + 1] |=
2668 _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
2669 CTX_CTRL_RS_CTX_ENABLE);
2670 }
56e51bf0
TU
2671 CTX_REG(regs, CTX_RING_HEAD, RING_HEAD(base), 0);
2672 CTX_REG(regs, CTX_RING_TAIL, RING_TAIL(base), 0);
2673 CTX_REG(regs, CTX_RING_BUFFER_START, RING_START(base), 0);
2674 CTX_REG(regs, CTX_RING_BUFFER_CONTROL, RING_CTL(base),
2675 RING_CTL_SIZE(ring->size) | RING_VALID);
2676 CTX_REG(regs, CTX_BB_HEAD_U, RING_BBADDR_UDW(base), 0);
2677 CTX_REG(regs, CTX_BB_HEAD_L, RING_BBADDR(base), 0);
2678 CTX_REG(regs, CTX_BB_STATE, RING_BBSTATE(base), RING_BB_PPGTT);
2679 CTX_REG(regs, CTX_SECOND_BB_HEAD_U, RING_SBBADDR_UDW(base), 0);
2680 CTX_REG(regs, CTX_SECOND_BB_HEAD_L, RING_SBBADDR(base), 0);
2681 CTX_REG(regs, CTX_SECOND_BB_STATE, RING_SBBSTATE(base), 0);
2682 if (rcs) {
604a8f6f
CW
2683 struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
2684
56e51bf0
TU
2685 CTX_REG(regs, CTX_RCS_INDIRECT_CTX, RING_INDIRECT_CTX(base), 0);
2686 CTX_REG(regs, CTX_RCS_INDIRECT_CTX_OFFSET,
2687 RING_INDIRECT_CTX_OFFSET(base), 0);
604a8f6f 2688 if (wa_ctx->indirect_ctx.size) {
bde13ebd 2689 u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
17ee950d 2690
56e51bf0 2691 regs[CTX_RCS_INDIRECT_CTX + 1] =
097d4f1c
TU
2692 (ggtt_offset + wa_ctx->indirect_ctx.offset) |
2693 (wa_ctx->indirect_ctx.size / CACHELINE_BYTES);
17ee950d 2694
56e51bf0 2695 regs[CTX_RCS_INDIRECT_CTX_OFFSET + 1] =
0bc40be8 2696 intel_lr_indirect_ctx_offset(engine) << 6;
604a8f6f
CW
2697 }
2698
2699 CTX_REG(regs, CTX_BB_PER_CTX_PTR, RING_BB_PER_CTX_PTR(base), 0);
2700 if (wa_ctx->per_ctx.size) {
2701 u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
17ee950d 2702
56e51bf0 2703 regs[CTX_BB_PER_CTX_PTR + 1] =
097d4f1c 2704 (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
17ee950d 2705 }
8670d6f9 2706 }
56e51bf0
TU
2707
2708 regs[CTX_LRI_HEADER_1] = MI_LOAD_REGISTER_IMM(9) | MI_LRI_FORCE_POSTED;
2709
2710 CTX_REG(regs, CTX_CTX_TIMESTAMP, RING_CTX_TIMESTAMP(base), 0);
0d925ea0 2711 /* PDP values well be assigned later if needed */
56e51bf0
TU
2712 CTX_REG(regs, CTX_PDP3_UDW, GEN8_RING_PDP_UDW(engine, 3), 0);
2713 CTX_REG(regs, CTX_PDP3_LDW, GEN8_RING_PDP_LDW(engine, 3), 0);
2714 CTX_REG(regs, CTX_PDP2_UDW, GEN8_RING_PDP_UDW(engine, 2), 0);
2715 CTX_REG(regs, CTX_PDP2_LDW, GEN8_RING_PDP_LDW(engine, 2), 0);
2716 CTX_REG(regs, CTX_PDP1_UDW, GEN8_RING_PDP_UDW(engine, 1), 0);
2717 CTX_REG(regs, CTX_PDP1_LDW, GEN8_RING_PDP_LDW(engine, 1), 0);
2718 CTX_REG(regs, CTX_PDP0_UDW, GEN8_RING_PDP_UDW(engine, 0), 0);
2719 CTX_REG(regs, CTX_PDP0_LDW, GEN8_RING_PDP_LDW(engine, 0), 0);
d7b2633d 2720
82ad6443 2721 if (ppgtt && i915_vm_is_48bit(&ppgtt->vm)) {
2dba3239
MT
2722 /* 64b PPGTT (48bit canonical)
2723 * PDP0_DESCRIPTOR contains the base address to PML4 and
2724 * other PDP Descriptors are ignored.
2725 */
56e51bf0 2726 ASSIGN_CTX_PML4(ppgtt, regs);
2dba3239
MT
2727 }
2728
56e51bf0
TU
2729 if (rcs) {
2730 regs[CTX_LRI_HEADER_2] = MI_LOAD_REGISTER_IMM(1);
2731 CTX_REG(regs, CTX_R_PWR_CLK_STATE, GEN8_R_PWR_CLK_STATE,
2732 make_rpcs(dev_priv));
19f81df2
RB
2733
2734 i915_oa_init_reg_state(engine, ctx, regs);
8670d6f9 2735 }
d0f5cc5d
CW
2736
2737 regs[CTX_END] = MI_BATCH_BUFFER_END;
2738 if (INTEL_GEN(dev_priv) >= 10)
2739 regs[CTX_END] |= BIT(0);
a3aabe86
CW
2740}
2741
2742static int
2743populate_lr_context(struct i915_gem_context *ctx,
2744 struct drm_i915_gem_object *ctx_obj,
2745 struct intel_engine_cs *engine,
2746 struct intel_ring *ring)
2747{
2748 void *vaddr;
d2b4b979 2749 u32 *regs;
a3aabe86
CW
2750 int ret;
2751
2752 ret = i915_gem_object_set_to_cpu_domain(ctx_obj, true);
2753 if (ret) {
2754 DRM_DEBUG_DRIVER("Could not set to CPU domain\n");
2755 return ret;
2756 }
2757
2758 vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB);
2759 if (IS_ERR(vaddr)) {
2760 ret = PTR_ERR(vaddr);
2761 DRM_DEBUG_DRIVER("Could not map object pages! (%d)\n", ret);
2762 return ret;
2763 }
a4f5ea64 2764 ctx_obj->mm.dirty = true;
a3aabe86 2765
d2b4b979
CW
2766 if (engine->default_state) {
2767 /*
2768 * We only want to copy over the template context state;
2769 * skipping over the headers reserved for GuC communication,
2770 * leaving those as zero.
2771 */
2772 const unsigned long start = LRC_HEADER_PAGES * PAGE_SIZE;
2773 void *defaults;
2774
2775 defaults = i915_gem_object_pin_map(engine->default_state,
2776 I915_MAP_WB);
aaefa06a
MA
2777 if (IS_ERR(defaults)) {
2778 ret = PTR_ERR(defaults);
2779 goto err_unpin_ctx;
2780 }
d2b4b979
CW
2781
2782 memcpy(vaddr + start, defaults + start, engine->context_size);
2783 i915_gem_object_unpin_map(engine->default_state);
2784 }
2785
a3aabe86
CW
2786 /* The second page of the context object contains some fields which must
2787 * be set up prior to the first execution. */
d2b4b979
CW
2788 regs = vaddr + LRC_STATE_PN * PAGE_SIZE;
2789 execlists_init_reg_state(regs, ctx, engine, ring);
2790 if (!engine->default_state)
2791 regs[CTX_CONTEXT_CONTROL + 1] |=
2792 _MASKED_BIT_ENABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
05f0addd 2793 if (ctx == ctx->i915->preempt_context && INTEL_GEN(engine->i915) < 11)
517aaffe
CW
2794 regs[CTX_CONTEXT_CONTROL + 1] |=
2795 _MASKED_BIT_ENABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT |
2796 CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT);
8670d6f9 2797
aaefa06a 2798err_unpin_ctx:
7d774cac 2799 i915_gem_object_unpin_map(ctx_obj);
aaefa06a 2800 return ret;
8670d6f9
OM
2801}
2802
e2efd130 2803static int execlists_context_deferred_alloc(struct i915_gem_context *ctx,
1fc44d9b
CW
2804 struct intel_engine_cs *engine,
2805 struct intel_context *ce)
ede7d42b 2806{
8c857917 2807 struct drm_i915_gem_object *ctx_obj;
bf3783e5 2808 struct i915_vma *vma;
8c857917 2809 uint32_t context_size;
7e37f889 2810 struct intel_ring *ring;
a89d1f92 2811 struct i915_timeline *timeline;
8c857917
OM
2812 int ret;
2813
1d2a19c2
CW
2814 if (ce->state)
2815 return 0;
ede7d42b 2816
63ffbcda 2817 context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
8c857917 2818
0b29c75a
MT
2819 /*
2820 * Before the actual start of the context image, we insert a few pages
2821 * for our own use and for sharing with the GuC.
2822 */
2823 context_size += LRC_HEADER_PAGES * PAGE_SIZE;
d1675198 2824
12d79d78 2825 ctx_obj = i915_gem_object_create(ctx->i915, context_size);
467d3578
CW
2826 if (IS_ERR(ctx_obj))
2827 return PTR_ERR(ctx_obj);
8c857917 2828
82ad6443 2829 vma = i915_vma_instance(ctx_obj, &ctx->i915->ggtt.vm, NULL);
bf3783e5
CW
2830 if (IS_ERR(vma)) {
2831 ret = PTR_ERR(vma);
2832 goto error_deref_obj;
2833 }
2834
a89d1f92
CW
2835 timeline = i915_timeline_create(ctx->i915, ctx->name);
2836 if (IS_ERR(timeline)) {
2837 ret = PTR_ERR(timeline);
2838 goto error_deref_obj;
2839 }
2840
2841 ring = intel_engine_create_ring(engine, timeline, ctx->ring_size);
2842 i915_timeline_put(timeline);
dca33ecc
CW
2843 if (IS_ERR(ring)) {
2844 ret = PTR_ERR(ring);
e84fe803 2845 goto error_deref_obj;
8670d6f9
OM
2846 }
2847
dca33ecc 2848 ret = populate_lr_context(ctx, ctx_obj, engine, ring);
8670d6f9
OM
2849 if (ret) {
2850 DRM_DEBUG_DRIVER("Failed to populate LRC: %d\n", ret);
dca33ecc 2851 goto error_ring_free;
84c2377f
OM
2852 }
2853
dca33ecc 2854 ce->ring = ring;
bf3783e5 2855 ce->state = vma;
ede7d42b
OM
2856
2857 return 0;
8670d6f9 2858
dca33ecc 2859error_ring_free:
7e37f889 2860 intel_ring_free(ring);
e84fe803 2861error_deref_obj:
f8c417cd 2862 i915_gem_object_put(ctx_obj);
8670d6f9 2863 return ret;
ede7d42b 2864}
3e5b6f05 2865
dee60ca1 2866void intel_lr_context_resume(struct drm_i915_private *i915)
3e5b6f05 2867{
e2f80391 2868 struct intel_engine_cs *engine;
bafb2f7d 2869 struct i915_gem_context *ctx;
3b3f1650 2870 enum intel_engine_id id;
bafb2f7d 2871
dee60ca1
CW
2872 /*
2873 * Because we emit WA_TAIL_DWORDS there may be a disparity
bafb2f7d
CW
2874 * between our bookkeeping in ce->ring->head and ce->ring->tail and
2875 * that stored in context. As we only write new commands from
2876 * ce->ring->tail onwards, everything before that is junk. If the GPU
2877 * starts reading from its RING_HEAD from the context, it may try to
2878 * execute that junk and die.
2879 *
2880 * So to avoid that we reset the context images upon resume. For
2881 * simplicity, we just zero everything out.
2882 */
dee60ca1
CW
2883 list_for_each_entry(ctx, &i915->contexts.list, link) {
2884 for_each_engine(engine, i915, id) {
ab82a063
CW
2885 struct intel_context *ce =
2886 to_intel_context(ctx, engine);
3e5b6f05 2887
bafb2f7d
CW
2888 if (!ce->state)
2889 continue;
7d774cac 2890
dee60ca1 2891 intel_ring_reset(ce->ring, 0);
3e5b6f05 2892
dee60ca1
CW
2893 if (ce->pin_count) { /* otherwise done in context_pin */
2894 u32 *regs = ce->lrc_reg_state;
3e5b6f05 2895
dee60ca1
CW
2896 regs[CTX_RING_HEAD + 1] = ce->ring->head;
2897 regs[CTX_RING_TAIL + 1] = ce->ring->tail;
2898 }
bafb2f7d 2899 }
3e5b6f05
TD
2900 }
2901}
2c66555e
CW
2902
2903#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
2904#include "selftests/intel_lrc.c"
2905#endif